xref: /haiku/src/system/kernel/fs/vfs.cpp (revision 3995592cdf304335132305e27c40cbb0b1ac46e3)
1 /*
2  * Copyright 2005-2013, Ingo Weinhold, ingo_weinhold@gmx.de.
3  * Copyright 2002-2017, Axel Dörfler, axeld@pinc-software.de.
4  * Distributed under the terms of the MIT License.
5  *
6  * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
7  * Distributed under the terms of the NewOS License.
8  */
9 
10 
11 /*! Virtual File System and File System Interface Layer */
12 
13 
14 #include <ctype.h>
15 #include <fcntl.h>
16 #include <limits.h>
17 #include <stddef.h>
18 #include <stdio.h>
19 #include <string.h>
20 #include <sys/file.h>
21 #include <sys/resource.h>
22 #include <sys/stat.h>
23 #include <unistd.h>
24 
25 #include <fs_attr.h>
26 #include <fs_info.h>
27 #include <fs_interface.h>
28 #include <fs_volume.h>
29 #include <OS.h>
30 #include <StorageDefs.h>
31 
32 #include <AutoDeleter.h>
33 #include <block_cache.h>
34 #include <boot/kernel_args.h>
35 #include <debug_heap.h>
36 #include <disk_device_manager/KDiskDevice.h>
37 #include <disk_device_manager/KDiskDeviceManager.h>
38 #include <disk_device_manager/KDiskDeviceUtils.h>
39 #include <disk_device_manager/KDiskSystem.h>
40 #include <fd.h>
41 #include <file_cache.h>
42 #include <fs/node_monitor.h>
43 #include <KPath.h>
44 #include <lock.h>
45 #include <low_resource_manager.h>
46 #include <syscalls.h>
47 #include <syscall_restart.h>
48 #include <tracing.h>
49 #include <util/atomic.h>
50 #include <util/AutoLock.h>
51 #include <util/DoublyLinkedList.h>
52 #include <vfs.h>
53 #include <vm/vm.h>
54 #include <vm/VMCache.h>
55 
56 #include "EntryCache.h"
57 #include "fifo.h"
58 #include "IORequest.h"
59 #include "unused_vnodes.h"
60 #include "vfs_tracing.h"
61 #include "Vnode.h"
62 #include "../cache/vnode_store.h"
63 
64 
65 //#define TRACE_VFS
66 #ifdef TRACE_VFS
67 #	define TRACE(x) dprintf x
68 #	define FUNCTION(x) dprintf x
69 #else
70 #	define TRACE(x) ;
71 #	define FUNCTION(x) ;
72 #endif
73 
74 #define ADD_DEBUGGER_COMMANDS
75 
76 
77 #define HAS_FS_CALL(vnode, op)			(vnode->ops->op != NULL)
78 #define HAS_FS_MOUNT_CALL(mount, op)	(mount->volume->ops->op != NULL)
79 
80 #if KDEBUG
81 #	define FS_CALL(vnode, op, params...) \
82 		( HAS_FS_CALL(vnode, op) ? \
83 			vnode->ops->op(vnode->mount->volume, vnode, params) \
84 			: (panic("FS_CALL op " #op " is NULL"), 0))
85 #	define FS_CALL_NO_PARAMS(vnode, op) \
86 		( HAS_FS_CALL(vnode, op) ? \
87 			vnode->ops->op(vnode->mount->volume, vnode) \
88 			: (panic("FS_CALL_NO_PARAMS op " #op " is NULL"), 0))
89 #	define FS_MOUNT_CALL(mount, op, params...) \
90 		( HAS_FS_MOUNT_CALL(mount, op) ? \
91 			mount->volume->ops->op(mount->volume, params) \
92 			: (panic("FS_MOUNT_CALL op " #op " is NULL"), 0))
93 #	define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
94 		( HAS_FS_MOUNT_CALL(mount, op) ? \
95 			mount->volume->ops->op(mount->volume) \
96 			: (panic("FS_MOUNT_CALL_NO_PARAMS op " #op " is NULL"), 0))
97 #else
98 #	define FS_CALL(vnode, op, params...) \
99 			vnode->ops->op(vnode->mount->volume, vnode, params)
100 #	define FS_CALL_NO_PARAMS(vnode, op) \
101 			vnode->ops->op(vnode->mount->volume, vnode)
102 #	define FS_MOUNT_CALL(mount, op, params...) \
103 			mount->volume->ops->op(mount->volume, params)
104 #	define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
105 			mount->volume->ops->op(mount->volume)
106 #endif
107 
108 
109 const static size_t kMaxPathLength = 65536;
110 	// The absolute maximum path length (for getcwd() - this is not depending
111 	// on PATH_MAX
112 
113 
114 typedef DoublyLinkedList<vnode> VnodeList;
115 
116 /*!	\brief Structure to manage a mounted file system
117 
118 	Note: The root_vnode and root_vnode->covers fields (what others?) are
119 	initialized in fs_mount() and not changed afterwards. That is as soon
120 	as the mount is mounted and it is made sure it won't be unmounted
121 	(e.g. by holding a reference to a vnode of that mount) (read) access
122 	to those fields is always safe, even without additional locking. Morever
123 	while mounted the mount holds a reference to the root_vnode->covers vnode,
124 	and thus making the access path vnode->mount->root_vnode->covers->mount->...
125 	safe if a reference to vnode is held (note that for the root mount
126 	root_vnode->covers is NULL, though).
127 */
128 struct fs_mount {
129 	fs_mount()
130 		:
131 		volume(NULL),
132 		device_name(NULL)
133 	{
134 		recursive_lock_init(&rlock, "mount rlock");
135 	}
136 
137 	~fs_mount()
138 	{
139 		recursive_lock_destroy(&rlock);
140 		free(device_name);
141 
142 		while (volume) {
143 			fs_volume* superVolume = volume->super_volume;
144 
145 			if (volume->file_system != NULL)
146 				put_module(volume->file_system->info.name);
147 
148 			free(volume->file_system_name);
149 			free(volume);
150 			volume = superVolume;
151 		}
152 	}
153 
154 	struct fs_mount* next;
155 	dev_t			id;
156 	fs_volume*		volume;
157 	char*			device_name;
158 	recursive_lock	rlock;	// guards the vnodes list
159 		// TODO: Make this a mutex! It is never used recursively.
160 	struct vnode*	root_vnode;
161 	struct vnode*	covers_vnode;	// immutable
162 	KPartition*		partition;
163 	VnodeList		vnodes;
164 	EntryCache		entry_cache;
165 	bool			unmounting;
166 	bool			owns_file_device;
167 };
168 
169 
170 namespace {
171 
172 struct advisory_lock : public DoublyLinkedListLinkImpl<advisory_lock> {
173 	list_link		link;
174 	team_id			team;
175 	pid_t			session;
176 	off_t			start;
177 	off_t			end;
178 	bool			shared;
179 };
180 
181 typedef DoublyLinkedList<advisory_lock> LockList;
182 
183 } // namespace
184 
185 
186 struct advisory_locking {
187 	sem_id			lock;
188 	sem_id			wait_sem;
189 	LockList		locks;
190 
191 	advisory_locking()
192 		:
193 		lock(-1),
194 		wait_sem(-1)
195 	{
196 	}
197 
198 	~advisory_locking()
199 	{
200 		if (lock >= 0)
201 			delete_sem(lock);
202 		if (wait_sem >= 0)
203 			delete_sem(wait_sem);
204 	}
205 };
206 
207 /*!	\brief Guards sMountsTable.
208 
209 	The holder is allowed to read/write access the sMountsTable.
210 	Manipulation of the fs_mount structures themselves
211 	(and their destruction) requires different locks though.
212 */
213 static mutex sMountMutex = MUTEX_INITIALIZER("vfs_mount_lock");
214 
215 /*!	\brief Guards mount/unmount operations.
216 
217 	The fs_mount() and fs_unmount() hold the lock during their whole operation.
218 	That is locking the lock ensures that no FS is mounted/unmounted. In
219 	particular this means that
220 	- sMountsTable will not be modified,
221 	- the fields immutable after initialization of the fs_mount structures in
222 	  sMountsTable will not be modified,
223 
224 	The thread trying to lock the lock must not hold sVnodeLock or
225 	sMountMutex.
226 */
227 static recursive_lock sMountOpLock;
228 
229 /*!	\brief Guards sVnodeTable.
230 
231 	The holder is allowed read/write access to sVnodeTable and to
232 	any unbusy vnode in that table, save to the immutable fields (device, id,
233 	private_node, mount) to which only read-only access is allowed.
234 	The mutable fields advisory_locking, mandatory_locked_by, and ref_count, as
235 	well as the busy, removed, unused flags, and the vnode's type can also be
236 	write accessed when holding a read lock to sVnodeLock *and* having the vnode
237 	locked. Write access to covered_by and covers requires to write lock
238 	sVnodeLock.
239 
240 	The thread trying to acquire the lock must not hold sMountMutex.
241 	You must not hold this lock when calling create_sem(), as this might call
242 	vfs_free_unused_vnodes() and thus cause a deadlock.
243 */
244 static rw_lock sVnodeLock = RW_LOCK_INITIALIZER("vfs_vnode_lock");
245 
246 /*!	\brief Guards io_context::root.
247 
248 	Must be held when setting or getting the io_context::root field.
249 	The only operation allowed while holding this lock besides getting or
250 	setting the field is inc_vnode_ref_count() on io_context::root.
251 */
252 static mutex sIOContextRootLock = MUTEX_INITIALIZER("io_context::root lock");
253 
254 
255 namespace {
256 
257 struct vnode_hash_key {
258 	dev_t	device;
259 	ino_t	vnode;
260 };
261 
262 struct VnodeHash {
263 	typedef vnode_hash_key	KeyType;
264 	typedef	struct vnode	ValueType;
265 
266 #define VHASH(mountid, vnodeid) \
267 	(((uint32)((vnodeid) >> 32) + (uint32)(vnodeid)) ^ (uint32)(mountid))
268 
269 	size_t HashKey(KeyType key) const
270 	{
271 		return VHASH(key.device, key.vnode);
272 	}
273 
274 	size_t Hash(ValueType* vnode) const
275 	{
276 		return VHASH(vnode->device, vnode->id);
277 	}
278 
279 #undef VHASH
280 
281 	bool Compare(KeyType key, ValueType* vnode) const
282 	{
283 		return vnode->device == key.device && vnode->id == key.vnode;
284 	}
285 
286 	ValueType*& GetLink(ValueType* value) const
287 	{
288 		return value->next;
289 	}
290 };
291 
292 typedef BOpenHashTable<VnodeHash> VnodeTable;
293 
294 
295 struct MountHash {
296 	typedef dev_t			KeyType;
297 	typedef	struct fs_mount	ValueType;
298 
299 	size_t HashKey(KeyType key) const
300 	{
301 		return key;
302 	}
303 
304 	size_t Hash(ValueType* mount) const
305 	{
306 		return mount->id;
307 	}
308 
309 	bool Compare(KeyType key, ValueType* mount) const
310 	{
311 		return mount->id == key;
312 	}
313 
314 	ValueType*& GetLink(ValueType* value) const
315 	{
316 		return value->next;
317 	}
318 };
319 
320 typedef BOpenHashTable<MountHash> MountTable;
321 
322 } // namespace
323 
324 
325 #define VNODE_HASH_TABLE_SIZE 1024
326 static VnodeTable* sVnodeTable;
327 static struct vnode* sRoot;
328 
329 #define MOUNTS_HASH_TABLE_SIZE 16
330 static MountTable* sMountsTable;
331 static dev_t sNextMountID = 1;
332 
333 #define MAX_TEMP_IO_VECS 8
334 
335 // How long to wait for busy vnodes (10s)
336 #define BUSY_VNODE_RETRIES 2000
337 #define BUSY_VNODE_DELAY 5000
338 
339 mode_t __gUmask = 022;
340 
341 /* function declarations */
342 
343 static void free_unused_vnodes();
344 
345 // file descriptor operation prototypes
346 static status_t file_read(struct file_descriptor* descriptor, off_t pos,
347 	void* buffer, size_t* _bytes);
348 static status_t file_write(struct file_descriptor* descriptor, off_t pos,
349 	const void* buffer, size_t* _bytes);
350 static off_t file_seek(struct file_descriptor* descriptor, off_t pos,
351 	int seekType);
352 static void file_free_fd(struct file_descriptor* descriptor);
353 static status_t file_close(struct file_descriptor* descriptor);
354 static status_t file_select(struct file_descriptor* descriptor, uint8 event,
355 	struct selectsync* sync);
356 static status_t file_deselect(struct file_descriptor* descriptor, uint8 event,
357 	struct selectsync* sync);
358 static status_t dir_read(struct io_context* context,
359 	struct file_descriptor* descriptor, struct dirent* buffer,
360 	size_t bufferSize, uint32* _count);
361 static status_t dir_read(struct io_context* ioContext, struct vnode* vnode,
362 	void* cookie, struct dirent* buffer, size_t bufferSize, uint32* _count);
363 static status_t dir_rewind(struct file_descriptor* descriptor);
364 static void dir_free_fd(struct file_descriptor* descriptor);
365 static status_t dir_close(struct file_descriptor* descriptor);
366 static status_t attr_dir_read(struct io_context* context,
367 	struct file_descriptor* descriptor, struct dirent* buffer,
368 	size_t bufferSize, uint32* _count);
369 static status_t attr_dir_rewind(struct file_descriptor* descriptor);
370 static void attr_dir_free_fd(struct file_descriptor* descriptor);
371 static status_t attr_dir_close(struct file_descriptor* descriptor);
372 static status_t attr_read(struct file_descriptor* descriptor, off_t pos,
373 	void* buffer, size_t* _bytes);
374 static status_t attr_write(struct file_descriptor* descriptor, off_t pos,
375 	const void* buffer, size_t* _bytes);
376 static off_t attr_seek(struct file_descriptor* descriptor, off_t pos,
377 	int seekType);
378 static void attr_free_fd(struct file_descriptor* descriptor);
379 static status_t attr_close(struct file_descriptor* descriptor);
380 static status_t attr_read_stat(struct file_descriptor* descriptor,
381 	struct stat* statData);
382 static status_t attr_write_stat(struct file_descriptor* descriptor,
383 	const struct stat* stat, int statMask);
384 static status_t index_dir_read(struct io_context* context,
385 	struct file_descriptor* descriptor, struct dirent* buffer,
386 	size_t bufferSize, uint32* _count);
387 static status_t index_dir_rewind(struct file_descriptor* descriptor);
388 static void index_dir_free_fd(struct file_descriptor* descriptor);
389 static status_t index_dir_close(struct file_descriptor* descriptor);
390 static status_t query_read(struct io_context* context,
391 	struct file_descriptor* descriptor, struct dirent* buffer,
392 	size_t bufferSize, uint32* _count);
393 static status_t query_rewind(struct file_descriptor* descriptor);
394 static void query_free_fd(struct file_descriptor* descriptor);
395 static status_t query_close(struct file_descriptor* descriptor);
396 
397 static status_t common_ioctl(struct file_descriptor* descriptor, ulong op,
398 	void* buffer, size_t length);
399 static status_t common_read_stat(struct file_descriptor* descriptor,
400 	struct stat* statData);
401 static status_t common_write_stat(struct file_descriptor* descriptor,
402 	const struct stat* statData, int statMask);
403 static status_t common_path_read_stat(int fd, char* path, bool traverseLeafLink,
404 	struct stat* stat, bool kernel);
405 
406 static status_t vnode_path_to_vnode(struct vnode* vnode, char* path,
407 	bool traverseLeafLink, int count, bool kernel,
408 	struct vnode** _vnode, ino_t* _parentID);
409 static status_t dir_vnode_to_path(struct vnode* vnode, char* buffer,
410 	size_t bufferSize, bool kernel);
411 static status_t fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
412 	struct vnode** _vnode, ino_t* _parentID, bool kernel);
413 static void inc_vnode_ref_count(struct vnode* vnode);
414 static status_t dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree,
415 	bool reenter);
416 static inline void put_vnode(struct vnode* vnode);
417 static status_t fs_unmount(char* path, dev_t mountID, uint32 flags,
418 	bool kernel);
419 static int open_vnode(struct vnode* vnode, int openMode, bool kernel);
420 
421 
422 static struct fd_ops sFileOps = {
423 	file_read,
424 	file_write,
425 	file_seek,
426 	common_ioctl,
427 	NULL,		// set_flags
428 	file_select,
429 	file_deselect,
430 	NULL,		// read_dir()
431 	NULL,		// rewind_dir()
432 	common_read_stat,
433 	common_write_stat,
434 	file_close,
435 	file_free_fd
436 };
437 
438 static struct fd_ops sDirectoryOps = {
439 	NULL,		// read()
440 	NULL,		// write()
441 	NULL,		// seek()
442 	common_ioctl,
443 	NULL,		// set_flags
444 	NULL,		// select()
445 	NULL,		// deselect()
446 	dir_read,
447 	dir_rewind,
448 	common_read_stat,
449 	common_write_stat,
450 	dir_close,
451 	dir_free_fd
452 };
453 
454 static struct fd_ops sAttributeDirectoryOps = {
455 	NULL,		// read()
456 	NULL,		// write()
457 	NULL,		// seek()
458 	common_ioctl,
459 	NULL,		// set_flags
460 	NULL,		// select()
461 	NULL,		// deselect()
462 	attr_dir_read,
463 	attr_dir_rewind,
464 	common_read_stat,
465 	common_write_stat,
466 	attr_dir_close,
467 	attr_dir_free_fd
468 };
469 
470 static struct fd_ops sAttributeOps = {
471 	attr_read,
472 	attr_write,
473 	attr_seek,
474 	common_ioctl,
475 	NULL,		// set_flags
476 	NULL,		// select()
477 	NULL,		// deselect()
478 	NULL,		// read_dir()
479 	NULL,		// rewind_dir()
480 	attr_read_stat,
481 	attr_write_stat,
482 	attr_close,
483 	attr_free_fd
484 };
485 
486 static struct fd_ops sIndexDirectoryOps = {
487 	NULL,		// read()
488 	NULL,		// write()
489 	NULL,		// seek()
490 	NULL,		// ioctl()
491 	NULL,		// set_flags
492 	NULL,		// select()
493 	NULL,		// deselect()
494 	index_dir_read,
495 	index_dir_rewind,
496 	NULL,		// read_stat()
497 	NULL,		// write_stat()
498 	index_dir_close,
499 	index_dir_free_fd
500 };
501 
502 #if 0
503 static struct fd_ops sIndexOps = {
504 	NULL,		// read()
505 	NULL,		// write()
506 	NULL,		// seek()
507 	NULL,		// ioctl()
508 	NULL,		// set_flags
509 	NULL,		// select()
510 	NULL,		// deselect()
511 	NULL,		// dir_read()
512 	NULL,		// dir_rewind()
513 	index_read_stat,	// read_stat()
514 	NULL,		// write_stat()
515 	NULL,		// dir_close()
516 	NULL		// free_fd()
517 };
518 #endif
519 
520 static struct fd_ops sQueryOps = {
521 	NULL,		// read()
522 	NULL,		// write()
523 	NULL,		// seek()
524 	NULL,		// ioctl()
525 	NULL,		// set_flags
526 	NULL,		// select()
527 	NULL,		// deselect()
528 	query_read,
529 	query_rewind,
530 	NULL,		// read_stat()
531 	NULL,		// write_stat()
532 	query_close,
533 	query_free_fd
534 };
535 
536 
537 namespace {
538 
539 class VNodePutter {
540 public:
541 	VNodePutter(struct vnode* vnode = NULL) : fVNode(vnode) {}
542 
543 	~VNodePutter()
544 	{
545 		Put();
546 	}
547 
548 	void SetTo(struct vnode* vnode)
549 	{
550 		Put();
551 		fVNode = vnode;
552 	}
553 
554 	void Put()
555 	{
556 		if (fVNode) {
557 			put_vnode(fVNode);
558 			fVNode = NULL;
559 		}
560 	}
561 
562 	struct vnode* Detach()
563 	{
564 		struct vnode* vnode = fVNode;
565 		fVNode = NULL;
566 		return vnode;
567 	}
568 
569 private:
570 	struct vnode* fVNode;
571 };
572 
573 
574 class FDCloser {
575 public:
576 	FDCloser() : fFD(-1), fKernel(true) {}
577 
578 	FDCloser(int fd, bool kernel) : fFD(fd), fKernel(kernel) {}
579 
580 	~FDCloser()
581 	{
582 		Close();
583 	}
584 
585 	void SetTo(int fd, bool kernel)
586 	{
587 		Close();
588 		fFD = fd;
589 		fKernel = kernel;
590 	}
591 
592 	void Close()
593 	{
594 		if (fFD >= 0) {
595 			if (fKernel)
596 				_kern_close(fFD);
597 			else
598 				_user_close(fFD);
599 			fFD = -1;
600 		}
601 	}
602 
603 	int Detach()
604 	{
605 		int fd = fFD;
606 		fFD = -1;
607 		return fd;
608 	}
609 
610 private:
611 	int		fFD;
612 	bool	fKernel;
613 };
614 
615 } // namespace
616 
617 
618 #if VFS_PAGES_IO_TRACING
619 
620 namespace VFSPagesIOTracing {
621 
622 class PagesIOTraceEntry : public AbstractTraceEntry {
623 protected:
624 	PagesIOTraceEntry(struct vnode* vnode, void* cookie, off_t pos,
625 		const generic_io_vec* vecs, uint32 count, uint32 flags,
626 		generic_size_t bytesRequested, status_t status,
627 		generic_size_t bytesTransferred)
628 		:
629 		fVnode(vnode),
630 		fMountID(vnode->mount->id),
631 		fNodeID(vnode->id),
632 		fCookie(cookie),
633 		fPos(pos),
634 		fCount(count),
635 		fFlags(flags),
636 		fBytesRequested(bytesRequested),
637 		fStatus(status),
638 		fBytesTransferred(bytesTransferred)
639 	{
640 		fVecs = (generic_io_vec*)alloc_tracing_buffer_memcpy(vecs,
641 			sizeof(generic_io_vec) * count, false);
642 	}
643 
644 	void AddDump(TraceOutput& out, const char* mode)
645 	{
646 		out.Print("vfs pages io %5s: vnode: %p (%" B_PRId32 ", %" B_PRId64 "), "
647 			"cookie: %p, pos: %" B_PRIdOFF ", size: %" B_PRIu64 ", vecs: {",
648 			mode, fVnode, fMountID, fNodeID, fCookie, fPos,
649 			(uint64)fBytesRequested);
650 
651 		if (fVecs != NULL) {
652 			for (uint32 i = 0; i < fCount; i++) {
653 				if (i > 0)
654 					out.Print(", ");
655 				out.Print("(%" B_PRIx64 ", %" B_PRIu64 ")", (uint64)fVecs[i].base,
656 					(uint64)fVecs[i].length);
657 			}
658 		}
659 
660 		out.Print("}, flags: %#" B_PRIx32 " -> status: %#" B_PRIx32 ", "
661 			"transferred: %" B_PRIu64, fFlags, fStatus,
662 			(uint64)fBytesTransferred);
663 	}
664 
665 protected:
666 	struct vnode*	fVnode;
667 	dev_t			fMountID;
668 	ino_t			fNodeID;
669 	void*			fCookie;
670 	off_t			fPos;
671 	generic_io_vec*	fVecs;
672 	uint32			fCount;
673 	uint32			fFlags;
674 	generic_size_t	fBytesRequested;
675 	status_t		fStatus;
676 	generic_size_t	fBytesTransferred;
677 };
678 
679 
680 class ReadPages : public PagesIOTraceEntry {
681 public:
682 	ReadPages(struct vnode* vnode, void* cookie, off_t pos,
683 		const generic_io_vec* vecs, uint32 count, uint32 flags,
684 		generic_size_t bytesRequested, status_t status,
685 		generic_size_t bytesTransferred)
686 		:
687 		PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
688 			bytesRequested, status, bytesTransferred)
689 	{
690 		Initialized();
691 	}
692 
693 	virtual void AddDump(TraceOutput& out)
694 	{
695 		PagesIOTraceEntry::AddDump(out, "read");
696 	}
697 };
698 
699 
700 class WritePages : public PagesIOTraceEntry {
701 public:
702 	WritePages(struct vnode* vnode, void* cookie, off_t pos,
703 		const generic_io_vec* vecs, uint32 count, uint32 flags,
704 		generic_size_t bytesRequested, status_t status,
705 		generic_size_t bytesTransferred)
706 		:
707 		PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
708 			bytesRequested, status, bytesTransferred)
709 	{
710 		Initialized();
711 	}
712 
713 	virtual void AddDump(TraceOutput& out)
714 	{
715 		PagesIOTraceEntry::AddDump(out, "write");
716 	}
717 };
718 
719 }	// namespace VFSPagesIOTracing
720 
721 #	define TPIO(x) new(std::nothrow) VFSPagesIOTracing::x;
722 #else
723 #	define TPIO(x) ;
724 #endif	// VFS_PAGES_IO_TRACING
725 
726 
727 /*! Finds the mounted device (the fs_mount structure) with the given ID.
728 	Note, you must hold the gMountMutex lock when you call this function.
729 */
730 static struct fs_mount*
731 find_mount(dev_t id)
732 {
733 	ASSERT_LOCKED_MUTEX(&sMountMutex);
734 
735 	return sMountsTable->Lookup(id);
736 }
737 
738 
739 static status_t
740 get_mount(dev_t id, struct fs_mount** _mount)
741 {
742 	struct fs_mount* mount;
743 
744 	ReadLocker nodeLocker(sVnodeLock);
745 	MutexLocker mountLocker(sMountMutex);
746 
747 	mount = find_mount(id);
748 	if (mount == NULL)
749 		return B_BAD_VALUE;
750 
751 	struct vnode* rootNode = mount->root_vnode;
752 	if (mount->unmounting || rootNode == NULL || rootNode->IsBusy()
753 		|| rootNode->ref_count == 0) {
754 		// might have been called during a mount/unmount operation
755 		return B_BUSY;
756 	}
757 
758 	inc_vnode_ref_count(rootNode);
759 	*_mount = mount;
760 	return B_OK;
761 }
762 
763 
764 static void
765 put_mount(struct fs_mount* mount)
766 {
767 	if (mount)
768 		put_vnode(mount->root_vnode);
769 }
770 
771 
772 /*!	Tries to open the specified file system module.
773 	Accepts a file system name of the form "bfs" or "file_systems/bfs/v1".
774 	Returns a pointer to file system module interface, or NULL if it
775 	could not open the module.
776 */
777 static file_system_module_info*
778 get_file_system(const char* fsName)
779 {
780 	char name[B_FILE_NAME_LENGTH];
781 	if (strncmp(fsName, "file_systems/", strlen("file_systems/"))) {
782 		// construct module name if we didn't get one
783 		// (we currently support only one API)
784 		snprintf(name, sizeof(name), "file_systems/%s/v1", fsName);
785 		fsName = NULL;
786 	}
787 
788 	file_system_module_info* info;
789 	if (get_module(fsName ? fsName : name, (module_info**)&info) != B_OK)
790 		return NULL;
791 
792 	return info;
793 }
794 
795 
796 /*!	Accepts a file system name of the form "bfs" or "file_systems/bfs/v1"
797 	and returns a compatible fs_info.fsh_name name ("bfs" in both cases).
798 	The name is allocated for you, and you have to free() it when you're
799 	done with it.
800 	Returns NULL if the required memory is not available.
801 */
802 static char*
803 get_file_system_name(const char* fsName)
804 {
805 	const size_t length = strlen("file_systems/");
806 
807 	if (strncmp(fsName, "file_systems/", length)) {
808 		// the name already seems to be the module's file name
809 		return strdup(fsName);
810 	}
811 
812 	fsName += length;
813 	const char* end = strchr(fsName, '/');
814 	if (end == NULL) {
815 		// this doesn't seem to be a valid name, but well...
816 		return strdup(fsName);
817 	}
818 
819 	// cut off the trailing /v1
820 
821 	char* name = (char*)malloc(end + 1 - fsName);
822 	if (name == NULL)
823 		return NULL;
824 
825 	strlcpy(name, fsName, end + 1 - fsName);
826 	return name;
827 }
828 
829 
830 /*!	Accepts a list of file system names separated by a colon, one for each
831 	layer and returns the file system name for the specified layer.
832 	The name is allocated for you, and you have to free() it when you're
833 	done with it.
834 	Returns NULL if the required memory is not available or if there is no
835 	name for the specified layer.
836 */
837 static char*
838 get_file_system_name_for_layer(const char* fsNames, int32 layer)
839 {
840 	while (layer >= 0) {
841 		const char* end = strchr(fsNames, ':');
842 		if (end == NULL) {
843 			if (layer == 0)
844 				return strdup(fsNames);
845 			return NULL;
846 		}
847 
848 		if (layer == 0) {
849 			size_t length = end - fsNames + 1;
850 			char* result = (char*)malloc(length);
851 			strlcpy(result, fsNames, length);
852 			return result;
853 		}
854 
855 		fsNames = end + 1;
856 		layer--;
857 	}
858 
859 	return NULL;
860 }
861 
862 
863 static void
864 add_vnode_to_mount_list(struct vnode* vnode, struct fs_mount* mount)
865 {
866 	RecursiveLocker _(mount->rlock);
867 	mount->vnodes.Add(vnode);
868 }
869 
870 
871 static void
872 remove_vnode_from_mount_list(struct vnode* vnode, struct fs_mount* mount)
873 {
874 	RecursiveLocker _(mount->rlock);
875 	mount->vnodes.Remove(vnode);
876 }
877 
878 
879 /*!	\brief Looks up a vnode by mount and node ID in the sVnodeTable.
880 
881 	The caller must hold the sVnodeLock (read lock at least).
882 
883 	\param mountID the mount ID.
884 	\param vnodeID the node ID.
885 
886 	\return The vnode structure, if it was found in the hash table, \c NULL
887 			otherwise.
888 */
889 static struct vnode*
890 lookup_vnode(dev_t mountID, ino_t vnodeID)
891 {
892 	struct vnode_hash_key key;
893 
894 	key.device = mountID;
895 	key.vnode = vnodeID;
896 
897 	return sVnodeTable->Lookup(key);
898 }
899 
900 
901 /*!	\brief Checks whether or not a busy vnode should be waited for (again).
902 
903 	This will also wait for BUSY_VNODE_DELAY before returning if one should
904 	still wait for the vnode becoming unbusy.
905 
906 	\return \c true if one should retry, \c false if not.
907 */
908 static bool
909 retry_busy_vnode(int32& tries, dev_t mountID, ino_t vnodeID)
910 {
911 	if (--tries < 0) {
912 		// vnode doesn't seem to become unbusy
913 		dprintf("vnode %" B_PRIdDEV ":%" B_PRIdINO
914 			" is not becoming unbusy!\n", mountID, vnodeID);
915 		return false;
916 	}
917 	snooze(BUSY_VNODE_DELAY);
918 	return true;
919 }
920 
921 
922 /*!	Creates a new vnode with the given mount and node ID.
923 	If the node already exists, it is returned instead and no new node is
924 	created. In either case -- but not, if an error occurs -- the function write
925 	locks \c sVnodeLock and keeps it locked for the caller when returning. On
926 	error the lock is not held on return.
927 
928 	\param mountID The mount ID.
929 	\param vnodeID The vnode ID.
930 	\param _vnode Will be set to the new vnode on success.
931 	\param _nodeCreated Will be set to \c true when the returned vnode has
932 		been newly created, \c false when it already existed. Will not be
933 		changed on error.
934 	\return \c B_OK, when the vnode was successfully created and inserted or
935 		a node with the given ID was found, \c B_NO_MEMORY or
936 		\c B_ENTRY_NOT_FOUND on error.
937 */
938 static status_t
939 create_new_vnode_and_lock(dev_t mountID, ino_t vnodeID, struct vnode*& _vnode,
940 	bool& _nodeCreated)
941 {
942 	FUNCTION(("create_new_vnode_and_lock()\n"));
943 
944 	struct vnode* vnode = (struct vnode*)malloc(sizeof(struct vnode));
945 	if (vnode == NULL)
946 		return B_NO_MEMORY;
947 
948 	// initialize basic values
949 	memset(vnode, 0, sizeof(struct vnode));
950 	vnode->device = mountID;
951 	vnode->id = vnodeID;
952 	vnode->ref_count = 1;
953 	vnode->SetBusy(true);
954 
955 	// look up the node -- it might have been added by someone else in the
956 	// meantime
957 	rw_lock_write_lock(&sVnodeLock);
958 	struct vnode* existingVnode = lookup_vnode(mountID, vnodeID);
959 	if (existingVnode != NULL) {
960 		free(vnode);
961 		_vnode = existingVnode;
962 		_nodeCreated = false;
963 		return B_OK;
964 	}
965 
966 	// get the mount structure
967 	mutex_lock(&sMountMutex);
968 	vnode->mount = find_mount(mountID);
969 	if (!vnode->mount || vnode->mount->unmounting) {
970 		mutex_unlock(&sMountMutex);
971 		rw_lock_write_unlock(&sVnodeLock);
972 		free(vnode);
973 		return B_ENTRY_NOT_FOUND;
974 	}
975 
976 	// add the vnode to the mount's node list and the hash table
977 	sVnodeTable->Insert(vnode);
978 	add_vnode_to_mount_list(vnode, vnode->mount);
979 
980 	mutex_unlock(&sMountMutex);
981 
982 	_vnode = vnode;
983 	_nodeCreated = true;
984 
985 	// keep the vnode lock locked
986 	return B_OK;
987 }
988 
989 
990 /*!	Frees the vnode and all resources it has acquired, and removes
991 	it from the vnode hash as well as from its mount structure.
992 	Will also make sure that any cache modifications are written back.
993 */
994 static void
995 free_vnode(struct vnode* vnode, bool reenter)
996 {
997 	ASSERT_PRINT(vnode->ref_count == 0 && vnode->IsBusy(), "vnode: %p\n",
998 		vnode);
999 
1000 	// write back any changes in this vnode's cache -- but only
1001 	// if the vnode won't be deleted, in which case the changes
1002 	// will be discarded
1003 
1004 	if (!vnode->IsRemoved() && HAS_FS_CALL(vnode, fsync))
1005 		FS_CALL_NO_PARAMS(vnode, fsync);
1006 
1007 	// Note: If this vnode has a cache attached, there will still be two
1008 	// references to that cache at this point. The last one belongs to the vnode
1009 	// itself (cf. vfs_get_vnode_cache()) and one belongs to the node's file
1010 	// cache. Each but the last reference to a cache also includes a reference
1011 	// to the vnode. The file cache, however, released its reference (cf.
1012 	// file_cache_create()), so that this vnode's ref count has the chance to
1013 	// ever drop to 0. Deleting the file cache now, will cause the next to last
1014 	// cache reference to be released, which will also release a (no longer
1015 	// existing) vnode reference. To avoid problems, we set the vnode's ref
1016 	// count, so that it will neither become negative nor 0.
1017 	vnode->ref_count = 2;
1018 
1019 	if (!vnode->IsUnpublished()) {
1020 		if (vnode->IsRemoved())
1021 			FS_CALL(vnode, remove_vnode, reenter);
1022 		else
1023 			FS_CALL(vnode, put_vnode, reenter);
1024 	}
1025 
1026 	// If the vnode has a VMCache attached, make sure that it won't try to get
1027 	// another reference via VMVnodeCache::AcquireUnreferencedStoreRef(). As
1028 	// long as the vnode is busy and in the hash, that won't happen, but as
1029 	// soon as we've removed it from the hash, it could reload the vnode -- with
1030 	// a new cache attached!
1031 	if (vnode->cache != NULL)
1032 		((VMVnodeCache*)vnode->cache)->VnodeDeleted();
1033 
1034 	// The file system has removed the resources of the vnode now, so we can
1035 	// make it available again (by removing the busy vnode from the hash).
1036 	rw_lock_write_lock(&sVnodeLock);
1037 	sVnodeTable->Remove(vnode);
1038 	rw_lock_write_unlock(&sVnodeLock);
1039 
1040 	// if we have a VMCache attached, remove it
1041 	if (vnode->cache)
1042 		vnode->cache->ReleaseRef();
1043 
1044 	vnode->cache = NULL;
1045 
1046 	remove_vnode_from_mount_list(vnode, vnode->mount);
1047 
1048 	free(vnode);
1049 }
1050 
1051 
1052 /*!	\brief Decrements the reference counter of the given vnode and deletes it,
1053 	if the counter dropped to 0.
1054 
1055 	The caller must, of course, own a reference to the vnode to call this
1056 	function.
1057 	The caller must not hold the sVnodeLock or the sMountMutex.
1058 
1059 	\param vnode the vnode.
1060 	\param alwaysFree don't move this vnode into the unused list, but really
1061 		   delete it if possible.
1062 	\param reenter \c true, if this function is called (indirectly) from within
1063 		   a file system. This will be passed to file system hooks only.
1064 	\return \c B_OK, if everything went fine, an error code otherwise.
1065 */
1066 static status_t
1067 dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree, bool reenter)
1068 {
1069 	ReadLocker locker(sVnodeLock);
1070 	AutoLocker<Vnode> nodeLocker(vnode);
1071 
1072 	int32 oldRefCount = atomic_add(&vnode->ref_count, -1);
1073 
1074 	ASSERT_PRINT(oldRefCount > 0, "vnode %p\n", vnode);
1075 
1076 	TRACE(("dec_vnode_ref_count: vnode %p, ref now %" B_PRId32 "\n", vnode,
1077 		vnode->ref_count));
1078 
1079 	if (oldRefCount != 1)
1080 		return B_OK;
1081 
1082 	if (vnode->IsBusy())
1083 		panic("dec_vnode_ref_count: called on busy vnode %p\n", vnode);
1084 
1085 	bool freeNode = false;
1086 	bool freeUnusedNodes = false;
1087 
1088 	// Just insert the vnode into an unused list if we don't need
1089 	// to delete it
1090 	if (vnode->IsRemoved() || alwaysFree) {
1091 		vnode_to_be_freed(vnode);
1092 		vnode->SetBusy(true);
1093 		freeNode = true;
1094 	} else
1095 		freeUnusedNodes = vnode_unused(vnode);
1096 
1097 	nodeLocker.Unlock();
1098 	locker.Unlock();
1099 
1100 	if (freeNode)
1101 		free_vnode(vnode, reenter);
1102 	else if (freeUnusedNodes)
1103 		free_unused_vnodes();
1104 
1105 	return B_OK;
1106 }
1107 
1108 
1109 /*!	\brief Increments the reference counter of the given vnode.
1110 
1111 	The caller must make sure that the node isn't deleted while this function
1112 	is called. This can be done either:
1113 	- by ensuring that a reference to the node exists and remains in existence,
1114 	  or
1115 	- by holding the vnode's lock (which also requires read locking sVnodeLock)
1116 	  or by holding sVnodeLock write locked.
1117 
1118 	In the second case the caller is responsible for dealing with the ref count
1119 	0 -> 1 transition. That is 1. this function must not be invoked when the
1120 	node is busy in the first place and 2. vnode_used() must be called for the
1121 	node.
1122 
1123 	\param vnode the vnode.
1124 */
1125 static void
1126 inc_vnode_ref_count(struct vnode* vnode)
1127 {
1128 	atomic_add(&vnode->ref_count, 1);
1129 	TRACE(("inc_vnode_ref_count: vnode %p, ref now %" B_PRId32 "\n", vnode,
1130 		vnode->ref_count));
1131 }
1132 
1133 
1134 static bool
1135 is_special_node_type(int type)
1136 {
1137 	// at the moment only FIFOs are supported
1138 	return S_ISFIFO(type);
1139 }
1140 
1141 
1142 static status_t
1143 create_special_sub_node(struct vnode* vnode, uint32 flags)
1144 {
1145 	if (S_ISFIFO(vnode->Type()))
1146 		return create_fifo_vnode(vnode->mount->volume, vnode);
1147 
1148 	return B_BAD_VALUE;
1149 }
1150 
1151 
1152 /*!	\brief Retrieves a vnode for a given mount ID, node ID pair.
1153 
1154 	If the node is not yet in memory, it will be loaded.
1155 
1156 	The caller must not hold the sVnodeLock or the sMountMutex.
1157 
1158 	\param mountID the mount ID.
1159 	\param vnodeID the node ID.
1160 	\param _vnode Pointer to a vnode* variable into which the pointer to the
1161 		   retrieved vnode structure shall be written.
1162 	\param reenter \c true, if this function is called (indirectly) from within
1163 		   a file system.
1164 	\return \c B_OK, if everything when fine, an error code otherwise.
1165 */
1166 static status_t
1167 get_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode, bool canWait,
1168 	int reenter)
1169 {
1170 	FUNCTION(("get_vnode: mountid %" B_PRId32 " vnid 0x%" B_PRIx64 " %p\n",
1171 		mountID, vnodeID, _vnode));
1172 
1173 	rw_lock_read_lock(&sVnodeLock);
1174 
1175 	int32 tries = BUSY_VNODE_RETRIES;
1176 restart:
1177 	struct vnode* vnode = lookup_vnode(mountID, vnodeID);
1178 	AutoLocker<Vnode> nodeLocker(vnode);
1179 
1180 	if (vnode && vnode->IsBusy()) {
1181 		nodeLocker.Unlock();
1182 		rw_lock_read_unlock(&sVnodeLock);
1183 		if (!canWait) {
1184 			dprintf("vnode %" B_PRIdDEV ":%" B_PRIdINO " is busy!\n",
1185 				mountID, vnodeID);
1186 			return B_BUSY;
1187 		}
1188 		if (!retry_busy_vnode(tries, mountID, vnodeID))
1189 			return B_BUSY;
1190 
1191 		rw_lock_read_lock(&sVnodeLock);
1192 		goto restart;
1193 	}
1194 
1195 	TRACE(("get_vnode: tried to lookup vnode, got %p\n", vnode));
1196 
1197 	status_t status;
1198 
1199 	if (vnode) {
1200 		if (vnode->ref_count == 0) {
1201 			// this vnode has been unused before
1202 			vnode_used(vnode);
1203 		}
1204 		inc_vnode_ref_count(vnode);
1205 
1206 		nodeLocker.Unlock();
1207 		rw_lock_read_unlock(&sVnodeLock);
1208 	} else {
1209 		// we need to create a new vnode and read it in
1210 		rw_lock_read_unlock(&sVnodeLock);
1211 			// unlock -- create_new_vnode_and_lock() write-locks on success
1212 		bool nodeCreated;
1213 		status = create_new_vnode_and_lock(mountID, vnodeID, vnode,
1214 			nodeCreated);
1215 		if (status != B_OK)
1216 			return status;
1217 
1218 		if (!nodeCreated) {
1219 			rw_lock_read_lock(&sVnodeLock);
1220 			rw_lock_write_unlock(&sVnodeLock);
1221 			goto restart;
1222 		}
1223 
1224 		rw_lock_write_unlock(&sVnodeLock);
1225 
1226 		int type;
1227 		uint32 flags;
1228 		status = FS_MOUNT_CALL(vnode->mount, get_vnode, vnodeID, vnode, &type,
1229 			&flags, reenter);
1230 		if (status == B_OK && vnode->private_node == NULL)
1231 			status = B_BAD_VALUE;
1232 
1233 		bool gotNode = status == B_OK;
1234 		bool publishSpecialSubNode = false;
1235 		if (gotNode) {
1236 			vnode->SetType(type);
1237 			publishSpecialSubNode = is_special_node_type(type)
1238 				&& (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
1239 		}
1240 
1241 		if (gotNode && publishSpecialSubNode)
1242 			status = create_special_sub_node(vnode, flags);
1243 
1244 		if (status != B_OK) {
1245 			if (gotNode)
1246 				FS_CALL(vnode, put_vnode, reenter);
1247 
1248 			rw_lock_write_lock(&sVnodeLock);
1249 			sVnodeTable->Remove(vnode);
1250 			remove_vnode_from_mount_list(vnode, vnode->mount);
1251 			rw_lock_write_unlock(&sVnodeLock);
1252 
1253 			free(vnode);
1254 			return status;
1255 		}
1256 
1257 		rw_lock_read_lock(&sVnodeLock);
1258 		vnode->Lock();
1259 
1260 		vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
1261 		vnode->SetBusy(false);
1262 
1263 		vnode->Unlock();
1264 		rw_lock_read_unlock(&sVnodeLock);
1265 	}
1266 
1267 	TRACE(("get_vnode: returning %p\n", vnode));
1268 
1269 	*_vnode = vnode;
1270 	return B_OK;
1271 }
1272 
1273 
1274 /*!	\brief Decrements the reference counter of the given vnode and deletes it,
1275 	if the counter dropped to 0.
1276 
1277 	The caller must, of course, own a reference to the vnode to call this
1278 	function.
1279 	The caller must not hold the sVnodeLock or the sMountMutex.
1280 
1281 	\param vnode the vnode.
1282 */
1283 static inline void
1284 put_vnode(struct vnode* vnode)
1285 {
1286 	dec_vnode_ref_count(vnode, false, false);
1287 }
1288 
1289 
1290 static void
1291 free_unused_vnodes(int32 level)
1292 {
1293 	unused_vnodes_check_started();
1294 
1295 	if (level == B_NO_LOW_RESOURCE) {
1296 		unused_vnodes_check_done();
1297 		return;
1298 	}
1299 
1300 	flush_hot_vnodes();
1301 
1302 	// determine how many nodes to free
1303 	uint32 count = 1;
1304 	{
1305 		MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1306 
1307 		switch (level) {
1308 			case B_LOW_RESOURCE_NOTE:
1309 				count = sUnusedVnodes / 100;
1310 				break;
1311 			case B_LOW_RESOURCE_WARNING:
1312 				count = sUnusedVnodes / 10;
1313 				break;
1314 			case B_LOW_RESOURCE_CRITICAL:
1315 				count = sUnusedVnodes;
1316 				break;
1317 		}
1318 
1319 		if (count > sUnusedVnodes)
1320 			count = sUnusedVnodes;
1321 	}
1322 
1323 	// Write back the modified pages of some unused vnodes and free them.
1324 
1325 	for (uint32 i = 0; i < count; i++) {
1326 		ReadLocker vnodesReadLocker(sVnodeLock);
1327 
1328 		// get the first node
1329 		MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1330 		struct vnode* vnode = (struct vnode*)list_get_first_item(
1331 			&sUnusedVnodeList);
1332 		unusedVnodesLocker.Unlock();
1333 
1334 		if (vnode == NULL)
1335 			break;
1336 
1337 		// lock the node
1338 		AutoLocker<Vnode> nodeLocker(vnode);
1339 
1340 		// Check whether the node is still unused -- since we only append to the
1341 		// tail of the unused queue, the vnode should still be at its head.
1342 		// Alternatively we could check its ref count for 0 and its busy flag,
1343 		// but if the node is no longer at the head of the queue, it means it
1344 		// has been touched in the meantime, i.e. it is no longer the least
1345 		// recently used unused vnode and we rather don't free it.
1346 		unusedVnodesLocker.Lock();
1347 		if (vnode != list_get_first_item(&sUnusedVnodeList))
1348 			continue;
1349 		unusedVnodesLocker.Unlock();
1350 
1351 		ASSERT(!vnode->IsBusy());
1352 
1353 		// grab a reference
1354 		inc_vnode_ref_count(vnode);
1355 		vnode_used(vnode);
1356 
1357 		// write back changes and free the node
1358 		nodeLocker.Unlock();
1359 		vnodesReadLocker.Unlock();
1360 
1361 		if (vnode->cache != NULL)
1362 			vnode->cache->WriteModified();
1363 
1364 		dec_vnode_ref_count(vnode, true, false);
1365 			// this should free the vnode when it's still unused
1366 	}
1367 
1368 	unused_vnodes_check_done();
1369 }
1370 
1371 
1372 /*!	Gets the vnode the given vnode is covering.
1373 
1374 	The caller must have \c sVnodeLock read-locked at least.
1375 
1376 	The function returns a reference to the retrieved vnode (if any), the caller
1377 	is responsible to free.
1378 
1379 	\param vnode The vnode whose covered node shall be returned.
1380 	\return The covered vnode, or \c NULL if the given vnode doesn't cover any
1381 		vnode.
1382 */
1383 static inline Vnode*
1384 get_covered_vnode_locked(Vnode* vnode)
1385 {
1386 	if (Vnode* coveredNode = vnode->covers) {
1387 		while (coveredNode->covers != NULL)
1388 			coveredNode = coveredNode->covers;
1389 
1390 		inc_vnode_ref_count(coveredNode);
1391 		return coveredNode;
1392 	}
1393 
1394 	return NULL;
1395 }
1396 
1397 
1398 /*!	Gets the vnode the given vnode is covering.
1399 
1400 	The caller must not hold \c sVnodeLock. Note that this implies a race
1401 	condition, since the situation can change at any time.
1402 
1403 	The function returns a reference to the retrieved vnode (if any), the caller
1404 	is responsible to free.
1405 
1406 	\param vnode The vnode whose covered node shall be returned.
1407 	\return The covered vnode, or \c NULL if the given vnode doesn't cover any
1408 		vnode.
1409 */
1410 static inline Vnode*
1411 get_covered_vnode(Vnode* vnode)
1412 {
1413 	if (!vnode->IsCovering())
1414 		return NULL;
1415 
1416 	ReadLocker vnodeReadLocker(sVnodeLock);
1417 	return get_covered_vnode_locked(vnode);
1418 }
1419 
1420 
1421 /*!	Gets the vnode the given vnode is covered by.
1422 
1423 	The caller must have \c sVnodeLock read-locked at least.
1424 
1425 	The function returns a reference to the retrieved vnode (if any), the caller
1426 	is responsible to free.
1427 
1428 	\param vnode The vnode whose covering node shall be returned.
1429 	\return The covering vnode, or \c NULL if the given vnode isn't covered by
1430 		any vnode.
1431 */
1432 static Vnode*
1433 get_covering_vnode_locked(Vnode* vnode)
1434 {
1435 	if (Vnode* coveringNode = vnode->covered_by) {
1436 		while (coveringNode->covered_by != NULL)
1437 			coveringNode = coveringNode->covered_by;
1438 
1439 		inc_vnode_ref_count(coveringNode);
1440 		return coveringNode;
1441 	}
1442 
1443 	return NULL;
1444 }
1445 
1446 
1447 /*!	Gets the vnode the given vnode is covered by.
1448 
1449 	The caller must not hold \c sVnodeLock. Note that this implies a race
1450 	condition, since the situation can change at any time.
1451 
1452 	The function returns a reference to the retrieved vnode (if any), the caller
1453 	is responsible to free.
1454 
1455 	\param vnode The vnode whose covering node shall be returned.
1456 	\return The covering vnode, or \c NULL if the given vnode isn't covered by
1457 		any vnode.
1458 */
1459 static inline Vnode*
1460 get_covering_vnode(Vnode* vnode)
1461 {
1462 	if (!vnode->IsCovered())
1463 		return NULL;
1464 
1465 	ReadLocker vnodeReadLocker(sVnodeLock);
1466 	return get_covering_vnode_locked(vnode);
1467 }
1468 
1469 
1470 static void
1471 free_unused_vnodes()
1472 {
1473 	free_unused_vnodes(
1474 		low_resource_state(B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1475 			| B_KERNEL_RESOURCE_ADDRESS_SPACE));
1476 }
1477 
1478 
1479 static void
1480 vnode_low_resource_handler(void* /*data*/, uint32 resources, int32 level)
1481 {
1482 	TRACE(("vnode_low_resource_handler(level = %" B_PRId32 ")\n", level));
1483 
1484 	free_unused_vnodes(level);
1485 }
1486 
1487 
1488 static inline void
1489 put_advisory_locking(struct advisory_locking* locking)
1490 {
1491 	release_sem(locking->lock);
1492 }
1493 
1494 
1495 /*!	Returns the advisory_locking object of the \a vnode in case it
1496 	has one, and locks it.
1497 	You have to call put_advisory_locking() when you're done with
1498 	it.
1499 	Note, you must not have the vnode mutex locked when calling
1500 	this function.
1501 */
1502 static struct advisory_locking*
1503 get_advisory_locking(struct vnode* vnode)
1504 {
1505 	rw_lock_read_lock(&sVnodeLock);
1506 	vnode->Lock();
1507 
1508 	struct advisory_locking* locking = vnode->advisory_locking;
1509 	sem_id lock = locking != NULL ? locking->lock : B_ERROR;
1510 
1511 	vnode->Unlock();
1512 	rw_lock_read_unlock(&sVnodeLock);
1513 
1514 	if (lock >= 0)
1515 		lock = acquire_sem(lock);
1516 	if (lock < 0) {
1517 		// This means the locking has been deleted in the mean time
1518 		// or had never existed in the first place - otherwise, we
1519 		// would get the lock at some point.
1520 		return NULL;
1521 	}
1522 
1523 	return locking;
1524 }
1525 
1526 
1527 /*!	Creates a locked advisory_locking object, and attaches it to the
1528 	given \a vnode.
1529 	Returns B_OK in case of success - also if the vnode got such an
1530 	object from someone else in the mean time, you'll still get this
1531 	one locked then.
1532 */
1533 static status_t
1534 create_advisory_locking(struct vnode* vnode)
1535 {
1536 	if (vnode == NULL)
1537 		return B_FILE_ERROR;
1538 
1539 	ObjectDeleter<advisory_locking> lockingDeleter;
1540 	struct advisory_locking* locking = NULL;
1541 
1542 	while (get_advisory_locking(vnode) == NULL) {
1543 		// no locking object set on the vnode yet, create one
1544 		if (locking == NULL) {
1545 			locking = new(std::nothrow) advisory_locking;
1546 			if (locking == NULL)
1547 				return B_NO_MEMORY;
1548 			lockingDeleter.SetTo(locking);
1549 
1550 			locking->wait_sem = create_sem(0, "advisory lock");
1551 			if (locking->wait_sem < 0)
1552 				return locking->wait_sem;
1553 
1554 			locking->lock = create_sem(0, "advisory locking");
1555 			if (locking->lock < 0)
1556 				return locking->lock;
1557 		}
1558 
1559 		// set our newly created locking object
1560 		ReadLocker _(sVnodeLock);
1561 		AutoLocker<Vnode> nodeLocker(vnode);
1562 		if (vnode->advisory_locking == NULL) {
1563 			vnode->advisory_locking = locking;
1564 			lockingDeleter.Detach();
1565 			return B_OK;
1566 		}
1567 	}
1568 
1569 	// The vnode already had a locking object. That's just as well.
1570 
1571 	return B_OK;
1572 }
1573 
1574 
1575 /*! Returns \c true when either \a flock is \c NULL or the \a flock intersects
1576 	with the advisory_lock \a lock.
1577 */
1578 static bool
1579 advisory_lock_intersects(struct advisory_lock* lock, struct flock* flock)
1580 {
1581 	if (flock == NULL)
1582 		return true;
1583 
1584 	return lock->start <= flock->l_start - 1 + flock->l_len
1585 		&& lock->end >= flock->l_start;
1586 }
1587 
1588 
1589 /*!	Tests whether acquiring a lock would block.
1590 */
1591 static status_t
1592 test_advisory_lock(struct vnode* vnode, struct flock* flock)
1593 {
1594 	flock->l_type = F_UNLCK;
1595 
1596 	struct advisory_locking* locking = get_advisory_locking(vnode);
1597 	if (locking == NULL)
1598 		return B_OK;
1599 
1600 	team_id team = team_get_current_team_id();
1601 
1602 	LockList::Iterator iterator = locking->locks.GetIterator();
1603 	while (iterator.HasNext()) {
1604 		struct advisory_lock* lock = iterator.Next();
1605 
1606 		 if (lock->team != team && advisory_lock_intersects(lock, flock)) {
1607 			// locks do overlap
1608 			if (flock->l_type != F_RDLCK || !lock->shared) {
1609 				// collision
1610 				flock->l_type = lock->shared ? F_RDLCK : F_WRLCK;
1611 				flock->l_whence = SEEK_SET;
1612 				flock->l_start = lock->start;
1613 				flock->l_len = lock->end - lock->start + 1;
1614 				flock->l_pid = lock->team;
1615 				break;
1616 			}
1617 		}
1618 	}
1619 
1620 	put_advisory_locking(locking);
1621 	return B_OK;
1622 }
1623 
1624 
1625 /*!	Removes the specified lock, or all locks of the calling team
1626 	if \a flock is NULL.
1627 */
1628 static status_t
1629 release_advisory_lock(struct vnode* vnode, struct flock* flock)
1630 {
1631 	FUNCTION(("release_advisory_lock(vnode = %p, flock = %p)\n", vnode, flock));
1632 
1633 	struct advisory_locking* locking = get_advisory_locking(vnode);
1634 	if (locking == NULL)
1635 		return B_OK;
1636 
1637 	// TODO: use the thread ID instead??
1638 	team_id team = team_get_current_team_id();
1639 	pid_t session = thread_get_current_thread()->team->session_id;
1640 
1641 	// find matching lock entries
1642 
1643 	LockList::Iterator iterator = locking->locks.GetIterator();
1644 	while (iterator.HasNext()) {
1645 		struct advisory_lock* lock = iterator.Next();
1646 		bool removeLock = false;
1647 
1648 		if (lock->session == session)
1649 			removeLock = true;
1650 		else if (lock->team == team && advisory_lock_intersects(lock, flock)) {
1651 			bool endsBeyond = false;
1652 			bool startsBefore = false;
1653 			if (flock != NULL) {
1654 				startsBefore = lock->start < flock->l_start;
1655 				endsBeyond = lock->end > flock->l_start - 1 + flock->l_len;
1656 			}
1657 
1658 			if (!startsBefore && !endsBeyond) {
1659 				// lock is completely contained in flock
1660 				removeLock = true;
1661 			} else if (startsBefore && !endsBeyond) {
1662 				// cut the end of the lock
1663 				lock->end = flock->l_start - 1;
1664 			} else if (!startsBefore && endsBeyond) {
1665 				// cut the start of the lock
1666 				lock->start = flock->l_start + flock->l_len;
1667 			} else {
1668 				// divide the lock into two locks
1669 				struct advisory_lock* secondLock = new advisory_lock;
1670 				if (secondLock == NULL) {
1671 					// TODO: we should probably revert the locks we already
1672 					// changed... (ie. allocate upfront)
1673 					put_advisory_locking(locking);
1674 					return B_NO_MEMORY;
1675 				}
1676 
1677 				lock->end = flock->l_start - 1;
1678 
1679 				secondLock->team = lock->team;
1680 				secondLock->session = lock->session;
1681 				// values must already be normalized when getting here
1682 				secondLock->start = flock->l_start + flock->l_len;
1683 				secondLock->end = lock->end;
1684 				secondLock->shared = lock->shared;
1685 
1686 				locking->locks.Add(secondLock);
1687 			}
1688 		}
1689 
1690 		if (removeLock) {
1691 			// this lock is no longer used
1692 			iterator.Remove();
1693 			free(lock);
1694 		}
1695 	}
1696 
1697 	bool removeLocking = locking->locks.IsEmpty();
1698 	release_sem_etc(locking->wait_sem, 1, B_RELEASE_ALL);
1699 
1700 	put_advisory_locking(locking);
1701 
1702 	if (removeLocking) {
1703 		// We can remove the whole advisory locking structure; it's no
1704 		// longer used
1705 		locking = get_advisory_locking(vnode);
1706 		if (locking != NULL) {
1707 			ReadLocker locker(sVnodeLock);
1708 			AutoLocker<Vnode> nodeLocker(vnode);
1709 
1710 			// the locking could have been changed in the mean time
1711 			if (locking->locks.IsEmpty()) {
1712 				vnode->advisory_locking = NULL;
1713 				nodeLocker.Unlock();
1714 				locker.Unlock();
1715 
1716 				// we've detached the locking from the vnode, so we can
1717 				// safely delete it
1718 				delete locking;
1719 			} else {
1720 				// the locking is in use again
1721 				nodeLocker.Unlock();
1722 				locker.Unlock();
1723 				release_sem_etc(locking->lock, 1, B_DO_NOT_RESCHEDULE);
1724 			}
1725 		}
1726 	}
1727 
1728 	return B_OK;
1729 }
1730 
1731 
1732 /*!	Acquires an advisory lock for the \a vnode. If \a wait is \c true, it
1733 	will wait for the lock to become available, if there are any collisions
1734 	(it will return B_PERMISSION_DENIED in this case if \a wait is \c false).
1735 
1736 	If \a session is -1, POSIX semantics are used for this lock. Otherwise,
1737 	BSD flock() semantics are used, that is, all children can unlock the file
1738 	in question (we even allow parents to remove the lock, though, but that
1739 	seems to be in line to what the BSD's are doing).
1740 */
1741 static status_t
1742 acquire_advisory_lock(struct vnode* vnode, pid_t session, struct flock* flock,
1743 	bool wait)
1744 {
1745 	FUNCTION(("acquire_advisory_lock(vnode = %p, flock = %p, wait = %s)\n",
1746 		vnode, flock, wait ? "yes" : "no"));
1747 
1748 	bool shared = flock->l_type == F_RDLCK;
1749 	status_t status = B_OK;
1750 
1751 	// TODO: do deadlock detection!
1752 
1753 	struct advisory_locking* locking;
1754 
1755 	while (true) {
1756 		// if this vnode has an advisory_locking structure attached,
1757 		// lock that one and search for any colliding file lock
1758 		status = create_advisory_locking(vnode);
1759 		if (status != B_OK)
1760 			return status;
1761 
1762 		locking = vnode->advisory_locking;
1763 		team_id team = team_get_current_team_id();
1764 		sem_id waitForLock = -1;
1765 
1766 		// test for collisions
1767 		LockList::Iterator iterator = locking->locks.GetIterator();
1768 		while (iterator.HasNext()) {
1769 			struct advisory_lock* lock = iterator.Next();
1770 
1771 			// TODO: locks from the same team might be joinable!
1772 			if (lock->team != team && advisory_lock_intersects(lock, flock)) {
1773 				// locks do overlap
1774 				if (!shared || !lock->shared) {
1775 					// we need to wait
1776 					waitForLock = locking->wait_sem;
1777 					break;
1778 				}
1779 			}
1780 		}
1781 
1782 		if (waitForLock < 0)
1783 			break;
1784 
1785 		// We need to wait. Do that or fail now, if we've been asked not to.
1786 
1787 		if (!wait) {
1788 			put_advisory_locking(locking);
1789 			return session != -1 ? B_WOULD_BLOCK : B_PERMISSION_DENIED;
1790 		}
1791 
1792 		status = switch_sem_etc(locking->lock, waitForLock, 1,
1793 			B_CAN_INTERRUPT, 0);
1794 		if (status != B_OK && status != B_BAD_SEM_ID)
1795 			return status;
1796 
1797 		// We have been notified, but we need to re-lock the locking object. So
1798 		// go another round...
1799 	}
1800 
1801 	// install new lock
1802 
1803 	struct advisory_lock* lock = (struct advisory_lock*)malloc(
1804 		sizeof(struct advisory_lock));
1805 	if (lock == NULL) {
1806 		put_advisory_locking(locking);
1807 		return B_NO_MEMORY;
1808 	}
1809 
1810 	lock->team = team_get_current_team_id();
1811 	lock->session = session;
1812 	// values must already be normalized when getting here
1813 	lock->start = flock->l_start;
1814 	lock->end = flock->l_start - 1 + flock->l_len;
1815 	lock->shared = shared;
1816 
1817 	locking->locks.Add(lock);
1818 	put_advisory_locking(locking);
1819 
1820 	return status;
1821 }
1822 
1823 
1824 /*!	Normalizes the \a flock structure to make it easier to compare the
1825 	structure with others. The l_start and l_len fields are set to absolute
1826 	values according to the l_whence field.
1827 */
1828 static status_t
1829 normalize_flock(struct file_descriptor* descriptor, struct flock* flock)
1830 {
1831 	switch (flock->l_whence) {
1832 		case SEEK_SET:
1833 			break;
1834 		case SEEK_CUR:
1835 			flock->l_start += descriptor->pos;
1836 			break;
1837 		case SEEK_END:
1838 		{
1839 			struct vnode* vnode = descriptor->u.vnode;
1840 			struct stat stat;
1841 			status_t status;
1842 
1843 			if (!HAS_FS_CALL(vnode, read_stat))
1844 				return B_UNSUPPORTED;
1845 
1846 			status = FS_CALL(vnode, read_stat, &stat);
1847 			if (status != B_OK)
1848 				return status;
1849 
1850 			flock->l_start += stat.st_size;
1851 			break;
1852 		}
1853 		default:
1854 			return B_BAD_VALUE;
1855 	}
1856 
1857 	if (flock->l_start < 0)
1858 		flock->l_start = 0;
1859 	if (flock->l_len == 0)
1860 		flock->l_len = OFF_MAX;
1861 
1862 	// don't let the offset and length overflow
1863 	if (flock->l_start > 0 && OFF_MAX - flock->l_start < flock->l_len)
1864 		flock->l_len = OFF_MAX - flock->l_start;
1865 
1866 	if (flock->l_len < 0) {
1867 		// a negative length reverses the region
1868 		flock->l_start += flock->l_len;
1869 		flock->l_len = -flock->l_len;
1870 	}
1871 
1872 	return B_OK;
1873 }
1874 
1875 
1876 static void
1877 replace_vnode_if_disconnected(struct fs_mount* mount,
1878 	struct vnode* vnodeToDisconnect, struct vnode*& vnode,
1879 	struct vnode* fallBack, bool lockRootLock)
1880 {
1881 	struct vnode* givenVnode = vnode;
1882 	bool vnodeReplaced = false;
1883 
1884 	ReadLocker vnodeReadLocker(sVnodeLock);
1885 
1886 	if (lockRootLock)
1887 		mutex_lock(&sIOContextRootLock);
1888 
1889 	while (vnode != NULL && vnode->mount == mount
1890 		&& (vnodeToDisconnect == NULL || vnodeToDisconnect == vnode)) {
1891 		if (vnode->covers != NULL) {
1892 			// redirect the vnode to the covered vnode
1893 			vnode = vnode->covers;
1894 		} else
1895 			vnode = fallBack;
1896 
1897 		vnodeReplaced = true;
1898 	}
1899 
1900 	// If we've replaced the node, grab a reference for the new one.
1901 	if (vnodeReplaced && vnode != NULL)
1902 		inc_vnode_ref_count(vnode);
1903 
1904 	if (lockRootLock)
1905 		mutex_unlock(&sIOContextRootLock);
1906 
1907 	vnodeReadLocker.Unlock();
1908 
1909 	if (vnodeReplaced)
1910 		put_vnode(givenVnode);
1911 }
1912 
1913 
1914 /*!	Disconnects all file descriptors that are associated with the
1915 	\a vnodeToDisconnect, or if this is NULL, all vnodes of the specified
1916 	\a mount object.
1917 
1918 	Note, after you've called this function, there might still be ongoing
1919 	accesses - they won't be interrupted if they already happened before.
1920 	However, any subsequent access will fail.
1921 
1922 	This is not a cheap function and should be used with care and rarely.
1923 	TODO: there is currently no means to stop a blocking read/write!
1924 */
1925 static void
1926 disconnect_mount_or_vnode_fds(struct fs_mount* mount,
1927 	struct vnode* vnodeToDisconnect)
1928 {
1929 	// iterate over all teams and peek into their file descriptors
1930 	TeamListIterator teamIterator;
1931 	while (Team* team = teamIterator.Next()) {
1932 		BReference<Team> teamReference(team, true);
1933 		TeamLocker teamLocker(team);
1934 
1935 		// lock the I/O context
1936 		io_context* context = team->io_context;
1937 		if (context == NULL)
1938 			continue;
1939 		MutexLocker contextLocker(context->io_mutex);
1940 
1941 		teamLocker.Unlock();
1942 
1943 		replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->root,
1944 			sRoot, true);
1945 		replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->cwd,
1946 			sRoot, false);
1947 
1948 		for (uint32 i = 0; i < context->table_size; i++) {
1949 			if (struct file_descriptor* descriptor = context->fds[i]) {
1950 				inc_fd_ref_count(descriptor);
1951 
1952 				// if this descriptor points at this mount, we
1953 				// need to disconnect it to be able to unmount
1954 				struct vnode* vnode = fd_vnode(descriptor);
1955 				if (vnodeToDisconnect != NULL) {
1956 					if (vnode == vnodeToDisconnect)
1957 						disconnect_fd(descriptor);
1958 				} else if ((vnode != NULL && vnode->mount == mount)
1959 					|| (vnode == NULL && descriptor->u.mount == mount))
1960 					disconnect_fd(descriptor);
1961 
1962 				put_fd(descriptor);
1963 			}
1964 		}
1965 	}
1966 }
1967 
1968 
1969 /*!	\brief Gets the root node of the current IO context.
1970 	If \a kernel is \c true, the kernel IO context will be used.
1971 	The caller obtains a reference to the returned node.
1972 */
1973 struct vnode*
1974 get_root_vnode(bool kernel)
1975 {
1976 	if (!kernel) {
1977 		// Get current working directory from io context
1978 		struct io_context* context = get_current_io_context(kernel);
1979 
1980 		mutex_lock(&sIOContextRootLock);
1981 
1982 		struct vnode* root = context->root;
1983 		if (root != NULL)
1984 			inc_vnode_ref_count(root);
1985 
1986 		mutex_unlock(&sIOContextRootLock);
1987 
1988 		if (root != NULL)
1989 			return root;
1990 
1991 		// That should never happen.
1992 		dprintf("get_root_vnode(): IO context for team %" B_PRId32 " doesn't "
1993 			"have a root\n", team_get_current_team_id());
1994 	}
1995 
1996 	inc_vnode_ref_count(sRoot);
1997 	return sRoot;
1998 }
1999 
2000 
2001 /*!	\brief Gets the directory path and leaf name for a given path.
2002 
2003 	The supplied \a path is transformed to refer to the directory part of
2004 	the entry identified by the original path, and into the buffer \a filename
2005 	the leaf name of the original entry is written.
2006 	Neither the returned path nor the leaf name can be expected to be
2007 	canonical.
2008 
2009 	\param path The path to be analyzed. Must be able to store at least one
2010 		   additional character.
2011 	\param filename The buffer into which the leaf name will be written.
2012 		   Must be of size B_FILE_NAME_LENGTH at least.
2013 	\return \c B_OK, if everything went fine, \c B_NAME_TOO_LONG, if the leaf
2014 		   name is longer than \c B_FILE_NAME_LENGTH, or \c B_ENTRY_NOT_FOUND,
2015 		   if the given path name is empty.
2016 */
2017 static status_t
2018 get_dir_path_and_leaf(char* path, char* filename)
2019 {
2020 	if (*path == '\0')
2021 		return B_ENTRY_NOT_FOUND;
2022 
2023 	char* last = strrchr(path, '/');
2024 		// '/' are not allowed in file names!
2025 
2026 	FUNCTION(("get_dir_path_and_leaf(path = %s)\n", path));
2027 
2028 	if (last == NULL) {
2029 		// this path is single segment with no '/' in it
2030 		// ex. "foo"
2031 		if (strlcpy(filename, path, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2032 			return B_NAME_TOO_LONG;
2033 
2034 		strcpy(path, ".");
2035 	} else {
2036 		last++;
2037 		if (last[0] == '\0') {
2038 			// special case: the path ends in one or more '/' - remove them
2039 			while (*--last == '/' && last != path);
2040 			last[1] = '\0';
2041 
2042 			if (last == path && last[0] == '/') {
2043 				// This path points to the root of the file system
2044 				strcpy(filename, ".");
2045 				return B_OK;
2046 			}
2047 			for (; last != path && *(last - 1) != '/'; last--);
2048 				// rewind to the start of the leaf before the '/'
2049 		}
2050 
2051 		// normal leaf: replace the leaf portion of the path with a '.'
2052 		if (strlcpy(filename, last, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2053 			return B_NAME_TOO_LONG;
2054 
2055 		last[0] = '.';
2056 		last[1] = '\0';
2057 	}
2058 	return B_OK;
2059 }
2060 
2061 
2062 static status_t
2063 entry_ref_to_vnode(dev_t mountID, ino_t directoryID, const char* name,
2064 	bool traverse, bool kernel, struct vnode** _vnode)
2065 {
2066 	char clonedName[B_FILE_NAME_LENGTH + 1];
2067 	if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2068 		return B_NAME_TOO_LONG;
2069 
2070 	// get the directory vnode and let vnode_path_to_vnode() do the rest
2071 	struct vnode* directory;
2072 
2073 	status_t status = get_vnode(mountID, directoryID, &directory, true, false);
2074 	if (status < 0)
2075 		return status;
2076 
2077 	return vnode_path_to_vnode(directory, clonedName, traverse, 0, kernel,
2078 		_vnode, NULL);
2079 }
2080 
2081 
2082 /*!	Looks up the entry with name \a name in the directory represented by \a dir
2083 	and returns the respective vnode.
2084 	On success a reference to the vnode is acquired for the caller.
2085 */
2086 static status_t
2087 lookup_dir_entry(struct vnode* dir, const char* name, struct vnode** _vnode)
2088 {
2089 	ino_t id;
2090 	bool missing;
2091 
2092 	if (dir->mount->entry_cache.Lookup(dir->id, name, id, missing)) {
2093 		return missing ? B_ENTRY_NOT_FOUND
2094 			: get_vnode(dir->device, id, _vnode, true, false);
2095 	}
2096 
2097 	status_t status = FS_CALL(dir, lookup, name, &id);
2098 	if (status != B_OK)
2099 		return status;
2100 
2101 	// The lookup() hook calls get_vnode() or publish_vnode(), so we do already
2102 	// have a reference and just need to look the node up.
2103 	rw_lock_read_lock(&sVnodeLock);
2104 	*_vnode = lookup_vnode(dir->device, id);
2105 	rw_lock_read_unlock(&sVnodeLock);
2106 
2107 	if (*_vnode == NULL) {
2108 		panic("lookup_dir_entry(): could not lookup vnode (mountid 0x%" B_PRIx32
2109 			" vnid 0x%" B_PRIx64 ")\n", dir->device, id);
2110 		return B_ENTRY_NOT_FOUND;
2111 	}
2112 
2113 //	ktrace_printf("lookup_dir_entry(): dir: %p (%ld, %lld), name: \"%s\" -> "
2114 //		"%p (%ld, %lld)", dir, dir->mount->id, dir->id, name, *_vnode,
2115 //		(*_vnode)->mount->id, (*_vnode)->id);
2116 
2117 	return B_OK;
2118 }
2119 
2120 
2121 /*!	Returns the vnode for the relative path starting at the specified \a vnode.
2122 	\a path must not be NULL.
2123 	If it returns successfully, \a path contains the name of the last path
2124 	component. This function clobbers the buffer pointed to by \a path only
2125 	if it does contain more than one component.
2126 	Note, this reduces the ref_count of the starting \a vnode, no matter if
2127 	it is successful or not!
2128 */
2129 static status_t
2130 vnode_path_to_vnode(struct vnode* vnode, char* path, bool traverseLeafLink,
2131 	int count, struct io_context* ioContext, struct vnode** _vnode,
2132 	ino_t* _parentID)
2133 {
2134 	status_t status = B_OK;
2135 	ino_t lastParentID = vnode->id;
2136 
2137 	FUNCTION(("vnode_path_to_vnode(vnode = %p, path = %s)\n", vnode, path));
2138 
2139 	if (path == NULL) {
2140 		put_vnode(vnode);
2141 		return B_BAD_VALUE;
2142 	}
2143 
2144 	if (*path == '\0') {
2145 		put_vnode(vnode);
2146 		return B_ENTRY_NOT_FOUND;
2147 	}
2148 
2149 	while (true) {
2150 		struct vnode* nextVnode;
2151 		char* nextPath;
2152 
2153 		TRACE(("vnode_path_to_vnode: top of loop. p = %p, p = '%s'\n", path,
2154 			path));
2155 
2156 		// done?
2157 		if (path[0] == '\0')
2158 			break;
2159 
2160 		// walk to find the next path component ("path" will point to a single
2161 		// path component), and filter out multiple slashes
2162 		for (nextPath = path + 1; *nextPath != '\0' && *nextPath != '/';
2163 				nextPath++);
2164 
2165 		if (*nextPath == '/') {
2166 			*nextPath = '\0';
2167 			do
2168 				nextPath++;
2169 			while (*nextPath == '/');
2170 		}
2171 
2172 		// See if the '..' is at a covering vnode move to the covered
2173 		// vnode so we pass the '..' path to the underlying filesystem.
2174 		// Also prevent breaking the root of the IO context.
2175 		if (strcmp("..", path) == 0) {
2176 			if (vnode == ioContext->root) {
2177 				// Attempted prison break! Keep it contained.
2178 				path = nextPath;
2179 				continue;
2180 			}
2181 
2182 			if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2183 				nextVnode = coveredVnode;
2184 				put_vnode(vnode);
2185 				vnode = nextVnode;
2186 			}
2187 		}
2188 
2189 		// check if vnode is really a directory
2190 		if (status == B_OK && !S_ISDIR(vnode->Type()))
2191 			status = B_NOT_A_DIRECTORY;
2192 
2193 		// Check if we have the right to search the current directory vnode.
2194 		// If a file system doesn't have the access() function, we assume that
2195 		// searching a directory is always allowed
2196 		if (status == B_OK && HAS_FS_CALL(vnode, access))
2197 			status = FS_CALL(vnode, access, X_OK);
2198 
2199 		// Tell the filesystem to get the vnode of this path component (if we
2200 		// got the permission from the call above)
2201 		if (status == B_OK)
2202 			status = lookup_dir_entry(vnode, path, &nextVnode);
2203 
2204 		if (status != B_OK) {
2205 			put_vnode(vnode);
2206 			return status;
2207 		}
2208 
2209 		// If the new node is a symbolic link, resolve it (if we've been told
2210 		// to do it)
2211 		if (S_ISLNK(nextVnode->Type())
2212 			&& (traverseLeafLink || nextPath[0] != '\0')) {
2213 			size_t bufferSize;
2214 			char* buffer;
2215 
2216 			TRACE(("traverse link\n"));
2217 
2218 			// it's not exactly nice style using goto in this way, but hey,
2219 			// it works :-/
2220 			if (count + 1 > B_MAX_SYMLINKS) {
2221 				status = B_LINK_LIMIT;
2222 				goto resolve_link_error;
2223 			}
2224 
2225 			buffer = (char*)malloc(bufferSize = B_PATH_NAME_LENGTH);
2226 			if (buffer == NULL) {
2227 				status = B_NO_MEMORY;
2228 				goto resolve_link_error;
2229 			}
2230 
2231 			if (HAS_FS_CALL(nextVnode, read_symlink)) {
2232 				bufferSize--;
2233 				status = FS_CALL(nextVnode, read_symlink, buffer, &bufferSize);
2234 				// null-terminate
2235 				if (status >= 0)
2236 					buffer[bufferSize] = '\0';
2237 			} else
2238 				status = B_BAD_VALUE;
2239 
2240 			if (status != B_OK) {
2241 				free(buffer);
2242 
2243 		resolve_link_error:
2244 				put_vnode(vnode);
2245 				put_vnode(nextVnode);
2246 
2247 				return status;
2248 			}
2249 			put_vnode(nextVnode);
2250 
2251 			// Check if we start from the root directory or the current
2252 			// directory ("vnode" still points to that one).
2253 			// Cut off all leading slashes if it's the root directory
2254 			path = buffer;
2255 			bool absoluteSymlink = false;
2256 			if (path[0] == '/') {
2257 				// we don't need the old directory anymore
2258 				put_vnode(vnode);
2259 
2260 				while (*++path == '/')
2261 					;
2262 
2263 				mutex_lock(&sIOContextRootLock);
2264 				vnode = ioContext->root;
2265 				inc_vnode_ref_count(vnode);
2266 				mutex_unlock(&sIOContextRootLock);
2267 
2268 				absoluteSymlink = true;
2269 			}
2270 
2271 			inc_vnode_ref_count(vnode);
2272 				// balance the next recursion - we will decrement the
2273 				// ref_count of the vnode, no matter if we succeeded or not
2274 
2275 			if (absoluteSymlink && *path == '\0') {
2276 				// symlink was just "/"
2277 				nextVnode = vnode;
2278 			} else {
2279 				status = vnode_path_to_vnode(vnode, path, true, count + 1,
2280 					ioContext, &nextVnode, &lastParentID);
2281 			}
2282 
2283 			free(buffer);
2284 
2285 			if (status != B_OK) {
2286 				put_vnode(vnode);
2287 				return status;
2288 			}
2289 		} else
2290 			lastParentID = vnode->id;
2291 
2292 		// decrease the ref count on the old dir we just looked up into
2293 		put_vnode(vnode);
2294 
2295 		path = nextPath;
2296 		vnode = nextVnode;
2297 
2298 		// see if we hit a covered node
2299 		if (Vnode* coveringNode = get_covering_vnode(vnode)) {
2300 			put_vnode(vnode);
2301 			vnode = coveringNode;
2302 		}
2303 	}
2304 
2305 	*_vnode = vnode;
2306 	if (_parentID)
2307 		*_parentID = lastParentID;
2308 
2309 	return B_OK;
2310 }
2311 
2312 
2313 static status_t
2314 vnode_path_to_vnode(struct vnode* vnode, char* path, bool traverseLeafLink,
2315 	int count, bool kernel, struct vnode** _vnode, ino_t* _parentID)
2316 {
2317 	return vnode_path_to_vnode(vnode, path, traverseLeafLink, count,
2318 		get_current_io_context(kernel), _vnode, _parentID);
2319 }
2320 
2321 
2322 static status_t
2323 path_to_vnode(char* path, bool traverseLink, struct vnode** _vnode,
2324 	ino_t* _parentID, bool kernel)
2325 {
2326 	struct vnode* start = NULL;
2327 
2328 	FUNCTION(("path_to_vnode(path = \"%s\")\n", path));
2329 
2330 	if (!path)
2331 		return B_BAD_VALUE;
2332 
2333 	if (*path == '\0')
2334 		return B_ENTRY_NOT_FOUND;
2335 
2336 	// figure out if we need to start at root or at cwd
2337 	if (*path == '/') {
2338 		if (sRoot == NULL) {
2339 			// we're a bit early, aren't we?
2340 			return B_ERROR;
2341 		}
2342 
2343 		while (*++path == '/')
2344 			;
2345 		start = get_root_vnode(kernel);
2346 
2347 		if (*path == '\0') {
2348 			*_vnode = start;
2349 			return B_OK;
2350 		}
2351 
2352 	} else {
2353 		struct io_context* context = get_current_io_context(kernel);
2354 
2355 		mutex_lock(&context->io_mutex);
2356 		start = context->cwd;
2357 		if (start != NULL)
2358 			inc_vnode_ref_count(start);
2359 		mutex_unlock(&context->io_mutex);
2360 
2361 		if (start == NULL)
2362 			return B_ERROR;
2363 	}
2364 
2365 	return vnode_path_to_vnode(start, path, traverseLink, 0, kernel, _vnode,
2366 		_parentID);
2367 }
2368 
2369 
2370 /*! Returns the vnode in the next to last segment of the path, and returns
2371 	the last portion in filename.
2372 	The path buffer must be able to store at least one additional character.
2373 */
2374 static status_t
2375 path_to_dir_vnode(char* path, struct vnode** _vnode, char* filename,
2376 	bool kernel)
2377 {
2378 	status_t status = get_dir_path_and_leaf(path, filename);
2379 	if (status != B_OK)
2380 		return status;
2381 
2382 	return path_to_vnode(path, true, _vnode, NULL, kernel);
2383 }
2384 
2385 
2386 /*!	\brief Retrieves the directory vnode and the leaf name of an entry referred
2387 		   to by a FD + path pair.
2388 
2389 	\a path must be given in either case. \a fd might be omitted, in which
2390 	case \a path is either an absolute path or one relative to the current
2391 	directory. If both a supplied and \a path is relative it is reckoned off
2392 	of the directory referred to by \a fd. If \a path is absolute \a fd is
2393 	ignored.
2394 
2395 	The caller has the responsibility to call put_vnode() on the returned
2396 	directory vnode.
2397 
2398 	\param fd The FD. May be < 0.
2399 	\param path The absolute or relative path. Must not be \c NULL. The buffer
2400 	       is modified by this function. It must have at least room for a
2401 	       string one character longer than the path it contains.
2402 	\param _vnode A pointer to a variable the directory vnode shall be written
2403 		   into.
2404 	\param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2405 		   the leaf name of the specified entry will be written.
2406 	\param kernel \c true, if invoked from inside the kernel, \c false if
2407 		   invoked from userland.
2408 	\return \c B_OK, if everything went fine, another error code otherwise.
2409 */
2410 static status_t
2411 fd_and_path_to_dir_vnode(int fd, char* path, struct vnode** _vnode,
2412 	char* filename, bool kernel)
2413 {
2414 	if (!path)
2415 		return B_BAD_VALUE;
2416 	if (*path == '\0')
2417 		return B_ENTRY_NOT_FOUND;
2418 	if (fd < 0)
2419 		return path_to_dir_vnode(path, _vnode, filename, kernel);
2420 
2421 	status_t status = get_dir_path_and_leaf(path, filename);
2422 	if (status != B_OK)
2423 		return status;
2424 
2425 	return fd_and_path_to_vnode(fd, path, true, _vnode, NULL, kernel);
2426 }
2427 
2428 
2429 /*!	\brief Retrieves the directory vnode and the leaf name of an entry referred
2430 		   to by a vnode + path pair.
2431 
2432 	\a path must be given in either case. \a vnode might be omitted, in which
2433 	case \a path is either an absolute path or one relative to the current
2434 	directory. If both a supplied and \a path is relative it is reckoned off
2435 	of the directory referred to by \a vnode. If \a path is absolute \a vnode is
2436 	ignored.
2437 
2438 	The caller has the responsibility to call put_vnode() on the returned
2439 	directory vnode.
2440 
2441 	\param vnode The vnode. May be \c NULL.
2442 	\param path The absolute or relative path. Must not be \c NULL. The buffer
2443 	       is modified by this function. It must have at least room for a
2444 	       string one character longer than the path it contains.
2445 	\param _vnode A pointer to a variable the directory vnode shall be written
2446 		   into.
2447 	\param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2448 		   the leaf name of the specified entry will be written.
2449 	\param kernel \c true, if invoked from inside the kernel, \c false if
2450 		   invoked from userland.
2451 	\return \c B_OK, if everything went fine, another error code otherwise.
2452 */
2453 static status_t
2454 vnode_and_path_to_dir_vnode(struct vnode* vnode, char* path,
2455 	struct vnode** _vnode, char* filename, bool kernel)
2456 {
2457 	if (!path)
2458 		return B_BAD_VALUE;
2459 	if (*path == '\0')
2460 		return B_ENTRY_NOT_FOUND;
2461 	if (vnode == NULL || path[0] == '/')
2462 		return path_to_dir_vnode(path, _vnode, filename, kernel);
2463 
2464 	status_t status = get_dir_path_and_leaf(path, filename);
2465 	if (status != B_OK)
2466 		return status;
2467 
2468 	inc_vnode_ref_count(vnode);
2469 		// vnode_path_to_vnode() always decrements the ref count
2470 
2471 	return vnode_path_to_vnode(vnode, path, true, 0, kernel, _vnode, NULL);
2472 }
2473 
2474 
2475 /*! Returns a vnode's name in the d_name field of a supplied dirent buffer.
2476 */
2477 static status_t
2478 get_vnode_name(struct vnode* vnode, struct vnode* parent, struct dirent* buffer,
2479 	size_t bufferSize, struct io_context* ioContext)
2480 {
2481 	if (bufferSize < sizeof(struct dirent))
2482 		return B_BAD_VALUE;
2483 
2484 	// See if the vnode is covering another vnode and move to the covered
2485 	// vnode so we get the underlying file system
2486 	VNodePutter vnodePutter;
2487 	if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2488 		vnode = coveredVnode;
2489 		vnodePutter.SetTo(vnode);
2490 	}
2491 
2492 	if (HAS_FS_CALL(vnode, get_vnode_name)) {
2493 		// The FS supports getting the name of a vnode.
2494 		if (FS_CALL(vnode, get_vnode_name, buffer->d_name,
2495 			(char*)buffer + bufferSize - buffer->d_name) == B_OK)
2496 			return B_OK;
2497 	}
2498 
2499 	// The FS doesn't support getting the name of a vnode. So we search the
2500 	// parent directory for the vnode, if the caller let us.
2501 
2502 	if (parent == NULL || !HAS_FS_CALL(parent, read_dir))
2503 		return B_UNSUPPORTED;
2504 
2505 	void* cookie;
2506 
2507 	status_t status = FS_CALL(parent, open_dir, &cookie);
2508 	if (status >= B_OK) {
2509 		while (true) {
2510 			uint32 num = 1;
2511 			// We use the FS hook directly instead of dir_read(), since we don't
2512 			// want the entries to be fixed. We have already resolved vnode to
2513 			// the covered node.
2514 			status = FS_CALL(parent, read_dir, cookie, buffer, bufferSize,
2515 				&num);
2516 			if (status != B_OK)
2517 				break;
2518 			if (num == 0) {
2519 				status = B_ENTRY_NOT_FOUND;
2520 				break;
2521 			}
2522 
2523 			if (vnode->id == buffer->d_ino) {
2524 				// found correct entry!
2525 				break;
2526 			}
2527 		}
2528 
2529 		FS_CALL(parent, close_dir, cookie);
2530 		FS_CALL(parent, free_dir_cookie, cookie);
2531 	}
2532 	return status;
2533 }
2534 
2535 
2536 static status_t
2537 get_vnode_name(struct vnode* vnode, struct vnode* parent, char* name,
2538 	size_t nameSize, bool kernel)
2539 {
2540 	char buffer[sizeof(struct dirent) + B_FILE_NAME_LENGTH];
2541 	struct dirent* dirent = (struct dirent*)buffer;
2542 
2543 	status_t status = get_vnode_name(vnode, parent, dirent, sizeof(buffer),
2544 		get_current_io_context(kernel));
2545 	if (status != B_OK)
2546 		return status;
2547 
2548 	if (strlcpy(name, dirent->d_name, nameSize) >= nameSize)
2549 		return B_BUFFER_OVERFLOW;
2550 
2551 	return B_OK;
2552 }
2553 
2554 
2555 /*!	Gets the full path to a given directory vnode.
2556 	It uses the fs_get_vnode_name() call to get the name of a vnode; if a
2557 	file system doesn't support this call, it will fall back to iterating
2558 	through the parent directory to get the name of the child.
2559 
2560 	To protect against circular loops, it supports a maximum tree depth
2561 	of 256 levels.
2562 
2563 	Note that the path may not be correct the time this function returns!
2564 	It doesn't use any locking to prevent returning the correct path, as
2565 	paths aren't safe anyway: the path to a file can change at any time.
2566 
2567 	It might be a good idea, though, to check if the returned path exists
2568 	in the calling function (it's not done here because of efficiency)
2569 */
2570 static status_t
2571 dir_vnode_to_path(struct vnode* vnode, char* buffer, size_t bufferSize,
2572 	bool kernel)
2573 {
2574 	FUNCTION(("dir_vnode_to_path(%p, %p, %lu)\n", vnode, buffer, bufferSize));
2575 
2576 	if (vnode == NULL || buffer == NULL || bufferSize == 0)
2577 		return B_BAD_VALUE;
2578 
2579 	if (!S_ISDIR(vnode->Type()))
2580 		return B_NOT_A_DIRECTORY;
2581 
2582 	char* path = buffer;
2583 	int32 insert = bufferSize;
2584 	int32 maxLevel = 256;
2585 	int32 length;
2586 	status_t status = B_OK;
2587 	struct io_context* ioContext = get_current_io_context(kernel);
2588 
2589 	// we don't use get_vnode() here because this call is more
2590 	// efficient and does all we need from get_vnode()
2591 	inc_vnode_ref_count(vnode);
2592 
2593 	path[--insert] = '\0';
2594 		// the path is filled right to left
2595 
2596 	while (true) {
2597 		// If the node is the context's root, bail out. Otherwise resolve mount
2598 		// points.
2599 		if (vnode == ioContext->root)
2600 			break;
2601 
2602 		if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2603 			put_vnode(vnode);
2604 			vnode = coveredVnode;
2605 		}
2606 
2607 		// lookup the parent vnode
2608 		struct vnode* parentVnode;
2609 		status = lookup_dir_entry(vnode, "..", &parentVnode);
2610 		if (status != B_OK)
2611 			goto out;
2612 
2613 		if (parentVnode == vnode) {
2614 			// The caller apparently got their hands on a node outside of their
2615 			// context's root. Now we've hit the global root.
2616 			put_vnode(parentVnode);
2617 			break;
2618 		}
2619 
2620 		// get the node's name
2621 		char nameBuffer[sizeof(struct dirent) + B_FILE_NAME_LENGTH];
2622 			// also used for fs_read_dir()
2623 		char* name = &((struct dirent*)nameBuffer)->d_name[0];
2624 		status = get_vnode_name(vnode, parentVnode, (struct dirent*)nameBuffer,
2625 			sizeof(nameBuffer), ioContext);
2626 
2627 		// release the current vnode, we only need its parent from now on
2628 		put_vnode(vnode);
2629 		vnode = parentVnode;
2630 
2631 		if (status != B_OK)
2632 			goto out;
2633 
2634 		// TODO: add an explicit check for loops in about 10 levels to do
2635 		// real loop detection
2636 
2637 		// don't go deeper as 'maxLevel' to prevent circular loops
2638 		if (maxLevel-- < 0) {
2639 			status = B_LINK_LIMIT;
2640 			goto out;
2641 		}
2642 
2643 		// add the name in front of the current path
2644 		name[B_FILE_NAME_LENGTH - 1] = '\0';
2645 		length = strlen(name);
2646 		insert -= length;
2647 		if (insert <= 0) {
2648 			status = B_RESULT_NOT_REPRESENTABLE;
2649 			goto out;
2650 		}
2651 		memcpy(path + insert, name, length);
2652 		path[--insert] = '/';
2653 	}
2654 
2655 	// the root dir will result in an empty path: fix it
2656 	if (path[insert] == '\0')
2657 		path[--insert] = '/';
2658 
2659 	TRACE(("  path is: %s\n", path + insert));
2660 
2661 	// move the path to the start of the buffer
2662 	length = bufferSize - insert;
2663 	memmove(buffer, path + insert, length);
2664 
2665 out:
2666 	put_vnode(vnode);
2667 	return status;
2668 }
2669 
2670 
2671 /*!	Checks the length of every path component, and adds a '.'
2672 	if the path ends in a slash.
2673 	The given path buffer must be able to store at least one
2674 	additional character.
2675 */
2676 static status_t
2677 check_path(char* to)
2678 {
2679 	int32 length = 0;
2680 
2681 	// check length of every path component
2682 
2683 	while (*to) {
2684 		char* begin;
2685 		if (*to == '/')
2686 			to++, length++;
2687 
2688 		begin = to;
2689 		while (*to != '/' && *to)
2690 			to++, length++;
2691 
2692 		if (to - begin > B_FILE_NAME_LENGTH)
2693 			return B_NAME_TOO_LONG;
2694 	}
2695 
2696 	if (length == 0)
2697 		return B_ENTRY_NOT_FOUND;
2698 
2699 	// complete path if there is a slash at the end
2700 
2701 	if (*(to - 1) == '/') {
2702 		if (length > B_PATH_NAME_LENGTH - 2)
2703 			return B_NAME_TOO_LONG;
2704 
2705 		to[0] = '.';
2706 		to[1] = '\0';
2707 	}
2708 
2709 	return B_OK;
2710 }
2711 
2712 
2713 static struct file_descriptor*
2714 get_fd_and_vnode(int fd, struct vnode** _vnode, bool kernel)
2715 {
2716 	struct file_descriptor* descriptor
2717 		= get_fd(get_current_io_context(kernel), fd);
2718 	if (descriptor == NULL)
2719 		return NULL;
2720 
2721 	struct vnode* vnode = fd_vnode(descriptor);
2722 	if (vnode == NULL) {
2723 		put_fd(descriptor);
2724 		return NULL;
2725 	}
2726 
2727 	// ToDo: when we can close a file descriptor at any point, investigate
2728 	//	if this is still valid to do (accessing the vnode without ref_count
2729 	//	or locking)
2730 	*_vnode = vnode;
2731 	return descriptor;
2732 }
2733 
2734 
2735 static struct vnode*
2736 get_vnode_from_fd(int fd, bool kernel)
2737 {
2738 	struct file_descriptor* descriptor;
2739 	struct vnode* vnode;
2740 
2741 	descriptor = get_fd(get_current_io_context(kernel), fd);
2742 	if (descriptor == NULL)
2743 		return NULL;
2744 
2745 	vnode = fd_vnode(descriptor);
2746 	if (vnode != NULL)
2747 		inc_vnode_ref_count(vnode);
2748 
2749 	put_fd(descriptor);
2750 	return vnode;
2751 }
2752 
2753 
2754 /*!	Gets the vnode from an FD + path combination. If \a fd is lower than zero,
2755 	only the path will be considered. In this case, the \a path must not be
2756 	NULL.
2757 	If \a fd is a valid file descriptor, \a path may be NULL for directories,
2758 	and should be NULL for files.
2759 */
2760 static status_t
2761 fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
2762 	struct vnode** _vnode, ino_t* _parentID, bool kernel)
2763 {
2764 	if (fd < 0 && !path)
2765 		return B_BAD_VALUE;
2766 
2767 	if (path != NULL && *path == '\0')
2768 		return B_ENTRY_NOT_FOUND;
2769 
2770 	if (fd < 0 || (path != NULL && path[0] == '/')) {
2771 		// no FD or absolute path
2772 		return path_to_vnode(path, traverseLeafLink, _vnode, _parentID, kernel);
2773 	}
2774 
2775 	// FD only, or FD + relative path
2776 	struct vnode* vnode = get_vnode_from_fd(fd, kernel);
2777 	if (vnode == NULL)
2778 		return B_FILE_ERROR;
2779 
2780 	if (path != NULL) {
2781 		return vnode_path_to_vnode(vnode, path, traverseLeafLink, 0, kernel,
2782 			_vnode, _parentID);
2783 	}
2784 
2785 	// there is no relative path to take into account
2786 
2787 	*_vnode = vnode;
2788 	if (_parentID)
2789 		*_parentID = -1;
2790 
2791 	return B_OK;
2792 }
2793 
2794 
2795 static int
2796 get_new_fd(int type, struct fs_mount* mount, struct vnode* vnode,
2797 	void* cookie, int openMode, bool kernel)
2798 {
2799 	struct file_descriptor* descriptor;
2800 	int fd;
2801 
2802 	// If the vnode is locked, we don't allow creating a new file/directory
2803 	// file_descriptor for it
2804 	if (vnode && vnode->mandatory_locked_by != NULL
2805 		&& (type == FDTYPE_FILE || type == FDTYPE_DIR))
2806 		return B_BUSY;
2807 
2808 	descriptor = alloc_fd();
2809 	if (!descriptor)
2810 		return B_NO_MEMORY;
2811 
2812 	if (vnode)
2813 		descriptor->u.vnode = vnode;
2814 	else
2815 		descriptor->u.mount = mount;
2816 	descriptor->cookie = cookie;
2817 
2818 	switch (type) {
2819 		// vnode types
2820 		case FDTYPE_FILE:
2821 			descriptor->ops = &sFileOps;
2822 			break;
2823 		case FDTYPE_DIR:
2824 			descriptor->ops = &sDirectoryOps;
2825 			break;
2826 		case FDTYPE_ATTR:
2827 			descriptor->ops = &sAttributeOps;
2828 			break;
2829 		case FDTYPE_ATTR_DIR:
2830 			descriptor->ops = &sAttributeDirectoryOps;
2831 			break;
2832 
2833 		// mount types
2834 		case FDTYPE_INDEX_DIR:
2835 			descriptor->ops = &sIndexDirectoryOps;
2836 			break;
2837 		case FDTYPE_QUERY:
2838 			descriptor->ops = &sQueryOps;
2839 			break;
2840 
2841 		default:
2842 			panic("get_new_fd() called with unknown type %d\n", type);
2843 			break;
2844 	}
2845 	descriptor->type = type;
2846 	descriptor->open_mode = openMode;
2847 
2848 	io_context* context = get_current_io_context(kernel);
2849 	fd = new_fd(context, descriptor);
2850 	if (fd < 0) {
2851 		free(descriptor);
2852 		return B_NO_MORE_FDS;
2853 	}
2854 
2855 	mutex_lock(&context->io_mutex);
2856 	fd_set_close_on_exec(context, fd, (openMode & O_CLOEXEC) != 0);
2857 	mutex_unlock(&context->io_mutex);
2858 
2859 	return fd;
2860 }
2861 
2862 
2863 /*!	In-place normalizes \a path. It's otherwise semantically equivalent to
2864 	vfs_normalize_path(). See there for more documentation.
2865 */
2866 static status_t
2867 normalize_path(char* path, size_t pathSize, bool traverseLink, bool kernel)
2868 {
2869 	VNodePutter dirPutter;
2870 	struct vnode* dir = NULL;
2871 	status_t error;
2872 
2873 	for (int i = 0; i < B_MAX_SYMLINKS; i++) {
2874 		// get dir vnode + leaf name
2875 		struct vnode* nextDir;
2876 		char leaf[B_FILE_NAME_LENGTH];
2877 		error = vnode_and_path_to_dir_vnode(dir, path, &nextDir, leaf, kernel);
2878 		if (error != B_OK)
2879 			return error;
2880 
2881 		dir = nextDir;
2882 		strcpy(path, leaf);
2883 		dirPutter.SetTo(dir);
2884 
2885 		// get file vnode, if we shall resolve links
2886 		bool fileExists = false;
2887 		struct vnode* fileVnode;
2888 		VNodePutter fileVnodePutter;
2889 		if (traverseLink) {
2890 			inc_vnode_ref_count(dir);
2891 			if (vnode_path_to_vnode(dir, path, false, 0, kernel, &fileVnode,
2892 					NULL) == B_OK) {
2893 				fileVnodePutter.SetTo(fileVnode);
2894 				fileExists = true;
2895 			}
2896 		}
2897 
2898 		if (!fileExists || !traverseLink || !S_ISLNK(fileVnode->Type())) {
2899 			// we're done -- construct the path
2900 			bool hasLeaf = true;
2901 			if (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0) {
2902 				// special cases "." and ".." -- get the dir, forget the leaf
2903 				inc_vnode_ref_count(dir);
2904 				error = vnode_path_to_vnode(dir, leaf, false, 0, kernel,
2905 					&nextDir, NULL);
2906 				if (error != B_OK)
2907 					return error;
2908 				dir = nextDir;
2909 				dirPutter.SetTo(dir);
2910 				hasLeaf = false;
2911 			}
2912 
2913 			// get the directory path
2914 			error = dir_vnode_to_path(dir, path, B_PATH_NAME_LENGTH, kernel);
2915 			if (error != B_OK)
2916 				return error;
2917 
2918 			// append the leaf name
2919 			if (hasLeaf) {
2920 				// insert a directory separator if this is not the file system
2921 				// root
2922 				if ((strcmp(path, "/") != 0
2923 					&& strlcat(path, "/", pathSize) >= pathSize)
2924 					|| strlcat(path, leaf, pathSize) >= pathSize) {
2925 					return B_NAME_TOO_LONG;
2926 				}
2927 			}
2928 
2929 			return B_OK;
2930 		}
2931 
2932 		// read link
2933 		if (HAS_FS_CALL(fileVnode, read_symlink)) {
2934 			size_t bufferSize = B_PATH_NAME_LENGTH - 1;
2935 			error = FS_CALL(fileVnode, read_symlink, path, &bufferSize);
2936 			if (error != B_OK)
2937 				return error;
2938 			path[bufferSize] = '\0';
2939 		} else
2940 			return B_BAD_VALUE;
2941 	}
2942 
2943 	return B_LINK_LIMIT;
2944 }
2945 
2946 
2947 static status_t
2948 resolve_covered_parent(struct vnode* parent, dev_t* _device, ino_t* _node,
2949 	struct io_context* ioContext)
2950 {
2951 	// Make sure the IO context root is not bypassed.
2952 	if (parent == ioContext->root) {
2953 		*_device = parent->device;
2954 		*_node = parent->id;
2955 		return B_OK;
2956 	}
2957 
2958 	inc_vnode_ref_count(parent);
2959 		// vnode_path_to_vnode() puts the node
2960 
2961 	// ".." is guaranteed not to be clobbered by this call
2962 	struct vnode* vnode;
2963 	status_t status = vnode_path_to_vnode(parent, (char*)"..", false, 0,
2964 		ioContext, &vnode, NULL);
2965 	if (status == B_OK) {
2966 		*_device = vnode->device;
2967 		*_node = vnode->id;
2968 		put_vnode(vnode);
2969 	}
2970 
2971 	return status;
2972 }
2973 
2974 
2975 #ifdef ADD_DEBUGGER_COMMANDS
2976 
2977 
2978 static void
2979 _dump_advisory_locking(advisory_locking* locking)
2980 {
2981 	if (locking == NULL)
2982 		return;
2983 
2984 	kprintf("   lock:        %" B_PRId32, locking->lock);
2985 	kprintf("   wait_sem:    %" B_PRId32, locking->wait_sem);
2986 
2987 	int32 index = 0;
2988 	LockList::Iterator iterator = locking->locks.GetIterator();
2989 	while (iterator.HasNext()) {
2990 		struct advisory_lock* lock = iterator.Next();
2991 
2992 		kprintf("   [%2" B_PRId32 "] team:   %" B_PRId32 "\n", index++, lock->team);
2993 		kprintf("        start:  %" B_PRIdOFF "\n", lock->start);
2994 		kprintf("        end:    %" B_PRIdOFF "\n", lock->end);
2995 		kprintf("        shared? %s\n", lock->shared ? "yes" : "no");
2996 	}
2997 }
2998 
2999 
3000 static void
3001 _dump_mount(struct fs_mount* mount)
3002 {
3003 	kprintf("MOUNT: %p\n", mount);
3004 	kprintf(" id:            %" B_PRIdDEV "\n", mount->id);
3005 	kprintf(" device_name:   %s\n", mount->device_name);
3006 	kprintf(" root_vnode:    %p\n", mount->root_vnode);
3007 	kprintf(" covers:        %p\n", mount->root_vnode->covers);
3008 	kprintf(" partition:     %p\n", mount->partition);
3009 	kprintf(" lock:          %p\n", &mount->rlock);
3010 	kprintf(" flags:        %s%s\n", mount->unmounting ? " unmounting" : "",
3011 		mount->owns_file_device ? " owns_file_device" : "");
3012 
3013 	fs_volume* volume = mount->volume;
3014 	while (volume != NULL) {
3015 		kprintf(" volume %p:\n", volume);
3016 		kprintf("  layer:            %" B_PRId32 "\n", volume->layer);
3017 		kprintf("  private_volume:   %p\n", volume->private_volume);
3018 		kprintf("  ops:              %p\n", volume->ops);
3019 		kprintf("  file_system:      %p\n", volume->file_system);
3020 		kprintf("  file_system_name: %s\n", volume->file_system_name);
3021 		volume = volume->super_volume;
3022 	}
3023 
3024 	set_debug_variable("_volume", (addr_t)mount->volume->private_volume);
3025 	set_debug_variable("_root", (addr_t)mount->root_vnode);
3026 	set_debug_variable("_covers", (addr_t)mount->root_vnode->covers);
3027 	set_debug_variable("_partition", (addr_t)mount->partition);
3028 }
3029 
3030 
3031 static bool
3032 debug_prepend_vnode_name_to_path(char* buffer, size_t& bufferSize,
3033 	const char* name)
3034 {
3035 	bool insertSlash = buffer[bufferSize] != '\0';
3036 	size_t nameLength = strlen(name);
3037 
3038 	if (bufferSize < nameLength + (insertSlash ? 1 : 0))
3039 		return false;
3040 
3041 	if (insertSlash)
3042 		buffer[--bufferSize] = '/';
3043 
3044 	bufferSize -= nameLength;
3045 	memcpy(buffer + bufferSize, name, nameLength);
3046 
3047 	return true;
3048 }
3049 
3050 
3051 static bool
3052 debug_prepend_vnode_id_to_path(char* buffer, size_t& bufferSize, dev_t devID,
3053 	ino_t nodeID)
3054 {
3055 	if (bufferSize == 0)
3056 		return false;
3057 
3058 	bool insertSlash = buffer[bufferSize] != '\0';
3059 	if (insertSlash)
3060 		buffer[--bufferSize] = '/';
3061 
3062 	size_t size = snprintf(buffer, bufferSize,
3063 		"<%" B_PRIdDEV ",%" B_PRIdINO ">", devID, nodeID);
3064 	if (size > bufferSize) {
3065 		if (insertSlash)
3066 			bufferSize++;
3067 		return false;
3068 	}
3069 
3070 	if (size < bufferSize)
3071 		memmove(buffer + bufferSize - size, buffer, size);
3072 
3073 	bufferSize -= size;
3074 	return true;
3075 }
3076 
3077 
3078 static char*
3079 debug_resolve_vnode_path(struct vnode* vnode, char* buffer, size_t bufferSize,
3080 	bool& _truncated)
3081 {
3082 	// null-terminate the path
3083 	buffer[--bufferSize] = '\0';
3084 
3085 	while (true) {
3086 		while (vnode->covers != NULL)
3087 			vnode = vnode->covers;
3088 
3089 		if (vnode == sRoot) {
3090 			_truncated = bufferSize == 0;
3091 			if (!_truncated)
3092 				buffer[--bufferSize] = '/';
3093 			return buffer + bufferSize;
3094 		}
3095 
3096 		// resolve the name
3097 		ino_t dirID;
3098 		const char* name = vnode->mount->entry_cache.DebugReverseLookup(
3099 			vnode->id, dirID);
3100 		if (name == NULL) {
3101 			// Failed to resolve the name -- prepend "<dev,node>/".
3102 			_truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3103 				vnode->mount->id, vnode->id);
3104 			return buffer + bufferSize;
3105 		}
3106 
3107 		// prepend the name
3108 		if (!debug_prepend_vnode_name_to_path(buffer, bufferSize, name)) {
3109 			_truncated = true;
3110 			return buffer + bufferSize;
3111 		}
3112 
3113 		// resolve the directory node
3114 		struct vnode* nextVnode = lookup_vnode(vnode->mount->id, dirID);
3115 		if (nextVnode == NULL) {
3116 			_truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3117 				vnode->mount->id, dirID);
3118 			return buffer + bufferSize;
3119 		}
3120 
3121 		vnode = nextVnode;
3122 	}
3123 }
3124 
3125 
3126 static void
3127 _dump_vnode(struct vnode* vnode, bool printPath)
3128 {
3129 	kprintf("VNODE: %p\n", vnode);
3130 	kprintf(" device:        %" B_PRIdDEV "\n", vnode->device);
3131 	kprintf(" id:            %" B_PRIdINO "\n", vnode->id);
3132 	kprintf(" ref_count:     %" B_PRId32 "\n", vnode->ref_count);
3133 	kprintf(" private_node:  %p\n", vnode->private_node);
3134 	kprintf(" mount:         %p\n", vnode->mount);
3135 	kprintf(" covered_by:    %p\n", vnode->covered_by);
3136 	kprintf(" covers:        %p\n", vnode->covers);
3137 	kprintf(" cache:         %p\n", vnode->cache);
3138 	kprintf(" type:          %#" B_PRIx32 "\n", vnode->Type());
3139 	kprintf(" flags:         %s%s%s\n", vnode->IsRemoved() ? "r" : "-",
3140 		vnode->IsBusy() ? "b" : "-", vnode->IsUnpublished() ? "u" : "-");
3141 	kprintf(" advisory_lock: %p\n", vnode->advisory_locking);
3142 
3143 	_dump_advisory_locking(vnode->advisory_locking);
3144 
3145 	if (printPath) {
3146 		void* buffer = debug_malloc(B_PATH_NAME_LENGTH);
3147 		if (buffer != NULL) {
3148 			bool truncated;
3149 			char* path = debug_resolve_vnode_path(vnode, (char*)buffer,
3150 				B_PATH_NAME_LENGTH, truncated);
3151 			if (path != NULL) {
3152 				kprintf(" path:          ");
3153 				if (truncated)
3154 					kputs("<truncated>/");
3155 				kputs(path);
3156 				kputs("\n");
3157 			} else
3158 				kprintf("Failed to resolve vnode path.\n");
3159 
3160 			debug_free(buffer);
3161 		} else
3162 			kprintf("Failed to allocate memory for constructing the path.\n");
3163 	}
3164 
3165 	set_debug_variable("_node", (addr_t)vnode->private_node);
3166 	set_debug_variable("_mount", (addr_t)vnode->mount);
3167 	set_debug_variable("_covered_by", (addr_t)vnode->covered_by);
3168 	set_debug_variable("_covers", (addr_t)vnode->covers);
3169 	set_debug_variable("_adv_lock", (addr_t)vnode->advisory_locking);
3170 }
3171 
3172 
3173 static int
3174 dump_mount(int argc, char** argv)
3175 {
3176 	if (argc != 2 || !strcmp(argv[1], "--help")) {
3177 		kprintf("usage: %s [id|address]\n", argv[0]);
3178 		return 0;
3179 	}
3180 
3181 	ulong val = parse_expression(argv[1]);
3182 	uint32 id = val;
3183 
3184 	struct fs_mount* mount = sMountsTable->Lookup(id);
3185 	if (mount == NULL) {
3186 		if (IS_USER_ADDRESS(id)) {
3187 			kprintf("fs_mount not found\n");
3188 			return 0;
3189 		}
3190 		mount = (fs_mount*)val;
3191 	}
3192 
3193 	_dump_mount(mount);
3194 	return 0;
3195 }
3196 
3197 
3198 static int
3199 dump_mounts(int argc, char** argv)
3200 {
3201 	if (argc != 1) {
3202 		kprintf("usage: %s\n", argv[0]);
3203 		return 0;
3204 	}
3205 
3206 	kprintf("%-*s    id %-*s   %-*s   %-*s   fs_name\n",
3207 		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "root",
3208 		B_PRINTF_POINTER_WIDTH, "covers", B_PRINTF_POINTER_WIDTH, "cookie");
3209 
3210 	struct fs_mount* mount;
3211 
3212 	MountTable::Iterator iterator(sMountsTable);
3213 	while (iterator.HasNext()) {
3214 		mount = iterator.Next();
3215 		kprintf("%p%4" B_PRIdDEV " %p %p %p %s\n", mount, mount->id, mount->root_vnode,
3216 			mount->root_vnode->covers, mount->volume->private_volume,
3217 			mount->volume->file_system_name);
3218 
3219 		fs_volume* volume = mount->volume;
3220 		while (volume->super_volume != NULL) {
3221 			volume = volume->super_volume;
3222 			kprintf("                                     %p %s\n",
3223 				volume->private_volume, volume->file_system_name);
3224 		}
3225 	}
3226 
3227 	return 0;
3228 }
3229 
3230 
3231 static int
3232 dump_vnode(int argc, char** argv)
3233 {
3234 	bool printPath = false;
3235 	int argi = 1;
3236 	if (argc >= 2 && strcmp(argv[argi], "-p") == 0) {
3237 		printPath = true;
3238 		argi++;
3239 	}
3240 
3241 	if (argi >= argc || argi + 2 < argc) {
3242 		print_debugger_command_usage(argv[0]);
3243 		return 0;
3244 	}
3245 
3246 	struct vnode* vnode = NULL;
3247 
3248 	if (argi + 1 == argc) {
3249 		vnode = (struct vnode*)parse_expression(argv[argi]);
3250 		if (IS_USER_ADDRESS(vnode)) {
3251 			kprintf("invalid vnode address\n");
3252 			return 0;
3253 		}
3254 		_dump_vnode(vnode, printPath);
3255 		return 0;
3256 	}
3257 
3258 	dev_t device = parse_expression(argv[argi]);
3259 	ino_t id = parse_expression(argv[argi + 1]);
3260 
3261 	VnodeTable::Iterator iterator(sVnodeTable);
3262 	while (iterator.HasNext()) {
3263 		vnode = iterator.Next();
3264 		if (vnode->id != id || vnode->device != device)
3265 			continue;
3266 
3267 		_dump_vnode(vnode, printPath);
3268 	}
3269 
3270 	return 0;
3271 }
3272 
3273 
3274 static int
3275 dump_vnodes(int argc, char** argv)
3276 {
3277 	if (argc != 2 || !strcmp(argv[1], "--help")) {
3278 		kprintf("usage: %s [device]\n", argv[0]);
3279 		return 0;
3280 	}
3281 
3282 	// restrict dumped nodes to a certain device if requested
3283 	dev_t device = parse_expression(argv[1]);
3284 
3285 	struct vnode* vnode;
3286 
3287 	kprintf("%-*s   dev     inode  ref %-*s   %-*s   %-*s   flags\n",
3288 		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache",
3289 		B_PRINTF_POINTER_WIDTH, "fs-node", B_PRINTF_POINTER_WIDTH, "locking");
3290 
3291 	VnodeTable::Iterator iterator(sVnodeTable);
3292 	while (iterator.HasNext()) {
3293 		vnode = iterator.Next();
3294 		if (vnode->device != device)
3295 			continue;
3296 
3297 		kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO "%5" B_PRId32 " %p %p %p %s%s%s\n",
3298 			vnode, vnode->device, vnode->id, vnode->ref_count, vnode->cache,
3299 			vnode->private_node, vnode->advisory_locking,
3300 			vnode->IsRemoved() ? "r" : "-", vnode->IsBusy() ? "b" : "-",
3301 			vnode->IsUnpublished() ? "u" : "-");
3302 	}
3303 
3304 	return 0;
3305 }
3306 
3307 
3308 static int
3309 dump_vnode_caches(int argc, char** argv)
3310 {
3311 	struct vnode* vnode;
3312 
3313 	if (argc > 2 || !strcmp(argv[1], "--help")) {
3314 		kprintf("usage: %s [device]\n", argv[0]);
3315 		return 0;
3316 	}
3317 
3318 	// restrict dumped nodes to a certain device if requested
3319 	dev_t device = -1;
3320 	if (argc > 1)
3321 		device = parse_expression(argv[1]);
3322 
3323 	kprintf("%-*s   dev     inode %-*s       size   pages\n",
3324 		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache");
3325 
3326 	VnodeTable::Iterator iterator(sVnodeTable);
3327 	while (iterator.HasNext()) {
3328 		vnode = iterator.Next();
3329 		if (vnode->cache == NULL)
3330 			continue;
3331 		if (device != -1 && vnode->device != device)
3332 			continue;
3333 
3334 		kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO " %p %8" B_PRIdOFF "%8" B_PRId32 "\n",
3335 			vnode, vnode->device, vnode->id, vnode->cache,
3336 			(vnode->cache->virtual_end + B_PAGE_SIZE - 1) / B_PAGE_SIZE,
3337 			vnode->cache->page_count);
3338 	}
3339 
3340 	return 0;
3341 }
3342 
3343 
3344 int
3345 dump_io_context(int argc, char** argv)
3346 {
3347 	if (argc > 2 || !strcmp(argv[1], "--help")) {
3348 		kprintf("usage: %s [team-id|address]\n", argv[0]);
3349 		return 0;
3350 	}
3351 
3352 	struct io_context* context = NULL;
3353 
3354 	if (argc > 1) {
3355 		ulong num = parse_expression(argv[1]);
3356 		if (IS_KERNEL_ADDRESS(num))
3357 			context = (struct io_context*)num;
3358 		else {
3359 			Team* team = team_get_team_struct_locked(num);
3360 			if (team == NULL) {
3361 				kprintf("could not find team with ID %lu\n", num);
3362 				return 0;
3363 			}
3364 			context = (struct io_context*)team->io_context;
3365 		}
3366 	} else
3367 		context = get_current_io_context(true);
3368 
3369 	kprintf("I/O CONTEXT: %p\n", context);
3370 	kprintf(" root vnode:\t%p\n", context->root);
3371 	kprintf(" cwd vnode:\t%p\n", context->cwd);
3372 	kprintf(" used fds:\t%" B_PRIu32 "\n", context->num_used_fds);
3373 	kprintf(" max fds:\t%" B_PRIu32 "\n", context->table_size);
3374 
3375 	if (context->num_used_fds) {
3376 		kprintf("   no.  type    %*s  ref  open  mode         pos    %*s\n",
3377 			B_PRINTF_POINTER_WIDTH, "ops", B_PRINTF_POINTER_WIDTH, "cookie");
3378 	}
3379 
3380 	for (uint32 i = 0; i < context->table_size; i++) {
3381 		struct file_descriptor* fd = context->fds[i];
3382 		if (fd == NULL)
3383 			continue;
3384 
3385 		kprintf("  %3" B_PRIu32 ":  %4" B_PRId32 "  %p  %3" B_PRId32 "  %4"
3386 			B_PRIu32 "  %4" B_PRIx32 "  %10" B_PRIdOFF "  %p  %s %p\n", i,
3387 			fd->type, fd->ops, fd->ref_count, fd->open_count, fd->open_mode,
3388 			fd->pos, fd->cookie,
3389 			fd->type >= FDTYPE_INDEX && fd->type <= FDTYPE_QUERY
3390 				? "mount" : "vnode",
3391 			fd->u.vnode);
3392 	}
3393 
3394 	kprintf(" used monitors:\t%" B_PRIu32 "\n", context->num_monitors);
3395 	kprintf(" max monitors:\t%" B_PRIu32 "\n", context->max_monitors);
3396 
3397 	set_debug_variable("_cwd", (addr_t)context->cwd);
3398 
3399 	return 0;
3400 }
3401 
3402 
3403 int
3404 dump_vnode_usage(int argc, char** argv)
3405 {
3406 	if (argc != 1) {
3407 		kprintf("usage: %s\n", argv[0]);
3408 		return 0;
3409 	}
3410 
3411 	kprintf("Unused vnodes: %" B_PRIu32 " (max unused %" B_PRIu32 ")\n",
3412 		sUnusedVnodes, kMaxUnusedVnodes);
3413 
3414 	uint32 count = sVnodeTable->CountElements();
3415 
3416 	kprintf("%" B_PRIu32 " vnodes total (%" B_PRIu32 " in use).\n", count,
3417 		count - sUnusedVnodes);
3418 	return 0;
3419 }
3420 
3421 #endif	// ADD_DEBUGGER_COMMANDS
3422 
3423 
3424 /*!	Clears memory specified by an iovec array.
3425 */
3426 static void
3427 zero_iovecs(const iovec* vecs, size_t vecCount, size_t bytes)
3428 {
3429 	for (size_t i = 0; i < vecCount && bytes > 0; i++) {
3430 		size_t length = std::min(vecs[i].iov_len, bytes);
3431 		memset(vecs[i].iov_base, 0, length);
3432 		bytes -= length;
3433 	}
3434 }
3435 
3436 
3437 /*!	Does the dirty work of combining the file_io_vecs with the iovecs
3438 	and calls the file system hooks to read/write the request to disk.
3439 */
3440 static status_t
3441 common_file_io_vec_pages(struct vnode* vnode, void* cookie,
3442 	const file_io_vec* fileVecs, size_t fileVecCount, const iovec* vecs,
3443 	size_t vecCount, uint32* _vecIndex, size_t* _vecOffset, size_t* _numBytes,
3444 	bool doWrite)
3445 {
3446 	if (fileVecCount == 0) {
3447 		// There are no file vecs at this offset, so we're obviously trying
3448 		// to access the file outside of its bounds
3449 		return B_BAD_VALUE;
3450 	}
3451 
3452 	size_t numBytes = *_numBytes;
3453 	uint32 fileVecIndex;
3454 	size_t vecOffset = *_vecOffset;
3455 	uint32 vecIndex = *_vecIndex;
3456 	status_t status;
3457 	size_t size;
3458 
3459 	if (!doWrite && vecOffset == 0) {
3460 		// now directly read the data from the device
3461 		// the first file_io_vec can be read directly
3462 
3463 		if (fileVecs[0].length < (off_t)numBytes)
3464 			size = fileVecs[0].length;
3465 		else
3466 			size = numBytes;
3467 
3468 		if (fileVecs[0].offset >= 0) {
3469 			status = FS_CALL(vnode, read_pages, cookie, fileVecs[0].offset,
3470 				&vecs[vecIndex], vecCount - vecIndex, &size);
3471 		} else {
3472 			// sparse read
3473 			zero_iovecs(&vecs[vecIndex], vecCount - vecIndex, size);
3474 			status = B_OK;
3475 		}
3476 		if (status != B_OK)
3477 			return status;
3478 
3479 		// TODO: this is a work-around for buggy device drivers!
3480 		//	When our own drivers honour the length, we can:
3481 		//	a) also use this direct I/O for writes (otherwise, it would
3482 		//	   overwrite precious data)
3483 		//	b) panic if the term below is true (at least for writes)
3484 		if ((off_t)size > fileVecs[0].length) {
3485 			//dprintf("warning: device driver %p doesn't respect total length "
3486 			//	"in read_pages() call!\n", ref->device);
3487 			size = fileVecs[0].length;
3488 		}
3489 
3490 		ASSERT((off_t)size <= fileVecs[0].length);
3491 
3492 		// If the file portion was contiguous, we're already done now
3493 		if (size == numBytes)
3494 			return B_OK;
3495 
3496 		// if we reached the end of the file, we can return as well
3497 		if ((off_t)size != fileVecs[0].length) {
3498 			*_numBytes = size;
3499 			return B_OK;
3500 		}
3501 
3502 		fileVecIndex = 1;
3503 
3504 		// first, find out where we have to continue in our iovecs
3505 		for (; vecIndex < vecCount; vecIndex++) {
3506 			if (size < vecs[vecIndex].iov_len)
3507 				break;
3508 
3509 			size -= vecs[vecIndex].iov_len;
3510 		}
3511 
3512 		vecOffset = size;
3513 	} else {
3514 		fileVecIndex = 0;
3515 		size = 0;
3516 	}
3517 
3518 	// Too bad, let's process the rest of the file_io_vecs
3519 
3520 	size_t totalSize = size;
3521 	size_t bytesLeft = numBytes - size;
3522 
3523 	for (; fileVecIndex < fileVecCount; fileVecIndex++) {
3524 		const file_io_vec &fileVec = fileVecs[fileVecIndex];
3525 		off_t fileOffset = fileVec.offset;
3526 		off_t fileLeft = min_c(fileVec.length, (off_t)bytesLeft);
3527 
3528 		TRACE(("FILE VEC [%" B_PRIu32 "] length %" B_PRIdOFF "\n", fileVecIndex,
3529 			fileLeft));
3530 
3531 		// process the complete fileVec
3532 		while (fileLeft > 0) {
3533 			iovec tempVecs[MAX_TEMP_IO_VECS];
3534 			uint32 tempCount = 0;
3535 
3536 			// size tracks how much of what is left of the current fileVec
3537 			// (fileLeft) has been assigned to tempVecs
3538 			size = 0;
3539 
3540 			// assign what is left of the current fileVec to the tempVecs
3541 			for (size = 0; (off_t)size < fileLeft && vecIndex < vecCount
3542 					&& tempCount < MAX_TEMP_IO_VECS;) {
3543 				// try to satisfy one iovec per iteration (or as much as
3544 				// possible)
3545 
3546 				// bytes left of the current iovec
3547 				size_t vecLeft = vecs[vecIndex].iov_len - vecOffset;
3548 				if (vecLeft == 0) {
3549 					vecOffset = 0;
3550 					vecIndex++;
3551 					continue;
3552 				}
3553 
3554 				TRACE(("fill vec %" B_PRIu32 ", offset = %lu, size = %lu\n",
3555 					vecIndex, vecOffset, size));
3556 
3557 				// actually available bytes
3558 				size_t tempVecSize = min_c(vecLeft, fileLeft - size);
3559 
3560 				tempVecs[tempCount].iov_base
3561 					= (void*)((addr_t)vecs[vecIndex].iov_base + vecOffset);
3562 				tempVecs[tempCount].iov_len = tempVecSize;
3563 				tempCount++;
3564 
3565 				size += tempVecSize;
3566 				vecOffset += tempVecSize;
3567 			}
3568 
3569 			size_t bytes = size;
3570 
3571 			if (fileOffset == -1) {
3572 				if (doWrite) {
3573 					panic("sparse write attempt: vnode %p", vnode);
3574 					status = B_IO_ERROR;
3575 				} else {
3576 					// sparse read
3577 					zero_iovecs(tempVecs, tempCount, bytes);
3578 					status = B_OK;
3579 				}
3580 			} else if (doWrite) {
3581 				status = FS_CALL(vnode, write_pages, cookie, fileOffset,
3582 					tempVecs, tempCount, &bytes);
3583 			} else {
3584 				status = FS_CALL(vnode, read_pages, cookie, fileOffset,
3585 					tempVecs, tempCount, &bytes);
3586 			}
3587 			if (status != B_OK)
3588 				return status;
3589 
3590 			totalSize += bytes;
3591 			bytesLeft -= size;
3592 			if (fileOffset >= 0)
3593 				fileOffset += size;
3594 			fileLeft -= size;
3595 			//dprintf("-> file left = %Lu\n", fileLeft);
3596 
3597 			if (size != bytes || vecIndex >= vecCount) {
3598 				// there are no more bytes or iovecs, let's bail out
3599 				*_numBytes = totalSize;
3600 				return B_OK;
3601 			}
3602 		}
3603 	}
3604 
3605 	*_vecIndex = vecIndex;
3606 	*_vecOffset = vecOffset;
3607 	*_numBytes = totalSize;
3608 	return B_OK;
3609 }
3610 
3611 
3612 static bool
3613 is_user_in_group(gid_t gid)
3614 {
3615 	if (gid == getegid())
3616 		return true;
3617 
3618 	gid_t groups[NGROUPS_MAX];
3619 	int groupCount = getgroups(NGROUPS_MAX, groups);
3620 	for (int i = 0; i < groupCount; i++) {
3621 		if (gid == groups[i])
3622 			return true;
3623 	}
3624 
3625 	return false;
3626 }
3627 
3628 
3629 static status_t
3630 free_io_context(io_context* context)
3631 {
3632 	uint32 i;
3633 
3634 	TIOC(FreeIOContext(context));
3635 
3636 	if (context->root)
3637 		put_vnode(context->root);
3638 
3639 	if (context->cwd)
3640 		put_vnode(context->cwd);
3641 
3642 	mutex_lock(&context->io_mutex);
3643 
3644 	for (i = 0; i < context->table_size; i++) {
3645 		if (struct file_descriptor* descriptor = context->fds[i]) {
3646 			close_fd(descriptor);
3647 			put_fd(descriptor);
3648 		}
3649 	}
3650 
3651 	mutex_destroy(&context->io_mutex);
3652 
3653 	remove_node_monitors(context);
3654 	free(context->fds);
3655 	free(context);
3656 
3657 	return B_OK;
3658 }
3659 
3660 
3661 static status_t
3662 resize_monitor_table(struct io_context* context, const int newSize)
3663 {
3664 	int	status = B_OK;
3665 
3666 	if (newSize <= 0 || newSize > MAX_NODE_MONITORS)
3667 		return B_BAD_VALUE;
3668 
3669 	mutex_lock(&context->io_mutex);
3670 
3671 	if ((size_t)newSize < context->num_monitors) {
3672 		status = B_BUSY;
3673 		goto out;
3674 	}
3675 	context->max_monitors = newSize;
3676 
3677 out:
3678 	mutex_unlock(&context->io_mutex);
3679 	return status;
3680 }
3681 
3682 
3683 //	#pragma mark - public API for file systems
3684 
3685 
3686 extern "C" status_t
3687 new_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3688 	fs_vnode_ops* ops)
3689 {
3690 	FUNCTION(("new_vnode(volume = %p (%" B_PRId32 "), vnodeID = %" B_PRId64
3691 		", node = %p)\n", volume, volume->id, vnodeID, privateNode));
3692 
3693 	if (privateNode == NULL)
3694 		return B_BAD_VALUE;
3695 
3696 	int32 tries = BUSY_VNODE_RETRIES;
3697 restart:
3698 	// create the node
3699 	bool nodeCreated;
3700 	struct vnode* vnode;
3701 	status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3702 		nodeCreated);
3703 	if (status != B_OK)
3704 		return status;
3705 
3706 	WriteLocker nodeLocker(sVnodeLock, true);
3707 		// create_new_vnode_and_lock() has locked for us
3708 
3709 	if (!nodeCreated && vnode->IsBusy()) {
3710 		nodeLocker.Unlock();
3711 		if (!retry_busy_vnode(tries, volume->id, vnodeID))
3712 			return B_BUSY;
3713 		goto restart;
3714 	}
3715 
3716 	// file system integrity check:
3717 	// test if the vnode already exists and bail out if this is the case!
3718 	if (!nodeCreated) {
3719 		panic("vnode %" B_PRIdDEV ":%" B_PRIdINO " already exists (node = %p, "
3720 			"vnode->node = %p)!", volume->id, vnodeID, privateNode,
3721 			vnode->private_node);
3722 		return B_ERROR;
3723 	}
3724 
3725 	vnode->private_node = privateNode;
3726 	vnode->ops = ops;
3727 	vnode->SetUnpublished(true);
3728 
3729 	TRACE(("returns: %s\n", strerror(status)));
3730 
3731 	return status;
3732 }
3733 
3734 
3735 extern "C" status_t
3736 publish_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3737 	fs_vnode_ops* ops, int type, uint32 flags)
3738 {
3739 	FUNCTION(("publish_vnode()\n"));
3740 
3741 	int32 tries = BUSY_VNODE_RETRIES;
3742 restart:
3743 	WriteLocker locker(sVnodeLock);
3744 
3745 	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3746 
3747 	bool nodeCreated = false;
3748 	if (vnode == NULL) {
3749 		if (privateNode == NULL)
3750 			return B_BAD_VALUE;
3751 
3752 		// create the node
3753 		locker.Unlock();
3754 			// create_new_vnode_and_lock() will re-lock for us on success
3755 		status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3756 			nodeCreated);
3757 		if (status != B_OK)
3758 			return status;
3759 
3760 		locker.SetTo(sVnodeLock, true);
3761 	}
3762 
3763 	if (nodeCreated) {
3764 		vnode->private_node = privateNode;
3765 		vnode->ops = ops;
3766 		vnode->SetUnpublished(true);
3767 	} else if (vnode->IsBusy() && vnode->IsUnpublished()
3768 		&& vnode->private_node == privateNode && vnode->ops == ops) {
3769 		// already known, but not published
3770 	} else if (vnode->IsBusy()) {
3771 		locker.Unlock();
3772 		if (!retry_busy_vnode(tries, volume->id, vnodeID))
3773 			return B_BUSY;
3774 		goto restart;
3775 	} else
3776 		return B_BAD_VALUE;
3777 
3778 	bool publishSpecialSubNode = false;
3779 
3780 	vnode->SetType(type);
3781 	vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
3782 	publishSpecialSubNode = is_special_node_type(type)
3783 		&& (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
3784 
3785 	status_t status = B_OK;
3786 
3787 	// create sub vnodes, if necessary
3788 	if (volume->sub_volume != NULL || publishSpecialSubNode) {
3789 		locker.Unlock();
3790 
3791 		fs_volume* subVolume = volume;
3792 		if (volume->sub_volume != NULL) {
3793 			while (status == B_OK && subVolume->sub_volume != NULL) {
3794 				subVolume = subVolume->sub_volume;
3795 				status = subVolume->ops->create_sub_vnode(subVolume, vnodeID,
3796 					vnode);
3797 			}
3798 		}
3799 
3800 		if (status == B_OK && publishSpecialSubNode)
3801 			status = create_special_sub_node(vnode, flags);
3802 
3803 		if (status != B_OK) {
3804 			// error -- clean up the created sub vnodes
3805 			while (subVolume->super_volume != volume) {
3806 				subVolume = subVolume->super_volume;
3807 				subVolume->ops->delete_sub_vnode(subVolume, vnode);
3808 			}
3809 		}
3810 
3811 		if (status == B_OK) {
3812 			ReadLocker vnodesReadLocker(sVnodeLock);
3813 			AutoLocker<Vnode> nodeLocker(vnode);
3814 			vnode->SetBusy(false);
3815 			vnode->SetUnpublished(false);
3816 		} else {
3817 			locker.Lock();
3818 			sVnodeTable->Remove(vnode);
3819 			remove_vnode_from_mount_list(vnode, vnode->mount);
3820 			free(vnode);
3821 		}
3822 	} else {
3823 		// we still hold the write lock -- mark the node unbusy and published
3824 		vnode->SetBusy(false);
3825 		vnode->SetUnpublished(false);
3826 	}
3827 
3828 	TRACE(("returns: %s\n", strerror(status)));
3829 
3830 	return status;
3831 }
3832 
3833 
3834 extern "C" status_t
3835 get_vnode(fs_volume* volume, ino_t vnodeID, void** _privateNode)
3836 {
3837 	struct vnode* vnode;
3838 
3839 	if (volume == NULL)
3840 		return B_BAD_VALUE;
3841 
3842 	status_t status = get_vnode(volume->id, vnodeID, &vnode, true, true);
3843 	if (status != B_OK)
3844 		return status;
3845 
3846 	// If this is a layered FS, we need to get the node cookie for the requested
3847 	// layer.
3848 	if (HAS_FS_CALL(vnode, get_super_vnode)) {
3849 		fs_vnode resolvedNode;
3850 		status_t status = FS_CALL(vnode, get_super_vnode, volume,
3851 			&resolvedNode);
3852 		if (status != B_OK) {
3853 			panic("get_vnode(): Failed to get super node for vnode %p, "
3854 				"volume: %p", vnode, volume);
3855 			put_vnode(vnode);
3856 			return status;
3857 		}
3858 
3859 		if (_privateNode != NULL)
3860 			*_privateNode = resolvedNode.private_node;
3861 	} else if (_privateNode != NULL)
3862 		*_privateNode = vnode->private_node;
3863 
3864 	return B_OK;
3865 }
3866 
3867 
3868 extern "C" status_t
3869 acquire_vnode(fs_volume* volume, ino_t vnodeID)
3870 {
3871 	struct vnode* vnode;
3872 
3873 	rw_lock_read_lock(&sVnodeLock);
3874 	vnode = lookup_vnode(volume->id, vnodeID);
3875 	rw_lock_read_unlock(&sVnodeLock);
3876 
3877 	if (vnode == NULL)
3878 		return B_BAD_VALUE;
3879 
3880 	inc_vnode_ref_count(vnode);
3881 	return B_OK;
3882 }
3883 
3884 
3885 extern "C" status_t
3886 put_vnode(fs_volume* volume, ino_t vnodeID)
3887 {
3888 	struct vnode* vnode;
3889 
3890 	rw_lock_read_lock(&sVnodeLock);
3891 	vnode = lookup_vnode(volume->id, vnodeID);
3892 	rw_lock_read_unlock(&sVnodeLock);
3893 
3894 	if (vnode == NULL)
3895 		return B_BAD_VALUE;
3896 
3897 	dec_vnode_ref_count(vnode, false, true);
3898 	return B_OK;
3899 }
3900 
3901 
3902 extern "C" status_t
3903 remove_vnode(fs_volume* volume, ino_t vnodeID)
3904 {
3905 	ReadLocker locker(sVnodeLock);
3906 
3907 	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3908 	if (vnode == NULL)
3909 		return B_ENTRY_NOT_FOUND;
3910 
3911 	if (vnode->covered_by != NULL || vnode->covers != NULL) {
3912 		// this vnode is in use
3913 		return B_BUSY;
3914 	}
3915 
3916 	vnode->Lock();
3917 
3918 	vnode->SetRemoved(true);
3919 	bool removeUnpublished = false;
3920 
3921 	if (vnode->IsUnpublished()) {
3922 		// prepare the vnode for deletion
3923 		removeUnpublished = true;
3924 		vnode->SetBusy(true);
3925 	}
3926 
3927 	vnode->Unlock();
3928 	locker.Unlock();
3929 
3930 	if (removeUnpublished) {
3931 		// If the vnode hasn't been published yet, we delete it here
3932 		atomic_add(&vnode->ref_count, -1);
3933 		free_vnode(vnode, true);
3934 	}
3935 
3936 	return B_OK;
3937 }
3938 
3939 
3940 extern "C" status_t
3941 unremove_vnode(fs_volume* volume, ino_t vnodeID)
3942 {
3943 	struct vnode* vnode;
3944 
3945 	rw_lock_read_lock(&sVnodeLock);
3946 
3947 	vnode = lookup_vnode(volume->id, vnodeID);
3948 	if (vnode) {
3949 		AutoLocker<Vnode> nodeLocker(vnode);
3950 		vnode->SetRemoved(false);
3951 	}
3952 
3953 	rw_lock_read_unlock(&sVnodeLock);
3954 	return B_OK;
3955 }
3956 
3957 
3958 extern "C" status_t
3959 get_vnode_removed(fs_volume* volume, ino_t vnodeID, bool* _removed)
3960 {
3961 	ReadLocker _(sVnodeLock);
3962 
3963 	if (struct vnode* vnode = lookup_vnode(volume->id, vnodeID)) {
3964 		if (_removed != NULL)
3965 			*_removed = vnode->IsRemoved();
3966 		return B_OK;
3967 	}
3968 
3969 	return B_BAD_VALUE;
3970 }
3971 
3972 
3973 extern "C" fs_volume*
3974 volume_for_vnode(fs_vnode* _vnode)
3975 {
3976 	if (_vnode == NULL)
3977 		return NULL;
3978 
3979 	struct vnode* vnode = static_cast<struct vnode*>(_vnode);
3980 	return vnode->mount->volume;
3981 }
3982 
3983 
3984 extern "C" status_t
3985 check_access_permissions(int accessMode, mode_t mode, gid_t nodeGroupID,
3986 	uid_t nodeUserID)
3987 {
3988 	// get node permissions
3989 	int userPermissions = (mode & S_IRWXU) >> 6;
3990 	int groupPermissions = (mode & S_IRWXG) >> 3;
3991 	int otherPermissions = mode & S_IRWXO;
3992 
3993 	// get the node permissions for this uid/gid
3994 	int permissions = 0;
3995 	uid_t uid = geteuid();
3996 
3997 	if (uid == 0) {
3998 		// user is root
3999 		// root has always read/write permission, but at least one of the
4000 		// X bits must be set for execute permission
4001 		permissions = userPermissions | groupPermissions | otherPermissions
4002 			| S_IROTH | S_IWOTH;
4003 		if (S_ISDIR(mode))
4004 			permissions |= S_IXOTH;
4005 	} else if (uid == nodeUserID) {
4006 		// user is node owner
4007 		permissions = userPermissions;
4008 	} else if (is_user_in_group(nodeGroupID)) {
4009 		// user is in owning group
4010 		permissions = groupPermissions;
4011 	} else {
4012 		// user is one of the others
4013 		permissions = otherPermissions;
4014 	}
4015 
4016 	return (accessMode & ~permissions) == 0 ? B_OK : B_PERMISSION_DENIED;
4017 }
4018 
4019 
4020 #if 0
4021 extern "C" status_t
4022 read_pages(int fd, off_t pos, const iovec* vecs, size_t count,
4023 	size_t* _numBytes)
4024 {
4025 	struct file_descriptor* descriptor;
4026 	struct vnode* vnode;
4027 
4028 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4029 	if (descriptor == NULL)
4030 		return B_FILE_ERROR;
4031 
4032 	status_t status = vfs_read_pages(vnode, descriptor->cookie, pos, vecs,
4033 		count, 0, _numBytes);
4034 
4035 	put_fd(descriptor);
4036 	return status;
4037 }
4038 
4039 
4040 extern "C" status_t
4041 write_pages(int fd, off_t pos, const iovec* vecs, size_t count,
4042 	size_t* _numBytes)
4043 {
4044 	struct file_descriptor* descriptor;
4045 	struct vnode* vnode;
4046 
4047 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4048 	if (descriptor == NULL)
4049 		return B_FILE_ERROR;
4050 
4051 	status_t status = vfs_write_pages(vnode, descriptor->cookie, pos, vecs,
4052 		count, 0, _numBytes);
4053 
4054 	put_fd(descriptor);
4055 	return status;
4056 }
4057 #endif
4058 
4059 
4060 extern "C" status_t
4061 read_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
4062 	const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
4063 	size_t* _bytes)
4064 {
4065 	struct file_descriptor* descriptor;
4066 	struct vnode* vnode;
4067 
4068 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4069 	if (descriptor == NULL)
4070 		return B_FILE_ERROR;
4071 
4072 	status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
4073 		fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
4074 		false);
4075 
4076 	put_fd(descriptor);
4077 	return status;
4078 }
4079 
4080 
4081 extern "C" status_t
4082 write_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
4083 	const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
4084 	size_t* _bytes)
4085 {
4086 	struct file_descriptor* descriptor;
4087 	struct vnode* vnode;
4088 
4089 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4090 	if (descriptor == NULL)
4091 		return B_FILE_ERROR;
4092 
4093 	status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
4094 		fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
4095 		true);
4096 
4097 	put_fd(descriptor);
4098 	return status;
4099 }
4100 
4101 
4102 extern "C" status_t
4103 entry_cache_add(dev_t mountID, ino_t dirID, const char* name, ino_t nodeID)
4104 {
4105 	// lookup mount -- the caller is required to make sure that the mount
4106 	// won't go away
4107 	MutexLocker locker(sMountMutex);
4108 	struct fs_mount* mount = find_mount(mountID);
4109 	if (mount == NULL)
4110 		return B_BAD_VALUE;
4111 	locker.Unlock();
4112 
4113 	return mount->entry_cache.Add(dirID, name, nodeID, false);
4114 }
4115 
4116 
4117 extern "C" status_t
4118 entry_cache_add_missing(dev_t mountID, ino_t dirID, const char* name)
4119 {
4120 	// lookup mount -- the caller is required to make sure that the mount
4121 	// won't go away
4122 	MutexLocker locker(sMountMutex);
4123 	struct fs_mount* mount = find_mount(mountID);
4124 	if (mount == NULL)
4125 		return B_BAD_VALUE;
4126 	locker.Unlock();
4127 
4128 	return mount->entry_cache.Add(dirID, name, -1, true);
4129 }
4130 
4131 
4132 extern "C" status_t
4133 entry_cache_remove(dev_t mountID, ino_t dirID, const char* name)
4134 {
4135 	// lookup mount -- the caller is required to make sure that the mount
4136 	// won't go away
4137 	MutexLocker locker(sMountMutex);
4138 	struct fs_mount* mount = find_mount(mountID);
4139 	if (mount == NULL)
4140 		return B_BAD_VALUE;
4141 	locker.Unlock();
4142 
4143 	return mount->entry_cache.Remove(dirID, name);
4144 }
4145 
4146 
4147 //	#pragma mark - private VFS API
4148 //	Functions the VFS exports for other parts of the kernel
4149 
4150 
4151 /*! Acquires another reference to the vnode that has to be released
4152 	by calling vfs_put_vnode().
4153 */
4154 void
4155 vfs_acquire_vnode(struct vnode* vnode)
4156 {
4157 	inc_vnode_ref_count(vnode);
4158 }
4159 
4160 
4161 /*! This is currently called from file_cache_create() only.
4162 	It's probably a temporary solution as long as devfs requires that
4163 	fs_read_pages()/fs_write_pages() are called with the standard
4164 	open cookie and not with a device cookie.
4165 	If that's done differently, remove this call; it has no other
4166 	purpose.
4167 */
4168 extern "C" status_t
4169 vfs_get_cookie_from_fd(int fd, void** _cookie)
4170 {
4171 	struct file_descriptor* descriptor;
4172 
4173 	descriptor = get_fd(get_current_io_context(true), fd);
4174 	if (descriptor == NULL)
4175 		return B_FILE_ERROR;
4176 
4177 	*_cookie = descriptor->cookie;
4178 	return B_OK;
4179 }
4180 
4181 
4182 extern "C" status_t
4183 vfs_get_vnode_from_fd(int fd, bool kernel, struct vnode** vnode)
4184 {
4185 	*vnode = get_vnode_from_fd(fd, kernel);
4186 
4187 	if (*vnode == NULL)
4188 		return B_FILE_ERROR;
4189 
4190 	return B_NO_ERROR;
4191 }
4192 
4193 
4194 extern "C" status_t
4195 vfs_get_vnode_from_path(const char* path, bool kernel, struct vnode** _vnode)
4196 {
4197 	TRACE(("vfs_get_vnode_from_path: entry. path = '%s', kernel %d\n",
4198 		path, kernel));
4199 
4200 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
4201 	if (pathBuffer.InitCheck() != B_OK)
4202 		return B_NO_MEMORY;
4203 
4204 	char* buffer = pathBuffer.LockBuffer();
4205 	strlcpy(buffer, path, pathBuffer.BufferSize());
4206 
4207 	struct vnode* vnode;
4208 	status_t status = path_to_vnode(buffer, true, &vnode, NULL, kernel);
4209 	if (status != B_OK)
4210 		return status;
4211 
4212 	*_vnode = vnode;
4213 	return B_OK;
4214 }
4215 
4216 
4217 extern "C" status_t
4218 vfs_get_vnode(dev_t mountID, ino_t vnodeID, bool canWait, struct vnode** _vnode)
4219 {
4220 	struct vnode* vnode = NULL;
4221 
4222 	status_t status = get_vnode(mountID, vnodeID, &vnode, canWait, false);
4223 	if (status != B_OK)
4224 		return status;
4225 
4226 	*_vnode = vnode;
4227 	return B_OK;
4228 }
4229 
4230 
4231 extern "C" status_t
4232 vfs_entry_ref_to_vnode(dev_t mountID, ino_t directoryID,
4233 	const char* name, struct vnode** _vnode)
4234 {
4235 	return entry_ref_to_vnode(mountID, directoryID, name, false, true, _vnode);
4236 }
4237 
4238 
4239 extern "C" void
4240 vfs_vnode_to_node_ref(struct vnode* vnode, dev_t* _mountID, ino_t* _vnodeID)
4241 {
4242 	*_mountID = vnode->device;
4243 	*_vnodeID = vnode->id;
4244 }
4245 
4246 
4247 /*!
4248 	Helper function abstracting the process of "converting" a given
4249 	vnode-pointer to a fs_vnode-pointer.
4250 	Currently only used in bindfs.
4251 */
4252 extern "C" fs_vnode*
4253 vfs_fsnode_for_vnode(struct vnode* vnode)
4254 {
4255 	return vnode;
4256 }
4257 
4258 
4259 /*!
4260 	Calls fs_open() on the given vnode and returns a new
4261 	file descriptor for it
4262 */
4263 int
4264 vfs_open_vnode(struct vnode* vnode, int openMode, bool kernel)
4265 {
4266 	return open_vnode(vnode, openMode, kernel);
4267 }
4268 
4269 
4270 /*!	Looks up a vnode with the given mount and vnode ID.
4271 	Must only be used with "in-use" vnodes as it doesn't grab a reference
4272 	to the node.
4273 	It's currently only be used by file_cache_create().
4274 */
4275 extern "C" status_t
4276 vfs_lookup_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode)
4277 {
4278 	rw_lock_read_lock(&sVnodeLock);
4279 	struct vnode* vnode = lookup_vnode(mountID, vnodeID);
4280 	rw_lock_read_unlock(&sVnodeLock);
4281 
4282 	if (vnode == NULL)
4283 		return B_ERROR;
4284 
4285 	*_vnode = vnode;
4286 	return B_OK;
4287 }
4288 
4289 
4290 extern "C" status_t
4291 vfs_get_fs_node_from_path(fs_volume* volume, const char* path,
4292 	bool traverseLeafLink, bool kernel, void** _node)
4293 {
4294 	TRACE(("vfs_get_fs_node_from_path(volume = %p, path = \"%s\", kernel %d)\n",
4295 		volume, path, kernel));
4296 
4297 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
4298 	if (pathBuffer.InitCheck() != B_OK)
4299 		return B_NO_MEMORY;
4300 
4301 	fs_mount* mount;
4302 	status_t status = get_mount(volume->id, &mount);
4303 	if (status != B_OK)
4304 		return status;
4305 
4306 	char* buffer = pathBuffer.LockBuffer();
4307 	strlcpy(buffer, path, pathBuffer.BufferSize());
4308 
4309 	struct vnode* vnode = mount->root_vnode;
4310 
4311 	if (buffer[0] == '/')
4312 		status = path_to_vnode(buffer, traverseLeafLink, &vnode, NULL, kernel);
4313 	else {
4314 		inc_vnode_ref_count(vnode);
4315 			// vnode_path_to_vnode() releases a reference to the starting vnode
4316 		status = vnode_path_to_vnode(vnode, buffer, traverseLeafLink, 0,
4317 			kernel, &vnode, NULL);
4318 	}
4319 
4320 	put_mount(mount);
4321 
4322 	if (status != B_OK)
4323 		return status;
4324 
4325 	if (vnode->device != volume->id) {
4326 		// wrong mount ID - must not gain access on foreign file system nodes
4327 		put_vnode(vnode);
4328 		return B_BAD_VALUE;
4329 	}
4330 
4331 	// Use get_vnode() to resolve the cookie for the right layer.
4332 	status = get_vnode(volume, vnode->id, _node);
4333 	put_vnode(vnode);
4334 
4335 	return status;
4336 }
4337 
4338 
4339 status_t
4340 vfs_read_stat(int fd, const char* path, bool traverseLeafLink,
4341 	struct stat* stat, bool kernel)
4342 {
4343 	status_t status;
4344 
4345 	if (path != NULL) {
4346 		// path given: get the stat of the node referred to by (fd, path)
4347 		KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
4348 		if (pathBuffer.InitCheck() != B_OK)
4349 			return B_NO_MEMORY;
4350 
4351 		status = common_path_read_stat(fd, pathBuffer.LockBuffer(),
4352 			traverseLeafLink, stat, kernel);
4353 	} else {
4354 		// no path given: get the FD and use the FD operation
4355 		struct file_descriptor* descriptor
4356 			= get_fd(get_current_io_context(kernel), fd);
4357 		if (descriptor == NULL)
4358 			return B_FILE_ERROR;
4359 
4360 		if (descriptor->ops->fd_read_stat)
4361 			status = descriptor->ops->fd_read_stat(descriptor, stat);
4362 		else
4363 			status = B_UNSUPPORTED;
4364 
4365 		put_fd(descriptor);
4366 	}
4367 
4368 	return status;
4369 }
4370 
4371 
4372 /*!	Finds the full path to the file that contains the module \a moduleName,
4373 	puts it into \a pathBuffer, and returns B_OK for success.
4374 	If \a pathBuffer was too small, it returns \c B_BUFFER_OVERFLOW,
4375 	\c B_ENTRY_NOT_FOUNT if no file could be found.
4376 	\a pathBuffer is clobbered in any case and must not be relied on if this
4377 	functions returns unsuccessfully.
4378 	\a basePath and \a pathBuffer must not point to the same space.
4379 */
4380 status_t
4381 vfs_get_module_path(const char* basePath, const char* moduleName,
4382 	char* pathBuffer, size_t bufferSize)
4383 {
4384 	struct vnode* dir;
4385 	struct vnode* file;
4386 	status_t status;
4387 	size_t length;
4388 	char* path;
4389 
4390 	if (bufferSize == 0
4391 		|| strlcpy(pathBuffer, basePath, bufferSize) >= bufferSize)
4392 		return B_BUFFER_OVERFLOW;
4393 
4394 	status = path_to_vnode(pathBuffer, true, &dir, NULL, true);
4395 	if (status != B_OK)
4396 		return status;
4397 
4398 	// the path buffer had been clobbered by the above call
4399 	length = strlcpy(pathBuffer, basePath, bufferSize);
4400 	if (pathBuffer[length - 1] != '/')
4401 		pathBuffer[length++] = '/';
4402 
4403 	path = pathBuffer + length;
4404 	bufferSize -= length;
4405 
4406 	while (moduleName) {
4407 		char* nextPath = strchr(moduleName, '/');
4408 		if (nextPath == NULL)
4409 			length = strlen(moduleName);
4410 		else {
4411 			length = nextPath - moduleName;
4412 			nextPath++;
4413 		}
4414 
4415 		if (length + 1 >= bufferSize) {
4416 			status = B_BUFFER_OVERFLOW;
4417 			goto err;
4418 		}
4419 
4420 		memcpy(path, moduleName, length);
4421 		path[length] = '\0';
4422 		moduleName = nextPath;
4423 
4424 		status = vnode_path_to_vnode(dir, path, true, 0, true, &file, NULL);
4425 		if (status != B_OK) {
4426 			// vnode_path_to_vnode() has already released the reference to dir
4427 			return status;
4428 		}
4429 
4430 		if (S_ISDIR(file->Type())) {
4431 			// goto the next directory
4432 			path[length] = '/';
4433 			path[length + 1] = '\0';
4434 			path += length + 1;
4435 			bufferSize -= length + 1;
4436 
4437 			dir = file;
4438 		} else if (S_ISREG(file->Type())) {
4439 			// it's a file so it should be what we've searched for
4440 			put_vnode(file);
4441 
4442 			return B_OK;
4443 		} else {
4444 			TRACE(("vfs_get_module_path(): something is strange here: "
4445 				"0x%08" B_PRIx32 "...\n", file->Type()));
4446 			status = B_ERROR;
4447 			dir = file;
4448 			goto err;
4449 		}
4450 	}
4451 
4452 	// if we got here, the moduleName just pointed to a directory, not to
4453 	// a real module - what should we do in this case?
4454 	status = B_ENTRY_NOT_FOUND;
4455 
4456 err:
4457 	put_vnode(dir);
4458 	return status;
4459 }
4460 
4461 
4462 /*!	\brief Normalizes a given path.
4463 
4464 	The path must refer to an existing or non-existing entry in an existing
4465 	directory, that is chopping off the leaf component the remaining path must
4466 	refer to an existing directory.
4467 
4468 	The returned will be canonical in that it will be absolute, will not
4469 	contain any "." or ".." components or duplicate occurrences of '/'s,
4470 	and none of the directory components will by symbolic links.
4471 
4472 	Any two paths referring to the same entry, will result in the same
4473 	normalized path (well, that is pretty much the definition of `normalized',
4474 	isn't it :-).
4475 
4476 	\param path The path to be normalized.
4477 	\param buffer The buffer into which the normalized path will be written.
4478 		   May be the same one as \a path.
4479 	\param bufferSize The size of \a buffer.
4480 	\param traverseLink If \c true, the function also resolves leaf symlinks.
4481 	\param kernel \c true, if the IO context of the kernel shall be used,
4482 		   otherwise that of the team this thread belongs to. Only relevant,
4483 		   if the path is relative (to get the CWD).
4484 	\return \c B_OK if everything went fine, another error code otherwise.
4485 */
4486 status_t
4487 vfs_normalize_path(const char* path, char* buffer, size_t bufferSize,
4488 	bool traverseLink, bool kernel)
4489 {
4490 	if (!path || !buffer || bufferSize < 1)
4491 		return B_BAD_VALUE;
4492 
4493 	if (path != buffer) {
4494 		if (strlcpy(buffer, path, bufferSize) >= bufferSize)
4495 			return B_BUFFER_OVERFLOW;
4496 	}
4497 
4498 	return normalize_path(buffer, bufferSize, traverseLink, kernel);
4499 }
4500 
4501 
4502 /*!	\brief Gets the parent of the passed in node.
4503 
4504 	Gets the parent of the passed in node, and correctly resolves covered
4505 	nodes.
4506 */
4507 extern "C" status_t
4508 vfs_resolve_parent(struct vnode* parent, dev_t* device, ino_t* node)
4509 {
4510 	return resolve_covered_parent(parent, device, node,
4511 		get_current_io_context(true));
4512 }
4513 
4514 
4515 /*!	\brief Creates a special node in the file system.
4516 
4517 	The caller gets a reference to the newly created node (which is passed
4518 	back through \a _createdVnode) and is responsible for releasing it.
4519 
4520 	\param path The path where to create the entry for the node. Can be \c NULL,
4521 		in which case the node is created without an entry in the root FS -- it
4522 		will automatically be deleted when the last reference has been released.
4523 	\param subVnode The definition of the subnode. Can be \c NULL, in which case
4524 		the target file system will just create the node with its standard
4525 		operations. Depending on the type of the node a subnode might be created
4526 		automatically, though.
4527 	\param mode The type and permissions for the node to be created.
4528 	\param flags Flags to be passed to the creating FS.
4529 	\param kernel \c true, if called in the kernel context (relevant only if
4530 		\a path is not \c NULL and not absolute).
4531 	\param _superVnode Pointer to a pre-allocated structure to be filled by the
4532 		file system creating the node, with the private data pointer and
4533 		operations for the super node. Can be \c NULL.
4534 	\param _createVnode Pointer to pre-allocated storage where to store the
4535 		pointer to the newly created node.
4536 	\return \c B_OK, if everything went fine, another error code otherwise.
4537 */
4538 status_t
4539 vfs_create_special_node(const char* path, fs_vnode* subVnode, mode_t mode,
4540 	uint32 flags, bool kernel, fs_vnode* _superVnode,
4541 	struct vnode** _createdVnode)
4542 {
4543 	struct vnode* dirNode;
4544 	char _leaf[B_FILE_NAME_LENGTH];
4545 	char* leaf = NULL;
4546 
4547 	if (path) {
4548 		// We've got a path. Get the dir vnode and the leaf name.
4549 		KPath tmpPathBuffer(B_PATH_NAME_LENGTH + 1);
4550 		if (tmpPathBuffer.InitCheck() != B_OK)
4551 			return B_NO_MEMORY;
4552 
4553 		char* tmpPath = tmpPathBuffer.LockBuffer();
4554 		if (strlcpy(tmpPath, path, B_PATH_NAME_LENGTH) >= B_PATH_NAME_LENGTH)
4555 			return B_NAME_TOO_LONG;
4556 
4557 		// get the dir vnode and the leaf name
4558 		leaf = _leaf;
4559 		status_t error = path_to_dir_vnode(tmpPath, &dirNode, leaf, kernel);
4560 		if (error != B_OK)
4561 			return error;
4562 	} else {
4563 		// No path. Create the node in the root FS.
4564 		dirNode = sRoot;
4565 		inc_vnode_ref_count(dirNode);
4566 	}
4567 
4568 	VNodePutter _(dirNode);
4569 
4570 	// check support for creating special nodes
4571 	if (!HAS_FS_CALL(dirNode, create_special_node))
4572 		return B_UNSUPPORTED;
4573 
4574 	// create the node
4575 	fs_vnode superVnode;
4576 	ino_t nodeID;
4577 	status_t status = FS_CALL(dirNode, create_special_node, leaf, subVnode,
4578 		mode, flags, _superVnode != NULL ? _superVnode : &superVnode, &nodeID);
4579 	if (status != B_OK)
4580 		return status;
4581 
4582 	// lookup the node
4583 	rw_lock_read_lock(&sVnodeLock);
4584 	*_createdVnode = lookup_vnode(dirNode->mount->id, nodeID);
4585 	rw_lock_read_unlock(&sVnodeLock);
4586 
4587 	if (*_createdVnode == NULL) {
4588 		panic("vfs_create_special_node(): lookup of node failed");
4589 		return B_ERROR;
4590 	}
4591 
4592 	return B_OK;
4593 }
4594 
4595 
4596 extern "C" void
4597 vfs_put_vnode(struct vnode* vnode)
4598 {
4599 	put_vnode(vnode);
4600 }
4601 
4602 
4603 extern "C" status_t
4604 vfs_get_cwd(dev_t* _mountID, ino_t* _vnodeID)
4605 {
4606 	// Get current working directory from io context
4607 	struct io_context* context = get_current_io_context(false);
4608 	status_t status = B_OK;
4609 
4610 	mutex_lock(&context->io_mutex);
4611 
4612 	if (context->cwd != NULL) {
4613 		*_mountID = context->cwd->device;
4614 		*_vnodeID = context->cwd->id;
4615 	} else
4616 		status = B_ERROR;
4617 
4618 	mutex_unlock(&context->io_mutex);
4619 	return status;
4620 }
4621 
4622 
4623 status_t
4624 vfs_unmount(dev_t mountID, uint32 flags)
4625 {
4626 	return fs_unmount(NULL, mountID, flags, true);
4627 }
4628 
4629 
4630 extern "C" status_t
4631 vfs_disconnect_vnode(dev_t mountID, ino_t vnodeID)
4632 {
4633 	struct vnode* vnode;
4634 
4635 	status_t status = get_vnode(mountID, vnodeID, &vnode, true, true);
4636 	if (status != B_OK)
4637 		return status;
4638 
4639 	disconnect_mount_or_vnode_fds(vnode->mount, vnode);
4640 	put_vnode(vnode);
4641 	return B_OK;
4642 }
4643 
4644 
4645 extern "C" void
4646 vfs_free_unused_vnodes(int32 level)
4647 {
4648 	vnode_low_resource_handler(NULL,
4649 		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
4650 			| B_KERNEL_RESOURCE_ADDRESS_SPACE,
4651 		level);
4652 }
4653 
4654 
4655 extern "C" bool
4656 vfs_can_page(struct vnode* vnode, void* cookie)
4657 {
4658 	FUNCTION(("vfs_canpage: vnode %p\n", vnode));
4659 
4660 	if (HAS_FS_CALL(vnode, can_page))
4661 		return FS_CALL(vnode, can_page, cookie);
4662 	return false;
4663 }
4664 
4665 
4666 extern "C" status_t
4667 vfs_read_pages(struct vnode* vnode, void* cookie, off_t pos,
4668 	const generic_io_vec* vecs, size_t count, uint32 flags,
4669 	generic_size_t* _numBytes)
4670 {
4671 	FUNCTION(("vfs_read_pages: vnode %p, vecs %p, pos %" B_PRIdOFF "\n", vnode,
4672 		vecs, pos));
4673 
4674 #if VFS_PAGES_IO_TRACING
4675 	generic_size_t bytesRequested = *_numBytes;
4676 #endif
4677 
4678 	IORequest request;
4679 	status_t status = request.Init(pos, vecs, count, *_numBytes, false, flags);
4680 	if (status == B_OK) {
4681 		status = vfs_vnode_io(vnode, cookie, &request);
4682 		if (status == B_OK)
4683 			status = request.Wait();
4684 		*_numBytes = request.TransferredBytes();
4685 	}
4686 
4687 	TPIO(ReadPages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4688 		status, *_numBytes));
4689 
4690 	return status;
4691 }
4692 
4693 
4694 extern "C" status_t
4695 vfs_write_pages(struct vnode* vnode, void* cookie, off_t pos,
4696 	const generic_io_vec* vecs, size_t count, uint32 flags,
4697 	generic_size_t* _numBytes)
4698 {
4699 	FUNCTION(("vfs_write_pages: vnode %p, vecs %p, pos %" B_PRIdOFF "\n", vnode,
4700 		vecs, pos));
4701 
4702 #if VFS_PAGES_IO_TRACING
4703 	generic_size_t bytesRequested = *_numBytes;
4704 #endif
4705 
4706 	IORequest request;
4707 	status_t status = request.Init(pos, vecs, count, *_numBytes, true, flags);
4708 	if (status == B_OK) {
4709 		status = vfs_vnode_io(vnode, cookie, &request);
4710 		if (status == B_OK)
4711 			status = request.Wait();
4712 		*_numBytes = request.TransferredBytes();
4713 	}
4714 
4715 	TPIO(WritePages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4716 		status, *_numBytes));
4717 
4718 	return status;
4719 }
4720 
4721 
4722 /*!	Gets the vnode's VMCache object. If it didn't have one, it will be
4723 	created if \a allocate is \c true.
4724 	In case it's successful, it will also grab a reference to the cache
4725 	it returns.
4726 */
4727 extern "C" status_t
4728 vfs_get_vnode_cache(struct vnode* vnode, VMCache** _cache, bool allocate)
4729 {
4730 	if (vnode->cache != NULL) {
4731 		vnode->cache->AcquireRef();
4732 		*_cache = vnode->cache;
4733 		return B_OK;
4734 	}
4735 
4736 	rw_lock_read_lock(&sVnodeLock);
4737 	vnode->Lock();
4738 
4739 	status_t status = B_OK;
4740 
4741 	// The cache could have been created in the meantime
4742 	if (vnode->cache == NULL) {
4743 		if (allocate) {
4744 			// TODO: actually the vnode needs to be busy already here, or
4745 			//	else this won't work...
4746 			bool wasBusy = vnode->IsBusy();
4747 			vnode->SetBusy(true);
4748 
4749 			vnode->Unlock();
4750 			rw_lock_read_unlock(&sVnodeLock);
4751 
4752 			status = vm_create_vnode_cache(vnode, &vnode->cache);
4753 
4754 			rw_lock_read_lock(&sVnodeLock);
4755 			vnode->Lock();
4756 			vnode->SetBusy(wasBusy);
4757 		} else
4758 			status = B_BAD_VALUE;
4759 	}
4760 
4761 	vnode->Unlock();
4762 	rw_lock_read_unlock(&sVnodeLock);
4763 
4764 	if (status == B_OK) {
4765 		vnode->cache->AcquireRef();
4766 		*_cache = vnode->cache;
4767 	}
4768 
4769 	return status;
4770 }
4771 
4772 
4773 status_t
4774 vfs_get_file_map(struct vnode* vnode, off_t offset, size_t size,
4775 	file_io_vec* vecs, size_t* _count)
4776 {
4777 	FUNCTION(("vfs_get_file_map: vnode %p, vecs %p, offset %" B_PRIdOFF
4778 		", size = %" B_PRIuSIZE "\n", vnode, vecs, offset, size));
4779 
4780 	return FS_CALL(vnode, get_file_map, offset, size, vecs, _count);
4781 }
4782 
4783 
4784 status_t
4785 vfs_stat_vnode(struct vnode* vnode, struct stat* stat)
4786 {
4787 	status_t status = FS_CALL(vnode, read_stat, stat);
4788 
4789 	// fill in the st_dev and st_ino fields
4790 	if (status == B_OK) {
4791 		stat->st_dev = vnode->device;
4792 		stat->st_ino = vnode->id;
4793 		// the rdev field must stay unset for non-special files
4794 		if (!S_ISBLK(stat->st_mode) && !S_ISCHR(stat->st_mode))
4795 			stat->st_rdev = -1;
4796 	}
4797 
4798 	return status;
4799 }
4800 
4801 
4802 status_t
4803 vfs_stat_node_ref(dev_t device, ino_t inode, struct stat* stat)
4804 {
4805 	struct vnode* vnode;
4806 	status_t status = get_vnode(device, inode, &vnode, true, false);
4807 	if (status != B_OK)
4808 		return status;
4809 
4810 	status = vfs_stat_vnode(vnode, stat);
4811 
4812 	put_vnode(vnode);
4813 	return status;
4814 }
4815 
4816 
4817 status_t
4818 vfs_get_vnode_name(struct vnode* vnode, char* name, size_t nameSize)
4819 {
4820 	return get_vnode_name(vnode, NULL, name, nameSize, true);
4821 }
4822 
4823 
4824 status_t
4825 vfs_entry_ref_to_path(dev_t device, ino_t inode, const char* leaf,
4826 	bool kernel, char* path, size_t pathLength)
4827 {
4828 	struct vnode* vnode;
4829 	status_t status;
4830 
4831 	// filter invalid leaf names
4832 	if (leaf != NULL && (leaf[0] == '\0' || strchr(leaf, '/')))
4833 		return B_BAD_VALUE;
4834 
4835 	// get the vnode matching the dir's node_ref
4836 	if (leaf && (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0)) {
4837 		// special cases "." and "..": we can directly get the vnode of the
4838 		// referenced directory
4839 		status = entry_ref_to_vnode(device, inode, leaf, false, kernel, &vnode);
4840 		leaf = NULL;
4841 	} else
4842 		status = get_vnode(device, inode, &vnode, true, false);
4843 	if (status != B_OK)
4844 		return status;
4845 
4846 	// get the directory path
4847 	status = dir_vnode_to_path(vnode, path, pathLength, kernel);
4848 	put_vnode(vnode);
4849 		// we don't need the vnode anymore
4850 	if (status != B_OK)
4851 		return status;
4852 
4853 	// append the leaf name
4854 	if (leaf) {
4855 		// insert a directory separator if this is not the file system root
4856 		if ((strcmp(path, "/") && strlcat(path, "/", pathLength)
4857 				>= pathLength)
4858 			|| strlcat(path, leaf, pathLength) >= pathLength) {
4859 			return B_NAME_TOO_LONG;
4860 		}
4861 	}
4862 
4863 	return B_OK;
4864 }
4865 
4866 
4867 /*!	If the given descriptor locked its vnode, that lock will be released. */
4868 void
4869 vfs_unlock_vnode_if_locked(struct file_descriptor* descriptor)
4870 {
4871 	struct vnode* vnode = fd_vnode(descriptor);
4872 
4873 	if (vnode != NULL && vnode->mandatory_locked_by == descriptor)
4874 		vnode->mandatory_locked_by = NULL;
4875 }
4876 
4877 
4878 /*!	Closes all file descriptors of the specified I/O context that
4879 	have the O_CLOEXEC flag set.
4880 */
4881 void
4882 vfs_exec_io_context(io_context* context)
4883 {
4884 	uint32 i;
4885 
4886 	for (i = 0; i < context->table_size; i++) {
4887 		mutex_lock(&context->io_mutex);
4888 
4889 		struct file_descriptor* descriptor = context->fds[i];
4890 		bool remove = false;
4891 
4892 		if (descriptor != NULL && fd_close_on_exec(context, i)) {
4893 			context->fds[i] = NULL;
4894 			context->num_used_fds--;
4895 
4896 			remove = true;
4897 		}
4898 
4899 		mutex_unlock(&context->io_mutex);
4900 
4901 		if (remove) {
4902 			close_fd(descriptor);
4903 			put_fd(descriptor);
4904 		}
4905 	}
4906 }
4907 
4908 
4909 /*! Sets up a new io_control structure, and inherits the properties
4910 	of the parent io_control if it is given.
4911 */
4912 io_context*
4913 vfs_new_io_context(io_context* parentContext, bool purgeCloseOnExec)
4914 {
4915 	io_context* context = (io_context*)malloc(sizeof(io_context));
4916 	if (context == NULL)
4917 		return NULL;
4918 
4919 	TIOC(NewIOContext(context, parentContext));
4920 
4921 	memset(context, 0, sizeof(io_context));
4922 	context->ref_count = 1;
4923 
4924 	MutexLocker parentLocker;
4925 
4926 	size_t tableSize;
4927 	if (parentContext != NULL) {
4928 		parentLocker.SetTo(parentContext->io_mutex, false);
4929 		tableSize = parentContext->table_size;
4930 	} else
4931 		tableSize = DEFAULT_FD_TABLE_SIZE;
4932 
4933 	// allocate space for FDs and their close-on-exec flag
4934 	context->fds = (file_descriptor**)malloc(
4935 		sizeof(struct file_descriptor*) * tableSize
4936 		+ sizeof(struct select_sync*) * tableSize
4937 		+ (tableSize + 7) / 8);
4938 	if (context->fds == NULL) {
4939 		free(context);
4940 		return NULL;
4941 	}
4942 
4943 	context->select_infos = (select_info**)(context->fds + tableSize);
4944 	context->fds_close_on_exec = (uint8*)(context->select_infos + tableSize);
4945 
4946 	memset(context->fds, 0, sizeof(struct file_descriptor*) * tableSize
4947 		+ sizeof(struct select_sync*) * tableSize
4948 		+ (tableSize + 7) / 8);
4949 
4950 	mutex_init(&context->io_mutex, "I/O context");
4951 
4952 	// Copy all parent file descriptors
4953 
4954 	if (parentContext != NULL) {
4955 		size_t i;
4956 
4957 		mutex_lock(&sIOContextRootLock);
4958 		context->root = parentContext->root;
4959 		if (context->root)
4960 			inc_vnode_ref_count(context->root);
4961 		mutex_unlock(&sIOContextRootLock);
4962 
4963 		context->cwd = parentContext->cwd;
4964 		if (context->cwd)
4965 			inc_vnode_ref_count(context->cwd);
4966 
4967 		if (parentContext->inherit_fds) {
4968 			for (i = 0; i < tableSize; i++) {
4969 				struct file_descriptor* descriptor = parentContext->fds[i];
4970 
4971 				if (descriptor != NULL
4972 					&& (descriptor->open_mode & O_DISCONNECTED) == 0) {
4973 					bool closeOnExec = fd_close_on_exec(parentContext, i);
4974 					if (closeOnExec && purgeCloseOnExec)
4975 						continue;
4976 
4977 					TFD(InheritFD(context, i, descriptor, parentContext));
4978 
4979 					context->fds[i] = descriptor;
4980 					context->num_used_fds++;
4981 					atomic_add(&descriptor->ref_count, 1);
4982 					atomic_add(&descriptor->open_count, 1);
4983 
4984 					if (closeOnExec)
4985 						fd_set_close_on_exec(context, i, true);
4986 				}
4987 			}
4988 		}
4989 
4990 		parentLocker.Unlock();
4991 	} else {
4992 		context->root = sRoot;
4993 		context->cwd = sRoot;
4994 
4995 		if (context->root)
4996 			inc_vnode_ref_count(context->root);
4997 
4998 		if (context->cwd)
4999 			inc_vnode_ref_count(context->cwd);
5000 	}
5001 
5002 	context->table_size = tableSize;
5003 	context->inherit_fds = parentContext != NULL;
5004 
5005 	list_init(&context->node_monitors);
5006 	context->max_monitors = DEFAULT_NODE_MONITORS;
5007 
5008 	return context;
5009 }
5010 
5011 
5012 void
5013 vfs_get_io_context(io_context* context)
5014 {
5015 	atomic_add(&context->ref_count, 1);
5016 }
5017 
5018 
5019 void
5020 vfs_put_io_context(io_context* context)
5021 {
5022 	if (atomic_add(&context->ref_count, -1) == 1)
5023 		free_io_context(context);
5024 }
5025 
5026 
5027 status_t
5028 vfs_resize_fd_table(struct io_context* context, uint32 newSize)
5029 {
5030 	if (newSize == 0 || newSize > MAX_FD_TABLE_SIZE)
5031 		return B_BAD_VALUE;
5032 
5033 	TIOC(ResizeIOContext(context, newSize));
5034 
5035 	MutexLocker _(context->io_mutex);
5036 
5037 	uint32 oldSize = context->table_size;
5038 	int oldCloseOnExitBitmapSize = (oldSize + 7) / 8;
5039 	int newCloseOnExitBitmapSize = (newSize + 7) / 8;
5040 
5041 	// If the tables shrink, make sure none of the fds being dropped are in use.
5042 	if (newSize < oldSize) {
5043 		for (uint32 i = oldSize; i-- > newSize;) {
5044 			if (context->fds[i])
5045 				return B_BUSY;
5046 		}
5047 	}
5048 
5049 	// store pointers to the old tables
5050 	file_descriptor** oldFDs = context->fds;
5051 	select_info** oldSelectInfos = context->select_infos;
5052 	uint8* oldCloseOnExecTable = context->fds_close_on_exec;
5053 
5054 	// allocate new tables
5055 	file_descriptor** newFDs = (file_descriptor**)malloc(
5056 		sizeof(struct file_descriptor*) * newSize
5057 		+ sizeof(struct select_sync*) * newSize
5058 		+ newCloseOnExitBitmapSize);
5059 	if (newFDs == NULL)
5060 		return B_NO_MEMORY;
5061 
5062 	context->fds = newFDs;
5063 	context->select_infos = (select_info**)(context->fds + newSize);
5064 	context->fds_close_on_exec = (uint8*)(context->select_infos + newSize);
5065 	context->table_size = newSize;
5066 
5067 	// copy entries from old tables
5068 	uint32 toCopy = min_c(oldSize, newSize);
5069 
5070 	memcpy(context->fds, oldFDs, sizeof(void*) * toCopy);
5071 	memcpy(context->select_infos, oldSelectInfos, sizeof(void*) * toCopy);
5072 	memcpy(context->fds_close_on_exec, oldCloseOnExecTable,
5073 		min_c(oldCloseOnExitBitmapSize, newCloseOnExitBitmapSize));
5074 
5075 	// clear additional entries, if the tables grow
5076 	if (newSize > oldSize) {
5077 		memset(context->fds + oldSize, 0, sizeof(void*) * (newSize - oldSize));
5078 		memset(context->select_infos + oldSize, 0,
5079 			sizeof(void*) * (newSize - oldSize));
5080 		memset(context->fds_close_on_exec + oldCloseOnExitBitmapSize, 0,
5081 			newCloseOnExitBitmapSize - oldCloseOnExitBitmapSize);
5082 	}
5083 
5084 	free(oldFDs);
5085 
5086 	return B_OK;
5087 }
5088 
5089 
5090 /*!	\brief Resolves a vnode to the vnode it is covered by, if any.
5091 
5092 	Given an arbitrary vnode (identified by mount and node ID), the function
5093 	checks, whether the vnode is covered by another vnode. If it is, the
5094 	function returns the mount and node ID of the covering vnode. Otherwise
5095 	it simply returns the supplied mount and node ID.
5096 
5097 	In case of error (e.g. the supplied node could not be found) the variables
5098 	for storing the resolved mount and node ID remain untouched and an error
5099 	code is returned.
5100 
5101 	\param mountID The mount ID of the vnode in question.
5102 	\param nodeID The node ID of the vnode in question.
5103 	\param resolvedMountID Pointer to storage for the resolved mount ID.
5104 	\param resolvedNodeID Pointer to storage for the resolved node ID.
5105 	\return
5106 	- \c B_OK, if everything went fine,
5107 	- another error code, if something went wrong.
5108 */
5109 status_t
5110 vfs_resolve_vnode_to_covering_vnode(dev_t mountID, ino_t nodeID,
5111 	dev_t* resolvedMountID, ino_t* resolvedNodeID)
5112 {
5113 	// get the node
5114 	struct vnode* node;
5115 	status_t error = get_vnode(mountID, nodeID, &node, true, false);
5116 	if (error != B_OK)
5117 		return error;
5118 
5119 	// resolve the node
5120 	if (Vnode* coveringNode = get_covering_vnode(node)) {
5121 		put_vnode(node);
5122 		node = coveringNode;
5123 	}
5124 
5125 	// set the return values
5126 	*resolvedMountID = node->device;
5127 	*resolvedNodeID = node->id;
5128 
5129 	put_vnode(node);
5130 
5131 	return B_OK;
5132 }
5133 
5134 
5135 status_t
5136 vfs_get_mount_point(dev_t mountID, dev_t* _mountPointMountID,
5137 	ino_t* _mountPointNodeID)
5138 {
5139 	ReadLocker nodeLocker(sVnodeLock);
5140 	MutexLocker mountLocker(sMountMutex);
5141 
5142 	struct fs_mount* mount = find_mount(mountID);
5143 	if (mount == NULL)
5144 		return B_BAD_VALUE;
5145 
5146 	Vnode* mountPoint = mount->covers_vnode;
5147 
5148 	*_mountPointMountID = mountPoint->device;
5149 	*_mountPointNodeID = mountPoint->id;
5150 
5151 	return B_OK;
5152 }
5153 
5154 
5155 status_t
5156 vfs_bind_mount_directory(dev_t mountID, ino_t nodeID, dev_t coveredMountID,
5157 	ino_t coveredNodeID)
5158 {
5159 	// get the vnodes
5160 	Vnode* vnode;
5161 	status_t error = get_vnode(mountID, nodeID, &vnode, true, false);
5162 	if (error != B_OK)
5163 		return B_BAD_VALUE;
5164 	VNodePutter vnodePutter(vnode);
5165 
5166 	Vnode* coveredVnode;
5167 	error = get_vnode(coveredMountID, coveredNodeID, &coveredVnode, true,
5168 		false);
5169 	if (error != B_OK)
5170 		return B_BAD_VALUE;
5171 	VNodePutter coveredVnodePutter(coveredVnode);
5172 
5173 	// establish the covered/covering links
5174 	WriteLocker locker(sVnodeLock);
5175 
5176 	if (vnode->covers != NULL || coveredVnode->covered_by != NULL
5177 		|| vnode->mount->unmounting || coveredVnode->mount->unmounting) {
5178 		return B_BUSY;
5179 	}
5180 
5181 	vnode->covers = coveredVnode;
5182 	vnode->SetCovering(true);
5183 
5184 	coveredVnode->covered_by = vnode;
5185 	coveredVnode->SetCovered(true);
5186 
5187 	// the vnodes do now reference each other
5188 	inc_vnode_ref_count(vnode);
5189 	inc_vnode_ref_count(coveredVnode);
5190 
5191 	return B_OK;
5192 }
5193 
5194 
5195 int
5196 vfs_getrlimit(int resource, struct rlimit* rlp)
5197 {
5198 	if (!rlp)
5199 		return B_BAD_ADDRESS;
5200 
5201 	switch (resource) {
5202 		case RLIMIT_NOFILE:
5203 		{
5204 			struct io_context* context = get_current_io_context(false);
5205 			MutexLocker _(context->io_mutex);
5206 
5207 			rlp->rlim_cur = context->table_size;
5208 			rlp->rlim_max = MAX_FD_TABLE_SIZE;
5209 			return 0;
5210 		}
5211 
5212 		case RLIMIT_NOVMON:
5213 		{
5214 			struct io_context* context = get_current_io_context(false);
5215 			MutexLocker _(context->io_mutex);
5216 
5217 			rlp->rlim_cur = context->max_monitors;
5218 			rlp->rlim_max = MAX_NODE_MONITORS;
5219 			return 0;
5220 		}
5221 
5222 		default:
5223 			return B_BAD_VALUE;
5224 	}
5225 }
5226 
5227 
5228 int
5229 vfs_setrlimit(int resource, const struct rlimit* rlp)
5230 {
5231 	if (!rlp)
5232 		return B_BAD_ADDRESS;
5233 
5234 	switch (resource) {
5235 		case RLIMIT_NOFILE:
5236 			/* TODO: check getuid() */
5237 			if (rlp->rlim_max != RLIM_SAVED_MAX
5238 				&& rlp->rlim_max != MAX_FD_TABLE_SIZE)
5239 				return B_NOT_ALLOWED;
5240 
5241 			return vfs_resize_fd_table(get_current_io_context(false),
5242 				rlp->rlim_cur);
5243 
5244 		case RLIMIT_NOVMON:
5245 			/* TODO: check getuid() */
5246 			if (rlp->rlim_max != RLIM_SAVED_MAX
5247 				&& rlp->rlim_max != MAX_NODE_MONITORS)
5248 				return B_NOT_ALLOWED;
5249 
5250 			return resize_monitor_table(get_current_io_context(false),
5251 				rlp->rlim_cur);
5252 
5253 		default:
5254 			return B_BAD_VALUE;
5255 	}
5256 }
5257 
5258 
5259 status_t
5260 vfs_init(kernel_args* args)
5261 {
5262 	vnode::StaticInit();
5263 
5264 	sVnodeTable = new(std::nothrow) VnodeTable();
5265 	if (sVnodeTable == NULL || sVnodeTable->Init(VNODE_HASH_TABLE_SIZE) != B_OK)
5266 		panic("vfs_init: error creating vnode hash table\n");
5267 
5268 	struct vnode dummy_vnode;
5269 	list_init_etc(&sUnusedVnodeList, offset_of_member(dummy_vnode, unused_link));
5270 
5271 	struct fs_mount dummyMount;
5272 	sMountsTable = new(std::nothrow) MountTable();
5273 	if (sMountsTable == NULL
5274 			|| sMountsTable->Init(MOUNTS_HASH_TABLE_SIZE) != B_OK)
5275 		panic("vfs_init: error creating mounts hash table\n");
5276 
5277 	node_monitor_init();
5278 
5279 	sRoot = NULL;
5280 
5281 	recursive_lock_init(&sMountOpLock, "vfs_mount_op_lock");
5282 
5283 	if (block_cache_init() != B_OK)
5284 		return B_ERROR;
5285 
5286 #ifdef ADD_DEBUGGER_COMMANDS
5287 	// add some debugger commands
5288 	add_debugger_command_etc("vnode", &dump_vnode,
5289 		"Print info about the specified vnode",
5290 		"[ \"-p\" ] ( <vnode> | <devID> <nodeID> )\n"
5291 		"Prints information about the vnode specified by address <vnode> or\n"
5292 		"<devID>, <vnodeID> pair. If \"-p\" is given, a path of the vnode is\n"
5293 		"constructed and printed. It might not be possible to construct a\n"
5294 		"complete path, though.\n",
5295 		0);
5296 	add_debugger_command("vnodes", &dump_vnodes,
5297 		"list all vnodes (from the specified device)");
5298 	add_debugger_command("vnode_caches", &dump_vnode_caches,
5299 		"list all vnode caches");
5300 	add_debugger_command("mount", &dump_mount,
5301 		"info about the specified fs_mount");
5302 	add_debugger_command("mounts", &dump_mounts, "list all fs_mounts");
5303 	add_debugger_command("io_context", &dump_io_context,
5304 		"info about the I/O context");
5305 	add_debugger_command("vnode_usage", &dump_vnode_usage,
5306 		"info about vnode usage");
5307 #endif
5308 
5309 	register_low_resource_handler(&vnode_low_resource_handler, NULL,
5310 		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
5311 			| B_KERNEL_RESOURCE_ADDRESS_SPACE,
5312 		0);
5313 
5314 	fifo_init();
5315 	file_map_init();
5316 
5317 	return file_cache_init();
5318 }
5319 
5320 
5321 //	#pragma mark - fd_ops implementations
5322 
5323 
5324 /*!
5325 	Calls fs_open() on the given vnode and returns a new
5326 	file descriptor for it
5327 */
5328 static int
5329 open_vnode(struct vnode* vnode, int openMode, bool kernel)
5330 {
5331 	void* cookie;
5332 	status_t status = FS_CALL(vnode, open, openMode, &cookie);
5333 	if (status != B_OK)
5334 		return status;
5335 
5336 	int fd = get_new_fd(FDTYPE_FILE, NULL, vnode, cookie, openMode, kernel);
5337 	if (fd < 0) {
5338 		FS_CALL(vnode, close, cookie);
5339 		FS_CALL(vnode, free_cookie, cookie);
5340 	}
5341 	return fd;
5342 }
5343 
5344 
5345 /*!
5346 	Calls fs_open() on the given vnode and returns a new
5347 	file descriptor for it
5348 */
5349 static int
5350 create_vnode(struct vnode* directory, const char* name, int openMode,
5351 	int perms, bool kernel)
5352 {
5353 	bool traverse = ((openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0);
5354 	status_t status = B_ERROR;
5355 	struct vnode* vnode;
5356 	void* cookie;
5357 	ino_t newID;
5358 
5359 	// This is somewhat tricky: If the entry already exists, the FS responsible
5360 	// for the directory might not necessarily also be the one responsible for
5361 	// the node the entry refers to (e.g. in case of mount points or FIFOs). So
5362 	// we can actually never call the create() hook without O_EXCL. Instead we
5363 	// try to look the entry up first. If it already exists, we just open the
5364 	// node (unless O_EXCL), otherwise we call create() with O_EXCL. This
5365 	// introduces a race condition, since someone else might have created the
5366 	// entry in the meantime. We hope the respective FS returns the correct
5367 	// error code and retry (up to 3 times) again.
5368 
5369 	for (int i = 0; i < 3 && status != B_OK; i++) {
5370 		// look the node up
5371 		status = lookup_dir_entry(directory, name, &vnode);
5372 		if (status == B_OK) {
5373 			VNodePutter putter(vnode);
5374 
5375 			if ((openMode & O_EXCL) != 0)
5376 				return B_FILE_EXISTS;
5377 
5378 			// If the node is a symlink, we have to follow it, unless
5379 			// O_NOTRAVERSE is set.
5380 			if (S_ISLNK(vnode->Type()) && traverse) {
5381 				putter.Put();
5382 				char clonedName[B_FILE_NAME_LENGTH + 1];
5383 				if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH)
5384 						>= B_FILE_NAME_LENGTH) {
5385 					return B_NAME_TOO_LONG;
5386 				}
5387 
5388 				inc_vnode_ref_count(directory);
5389 				status = vnode_path_to_vnode(directory, clonedName, true, 0,
5390 					kernel, &vnode, NULL);
5391 				if (status != B_OK)
5392 					return status;
5393 
5394 				putter.SetTo(vnode);
5395 			}
5396 
5397 			if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type()))
5398 				return B_LINK_LIMIT;
5399 
5400 			int fd = open_vnode(vnode, openMode & ~O_CREAT, kernel);
5401 			// on success keep the vnode reference for the FD
5402 			if (fd >= 0)
5403 				putter.Detach();
5404 
5405 			return fd;
5406 		}
5407 
5408 		// it doesn't exist yet -- try to create it
5409 
5410 		if (!HAS_FS_CALL(directory, create))
5411 			return B_READ_ONLY_DEVICE;
5412 
5413 		status = FS_CALL(directory, create, name, openMode | O_EXCL, perms,
5414 			&cookie, &newID);
5415 		if (status != B_OK
5416 			&& ((openMode & O_EXCL) != 0 || status != B_FILE_EXISTS)) {
5417 			return status;
5418 		}
5419 	}
5420 
5421 	if (status != B_OK)
5422 		return status;
5423 
5424 	// the node has been created successfully
5425 
5426 	rw_lock_read_lock(&sVnodeLock);
5427 	vnode = lookup_vnode(directory->device, newID);
5428 	rw_lock_read_unlock(&sVnodeLock);
5429 
5430 	if (vnode == NULL) {
5431 		panic("vfs: fs_create() returned success but there is no vnode, "
5432 			"mount ID %" B_PRIdDEV "!\n", directory->device);
5433 		return B_BAD_VALUE;
5434 	}
5435 
5436 	int fd = get_new_fd(FDTYPE_FILE, NULL, vnode, cookie, openMode, kernel);
5437 	if (fd >= 0)
5438 		return fd;
5439 
5440 	status = fd;
5441 
5442 	// something went wrong, clean up
5443 
5444 	FS_CALL(vnode, close, cookie);
5445 	FS_CALL(vnode, free_cookie, cookie);
5446 	put_vnode(vnode);
5447 
5448 	FS_CALL(directory, unlink, name);
5449 
5450 	return status;
5451 }
5452 
5453 
5454 /*! Calls fs open_dir() on the given vnode and returns a new
5455 	file descriptor for it
5456 */
5457 static int
5458 open_dir_vnode(struct vnode* vnode, bool kernel)
5459 {
5460 	void* cookie;
5461 	status_t status = FS_CALL(vnode, open_dir, &cookie);
5462 	if (status != B_OK)
5463 		return status;
5464 
5465 	// directory is opened, create a fd
5466 	status = get_new_fd(FDTYPE_DIR, NULL, vnode, cookie, O_CLOEXEC, kernel);
5467 	if (status >= 0)
5468 		return status;
5469 
5470 	FS_CALL(vnode, close_dir, cookie);
5471 	FS_CALL(vnode, free_dir_cookie, cookie);
5472 
5473 	return status;
5474 }
5475 
5476 
5477 /*! Calls fs open_attr_dir() on the given vnode and returns a new
5478 	file descriptor for it.
5479 	Used by attr_dir_open(), and attr_dir_open_fd().
5480 */
5481 static int
5482 open_attr_dir_vnode(struct vnode* vnode, bool kernel)
5483 {
5484 	if (!HAS_FS_CALL(vnode, open_attr_dir))
5485 		return B_UNSUPPORTED;
5486 
5487 	void* cookie;
5488 	status_t status = FS_CALL(vnode, open_attr_dir, &cookie);
5489 	if (status != B_OK)
5490 		return status;
5491 
5492 	// directory is opened, create a fd
5493 	status = get_new_fd(FDTYPE_ATTR_DIR, NULL, vnode, cookie, O_CLOEXEC,
5494 		kernel);
5495 	if (status >= 0)
5496 		return status;
5497 
5498 	FS_CALL(vnode, close_attr_dir, cookie);
5499 	FS_CALL(vnode, free_attr_dir_cookie, cookie);
5500 
5501 	return status;
5502 }
5503 
5504 
5505 static int
5506 file_create_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5507 	int openMode, int perms, bool kernel)
5508 {
5509 	FUNCTION(("file_create_entry_ref: name = '%s', omode %x, perms %d, "
5510 		"kernel %d\n", name, openMode, perms, kernel));
5511 
5512 	// get directory to put the new file in
5513 	struct vnode* directory;
5514 	status_t status = get_vnode(mountID, directoryID, &directory, true, false);
5515 	if (status != B_OK)
5516 		return status;
5517 
5518 	status = create_vnode(directory, name, openMode, perms, kernel);
5519 	put_vnode(directory);
5520 
5521 	return status;
5522 }
5523 
5524 
5525 static int
5526 file_create(int fd, char* path, int openMode, int perms, bool kernel)
5527 {
5528 	FUNCTION(("file_create: path '%s', omode %x, perms %d, kernel %d\n", path,
5529 		openMode, perms, kernel));
5530 
5531 	// get directory to put the new file in
5532 	char name[B_FILE_NAME_LENGTH];
5533 	struct vnode* directory;
5534 	status_t status = fd_and_path_to_dir_vnode(fd, path, &directory, name,
5535 		kernel);
5536 	if (status < 0)
5537 		return status;
5538 
5539 	status = create_vnode(directory, name, openMode, perms, kernel);
5540 
5541 	put_vnode(directory);
5542 	return status;
5543 }
5544 
5545 
5546 static int
5547 file_open_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5548 	int openMode, bool kernel)
5549 {
5550 	if (name == NULL || *name == '\0')
5551 		return B_BAD_VALUE;
5552 
5553 	FUNCTION(("file_open_entry_ref(ref = (%" B_PRId32 ", %" B_PRId64 ", %s), "
5554 		"openMode = %d)\n", mountID, directoryID, name, openMode));
5555 
5556 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5557 
5558 	// get the vnode matching the entry_ref
5559 	struct vnode* vnode;
5560 	status_t status = entry_ref_to_vnode(mountID, directoryID, name, traverse,
5561 		kernel, &vnode);
5562 	if (status != B_OK)
5563 		return status;
5564 
5565 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
5566 		put_vnode(vnode);
5567 		return B_LINK_LIMIT;
5568 	}
5569 
5570 	int newFD = open_vnode(vnode, openMode, kernel);
5571 	if (newFD >= 0) {
5572 		// The vnode reference has been transferred to the FD
5573 		cache_node_opened(vnode, FDTYPE_FILE, vnode->cache, mountID,
5574 			directoryID, vnode->id, name);
5575 	} else
5576 		put_vnode(vnode);
5577 
5578 	return newFD;
5579 }
5580 
5581 
5582 static int
5583 file_open(int fd, char* path, int openMode, bool kernel)
5584 {
5585 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5586 
5587 	FUNCTION(("file_open: fd: %d, entry path = '%s', omode %d, kernel %d\n",
5588 		fd, path, openMode, kernel));
5589 
5590 	// get the vnode matching the vnode + path combination
5591 	struct vnode* vnode;
5592 	ino_t parentID;
5593 	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode,
5594 		&parentID, kernel);
5595 	if (status != B_OK)
5596 		return status;
5597 
5598 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
5599 		put_vnode(vnode);
5600 		return B_LINK_LIMIT;
5601 	}
5602 
5603 	// open the vnode
5604 	int newFD = open_vnode(vnode, openMode, kernel);
5605 	if (newFD >= 0) {
5606 		// The vnode reference has been transferred to the FD
5607 		cache_node_opened(vnode, FDTYPE_FILE, vnode->cache,
5608 			vnode->device, parentID, vnode->id, NULL);
5609 	} else
5610 		put_vnode(vnode);
5611 
5612 	return newFD;
5613 }
5614 
5615 
5616 static status_t
5617 file_close(struct file_descriptor* descriptor)
5618 {
5619 	struct vnode* vnode = descriptor->u.vnode;
5620 	status_t status = B_OK;
5621 
5622 	FUNCTION(("file_close(descriptor = %p)\n", descriptor));
5623 
5624 	cache_node_closed(vnode, FDTYPE_FILE, vnode->cache, vnode->device,
5625 		vnode->id);
5626 	if (HAS_FS_CALL(vnode, close)) {
5627 		status = FS_CALL(vnode, close, descriptor->cookie);
5628 	}
5629 
5630 	if (status == B_OK) {
5631 		// remove all outstanding locks for this team
5632 		if (HAS_FS_CALL(vnode, release_lock))
5633 			status = FS_CALL(vnode, release_lock, descriptor->cookie, NULL);
5634 		else
5635 			status = release_advisory_lock(vnode, NULL);
5636 	}
5637 	return status;
5638 }
5639 
5640 
5641 static void
5642 file_free_fd(struct file_descriptor* descriptor)
5643 {
5644 	struct vnode* vnode = descriptor->u.vnode;
5645 
5646 	if (vnode != NULL) {
5647 		FS_CALL(vnode, free_cookie, descriptor->cookie);
5648 		put_vnode(vnode);
5649 	}
5650 }
5651 
5652 
5653 static status_t
5654 file_read(struct file_descriptor* descriptor, off_t pos, void* buffer,
5655 	size_t* length)
5656 {
5657 	struct vnode* vnode = descriptor->u.vnode;
5658 	FUNCTION(("file_read: buf %p, pos %" B_PRIdOFF ", len %p = %ld\n", buffer,
5659 		pos, length, *length));
5660 
5661 	if (S_ISDIR(vnode->Type()))
5662 		return B_IS_A_DIRECTORY;
5663 
5664 	return FS_CALL(vnode, read, descriptor->cookie, pos, buffer, length);
5665 }
5666 
5667 
5668 static status_t
5669 file_write(struct file_descriptor* descriptor, off_t pos, const void* buffer,
5670 	size_t* length)
5671 {
5672 	struct vnode* vnode = descriptor->u.vnode;
5673 	FUNCTION(("file_write: buf %p, pos %" B_PRIdOFF ", len %p\n", buffer, pos,
5674 		length));
5675 
5676 	if (S_ISDIR(vnode->Type()))
5677 		return B_IS_A_DIRECTORY;
5678 	if (!HAS_FS_CALL(vnode, write))
5679 		return B_READ_ONLY_DEVICE;
5680 
5681 	return FS_CALL(vnode, write, descriptor->cookie, pos, buffer, length);
5682 }
5683 
5684 
5685 static off_t
5686 file_seek(struct file_descriptor* descriptor, off_t pos, int seekType)
5687 {
5688 	struct vnode* vnode = descriptor->u.vnode;
5689 	off_t offset;
5690 	bool isDevice = false;
5691 
5692 	FUNCTION(("file_seek(pos = %" B_PRIdOFF ", seekType = %d)\n", pos,
5693 		seekType));
5694 
5695 	// some kinds of files are not seekable
5696 	switch (vnode->Type() & S_IFMT) {
5697 		case S_IFIFO:
5698 		case S_IFSOCK:
5699 			return ESPIPE;
5700 
5701 		// drivers publish block devices as chr, so pick both
5702 		case S_IFBLK:
5703 		case S_IFCHR:
5704 			isDevice = true;
5705 			break;
5706 		// The Open Group Base Specs don't mention any file types besides pipes,
5707 		// fifos, and sockets specially, so we allow seeking them.
5708 		case S_IFREG:
5709 		case S_IFDIR:
5710 		case S_IFLNK:
5711 			break;
5712 	}
5713 
5714 	switch (seekType) {
5715 		case SEEK_SET:
5716 			offset = 0;
5717 			break;
5718 		case SEEK_CUR:
5719 			offset = descriptor->pos;
5720 			break;
5721 		case SEEK_END:
5722 		{
5723 			// stat() the node
5724 			if (!HAS_FS_CALL(vnode, read_stat))
5725 				return B_UNSUPPORTED;
5726 
5727 			struct stat stat;
5728 			status_t status = FS_CALL(vnode, read_stat, &stat);
5729 			if (status != B_OK)
5730 				return status;
5731 
5732 			offset = stat.st_size;
5733 
5734 			if (offset == 0 && isDevice) {
5735 				// stat() on regular drivers doesn't report size
5736 				device_geometry geometry;
5737 
5738 				if (HAS_FS_CALL(vnode, ioctl)) {
5739 					status = FS_CALL(vnode, ioctl, descriptor->cookie,
5740 						B_GET_GEOMETRY, &geometry, sizeof(geometry));
5741 					if (status == B_OK)
5742 						offset = (off_t)geometry.bytes_per_sector
5743 							* geometry.sectors_per_track
5744 							* geometry.cylinder_count
5745 							* geometry.head_count;
5746 				}
5747 			}
5748 
5749 			break;
5750 		}
5751 		default:
5752 			return B_BAD_VALUE;
5753 	}
5754 
5755 	// assumes off_t is 64 bits wide
5756 	if (offset > 0 && LONGLONG_MAX - offset < pos)
5757 		return B_BUFFER_OVERFLOW;
5758 
5759 	pos += offset;
5760 	if (pos < 0)
5761 		return B_BAD_VALUE;
5762 
5763 	return descriptor->pos = pos;
5764 }
5765 
5766 
5767 static status_t
5768 file_select(struct file_descriptor* descriptor, uint8 event,
5769 	struct selectsync* sync)
5770 {
5771 	FUNCTION(("file_select(%p, %u, %p)\n", descriptor, event, sync));
5772 
5773 	struct vnode* vnode = descriptor->u.vnode;
5774 
5775 	// If the FS has no select() hook, notify select() now.
5776 	if (!HAS_FS_CALL(vnode, select))
5777 		return notify_select_event(sync, event);
5778 
5779 	return FS_CALL(vnode, select, descriptor->cookie, event, sync);
5780 }
5781 
5782 
5783 static status_t
5784 file_deselect(struct file_descriptor* descriptor, uint8 event,
5785 	struct selectsync* sync)
5786 {
5787 	struct vnode* vnode = descriptor->u.vnode;
5788 
5789 	if (!HAS_FS_CALL(vnode, deselect))
5790 		return B_OK;
5791 
5792 	return FS_CALL(vnode, deselect, descriptor->cookie, event, sync);
5793 }
5794 
5795 
5796 static status_t
5797 dir_create_entry_ref(dev_t mountID, ino_t parentID, const char* name, int perms,
5798 	bool kernel)
5799 {
5800 	struct vnode* vnode;
5801 	status_t status;
5802 
5803 	if (name == NULL || *name == '\0')
5804 		return B_BAD_VALUE;
5805 
5806 	FUNCTION(("dir_create_entry_ref(dev = %" B_PRId32 ", ino = %" B_PRId64 ", "
5807 		"name = '%s', perms = %d)\n", mountID, parentID, name, perms));
5808 
5809 	status = get_vnode(mountID, parentID, &vnode, true, false);
5810 	if (status != B_OK)
5811 		return status;
5812 
5813 	if (HAS_FS_CALL(vnode, create_dir))
5814 		status = FS_CALL(vnode, create_dir, name, perms);
5815 	else
5816 		status = B_READ_ONLY_DEVICE;
5817 
5818 	put_vnode(vnode);
5819 	return status;
5820 }
5821 
5822 
5823 static status_t
5824 dir_create(int fd, char* path, int perms, bool kernel)
5825 {
5826 	char filename[B_FILE_NAME_LENGTH];
5827 	struct vnode* vnode;
5828 	status_t status;
5829 
5830 	FUNCTION(("dir_create: path '%s', perms %d, kernel %d\n", path, perms,
5831 		kernel));
5832 
5833 	status = fd_and_path_to_dir_vnode(fd, path, &vnode, filename, kernel);
5834 	if (status < 0)
5835 		return status;
5836 
5837 	if (HAS_FS_CALL(vnode, create_dir)) {
5838 		status = FS_CALL(vnode, create_dir, filename, perms);
5839 	} else
5840 		status = B_READ_ONLY_DEVICE;
5841 
5842 	put_vnode(vnode);
5843 	return status;
5844 }
5845 
5846 
5847 static int
5848 dir_open_entry_ref(dev_t mountID, ino_t parentID, const char* name, bool kernel)
5849 {
5850 	FUNCTION(("dir_open_entry_ref()\n"));
5851 
5852 	if (name && name[0] == '\0')
5853 		return B_BAD_VALUE;
5854 
5855 	// get the vnode matching the entry_ref/node_ref
5856 	struct vnode* vnode;
5857 	status_t status;
5858 	if (name) {
5859 		status = entry_ref_to_vnode(mountID, parentID, name, true, kernel,
5860 			&vnode);
5861 	} else
5862 		status = get_vnode(mountID, parentID, &vnode, true, false);
5863 	if (status != B_OK)
5864 		return status;
5865 
5866 	int newFD = open_dir_vnode(vnode, kernel);
5867 	if (newFD >= 0) {
5868 		// The vnode reference has been transferred to the FD
5869 		cache_node_opened(vnode, FDTYPE_DIR, vnode->cache, mountID, parentID,
5870 			vnode->id, name);
5871 	} else
5872 		put_vnode(vnode);
5873 
5874 	return newFD;
5875 }
5876 
5877 
5878 static int
5879 dir_open(int fd, char* path, bool kernel)
5880 {
5881 	FUNCTION(("dir_open: fd: %d, entry path = '%s', kernel %d\n", fd, path,
5882 		kernel));
5883 
5884 	// get the vnode matching the vnode + path combination
5885 	struct vnode* vnode = NULL;
5886 	ino_t parentID;
5887 	status_t status = fd_and_path_to_vnode(fd, path, true, &vnode, &parentID,
5888 		kernel);
5889 	if (status != B_OK)
5890 		return status;
5891 
5892 	// open the dir
5893 	int newFD = open_dir_vnode(vnode, kernel);
5894 	if (newFD >= 0) {
5895 		// The vnode reference has been transferred to the FD
5896 		cache_node_opened(vnode, FDTYPE_DIR, vnode->cache, vnode->device,
5897 			parentID, vnode->id, NULL);
5898 	} else
5899 		put_vnode(vnode);
5900 
5901 	return newFD;
5902 }
5903 
5904 
5905 static status_t
5906 dir_close(struct file_descriptor* descriptor)
5907 {
5908 	struct vnode* vnode = descriptor->u.vnode;
5909 
5910 	FUNCTION(("dir_close(descriptor = %p)\n", descriptor));
5911 
5912 	cache_node_closed(vnode, FDTYPE_DIR, vnode->cache, vnode->device,
5913 		vnode->id);
5914 	if (HAS_FS_CALL(vnode, close_dir))
5915 		return FS_CALL(vnode, close_dir, descriptor->cookie);
5916 
5917 	return B_OK;
5918 }
5919 
5920 
5921 static void
5922 dir_free_fd(struct file_descriptor* descriptor)
5923 {
5924 	struct vnode* vnode = descriptor->u.vnode;
5925 
5926 	if (vnode != NULL) {
5927 		FS_CALL(vnode, free_dir_cookie, descriptor->cookie);
5928 		put_vnode(vnode);
5929 	}
5930 }
5931 
5932 
5933 static status_t
5934 dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
5935 	struct dirent* buffer, size_t bufferSize, uint32* _count)
5936 {
5937 	return dir_read(ioContext, descriptor->u.vnode, descriptor->cookie, buffer,
5938 		bufferSize, _count);
5939 }
5940 
5941 
5942 static status_t
5943 fix_dirent(struct vnode* parent, struct dirent* entry,
5944 	struct io_context* ioContext)
5945 {
5946 	// set d_pdev and d_pino
5947 	entry->d_pdev = parent->device;
5948 	entry->d_pino = parent->id;
5949 
5950 	// If this is the ".." entry and the directory covering another vnode,
5951 	// we need to replace d_dev and d_ino with the actual values.
5952 	if (strcmp(entry->d_name, "..") == 0 && parent->IsCovering()) {
5953 		return resolve_covered_parent(parent, &entry->d_dev, &entry->d_ino,
5954 			ioContext);
5955 	}
5956 
5957 	// resolve covered vnodes
5958 	ReadLocker _(&sVnodeLock);
5959 
5960 	struct vnode* vnode = lookup_vnode(entry->d_dev, entry->d_ino);
5961 	if (vnode != NULL && vnode->covered_by != NULL) {
5962 		do {
5963 			vnode = vnode->covered_by;
5964 		} while (vnode->covered_by != NULL);
5965 
5966 		entry->d_dev = vnode->device;
5967 		entry->d_ino = vnode->id;
5968 	}
5969 
5970 	return B_OK;
5971 }
5972 
5973 
5974 static status_t
5975 dir_read(struct io_context* ioContext, struct vnode* vnode, void* cookie,
5976 	struct dirent* buffer, size_t bufferSize, uint32* _count)
5977 {
5978 	if (!HAS_FS_CALL(vnode, read_dir))
5979 		return B_UNSUPPORTED;
5980 
5981 	status_t error = FS_CALL(vnode, read_dir, cookie, buffer, bufferSize,
5982 		_count);
5983 	if (error != B_OK)
5984 		return error;
5985 
5986 	// we need to adjust the read dirents
5987 	uint32 count = *_count;
5988 	for (uint32 i = 0; i < count; i++) {
5989 		error = fix_dirent(vnode, buffer, ioContext);
5990 		if (error != B_OK)
5991 			return error;
5992 
5993 		buffer = (struct dirent*)((uint8*)buffer + buffer->d_reclen);
5994 	}
5995 
5996 	return error;
5997 }
5998 
5999 
6000 static status_t
6001 dir_rewind(struct file_descriptor* descriptor)
6002 {
6003 	struct vnode* vnode = descriptor->u.vnode;
6004 
6005 	if (HAS_FS_CALL(vnode, rewind_dir)) {
6006 		return FS_CALL(vnode, rewind_dir, descriptor->cookie);
6007 	}
6008 
6009 	return B_UNSUPPORTED;
6010 }
6011 
6012 
6013 static status_t
6014 dir_remove(int fd, char* path, bool kernel)
6015 {
6016 	char name[B_FILE_NAME_LENGTH];
6017 	struct vnode* directory;
6018 	status_t status;
6019 
6020 	if (path != NULL) {
6021 		// we need to make sure our path name doesn't stop with "/", ".",
6022 		// or ".."
6023 		char* lastSlash;
6024 		while ((lastSlash = strrchr(path, '/')) != NULL) {
6025 			char* leaf = lastSlash + 1;
6026 			if (!strcmp(leaf, ".."))
6027 				return B_NOT_ALLOWED;
6028 
6029 			// omit multiple slashes
6030 			while (lastSlash > path && lastSlash[-1] == '/')
6031 				lastSlash--;
6032 
6033 			if (leaf[0]
6034 				&& strcmp(leaf, ".")) {
6035 				break;
6036 			}
6037 			// "name/" -> "name", or "name/." -> "name"
6038 			lastSlash[0] = '\0';
6039 		}
6040 
6041 		if (!strcmp(path, ".") || !strcmp(path, ".."))
6042 			return B_NOT_ALLOWED;
6043 	}
6044 
6045 	status = fd_and_path_to_dir_vnode(fd, path, &directory, name, kernel);
6046 	if (status != B_OK)
6047 		return status;
6048 
6049 	if (HAS_FS_CALL(directory, remove_dir))
6050 		status = FS_CALL(directory, remove_dir, name);
6051 	else
6052 		status = B_READ_ONLY_DEVICE;
6053 
6054 	put_vnode(directory);
6055 	return status;
6056 }
6057 
6058 
6059 static status_t
6060 common_ioctl(struct file_descriptor* descriptor, ulong op, void* buffer,
6061 	size_t length)
6062 {
6063 	struct vnode* vnode = descriptor->u.vnode;
6064 
6065 	if (HAS_FS_CALL(vnode, ioctl))
6066 		return FS_CALL(vnode, ioctl, descriptor->cookie, op, buffer, length);
6067 
6068 	return B_DEV_INVALID_IOCTL;
6069 }
6070 
6071 
6072 static status_t
6073 common_fcntl(int fd, int op, size_t argument, bool kernel)
6074 {
6075 	struct flock flock;
6076 
6077 	FUNCTION(("common_fcntl(fd = %d, op = %d, argument = %lx, %s)\n",
6078 		fd, op, argument, kernel ? "kernel" : "user"));
6079 
6080 	struct file_descriptor* descriptor = get_fd(get_current_io_context(kernel),
6081 		fd);
6082 	if (descriptor == NULL)
6083 		return B_FILE_ERROR;
6084 
6085 	struct vnode* vnode = fd_vnode(descriptor);
6086 
6087 	status_t status = B_OK;
6088 
6089 	if (op == F_SETLK || op == F_SETLKW || op == F_GETLK) {
6090 		if (descriptor->type != FDTYPE_FILE)
6091 			status = B_BAD_VALUE;
6092 		else if (user_memcpy(&flock, (struct flock*)argument,
6093 				sizeof(struct flock)) != B_OK)
6094 			status = B_BAD_ADDRESS;
6095 
6096 		if (status != B_OK) {
6097 			put_fd(descriptor);
6098 			return status;
6099 		}
6100 	}
6101 
6102 	switch (op) {
6103 		case F_SETFD:
6104 		{
6105 			struct io_context* context = get_current_io_context(kernel);
6106 			// Set file descriptor flags
6107 
6108 			// O_CLOEXEC is the only flag available at this time
6109 			mutex_lock(&context->io_mutex);
6110 			fd_set_close_on_exec(context, fd, (argument & FD_CLOEXEC) != 0);
6111 			mutex_unlock(&context->io_mutex);
6112 
6113 			status = B_OK;
6114 			break;
6115 		}
6116 
6117 		case F_GETFD:
6118 		{
6119 			struct io_context* context = get_current_io_context(kernel);
6120 
6121 			// Get file descriptor flags
6122 			mutex_lock(&context->io_mutex);
6123 			status = fd_close_on_exec(context, fd) ? FD_CLOEXEC : 0;
6124 			mutex_unlock(&context->io_mutex);
6125 			break;
6126 		}
6127 
6128 		case F_SETFL:
6129 			// Set file descriptor open mode
6130 
6131 			// we only accept changes to O_APPEND and O_NONBLOCK
6132 			argument &= O_APPEND | O_NONBLOCK;
6133 			if (descriptor->ops->fd_set_flags != NULL) {
6134 				status = descriptor->ops->fd_set_flags(descriptor, argument);
6135 			} else if (vnode != NULL && HAS_FS_CALL(vnode, set_flags)) {
6136 				status = FS_CALL(vnode, set_flags, descriptor->cookie,
6137 					(int)argument);
6138 			} else
6139 				status = B_UNSUPPORTED;
6140 
6141 			if (status == B_OK) {
6142 				// update this descriptor's open_mode field
6143 				descriptor->open_mode = (descriptor->open_mode
6144 					& ~(O_APPEND | O_NONBLOCK)) | argument;
6145 			}
6146 
6147 			break;
6148 
6149 		case F_GETFL:
6150 			// Get file descriptor open mode
6151 			status = descriptor->open_mode;
6152 			break;
6153 
6154 		case F_DUPFD:
6155 		case F_DUPFD_CLOEXEC:
6156 		{
6157 			struct io_context* context = get_current_io_context(kernel);
6158 
6159 			status = new_fd_etc(context, descriptor, (int)argument);
6160 			if (status >= 0) {
6161 				mutex_lock(&context->io_mutex);
6162 				fd_set_close_on_exec(context, fd, op == F_DUPFD_CLOEXEC);
6163 				mutex_unlock(&context->io_mutex);
6164 
6165 				atomic_add(&descriptor->ref_count, 1);
6166 			}
6167 			break;
6168 		}
6169 
6170 		case F_GETLK:
6171 			if (vnode != NULL) {
6172 				struct flock normalizedLock;
6173 
6174 				memcpy(&normalizedLock, &flock, sizeof(struct flock));
6175 				status = normalize_flock(descriptor, &normalizedLock);
6176 				if (status != B_OK)
6177 					break;
6178 
6179 				if (HAS_FS_CALL(vnode, test_lock)) {
6180 					status = FS_CALL(vnode, test_lock, descriptor->cookie,
6181 						&normalizedLock);
6182 				} else
6183 					status = test_advisory_lock(vnode, &normalizedLock);
6184 				if (status == B_OK) {
6185 					if (normalizedLock.l_type == F_UNLCK) {
6186 						// no conflicting lock found, copy back the same struct
6187 						// we were given except change type to F_UNLCK
6188 						flock.l_type = F_UNLCK;
6189 						status = user_memcpy((struct flock*)argument, &flock,
6190 							sizeof(struct flock));
6191 					} else {
6192 						// a conflicting lock was found, copy back its range and
6193 						// type
6194 						if (normalizedLock.l_len == OFF_MAX)
6195 							normalizedLock.l_len = 0;
6196 
6197 						status = user_memcpy((struct flock*)argument,
6198 							&normalizedLock, sizeof(struct flock));
6199 					}
6200 				}
6201 			} else
6202 				status = B_BAD_VALUE;
6203 			break;
6204 
6205 		case F_SETLK:
6206 		case F_SETLKW:
6207 			status = normalize_flock(descriptor, &flock);
6208 			if (status != B_OK)
6209 				break;
6210 
6211 			if (vnode == NULL) {
6212 				status = B_BAD_VALUE;
6213 			} else if (flock.l_type == F_UNLCK) {
6214 				if (HAS_FS_CALL(vnode, release_lock)) {
6215 					status = FS_CALL(vnode, release_lock, descriptor->cookie,
6216 						&flock);
6217 				} else
6218 					status = release_advisory_lock(vnode, &flock);
6219 			} else {
6220 				// the open mode must match the lock type
6221 				if (((descriptor->open_mode & O_RWMASK) == O_RDONLY
6222 						&& flock.l_type == F_WRLCK)
6223 					|| ((descriptor->open_mode & O_RWMASK) == O_WRONLY
6224 						&& flock.l_type == F_RDLCK))
6225 					status = B_FILE_ERROR;
6226 				else {
6227 					if (HAS_FS_CALL(vnode, acquire_lock)) {
6228 						status = FS_CALL(vnode, acquire_lock,
6229 							descriptor->cookie, &flock, op == F_SETLKW);
6230 					} else {
6231 						status = acquire_advisory_lock(vnode, -1,
6232 							&flock, op == F_SETLKW);
6233 					}
6234 				}
6235 			}
6236 			break;
6237 
6238 		// ToDo: add support for more ops?
6239 
6240 		default:
6241 			status = B_BAD_VALUE;
6242 	}
6243 
6244 	put_fd(descriptor);
6245 	return status;
6246 }
6247 
6248 
6249 static status_t
6250 common_sync(int fd, bool kernel)
6251 {
6252 	struct file_descriptor* descriptor;
6253 	struct vnode* vnode;
6254 	status_t status;
6255 
6256 	FUNCTION(("common_fsync: entry. fd %d kernel %d\n", fd, kernel));
6257 
6258 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6259 	if (descriptor == NULL)
6260 		return B_FILE_ERROR;
6261 
6262 	if (HAS_FS_CALL(vnode, fsync))
6263 		status = FS_CALL_NO_PARAMS(vnode, fsync);
6264 	else
6265 		status = B_UNSUPPORTED;
6266 
6267 	put_fd(descriptor);
6268 	return status;
6269 }
6270 
6271 
6272 static status_t
6273 common_lock_node(int fd, bool kernel)
6274 {
6275 	struct file_descriptor* descriptor;
6276 	struct vnode* vnode;
6277 
6278 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6279 	if (descriptor == NULL)
6280 		return B_FILE_ERROR;
6281 
6282 	status_t status = B_OK;
6283 
6284 	// We need to set the locking atomically - someone
6285 	// else might set one at the same time
6286 	if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by, descriptor,
6287 			(file_descriptor*)NULL) != NULL)
6288 		status = B_BUSY;
6289 
6290 	put_fd(descriptor);
6291 	return status;
6292 }
6293 
6294 
6295 static status_t
6296 common_unlock_node(int fd, bool kernel)
6297 {
6298 	struct file_descriptor* descriptor;
6299 	struct vnode* vnode;
6300 
6301 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6302 	if (descriptor == NULL)
6303 		return B_FILE_ERROR;
6304 
6305 	status_t status = B_OK;
6306 
6307 	// We need to set the locking atomically - someone
6308 	// else might set one at the same time
6309 	if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by,
6310 			(file_descriptor*)NULL, descriptor) != descriptor)
6311 		status = B_BAD_VALUE;
6312 
6313 	put_fd(descriptor);
6314 	return status;
6315 }
6316 
6317 
6318 static status_t
6319 common_read_link(int fd, char* path, char* buffer, size_t* _bufferSize,
6320 	bool kernel)
6321 {
6322 	struct vnode* vnode;
6323 	status_t status;
6324 
6325 	status = fd_and_path_to_vnode(fd, path, false, &vnode, NULL, kernel);
6326 	if (status != B_OK)
6327 		return status;
6328 
6329 	if (HAS_FS_CALL(vnode, read_symlink)) {
6330 		status = FS_CALL(vnode, read_symlink, buffer, _bufferSize);
6331 	} else
6332 		status = B_BAD_VALUE;
6333 
6334 	put_vnode(vnode);
6335 	return status;
6336 }
6337 
6338 
6339 static status_t
6340 common_create_symlink(int fd, char* path, const char* toPath, int mode,
6341 	bool kernel)
6342 {
6343 	// path validity checks have to be in the calling function!
6344 	char name[B_FILE_NAME_LENGTH];
6345 	struct vnode* vnode;
6346 	status_t status;
6347 
6348 	FUNCTION(("common_create_symlink(fd = %d, path = %s, toPath = %s, "
6349 		"mode = %d, kernel = %d)\n", fd, path, toPath, mode, kernel));
6350 
6351 	status = fd_and_path_to_dir_vnode(fd, path, &vnode, name, kernel);
6352 	if (status != B_OK)
6353 		return status;
6354 
6355 	if (HAS_FS_CALL(vnode, create_symlink))
6356 		status = FS_CALL(vnode, create_symlink, name, toPath, mode);
6357 	else {
6358 		status = HAS_FS_CALL(vnode, write)
6359 			? B_UNSUPPORTED : B_READ_ONLY_DEVICE;
6360 	}
6361 
6362 	put_vnode(vnode);
6363 
6364 	return status;
6365 }
6366 
6367 
6368 static status_t
6369 common_create_link(int pathFD, char* path, int toFD, char* toPath,
6370 	bool traverseLeafLink, bool kernel)
6371 {
6372 	// path validity checks have to be in the calling function!
6373 
6374 	FUNCTION(("common_create_link(path = %s, toPath = %s, kernel = %d)\n", path,
6375 		toPath, kernel));
6376 
6377 	char name[B_FILE_NAME_LENGTH];
6378 	struct vnode* directory;
6379 	status_t status = fd_and_path_to_dir_vnode(pathFD, path, &directory, name,
6380 		kernel);
6381 	if (status != B_OK)
6382 		return status;
6383 
6384 	struct vnode* vnode;
6385 	status = fd_and_path_to_vnode(toFD, toPath, traverseLeafLink, &vnode, NULL,
6386 		kernel);
6387 	if (status != B_OK)
6388 		goto err;
6389 
6390 	if (directory->mount != vnode->mount) {
6391 		status = B_CROSS_DEVICE_LINK;
6392 		goto err1;
6393 	}
6394 
6395 	if (HAS_FS_CALL(directory, link))
6396 		status = FS_CALL(directory, link, name, vnode);
6397 	else
6398 		status = B_READ_ONLY_DEVICE;
6399 
6400 err1:
6401 	put_vnode(vnode);
6402 err:
6403 	put_vnode(directory);
6404 
6405 	return status;
6406 }
6407 
6408 
6409 static status_t
6410 common_unlink(int fd, char* path, bool kernel)
6411 {
6412 	char filename[B_FILE_NAME_LENGTH];
6413 	struct vnode* vnode;
6414 	status_t status;
6415 
6416 	FUNCTION(("common_unlink: fd: %d, path '%s', kernel %d\n", fd, path,
6417 		kernel));
6418 
6419 	status = fd_and_path_to_dir_vnode(fd, path, &vnode, filename, kernel);
6420 	if (status < 0)
6421 		return status;
6422 
6423 	if (HAS_FS_CALL(vnode, unlink))
6424 		status = FS_CALL(vnode, unlink, filename);
6425 	else
6426 		status = B_READ_ONLY_DEVICE;
6427 
6428 	put_vnode(vnode);
6429 
6430 	return status;
6431 }
6432 
6433 
6434 static status_t
6435 common_access(int fd, char* path, int mode, bool effectiveUserGroup, bool kernel)
6436 {
6437 	struct vnode* vnode;
6438 	status_t status;
6439 
6440 	// TODO: honor effectiveUserGroup argument
6441 
6442 	status = fd_and_path_to_vnode(fd, path, true, &vnode, NULL, kernel);
6443 	if (status != B_OK)
6444 		return status;
6445 
6446 	if (HAS_FS_CALL(vnode, access))
6447 		status = FS_CALL(vnode, access, mode);
6448 	else
6449 		status = B_OK;
6450 
6451 	put_vnode(vnode);
6452 
6453 	return status;
6454 }
6455 
6456 
6457 static status_t
6458 common_rename(int fd, char* path, int newFD, char* newPath, bool kernel)
6459 {
6460 	struct vnode* fromVnode;
6461 	struct vnode* toVnode;
6462 	char fromName[B_FILE_NAME_LENGTH];
6463 	char toName[B_FILE_NAME_LENGTH];
6464 	status_t status;
6465 
6466 	FUNCTION(("common_rename(fd = %d, path = %s, newFD = %d, newPath = %s, "
6467 		"kernel = %d)\n", fd, path, newFD, newPath, kernel));
6468 
6469 	status = fd_and_path_to_dir_vnode(fd, path, &fromVnode, fromName, kernel);
6470 	if (status != B_OK)
6471 		return status;
6472 
6473 	status = fd_and_path_to_dir_vnode(newFD, newPath, &toVnode, toName, kernel);
6474 	if (status != B_OK)
6475 		goto err1;
6476 
6477 	if (fromVnode->device != toVnode->device) {
6478 		status = B_CROSS_DEVICE_LINK;
6479 		goto err2;
6480 	}
6481 
6482 	if (fromName[0] == '\0' || toName[0] == '\0'
6483 		|| !strcmp(fromName, ".") || !strcmp(fromName, "..")
6484 		|| !strcmp(toName, ".") || !strcmp(toName, "..")
6485 		|| (fromVnode == toVnode && !strcmp(fromName, toName))) {
6486 		status = B_BAD_VALUE;
6487 		goto err2;
6488 	}
6489 
6490 	if (HAS_FS_CALL(fromVnode, rename))
6491 		status = FS_CALL(fromVnode, rename, fromName, toVnode, toName);
6492 	else
6493 		status = B_READ_ONLY_DEVICE;
6494 
6495 err2:
6496 	put_vnode(toVnode);
6497 err1:
6498 	put_vnode(fromVnode);
6499 
6500 	return status;
6501 }
6502 
6503 
6504 static status_t
6505 common_read_stat(struct file_descriptor* descriptor, struct stat* stat)
6506 {
6507 	struct vnode* vnode = descriptor->u.vnode;
6508 
6509 	FUNCTION(("common_read_stat: stat %p\n", stat));
6510 
6511 	// TODO: remove this once all file systems properly set them!
6512 	stat->st_crtim.tv_nsec = 0;
6513 	stat->st_ctim.tv_nsec = 0;
6514 	stat->st_mtim.tv_nsec = 0;
6515 	stat->st_atim.tv_nsec = 0;
6516 
6517 	return vfs_stat_vnode(vnode, stat);
6518 }
6519 
6520 
6521 static status_t
6522 common_write_stat(struct file_descriptor* descriptor, const struct stat* stat,
6523 	int statMask)
6524 {
6525 	struct vnode* vnode = descriptor->u.vnode;
6526 
6527 	FUNCTION(("common_write_stat(vnode = %p, stat = %p, statMask = %d)\n",
6528 		vnode, stat, statMask));
6529 
6530 	if (!HAS_FS_CALL(vnode, write_stat))
6531 		return B_READ_ONLY_DEVICE;
6532 
6533 	return FS_CALL(vnode, write_stat, stat, statMask);
6534 }
6535 
6536 
6537 static status_t
6538 common_path_read_stat(int fd, char* path, bool traverseLeafLink,
6539 	struct stat* stat, bool kernel)
6540 {
6541 	FUNCTION(("common_path_read_stat: fd: %d, path '%s', stat %p,\n", fd, path,
6542 		stat));
6543 
6544 	struct vnode* vnode;
6545 	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6546 		NULL, kernel);
6547 	if (status != B_OK)
6548 		return status;
6549 
6550 	status = vfs_stat_vnode(vnode, stat);
6551 
6552 	put_vnode(vnode);
6553 	return status;
6554 }
6555 
6556 
6557 static status_t
6558 common_path_write_stat(int fd, char* path, bool traverseLeafLink,
6559 	const struct stat* stat, int statMask, bool kernel)
6560 {
6561 	FUNCTION(("common_write_stat: fd: %d, path '%s', stat %p, stat_mask %d, "
6562 		"kernel %d\n", fd, path, stat, statMask, kernel));
6563 
6564 	struct vnode* vnode;
6565 	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6566 		NULL, kernel);
6567 	if (status != B_OK)
6568 		return status;
6569 
6570 	if (HAS_FS_CALL(vnode, write_stat))
6571 		status = FS_CALL(vnode, write_stat, stat, statMask);
6572 	else
6573 		status = B_READ_ONLY_DEVICE;
6574 
6575 	put_vnode(vnode);
6576 
6577 	return status;
6578 }
6579 
6580 
6581 static int
6582 attr_dir_open(int fd, char* path, bool traverseLeafLink, bool kernel)
6583 {
6584 	FUNCTION(("attr_dir_open(fd = %d, path = '%s', kernel = %d)\n", fd, path,
6585 		kernel));
6586 
6587 	struct vnode* vnode;
6588 	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6589 		NULL, kernel);
6590 	if (status != B_OK)
6591 		return status;
6592 
6593 	status = open_attr_dir_vnode(vnode, kernel);
6594 	if (status < 0)
6595 		put_vnode(vnode);
6596 
6597 	return status;
6598 }
6599 
6600 
6601 static status_t
6602 attr_dir_close(struct file_descriptor* descriptor)
6603 {
6604 	struct vnode* vnode = descriptor->u.vnode;
6605 
6606 	FUNCTION(("attr_dir_close(descriptor = %p)\n", descriptor));
6607 
6608 	if (HAS_FS_CALL(vnode, close_attr_dir))
6609 		return FS_CALL(vnode, close_attr_dir, descriptor->cookie);
6610 
6611 	return B_OK;
6612 }
6613 
6614 
6615 static void
6616 attr_dir_free_fd(struct file_descriptor* descriptor)
6617 {
6618 	struct vnode* vnode = descriptor->u.vnode;
6619 
6620 	if (vnode != NULL) {
6621 		FS_CALL(vnode, free_attr_dir_cookie, descriptor->cookie);
6622 		put_vnode(vnode);
6623 	}
6624 }
6625 
6626 
6627 static status_t
6628 attr_dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
6629 	struct dirent* buffer, size_t bufferSize, uint32* _count)
6630 {
6631 	struct vnode* vnode = descriptor->u.vnode;
6632 
6633 	FUNCTION(("attr_dir_read(descriptor = %p)\n", descriptor));
6634 
6635 	if (HAS_FS_CALL(vnode, read_attr_dir))
6636 		return FS_CALL(vnode, read_attr_dir, descriptor->cookie, buffer,
6637 			bufferSize, _count);
6638 
6639 	return B_UNSUPPORTED;
6640 }
6641 
6642 
6643 static status_t
6644 attr_dir_rewind(struct file_descriptor* descriptor)
6645 {
6646 	struct vnode* vnode = descriptor->u.vnode;
6647 
6648 	FUNCTION(("attr_dir_rewind(descriptor = %p)\n", descriptor));
6649 
6650 	if (HAS_FS_CALL(vnode, rewind_attr_dir))
6651 		return FS_CALL(vnode, rewind_attr_dir, descriptor->cookie);
6652 
6653 	return B_UNSUPPORTED;
6654 }
6655 
6656 
6657 static int
6658 attr_create(int fd, char* path, const char* name, uint32 type,
6659 	int openMode, bool kernel)
6660 {
6661 	if (name == NULL || *name == '\0')
6662 		return B_BAD_VALUE;
6663 
6664 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
6665 	struct vnode* vnode;
6666 	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode, NULL,
6667 		kernel);
6668 	if (status != B_OK)
6669 		return status;
6670 
6671 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
6672 		status = B_LINK_LIMIT;
6673 		goto err;
6674 	}
6675 
6676 	if (!HAS_FS_CALL(vnode, create_attr)) {
6677 		status = B_READ_ONLY_DEVICE;
6678 		goto err;
6679 	}
6680 
6681 	void* cookie;
6682 	status = FS_CALL(vnode, create_attr, name, type, openMode, &cookie);
6683 	if (status != B_OK)
6684 		goto err;
6685 
6686 	fd = get_new_fd(FDTYPE_ATTR, NULL, vnode, cookie, openMode, kernel);
6687 	if (fd >= 0)
6688 		return fd;
6689 
6690 	status = fd;
6691 
6692 	FS_CALL(vnode, close_attr, cookie);
6693 	FS_CALL(vnode, free_attr_cookie, cookie);
6694 
6695 	FS_CALL(vnode, remove_attr, name);
6696 
6697 err:
6698 	put_vnode(vnode);
6699 
6700 	return status;
6701 }
6702 
6703 
6704 static int
6705 attr_open(int fd, char* path, const char* name, int openMode, bool kernel)
6706 {
6707 	if (name == NULL || *name == '\0')
6708 		return B_BAD_VALUE;
6709 
6710 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
6711 	struct vnode* vnode;
6712 	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode, NULL,
6713 		kernel);
6714 	if (status != B_OK)
6715 		return status;
6716 
6717 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
6718 		status = B_LINK_LIMIT;
6719 		goto err;
6720 	}
6721 
6722 	if (!HAS_FS_CALL(vnode, open_attr)) {
6723 		status = B_UNSUPPORTED;
6724 		goto err;
6725 	}
6726 
6727 	void* cookie;
6728 	status = FS_CALL(vnode, open_attr, name, openMode, &cookie);
6729 	if (status != B_OK)
6730 		goto err;
6731 
6732 	// now we only need a file descriptor for this attribute and we're done
6733 	fd = get_new_fd(FDTYPE_ATTR, NULL, vnode, cookie, openMode, kernel);
6734 	if (fd >= 0)
6735 		return fd;
6736 
6737 	status = fd;
6738 
6739 	FS_CALL(vnode, close_attr, cookie);
6740 	FS_CALL(vnode, free_attr_cookie, cookie);
6741 
6742 err:
6743 	put_vnode(vnode);
6744 
6745 	return status;
6746 }
6747 
6748 
6749 static status_t
6750 attr_close(struct file_descriptor* descriptor)
6751 {
6752 	struct vnode* vnode = descriptor->u.vnode;
6753 
6754 	FUNCTION(("attr_close(descriptor = %p)\n", descriptor));
6755 
6756 	if (HAS_FS_CALL(vnode, close_attr))
6757 		return FS_CALL(vnode, close_attr, descriptor->cookie);
6758 
6759 	return B_OK;
6760 }
6761 
6762 
6763 static void
6764 attr_free_fd(struct file_descriptor* descriptor)
6765 {
6766 	struct vnode* vnode = descriptor->u.vnode;
6767 
6768 	if (vnode != NULL) {
6769 		FS_CALL(vnode, free_attr_cookie, descriptor->cookie);
6770 		put_vnode(vnode);
6771 	}
6772 }
6773 
6774 
6775 static status_t
6776 attr_read(struct file_descriptor* descriptor, off_t pos, void* buffer,
6777 	size_t* length)
6778 {
6779 	struct vnode* vnode = descriptor->u.vnode;
6780 
6781 	FUNCTION(("attr_read: buf %p, pos %" B_PRIdOFF ", len %p = %ld\n", buffer,
6782 		pos, length, *length));
6783 
6784 	if (!HAS_FS_CALL(vnode, read_attr))
6785 		return B_UNSUPPORTED;
6786 
6787 	return FS_CALL(vnode, read_attr, descriptor->cookie, pos, buffer, length);
6788 }
6789 
6790 
6791 static status_t
6792 attr_write(struct file_descriptor* descriptor, off_t pos, const void* buffer,
6793 	size_t* length)
6794 {
6795 	struct vnode* vnode = descriptor->u.vnode;
6796 
6797 	FUNCTION(("attr_write: buf %p, pos %" B_PRIdOFF ", len %p\n", buffer, pos,
6798 		length));
6799 
6800 	if (!HAS_FS_CALL(vnode, write_attr))
6801 		return B_UNSUPPORTED;
6802 
6803 	return FS_CALL(vnode, write_attr, descriptor->cookie, pos, buffer, length);
6804 }
6805 
6806 
6807 static off_t
6808 attr_seek(struct file_descriptor* descriptor, off_t pos, int seekType)
6809 {
6810 	off_t offset;
6811 
6812 	switch (seekType) {
6813 		case SEEK_SET:
6814 			offset = 0;
6815 			break;
6816 		case SEEK_CUR:
6817 			offset = descriptor->pos;
6818 			break;
6819 		case SEEK_END:
6820 		{
6821 			struct vnode* vnode = descriptor->u.vnode;
6822 			if (!HAS_FS_CALL(vnode, read_stat))
6823 				return B_UNSUPPORTED;
6824 
6825 			struct stat stat;
6826 			status_t status = FS_CALL(vnode, read_attr_stat, descriptor->cookie,
6827 				&stat);
6828 			if (status != B_OK)
6829 				return status;
6830 
6831 			offset = stat.st_size;
6832 			break;
6833 		}
6834 		default:
6835 			return B_BAD_VALUE;
6836 	}
6837 
6838 	// assumes off_t is 64 bits wide
6839 	if (offset > 0 && LONGLONG_MAX - offset < pos)
6840 		return B_BUFFER_OVERFLOW;
6841 
6842 	pos += offset;
6843 	if (pos < 0)
6844 		return B_BAD_VALUE;
6845 
6846 	return descriptor->pos = pos;
6847 }
6848 
6849 
6850 static status_t
6851 attr_read_stat(struct file_descriptor* descriptor, struct stat* stat)
6852 {
6853 	struct vnode* vnode = descriptor->u.vnode;
6854 
6855 	FUNCTION(("attr_read_stat: stat 0x%p\n", stat));
6856 
6857 	if (!HAS_FS_CALL(vnode, read_attr_stat))
6858 		return B_UNSUPPORTED;
6859 
6860 	return FS_CALL(vnode, read_attr_stat, descriptor->cookie, stat);
6861 }
6862 
6863 
6864 static status_t
6865 attr_write_stat(struct file_descriptor* descriptor, const struct stat* stat,
6866 	int statMask)
6867 {
6868 	struct vnode* vnode = descriptor->u.vnode;
6869 
6870 	FUNCTION(("attr_write_stat: stat = %p, statMask %d\n", stat, statMask));
6871 
6872 	if (!HAS_FS_CALL(vnode, write_attr_stat))
6873 		return B_READ_ONLY_DEVICE;
6874 
6875 	return FS_CALL(vnode, write_attr_stat, descriptor->cookie, stat, statMask);
6876 }
6877 
6878 
6879 static status_t
6880 attr_remove(int fd, const char* name, bool kernel)
6881 {
6882 	struct file_descriptor* descriptor;
6883 	struct vnode* vnode;
6884 	status_t status;
6885 
6886 	if (name == NULL || *name == '\0')
6887 		return B_BAD_VALUE;
6888 
6889 	FUNCTION(("attr_remove: fd = %d, name = \"%s\", kernel %d\n", fd, name,
6890 		kernel));
6891 
6892 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6893 	if (descriptor == NULL)
6894 		return B_FILE_ERROR;
6895 
6896 	if (HAS_FS_CALL(vnode, remove_attr))
6897 		status = FS_CALL(vnode, remove_attr, name);
6898 	else
6899 		status = B_READ_ONLY_DEVICE;
6900 
6901 	put_fd(descriptor);
6902 
6903 	return status;
6904 }
6905 
6906 
6907 static status_t
6908 attr_rename(int fromFD, const char* fromName, int toFD, const char* toName,
6909 	bool kernel)
6910 {
6911 	struct file_descriptor* fromDescriptor;
6912 	struct file_descriptor* toDescriptor;
6913 	struct vnode* fromVnode;
6914 	struct vnode* toVnode;
6915 	status_t status;
6916 
6917 	if (fromName == NULL || *fromName == '\0' || toName == NULL
6918 		|| *toName == '\0')
6919 		return B_BAD_VALUE;
6920 
6921 	FUNCTION(("attr_rename: from fd = %d, from name = \"%s\", to fd = %d, to "
6922 		"name = \"%s\", kernel %d\n", fromFD, fromName, toFD, toName, kernel));
6923 
6924 	fromDescriptor = get_fd_and_vnode(fromFD, &fromVnode, kernel);
6925 	if (fromDescriptor == NULL)
6926 		return B_FILE_ERROR;
6927 
6928 	toDescriptor = get_fd_and_vnode(toFD, &toVnode, kernel);
6929 	if (toDescriptor == NULL) {
6930 		status = B_FILE_ERROR;
6931 		goto err;
6932 	}
6933 
6934 	// are the files on the same volume?
6935 	if (fromVnode->device != toVnode->device) {
6936 		status = B_CROSS_DEVICE_LINK;
6937 		goto err1;
6938 	}
6939 
6940 	if (HAS_FS_CALL(fromVnode, rename_attr)) {
6941 		status = FS_CALL(fromVnode, rename_attr, fromName, toVnode, toName);
6942 	} else
6943 		status = B_READ_ONLY_DEVICE;
6944 
6945 err1:
6946 	put_fd(toDescriptor);
6947 err:
6948 	put_fd(fromDescriptor);
6949 
6950 	return status;
6951 }
6952 
6953 
6954 static int
6955 index_dir_open(dev_t mountID, bool kernel)
6956 {
6957 	struct fs_mount* mount;
6958 	void* cookie;
6959 
6960 	FUNCTION(("index_dir_open(mountID = %" B_PRId32 ", kernel = %d)\n", mountID,
6961 		kernel));
6962 
6963 	status_t status = get_mount(mountID, &mount);
6964 	if (status != B_OK)
6965 		return status;
6966 
6967 	if (!HAS_FS_MOUNT_CALL(mount, open_index_dir)) {
6968 		status = B_UNSUPPORTED;
6969 		goto error;
6970 	}
6971 
6972 	status = FS_MOUNT_CALL(mount, open_index_dir, &cookie);
6973 	if (status != B_OK)
6974 		goto error;
6975 
6976 	// get fd for the index directory
6977 	int fd;
6978 	fd = get_new_fd(FDTYPE_INDEX_DIR, mount, NULL, cookie, O_CLOEXEC, kernel);
6979 	if (fd >= 0)
6980 		return fd;
6981 
6982 	// something went wrong
6983 	FS_MOUNT_CALL(mount, close_index_dir, cookie);
6984 	FS_MOUNT_CALL(mount, free_index_dir_cookie, cookie);
6985 
6986 	status = fd;
6987 
6988 error:
6989 	put_mount(mount);
6990 	return status;
6991 }
6992 
6993 
6994 static status_t
6995 index_dir_close(struct file_descriptor* descriptor)
6996 {
6997 	struct fs_mount* mount = descriptor->u.mount;
6998 
6999 	FUNCTION(("index_dir_close(descriptor = %p)\n", descriptor));
7000 
7001 	if (HAS_FS_MOUNT_CALL(mount, close_index_dir))
7002 		return FS_MOUNT_CALL(mount, close_index_dir, descriptor->cookie);
7003 
7004 	return B_OK;
7005 }
7006 
7007 
7008 static void
7009 index_dir_free_fd(struct file_descriptor* descriptor)
7010 {
7011 	struct fs_mount* mount = descriptor->u.mount;
7012 
7013 	if (mount != NULL) {
7014 		FS_MOUNT_CALL(mount, free_index_dir_cookie, descriptor->cookie);
7015 		put_mount(mount);
7016 	}
7017 }
7018 
7019 
7020 static status_t
7021 index_dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
7022 	struct dirent* buffer, size_t bufferSize, uint32* _count)
7023 {
7024 	struct fs_mount* mount = descriptor->u.mount;
7025 
7026 	if (HAS_FS_MOUNT_CALL(mount, read_index_dir)) {
7027 		return FS_MOUNT_CALL(mount, read_index_dir, descriptor->cookie, buffer,
7028 			bufferSize, _count);
7029 	}
7030 
7031 	return B_UNSUPPORTED;
7032 }
7033 
7034 
7035 static status_t
7036 index_dir_rewind(struct file_descriptor* descriptor)
7037 {
7038 	struct fs_mount* mount = descriptor->u.mount;
7039 
7040 	if (HAS_FS_MOUNT_CALL(mount, rewind_index_dir))
7041 		return FS_MOUNT_CALL(mount, rewind_index_dir, descriptor->cookie);
7042 
7043 	return B_UNSUPPORTED;
7044 }
7045 
7046 
7047 static status_t
7048 index_create(dev_t mountID, const char* name, uint32 type, uint32 flags,
7049 	bool kernel)
7050 {
7051 	FUNCTION(("index_create(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7052 		mountID, name, kernel));
7053 
7054 	struct fs_mount* mount;
7055 	status_t status = get_mount(mountID, &mount);
7056 	if (status != B_OK)
7057 		return status;
7058 
7059 	if (!HAS_FS_MOUNT_CALL(mount, create_index)) {
7060 		status = B_READ_ONLY_DEVICE;
7061 		goto out;
7062 	}
7063 
7064 	status = FS_MOUNT_CALL(mount, create_index, name, type, flags);
7065 
7066 out:
7067 	put_mount(mount);
7068 	return status;
7069 }
7070 
7071 
7072 #if 0
7073 static status_t
7074 index_read_stat(struct file_descriptor* descriptor, struct stat* stat)
7075 {
7076 	struct vnode* vnode = descriptor->u.vnode;
7077 
7078 	// ToDo: currently unused!
7079 	FUNCTION(("index_read_stat: stat 0x%p\n", stat));
7080 	if (!HAS_FS_CALL(vnode, read_index_stat))
7081 		return B_UNSUPPORTED;
7082 
7083 	return B_UNSUPPORTED;
7084 	//return FS_CALL(vnode, read_index_stat, descriptor->cookie, stat);
7085 }
7086 
7087 
7088 static void
7089 index_free_fd(struct file_descriptor* descriptor)
7090 {
7091 	struct vnode* vnode = descriptor->u.vnode;
7092 
7093 	if (vnode != NULL) {
7094 		FS_CALL(vnode, free_index_cookie, descriptor->cookie);
7095 		put_vnode(vnode);
7096 	}
7097 }
7098 #endif
7099 
7100 
7101 static status_t
7102 index_name_read_stat(dev_t mountID, const char* name, struct stat* stat,
7103 	bool kernel)
7104 {
7105 	FUNCTION(("index_remove(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7106 		mountID, name, kernel));
7107 
7108 	struct fs_mount* mount;
7109 	status_t status = get_mount(mountID, &mount);
7110 	if (status != B_OK)
7111 		return status;
7112 
7113 	if (!HAS_FS_MOUNT_CALL(mount, read_index_stat)) {
7114 		status = B_UNSUPPORTED;
7115 		goto out;
7116 	}
7117 
7118 	status = FS_MOUNT_CALL(mount, read_index_stat, name, stat);
7119 
7120 out:
7121 	put_mount(mount);
7122 	return status;
7123 }
7124 
7125 
7126 static status_t
7127 index_remove(dev_t mountID, const char* name, bool kernel)
7128 {
7129 	FUNCTION(("index_remove(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7130 		mountID, name, kernel));
7131 
7132 	struct fs_mount* mount;
7133 	status_t status = get_mount(mountID, &mount);
7134 	if (status != B_OK)
7135 		return status;
7136 
7137 	if (!HAS_FS_MOUNT_CALL(mount, remove_index)) {
7138 		status = B_READ_ONLY_DEVICE;
7139 		goto out;
7140 	}
7141 
7142 	status = FS_MOUNT_CALL(mount, remove_index, name);
7143 
7144 out:
7145 	put_mount(mount);
7146 	return status;
7147 }
7148 
7149 
7150 /*!	TODO: the query FS API is still the pretty much the same as in R5.
7151 		It would be nice if the FS would find some more kernel support
7152 		for them.
7153 		For example, query parsing should be moved into the kernel.
7154 */
7155 static int
7156 query_open(dev_t device, const char* query, uint32 flags, port_id port,
7157 	int32 token, bool kernel)
7158 {
7159 	struct fs_mount* mount;
7160 	void* cookie;
7161 
7162 	FUNCTION(("query_open(device = %" B_PRId32 ", query = \"%s\", kernel = %d)\n",
7163 		device, query, kernel));
7164 
7165 	status_t status = get_mount(device, &mount);
7166 	if (status != B_OK)
7167 		return status;
7168 
7169 	if (!HAS_FS_MOUNT_CALL(mount, open_query)) {
7170 		status = B_UNSUPPORTED;
7171 		goto error;
7172 	}
7173 
7174 	status = FS_MOUNT_CALL(mount, open_query, query, flags, port, token,
7175 		&cookie);
7176 	if (status != B_OK)
7177 		goto error;
7178 
7179 	// get fd for the index directory
7180 	int fd;
7181 	fd = get_new_fd(FDTYPE_QUERY, mount, NULL, cookie, O_CLOEXEC, kernel);
7182 	if (fd >= 0)
7183 		return fd;
7184 
7185 	status = fd;
7186 
7187 	// something went wrong
7188 	FS_MOUNT_CALL(mount, close_query, cookie);
7189 	FS_MOUNT_CALL(mount, free_query_cookie, cookie);
7190 
7191 error:
7192 	put_mount(mount);
7193 	return status;
7194 }
7195 
7196 
7197 static status_t
7198 query_close(struct file_descriptor* descriptor)
7199 {
7200 	struct fs_mount* mount = descriptor->u.mount;
7201 
7202 	FUNCTION(("query_close(descriptor = %p)\n", descriptor));
7203 
7204 	if (HAS_FS_MOUNT_CALL(mount, close_query))
7205 		return FS_MOUNT_CALL(mount, close_query, descriptor->cookie);
7206 
7207 	return B_OK;
7208 }
7209 
7210 
7211 static void
7212 query_free_fd(struct file_descriptor* descriptor)
7213 {
7214 	struct fs_mount* mount = descriptor->u.mount;
7215 
7216 	if (mount != NULL) {
7217 		FS_MOUNT_CALL(mount, free_query_cookie, descriptor->cookie);
7218 		put_mount(mount);
7219 	}
7220 }
7221 
7222 
7223 static status_t
7224 query_read(struct io_context* ioContext, struct file_descriptor* descriptor,
7225 	struct dirent* buffer, size_t bufferSize, uint32* _count)
7226 {
7227 	struct fs_mount* mount = descriptor->u.mount;
7228 
7229 	if (HAS_FS_MOUNT_CALL(mount, read_query)) {
7230 		return FS_MOUNT_CALL(mount, read_query, descriptor->cookie, buffer,
7231 			bufferSize, _count);
7232 	}
7233 
7234 	return B_UNSUPPORTED;
7235 }
7236 
7237 
7238 static status_t
7239 query_rewind(struct file_descriptor* descriptor)
7240 {
7241 	struct fs_mount* mount = descriptor->u.mount;
7242 
7243 	if (HAS_FS_MOUNT_CALL(mount, rewind_query))
7244 		return FS_MOUNT_CALL(mount, rewind_query, descriptor->cookie);
7245 
7246 	return B_UNSUPPORTED;
7247 }
7248 
7249 
7250 //	#pragma mark - General File System functions
7251 
7252 
7253 static dev_t
7254 fs_mount(char* path, const char* device, const char* fsName, uint32 flags,
7255 	const char* args, bool kernel)
7256 {
7257 	struct ::fs_mount* mount;
7258 	status_t status = B_OK;
7259 	fs_volume* volume = NULL;
7260 	int32 layer = 0;
7261 	Vnode* coveredNode = NULL;
7262 
7263 	FUNCTION(("fs_mount: path = '%s', device = '%s', fs_name = '%s', flags = %#"
7264 		B_PRIx32 ", args = '%s'\n", path, device, fsName, flags, args));
7265 
7266 	// The path is always safe, we just have to make sure that fsName is
7267 	// almost valid - we can't make any assumptions about args, though.
7268 	// A NULL fsName is OK, if a device was given and the FS is not virtual.
7269 	// We'll get it from the DDM later.
7270 	if (fsName == NULL) {
7271 		if (!device || flags & B_MOUNT_VIRTUAL_DEVICE)
7272 			return B_BAD_VALUE;
7273 	} else if (fsName[0] == '\0')
7274 		return B_BAD_VALUE;
7275 
7276 	RecursiveLocker mountOpLocker(sMountOpLock);
7277 
7278 	// Helper to delete a newly created file device on failure.
7279 	// Not exactly beautiful, but helps to keep the code below cleaner.
7280 	struct FileDeviceDeleter {
7281 		FileDeviceDeleter() : id(-1) {}
7282 		~FileDeviceDeleter()
7283 		{
7284 			KDiskDeviceManager::Default()->DeleteFileDevice(id);
7285 		}
7286 
7287 		partition_id id;
7288 	} fileDeviceDeleter;
7289 
7290 	// If the file system is not a "virtual" one, the device argument should
7291 	// point to a real file/device (if given at all).
7292 	// get the partition
7293 	KDiskDeviceManager* ddm = KDiskDeviceManager::Default();
7294 	KPartition* partition = NULL;
7295 	KPath normalizedDevice;
7296 	bool newlyCreatedFileDevice = false;
7297 
7298 	if (!(flags & B_MOUNT_VIRTUAL_DEVICE) && device != NULL) {
7299 		// normalize the device path
7300 		status = normalizedDevice.SetTo(device, true);
7301 		if (status != B_OK)
7302 			return status;
7303 
7304 		// get a corresponding partition from the DDM
7305 		partition = ddm->RegisterPartition(normalizedDevice.Path());
7306 		if (partition == NULL) {
7307 			// Partition not found: This either means, the user supplied
7308 			// an invalid path, or the path refers to an image file. We try
7309 			// to let the DDM create a file device for the path.
7310 			partition_id deviceID = ddm->CreateFileDevice(
7311 				normalizedDevice.Path(), &newlyCreatedFileDevice);
7312 			if (deviceID >= 0) {
7313 				partition = ddm->RegisterPartition(deviceID);
7314 				if (newlyCreatedFileDevice)
7315 					fileDeviceDeleter.id = deviceID;
7316 			}
7317 		}
7318 
7319 		if (!partition) {
7320 			TRACE(("fs_mount(): Partition `%s' not found.\n",
7321 				normalizedDevice.Path()));
7322 			return B_ENTRY_NOT_FOUND;
7323 		}
7324 
7325 		device = normalizedDevice.Path();
7326 			// correct path to file device
7327 	}
7328 	PartitionRegistrar partitionRegistrar(partition, true);
7329 
7330 	// Write lock the partition's device. For the time being, we keep the lock
7331 	// until we're done mounting -- not nice, but ensure, that no-one is
7332 	// interfering.
7333 	// TODO: Just mark the partition busy while mounting!
7334 	KDiskDevice* diskDevice = NULL;
7335 	if (partition) {
7336 		diskDevice = ddm->WriteLockDevice(partition->Device()->ID());
7337 		if (!diskDevice) {
7338 			TRACE(("fs_mount(): Failed to lock disk device!\n"));
7339 			return B_ERROR;
7340 		}
7341 	}
7342 
7343 	DeviceWriteLocker writeLocker(diskDevice, true);
7344 		// this takes over the write lock acquired before
7345 
7346 	if (partition != NULL) {
7347 		// make sure, that the partition is not busy
7348 		if (partition->IsBusy()) {
7349 			TRACE(("fs_mount(): Partition is busy.\n"));
7350 			return B_BUSY;
7351 		}
7352 
7353 		// if no FS name had been supplied, we get it from the partition
7354 		if (fsName == NULL) {
7355 			KDiskSystem* diskSystem = partition->DiskSystem();
7356 			if (!diskSystem) {
7357 				TRACE(("fs_mount(): No FS name was given, and the DDM didn't "
7358 					"recognize it.\n"));
7359 				return B_BAD_VALUE;
7360 			}
7361 
7362 			if (!diskSystem->IsFileSystem()) {
7363 				TRACE(("fs_mount(): No FS name was given, and the DDM found a "
7364 					"partitioning system.\n"));
7365 				return B_BAD_VALUE;
7366 			}
7367 
7368 			// The disk system name will not change, and the KDiskSystem
7369 			// object will not go away while the disk device is locked (and
7370 			// the partition has a reference to it), so this is safe.
7371 			fsName = diskSystem->Name();
7372 		}
7373 	}
7374 
7375 	mount = new(std::nothrow) (struct ::fs_mount);
7376 	if (mount == NULL)
7377 		return B_NO_MEMORY;
7378 
7379 	mount->device_name = strdup(device);
7380 		// "device" can be NULL
7381 
7382 	status = mount->entry_cache.Init();
7383 	if (status != B_OK)
7384 		goto err1;
7385 
7386 	// initialize structure
7387 	mount->id = sNextMountID++;
7388 	mount->partition = NULL;
7389 	mount->root_vnode = NULL;
7390 	mount->covers_vnode = NULL;
7391 	mount->unmounting = false;
7392 	mount->owns_file_device = false;
7393 	mount->volume = NULL;
7394 
7395 	// build up the volume(s)
7396 	while (true) {
7397 		char* layerFSName = get_file_system_name_for_layer(fsName, layer);
7398 		if (layerFSName == NULL) {
7399 			if (layer == 0) {
7400 				status = B_NO_MEMORY;
7401 				goto err1;
7402 			}
7403 
7404 			break;
7405 		}
7406 		MemoryDeleter layerFSNameDeleter(layerFSName);
7407 
7408 		volume = (fs_volume*)malloc(sizeof(fs_volume));
7409 		if (volume == NULL) {
7410 			status = B_NO_MEMORY;
7411 			goto err1;
7412 		}
7413 
7414 		volume->id = mount->id;
7415 		volume->partition = partition != NULL ? partition->ID() : -1;
7416 		volume->layer = layer++;
7417 		volume->private_volume = NULL;
7418 		volume->ops = NULL;
7419 		volume->sub_volume = NULL;
7420 		volume->super_volume = NULL;
7421 		volume->file_system = NULL;
7422 		volume->file_system_name = NULL;
7423 
7424 		volume->file_system_name = get_file_system_name(layerFSName);
7425 		if (volume->file_system_name == NULL) {
7426 			status = B_NO_MEMORY;
7427 			free(volume);
7428 			goto err1;
7429 		}
7430 
7431 		volume->file_system = get_file_system(layerFSName);
7432 		if (volume->file_system == NULL) {
7433 			status = B_DEVICE_NOT_FOUND;
7434 			free(volume->file_system_name);
7435 			free(volume);
7436 			goto err1;
7437 		}
7438 
7439 		if (mount->volume == NULL)
7440 			mount->volume = volume;
7441 		else {
7442 			volume->super_volume = mount->volume;
7443 			mount->volume->sub_volume = volume;
7444 			mount->volume = volume;
7445 		}
7446 	}
7447 
7448 	// insert mount struct into list before we call FS's mount() function
7449 	// so that vnodes can be created for this mount
7450 	mutex_lock(&sMountMutex);
7451 	sMountsTable->Insert(mount);
7452 	mutex_unlock(&sMountMutex);
7453 
7454 	ino_t rootID;
7455 
7456 	if (!sRoot) {
7457 		// we haven't mounted anything yet
7458 		if (strcmp(path, "/") != 0) {
7459 			status = B_ERROR;
7460 			goto err2;
7461 		}
7462 
7463 		status = mount->volume->file_system->mount(mount->volume, device, flags,
7464 			args, &rootID);
7465 		if (status != 0)
7466 			goto err2;
7467 	} else {
7468 		status = path_to_vnode(path, true, &coveredNode, NULL, kernel);
7469 		if (status != B_OK)
7470 			goto err2;
7471 
7472 		mount->covers_vnode = coveredNode;
7473 
7474 		// make sure covered_vnode is a directory
7475 		if (!S_ISDIR(coveredNode->Type())) {
7476 			status = B_NOT_A_DIRECTORY;
7477 			goto err3;
7478 		}
7479 
7480 		if (coveredNode->IsCovered()) {
7481 			// this is already a covered vnode
7482 			status = B_BUSY;
7483 			goto err3;
7484 		}
7485 
7486 		// mount it/them
7487 		fs_volume* volume = mount->volume;
7488 		while (volume) {
7489 			status = volume->file_system->mount(volume, device, flags, args,
7490 				&rootID);
7491 			if (status != B_OK) {
7492 				if (volume->sub_volume)
7493 					goto err4;
7494 				goto err3;
7495 			}
7496 
7497 			volume = volume->super_volume;
7498 		}
7499 
7500 		volume = mount->volume;
7501 		while (volume) {
7502 			if (volume->ops->all_layers_mounted != NULL)
7503 				volume->ops->all_layers_mounted(volume);
7504 			volume = volume->super_volume;
7505 		}
7506 	}
7507 
7508 	// the root node is supposed to be owned by the file system - it must
7509 	// exist at this point
7510 	mount->root_vnode = lookup_vnode(mount->id, rootID);
7511 	if (mount->root_vnode == NULL || mount->root_vnode->ref_count != 1) {
7512 		panic("fs_mount: file system does not own its root node!\n");
7513 		status = B_ERROR;
7514 		goto err4;
7515 	}
7516 
7517 	// set up the links between the root vnode and the vnode it covers
7518 	rw_lock_write_lock(&sVnodeLock);
7519 	if (coveredNode != NULL) {
7520 		if (coveredNode->IsCovered()) {
7521 			// the vnode is covered now
7522 			status = B_BUSY;
7523 			rw_lock_write_unlock(&sVnodeLock);
7524 			goto err4;
7525 		}
7526 
7527 		mount->root_vnode->covers = coveredNode;
7528 		mount->root_vnode->SetCovering(true);
7529 
7530 		coveredNode->covered_by = mount->root_vnode;
7531 		coveredNode->SetCovered(true);
7532 	}
7533 	rw_lock_write_unlock(&sVnodeLock);
7534 
7535 	if (!sRoot) {
7536 		sRoot = mount->root_vnode;
7537 		mutex_lock(&sIOContextRootLock);
7538 		get_current_io_context(true)->root = sRoot;
7539 		mutex_unlock(&sIOContextRootLock);
7540 		inc_vnode_ref_count(sRoot);
7541 	}
7542 
7543 	// supply the partition (if any) with the mount cookie and mark it mounted
7544 	if (partition) {
7545 		partition->SetMountCookie(mount->volume->private_volume);
7546 		partition->SetVolumeID(mount->id);
7547 
7548 		// keep a partition reference as long as the partition is mounted
7549 		partitionRegistrar.Detach();
7550 		mount->partition = partition;
7551 		mount->owns_file_device = newlyCreatedFileDevice;
7552 		fileDeviceDeleter.id = -1;
7553 	}
7554 
7555 	notify_mount(mount->id,
7556 		coveredNode != NULL ? coveredNode->device : -1,
7557 		coveredNode ? coveredNode->id : -1);
7558 
7559 	return mount->id;
7560 
7561 err4:
7562 	FS_MOUNT_CALL_NO_PARAMS(mount, unmount);
7563 err3:
7564 	if (coveredNode != NULL)
7565 		put_vnode(coveredNode);
7566 err2:
7567 	mutex_lock(&sMountMutex);
7568 	sMountsTable->Remove(mount);
7569 	mutex_unlock(&sMountMutex);
7570 err1:
7571 	delete mount;
7572 
7573 	return status;
7574 }
7575 
7576 
7577 static status_t
7578 fs_unmount(char* path, dev_t mountID, uint32 flags, bool kernel)
7579 {
7580 	struct fs_mount* mount;
7581 	status_t err;
7582 
7583 	FUNCTION(("fs_unmount(path '%s', dev %" B_PRId32 ", kernel %d\n", path,
7584 		mountID, kernel));
7585 
7586 	struct vnode* pathVnode = NULL;
7587 	if (path != NULL) {
7588 		err = path_to_vnode(path, true, &pathVnode, NULL, kernel);
7589 		if (err != B_OK)
7590 			return B_ENTRY_NOT_FOUND;
7591 	}
7592 
7593 	RecursiveLocker mountOpLocker(sMountOpLock);
7594 
7595 	// this lock is not strictly necessary, but here in case of KDEBUG
7596 	// to keep the ASSERT in find_mount() working.
7597 	KDEBUG_ONLY(mutex_lock(&sMountMutex));
7598 	mount = find_mount(path != NULL ? pathVnode->device : mountID);
7599 	KDEBUG_ONLY(mutex_unlock(&sMountMutex));
7600 	if (mount == NULL) {
7601 		panic("fs_unmount: find_mount() failed on root vnode @%p of mount\n",
7602 			pathVnode);
7603 	}
7604 
7605 	if (path != NULL) {
7606 		put_vnode(pathVnode);
7607 
7608 		if (mount->root_vnode != pathVnode) {
7609 			// not mountpoint
7610 			return B_BAD_VALUE;
7611 		}
7612 	}
7613 
7614 	// if the volume is associated with a partition, lock the device of the
7615 	// partition as long as we are unmounting
7616 	KDiskDeviceManager* ddm = KDiskDeviceManager::Default();
7617 	KPartition* partition = mount->partition;
7618 	KDiskDevice* diskDevice = NULL;
7619 	if (partition != NULL) {
7620 		if (partition->Device() == NULL) {
7621 			dprintf("fs_unmount(): There is no device!\n");
7622 			return B_ERROR;
7623 		}
7624 		diskDevice = ddm->WriteLockDevice(partition->Device()->ID());
7625 		if (!diskDevice) {
7626 			TRACE(("fs_unmount(): Failed to lock disk device!\n"));
7627 			return B_ERROR;
7628 		}
7629 	}
7630 	DeviceWriteLocker writeLocker(diskDevice, true);
7631 
7632 	// make sure, that the partition is not busy
7633 	if (partition != NULL) {
7634 		if ((flags & B_UNMOUNT_BUSY_PARTITION) == 0 && partition->IsBusy()) {
7635 			TRACE(("fs_unmount(): Partition is busy.\n"));
7636 			return B_BUSY;
7637 		}
7638 	}
7639 
7640 	// grab the vnode master mutex to keep someone from creating
7641 	// a vnode while we're figuring out if we can continue
7642 	WriteLocker vnodesWriteLocker(&sVnodeLock);
7643 
7644 	bool disconnectedDescriptors = false;
7645 
7646 	while (true) {
7647 		bool busy = false;
7648 
7649 		// cycle through the list of vnodes associated with this mount and
7650 		// make sure all of them are not busy or have refs on them
7651 		VnodeList::Iterator iterator = mount->vnodes.GetIterator();
7652 		while (struct vnode* vnode = iterator.Next()) {
7653 			if (vnode->IsBusy()) {
7654 				busy = true;
7655 				break;
7656 			}
7657 
7658 			// check the vnode's ref count -- subtract additional references for
7659 			// covering
7660 			int32 refCount = vnode->ref_count;
7661 			if (vnode->covers != NULL)
7662 				refCount--;
7663 			if (vnode->covered_by != NULL)
7664 				refCount--;
7665 
7666 			if (refCount != 0) {
7667 				// there are still vnodes in use on this mount, so we cannot
7668 				// unmount yet
7669 				busy = true;
7670 				break;
7671 			}
7672 		}
7673 
7674 		if (!busy)
7675 			break;
7676 
7677 		if ((flags & B_FORCE_UNMOUNT) == 0)
7678 			return B_BUSY;
7679 
7680 		if (disconnectedDescriptors) {
7681 			// wait a bit until the last access is finished, and then try again
7682 			vnodesWriteLocker.Unlock();
7683 			snooze(100000);
7684 			// TODO: if there is some kind of bug that prevents the ref counts
7685 			// from getting back to zero, this will fall into an endless loop...
7686 			vnodesWriteLocker.Lock();
7687 			continue;
7688 		}
7689 
7690 		// the file system is still busy - but we're forced to unmount it,
7691 		// so let's disconnect all open file descriptors
7692 
7693 		mount->unmounting = true;
7694 			// prevent new vnodes from being created
7695 
7696 		vnodesWriteLocker.Unlock();
7697 
7698 		disconnect_mount_or_vnode_fds(mount, NULL);
7699 		disconnectedDescriptors = true;
7700 
7701 		vnodesWriteLocker.Lock();
7702 	}
7703 
7704 	// We can safely continue. Mark all of the vnodes busy and this mount
7705 	// structure in unmounting state. Also undo the vnode covers/covered_by
7706 	// links.
7707 	mount->unmounting = true;
7708 
7709 	VnodeList::Iterator iterator = mount->vnodes.GetIterator();
7710 	while (struct vnode* vnode = iterator.Next()) {
7711 		// Remove all covers/covered_by links from other mounts' nodes to this
7712 		// vnode and adjust the node ref count accordingly. We will release the
7713 		// references to the external vnodes below.
7714 		if (Vnode* coveredNode = vnode->covers) {
7715 			if (Vnode* coveringNode = vnode->covered_by) {
7716 				// We have both covered and covering vnodes, so just remove us
7717 				// from the chain.
7718 				coveredNode->covered_by = coveringNode;
7719 				coveringNode->covers = coveredNode;
7720 				vnode->ref_count -= 2;
7721 
7722 				vnode->covered_by = NULL;
7723 				vnode->covers = NULL;
7724 				vnode->SetCovering(false);
7725 				vnode->SetCovered(false);
7726 			} else {
7727 				// We only have a covered vnode. Remove its link to us.
7728 				coveredNode->covered_by = NULL;
7729 				coveredNode->SetCovered(false);
7730 				vnode->ref_count--;
7731 
7732 				// If the other node is an external vnode, we keep its link
7733 				// link around so we can put the reference later on. Otherwise
7734 				// we get rid of it right now.
7735 				if (coveredNode->mount == mount) {
7736 					vnode->covers = NULL;
7737 					coveredNode->ref_count--;
7738 				}
7739 			}
7740 		} else if (Vnode* coveringNode = vnode->covered_by) {
7741 			// We only have a covering vnode. Remove its link to us.
7742 			coveringNode->covers = NULL;
7743 			coveringNode->SetCovering(false);
7744 			vnode->ref_count--;
7745 
7746 			// If the other node is an external vnode, we keep its link
7747 			// link around so we can put the reference later on. Otherwise
7748 			// we get rid of it right now.
7749 			if (coveringNode->mount == mount) {
7750 				vnode->covered_by = NULL;
7751 				coveringNode->ref_count--;
7752 			}
7753 		}
7754 
7755 		vnode->SetBusy(true);
7756 		vnode_to_be_freed(vnode);
7757 	}
7758 
7759 	vnodesWriteLocker.Unlock();
7760 
7761 	// Free all vnodes associated with this mount.
7762 	// They will be removed from the mount list by free_vnode(), so
7763 	// we don't have to do this.
7764 	while (struct vnode* vnode = mount->vnodes.Head()) {
7765 		// Put the references to external covered/covering vnodes we kept above.
7766 		if (Vnode* coveredNode = vnode->covers)
7767 			put_vnode(coveredNode);
7768 		if (Vnode* coveringNode = vnode->covered_by)
7769 			put_vnode(coveringNode);
7770 
7771 		free_vnode(vnode, false);
7772 	}
7773 
7774 	// remove the mount structure from the hash table
7775 	mutex_lock(&sMountMutex);
7776 	sMountsTable->Remove(mount);
7777 	mutex_unlock(&sMountMutex);
7778 
7779 	mountOpLocker.Unlock();
7780 
7781 	FS_MOUNT_CALL_NO_PARAMS(mount, unmount);
7782 	notify_unmount(mount->id);
7783 
7784 	// dereference the partition and mark it unmounted
7785 	if (partition) {
7786 		partition->SetVolumeID(-1);
7787 		partition->SetMountCookie(NULL);
7788 
7789 		if (mount->owns_file_device)
7790 			KDiskDeviceManager::Default()->DeleteFileDevice(partition->ID());
7791 		partition->Unregister();
7792 	}
7793 
7794 	delete mount;
7795 	return B_OK;
7796 }
7797 
7798 
7799 static status_t
7800 fs_sync(dev_t device)
7801 {
7802 	struct fs_mount* mount;
7803 	status_t status = get_mount(device, &mount);
7804 	if (status != B_OK)
7805 		return status;
7806 
7807 	struct vnode marker;
7808 	memset(&marker, 0, sizeof(marker));
7809 	marker.SetBusy(true);
7810 	marker.SetRemoved(true);
7811 
7812 	// First, synchronize all file caches
7813 
7814 	while (true) {
7815 		WriteLocker locker(sVnodeLock);
7816 			// Note: That's the easy way. Which is probably OK for sync(),
7817 			// since it's a relatively rare call and doesn't need to allow for
7818 			// a lot of concurrency. Using a read lock would be possible, but
7819 			// also more involved, since we had to lock the individual nodes
7820 			// and take care of the locking order, which we might not want to
7821 			// do while holding fs_mount::rlock.
7822 
7823 		// synchronize access to vnode list
7824 		recursive_lock_lock(&mount->rlock);
7825 
7826 		struct vnode* vnode;
7827 		if (!marker.IsRemoved()) {
7828 			vnode = mount->vnodes.GetNext(&marker);
7829 			mount->vnodes.Remove(&marker);
7830 			marker.SetRemoved(true);
7831 		} else
7832 			vnode = mount->vnodes.First();
7833 
7834 		while (vnode != NULL && (vnode->cache == NULL
7835 			|| vnode->IsRemoved() || vnode->IsBusy())) {
7836 			// TODO: we could track writes (and writable mapped vnodes)
7837 			//	and have a simple flag that we could test for here
7838 			vnode = mount->vnodes.GetNext(vnode);
7839 		}
7840 
7841 		if (vnode != NULL) {
7842 			// insert marker vnode again
7843 			mount->vnodes.Insert(mount->vnodes.GetNext(vnode), &marker);
7844 			marker.SetRemoved(false);
7845 		}
7846 
7847 		recursive_lock_unlock(&mount->rlock);
7848 
7849 		if (vnode == NULL)
7850 			break;
7851 
7852 		vnode = lookup_vnode(mount->id, vnode->id);
7853 		if (vnode == NULL || vnode->IsBusy())
7854 			continue;
7855 
7856 		if (vnode->ref_count == 0) {
7857 			// this vnode has been unused before
7858 			vnode_used(vnode);
7859 		}
7860 		inc_vnode_ref_count(vnode);
7861 
7862 		locker.Unlock();
7863 
7864 		if (vnode->cache != NULL && !vnode->IsRemoved())
7865 			vnode->cache->WriteModified();
7866 
7867 		put_vnode(vnode);
7868 	}
7869 
7870 	// And then, let the file systems do their synchronizing work
7871 
7872 	if (HAS_FS_MOUNT_CALL(mount, sync))
7873 		status = FS_MOUNT_CALL_NO_PARAMS(mount, sync);
7874 
7875 	put_mount(mount);
7876 	return status;
7877 }
7878 
7879 
7880 static status_t
7881 fs_read_info(dev_t device, struct fs_info* info)
7882 {
7883 	struct fs_mount* mount;
7884 	status_t status = get_mount(device, &mount);
7885 	if (status != B_OK)
7886 		return status;
7887 
7888 	memset(info, 0, sizeof(struct fs_info));
7889 
7890 	if (HAS_FS_MOUNT_CALL(mount, read_fs_info))
7891 		status = FS_MOUNT_CALL(mount, read_fs_info, info);
7892 
7893 	// fill in info the file system doesn't (have to) know about
7894 	if (status == B_OK) {
7895 		info->dev = mount->id;
7896 		info->root = mount->root_vnode->id;
7897 
7898 		fs_volume* volume = mount->volume;
7899 		while (volume->super_volume != NULL)
7900 			volume = volume->super_volume;
7901 
7902 		strlcpy(info->fsh_name, volume->file_system_name,
7903 			sizeof(info->fsh_name));
7904 		if (mount->device_name != NULL) {
7905 			strlcpy(info->device_name, mount->device_name,
7906 				sizeof(info->device_name));
7907 		}
7908 	}
7909 
7910 	// if the call is not supported by the file system, there are still
7911 	// the parts that we filled out ourselves
7912 
7913 	put_mount(mount);
7914 	return status;
7915 }
7916 
7917 
7918 static status_t
7919 fs_write_info(dev_t device, const struct fs_info* info, int mask)
7920 {
7921 	struct fs_mount* mount;
7922 	status_t status = get_mount(device, &mount);
7923 	if (status != B_OK)
7924 		return status;
7925 
7926 	if (HAS_FS_MOUNT_CALL(mount, write_fs_info))
7927 		status = FS_MOUNT_CALL(mount, write_fs_info, info, mask);
7928 	else
7929 		status = B_READ_ONLY_DEVICE;
7930 
7931 	put_mount(mount);
7932 	return status;
7933 }
7934 
7935 
7936 static dev_t
7937 fs_next_device(int32* _cookie)
7938 {
7939 	struct fs_mount* mount = NULL;
7940 	dev_t device = *_cookie;
7941 
7942 	mutex_lock(&sMountMutex);
7943 
7944 	// Since device IDs are assigned sequentially, this algorithm
7945 	// does work good enough. It makes sure that the device list
7946 	// returned is sorted, and that no device is skipped when an
7947 	// already visited device got unmounted.
7948 
7949 	while (device < sNextMountID) {
7950 		mount = find_mount(device++);
7951 		if (mount != NULL && mount->volume->private_volume != NULL)
7952 			break;
7953 	}
7954 
7955 	*_cookie = device;
7956 
7957 	if (mount != NULL)
7958 		device = mount->id;
7959 	else
7960 		device = B_BAD_VALUE;
7961 
7962 	mutex_unlock(&sMountMutex);
7963 
7964 	return device;
7965 }
7966 
7967 
7968 ssize_t
7969 fs_read_attr(int fd, const char *attribute, uint32 type, off_t pos,
7970 	void *buffer, size_t readBytes)
7971 {
7972 	int attrFD = attr_open(fd, NULL, attribute, O_RDONLY, true);
7973 	if (attrFD < 0)
7974 		return attrFD;
7975 
7976 	ssize_t bytesRead = _kern_read(attrFD, pos, buffer, readBytes);
7977 
7978 	_kern_close(attrFD);
7979 
7980 	return bytesRead;
7981 }
7982 
7983 
7984 static status_t
7985 get_cwd(char* buffer, size_t size, bool kernel)
7986 {
7987 	// Get current working directory from io context
7988 	struct io_context* context = get_current_io_context(kernel);
7989 	status_t status;
7990 
7991 	FUNCTION(("vfs_get_cwd: buf %p, size %ld\n", buffer, size));
7992 
7993 	mutex_lock(&context->io_mutex);
7994 
7995 	struct vnode* vnode = context->cwd;
7996 	if (vnode)
7997 		inc_vnode_ref_count(vnode);
7998 
7999 	mutex_unlock(&context->io_mutex);
8000 
8001 	if (vnode) {
8002 		status = dir_vnode_to_path(vnode, buffer, size, kernel);
8003 		put_vnode(vnode);
8004 	} else
8005 		status = B_ERROR;
8006 
8007 	return status;
8008 }
8009 
8010 
8011 static status_t
8012 set_cwd(int fd, char* path, bool kernel)
8013 {
8014 	struct io_context* context;
8015 	struct vnode* vnode = NULL;
8016 	struct vnode* oldDirectory;
8017 	status_t status;
8018 
8019 	FUNCTION(("set_cwd: path = \'%s\'\n", path));
8020 
8021 	// Get vnode for passed path, and bail if it failed
8022 	status = fd_and_path_to_vnode(fd, path, true, &vnode, NULL, kernel);
8023 	if (status < 0)
8024 		return status;
8025 
8026 	if (!S_ISDIR(vnode->Type())) {
8027 		// nope, can't cwd to here
8028 		status = B_NOT_A_DIRECTORY;
8029 		goto err;
8030 	}
8031 
8032 	// We need to have the permission to enter the directory, too
8033 	if (HAS_FS_CALL(vnode, access)) {
8034 		status = FS_CALL(vnode, access, X_OK);
8035 		if (status != B_OK)
8036 			goto err;
8037 	}
8038 
8039 	// Get current io context and lock
8040 	context = get_current_io_context(kernel);
8041 	mutex_lock(&context->io_mutex);
8042 
8043 	// save the old current working directory first
8044 	oldDirectory = context->cwd;
8045 	context->cwd = vnode;
8046 
8047 	mutex_unlock(&context->io_mutex);
8048 
8049 	if (oldDirectory)
8050 		put_vnode(oldDirectory);
8051 
8052 	return B_NO_ERROR;
8053 
8054 err:
8055 	put_vnode(vnode);
8056 	return status;
8057 }
8058 
8059 
8060 //	#pragma mark - kernel mirrored syscalls
8061 
8062 
8063 dev_t
8064 _kern_mount(const char* path, const char* device, const char* fsName,
8065 	uint32 flags, const char* args, size_t argsLength)
8066 {
8067 	KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8068 	if (pathBuffer.InitCheck() != B_OK)
8069 		return B_NO_MEMORY;
8070 
8071 	return fs_mount(pathBuffer.LockBuffer(), device, fsName, flags, args, true);
8072 }
8073 
8074 
8075 status_t
8076 _kern_unmount(const char* path, uint32 flags)
8077 {
8078 	KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8079 	if (pathBuffer.InitCheck() != B_OK)
8080 		return B_NO_MEMORY;
8081 
8082 	return fs_unmount(pathBuffer.LockBuffer(), -1, flags, true);
8083 }
8084 
8085 
8086 status_t
8087 _kern_read_fs_info(dev_t device, struct fs_info* info)
8088 {
8089 	if (info == NULL)
8090 		return B_BAD_VALUE;
8091 
8092 	return fs_read_info(device, info);
8093 }
8094 
8095 
8096 status_t
8097 _kern_write_fs_info(dev_t device, const struct fs_info* info, int mask)
8098 {
8099 	if (info == NULL)
8100 		return B_BAD_VALUE;
8101 
8102 	return fs_write_info(device, info, mask);
8103 }
8104 
8105 
8106 status_t
8107 _kern_sync(void)
8108 {
8109 	// Note: _kern_sync() is also called from _user_sync()
8110 	int32 cookie = 0;
8111 	dev_t device;
8112 	while ((device = next_dev(&cookie)) >= 0) {
8113 		status_t status = fs_sync(device);
8114 		if (status != B_OK && status != B_BAD_VALUE) {
8115 			dprintf("sync: device %" B_PRIdDEV " couldn't sync: %s\n", device,
8116 				strerror(status));
8117 		}
8118 	}
8119 
8120 	return B_OK;
8121 }
8122 
8123 
8124 dev_t
8125 _kern_next_device(int32* _cookie)
8126 {
8127 	return fs_next_device(_cookie);
8128 }
8129 
8130 
8131 status_t
8132 _kern_get_next_fd_info(team_id teamID, uint32* _cookie, fd_info* info,
8133 	size_t infoSize)
8134 {
8135 	if (infoSize != sizeof(fd_info))
8136 		return B_BAD_VALUE;
8137 
8138 	// get the team
8139 	Team* team = Team::Get(teamID);
8140 	if (team == NULL)
8141 		return B_BAD_TEAM_ID;
8142 	BReference<Team> teamReference(team, true);
8143 
8144 	// now that we have a team reference, its I/O context won't go away
8145 	io_context* context = team->io_context;
8146 	MutexLocker contextLocker(context->io_mutex);
8147 
8148 	uint32 slot = *_cookie;
8149 
8150 	struct file_descriptor* descriptor;
8151 	while (slot < context->table_size
8152 		&& (descriptor = context->fds[slot]) == NULL) {
8153 		slot++;
8154 	}
8155 
8156 	if (slot >= context->table_size)
8157 		return B_ENTRY_NOT_FOUND;
8158 
8159 	info->number = slot;
8160 	info->open_mode = descriptor->open_mode;
8161 
8162 	struct vnode* vnode = fd_vnode(descriptor);
8163 	if (vnode != NULL) {
8164 		info->device = vnode->device;
8165 		info->node = vnode->id;
8166 	} else if (descriptor->u.mount != NULL) {
8167 		info->device = descriptor->u.mount->id;
8168 		info->node = -1;
8169 	}
8170 
8171 	*_cookie = slot + 1;
8172 	return B_OK;
8173 }
8174 
8175 
8176 int
8177 _kern_open_entry_ref(dev_t device, ino_t inode, const char* name, int openMode,
8178 	int perms)
8179 {
8180 	if ((openMode & O_CREAT) != 0) {
8181 		return file_create_entry_ref(device, inode, name, openMode, perms,
8182 			true);
8183 	}
8184 
8185 	return file_open_entry_ref(device, inode, name, openMode, true);
8186 }
8187 
8188 
8189 /*!	\brief Opens a node specified by a FD + path pair.
8190 
8191 	At least one of \a fd and \a path must be specified.
8192 	If only \a fd is given, the function opens the node identified by this
8193 	FD. If only a path is given, this path is opened. If both are given and
8194 	the path is absolute, \a fd is ignored; a relative path is reckoned off
8195 	of the directory (!) identified by \a fd.
8196 
8197 	\param fd The FD. May be < 0.
8198 	\param path The absolute or relative path. May be \c NULL.
8199 	\param openMode The open mode.
8200 	\return A FD referring to the newly opened node, or an error code,
8201 			if an error occurs.
8202 */
8203 int
8204 _kern_open(int fd, const char* path, int openMode, int perms)
8205 {
8206 	KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8207 	if (pathBuffer.InitCheck() != B_OK)
8208 		return B_NO_MEMORY;
8209 
8210 	if ((openMode & O_CREAT) != 0)
8211 		return file_create(fd, pathBuffer.LockBuffer(), openMode, perms, true);
8212 
8213 	return file_open(fd, pathBuffer.LockBuffer(), openMode, true);
8214 }
8215 
8216 
8217 /*!	\brief Opens a directory specified by entry_ref or node_ref.
8218 
8219 	The supplied name may be \c NULL, in which case directory identified
8220 	by \a device and \a inode will be opened. Otherwise \a device and
8221 	\a inode identify the parent directory of the directory to be opened
8222 	and \a name its entry name.
8223 
8224 	\param device If \a name is specified the ID of the device the parent
8225 		   directory of the directory to be opened resides on, otherwise
8226 		   the device of the directory itself.
8227 	\param inode If \a name is specified the node ID of the parent
8228 		   directory of the directory to be opened, otherwise node ID of the
8229 		   directory itself.
8230 	\param name The entry name of the directory to be opened. If \c NULL,
8231 		   the \a device + \a inode pair identify the node to be opened.
8232 	\return The FD of the newly opened directory or an error code, if
8233 			something went wrong.
8234 */
8235 int
8236 _kern_open_dir_entry_ref(dev_t device, ino_t inode, const char* name)
8237 {
8238 	return dir_open_entry_ref(device, inode, name, true);
8239 }
8240 
8241 
8242 /*!	\brief Opens a directory specified by a FD + path pair.
8243 
8244 	At least one of \a fd and \a path must be specified.
8245 	If only \a fd is given, the function opens the directory identified by this
8246 	FD. If only a path is given, this path is opened. If both are given and
8247 	the path is absolute, \a fd is ignored; a relative path is reckoned off
8248 	of the directory (!) identified by \a fd.
8249 
8250 	\param fd The FD. May be < 0.
8251 	\param path The absolute or relative path. May be \c NULL.
8252 	\return A FD referring to the newly opened directory, or an error code,
8253 			if an error occurs.
8254 */
8255 int
8256 _kern_open_dir(int fd, const char* path)
8257 {
8258 	KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8259 	if (pathBuffer.InitCheck() != B_OK)
8260 		return B_NO_MEMORY;
8261 
8262 	return dir_open(fd, pathBuffer.LockBuffer(), true);
8263 }
8264 
8265 
8266 status_t
8267 _kern_fcntl(int fd, int op, size_t argument)
8268 {
8269 	return common_fcntl(fd, op, argument, true);
8270 }
8271 
8272 
8273 status_t
8274 _kern_fsync(int fd)
8275 {
8276 	return common_sync(fd, true);
8277 }
8278 
8279 
8280 status_t
8281 _kern_lock_node(int fd)
8282 {
8283 	return common_lock_node(fd, true);
8284 }
8285 
8286 
8287 status_t
8288 _kern_unlock_node(int fd)
8289 {
8290 	return common_unlock_node(fd, true);
8291 }
8292 
8293 
8294 status_t
8295 _kern_create_dir_entry_ref(dev_t device, ino_t inode, const char* name,
8296 	int perms)
8297 {
8298 	return dir_create_entry_ref(device, inode, name, perms, true);
8299 }
8300 
8301 
8302 /*!	\brief Creates a directory specified by a FD + path pair.
8303 
8304 	\a path must always be specified (it contains the name of the new directory
8305 	at least). If only a path is given, this path identifies the location at
8306 	which the directory shall be created. If both \a fd and \a path are given
8307 	and the path is absolute, \a fd is ignored; a relative path is reckoned off
8308 	of the directory (!) identified by \a fd.
8309 
8310 	\param fd The FD. May be < 0.
8311 	\param path The absolute or relative path. Must not be \c NULL.
8312 	\param perms The access permissions the new directory shall have.
8313 	\return \c B_OK, if the directory has been created successfully, another
8314 			error code otherwise.
8315 */
8316 status_t
8317 _kern_create_dir(int fd, const char* path, int perms)
8318 {
8319 	KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8320 	if (pathBuffer.InitCheck() != B_OK)
8321 		return B_NO_MEMORY;
8322 
8323 	return dir_create(fd, pathBuffer.LockBuffer(), perms, true);
8324 }
8325 
8326 
8327 status_t
8328 _kern_remove_dir(int fd, const char* path)
8329 {
8330 	KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8331 	if (pathBuffer.InitCheck() != B_OK)
8332 		return B_NO_MEMORY;
8333 
8334 	return dir_remove(fd, pathBuffer.LockBuffer(), true);
8335 }
8336 
8337 
8338 /*!	\brief Reads the contents of a symlink referred to by a FD + path pair.
8339 
8340 	At least one of \a fd and \a path must be specified.
8341 	If only \a fd is given, the function the symlink to be read is the node
8342 	identified by this FD. If only a path is given, this path identifies the
8343 	symlink to be read. If both are given and the path is absolute, \a fd is
8344 	ignored; a relative path is reckoned off of the directory (!) identified
8345 	by \a fd.
8346 	If this function fails with B_BUFFER_OVERFLOW, the \a _bufferSize pointer
8347 	will still be updated to reflect the required buffer size.
8348 
8349 	\param fd The FD. May be < 0.
8350 	\param path The absolute or relative path. May be \c NULL.
8351 	\param buffer The buffer into which the contents of the symlink shall be
8352 		   written.
8353 	\param _bufferSize A pointer to the size of the supplied buffer.
8354 	\return The length of the link on success or an appropriate error code
8355 */
8356 status_t
8357 _kern_read_link(int fd, const char* path, char* buffer, size_t* _bufferSize)
8358 {
8359 	KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8360 	if (pathBuffer.InitCheck() != B_OK)
8361 		return B_NO_MEMORY;
8362 
8363 	return common_read_link(fd, pathBuffer.LockBuffer(),
8364 		buffer, _bufferSize, true);
8365 }
8366 
8367 
8368 /*!	\brief Creates a symlink specified by a FD + path pair.
8369 
8370 	\a path must always be specified (it contains the name of the new symlink
8371 	at least). If only a path is given, this path identifies the location at
8372 	which the symlink shall be created. If both \a fd and \a path are given and
8373 	the path is absolute, \a fd is ignored; a relative path is reckoned off
8374 	of the directory (!) identified by \a fd.
8375 
8376 	\param fd The FD. May be < 0.
8377 	\param toPath The absolute or relative path. Must not be \c NULL.
8378 	\param mode The access permissions the new symlink shall have.
8379 	\return \c B_OK, if the symlink has been created successfully, another
8380 			error code otherwise.
8381 */
8382 status_t
8383 _kern_create_symlink(int fd, const char* path, const char* toPath, int mode)
8384 {
8385 	KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8386 	if (pathBuffer.InitCheck() != B_OK)
8387 		return B_NO_MEMORY;
8388 
8389 	return common_create_symlink(fd, pathBuffer.LockBuffer(),
8390 		toPath, mode, true);
8391 }
8392 
8393 
8394 status_t
8395 _kern_create_link(int pathFD, const char* path, int toFD, const char* toPath,
8396 	bool traverseLeafLink)
8397 {
8398 	KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8399 	KPath toPathBuffer(toPath, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8400 	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
8401 		return B_NO_MEMORY;
8402 
8403 	return common_create_link(pathFD, pathBuffer.LockBuffer(), toFD,
8404 		toPathBuffer.LockBuffer(), traverseLeafLink, true);
8405 }
8406 
8407 
8408 /*!	\brief Removes an entry specified by a FD + path pair from its directory.
8409 
8410 	\a path must always be specified (it contains at least the name of the entry
8411 	to be deleted). If only a path is given, this path identifies the entry
8412 	directly. If both \a fd and \a path are given and the path is absolute,
8413 	\a fd is ignored; a relative path is reckoned off of the directory (!)
8414 	identified by \a fd.
8415 
8416 	\param fd The FD. May be < 0.
8417 	\param path The absolute or relative path. Must not be \c NULL.
8418 	\return \c B_OK, if the entry has been removed successfully, another
8419 			error code otherwise.
8420 */
8421 status_t
8422 _kern_unlink(int fd, const char* path)
8423 {
8424 	KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8425 	if (pathBuffer.InitCheck() != B_OK)
8426 		return B_NO_MEMORY;
8427 
8428 	return common_unlink(fd, pathBuffer.LockBuffer(), true);
8429 }
8430 
8431 
8432 /*!	\brief Moves an entry specified by a FD + path pair to a an entry specified
8433 		   by another FD + path pair.
8434 
8435 	\a oldPath and \a newPath must always be specified (they contain at least
8436 	the name of the entry). If only a path is given, this path identifies the
8437 	entry directly. If both a FD and a path are given and the path is absolute,
8438 	the FD is ignored; a relative path is reckoned off of the directory (!)
8439 	identified by the respective FD.
8440 
8441 	\param oldFD The FD of the old location. May be < 0.
8442 	\param oldPath The absolute or relative path of the old location. Must not
8443 		   be \c NULL.
8444 	\param newFD The FD of the new location. May be < 0.
8445 	\param newPath The absolute or relative path of the new location. Must not
8446 		   be \c NULL.
8447 	\return \c B_OK, if the entry has been moved successfully, another
8448 			error code otherwise.
8449 */
8450 status_t
8451 _kern_rename(int oldFD, const char* oldPath, int newFD, const char* newPath)
8452 {
8453 	KPath oldPathBuffer(oldPath, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8454 	KPath newPathBuffer(newPath, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8455 	if (oldPathBuffer.InitCheck() != B_OK || newPathBuffer.InitCheck() != B_OK)
8456 		return B_NO_MEMORY;
8457 
8458 	return common_rename(oldFD, oldPathBuffer.LockBuffer(),
8459 		newFD, newPathBuffer.LockBuffer(), true);
8460 }
8461 
8462 
8463 status_t
8464 _kern_access(int fd, const char* path, int mode, bool effectiveUserGroup)
8465 {
8466 	KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8467 	if (pathBuffer.InitCheck() != B_OK)
8468 		return B_NO_MEMORY;
8469 
8470 	return common_access(fd, pathBuffer.LockBuffer(), mode, effectiveUserGroup,
8471 		true);
8472 }
8473 
8474 
8475 /*!	\brief Reads stat data of an entity specified by a FD + path pair.
8476 
8477 	If only \a fd is given, the stat operation associated with the type
8478 	of the FD (node, attr, attr dir etc.) is performed. If only \a path is
8479 	given, this path identifies the entry for whose node to retrieve the
8480 	stat data. If both \a fd and \a path are given and the path is absolute,
8481 	\a fd is ignored; a relative path is reckoned off of the directory (!)
8482 	identified by \a fd and specifies the entry whose stat data shall be
8483 	retrieved.
8484 
8485 	\param fd The FD. May be < 0.
8486 	\param path The absolute or relative path. Must not be \c NULL.
8487 	\param traverseLeafLink If \a path is given, \c true specifies that the
8488 		   function shall not stick to symlinks, but traverse them.
8489 	\param stat The buffer the stat data shall be written into.
8490 	\param statSize The size of the supplied stat buffer.
8491 	\return \c B_OK, if the the stat data have been read successfully, another
8492 			error code otherwise.
8493 */
8494 status_t
8495 _kern_read_stat(int fd, const char* path, bool traverseLeafLink,
8496 	struct stat* stat, size_t statSize)
8497 {
8498 	struct stat completeStat;
8499 	struct stat* originalStat = NULL;
8500 	status_t status;
8501 
8502 	if (statSize > sizeof(struct stat))
8503 		return B_BAD_VALUE;
8504 
8505 	// this supports different stat extensions
8506 	if (statSize < sizeof(struct stat)) {
8507 		originalStat = stat;
8508 		stat = &completeStat;
8509 	}
8510 
8511 	status = vfs_read_stat(fd, path, traverseLeafLink, stat, true);
8512 
8513 	if (status == B_OK && originalStat != NULL)
8514 		memcpy(originalStat, stat, statSize);
8515 
8516 	return status;
8517 }
8518 
8519 
8520 /*!	\brief Writes stat data of an entity specified by a FD + path pair.
8521 
8522 	If only \a fd is given, the stat operation associated with the type
8523 	of the FD (node, attr, attr dir etc.) is performed. If only \a path is
8524 	given, this path identifies the entry for whose node to write the
8525 	stat data. If both \a fd and \a path are given and the path is absolute,
8526 	\a fd is ignored; a relative path is reckoned off of the directory (!)
8527 	identified by \a fd and specifies the entry whose stat data shall be
8528 	written.
8529 
8530 	\param fd The FD. May be < 0.
8531 	\param path The absolute or relative path. May be \c NULL.
8532 	\param traverseLeafLink If \a path is given, \c true specifies that the
8533 		   function shall not stick to symlinks, but traverse them.
8534 	\param stat The buffer containing the stat data to be written.
8535 	\param statSize The size of the supplied stat buffer.
8536 	\param statMask A mask specifying which parts of the stat data shall be
8537 		   written.
8538 	\return \c B_OK, if the the stat data have been written successfully,
8539 			another error code otherwise.
8540 */
8541 status_t
8542 _kern_write_stat(int fd, const char* path, bool traverseLeafLink,
8543 	const struct stat* stat, size_t statSize, int statMask)
8544 {
8545 	struct stat completeStat;
8546 
8547 	if (statSize > sizeof(struct stat))
8548 		return B_BAD_VALUE;
8549 
8550 	// this supports different stat extensions
8551 	if (statSize < sizeof(struct stat)) {
8552 		memset((uint8*)&completeStat + statSize, 0,
8553 			sizeof(struct stat) - statSize);
8554 		memcpy(&completeStat, stat, statSize);
8555 		stat = &completeStat;
8556 	}
8557 
8558 	status_t status;
8559 
8560 	if (path != NULL) {
8561 		// path given: write the stat of the node referred to by (fd, path)
8562 		KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8563 		if (pathBuffer.InitCheck() != B_OK)
8564 			return B_NO_MEMORY;
8565 
8566 		status = common_path_write_stat(fd, pathBuffer.LockBuffer(),
8567 			traverseLeafLink, stat, statMask, true);
8568 	} else {
8569 		// no path given: get the FD and use the FD operation
8570 		struct file_descriptor* descriptor
8571 			= get_fd(get_current_io_context(true), fd);
8572 		if (descriptor == NULL)
8573 			return B_FILE_ERROR;
8574 
8575 		if (descriptor->ops->fd_write_stat)
8576 			status = descriptor->ops->fd_write_stat(descriptor, stat, statMask);
8577 		else
8578 			status = B_UNSUPPORTED;
8579 
8580 		put_fd(descriptor);
8581 	}
8582 
8583 	return status;
8584 }
8585 
8586 
8587 int
8588 _kern_open_attr_dir(int fd, const char* path, bool traverseLeafLink)
8589 {
8590 	KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8591 	if (pathBuffer.InitCheck() != B_OK)
8592 		return B_NO_MEMORY;
8593 
8594 	return attr_dir_open(fd, pathBuffer.LockBuffer(), traverseLeafLink, true);
8595 }
8596 
8597 
8598 int
8599 _kern_open_attr(int fd, const char* path, const char* name, uint32 type,
8600 	int openMode)
8601 {
8602 	KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8603 	if (pathBuffer.InitCheck() != B_OK)
8604 		return B_NO_MEMORY;
8605 
8606 	if ((openMode & O_CREAT) != 0) {
8607 		return attr_create(fd, pathBuffer.LockBuffer(), name, type, openMode,
8608 			true);
8609 	}
8610 
8611 	return attr_open(fd, pathBuffer.LockBuffer(), name, openMode, true);
8612 }
8613 
8614 
8615 status_t
8616 _kern_remove_attr(int fd, const char* name)
8617 {
8618 	return attr_remove(fd, name, true);
8619 }
8620 
8621 
8622 status_t
8623 _kern_rename_attr(int fromFile, const char* fromName, int toFile,
8624 	const char* toName)
8625 {
8626 	return attr_rename(fromFile, fromName, toFile, toName, true);
8627 }
8628 
8629 
8630 int
8631 _kern_open_index_dir(dev_t device)
8632 {
8633 	return index_dir_open(device, true);
8634 }
8635 
8636 
8637 status_t
8638 _kern_create_index(dev_t device, const char* name, uint32 type, uint32 flags)
8639 {
8640 	return index_create(device, name, type, flags, true);
8641 }
8642 
8643 
8644 status_t
8645 _kern_read_index_stat(dev_t device, const char* name, struct stat* stat)
8646 {
8647 	return index_name_read_stat(device, name, stat, true);
8648 }
8649 
8650 
8651 status_t
8652 _kern_remove_index(dev_t device, const char* name)
8653 {
8654 	return index_remove(device, name, true);
8655 }
8656 
8657 
8658 status_t
8659 _kern_getcwd(char* buffer, size_t size)
8660 {
8661 	TRACE(("_kern_getcwd: buf %p, %ld\n", buffer, size));
8662 
8663 	// Call vfs to get current working directory
8664 	return get_cwd(buffer, size, true);
8665 }
8666 
8667 
8668 status_t
8669 _kern_setcwd(int fd, const char* path)
8670 {
8671 	KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8672 	if (pathBuffer.InitCheck() != B_OK)
8673 		return B_NO_MEMORY;
8674 
8675 	return set_cwd(fd, pathBuffer.LockBuffer(), true);
8676 }
8677 
8678 
8679 //	#pragma mark - userland syscalls
8680 
8681 
8682 dev_t
8683 _user_mount(const char* userPath, const char* userDevice,
8684 	const char* userFileSystem, uint32 flags, const char* userArgs,
8685 	size_t argsLength)
8686 {
8687 	char fileSystem[B_FILE_NAME_LENGTH];
8688 	KPath path, device;
8689 	char* args = NULL;
8690 	status_t status;
8691 
8692 	if (!IS_USER_ADDRESS(userPath)
8693 		|| !IS_USER_ADDRESS(userFileSystem)
8694 		|| !IS_USER_ADDRESS(userDevice))
8695 		return B_BAD_ADDRESS;
8696 
8697 	if (path.InitCheck() != B_OK || device.InitCheck() != B_OK)
8698 		return B_NO_MEMORY;
8699 
8700 	if (user_strlcpy(path.LockBuffer(), userPath, B_PATH_NAME_LENGTH) < B_OK)
8701 		return B_BAD_ADDRESS;
8702 
8703 	if (userFileSystem != NULL
8704 		&& user_strlcpy(fileSystem, userFileSystem, sizeof(fileSystem)) < B_OK)
8705 		return B_BAD_ADDRESS;
8706 
8707 	if (userDevice != NULL
8708 		&& user_strlcpy(device.LockBuffer(), userDevice, B_PATH_NAME_LENGTH)
8709 			< B_OK)
8710 		return B_BAD_ADDRESS;
8711 
8712 	if (userArgs != NULL && argsLength > 0) {
8713 		if (!IS_USER_ADDRESS(userArgs))
8714 			return B_BAD_ADDRESS;
8715 
8716 		// this is a safety restriction
8717 		if (argsLength >= 65536)
8718 			return B_NAME_TOO_LONG;
8719 
8720 		args = (char*)malloc(argsLength + 1);
8721 		if (args == NULL)
8722 			return B_NO_MEMORY;
8723 
8724 		if (user_strlcpy(args, userArgs, argsLength + 1) < B_OK) {
8725 			free(args);
8726 			return B_BAD_ADDRESS;
8727 		}
8728 	}
8729 	path.UnlockBuffer();
8730 	device.UnlockBuffer();
8731 
8732 	status = fs_mount(path.LockBuffer(),
8733 		userDevice != NULL ? device.Path() : NULL,
8734 		userFileSystem ? fileSystem : NULL, flags, args, false);
8735 
8736 	free(args);
8737 	return status;
8738 }
8739 
8740 
8741 status_t
8742 _user_unmount(const char* userPath, uint32 flags)
8743 {
8744 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
8745 
8746 	if (!IS_USER_ADDRESS(userPath))
8747 		return B_BAD_ADDRESS;
8748 
8749 	if (pathBuffer.InitCheck() != B_OK)
8750 		return B_NO_MEMORY;
8751 
8752 	char* path = pathBuffer.LockBuffer();
8753 
8754 	if (user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
8755 		return B_BAD_ADDRESS;
8756 
8757 	return fs_unmount(path, -1, flags & ~B_UNMOUNT_BUSY_PARTITION, false);
8758 }
8759 
8760 
8761 status_t
8762 _user_read_fs_info(dev_t device, struct fs_info* userInfo)
8763 {
8764 	struct fs_info info;
8765 	status_t status;
8766 
8767 	if (userInfo == NULL)
8768 		return B_BAD_VALUE;
8769 
8770 	if (!IS_USER_ADDRESS(userInfo))
8771 		return B_BAD_ADDRESS;
8772 
8773 	status = fs_read_info(device, &info);
8774 	if (status != B_OK)
8775 		return status;
8776 
8777 	if (user_memcpy(userInfo, &info, sizeof(struct fs_info)) != B_OK)
8778 		return B_BAD_ADDRESS;
8779 
8780 	return B_OK;
8781 }
8782 
8783 
8784 status_t
8785 _user_write_fs_info(dev_t device, const struct fs_info* userInfo, int mask)
8786 {
8787 	struct fs_info info;
8788 
8789 	if (userInfo == NULL)
8790 		return B_BAD_VALUE;
8791 
8792 	if (!IS_USER_ADDRESS(userInfo)
8793 		|| user_memcpy(&info, userInfo, sizeof(struct fs_info)) != B_OK)
8794 		return B_BAD_ADDRESS;
8795 
8796 	return fs_write_info(device, &info, mask);
8797 }
8798 
8799 
8800 dev_t
8801 _user_next_device(int32* _userCookie)
8802 {
8803 	int32 cookie;
8804 	dev_t device;
8805 
8806 	if (!IS_USER_ADDRESS(_userCookie)
8807 		|| user_memcpy(&cookie, _userCookie, sizeof(int32)) != B_OK)
8808 		return B_BAD_ADDRESS;
8809 
8810 	device = fs_next_device(&cookie);
8811 
8812 	if (device >= B_OK) {
8813 		// update user cookie
8814 		if (user_memcpy(_userCookie, &cookie, sizeof(int32)) != B_OK)
8815 			return B_BAD_ADDRESS;
8816 	}
8817 
8818 	return device;
8819 }
8820 
8821 
8822 status_t
8823 _user_sync(void)
8824 {
8825 	return _kern_sync();
8826 }
8827 
8828 
8829 status_t
8830 _user_get_next_fd_info(team_id team, uint32* userCookie, fd_info* userInfo,
8831 	size_t infoSize)
8832 {
8833 	struct fd_info info;
8834 	uint32 cookie;
8835 
8836 	// only root can do this (or should root's group be enough?)
8837 	if (geteuid() != 0)
8838 		return B_NOT_ALLOWED;
8839 
8840 	if (infoSize != sizeof(fd_info))
8841 		return B_BAD_VALUE;
8842 
8843 	if (!IS_USER_ADDRESS(userCookie) || !IS_USER_ADDRESS(userInfo)
8844 		|| user_memcpy(&cookie, userCookie, sizeof(uint32)) != B_OK)
8845 		return B_BAD_ADDRESS;
8846 
8847 	status_t status = _kern_get_next_fd_info(team, &cookie, &info, infoSize);
8848 	if (status != B_OK)
8849 		return status;
8850 
8851 	if (user_memcpy(userCookie, &cookie, sizeof(uint32)) != B_OK
8852 		|| user_memcpy(userInfo, &info, infoSize) != B_OK)
8853 		return B_BAD_ADDRESS;
8854 
8855 	return status;
8856 }
8857 
8858 
8859 status_t
8860 _user_entry_ref_to_path(dev_t device, ino_t inode, const char* leaf,
8861 	char* userPath, size_t pathLength)
8862 {
8863 	if (!IS_USER_ADDRESS(userPath))
8864 		return B_BAD_ADDRESS;
8865 
8866 	KPath path(B_PATH_NAME_LENGTH + 1);
8867 	if (path.InitCheck() != B_OK)
8868 		return B_NO_MEMORY;
8869 
8870 	// copy the leaf name onto the stack
8871 	char stackLeaf[B_FILE_NAME_LENGTH];
8872 	if (leaf != NULL) {
8873 		if (!IS_USER_ADDRESS(leaf))
8874 			return B_BAD_ADDRESS;
8875 
8876 		int length = user_strlcpy(stackLeaf, leaf, B_FILE_NAME_LENGTH);
8877 		if (length < 0)
8878 			return length;
8879 		if (length >= B_FILE_NAME_LENGTH)
8880 			return B_NAME_TOO_LONG;
8881 
8882 		leaf = stackLeaf;
8883 	}
8884 
8885 	status_t status = vfs_entry_ref_to_path(device, inode, leaf,
8886 		false, path.LockBuffer(), path.BufferSize());
8887 	if (status != B_OK)
8888 		return status;
8889 
8890 	path.UnlockBuffer();
8891 
8892 	int length = user_strlcpy(userPath, path.Path(), pathLength);
8893 	if (length < 0)
8894 		return length;
8895 	if (length >= (int)pathLength)
8896 		return B_BUFFER_OVERFLOW;
8897 
8898 	return B_OK;
8899 }
8900 
8901 
8902 status_t
8903 _user_normalize_path(const char* userPath, bool traverseLink, char* buffer)
8904 {
8905 	if (userPath == NULL || buffer == NULL)
8906 		return B_BAD_VALUE;
8907 	if (!IS_USER_ADDRESS(userPath) || !IS_USER_ADDRESS(buffer))
8908 		return B_BAD_ADDRESS;
8909 
8910 	// copy path from userland
8911 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
8912 	if (pathBuffer.InitCheck() != B_OK)
8913 		return B_NO_MEMORY;
8914 	char* path = pathBuffer.LockBuffer();
8915 
8916 	if (user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
8917 		return B_BAD_ADDRESS;
8918 
8919 	status_t error = normalize_path(path, pathBuffer.BufferSize(), traverseLink,
8920 		false);
8921 	if (error != B_OK)
8922 		return error;
8923 
8924 	// copy back to userland
8925 	int len = user_strlcpy(buffer, path, B_PATH_NAME_LENGTH);
8926 	if (len < 0)
8927 		return len;
8928 	if (len >= B_PATH_NAME_LENGTH)
8929 		return B_BUFFER_OVERFLOW;
8930 
8931 	return B_OK;
8932 }
8933 
8934 
8935 int
8936 _user_open_entry_ref(dev_t device, ino_t inode, const char* userName,
8937 	int openMode, int perms)
8938 {
8939 	char name[B_FILE_NAME_LENGTH];
8940 
8941 	if (userName == NULL || device < 0 || inode < 0)
8942 		return B_BAD_VALUE;
8943 	if (!IS_USER_ADDRESS(userName)
8944 		|| user_strlcpy(name, userName, sizeof(name)) < B_OK)
8945 		return B_BAD_ADDRESS;
8946 
8947 	if ((openMode & O_CREAT) != 0) {
8948 		return file_create_entry_ref(device, inode, name, openMode, perms,
8949 			false);
8950 	}
8951 
8952 	return file_open_entry_ref(device, inode, name, openMode, false);
8953 }
8954 
8955 
8956 int
8957 _user_open(int fd, const char* userPath, int openMode, int perms)
8958 {
8959 	KPath path(B_PATH_NAME_LENGTH + 1);
8960 	if (path.InitCheck() != B_OK)
8961 		return B_NO_MEMORY;
8962 
8963 	char* buffer = path.LockBuffer();
8964 
8965 	if (!IS_USER_ADDRESS(userPath)
8966 		|| user_strlcpy(buffer, userPath, B_PATH_NAME_LENGTH) < B_OK)
8967 		return B_BAD_ADDRESS;
8968 
8969 	if ((openMode & O_CREAT) != 0)
8970 		return file_create(fd, buffer, openMode, perms, false);
8971 
8972 	return file_open(fd, buffer, openMode, false);
8973 }
8974 
8975 
8976 int
8977 _user_open_dir_entry_ref(dev_t device, ino_t inode, const char* userName)
8978 {
8979 	if (userName != NULL) {
8980 		char name[B_FILE_NAME_LENGTH];
8981 
8982 		if (!IS_USER_ADDRESS(userName)
8983 			|| user_strlcpy(name, userName, sizeof(name)) < B_OK)
8984 			return B_BAD_ADDRESS;
8985 
8986 		return dir_open_entry_ref(device, inode, name, false);
8987 	}
8988 	return dir_open_entry_ref(device, inode, NULL, false);
8989 }
8990 
8991 
8992 int
8993 _user_open_dir(int fd, const char* userPath)
8994 {
8995 	if (userPath == NULL)
8996 		return dir_open(fd, NULL, false);
8997 
8998 	KPath path(B_PATH_NAME_LENGTH + 1);
8999 	if (path.InitCheck() != B_OK)
9000 		return B_NO_MEMORY;
9001 
9002 	char* buffer = path.LockBuffer();
9003 
9004 	if (!IS_USER_ADDRESS(userPath)
9005 		|| user_strlcpy(buffer, userPath, B_PATH_NAME_LENGTH) < B_OK)
9006 		return B_BAD_ADDRESS;
9007 
9008 	return dir_open(fd, buffer, false);
9009 }
9010 
9011 
9012 /*!	\brief Opens a directory's parent directory and returns the entry name
9013 		   of the former.
9014 
9015 	Aside from that it returns the directory's entry name, this method is
9016 	equivalent to \code _user_open_dir(fd, "..") \endcode. It really is
9017 	equivalent, if \a userName is \c NULL.
9018 
9019 	If a name buffer is supplied and the name does not fit the buffer, the
9020 	function fails. A buffer of size \c B_FILE_NAME_LENGTH should be safe.
9021 
9022 	\param fd A FD referring to a directory.
9023 	\param userName Buffer the directory's entry name shall be written into.
9024 		   May be \c NULL.
9025 	\param nameLength Size of the name buffer.
9026 	\return The file descriptor of the opened parent directory, if everything
9027 			went fine, an error code otherwise.
9028 */
9029 int
9030 _user_open_parent_dir(int fd, char* userName, size_t nameLength)
9031 {
9032 	bool kernel = false;
9033 
9034 	if (userName && !IS_USER_ADDRESS(userName))
9035 		return B_BAD_ADDRESS;
9036 
9037 	// open the parent dir
9038 	int parentFD = dir_open(fd, (char*)"..", kernel);
9039 	if (parentFD < 0)
9040 		return parentFD;
9041 	FDCloser fdCloser(parentFD, kernel);
9042 
9043 	if (userName) {
9044 		// get the vnodes
9045 		struct vnode* parentVNode = get_vnode_from_fd(parentFD, kernel);
9046 		struct vnode* dirVNode = get_vnode_from_fd(fd, kernel);
9047 		VNodePutter parentVNodePutter(parentVNode);
9048 		VNodePutter dirVNodePutter(dirVNode);
9049 		if (!parentVNode || !dirVNode)
9050 			return B_FILE_ERROR;
9051 
9052 		// get the vnode name
9053 		char _buffer[sizeof(struct dirent) + B_FILE_NAME_LENGTH];
9054 		struct dirent* buffer = (struct dirent*)_buffer;
9055 		status_t status = get_vnode_name(dirVNode, parentVNode, buffer,
9056 			sizeof(_buffer), get_current_io_context(false));
9057 		if (status != B_OK)
9058 			return status;
9059 
9060 		// copy the name to the userland buffer
9061 		int len = user_strlcpy(userName, buffer->d_name, nameLength);
9062 		if (len < 0)
9063 			return len;
9064 		if (len >= (int)nameLength)
9065 			return B_BUFFER_OVERFLOW;
9066 	}
9067 
9068 	return fdCloser.Detach();
9069 }
9070 
9071 
9072 status_t
9073 _user_fcntl(int fd, int op, size_t argument)
9074 {
9075 	status_t status = common_fcntl(fd, op, argument, false);
9076 	if (op == F_SETLKW)
9077 		syscall_restart_handle_post(status);
9078 
9079 	return status;
9080 }
9081 
9082 
9083 status_t
9084 _user_fsync(int fd)
9085 {
9086 	return common_sync(fd, false);
9087 }
9088 
9089 
9090 status_t
9091 _user_flock(int fd, int operation)
9092 {
9093 	FUNCTION(("_user_fcntl(fd = %d, op = %d)\n", fd, operation));
9094 
9095 	// Check if the operation is valid
9096 	switch (operation & ~LOCK_NB) {
9097 		case LOCK_UN:
9098 		case LOCK_SH:
9099 		case LOCK_EX:
9100 			break;
9101 
9102 		default:
9103 			return B_BAD_VALUE;
9104 	}
9105 
9106 	struct file_descriptor* descriptor;
9107 	struct vnode* vnode;
9108 	descriptor = get_fd_and_vnode(fd, &vnode, false);
9109 	if (descriptor == NULL)
9110 		return B_FILE_ERROR;
9111 
9112 	if (descriptor->type != FDTYPE_FILE) {
9113 		put_fd(descriptor);
9114 		return B_BAD_VALUE;
9115 	}
9116 
9117 	struct flock flock;
9118 	flock.l_start = 0;
9119 	flock.l_len = OFF_MAX;
9120 	flock.l_whence = 0;
9121 	flock.l_type = (operation & LOCK_SH) != 0 ? F_RDLCK : F_WRLCK;
9122 
9123 	status_t status;
9124 	if ((operation & LOCK_UN) != 0)
9125 		status = release_advisory_lock(vnode, &flock);
9126 	else {
9127 		status = acquire_advisory_lock(vnode,
9128 			thread_get_current_thread()->team->session_id, &flock,
9129 			(operation & LOCK_NB) == 0);
9130 	}
9131 
9132 	syscall_restart_handle_post(status);
9133 
9134 	put_fd(descriptor);
9135 	return status;
9136 }
9137 
9138 
9139 status_t
9140 _user_lock_node(int fd)
9141 {
9142 	return common_lock_node(fd, false);
9143 }
9144 
9145 
9146 status_t
9147 _user_unlock_node(int fd)
9148 {
9149 	return common_unlock_node(fd, false);
9150 }
9151 
9152 
9153 status_t
9154 _user_create_dir_entry_ref(dev_t device, ino_t inode, const char* userName,
9155 	int perms)
9156 {
9157 	char name[B_FILE_NAME_LENGTH];
9158 	status_t status;
9159 
9160 	if (!IS_USER_ADDRESS(userName))
9161 		return B_BAD_ADDRESS;
9162 
9163 	status = user_strlcpy(name, userName, sizeof(name));
9164 	if (status < 0)
9165 		return status;
9166 
9167 	return dir_create_entry_ref(device, inode, name, perms, false);
9168 }
9169 
9170 
9171 status_t
9172 _user_create_dir(int fd, const char* userPath, int perms)
9173 {
9174 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9175 	if (pathBuffer.InitCheck() != B_OK)
9176 		return B_NO_MEMORY;
9177 
9178 	char* path = pathBuffer.LockBuffer();
9179 
9180 	if (!IS_USER_ADDRESS(userPath)
9181 		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9182 		return B_BAD_ADDRESS;
9183 
9184 	return dir_create(fd, path, perms, false);
9185 }
9186 
9187 
9188 status_t
9189 _user_remove_dir(int fd, const char* userPath)
9190 {
9191 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9192 	if (pathBuffer.InitCheck() != B_OK)
9193 		return B_NO_MEMORY;
9194 
9195 	char* path = pathBuffer.LockBuffer();
9196 
9197 	if (userPath != NULL) {
9198 		if (!IS_USER_ADDRESS(userPath)
9199 			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9200 			return B_BAD_ADDRESS;
9201 	}
9202 
9203 	return dir_remove(fd, userPath ? path : NULL, false);
9204 }
9205 
9206 
9207 status_t
9208 _user_read_link(int fd, const char* userPath, char* userBuffer,
9209 	size_t* userBufferSize)
9210 {
9211 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1), linkBuffer;
9212 	if (pathBuffer.InitCheck() != B_OK || linkBuffer.InitCheck() != B_OK)
9213 		return B_NO_MEMORY;
9214 
9215 	size_t bufferSize;
9216 
9217 	if (!IS_USER_ADDRESS(userBuffer) || !IS_USER_ADDRESS(userBufferSize)
9218 		|| user_memcpy(&bufferSize, userBufferSize, sizeof(size_t)) != B_OK)
9219 		return B_BAD_ADDRESS;
9220 
9221 	char* path = pathBuffer.LockBuffer();
9222 	char* buffer = linkBuffer.LockBuffer();
9223 
9224 	if (userPath) {
9225 		if (!IS_USER_ADDRESS(userPath)
9226 			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9227 			return B_BAD_ADDRESS;
9228 
9229 		if (bufferSize > B_PATH_NAME_LENGTH)
9230 			bufferSize = B_PATH_NAME_LENGTH;
9231 	}
9232 
9233 	status_t status = common_read_link(fd, userPath ? path : NULL, buffer,
9234 		&bufferSize, false);
9235 
9236 	// we also update the bufferSize in case of errors
9237 	// (the real length will be returned in case of B_BUFFER_OVERFLOW)
9238 	if (user_memcpy(userBufferSize, &bufferSize, sizeof(size_t)) != B_OK)
9239 		return B_BAD_ADDRESS;
9240 
9241 	if (status != B_OK)
9242 		return status;
9243 
9244 	if (user_memcpy(userBuffer, buffer, bufferSize) != B_OK)
9245 		return B_BAD_ADDRESS;
9246 
9247 	return B_OK;
9248 }
9249 
9250 
9251 status_t
9252 _user_create_symlink(int fd, const char* userPath, const char* userToPath,
9253 	int mode)
9254 {
9255 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9256 	KPath toPathBuffer(B_PATH_NAME_LENGTH + 1);
9257 	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
9258 		return B_NO_MEMORY;
9259 
9260 	char* path = pathBuffer.LockBuffer();
9261 	char* toPath = toPathBuffer.LockBuffer();
9262 
9263 	if (!IS_USER_ADDRESS(userPath)
9264 		|| !IS_USER_ADDRESS(userToPath)
9265 		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK
9266 		|| user_strlcpy(toPath, userToPath, B_PATH_NAME_LENGTH) < B_OK)
9267 		return B_BAD_ADDRESS;
9268 
9269 	return common_create_symlink(fd, path, toPath, mode, false);
9270 }
9271 
9272 
9273 status_t
9274 _user_create_link(int pathFD, const char* userPath, int toFD,
9275 	const char* userToPath, bool traverseLeafLink)
9276 {
9277 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9278 	KPath toPathBuffer(B_PATH_NAME_LENGTH + 1);
9279 	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
9280 		return B_NO_MEMORY;
9281 
9282 	char* path = pathBuffer.LockBuffer();
9283 	char* toPath = toPathBuffer.LockBuffer();
9284 
9285 	if (!IS_USER_ADDRESS(userPath)
9286 		|| !IS_USER_ADDRESS(userToPath)
9287 		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK
9288 		|| user_strlcpy(toPath, userToPath, B_PATH_NAME_LENGTH) < B_OK)
9289 		return B_BAD_ADDRESS;
9290 
9291 	status_t status = check_path(toPath);
9292 	if (status != B_OK)
9293 		return status;
9294 
9295 	return common_create_link(pathFD, path, toFD, toPath, traverseLeafLink,
9296 		false);
9297 }
9298 
9299 
9300 status_t
9301 _user_unlink(int fd, const char* userPath)
9302 {
9303 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9304 	if (pathBuffer.InitCheck() != B_OK)
9305 		return B_NO_MEMORY;
9306 
9307 	char* path = pathBuffer.LockBuffer();
9308 
9309 	if (!IS_USER_ADDRESS(userPath)
9310 		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9311 		return B_BAD_ADDRESS;
9312 
9313 	return common_unlink(fd, path, false);
9314 }
9315 
9316 
9317 status_t
9318 _user_rename(int oldFD, const char* userOldPath, int newFD,
9319 	const char* userNewPath)
9320 {
9321 	KPath oldPathBuffer(B_PATH_NAME_LENGTH + 1);
9322 	KPath newPathBuffer(B_PATH_NAME_LENGTH + 1);
9323 	if (oldPathBuffer.InitCheck() != B_OK || newPathBuffer.InitCheck() != B_OK)
9324 		return B_NO_MEMORY;
9325 
9326 	char* oldPath = oldPathBuffer.LockBuffer();
9327 	char* newPath = newPathBuffer.LockBuffer();
9328 
9329 	if (!IS_USER_ADDRESS(userOldPath) || !IS_USER_ADDRESS(userNewPath)
9330 		|| user_strlcpy(oldPath, userOldPath, B_PATH_NAME_LENGTH) < B_OK
9331 		|| user_strlcpy(newPath, userNewPath, B_PATH_NAME_LENGTH) < B_OK)
9332 		return B_BAD_ADDRESS;
9333 
9334 	return common_rename(oldFD, oldPath, newFD, newPath, false);
9335 }
9336 
9337 
9338 status_t
9339 _user_create_fifo(int fd, const char* userPath, mode_t perms)
9340 {
9341 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9342 	if (pathBuffer.InitCheck() != B_OK)
9343 		return B_NO_MEMORY;
9344 
9345 	char* path = pathBuffer.LockBuffer();
9346 
9347 	if (!IS_USER_ADDRESS(userPath)
9348 		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK) {
9349 		return B_BAD_ADDRESS;
9350 	}
9351 
9352 	// split into directory vnode and filename path
9353 	char filename[B_FILE_NAME_LENGTH];
9354 	struct vnode* dir;
9355 	status_t status = fd_and_path_to_dir_vnode(fd, path, &dir, filename, false);
9356 	if (status != B_OK)
9357 		return status;
9358 
9359 	VNodePutter _(dir);
9360 
9361 	// the underlying FS needs to support creating FIFOs
9362 	if (!HAS_FS_CALL(dir, create_special_node))
9363 		return B_UNSUPPORTED;
9364 
9365 	// create the entry	-- the FIFO sub node is set up automatically
9366 	fs_vnode superVnode;
9367 	ino_t nodeID;
9368 	status = FS_CALL(dir, create_special_node, filename, NULL,
9369 		S_IFIFO | (perms & S_IUMSK), 0, &superVnode, &nodeID);
9370 
9371 	// create_special_node() acquired a reference for us that we don't need.
9372 	if (status == B_OK)
9373 		put_vnode(dir->mount->volume, nodeID);
9374 
9375 	return status;
9376 }
9377 
9378 
9379 status_t
9380 _user_create_pipe(int* userFDs)
9381 {
9382 	// rootfs should support creating FIFOs, but let's be sure
9383 	if (!HAS_FS_CALL(sRoot, create_special_node))
9384 		return B_UNSUPPORTED;
9385 
9386 	// create the node	-- the FIFO sub node is set up automatically
9387 	fs_vnode superVnode;
9388 	ino_t nodeID;
9389 	status_t status = FS_CALL(sRoot, create_special_node, NULL, NULL,
9390 		S_IFIFO | S_IRUSR | S_IWUSR, 0, &superVnode, &nodeID);
9391 	if (status != B_OK)
9392 		return status;
9393 
9394 	// We've got one reference to the node and need another one.
9395 	struct vnode* vnode;
9396 	status = get_vnode(sRoot->mount->id, nodeID, &vnode, true, false);
9397 	if (status != B_OK) {
9398 		// that should not happen
9399 		dprintf("_user_create_pipe(): Failed to lookup vnode (%" B_PRIdDEV ", "
9400 			"%" B_PRIdINO ")\n", sRoot->mount->id, sRoot->id);
9401 		return status;
9402 	}
9403 
9404 	// Everything looks good so far. Open two FDs for reading respectively
9405 	// writing.
9406 	int fds[2];
9407 	fds[0] = open_vnode(vnode, O_RDONLY, false);
9408 	fds[1] = open_vnode(vnode, O_WRONLY, false);
9409 
9410 	FDCloser closer0(fds[0], false);
9411 	FDCloser closer1(fds[1], false);
9412 
9413 	status = (fds[0] >= 0 ? (fds[1] >= 0 ? B_OK : fds[1]) : fds[0]);
9414 
9415 	// copy FDs to userland
9416 	if (status == B_OK) {
9417 		if (!IS_USER_ADDRESS(userFDs)
9418 			|| user_memcpy(userFDs, fds, sizeof(fds)) != B_OK) {
9419 			status = B_BAD_ADDRESS;
9420 		}
9421 	}
9422 
9423 	// keep FDs, if everything went fine
9424 	if (status == B_OK) {
9425 		closer0.Detach();
9426 		closer1.Detach();
9427 	}
9428 
9429 	return status;
9430 }
9431 
9432 
9433 status_t
9434 _user_access(int fd, const char* userPath, int mode, bool effectiveUserGroup)
9435 {
9436 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9437 	if (pathBuffer.InitCheck() != B_OK)
9438 		return B_NO_MEMORY;
9439 
9440 	char* path = pathBuffer.LockBuffer();
9441 
9442 	if (!IS_USER_ADDRESS(userPath)
9443 		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9444 		return B_BAD_ADDRESS;
9445 
9446 	return common_access(fd, path, mode, effectiveUserGroup, false);
9447 }
9448 
9449 
9450 status_t
9451 _user_read_stat(int fd, const char* userPath, bool traverseLink,
9452 	struct stat* userStat, size_t statSize)
9453 {
9454 	struct stat stat;
9455 	status_t status;
9456 
9457 	if (statSize > sizeof(struct stat))
9458 		return B_BAD_VALUE;
9459 
9460 	if (!IS_USER_ADDRESS(userStat))
9461 		return B_BAD_ADDRESS;
9462 
9463 	if (userPath != NULL) {
9464 		// path given: get the stat of the node referred to by (fd, path)
9465 		if (!IS_USER_ADDRESS(userPath))
9466 			return B_BAD_ADDRESS;
9467 
9468 		KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9469 		if (pathBuffer.InitCheck() != B_OK)
9470 			return B_NO_MEMORY;
9471 
9472 		char* path = pathBuffer.LockBuffer();
9473 
9474 		ssize_t length = user_strlcpy(path, userPath, B_PATH_NAME_LENGTH);
9475 		if (length < B_OK)
9476 			return length;
9477 		if (length >= B_PATH_NAME_LENGTH)
9478 			return B_NAME_TOO_LONG;
9479 
9480 		status = common_path_read_stat(fd, path, traverseLink, &stat, false);
9481 	} else {
9482 		// no path given: get the FD and use the FD operation
9483 		struct file_descriptor* descriptor
9484 			= get_fd(get_current_io_context(false), fd);
9485 		if (descriptor == NULL)
9486 			return B_FILE_ERROR;
9487 
9488 		if (descriptor->ops->fd_read_stat)
9489 			status = descriptor->ops->fd_read_stat(descriptor, &stat);
9490 		else
9491 			status = B_UNSUPPORTED;
9492 
9493 		put_fd(descriptor);
9494 	}
9495 
9496 	if (status != B_OK)
9497 		return status;
9498 
9499 	return user_memcpy(userStat, &stat, statSize);
9500 }
9501 
9502 
9503 status_t
9504 _user_write_stat(int fd, const char* userPath, bool traverseLeafLink,
9505 	const struct stat* userStat, size_t statSize, int statMask)
9506 {
9507 	if (statSize > sizeof(struct stat))
9508 		return B_BAD_VALUE;
9509 
9510 	struct stat stat;
9511 
9512 	if (!IS_USER_ADDRESS(userStat)
9513 		|| user_memcpy(&stat, userStat, statSize) < B_OK)
9514 		return B_BAD_ADDRESS;
9515 
9516 	// clear additional stat fields
9517 	if (statSize < sizeof(struct stat))
9518 		memset((uint8*)&stat + statSize, 0, sizeof(struct stat) - statSize);
9519 
9520 	status_t status;
9521 
9522 	if (userPath != NULL) {
9523 		// path given: write the stat of the node referred to by (fd, path)
9524 		if (!IS_USER_ADDRESS(userPath))
9525 			return B_BAD_ADDRESS;
9526 
9527 		KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9528 		if (pathBuffer.InitCheck() != B_OK)
9529 			return B_NO_MEMORY;
9530 
9531 		char* path = pathBuffer.LockBuffer();
9532 
9533 		ssize_t length = user_strlcpy(path, userPath, B_PATH_NAME_LENGTH);
9534 		if (length < B_OK)
9535 			return length;
9536 		if (length >= B_PATH_NAME_LENGTH)
9537 			return B_NAME_TOO_LONG;
9538 
9539 		status = common_path_write_stat(fd, path, traverseLeafLink, &stat,
9540 			statMask, false);
9541 	} else {
9542 		// no path given: get the FD and use the FD operation
9543 		struct file_descriptor* descriptor
9544 			= get_fd(get_current_io_context(false), fd);
9545 		if (descriptor == NULL)
9546 			return B_FILE_ERROR;
9547 
9548 		if (descriptor->ops->fd_write_stat) {
9549 			status = descriptor->ops->fd_write_stat(descriptor, &stat,
9550 				statMask);
9551 		} else
9552 			status = B_UNSUPPORTED;
9553 
9554 		put_fd(descriptor);
9555 	}
9556 
9557 	return status;
9558 }
9559 
9560 
9561 int
9562 _user_open_attr_dir(int fd, const char* userPath, bool traverseLeafLink)
9563 {
9564 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9565 	if (pathBuffer.InitCheck() != B_OK)
9566 		return B_NO_MEMORY;
9567 
9568 	char* path = pathBuffer.LockBuffer();
9569 
9570 	if (userPath != NULL) {
9571 		if (!IS_USER_ADDRESS(userPath)
9572 			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9573 			return B_BAD_ADDRESS;
9574 	}
9575 
9576 	return attr_dir_open(fd, userPath ? path : NULL, traverseLeafLink, false);
9577 }
9578 
9579 
9580 ssize_t
9581 _user_read_attr(int fd, const char* userAttribute, off_t pos, void* userBuffer,
9582 	size_t readBytes)
9583 {
9584 	char attribute[B_FILE_NAME_LENGTH];
9585 
9586 	if (userAttribute == NULL)
9587 		return B_BAD_VALUE;
9588 	if (!IS_USER_ADDRESS(userAttribute)
9589 		|| user_strlcpy(attribute, userAttribute, sizeof(attribute)) < B_OK) {
9590 		return B_BAD_ADDRESS;
9591 	}
9592 
9593 	int attr = attr_open(fd, NULL, attribute, O_RDONLY, false);
9594 	if (attr < 0)
9595 		return attr;
9596 
9597 	ssize_t bytes = _user_read(attr, pos, userBuffer, readBytes);
9598 	_user_close(attr);
9599 
9600 	return bytes;
9601 }
9602 
9603 
9604 ssize_t
9605 _user_write_attr(int fd, const char* userAttribute, uint32 type, off_t pos,
9606 	const void* buffer, size_t writeBytes)
9607 {
9608 	char attribute[B_FILE_NAME_LENGTH];
9609 
9610 	if (userAttribute == NULL)
9611 		return B_BAD_VALUE;
9612 	if (!IS_USER_ADDRESS(userAttribute)
9613 		|| user_strlcpy(attribute, userAttribute, sizeof(attribute)) < B_OK) {
9614 		return B_BAD_ADDRESS;
9615 	}
9616 
9617 	// Try to support the BeOS typical truncation as well as the position
9618 	// argument
9619 	int attr = attr_create(fd, NULL, attribute, type,
9620 		O_CREAT | O_WRONLY | (pos != 0 ? 0 : O_TRUNC), false);
9621 	if (attr < 0)
9622 		return attr;
9623 
9624 	ssize_t bytes = _user_write(attr, pos, buffer, writeBytes);
9625 	_user_close(attr);
9626 
9627 	return bytes;
9628 }
9629 
9630 
9631 status_t
9632 _user_stat_attr(int fd, const char* userAttribute,
9633 	struct attr_info* userAttrInfo)
9634 {
9635 	char attribute[B_FILE_NAME_LENGTH];
9636 
9637 	if (userAttribute == NULL || userAttrInfo == NULL)
9638 		return B_BAD_VALUE;
9639 	if (!IS_USER_ADDRESS(userAttribute) || !IS_USER_ADDRESS(userAttrInfo)
9640 		|| user_strlcpy(attribute, userAttribute, sizeof(attribute)) < B_OK) {
9641 		return B_BAD_ADDRESS;
9642 	}
9643 
9644 	int attr = attr_open(fd, NULL, attribute, O_RDONLY, false);
9645 	if (attr < 0)
9646 		return attr;
9647 
9648 	struct file_descriptor* descriptor
9649 		= get_fd(get_current_io_context(false), attr);
9650 	if (descriptor == NULL) {
9651 		_user_close(attr);
9652 		return B_FILE_ERROR;
9653 	}
9654 
9655 	struct stat stat;
9656 	status_t status;
9657 	if (descriptor->ops->fd_read_stat)
9658 		status = descriptor->ops->fd_read_stat(descriptor, &stat);
9659 	else
9660 		status = B_UNSUPPORTED;
9661 
9662 	put_fd(descriptor);
9663 	_user_close(attr);
9664 
9665 	if (status == B_OK) {
9666 		attr_info info;
9667 		info.type = stat.st_type;
9668 		info.size = stat.st_size;
9669 
9670 		if (user_memcpy(userAttrInfo, &info, sizeof(struct attr_info)) != B_OK)
9671 			return B_BAD_ADDRESS;
9672 	}
9673 
9674 	return status;
9675 }
9676 
9677 
9678 int
9679 _user_open_attr(int fd, const char* userPath, const char* userName,
9680 	uint32 type, int openMode)
9681 {
9682 	char name[B_FILE_NAME_LENGTH];
9683 
9684 	if (!IS_USER_ADDRESS(userName)
9685 		|| user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9686 		return B_BAD_ADDRESS;
9687 
9688 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9689 	if (pathBuffer.InitCheck() != B_OK)
9690 		return B_NO_MEMORY;
9691 
9692 	char* path = pathBuffer.LockBuffer();
9693 
9694 	if (userPath != NULL) {
9695 		if (!IS_USER_ADDRESS(userPath)
9696 			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9697 			return B_BAD_ADDRESS;
9698 	}
9699 
9700 	if ((openMode & O_CREAT) != 0) {
9701 		return attr_create(fd, userPath ? path : NULL, name, type, openMode,
9702 			false);
9703 	}
9704 
9705 	return attr_open(fd, userPath ? path : NULL, name, openMode, false);
9706 }
9707 
9708 
9709 status_t
9710 _user_remove_attr(int fd, const char* userName)
9711 {
9712 	char name[B_FILE_NAME_LENGTH];
9713 
9714 	if (!IS_USER_ADDRESS(userName)
9715 		|| user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9716 		return B_BAD_ADDRESS;
9717 
9718 	return attr_remove(fd, name, false);
9719 }
9720 
9721 
9722 status_t
9723 _user_rename_attr(int fromFile, const char* userFromName, int toFile,
9724 	const char* userToName)
9725 {
9726 	if (!IS_USER_ADDRESS(userFromName)
9727 		|| !IS_USER_ADDRESS(userToName))
9728 		return B_BAD_ADDRESS;
9729 
9730 	KPath fromNameBuffer(B_FILE_NAME_LENGTH);
9731 	KPath toNameBuffer(B_FILE_NAME_LENGTH);
9732 	if (fromNameBuffer.InitCheck() != B_OK || toNameBuffer.InitCheck() != B_OK)
9733 		return B_NO_MEMORY;
9734 
9735 	char* fromName = fromNameBuffer.LockBuffer();
9736 	char* toName = toNameBuffer.LockBuffer();
9737 
9738 	if (user_strlcpy(fromName, userFromName, B_FILE_NAME_LENGTH) < B_OK
9739 		|| user_strlcpy(toName, userToName, B_FILE_NAME_LENGTH) < B_OK)
9740 		return B_BAD_ADDRESS;
9741 
9742 	return attr_rename(fromFile, fromName, toFile, toName, false);
9743 }
9744 
9745 
9746 int
9747 _user_open_index_dir(dev_t device)
9748 {
9749 	return index_dir_open(device, false);
9750 }
9751 
9752 
9753 status_t
9754 _user_create_index(dev_t device, const char* userName, uint32 type,
9755 	uint32 flags)
9756 {
9757 	char name[B_FILE_NAME_LENGTH];
9758 
9759 	if (!IS_USER_ADDRESS(userName)
9760 		|| user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9761 		return B_BAD_ADDRESS;
9762 
9763 	return index_create(device, name, type, flags, false);
9764 }
9765 
9766 
9767 status_t
9768 _user_read_index_stat(dev_t device, const char* userName, struct stat* userStat)
9769 {
9770 	char name[B_FILE_NAME_LENGTH];
9771 	struct stat stat;
9772 	status_t status;
9773 
9774 	if (!IS_USER_ADDRESS(userName)
9775 		|| !IS_USER_ADDRESS(userStat)
9776 		|| user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9777 		return B_BAD_ADDRESS;
9778 
9779 	status = index_name_read_stat(device, name, &stat, false);
9780 	if (status == B_OK) {
9781 		if (user_memcpy(userStat, &stat, sizeof(stat)) != B_OK)
9782 			return B_BAD_ADDRESS;
9783 	}
9784 
9785 	return status;
9786 }
9787 
9788 
9789 status_t
9790 _user_remove_index(dev_t device, const char* userName)
9791 {
9792 	char name[B_FILE_NAME_LENGTH];
9793 
9794 	if (!IS_USER_ADDRESS(userName)
9795 		|| user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9796 		return B_BAD_ADDRESS;
9797 
9798 	return index_remove(device, name, false);
9799 }
9800 
9801 
9802 status_t
9803 _user_getcwd(char* userBuffer, size_t size)
9804 {
9805 	if (size == 0)
9806 		return B_BAD_VALUE;
9807 	if (!IS_USER_ADDRESS(userBuffer))
9808 		return B_BAD_ADDRESS;
9809 
9810 	if (size > kMaxPathLength)
9811 		size = kMaxPathLength;
9812 
9813 	KPath pathBuffer(size);
9814 	if (pathBuffer.InitCheck() != B_OK)
9815 		return B_NO_MEMORY;
9816 
9817 	TRACE(("user_getcwd: buf %p, %ld\n", userBuffer, size));
9818 
9819 	char* path = pathBuffer.LockBuffer();
9820 
9821 	status_t status = get_cwd(path, size, false);
9822 	if (status != B_OK)
9823 		return status;
9824 
9825 	// Copy back the result
9826 	if (user_strlcpy(userBuffer, path, size) < B_OK)
9827 		return B_BAD_ADDRESS;
9828 
9829 	return status;
9830 }
9831 
9832 
9833 status_t
9834 _user_setcwd(int fd, const char* userPath)
9835 {
9836 	TRACE(("user_setcwd: path = %p\n", userPath));
9837 
9838 	KPath pathBuffer(B_PATH_NAME_LENGTH);
9839 	if (pathBuffer.InitCheck() != B_OK)
9840 		return B_NO_MEMORY;
9841 
9842 	char* path = pathBuffer.LockBuffer();
9843 
9844 	if (userPath != NULL) {
9845 		if (!IS_USER_ADDRESS(userPath)
9846 			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9847 			return B_BAD_ADDRESS;
9848 	}
9849 
9850 	return set_cwd(fd, userPath != NULL ? path : NULL, false);
9851 }
9852 
9853 
9854 status_t
9855 _user_change_root(const char* userPath)
9856 {
9857 	// only root is allowed to chroot()
9858 	if (geteuid() != 0)
9859 		return B_NOT_ALLOWED;
9860 
9861 	// alloc path buffer
9862 	KPath pathBuffer(B_PATH_NAME_LENGTH);
9863 	if (pathBuffer.InitCheck() != B_OK)
9864 		return B_NO_MEMORY;
9865 
9866 	// copy userland path to kernel
9867 	char* path = pathBuffer.LockBuffer();
9868 	if (userPath != NULL) {
9869 		if (!IS_USER_ADDRESS(userPath)
9870 			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9871 			return B_BAD_ADDRESS;
9872 	}
9873 
9874 	// get the vnode
9875 	struct vnode* vnode;
9876 	status_t status = path_to_vnode(path, true, &vnode, NULL, false);
9877 	if (status != B_OK)
9878 		return status;
9879 
9880 	// set the new root
9881 	struct io_context* context = get_current_io_context(false);
9882 	mutex_lock(&sIOContextRootLock);
9883 	struct vnode* oldRoot = context->root;
9884 	context->root = vnode;
9885 	mutex_unlock(&sIOContextRootLock);
9886 
9887 	put_vnode(oldRoot);
9888 
9889 	return B_OK;
9890 }
9891 
9892 
9893 int
9894 _user_open_query(dev_t device, const char* userQuery, size_t queryLength,
9895 	uint32 flags, port_id port, int32 token)
9896 {
9897 	char* query;
9898 
9899 	if (device < 0 || userQuery == NULL || queryLength == 0)
9900 		return B_BAD_VALUE;
9901 
9902 	if (!IS_USER_ADDRESS(userQuery))
9903 		return B_BAD_ADDRESS;
9904 
9905 	// this is a safety restriction
9906 	if (queryLength >= 65536)
9907 		return B_NAME_TOO_LONG;
9908 
9909 	query = (char*)malloc(queryLength + 1);
9910 	if (query == NULL)
9911 		return B_NO_MEMORY;
9912 	if (user_strlcpy(query, userQuery, queryLength + 1) < B_OK) {
9913 		free(query);
9914 		return B_BAD_ADDRESS;
9915 	}
9916 
9917 	int fd = query_open(device, query, flags, port, token, false);
9918 
9919 	free(query);
9920 	return fd;
9921 }
9922 
9923 
9924 #include "vfs_request_io.cpp"
9925