xref: /haiku/src/system/kernel/fs/vfs.cpp (revision adcf5b05a8ca9e17407aa4640675c3873c9f0a6c)
1 /*
2  * Copyright 2005-2013, Ingo Weinhold, ingo_weinhold@gmx.de.
3  * Copyright 2002-2018, Axel Dörfler, axeld@pinc-software.de.
4  * Distributed under the terms of the MIT License.
5  *
6  * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
7  * Distributed under the terms of the NewOS License.
8  */
9 
10 
11 /*! Virtual File System and File System Interface Layer */
12 
13 
14 #include <ctype.h>
15 #include <fcntl.h>
16 #include <limits.h>
17 #include <stddef.h>
18 #include <stdio.h>
19 #include <string.h>
20 #include <sys/file.h>
21 #include <sys/resource.h>
22 #include <sys/stat.h>
23 #include <unistd.h>
24 
25 #include <fs_attr.h>
26 #include <fs_info.h>
27 #include <fs_interface.h>
28 #include <fs_volume.h>
29 #include <NodeMonitor.h>
30 #include <OS.h>
31 #include <StorageDefs.h>
32 
33 #include <AutoDeleter.h>
34 #include <block_cache.h>
35 #include <boot/kernel_args.h>
36 #include <debug_heap.h>
37 #include <disk_device_manager/KDiskDevice.h>
38 #include <disk_device_manager/KDiskDeviceManager.h>
39 #include <disk_device_manager/KDiskDeviceUtils.h>
40 #include <disk_device_manager/KDiskSystem.h>
41 #include <fd.h>
42 #include <file_cache.h>
43 #include <fs/node_monitor.h>
44 #include <KPath.h>
45 #include <lock.h>
46 #include <low_resource_manager.h>
47 #include <slab/Slab.h>
48 #include <StackOrHeapArray.h>
49 #include <syscalls.h>
50 #include <syscall_restart.h>
51 #include <tracing.h>
52 #include <util/atomic.h>
53 #include <util/AutoLock.h>
54 #include <util/DoublyLinkedList.h>
55 #include <vfs.h>
56 #include <vm/vm.h>
57 #include <vm/VMCache.h>
58 #include <wait_for_objects.h>
59 
60 #include "EntryCache.h"
61 #include "fifo.h"
62 #include "IORequest.h"
63 #include "unused_vnodes.h"
64 #include "vfs_tracing.h"
65 #include "Vnode.h"
66 #include "../cache/vnode_store.h"
67 
68 
69 //#define TRACE_VFS
70 #ifdef TRACE_VFS
71 #	define TRACE(x) dprintf x
72 #	define FUNCTION(x) dprintf x
73 #else
74 #	define TRACE(x) ;
75 #	define FUNCTION(x) ;
76 #endif
77 
78 #define ADD_DEBUGGER_COMMANDS
79 
80 
81 #define HAS_FS_CALL(vnode, op)			(vnode->ops->op != NULL)
82 #define HAS_FS_MOUNT_CALL(mount, op)	(mount->volume->ops->op != NULL)
83 
84 #if KDEBUG
85 #	define FS_CALL(vnode, op, params...) \
86 		( HAS_FS_CALL(vnode, op) ? \
87 			vnode->ops->op(vnode->mount->volume, vnode, params) \
88 			: (panic("FS_CALL op " #op " is NULL"), 0))
89 #	define FS_CALL_NO_PARAMS(vnode, op) \
90 		( HAS_FS_CALL(vnode, op) ? \
91 			vnode->ops->op(vnode->mount->volume, vnode) \
92 			: (panic("FS_CALL_NO_PARAMS op " #op " is NULL"), 0))
93 #	define FS_MOUNT_CALL(mount, op, params...) \
94 		( HAS_FS_MOUNT_CALL(mount, op) ? \
95 			mount->volume->ops->op(mount->volume, params) \
96 			: (panic("FS_MOUNT_CALL op " #op " is NULL"), 0))
97 #	define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
98 		( HAS_FS_MOUNT_CALL(mount, op) ? \
99 			mount->volume->ops->op(mount->volume) \
100 			: (panic("FS_MOUNT_CALL_NO_PARAMS op " #op " is NULL"), 0))
101 #else
102 #	define FS_CALL(vnode, op, params...) \
103 			vnode->ops->op(vnode->mount->volume, vnode, params)
104 #	define FS_CALL_NO_PARAMS(vnode, op) \
105 			vnode->ops->op(vnode->mount->volume, vnode)
106 #	define FS_MOUNT_CALL(mount, op, params...) \
107 			mount->volume->ops->op(mount->volume, params)
108 #	define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
109 			mount->volume->ops->op(mount->volume)
110 #endif
111 
112 
113 const static size_t kMaxPathLength = 65536;
114 	// The absolute maximum path length (for getcwd() - this is not depending
115 	// on PATH_MAX
116 
117 
118 typedef DoublyLinkedList<vnode> VnodeList;
119 
120 /*!	\brief Structure to manage a mounted file system
121 
122 	Note: The root_vnode and root_vnode->covers fields (what others?) are
123 	initialized in fs_mount() and not changed afterwards. That is as soon
124 	as the mount is mounted and it is made sure it won't be unmounted
125 	(e.g. by holding a reference to a vnode of that mount) (read) access
126 	to those fields is always safe, even without additional locking. Morever
127 	while mounted the mount holds a reference to the root_vnode->covers vnode,
128 	and thus making the access path vnode->mount->root_vnode->covers->mount->...
129 	safe if a reference to vnode is held (note that for the root mount
130 	root_vnode->covers is NULL, though).
131 */
132 struct fs_mount {
133 	fs_mount()
134 		:
135 		volume(NULL),
136 		device_name(NULL)
137 	{
138 		mutex_init(&lock, "mount lock");
139 	}
140 
141 	~fs_mount()
142 	{
143 		mutex_destroy(&lock);
144 		free(device_name);
145 
146 		while (volume) {
147 			fs_volume* superVolume = volume->super_volume;
148 
149 			if (volume->file_system != NULL)
150 				put_module(volume->file_system->info.name);
151 
152 			free(volume->file_system_name);
153 			free(volume);
154 			volume = superVolume;
155 		}
156 	}
157 
158 	struct fs_mount* next;
159 	dev_t			id;
160 	fs_volume*		volume;
161 	char*			device_name;
162 	mutex			lock;	// guards the vnodes list
163 	struct vnode*	root_vnode;
164 	struct vnode*	covers_vnode;	// immutable
165 	KPartition*		partition;
166 	VnodeList		vnodes;
167 	EntryCache		entry_cache;
168 	bool			unmounting;
169 	bool			owns_file_device;
170 };
171 
172 
173 namespace {
174 
175 struct advisory_lock : public DoublyLinkedListLinkImpl<advisory_lock> {
176 	list_link		link;
177 	void*			bound_to;
178 	team_id			team;
179 	pid_t			session;
180 	off_t			start;
181 	off_t			end;
182 	bool			shared;
183 };
184 
185 typedef DoublyLinkedList<advisory_lock> LockList;
186 
187 } // namespace
188 
189 
190 struct advisory_locking {
191 	sem_id			lock;
192 	sem_id			wait_sem;
193 	LockList		locks;
194 
195 	advisory_locking()
196 		:
197 		lock(-1),
198 		wait_sem(-1)
199 	{
200 	}
201 
202 	~advisory_locking()
203 	{
204 		if (lock >= 0)
205 			delete_sem(lock);
206 		if (wait_sem >= 0)
207 			delete_sem(wait_sem);
208 	}
209 };
210 
211 /*!	\brief Guards sMountsTable.
212 
213 	The holder is allowed to read/write access the sMountsTable.
214 	Manipulation of the fs_mount structures themselves
215 	(and their destruction) requires different locks though.
216 */
217 static mutex sMountMutex = MUTEX_INITIALIZER("vfs_mount_lock");
218 
219 /*!	\brief Guards mount/unmount operations.
220 
221 	The fs_mount() and fs_unmount() hold the lock during their whole operation.
222 	That is locking the lock ensures that no FS is mounted/unmounted. In
223 	particular this means that
224 	- sMountsTable will not be modified,
225 	- the fields immutable after initialization of the fs_mount structures in
226 	  sMountsTable will not be modified,
227 
228 	The thread trying to lock the lock must not hold sVnodeLock or
229 	sMountMutex.
230 */
231 static recursive_lock sMountOpLock;
232 
233 /*!	\brief Guards sVnodeTable.
234 
235 	The holder is allowed read/write access to sVnodeTable and to
236 	any unbusy vnode in that table, save to the immutable fields (device, id,
237 	private_node, mount) to which only read-only access is allowed.
238 	The mutable fields advisory_locking, mandatory_locked_by, and ref_count, as
239 	well as the busy, removed, unused flags, and the vnode's type can also be
240 	write accessed when holding a read lock to sVnodeLock *and* having the vnode
241 	locked. Write access to covered_by and covers requires to write lock
242 	sVnodeLock.
243 
244 	The thread trying to acquire the lock must not hold sMountMutex.
245 	You must not hold this lock when calling create_sem(), as this might call
246 	vfs_free_unused_vnodes() and thus cause a deadlock.
247 */
248 static rw_lock sVnodeLock = RW_LOCK_INITIALIZER("vfs_vnode_lock");
249 
250 /*!	\brief Guards io_context::root.
251 
252 	Must be held when setting or getting the io_context::root field.
253 	The only operation allowed while holding this lock besides getting or
254 	setting the field is inc_vnode_ref_count() on io_context::root.
255 */
256 static mutex sIOContextRootLock = MUTEX_INITIALIZER("io_context::root lock");
257 
258 
259 namespace {
260 
261 struct vnode_hash_key {
262 	dev_t	device;
263 	ino_t	vnode;
264 };
265 
266 struct VnodeHash {
267 	typedef vnode_hash_key	KeyType;
268 	typedef	struct vnode	ValueType;
269 
270 #define VHASH(mountid, vnodeid) \
271 	(((uint32)((vnodeid) >> 32) + (uint32)(vnodeid)) ^ (uint32)(mountid))
272 
273 	size_t HashKey(KeyType key) const
274 	{
275 		return VHASH(key.device, key.vnode);
276 	}
277 
278 	size_t Hash(ValueType* vnode) const
279 	{
280 		return VHASH(vnode->device, vnode->id);
281 	}
282 
283 #undef VHASH
284 
285 	bool Compare(KeyType key, ValueType* vnode) const
286 	{
287 		return vnode->device == key.device && vnode->id == key.vnode;
288 	}
289 
290 	ValueType*& GetLink(ValueType* value) const
291 	{
292 		return value->next;
293 	}
294 };
295 
296 typedef BOpenHashTable<VnodeHash> VnodeTable;
297 
298 
299 struct MountHash {
300 	typedef dev_t			KeyType;
301 	typedef	struct fs_mount	ValueType;
302 
303 	size_t HashKey(KeyType key) const
304 	{
305 		return key;
306 	}
307 
308 	size_t Hash(ValueType* mount) const
309 	{
310 		return mount->id;
311 	}
312 
313 	bool Compare(KeyType key, ValueType* mount) const
314 	{
315 		return mount->id == key;
316 	}
317 
318 	ValueType*& GetLink(ValueType* value) const
319 	{
320 		return value->next;
321 	}
322 };
323 
324 typedef BOpenHashTable<MountHash> MountTable;
325 
326 } // namespace
327 
328 
329 object_cache* sPathNameCache;
330 object_cache* sFileDescriptorCache;
331 
332 #define VNODE_HASH_TABLE_SIZE 1024
333 static VnodeTable* sVnodeTable;
334 static struct vnode* sRoot;
335 
336 #define MOUNTS_HASH_TABLE_SIZE 16
337 static MountTable* sMountsTable;
338 static dev_t sNextMountID = 1;
339 
340 #define MAX_TEMP_IO_VECS 8
341 
342 // How long to wait for busy vnodes (10s)
343 #define BUSY_VNODE_RETRIES 2000
344 #define BUSY_VNODE_DELAY 5000
345 
346 mode_t __gUmask = 022;
347 
348 /* function declarations */
349 
350 static void free_unused_vnodes();
351 
352 // file descriptor operation prototypes
353 static status_t file_read(struct file_descriptor* descriptor, off_t pos,
354 	void* buffer, size_t* _bytes);
355 static status_t file_write(struct file_descriptor* descriptor, off_t pos,
356 	const void* buffer, size_t* _bytes);
357 static off_t file_seek(struct file_descriptor* descriptor, off_t pos,
358 	int seekType);
359 static void file_free_fd(struct file_descriptor* descriptor);
360 static status_t file_close(struct file_descriptor* descriptor);
361 static status_t file_select(struct file_descriptor* descriptor, uint8 event,
362 	struct selectsync* sync);
363 static status_t file_deselect(struct file_descriptor* descriptor, uint8 event,
364 	struct selectsync* sync);
365 static status_t dir_read(struct io_context* context,
366 	struct file_descriptor* descriptor, struct dirent* buffer,
367 	size_t bufferSize, uint32* _count);
368 static status_t dir_read(struct io_context* ioContext, struct vnode* vnode,
369 	void* cookie, struct dirent* buffer, size_t bufferSize, uint32* _count);
370 static status_t dir_rewind(struct file_descriptor* descriptor);
371 static void dir_free_fd(struct file_descriptor* descriptor);
372 static status_t dir_close(struct file_descriptor* descriptor);
373 static status_t attr_dir_read(struct io_context* context,
374 	struct file_descriptor* descriptor, struct dirent* buffer,
375 	size_t bufferSize, uint32* _count);
376 static status_t attr_dir_rewind(struct file_descriptor* descriptor);
377 static void attr_dir_free_fd(struct file_descriptor* descriptor);
378 static status_t attr_dir_close(struct file_descriptor* descriptor);
379 static status_t attr_read(struct file_descriptor* descriptor, off_t pos,
380 	void* buffer, size_t* _bytes);
381 static status_t attr_write(struct file_descriptor* descriptor, off_t pos,
382 	const void* buffer, size_t* _bytes);
383 static off_t attr_seek(struct file_descriptor* descriptor, off_t pos,
384 	int seekType);
385 static void attr_free_fd(struct file_descriptor* descriptor);
386 static status_t attr_close(struct file_descriptor* descriptor);
387 static status_t attr_read_stat(struct file_descriptor* descriptor,
388 	struct stat* statData);
389 static status_t attr_write_stat(struct file_descriptor* descriptor,
390 	const struct stat* stat, int statMask);
391 static status_t index_dir_read(struct io_context* context,
392 	struct file_descriptor* descriptor, struct dirent* buffer,
393 	size_t bufferSize, uint32* _count);
394 static status_t index_dir_rewind(struct file_descriptor* descriptor);
395 static void index_dir_free_fd(struct file_descriptor* descriptor);
396 static status_t index_dir_close(struct file_descriptor* descriptor);
397 static status_t query_read(struct io_context* context,
398 	struct file_descriptor* descriptor, struct dirent* buffer,
399 	size_t bufferSize, uint32* _count);
400 static status_t query_rewind(struct file_descriptor* descriptor);
401 static void query_free_fd(struct file_descriptor* descriptor);
402 static status_t query_close(struct file_descriptor* descriptor);
403 
404 static status_t common_ioctl(struct file_descriptor* descriptor, ulong op,
405 	void* buffer, size_t length);
406 static status_t common_read_stat(struct file_descriptor* descriptor,
407 	struct stat* statData);
408 static status_t common_write_stat(struct file_descriptor* descriptor,
409 	const struct stat* statData, int statMask);
410 static status_t common_path_read_stat(int fd, char* path, bool traverseLeafLink,
411 	struct stat* stat, bool kernel);
412 
413 static status_t vnode_path_to_vnode(struct vnode* vnode, char* path,
414 	bool traverseLeafLink, int count, bool kernel,
415 	struct vnode** _vnode, ino_t* _parentID);
416 static status_t dir_vnode_to_path(struct vnode* vnode, char* buffer,
417 	size_t bufferSize, bool kernel);
418 static status_t fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
419 	struct vnode** _vnode, ino_t* _parentID, bool kernel);
420 static void inc_vnode_ref_count(struct vnode* vnode);
421 static status_t dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree,
422 	bool reenter);
423 static inline void put_vnode(struct vnode* vnode);
424 static status_t fs_unmount(char* path, dev_t mountID, uint32 flags,
425 	bool kernel);
426 static int open_vnode(struct vnode* vnode, int openMode, bool kernel);
427 
428 
429 static struct fd_ops sFileOps = {
430 	file_read,
431 	file_write,
432 	file_seek,
433 	common_ioctl,
434 	NULL,		// set_flags
435 	file_select,
436 	file_deselect,
437 	NULL,		// read_dir()
438 	NULL,		// rewind_dir()
439 	common_read_stat,
440 	common_write_stat,
441 	file_close,
442 	file_free_fd
443 };
444 
445 static struct fd_ops sDirectoryOps = {
446 	NULL,		// read()
447 	NULL,		// write()
448 	NULL,		// seek()
449 	common_ioctl,
450 	NULL,		// set_flags
451 	NULL,		// select()
452 	NULL,		// deselect()
453 	dir_read,
454 	dir_rewind,
455 	common_read_stat,
456 	common_write_stat,
457 	dir_close,
458 	dir_free_fd
459 };
460 
461 static struct fd_ops sAttributeDirectoryOps = {
462 	NULL,		// read()
463 	NULL,		// write()
464 	NULL,		// seek()
465 	common_ioctl,
466 	NULL,		// set_flags
467 	NULL,		// select()
468 	NULL,		// deselect()
469 	attr_dir_read,
470 	attr_dir_rewind,
471 	common_read_stat,
472 	common_write_stat,
473 	attr_dir_close,
474 	attr_dir_free_fd
475 };
476 
477 static struct fd_ops sAttributeOps = {
478 	attr_read,
479 	attr_write,
480 	attr_seek,
481 	common_ioctl,
482 	NULL,		// set_flags
483 	NULL,		// select()
484 	NULL,		// deselect()
485 	NULL,		// read_dir()
486 	NULL,		// rewind_dir()
487 	attr_read_stat,
488 	attr_write_stat,
489 	attr_close,
490 	attr_free_fd
491 };
492 
493 static struct fd_ops sIndexDirectoryOps = {
494 	NULL,		// read()
495 	NULL,		// write()
496 	NULL,		// seek()
497 	NULL,		// ioctl()
498 	NULL,		// set_flags
499 	NULL,		// select()
500 	NULL,		// deselect()
501 	index_dir_read,
502 	index_dir_rewind,
503 	NULL,		// read_stat()
504 	NULL,		// write_stat()
505 	index_dir_close,
506 	index_dir_free_fd
507 };
508 
509 #if 0
510 static struct fd_ops sIndexOps = {
511 	NULL,		// read()
512 	NULL,		// write()
513 	NULL,		// seek()
514 	NULL,		// ioctl()
515 	NULL,		// set_flags
516 	NULL,		// select()
517 	NULL,		// deselect()
518 	NULL,		// dir_read()
519 	NULL,		// dir_rewind()
520 	index_read_stat,	// read_stat()
521 	NULL,		// write_stat()
522 	NULL,		// dir_close()
523 	NULL		// free_fd()
524 };
525 #endif
526 
527 static struct fd_ops sQueryOps = {
528 	NULL,		// read()
529 	NULL,		// write()
530 	NULL,		// seek()
531 	NULL,		// ioctl()
532 	NULL,		// set_flags
533 	NULL,		// select()
534 	NULL,		// deselect()
535 	query_read,
536 	query_rewind,
537 	NULL,		// read_stat()
538 	NULL,		// write_stat()
539 	query_close,
540 	query_free_fd
541 };
542 
543 
544 namespace {
545 
546 class VNodePutter {
547 public:
548 	VNodePutter(struct vnode* vnode = NULL) : fVNode(vnode) {}
549 
550 	~VNodePutter()
551 	{
552 		Put();
553 	}
554 
555 	void SetTo(struct vnode* vnode)
556 	{
557 		Put();
558 		fVNode = vnode;
559 	}
560 
561 	void Put()
562 	{
563 		if (fVNode) {
564 			put_vnode(fVNode);
565 			fVNode = NULL;
566 		}
567 	}
568 
569 	struct vnode* Detach()
570 	{
571 		struct vnode* vnode = fVNode;
572 		fVNode = NULL;
573 		return vnode;
574 	}
575 
576 private:
577 	struct vnode* fVNode;
578 };
579 
580 
581 class FDCloser {
582 public:
583 	FDCloser() : fFD(-1), fKernel(true) {}
584 
585 	FDCloser(int fd, bool kernel) : fFD(fd), fKernel(kernel) {}
586 
587 	~FDCloser()
588 	{
589 		Close();
590 	}
591 
592 	void SetTo(int fd, bool kernel)
593 	{
594 		Close();
595 		fFD = fd;
596 		fKernel = kernel;
597 	}
598 
599 	void Close()
600 	{
601 		if (fFD >= 0) {
602 			if (fKernel)
603 				_kern_close(fFD);
604 			else
605 				_user_close(fFD);
606 			fFD = -1;
607 		}
608 	}
609 
610 	int Detach()
611 	{
612 		int fd = fFD;
613 		fFD = -1;
614 		return fd;
615 	}
616 
617 private:
618 	int		fFD;
619 	bool	fKernel;
620 };
621 
622 } // namespace
623 
624 
625 #if VFS_PAGES_IO_TRACING
626 
627 namespace VFSPagesIOTracing {
628 
629 class PagesIOTraceEntry : public AbstractTraceEntry {
630 protected:
631 	PagesIOTraceEntry(struct vnode* vnode, void* cookie, off_t pos,
632 		const generic_io_vec* vecs, uint32 count, uint32 flags,
633 		generic_size_t bytesRequested, status_t status,
634 		generic_size_t bytesTransferred)
635 		:
636 		fVnode(vnode),
637 		fMountID(vnode->mount->id),
638 		fNodeID(vnode->id),
639 		fCookie(cookie),
640 		fPos(pos),
641 		fCount(count),
642 		fFlags(flags),
643 		fBytesRequested(bytesRequested),
644 		fStatus(status),
645 		fBytesTransferred(bytesTransferred)
646 	{
647 		fVecs = (generic_io_vec*)alloc_tracing_buffer_memcpy(vecs,
648 			sizeof(generic_io_vec) * count, false);
649 	}
650 
651 	void AddDump(TraceOutput& out, const char* mode)
652 	{
653 		out.Print("vfs pages io %5s: vnode: %p (%" B_PRId32 ", %" B_PRId64 "), "
654 			"cookie: %p, pos: %" B_PRIdOFF ", size: %" B_PRIu64 ", vecs: {",
655 			mode, fVnode, fMountID, fNodeID, fCookie, fPos,
656 			(uint64)fBytesRequested);
657 
658 		if (fVecs != NULL) {
659 			for (uint32 i = 0; i < fCount; i++) {
660 				if (i > 0)
661 					out.Print(", ");
662 				out.Print("(%" B_PRIx64 ", %" B_PRIu64 ")", (uint64)fVecs[i].base,
663 					(uint64)fVecs[i].length);
664 			}
665 		}
666 
667 		out.Print("}, flags: %#" B_PRIx32 " -> status: %#" B_PRIx32 ", "
668 			"transferred: %" B_PRIu64, fFlags, fStatus,
669 			(uint64)fBytesTransferred);
670 	}
671 
672 protected:
673 	struct vnode*	fVnode;
674 	dev_t			fMountID;
675 	ino_t			fNodeID;
676 	void*			fCookie;
677 	off_t			fPos;
678 	generic_io_vec*	fVecs;
679 	uint32			fCount;
680 	uint32			fFlags;
681 	generic_size_t	fBytesRequested;
682 	status_t		fStatus;
683 	generic_size_t	fBytesTransferred;
684 };
685 
686 
687 class ReadPages : public PagesIOTraceEntry {
688 public:
689 	ReadPages(struct vnode* vnode, void* cookie, off_t pos,
690 		const generic_io_vec* vecs, uint32 count, uint32 flags,
691 		generic_size_t bytesRequested, status_t status,
692 		generic_size_t bytesTransferred)
693 		:
694 		PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
695 			bytesRequested, status, bytesTransferred)
696 	{
697 		Initialized();
698 	}
699 
700 	virtual void AddDump(TraceOutput& out)
701 	{
702 		PagesIOTraceEntry::AddDump(out, "read");
703 	}
704 };
705 
706 
707 class WritePages : public PagesIOTraceEntry {
708 public:
709 	WritePages(struct vnode* vnode, void* cookie, off_t pos,
710 		const generic_io_vec* vecs, uint32 count, uint32 flags,
711 		generic_size_t bytesRequested, status_t status,
712 		generic_size_t bytesTransferred)
713 		:
714 		PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
715 			bytesRequested, status, bytesTransferred)
716 	{
717 		Initialized();
718 	}
719 
720 	virtual void AddDump(TraceOutput& out)
721 	{
722 		PagesIOTraceEntry::AddDump(out, "write");
723 	}
724 };
725 
726 }	// namespace VFSPagesIOTracing
727 
728 #	define TPIO(x) new(std::nothrow) VFSPagesIOTracing::x;
729 #else
730 #	define TPIO(x) ;
731 #endif	// VFS_PAGES_IO_TRACING
732 
733 
734 /*! Finds the mounted device (the fs_mount structure) with the given ID.
735 	Note, you must hold the gMountMutex lock when you call this function.
736 */
737 static struct fs_mount*
738 find_mount(dev_t id)
739 {
740 	ASSERT_LOCKED_MUTEX(&sMountMutex);
741 
742 	return sMountsTable->Lookup(id);
743 }
744 
745 
746 static status_t
747 get_mount(dev_t id, struct fs_mount** _mount)
748 {
749 	struct fs_mount* mount;
750 
751 	ReadLocker nodeLocker(sVnodeLock);
752 	MutexLocker mountLocker(sMountMutex);
753 
754 	mount = find_mount(id);
755 	if (mount == NULL)
756 		return B_BAD_VALUE;
757 
758 	struct vnode* rootNode = mount->root_vnode;
759 	if (mount->unmounting || rootNode == NULL || rootNode->IsBusy()
760 		|| rootNode->ref_count == 0) {
761 		// might have been called during a mount/unmount operation
762 		return B_BUSY;
763 	}
764 
765 	inc_vnode_ref_count(rootNode);
766 	*_mount = mount;
767 	return B_OK;
768 }
769 
770 
771 static void
772 put_mount(struct fs_mount* mount)
773 {
774 	if (mount)
775 		put_vnode(mount->root_vnode);
776 }
777 
778 
779 /*!	Tries to open the specified file system module.
780 	Accepts a file system name of the form "bfs" or "file_systems/bfs/v1".
781 	Returns a pointer to file system module interface, or NULL if it
782 	could not open the module.
783 */
784 static file_system_module_info*
785 get_file_system(const char* fsName)
786 {
787 	char name[B_FILE_NAME_LENGTH];
788 	if (strncmp(fsName, "file_systems/", strlen("file_systems/"))) {
789 		// construct module name if we didn't get one
790 		// (we currently support only one API)
791 		snprintf(name, sizeof(name), "file_systems/%s/v1", fsName);
792 		fsName = NULL;
793 	}
794 
795 	file_system_module_info* info;
796 	if (get_module(fsName ? fsName : name, (module_info**)&info) != B_OK)
797 		return NULL;
798 
799 	return info;
800 }
801 
802 
803 /*!	Accepts a file system name of the form "bfs" or "file_systems/bfs/v1"
804 	and returns a compatible fs_info.fsh_name name ("bfs" in both cases).
805 	The name is allocated for you, and you have to free() it when you're
806 	done with it.
807 	Returns NULL if the required memory is not available.
808 */
809 static char*
810 get_file_system_name(const char* fsName)
811 {
812 	const size_t length = strlen("file_systems/");
813 
814 	if (strncmp(fsName, "file_systems/", length)) {
815 		// the name already seems to be the module's file name
816 		return strdup(fsName);
817 	}
818 
819 	fsName += length;
820 	const char* end = strchr(fsName, '/');
821 	if (end == NULL) {
822 		// this doesn't seem to be a valid name, but well...
823 		return strdup(fsName);
824 	}
825 
826 	// cut off the trailing /v1
827 
828 	char* name = (char*)malloc(end + 1 - fsName);
829 	if (name == NULL)
830 		return NULL;
831 
832 	strlcpy(name, fsName, end + 1 - fsName);
833 	return name;
834 }
835 
836 
837 /*!	Accepts a list of file system names separated by a colon, one for each
838 	layer and returns the file system name for the specified layer.
839 	The name is allocated for you, and you have to free() it when you're
840 	done with it.
841 	Returns NULL if the required memory is not available or if there is no
842 	name for the specified layer.
843 */
844 static char*
845 get_file_system_name_for_layer(const char* fsNames, int32 layer)
846 {
847 	while (layer >= 0) {
848 		const char* end = strchr(fsNames, ':');
849 		if (end == NULL) {
850 			if (layer == 0)
851 				return strdup(fsNames);
852 			return NULL;
853 		}
854 
855 		if (layer == 0) {
856 			size_t length = end - fsNames + 1;
857 			char* result = (char*)malloc(length);
858 			strlcpy(result, fsNames, length);
859 			return result;
860 		}
861 
862 		fsNames = end + 1;
863 		layer--;
864 	}
865 
866 	return NULL;
867 }
868 
869 
870 static void
871 add_vnode_to_mount_list(struct vnode* vnode, struct fs_mount* mount)
872 {
873 	MutexLocker _(mount->lock);
874 	mount->vnodes.Add(vnode);
875 }
876 
877 
878 static void
879 remove_vnode_from_mount_list(struct vnode* vnode, struct fs_mount* mount)
880 {
881 	MutexLocker _(mount->lock);
882 	mount->vnodes.Remove(vnode);
883 }
884 
885 
886 /*!	\brief Looks up a vnode by mount and node ID in the sVnodeTable.
887 
888 	The caller must hold the sVnodeLock (read lock at least).
889 
890 	\param mountID the mount ID.
891 	\param vnodeID the node ID.
892 
893 	\return The vnode structure, if it was found in the hash table, \c NULL
894 			otherwise.
895 */
896 static struct vnode*
897 lookup_vnode(dev_t mountID, ino_t vnodeID)
898 {
899 	struct vnode_hash_key key;
900 
901 	key.device = mountID;
902 	key.vnode = vnodeID;
903 
904 	return sVnodeTable->Lookup(key);
905 }
906 
907 
908 /*!	\brief Checks whether or not a busy vnode should be waited for (again).
909 
910 	This will also wait for BUSY_VNODE_DELAY before returning if one should
911 	still wait for the vnode becoming unbusy.
912 
913 	\return \c true if one should retry, \c false if not.
914 */
915 static bool
916 retry_busy_vnode(int32& tries, dev_t mountID, ino_t vnodeID)
917 {
918 	if (--tries < 0) {
919 		// vnode doesn't seem to become unbusy
920 		dprintf("vnode %" B_PRIdDEV ":%" B_PRIdINO
921 			" is not becoming unbusy!\n", mountID, vnodeID);
922 		return false;
923 	}
924 	snooze(BUSY_VNODE_DELAY);
925 	return true;
926 }
927 
928 
929 /*!	Creates a new vnode with the given mount and node ID.
930 	If the node already exists, it is returned instead and no new node is
931 	created. In either case -- but not, if an error occurs -- the function write
932 	locks \c sVnodeLock and keeps it locked for the caller when returning. On
933 	error the lock is not held on return.
934 
935 	\param mountID The mount ID.
936 	\param vnodeID The vnode ID.
937 	\param _vnode Will be set to the new vnode on success.
938 	\param _nodeCreated Will be set to \c true when the returned vnode has
939 		been newly created, \c false when it already existed. Will not be
940 		changed on error.
941 	\return \c B_OK, when the vnode was successfully created and inserted or
942 		a node with the given ID was found, \c B_NO_MEMORY or
943 		\c B_ENTRY_NOT_FOUND on error.
944 */
945 static status_t
946 create_new_vnode_and_lock(dev_t mountID, ino_t vnodeID, struct vnode*& _vnode,
947 	bool& _nodeCreated)
948 {
949 	FUNCTION(("create_new_vnode_and_lock()\n"));
950 
951 	struct vnode* vnode = (struct vnode*)malloc(sizeof(struct vnode));
952 	if (vnode == NULL)
953 		return B_NO_MEMORY;
954 
955 	// initialize basic values
956 	memset(vnode, 0, sizeof(struct vnode));
957 	vnode->device = mountID;
958 	vnode->id = vnodeID;
959 	vnode->ref_count = 1;
960 	vnode->SetBusy(true);
961 
962 	// look up the node -- it might have been added by someone else in the
963 	// meantime
964 	rw_lock_write_lock(&sVnodeLock);
965 	struct vnode* existingVnode = lookup_vnode(mountID, vnodeID);
966 	if (existingVnode != NULL) {
967 		free(vnode);
968 		_vnode = existingVnode;
969 		_nodeCreated = false;
970 		return B_OK;
971 	}
972 
973 	// get the mount structure
974 	mutex_lock(&sMountMutex);
975 	vnode->mount = find_mount(mountID);
976 	if (!vnode->mount || vnode->mount->unmounting) {
977 		mutex_unlock(&sMountMutex);
978 		rw_lock_write_unlock(&sVnodeLock);
979 		free(vnode);
980 		return B_ENTRY_NOT_FOUND;
981 	}
982 
983 	// add the vnode to the mount's node list and the hash table
984 	sVnodeTable->Insert(vnode);
985 	add_vnode_to_mount_list(vnode, vnode->mount);
986 
987 	mutex_unlock(&sMountMutex);
988 
989 	_vnode = vnode;
990 	_nodeCreated = true;
991 
992 	// keep the vnode lock locked
993 	return B_OK;
994 }
995 
996 
997 /*!	Frees the vnode and all resources it has acquired, and removes
998 	it from the vnode hash as well as from its mount structure.
999 	Will also make sure that any cache modifications are written back.
1000 */
1001 static void
1002 free_vnode(struct vnode* vnode, bool reenter)
1003 {
1004 	ASSERT_PRINT(vnode->ref_count == 0 && vnode->IsBusy(), "vnode: %p\n",
1005 		vnode);
1006 	ASSERT_PRINT(vnode->advisory_locking == NULL, "vnode: %p\n", vnode);
1007 
1008 	// write back any changes in this vnode's cache -- but only
1009 	// if the vnode won't be deleted, in which case the changes
1010 	// will be discarded
1011 
1012 	if (!vnode->IsRemoved() && HAS_FS_CALL(vnode, fsync))
1013 		FS_CALL_NO_PARAMS(vnode, fsync);
1014 
1015 	// Note: If this vnode has a cache attached, there will still be two
1016 	// references to that cache at this point. The last one belongs to the vnode
1017 	// itself (cf. vfs_get_vnode_cache()) and one belongs to the node's file
1018 	// cache. Each but the last reference to a cache also includes a reference
1019 	// to the vnode. The file cache, however, released its reference (cf.
1020 	// file_cache_create()), so that this vnode's ref count has the chance to
1021 	// ever drop to 0. Deleting the file cache now, will cause the next to last
1022 	// cache reference to be released, which will also release a (no longer
1023 	// existing) vnode reference. To avoid problems, we set the vnode's ref
1024 	// count, so that it will neither become negative nor 0.
1025 	vnode->ref_count = 2;
1026 
1027 	if (!vnode->IsUnpublished()) {
1028 		if (vnode->IsRemoved())
1029 			FS_CALL(vnode, remove_vnode, reenter);
1030 		else
1031 			FS_CALL(vnode, put_vnode, reenter);
1032 	}
1033 
1034 	// If the vnode has a VMCache attached, make sure that it won't try to get
1035 	// another reference via VMVnodeCache::AcquireUnreferencedStoreRef(). As
1036 	// long as the vnode is busy and in the hash, that won't happen, but as
1037 	// soon as we've removed it from the hash, it could reload the vnode -- with
1038 	// a new cache attached!
1039 	if (vnode->cache != NULL && vnode->cache->type == CACHE_TYPE_VNODE)
1040 		((VMVnodeCache*)vnode->cache)->VnodeDeleted();
1041 
1042 	// The file system has removed the resources of the vnode now, so we can
1043 	// make it available again (by removing the busy vnode from the hash).
1044 	rw_lock_write_lock(&sVnodeLock);
1045 	sVnodeTable->Remove(vnode);
1046 	rw_lock_write_unlock(&sVnodeLock);
1047 
1048 	// if we have a VMCache attached, remove it
1049 	if (vnode->cache)
1050 		vnode->cache->ReleaseRef();
1051 
1052 	vnode->cache = NULL;
1053 
1054 	remove_vnode_from_mount_list(vnode, vnode->mount);
1055 
1056 	free(vnode);
1057 }
1058 
1059 
1060 /*!	\brief Decrements the reference counter of the given vnode and deletes it,
1061 	if the counter dropped to 0.
1062 
1063 	The caller must, of course, own a reference to the vnode to call this
1064 	function.
1065 	The caller must not hold the sVnodeLock or the sMountMutex.
1066 
1067 	\param vnode the vnode.
1068 	\param alwaysFree don't move this vnode into the unused list, but really
1069 		   delete it if possible.
1070 	\param reenter \c true, if this function is called (indirectly) from within
1071 		   a file system. This will be passed to file system hooks only.
1072 	\return \c B_OK, if everything went fine, an error code otherwise.
1073 */
1074 static status_t
1075 dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree, bool reenter)
1076 {
1077 	ReadLocker locker(sVnodeLock);
1078 	AutoLocker<Vnode> nodeLocker(vnode);
1079 
1080 	int32 oldRefCount = atomic_add(&vnode->ref_count, -1);
1081 
1082 	ASSERT_PRINT(oldRefCount > 0, "vnode %p\n", vnode);
1083 
1084 	TRACE(("dec_vnode_ref_count: vnode %p, ref now %" B_PRId32 "\n", vnode,
1085 		vnode->ref_count));
1086 
1087 	if (oldRefCount != 1)
1088 		return B_OK;
1089 
1090 	if (vnode->IsBusy())
1091 		panic("dec_vnode_ref_count: called on busy vnode %p\n", vnode);
1092 
1093 	bool freeNode = false;
1094 	bool freeUnusedNodes = false;
1095 
1096 	// Just insert the vnode into an unused list if we don't need
1097 	// to delete it
1098 	if (vnode->IsRemoved() || alwaysFree) {
1099 		vnode_to_be_freed(vnode);
1100 		vnode->SetBusy(true);
1101 		freeNode = true;
1102 	} else
1103 		freeUnusedNodes = vnode_unused(vnode);
1104 
1105 	nodeLocker.Unlock();
1106 	locker.Unlock();
1107 
1108 	if (freeNode)
1109 		free_vnode(vnode, reenter);
1110 	else if (freeUnusedNodes)
1111 		free_unused_vnodes();
1112 
1113 	return B_OK;
1114 }
1115 
1116 
1117 /*!	\brief Increments the reference counter of the given vnode.
1118 
1119 	The caller must make sure that the node isn't deleted while this function
1120 	is called. This can be done either:
1121 	- by ensuring that a reference to the node exists and remains in existence,
1122 	  or
1123 	- by holding the vnode's lock (which also requires read locking sVnodeLock)
1124 	  or by holding sVnodeLock write locked.
1125 
1126 	In the second case the caller is responsible for dealing with the ref count
1127 	0 -> 1 transition. That is 1. this function must not be invoked when the
1128 	node is busy in the first place and 2. vnode_used() must be called for the
1129 	node.
1130 
1131 	\param vnode the vnode.
1132 */
1133 static void
1134 inc_vnode_ref_count(struct vnode* vnode)
1135 {
1136 	atomic_add(&vnode->ref_count, 1);
1137 	TRACE(("inc_vnode_ref_count: vnode %p, ref now %" B_PRId32 "\n", vnode,
1138 		vnode->ref_count));
1139 }
1140 
1141 
1142 static bool
1143 is_special_node_type(int type)
1144 {
1145 	// at the moment only FIFOs are supported
1146 	return S_ISFIFO(type);
1147 }
1148 
1149 
1150 static status_t
1151 create_special_sub_node(struct vnode* vnode, uint32 flags)
1152 {
1153 	if (S_ISFIFO(vnode->Type()))
1154 		return create_fifo_vnode(vnode->mount->volume, vnode);
1155 
1156 	return B_BAD_VALUE;
1157 }
1158 
1159 
1160 /*!	\brief Retrieves a vnode for a given mount ID, node ID pair.
1161 
1162 	If the node is not yet in memory, it will be loaded.
1163 
1164 	The caller must not hold the sVnodeLock or the sMountMutex.
1165 
1166 	\param mountID the mount ID.
1167 	\param vnodeID the node ID.
1168 	\param _vnode Pointer to a vnode* variable into which the pointer to the
1169 		   retrieved vnode structure shall be written.
1170 	\param reenter \c true, if this function is called (indirectly) from within
1171 		   a file system.
1172 	\return \c B_OK, if everything when fine, an error code otherwise.
1173 */
1174 static status_t
1175 get_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode, bool canWait,
1176 	int reenter)
1177 {
1178 	FUNCTION(("get_vnode: mountid %" B_PRId32 " vnid 0x%" B_PRIx64 " %p\n",
1179 		mountID, vnodeID, _vnode));
1180 
1181 	rw_lock_read_lock(&sVnodeLock);
1182 
1183 	int32 tries = BUSY_VNODE_RETRIES;
1184 restart:
1185 	struct vnode* vnode = lookup_vnode(mountID, vnodeID);
1186 	AutoLocker<Vnode> nodeLocker(vnode);
1187 
1188 	if (vnode && vnode->IsBusy()) {
1189 		nodeLocker.Unlock();
1190 		rw_lock_read_unlock(&sVnodeLock);
1191 		if (!canWait) {
1192 			dprintf("vnode %" B_PRIdDEV ":%" B_PRIdINO " is busy!\n",
1193 				mountID, vnodeID);
1194 			return B_BUSY;
1195 		}
1196 		if (!retry_busy_vnode(tries, mountID, vnodeID))
1197 			return B_BUSY;
1198 
1199 		rw_lock_read_lock(&sVnodeLock);
1200 		goto restart;
1201 	}
1202 
1203 	TRACE(("get_vnode: tried to lookup vnode, got %p\n", vnode));
1204 
1205 	status_t status;
1206 
1207 	if (vnode) {
1208 		if (vnode->ref_count == 0) {
1209 			// this vnode has been unused before
1210 			vnode_used(vnode);
1211 		}
1212 		inc_vnode_ref_count(vnode);
1213 
1214 		nodeLocker.Unlock();
1215 		rw_lock_read_unlock(&sVnodeLock);
1216 	} else {
1217 		// we need to create a new vnode and read it in
1218 		rw_lock_read_unlock(&sVnodeLock);
1219 			// unlock -- create_new_vnode_and_lock() write-locks on success
1220 		bool nodeCreated;
1221 		status = create_new_vnode_and_lock(mountID, vnodeID, vnode,
1222 			nodeCreated);
1223 		if (status != B_OK)
1224 			return status;
1225 
1226 		if (!nodeCreated) {
1227 			rw_lock_read_lock(&sVnodeLock);
1228 			rw_lock_write_unlock(&sVnodeLock);
1229 			goto restart;
1230 		}
1231 
1232 		rw_lock_write_unlock(&sVnodeLock);
1233 
1234 		int type;
1235 		uint32 flags;
1236 		status = FS_MOUNT_CALL(vnode->mount, get_vnode, vnodeID, vnode, &type,
1237 			&flags, reenter);
1238 		if (status == B_OK && vnode->private_node == NULL)
1239 			status = B_BAD_VALUE;
1240 
1241 		bool gotNode = status == B_OK;
1242 		bool publishSpecialSubNode = false;
1243 		if (gotNode) {
1244 			vnode->SetType(type);
1245 			publishSpecialSubNode = is_special_node_type(type)
1246 				&& (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
1247 		}
1248 
1249 		if (gotNode && publishSpecialSubNode)
1250 			status = create_special_sub_node(vnode, flags);
1251 
1252 		if (status != B_OK) {
1253 			if (gotNode)
1254 				FS_CALL(vnode, put_vnode, reenter);
1255 
1256 			rw_lock_write_lock(&sVnodeLock);
1257 			sVnodeTable->Remove(vnode);
1258 			remove_vnode_from_mount_list(vnode, vnode->mount);
1259 			rw_lock_write_unlock(&sVnodeLock);
1260 
1261 			free(vnode);
1262 			return status;
1263 		}
1264 
1265 		rw_lock_read_lock(&sVnodeLock);
1266 		vnode->Lock();
1267 
1268 		vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
1269 		vnode->SetBusy(false);
1270 
1271 		vnode->Unlock();
1272 		rw_lock_read_unlock(&sVnodeLock);
1273 	}
1274 
1275 	TRACE(("get_vnode: returning %p\n", vnode));
1276 
1277 	*_vnode = vnode;
1278 	return B_OK;
1279 }
1280 
1281 
1282 /*!	\brief Decrements the reference counter of the given vnode and deletes it,
1283 	if the counter dropped to 0.
1284 
1285 	The caller must, of course, own a reference to the vnode to call this
1286 	function.
1287 	The caller must not hold the sVnodeLock or the sMountMutex.
1288 
1289 	\param vnode the vnode.
1290 */
1291 static inline void
1292 put_vnode(struct vnode* vnode)
1293 {
1294 	dec_vnode_ref_count(vnode, false, false);
1295 }
1296 
1297 
1298 static void
1299 free_unused_vnodes(int32 level)
1300 {
1301 	unused_vnodes_check_started();
1302 
1303 	if (level == B_NO_LOW_RESOURCE) {
1304 		unused_vnodes_check_done();
1305 		return;
1306 	}
1307 
1308 	flush_hot_vnodes();
1309 
1310 	// determine how many nodes to free
1311 	uint32 count = 1;
1312 	{
1313 		MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1314 
1315 		switch (level) {
1316 			case B_LOW_RESOURCE_NOTE:
1317 				count = sUnusedVnodes / 100;
1318 				break;
1319 			case B_LOW_RESOURCE_WARNING:
1320 				count = sUnusedVnodes / 10;
1321 				break;
1322 			case B_LOW_RESOURCE_CRITICAL:
1323 				count = sUnusedVnodes;
1324 				break;
1325 		}
1326 
1327 		if (count > sUnusedVnodes)
1328 			count = sUnusedVnodes;
1329 	}
1330 
1331 	// Write back the modified pages of some unused vnodes and free them.
1332 
1333 	for (uint32 i = 0; i < count; i++) {
1334 		ReadLocker vnodesReadLocker(sVnodeLock);
1335 
1336 		// get the first node
1337 		MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1338 		struct vnode* vnode = (struct vnode*)list_get_first_item(
1339 			&sUnusedVnodeList);
1340 		unusedVnodesLocker.Unlock();
1341 
1342 		if (vnode == NULL)
1343 			break;
1344 
1345 		// lock the node
1346 		AutoLocker<Vnode> nodeLocker(vnode);
1347 
1348 		// Check whether the node is still unused -- since we only append to the
1349 		// tail of the unused queue, the vnode should still be at its head.
1350 		// Alternatively we could check its ref count for 0 and its busy flag,
1351 		// but if the node is no longer at the head of the queue, it means it
1352 		// has been touched in the meantime, i.e. it is no longer the least
1353 		// recently used unused vnode and we rather don't free it.
1354 		unusedVnodesLocker.Lock();
1355 		if (vnode != list_get_first_item(&sUnusedVnodeList))
1356 			continue;
1357 		unusedVnodesLocker.Unlock();
1358 
1359 		ASSERT(!vnode->IsBusy());
1360 
1361 		// grab a reference
1362 		inc_vnode_ref_count(vnode);
1363 		vnode_used(vnode);
1364 
1365 		// write back changes and free the node
1366 		nodeLocker.Unlock();
1367 		vnodesReadLocker.Unlock();
1368 
1369 		if (vnode->cache != NULL)
1370 			vnode->cache->WriteModified();
1371 
1372 		dec_vnode_ref_count(vnode, true, false);
1373 			// this should free the vnode when it's still unused
1374 	}
1375 
1376 	unused_vnodes_check_done();
1377 }
1378 
1379 
1380 /*!	Gets the vnode the given vnode is covering.
1381 
1382 	The caller must have \c sVnodeLock read-locked at least.
1383 
1384 	The function returns a reference to the retrieved vnode (if any), the caller
1385 	is responsible to free.
1386 
1387 	\param vnode The vnode whose covered node shall be returned.
1388 	\return The covered vnode, or \c NULL if the given vnode doesn't cover any
1389 		vnode.
1390 */
1391 static inline Vnode*
1392 get_covered_vnode_locked(Vnode* vnode)
1393 {
1394 	if (Vnode* coveredNode = vnode->covers) {
1395 		while (coveredNode->covers != NULL)
1396 			coveredNode = coveredNode->covers;
1397 
1398 		inc_vnode_ref_count(coveredNode);
1399 		return coveredNode;
1400 	}
1401 
1402 	return NULL;
1403 }
1404 
1405 
1406 /*!	Gets the vnode the given vnode is covering.
1407 
1408 	The caller must not hold \c sVnodeLock. Note that this implies a race
1409 	condition, since the situation can change at any time.
1410 
1411 	The function returns a reference to the retrieved vnode (if any), the caller
1412 	is responsible to free.
1413 
1414 	\param vnode The vnode whose covered node shall be returned.
1415 	\return The covered vnode, or \c NULL if the given vnode doesn't cover any
1416 		vnode.
1417 */
1418 static inline Vnode*
1419 get_covered_vnode(Vnode* vnode)
1420 {
1421 	if (!vnode->IsCovering())
1422 		return NULL;
1423 
1424 	ReadLocker vnodeReadLocker(sVnodeLock);
1425 	return get_covered_vnode_locked(vnode);
1426 }
1427 
1428 
1429 /*!	Gets the vnode the given vnode is covered by.
1430 
1431 	The caller must have \c sVnodeLock read-locked at least.
1432 
1433 	The function returns a reference to the retrieved vnode (if any), the caller
1434 	is responsible to free.
1435 
1436 	\param vnode The vnode whose covering node shall be returned.
1437 	\return The covering vnode, or \c NULL if the given vnode isn't covered by
1438 		any vnode.
1439 */
1440 static Vnode*
1441 get_covering_vnode_locked(Vnode* vnode)
1442 {
1443 	if (Vnode* coveringNode = vnode->covered_by) {
1444 		while (coveringNode->covered_by != NULL)
1445 			coveringNode = coveringNode->covered_by;
1446 
1447 		inc_vnode_ref_count(coveringNode);
1448 		return coveringNode;
1449 	}
1450 
1451 	return NULL;
1452 }
1453 
1454 
1455 /*!	Gets the vnode the given vnode is covered by.
1456 
1457 	The caller must not hold \c sVnodeLock. Note that this implies a race
1458 	condition, since the situation can change at any time.
1459 
1460 	The function returns a reference to the retrieved vnode (if any), the caller
1461 	is responsible to free.
1462 
1463 	\param vnode The vnode whose covering node shall be returned.
1464 	\return The covering vnode, or \c NULL if the given vnode isn't covered by
1465 		any vnode.
1466 */
1467 static inline Vnode*
1468 get_covering_vnode(Vnode* vnode)
1469 {
1470 	if (!vnode->IsCovered())
1471 		return NULL;
1472 
1473 	ReadLocker vnodeReadLocker(sVnodeLock);
1474 	return get_covering_vnode_locked(vnode);
1475 }
1476 
1477 
1478 static void
1479 free_unused_vnodes()
1480 {
1481 	free_unused_vnodes(
1482 		low_resource_state(B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1483 			| B_KERNEL_RESOURCE_ADDRESS_SPACE));
1484 }
1485 
1486 
1487 static void
1488 vnode_low_resource_handler(void* /*data*/, uint32 resources, int32 level)
1489 {
1490 	TRACE(("vnode_low_resource_handler(level = %" B_PRId32 ")\n", level));
1491 
1492 	free_unused_vnodes(level);
1493 }
1494 
1495 
1496 static inline void
1497 put_advisory_locking(struct advisory_locking* locking)
1498 {
1499 	release_sem(locking->lock);
1500 }
1501 
1502 
1503 /*!	Returns the advisory_locking object of the \a vnode in case it
1504 	has one, and locks it.
1505 	You have to call put_advisory_locking() when you're done with
1506 	it.
1507 	Note, you must not have the vnode mutex locked when calling
1508 	this function.
1509 */
1510 static struct advisory_locking*
1511 get_advisory_locking(struct vnode* vnode)
1512 {
1513 	rw_lock_read_lock(&sVnodeLock);
1514 	vnode->Lock();
1515 
1516 	struct advisory_locking* locking = vnode->advisory_locking;
1517 	sem_id lock = locking != NULL ? locking->lock : B_ERROR;
1518 
1519 	vnode->Unlock();
1520 	rw_lock_read_unlock(&sVnodeLock);
1521 
1522 	if (lock >= 0)
1523 		lock = acquire_sem(lock);
1524 	if (lock < 0) {
1525 		// This means the locking has been deleted in the mean time
1526 		// or had never existed in the first place - otherwise, we
1527 		// would get the lock at some point.
1528 		return NULL;
1529 	}
1530 
1531 	return locking;
1532 }
1533 
1534 
1535 /*!	Creates a locked advisory_locking object, and attaches it to the
1536 	given \a vnode.
1537 	Returns B_OK in case of success - also if the vnode got such an
1538 	object from someone else in the mean time, you'll still get this
1539 	one locked then.
1540 */
1541 static status_t
1542 create_advisory_locking(struct vnode* vnode)
1543 {
1544 	if (vnode == NULL)
1545 		return B_FILE_ERROR;
1546 
1547 	ObjectDeleter<advisory_locking> lockingDeleter;
1548 	struct advisory_locking* locking = NULL;
1549 
1550 	while (get_advisory_locking(vnode) == NULL) {
1551 		// no locking object set on the vnode yet, create one
1552 		if (locking == NULL) {
1553 			locking = new(std::nothrow) advisory_locking;
1554 			if (locking == NULL)
1555 				return B_NO_MEMORY;
1556 			lockingDeleter.SetTo(locking);
1557 
1558 			locking->wait_sem = create_sem(0, "advisory lock");
1559 			if (locking->wait_sem < 0)
1560 				return locking->wait_sem;
1561 
1562 			locking->lock = create_sem(0, "advisory locking");
1563 			if (locking->lock < 0)
1564 				return locking->lock;
1565 		}
1566 
1567 		// set our newly created locking object
1568 		ReadLocker _(sVnodeLock);
1569 		AutoLocker<Vnode> nodeLocker(vnode);
1570 		if (vnode->advisory_locking == NULL) {
1571 			vnode->advisory_locking = locking;
1572 			lockingDeleter.Detach();
1573 			return B_OK;
1574 		}
1575 	}
1576 
1577 	// The vnode already had a locking object. That's just as well.
1578 
1579 	return B_OK;
1580 }
1581 
1582 
1583 /*! Returns \c true when either \a flock is \c NULL or the \a flock intersects
1584 	with the advisory_lock \a lock.
1585 */
1586 static bool
1587 advisory_lock_intersects(struct advisory_lock* lock, struct flock* flock)
1588 {
1589 	if (flock == NULL)
1590 		return true;
1591 
1592 	return lock->start <= flock->l_start - 1 + flock->l_len
1593 		&& lock->end >= flock->l_start;
1594 }
1595 
1596 
1597 /*!	Tests whether acquiring a lock would block.
1598 */
1599 static status_t
1600 test_advisory_lock(struct vnode* vnode, struct flock* flock)
1601 {
1602 	flock->l_type = F_UNLCK;
1603 
1604 	struct advisory_locking* locking = get_advisory_locking(vnode);
1605 	if (locking == NULL)
1606 		return B_OK;
1607 
1608 	team_id team = team_get_current_team_id();
1609 
1610 	LockList::Iterator iterator = locking->locks.GetIterator();
1611 	while (iterator.HasNext()) {
1612 		struct advisory_lock* lock = iterator.Next();
1613 
1614 		 if (lock->team != team && advisory_lock_intersects(lock, flock)) {
1615 			// locks do overlap
1616 			if (flock->l_type != F_RDLCK || !lock->shared) {
1617 				// collision
1618 				flock->l_type = lock->shared ? F_RDLCK : F_WRLCK;
1619 				flock->l_whence = SEEK_SET;
1620 				flock->l_start = lock->start;
1621 				flock->l_len = lock->end - lock->start + 1;
1622 				flock->l_pid = lock->team;
1623 				break;
1624 			}
1625 		}
1626 	}
1627 
1628 	put_advisory_locking(locking);
1629 	return B_OK;
1630 }
1631 
1632 
1633 /*!	Removes the specified lock, or all locks of the calling team
1634 	if \a flock is NULL.
1635 */
1636 static status_t
1637 release_advisory_lock(struct vnode* vnode, struct io_context* context,
1638 	struct file_descriptor* descriptor, struct flock* flock)
1639 {
1640 	FUNCTION(("release_advisory_lock(vnode = %p, flock = %p)\n", vnode, flock));
1641 
1642 	struct advisory_locking* locking = get_advisory_locking(vnode);
1643 	if (locking == NULL)
1644 		return B_OK;
1645 
1646 	// find matching lock entries
1647 
1648 	LockList::Iterator iterator = locking->locks.GetIterator();
1649 	while (iterator.HasNext()) {
1650 		struct advisory_lock* lock = iterator.Next();
1651 		bool removeLock = false;
1652 
1653 		if (descriptor != NULL && lock->bound_to == descriptor) {
1654 			// Remove flock() locks
1655 			removeLock = true;
1656 		} else if (lock->bound_to == context
1657 				&& advisory_lock_intersects(lock, flock)) {
1658 			// Remove POSIX locks
1659 			bool endsBeyond = false;
1660 			bool startsBefore = false;
1661 			if (flock != NULL) {
1662 				startsBefore = lock->start < flock->l_start;
1663 				endsBeyond = lock->end > flock->l_start - 1 + flock->l_len;
1664 			}
1665 
1666 			if (!startsBefore && !endsBeyond) {
1667 				// lock is completely contained in flock
1668 				removeLock = true;
1669 			} else if (startsBefore && !endsBeyond) {
1670 				// cut the end of the lock
1671 				lock->end = flock->l_start - 1;
1672 			} else if (!startsBefore && endsBeyond) {
1673 				// cut the start of the lock
1674 				lock->start = flock->l_start + flock->l_len;
1675 			} else {
1676 				// divide the lock into two locks
1677 				struct advisory_lock* secondLock = new advisory_lock;
1678 				if (secondLock == NULL) {
1679 					// TODO: we should probably revert the locks we already
1680 					// changed... (ie. allocate upfront)
1681 					put_advisory_locking(locking);
1682 					return B_NO_MEMORY;
1683 				}
1684 
1685 				lock->end = flock->l_start - 1;
1686 
1687 				secondLock->bound_to = context;
1688 				secondLock->team = lock->team;
1689 				secondLock->session = lock->session;
1690 				// values must already be normalized when getting here
1691 				secondLock->start = flock->l_start + flock->l_len;
1692 				secondLock->end = lock->end;
1693 				secondLock->shared = lock->shared;
1694 
1695 				locking->locks.Add(secondLock);
1696 			}
1697 		}
1698 
1699 		if (removeLock) {
1700 			// this lock is no longer used
1701 			iterator.Remove();
1702 			free(lock);
1703 		}
1704 	}
1705 
1706 	bool removeLocking = locking->locks.IsEmpty();
1707 	release_sem_etc(locking->wait_sem, 1, B_RELEASE_ALL);
1708 
1709 	put_advisory_locking(locking);
1710 
1711 	if (removeLocking) {
1712 		// We can remove the whole advisory locking structure; it's no
1713 		// longer used
1714 		locking = get_advisory_locking(vnode);
1715 		if (locking != NULL) {
1716 			ReadLocker locker(sVnodeLock);
1717 			AutoLocker<Vnode> nodeLocker(vnode);
1718 
1719 			// the locking could have been changed in the mean time
1720 			if (locking->locks.IsEmpty()) {
1721 				vnode->advisory_locking = NULL;
1722 				nodeLocker.Unlock();
1723 				locker.Unlock();
1724 
1725 				// we've detached the locking from the vnode, so we can
1726 				// safely delete it
1727 				delete locking;
1728 			} else {
1729 				// the locking is in use again
1730 				nodeLocker.Unlock();
1731 				locker.Unlock();
1732 				release_sem_etc(locking->lock, 1, B_DO_NOT_RESCHEDULE);
1733 			}
1734 		}
1735 	}
1736 
1737 	return B_OK;
1738 }
1739 
1740 
1741 /*!	Acquires an advisory lock for the \a vnode. If \a wait is \c true, it
1742 	will wait for the lock to become available, if there are any collisions
1743 	(it will return B_PERMISSION_DENIED in this case if \a wait is \c false).
1744 
1745 	If \a descriptor is NULL, POSIX semantics are used for this lock. Otherwise,
1746 	BSD flock() semantics are used, that is, all children can unlock the file
1747 	in question (we even allow parents to remove the lock, though, but that
1748 	seems to be in line to what the BSD's are doing).
1749 */
1750 static status_t
1751 acquire_advisory_lock(struct vnode* vnode, io_context* context,
1752 	struct file_descriptor* descriptor, struct flock* flock, bool wait)
1753 {
1754 	FUNCTION(("acquire_advisory_lock(vnode = %p, flock = %p, wait = %s)\n",
1755 		vnode, flock, wait ? "yes" : "no"));
1756 	dprintf("acquire_advisory_lock(vnode = %p, flock = %p, wait = %s)\n",
1757 		vnode, flock, wait ? "yes" : "no");
1758 
1759 	bool shared = flock->l_type == F_RDLCK;
1760 	void* boundTo = descriptor != NULL ? (void*)descriptor : (void*)context;
1761 	status_t status = B_OK;
1762 
1763 	// TODO: do deadlock detection!
1764 
1765 	struct advisory_locking* locking;
1766 
1767 	while (true) {
1768 		// if this vnode has an advisory_locking structure attached,
1769 		// lock that one and search for any colliding file lock
1770 		status = create_advisory_locking(vnode);
1771 		if (status != B_OK)
1772 			return status;
1773 
1774 		locking = vnode->advisory_locking;
1775 		team_id team = team_get_current_team_id();
1776 		sem_id waitForLock = -1;
1777 
1778 		// test for collisions
1779 		LockList::Iterator iterator = locking->locks.GetIterator();
1780 		while (iterator.HasNext()) {
1781 			struct advisory_lock* lock = iterator.Next();
1782 
1783 			// TODO: locks from the same team might be joinable!
1784 			if ((lock->team != team || lock->bound_to != boundTo)
1785 					&& advisory_lock_intersects(lock, flock)) {
1786 				// locks do overlap
1787 				if (!shared || !lock->shared) {
1788 					// we need to wait
1789 					waitForLock = locking->wait_sem;
1790 					break;
1791 				}
1792 			}
1793 		}
1794 
1795 		if (waitForLock < 0)
1796 			break;
1797 
1798 		// We need to wait. Do that or fail now, if we've been asked not to.
1799 
1800 		if (!wait) {
1801 			put_advisory_locking(locking);
1802 			return descriptor != NULL ? B_WOULD_BLOCK : B_PERMISSION_DENIED;
1803 		}
1804 
1805 		status = switch_sem_etc(locking->lock, waitForLock, 1,
1806 			B_CAN_INTERRUPT, 0);
1807 		if (status != B_OK && status != B_BAD_SEM_ID)
1808 			return status;
1809 
1810 		// We have been notified, but we need to re-lock the locking object. So
1811 		// go another round...
1812 	}
1813 
1814 	// install new lock
1815 
1816 	struct advisory_lock* lock = (struct advisory_lock*)malloc(
1817 		sizeof(struct advisory_lock));
1818 	if (lock == NULL) {
1819 		put_advisory_locking(locking);
1820 		return B_NO_MEMORY;
1821 	}
1822 
1823 	lock->bound_to = boundTo;
1824 	lock->team = team_get_current_team_id();
1825 	lock->session = thread_get_current_thread()->team->session_id;
1826 	// values must already be normalized when getting here
1827 	lock->start = flock->l_start;
1828 	lock->end = flock->l_start - 1 + flock->l_len;
1829 	lock->shared = shared;
1830 
1831 	locking->locks.Add(lock);
1832 	put_advisory_locking(locking);
1833 
1834 	return status;
1835 }
1836 
1837 
1838 /*!	Normalizes the \a flock structure to make it easier to compare the
1839 	structure with others. The l_start and l_len fields are set to absolute
1840 	values according to the l_whence field.
1841 */
1842 static status_t
1843 normalize_flock(struct file_descriptor* descriptor, struct flock* flock)
1844 {
1845 	switch (flock->l_whence) {
1846 		case SEEK_SET:
1847 			break;
1848 		case SEEK_CUR:
1849 			flock->l_start += descriptor->pos;
1850 			break;
1851 		case SEEK_END:
1852 		{
1853 			struct vnode* vnode = descriptor->u.vnode;
1854 			struct stat stat;
1855 			status_t status;
1856 
1857 			if (!HAS_FS_CALL(vnode, read_stat))
1858 				return B_UNSUPPORTED;
1859 
1860 			status = FS_CALL(vnode, read_stat, &stat);
1861 			if (status != B_OK)
1862 				return status;
1863 
1864 			flock->l_start += stat.st_size;
1865 			break;
1866 		}
1867 		default:
1868 			return B_BAD_VALUE;
1869 	}
1870 
1871 	if (flock->l_start < 0)
1872 		flock->l_start = 0;
1873 	if (flock->l_len == 0)
1874 		flock->l_len = OFF_MAX;
1875 
1876 	// don't let the offset and length overflow
1877 	if (flock->l_start > 0 && OFF_MAX - flock->l_start < flock->l_len)
1878 		flock->l_len = OFF_MAX - flock->l_start;
1879 
1880 	if (flock->l_len < 0) {
1881 		// a negative length reverses the region
1882 		flock->l_start += flock->l_len;
1883 		flock->l_len = -flock->l_len;
1884 	}
1885 
1886 	return B_OK;
1887 }
1888 
1889 
1890 static void
1891 replace_vnode_if_disconnected(struct fs_mount* mount,
1892 	struct vnode* vnodeToDisconnect, struct vnode*& vnode,
1893 	struct vnode* fallBack, bool lockRootLock)
1894 {
1895 	struct vnode* givenVnode = vnode;
1896 	bool vnodeReplaced = false;
1897 
1898 	ReadLocker vnodeReadLocker(sVnodeLock);
1899 
1900 	if (lockRootLock)
1901 		mutex_lock(&sIOContextRootLock);
1902 
1903 	while (vnode != NULL && vnode->mount == mount
1904 		&& (vnodeToDisconnect == NULL || vnodeToDisconnect == vnode)) {
1905 		if (vnode->covers != NULL) {
1906 			// redirect the vnode to the covered vnode
1907 			vnode = vnode->covers;
1908 		} else
1909 			vnode = fallBack;
1910 
1911 		vnodeReplaced = true;
1912 	}
1913 
1914 	// If we've replaced the node, grab a reference for the new one.
1915 	if (vnodeReplaced && vnode != NULL)
1916 		inc_vnode_ref_count(vnode);
1917 
1918 	if (lockRootLock)
1919 		mutex_unlock(&sIOContextRootLock);
1920 
1921 	vnodeReadLocker.Unlock();
1922 
1923 	if (vnodeReplaced)
1924 		put_vnode(givenVnode);
1925 }
1926 
1927 
1928 /*!	Disconnects all file descriptors that are associated with the
1929 	\a vnodeToDisconnect, or if this is NULL, all vnodes of the specified
1930 	\a mount object.
1931 
1932 	Note, after you've called this function, there might still be ongoing
1933 	accesses - they won't be interrupted if they already happened before.
1934 	However, any subsequent access will fail.
1935 
1936 	This is not a cheap function and should be used with care and rarely.
1937 	TODO: there is currently no means to stop a blocking read/write!
1938 */
1939 static void
1940 disconnect_mount_or_vnode_fds(struct fs_mount* mount,
1941 	struct vnode* vnodeToDisconnect)
1942 {
1943 	// iterate over all teams and peek into their file descriptors
1944 	TeamListIterator teamIterator;
1945 	while (Team* team = teamIterator.Next()) {
1946 		BReference<Team> teamReference(team, true);
1947 		TeamLocker teamLocker(team);
1948 
1949 		// lock the I/O context
1950 		io_context* context = team->io_context;
1951 		if (context == NULL)
1952 			continue;
1953 		MutexLocker contextLocker(context->io_mutex);
1954 
1955 		teamLocker.Unlock();
1956 
1957 		replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->root,
1958 			sRoot, true);
1959 		replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->cwd,
1960 			sRoot, false);
1961 
1962 		for (uint32 i = 0; i < context->table_size; i++) {
1963 			struct file_descriptor* descriptor = context->fds[i];
1964 			if (descriptor == NULL || (descriptor->open_mode & O_DISCONNECTED) != 0)
1965 				continue;
1966 
1967 			inc_fd_ref_count(descriptor);
1968 
1969 			// if this descriptor points at this mount, we
1970 			// need to disconnect it to be able to unmount
1971 			struct vnode* vnode = fd_vnode(descriptor);
1972 			if (vnodeToDisconnect != NULL) {
1973 				if (vnode == vnodeToDisconnect)
1974 					disconnect_fd(descriptor);
1975 			} else if ((vnode != NULL && vnode->mount == mount)
1976 				|| (vnode == NULL && descriptor->u.mount == mount))
1977 				disconnect_fd(descriptor);
1978 
1979 			put_fd(descriptor);
1980 		}
1981 	}
1982 }
1983 
1984 
1985 /*!	\brief Gets the root node of the current IO context.
1986 	If \a kernel is \c true, the kernel IO context will be used.
1987 	The caller obtains a reference to the returned node.
1988 */
1989 struct vnode*
1990 get_root_vnode(bool kernel)
1991 {
1992 	if (!kernel) {
1993 		// Get current working directory from io context
1994 		struct io_context* context = get_current_io_context(kernel);
1995 
1996 		mutex_lock(&sIOContextRootLock);
1997 
1998 		struct vnode* root = context->root;
1999 		if (root != NULL)
2000 			inc_vnode_ref_count(root);
2001 
2002 		mutex_unlock(&sIOContextRootLock);
2003 
2004 		if (root != NULL)
2005 			return root;
2006 
2007 		// That should never happen.
2008 		dprintf("get_root_vnode(): IO context for team %" B_PRId32 " doesn't "
2009 			"have a root\n", team_get_current_team_id());
2010 	}
2011 
2012 	inc_vnode_ref_count(sRoot);
2013 	return sRoot;
2014 }
2015 
2016 
2017 /*!	\brief Gets the directory path and leaf name for a given path.
2018 
2019 	The supplied \a path is transformed to refer to the directory part of
2020 	the entry identified by the original path, and into the buffer \a filename
2021 	the leaf name of the original entry is written.
2022 	Neither the returned path nor the leaf name can be expected to be
2023 	canonical.
2024 
2025 	\param path The path to be analyzed. Must be able to store at least one
2026 		   additional character.
2027 	\param filename The buffer into which the leaf name will be written.
2028 		   Must be of size B_FILE_NAME_LENGTH at least.
2029 	\return \c B_OK, if everything went fine, \c B_NAME_TOO_LONG, if the leaf
2030 		   name is longer than \c B_FILE_NAME_LENGTH, or \c B_ENTRY_NOT_FOUND,
2031 		   if the given path name is empty.
2032 */
2033 static status_t
2034 get_dir_path_and_leaf(char* path, char* filename)
2035 {
2036 	if (*path == '\0')
2037 		return B_ENTRY_NOT_FOUND;
2038 
2039 	char* last = strrchr(path, '/');
2040 		// '/' are not allowed in file names!
2041 
2042 	FUNCTION(("get_dir_path_and_leaf(path = %s)\n", path));
2043 
2044 	if (last == NULL) {
2045 		// this path is single segment with no '/' in it
2046 		// ex. "foo"
2047 		if (strlcpy(filename, path, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2048 			return B_NAME_TOO_LONG;
2049 
2050 		strcpy(path, ".");
2051 	} else {
2052 		last++;
2053 		if (last[0] == '\0') {
2054 			// special case: the path ends in one or more '/' - remove them
2055 			while (*--last == '/' && last != path);
2056 			last[1] = '\0';
2057 
2058 			if (last == path && last[0] == '/') {
2059 				// This path points to the root of the file system
2060 				strcpy(filename, ".");
2061 				return B_OK;
2062 			}
2063 			for (; last != path && *(last - 1) != '/'; last--);
2064 				// rewind to the start of the leaf before the '/'
2065 		}
2066 
2067 		// normal leaf: replace the leaf portion of the path with a '.'
2068 		if (strlcpy(filename, last, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2069 			return B_NAME_TOO_LONG;
2070 
2071 		last[0] = '.';
2072 		last[1] = '\0';
2073 	}
2074 	return B_OK;
2075 }
2076 
2077 
2078 static status_t
2079 entry_ref_to_vnode(dev_t mountID, ino_t directoryID, const char* name,
2080 	bool traverse, bool kernel, struct vnode** _vnode)
2081 {
2082 	char clonedName[B_FILE_NAME_LENGTH + 1];
2083 	if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2084 		return B_NAME_TOO_LONG;
2085 
2086 	// get the directory vnode and let vnode_path_to_vnode() do the rest
2087 	struct vnode* directory;
2088 
2089 	status_t status = get_vnode(mountID, directoryID, &directory, true, false);
2090 	if (status < 0)
2091 		return status;
2092 
2093 	return vnode_path_to_vnode(directory, clonedName, traverse, 0, kernel,
2094 		_vnode, NULL);
2095 }
2096 
2097 
2098 /*!	Looks up the entry with name \a name in the directory represented by \a dir
2099 	and returns the respective vnode.
2100 	On success a reference to the vnode is acquired for the caller.
2101 */
2102 static status_t
2103 lookup_dir_entry(struct vnode* dir, const char* name, struct vnode** _vnode)
2104 {
2105 	ino_t id;
2106 	bool missing;
2107 
2108 	if (dir->mount->entry_cache.Lookup(dir->id, name, id, missing)) {
2109 		return missing ? B_ENTRY_NOT_FOUND
2110 			: get_vnode(dir->device, id, _vnode, true, false);
2111 	}
2112 
2113 	status_t status = FS_CALL(dir, lookup, name, &id);
2114 	if (status != B_OK)
2115 		return status;
2116 
2117 	// The lookup() hook calls get_vnode() or publish_vnode(), so we do already
2118 	// have a reference and just need to look the node up.
2119 	rw_lock_read_lock(&sVnodeLock);
2120 	*_vnode = lookup_vnode(dir->device, id);
2121 	rw_lock_read_unlock(&sVnodeLock);
2122 
2123 	if (*_vnode == NULL) {
2124 		panic("lookup_dir_entry(): could not lookup vnode (mountid 0x%" B_PRIx32
2125 			" vnid 0x%" B_PRIx64 ")\n", dir->device, id);
2126 		return B_ENTRY_NOT_FOUND;
2127 	}
2128 
2129 //	ktrace_printf("lookup_dir_entry(): dir: %p (%ld, %lld), name: \"%s\" -> "
2130 //		"%p (%ld, %lld)", dir, dir->mount->id, dir->id, name, *_vnode,
2131 //		(*_vnode)->mount->id, (*_vnode)->id);
2132 
2133 	return B_OK;
2134 }
2135 
2136 
2137 /*!	Returns the vnode for the relative path starting at the specified \a vnode.
2138 	\a path must not be NULL.
2139 	If it returns successfully, \a path contains the name of the last path
2140 	component. This function clobbers the buffer pointed to by \a path only
2141 	if it does contain more than one component.
2142 	Note, this reduces the ref_count of the starting \a vnode, no matter if
2143 	it is successful or not!
2144 */
2145 static status_t
2146 vnode_path_to_vnode(struct vnode* vnode, char* path, bool traverseLeafLink,
2147 	int count, struct io_context* ioContext, struct vnode** _vnode,
2148 	ino_t* _parentID)
2149 {
2150 	status_t status = B_OK;
2151 	ino_t lastParentID = vnode->id;
2152 
2153 	FUNCTION(("vnode_path_to_vnode(vnode = %p, path = %s)\n", vnode, path));
2154 
2155 	if (path == NULL) {
2156 		put_vnode(vnode);
2157 		return B_BAD_VALUE;
2158 	}
2159 
2160 	if (*path == '\0') {
2161 		put_vnode(vnode);
2162 		return B_ENTRY_NOT_FOUND;
2163 	}
2164 
2165 	while (true) {
2166 		struct vnode* nextVnode;
2167 		char* nextPath;
2168 
2169 		TRACE(("vnode_path_to_vnode: top of loop. p = %p, p = '%s'\n", path,
2170 			path));
2171 
2172 		// done?
2173 		if (path[0] == '\0')
2174 			break;
2175 
2176 		// walk to find the next path component ("path" will point to a single
2177 		// path component), and filter out multiple slashes
2178 		for (nextPath = path + 1; *nextPath != '\0' && *nextPath != '/';
2179 				nextPath++);
2180 
2181 		if (*nextPath == '/') {
2182 			*nextPath = '\0';
2183 			do
2184 				nextPath++;
2185 			while (*nextPath == '/');
2186 		}
2187 
2188 		// See if the '..' is at a covering vnode move to the covered
2189 		// vnode so we pass the '..' path to the underlying filesystem.
2190 		// Also prevent breaking the root of the IO context.
2191 		if (strcmp("..", path) == 0) {
2192 			if (vnode == ioContext->root) {
2193 				// Attempted prison break! Keep it contained.
2194 				path = nextPath;
2195 				continue;
2196 			}
2197 
2198 			if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2199 				nextVnode = coveredVnode;
2200 				put_vnode(vnode);
2201 				vnode = nextVnode;
2202 			}
2203 		}
2204 
2205 		// check if vnode is really a directory
2206 		if (status == B_OK && !S_ISDIR(vnode->Type()))
2207 			status = B_NOT_A_DIRECTORY;
2208 
2209 		// Check if we have the right to search the current directory vnode.
2210 		// If a file system doesn't have the access() function, we assume that
2211 		// searching a directory is always allowed
2212 		if (status == B_OK && HAS_FS_CALL(vnode, access))
2213 			status = FS_CALL(vnode, access, X_OK);
2214 
2215 		// Tell the filesystem to get the vnode of this path component (if we
2216 		// got the permission from the call above)
2217 		if (status == B_OK)
2218 			status = lookup_dir_entry(vnode, path, &nextVnode);
2219 
2220 		if (status != B_OK) {
2221 			put_vnode(vnode);
2222 			return status;
2223 		}
2224 
2225 		// If the new node is a symbolic link, resolve it (if we've been told
2226 		// to do it)
2227 		if (S_ISLNK(nextVnode->Type())
2228 			&& (traverseLeafLink || nextPath[0] != '\0')) {
2229 			size_t bufferSize;
2230 			char* buffer;
2231 
2232 			TRACE(("traverse link\n"));
2233 
2234 			// it's not exactly nice style using goto in this way, but hey,
2235 			// it works :-/
2236 			if (count + 1 > B_MAX_SYMLINKS) {
2237 				status = B_LINK_LIMIT;
2238 				goto resolve_link_error;
2239 			}
2240 
2241 			bufferSize = B_PATH_NAME_LENGTH;
2242 			buffer = (char*)object_cache_alloc(sPathNameCache, 0);
2243 			if (buffer == NULL) {
2244 				status = B_NO_MEMORY;
2245 				goto resolve_link_error;
2246 			}
2247 
2248 			if (HAS_FS_CALL(nextVnode, read_symlink)) {
2249 				bufferSize--;
2250 				status = FS_CALL(nextVnode, read_symlink, buffer, &bufferSize);
2251 				// null-terminate
2252 				if (status >= 0)
2253 					buffer[bufferSize] = '\0';
2254 			} else
2255 				status = B_BAD_VALUE;
2256 
2257 			if (status != B_OK) {
2258 				free(buffer);
2259 
2260 		resolve_link_error:
2261 				put_vnode(vnode);
2262 				put_vnode(nextVnode);
2263 
2264 				return status;
2265 			}
2266 			put_vnode(nextVnode);
2267 
2268 			// Check if we start from the root directory or the current
2269 			// directory ("vnode" still points to that one).
2270 			// Cut off all leading slashes if it's the root directory
2271 			path = buffer;
2272 			bool absoluteSymlink = false;
2273 			if (path[0] == '/') {
2274 				// we don't need the old directory anymore
2275 				put_vnode(vnode);
2276 
2277 				while (*++path == '/')
2278 					;
2279 
2280 				mutex_lock(&sIOContextRootLock);
2281 				vnode = ioContext->root;
2282 				inc_vnode_ref_count(vnode);
2283 				mutex_unlock(&sIOContextRootLock);
2284 
2285 				absoluteSymlink = true;
2286 			}
2287 
2288 			inc_vnode_ref_count(vnode);
2289 				// balance the next recursion - we will decrement the
2290 				// ref_count of the vnode, no matter if we succeeded or not
2291 
2292 			if (absoluteSymlink && *path == '\0') {
2293 				// symlink was just "/"
2294 				nextVnode = vnode;
2295 			} else {
2296 				status = vnode_path_to_vnode(vnode, path, true, count + 1,
2297 					ioContext, &nextVnode, &lastParentID);
2298 			}
2299 
2300 			object_cache_free(sPathNameCache, buffer, 0);
2301 
2302 			if (status != B_OK) {
2303 				put_vnode(vnode);
2304 				return status;
2305 			}
2306 		} else
2307 			lastParentID = vnode->id;
2308 
2309 		// decrease the ref count on the old dir we just looked up into
2310 		put_vnode(vnode);
2311 
2312 		path = nextPath;
2313 		vnode = nextVnode;
2314 
2315 		// see if we hit a covered node
2316 		if (Vnode* coveringNode = get_covering_vnode(vnode)) {
2317 			put_vnode(vnode);
2318 			vnode = coveringNode;
2319 		}
2320 	}
2321 
2322 	*_vnode = vnode;
2323 	if (_parentID)
2324 		*_parentID = lastParentID;
2325 
2326 	return B_OK;
2327 }
2328 
2329 
2330 static status_t
2331 vnode_path_to_vnode(struct vnode* vnode, char* path, bool traverseLeafLink,
2332 	int count, bool kernel, struct vnode** _vnode, ino_t* _parentID)
2333 {
2334 	return vnode_path_to_vnode(vnode, path, traverseLeafLink, count,
2335 		get_current_io_context(kernel), _vnode, _parentID);
2336 }
2337 
2338 
2339 static status_t
2340 path_to_vnode(char* path, bool traverseLink, struct vnode** _vnode,
2341 	ino_t* _parentID, bool kernel)
2342 {
2343 	struct vnode* start = NULL;
2344 
2345 	FUNCTION(("path_to_vnode(path = \"%s\")\n", path));
2346 
2347 	if (!path)
2348 		return B_BAD_VALUE;
2349 
2350 	if (*path == '\0')
2351 		return B_ENTRY_NOT_FOUND;
2352 
2353 	// figure out if we need to start at root or at cwd
2354 	if (*path == '/') {
2355 		if (sRoot == NULL) {
2356 			// we're a bit early, aren't we?
2357 			return B_ERROR;
2358 		}
2359 
2360 		while (*++path == '/')
2361 			;
2362 		start = get_root_vnode(kernel);
2363 
2364 		if (*path == '\0') {
2365 			*_vnode = start;
2366 			return B_OK;
2367 		}
2368 
2369 	} else {
2370 		struct io_context* context = get_current_io_context(kernel);
2371 
2372 		mutex_lock(&context->io_mutex);
2373 		start = context->cwd;
2374 		if (start != NULL)
2375 			inc_vnode_ref_count(start);
2376 		mutex_unlock(&context->io_mutex);
2377 
2378 		if (start == NULL)
2379 			return B_ERROR;
2380 	}
2381 
2382 	return vnode_path_to_vnode(start, path, traverseLink, 0, kernel, _vnode,
2383 		_parentID);
2384 }
2385 
2386 
2387 /*! Returns the vnode in the next to last segment of the path, and returns
2388 	the last portion in filename.
2389 	The path buffer must be able to store at least one additional character.
2390 */
2391 static status_t
2392 path_to_dir_vnode(char* path, struct vnode** _vnode, char* filename,
2393 	bool kernel)
2394 {
2395 	status_t status = get_dir_path_and_leaf(path, filename);
2396 	if (status != B_OK)
2397 		return status;
2398 
2399 	return path_to_vnode(path, true, _vnode, NULL, kernel);
2400 }
2401 
2402 
2403 /*!	\brief Retrieves the directory vnode and the leaf name of an entry referred
2404 		   to by a FD + path pair.
2405 
2406 	\a path must be given in either case. \a fd might be omitted, in which
2407 	case \a path is either an absolute path or one relative to the current
2408 	directory. If both a supplied and \a path is relative it is reckoned off
2409 	of the directory referred to by \a fd. If \a path is absolute \a fd is
2410 	ignored.
2411 
2412 	The caller has the responsibility to call put_vnode() on the returned
2413 	directory vnode.
2414 
2415 	\param fd The FD. May be < 0.
2416 	\param path The absolute or relative path. Must not be \c NULL. The buffer
2417 	       is modified by this function. It must have at least room for a
2418 	       string one character longer than the path it contains.
2419 	\param _vnode A pointer to a variable the directory vnode shall be written
2420 		   into.
2421 	\param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2422 		   the leaf name of the specified entry will be written.
2423 	\param kernel \c true, if invoked from inside the kernel, \c false if
2424 		   invoked from userland.
2425 	\return \c B_OK, if everything went fine, another error code otherwise.
2426 */
2427 static status_t
2428 fd_and_path_to_dir_vnode(int fd, char* path, struct vnode** _vnode,
2429 	char* filename, bool kernel)
2430 {
2431 	if (!path)
2432 		return B_BAD_VALUE;
2433 	if (*path == '\0')
2434 		return B_ENTRY_NOT_FOUND;
2435 	if (fd < 0)
2436 		return path_to_dir_vnode(path, _vnode, filename, kernel);
2437 
2438 	status_t status = get_dir_path_and_leaf(path, filename);
2439 	if (status != B_OK)
2440 		return status;
2441 
2442 	return fd_and_path_to_vnode(fd, path, true, _vnode, NULL, kernel);
2443 }
2444 
2445 
2446 /*!	\brief Retrieves the directory vnode and the leaf name of an entry referred
2447 		   to by a vnode + path pair.
2448 
2449 	\a path must be given in either case. \a vnode might be omitted, in which
2450 	case \a path is either an absolute path or one relative to the current
2451 	directory. If both a supplied and \a path is relative it is reckoned off
2452 	of the directory referred to by \a vnode. If \a path is absolute \a vnode is
2453 	ignored.
2454 
2455 	The caller has the responsibility to call put_vnode() on the returned
2456 	directory vnode.
2457 
2458 	\param vnode The vnode. May be \c NULL.
2459 	\param path The absolute or relative path. Must not be \c NULL. The buffer
2460 	       is modified by this function. It must have at least room for a
2461 	       string one character longer than the path it contains.
2462 	\param _vnode A pointer to a variable the directory vnode shall be written
2463 		   into.
2464 	\param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2465 		   the leaf name of the specified entry will be written.
2466 	\param kernel \c true, if invoked from inside the kernel, \c false if
2467 		   invoked from userland.
2468 	\return \c B_OK, if everything went fine, another error code otherwise.
2469 */
2470 static status_t
2471 vnode_and_path_to_dir_vnode(struct vnode* vnode, char* path,
2472 	struct vnode** _vnode, char* filename, bool kernel)
2473 {
2474 	if (!path)
2475 		return B_BAD_VALUE;
2476 	if (*path == '\0')
2477 		return B_ENTRY_NOT_FOUND;
2478 	if (vnode == NULL || path[0] == '/')
2479 		return path_to_dir_vnode(path, _vnode, filename, kernel);
2480 
2481 	status_t status = get_dir_path_and_leaf(path, filename);
2482 	if (status != B_OK)
2483 		return status;
2484 
2485 	inc_vnode_ref_count(vnode);
2486 		// vnode_path_to_vnode() always decrements the ref count
2487 
2488 	return vnode_path_to_vnode(vnode, path, true, 0, kernel, _vnode, NULL);
2489 }
2490 
2491 
2492 /*! Returns a vnode's name in the d_name field of a supplied dirent buffer.
2493 */
2494 static status_t
2495 get_vnode_name(struct vnode* vnode, struct vnode* parent, struct dirent* buffer,
2496 	size_t bufferSize, struct io_context* ioContext)
2497 {
2498 	if (bufferSize < sizeof(struct dirent))
2499 		return B_BAD_VALUE;
2500 
2501 	// See if the vnode is covering another vnode and move to the covered
2502 	// vnode so we get the underlying file system
2503 	VNodePutter vnodePutter;
2504 	if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2505 		vnode = coveredVnode;
2506 		vnodePutter.SetTo(vnode);
2507 	}
2508 
2509 	if (HAS_FS_CALL(vnode, get_vnode_name)) {
2510 		// The FS supports getting the name of a vnode.
2511 		if (FS_CALL(vnode, get_vnode_name, buffer->d_name,
2512 			(char*)buffer + bufferSize - buffer->d_name) == B_OK)
2513 			return B_OK;
2514 	}
2515 
2516 	// The FS doesn't support getting the name of a vnode. So we search the
2517 	// parent directory for the vnode, if the caller let us.
2518 
2519 	if (parent == NULL || !HAS_FS_CALL(parent, read_dir))
2520 		return B_UNSUPPORTED;
2521 
2522 	void* cookie;
2523 
2524 	status_t status = FS_CALL(parent, open_dir, &cookie);
2525 	if (status >= B_OK) {
2526 		while (true) {
2527 			uint32 num = 1;
2528 			// We use the FS hook directly instead of dir_read(), since we don't
2529 			// want the entries to be fixed. We have already resolved vnode to
2530 			// the covered node.
2531 			status = FS_CALL(parent, read_dir, cookie, buffer, bufferSize,
2532 				&num);
2533 			if (status != B_OK)
2534 				break;
2535 			if (num == 0) {
2536 				status = B_ENTRY_NOT_FOUND;
2537 				break;
2538 			}
2539 
2540 			if (vnode->id == buffer->d_ino) {
2541 				// found correct entry!
2542 				break;
2543 			}
2544 		}
2545 
2546 		FS_CALL(parent, close_dir, cookie);
2547 		FS_CALL(parent, free_dir_cookie, cookie);
2548 	}
2549 	return status;
2550 }
2551 
2552 
2553 static status_t
2554 get_vnode_name(struct vnode* vnode, struct vnode* parent, char* name,
2555 	size_t nameSize, bool kernel)
2556 {
2557 	char buffer[sizeof(struct dirent) + B_FILE_NAME_LENGTH];
2558 	struct dirent* dirent = (struct dirent*)buffer;
2559 
2560 	status_t status = get_vnode_name(vnode, parent, dirent, sizeof(buffer),
2561 		get_current_io_context(kernel));
2562 	if (status != B_OK)
2563 		return status;
2564 
2565 	if (strlcpy(name, dirent->d_name, nameSize) >= nameSize)
2566 		return B_BUFFER_OVERFLOW;
2567 
2568 	return B_OK;
2569 }
2570 
2571 
2572 /*!	Gets the full path to a given directory vnode.
2573 	It uses the fs_get_vnode_name() call to get the name of a vnode; if a
2574 	file system doesn't support this call, it will fall back to iterating
2575 	through the parent directory to get the name of the child.
2576 
2577 	To protect against circular loops, it supports a maximum tree depth
2578 	of 256 levels.
2579 
2580 	Note that the path may not be correct the time this function returns!
2581 	It doesn't use any locking to prevent returning the correct path, as
2582 	paths aren't safe anyway: the path to a file can change at any time.
2583 
2584 	It might be a good idea, though, to check if the returned path exists
2585 	in the calling function (it's not done here because of efficiency)
2586 */
2587 static status_t
2588 dir_vnode_to_path(struct vnode* vnode, char* buffer, size_t bufferSize,
2589 	bool kernel)
2590 {
2591 	FUNCTION(("dir_vnode_to_path(%p, %p, %lu)\n", vnode, buffer, bufferSize));
2592 
2593 	if (vnode == NULL || buffer == NULL || bufferSize == 0)
2594 		return B_BAD_VALUE;
2595 
2596 	if (!S_ISDIR(vnode->Type()))
2597 		return B_NOT_A_DIRECTORY;
2598 
2599 	char* path = buffer;
2600 	int32 insert = bufferSize;
2601 	int32 maxLevel = 256;
2602 	int32 length;
2603 	status_t status = B_OK;
2604 	struct io_context* ioContext = get_current_io_context(kernel);
2605 
2606 	// we don't use get_vnode() here because this call is more
2607 	// efficient and does all we need from get_vnode()
2608 	inc_vnode_ref_count(vnode);
2609 
2610 	path[--insert] = '\0';
2611 		// the path is filled right to left
2612 
2613 	while (true) {
2614 		// If the node is the context's root, bail out. Otherwise resolve mount
2615 		// points.
2616 		if (vnode == ioContext->root)
2617 			break;
2618 
2619 		if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2620 			put_vnode(vnode);
2621 			vnode = coveredVnode;
2622 		}
2623 
2624 		// lookup the parent vnode
2625 		struct vnode* parentVnode;
2626 		status = lookup_dir_entry(vnode, "..", &parentVnode);
2627 		if (status != B_OK)
2628 			goto out;
2629 
2630 		if (parentVnode == vnode) {
2631 			// The caller apparently got their hands on a node outside of their
2632 			// context's root. Now we've hit the global root.
2633 			put_vnode(parentVnode);
2634 			break;
2635 		}
2636 
2637 		// get the node's name
2638 		char nameBuffer[sizeof(struct dirent) + B_FILE_NAME_LENGTH];
2639 			// also used for fs_read_dir()
2640 		char* name = &((struct dirent*)nameBuffer)->d_name[0];
2641 		status = get_vnode_name(vnode, parentVnode, (struct dirent*)nameBuffer,
2642 			sizeof(nameBuffer), ioContext);
2643 
2644 		// release the current vnode, we only need its parent from now on
2645 		put_vnode(vnode);
2646 		vnode = parentVnode;
2647 
2648 		if (status != B_OK)
2649 			goto out;
2650 
2651 		// TODO: add an explicit check for loops in about 10 levels to do
2652 		// real loop detection
2653 
2654 		// don't go deeper as 'maxLevel' to prevent circular loops
2655 		if (maxLevel-- < 0) {
2656 			status = B_LINK_LIMIT;
2657 			goto out;
2658 		}
2659 
2660 		// add the name in front of the current path
2661 		name[B_FILE_NAME_LENGTH - 1] = '\0';
2662 		length = strlen(name);
2663 		insert -= length;
2664 		if (insert <= 0) {
2665 			status = B_RESULT_NOT_REPRESENTABLE;
2666 			goto out;
2667 		}
2668 		memcpy(path + insert, name, length);
2669 		path[--insert] = '/';
2670 	}
2671 
2672 	// the root dir will result in an empty path: fix it
2673 	if (path[insert] == '\0')
2674 		path[--insert] = '/';
2675 
2676 	TRACE(("  path is: %s\n", path + insert));
2677 
2678 	// move the path to the start of the buffer
2679 	length = bufferSize - insert;
2680 	memmove(buffer, path + insert, length);
2681 
2682 out:
2683 	put_vnode(vnode);
2684 	return status;
2685 }
2686 
2687 
2688 /*!	Checks the length of every path component, and adds a '.'
2689 	if the path ends in a slash.
2690 	The given path buffer must be able to store at least one
2691 	additional character.
2692 */
2693 static status_t
2694 check_path(char* to)
2695 {
2696 	int32 length = 0;
2697 
2698 	// check length of every path component
2699 
2700 	while (*to) {
2701 		char* begin;
2702 		if (*to == '/')
2703 			to++, length++;
2704 
2705 		begin = to;
2706 		while (*to != '/' && *to)
2707 			to++, length++;
2708 
2709 		if (to - begin > B_FILE_NAME_LENGTH)
2710 			return B_NAME_TOO_LONG;
2711 	}
2712 
2713 	if (length == 0)
2714 		return B_ENTRY_NOT_FOUND;
2715 
2716 	// complete path if there is a slash at the end
2717 
2718 	if (*(to - 1) == '/') {
2719 		if (length > B_PATH_NAME_LENGTH - 2)
2720 			return B_NAME_TOO_LONG;
2721 
2722 		to[0] = '.';
2723 		to[1] = '\0';
2724 	}
2725 
2726 	return B_OK;
2727 }
2728 
2729 
2730 static struct file_descriptor*
2731 get_fd_and_vnode(int fd, struct vnode** _vnode, bool kernel)
2732 {
2733 	struct file_descriptor* descriptor
2734 		= get_fd(get_current_io_context(kernel), fd);
2735 	if (descriptor == NULL)
2736 		return NULL;
2737 
2738 	struct vnode* vnode = fd_vnode(descriptor);
2739 	if (vnode == NULL) {
2740 		put_fd(descriptor);
2741 		return NULL;
2742 	}
2743 
2744 	// ToDo: when we can close a file descriptor at any point, investigate
2745 	//	if this is still valid to do (accessing the vnode without ref_count
2746 	//	or locking)
2747 	*_vnode = vnode;
2748 	return descriptor;
2749 }
2750 
2751 
2752 static struct vnode*
2753 get_vnode_from_fd(int fd, bool kernel)
2754 {
2755 	struct file_descriptor* descriptor;
2756 	struct vnode* vnode;
2757 
2758 	descriptor = get_fd(get_current_io_context(kernel), fd);
2759 	if (descriptor == NULL)
2760 		return NULL;
2761 
2762 	vnode = fd_vnode(descriptor);
2763 	if (vnode != NULL)
2764 		inc_vnode_ref_count(vnode);
2765 
2766 	put_fd(descriptor);
2767 	return vnode;
2768 }
2769 
2770 
2771 /*!	Gets the vnode from an FD + path combination. If \a fd is lower than zero,
2772 	only the path will be considered. In this case, the \a path must not be
2773 	NULL.
2774 	If \a fd is a valid file descriptor, \a path may be NULL for directories,
2775 	and should be NULL for files.
2776 */
2777 static status_t
2778 fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
2779 	struct vnode** _vnode, ino_t* _parentID, bool kernel)
2780 {
2781 	if (fd < 0 && !path)
2782 		return B_BAD_VALUE;
2783 
2784 	if (path != NULL && *path == '\0')
2785 		return B_ENTRY_NOT_FOUND;
2786 
2787 	if (fd < 0 || (path != NULL && path[0] == '/')) {
2788 		// no FD or absolute path
2789 		return path_to_vnode(path, traverseLeafLink, _vnode, _parentID, kernel);
2790 	}
2791 
2792 	// FD only, or FD + relative path
2793 	struct vnode* vnode = get_vnode_from_fd(fd, kernel);
2794 	if (vnode == NULL)
2795 		return B_FILE_ERROR;
2796 
2797 	if (path != NULL) {
2798 		return vnode_path_to_vnode(vnode, path, traverseLeafLink, 0, kernel,
2799 			_vnode, _parentID);
2800 	}
2801 
2802 	// there is no relative path to take into account
2803 
2804 	*_vnode = vnode;
2805 	if (_parentID)
2806 		*_parentID = -1;
2807 
2808 	return B_OK;
2809 }
2810 
2811 
2812 static int
2813 get_new_fd(int type, struct fs_mount* mount, struct vnode* vnode,
2814 	void* cookie, int openMode, bool kernel)
2815 {
2816 	struct file_descriptor* descriptor;
2817 	int fd;
2818 
2819 	// If the vnode is locked, we don't allow creating a new file/directory
2820 	// file_descriptor for it
2821 	if (vnode && vnode->mandatory_locked_by != NULL
2822 		&& (type == FDTYPE_FILE || type == FDTYPE_DIR))
2823 		return B_BUSY;
2824 
2825 	descriptor = alloc_fd();
2826 	if (!descriptor)
2827 		return B_NO_MEMORY;
2828 
2829 	if (vnode)
2830 		descriptor->u.vnode = vnode;
2831 	else
2832 		descriptor->u.mount = mount;
2833 	descriptor->cookie = cookie;
2834 
2835 	switch (type) {
2836 		// vnode types
2837 		case FDTYPE_FILE:
2838 			descriptor->ops = &sFileOps;
2839 			break;
2840 		case FDTYPE_DIR:
2841 			descriptor->ops = &sDirectoryOps;
2842 			break;
2843 		case FDTYPE_ATTR:
2844 			descriptor->ops = &sAttributeOps;
2845 			break;
2846 		case FDTYPE_ATTR_DIR:
2847 			descriptor->ops = &sAttributeDirectoryOps;
2848 			break;
2849 
2850 		// mount types
2851 		case FDTYPE_INDEX_DIR:
2852 			descriptor->ops = &sIndexDirectoryOps;
2853 			break;
2854 		case FDTYPE_QUERY:
2855 			descriptor->ops = &sQueryOps;
2856 			break;
2857 
2858 		default:
2859 			panic("get_new_fd() called with unknown type %d\n", type);
2860 			break;
2861 	}
2862 	descriptor->type = type;
2863 	descriptor->open_mode = openMode;
2864 
2865 	io_context* context = get_current_io_context(kernel);
2866 	fd = new_fd(context, descriptor);
2867 	if (fd < 0) {
2868 		descriptor->ops = NULL;
2869 		put_fd(descriptor);
2870 		return B_NO_MORE_FDS;
2871 	}
2872 
2873 	mutex_lock(&context->io_mutex);
2874 	fd_set_close_on_exec(context, fd, (openMode & O_CLOEXEC) != 0);
2875 	mutex_unlock(&context->io_mutex);
2876 
2877 	return fd;
2878 }
2879 
2880 
2881 /*!	In-place normalizes \a path. It's otherwise semantically equivalent to
2882 	vfs_normalize_path(). See there for more documentation.
2883 */
2884 static status_t
2885 normalize_path(char* path, size_t pathSize, bool traverseLink, bool kernel)
2886 {
2887 	VNodePutter dirPutter;
2888 	struct vnode* dir = NULL;
2889 	status_t error;
2890 
2891 	for (int i = 0; i < B_MAX_SYMLINKS; i++) {
2892 		// get dir vnode + leaf name
2893 		struct vnode* nextDir;
2894 		char leaf[B_FILE_NAME_LENGTH];
2895 		error = vnode_and_path_to_dir_vnode(dir, path, &nextDir, leaf, kernel);
2896 		if (error != B_OK)
2897 			return error;
2898 
2899 		dir = nextDir;
2900 		strcpy(path, leaf);
2901 		dirPutter.SetTo(dir);
2902 
2903 		// get file vnode, if we shall resolve links
2904 		bool fileExists = false;
2905 		struct vnode* fileVnode;
2906 		VNodePutter fileVnodePutter;
2907 		if (traverseLink) {
2908 			inc_vnode_ref_count(dir);
2909 			if (vnode_path_to_vnode(dir, path, false, 0, kernel, &fileVnode,
2910 					NULL) == B_OK) {
2911 				fileVnodePutter.SetTo(fileVnode);
2912 				fileExists = true;
2913 			}
2914 		}
2915 
2916 		if (!fileExists || !traverseLink || !S_ISLNK(fileVnode->Type())) {
2917 			// we're done -- construct the path
2918 			bool hasLeaf = true;
2919 			if (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0) {
2920 				// special cases "." and ".." -- get the dir, forget the leaf
2921 				inc_vnode_ref_count(dir);
2922 				error = vnode_path_to_vnode(dir, leaf, false, 0, kernel,
2923 					&nextDir, NULL);
2924 				if (error != B_OK)
2925 					return error;
2926 				dir = nextDir;
2927 				dirPutter.SetTo(dir);
2928 				hasLeaf = false;
2929 			}
2930 
2931 			// get the directory path
2932 			error = dir_vnode_to_path(dir, path, B_PATH_NAME_LENGTH, kernel);
2933 			if (error != B_OK)
2934 				return error;
2935 
2936 			// append the leaf name
2937 			if (hasLeaf) {
2938 				// insert a directory separator if this is not the file system
2939 				// root
2940 				if ((strcmp(path, "/") != 0
2941 					&& strlcat(path, "/", pathSize) >= pathSize)
2942 					|| strlcat(path, leaf, pathSize) >= pathSize) {
2943 					return B_NAME_TOO_LONG;
2944 				}
2945 			}
2946 
2947 			return B_OK;
2948 		}
2949 
2950 		// read link
2951 		if (HAS_FS_CALL(fileVnode, read_symlink)) {
2952 			size_t bufferSize = B_PATH_NAME_LENGTH - 1;
2953 			error = FS_CALL(fileVnode, read_symlink, path, &bufferSize);
2954 			if (error != B_OK)
2955 				return error;
2956 			path[bufferSize] = '\0';
2957 		} else
2958 			return B_BAD_VALUE;
2959 	}
2960 
2961 	return B_LINK_LIMIT;
2962 }
2963 
2964 
2965 static status_t
2966 resolve_covered_parent(struct vnode* parent, dev_t* _device, ino_t* _node,
2967 	struct io_context* ioContext)
2968 {
2969 	// Make sure the IO context root is not bypassed.
2970 	if (parent == ioContext->root) {
2971 		*_device = parent->device;
2972 		*_node = parent->id;
2973 		return B_OK;
2974 	}
2975 
2976 	inc_vnode_ref_count(parent);
2977 		// vnode_path_to_vnode() puts the node
2978 
2979 	// ".." is guaranteed not to be clobbered by this call
2980 	struct vnode* vnode;
2981 	status_t status = vnode_path_to_vnode(parent, (char*)"..", false, 0,
2982 		ioContext, &vnode, NULL);
2983 	if (status == B_OK) {
2984 		*_device = vnode->device;
2985 		*_node = vnode->id;
2986 		put_vnode(vnode);
2987 	}
2988 
2989 	return status;
2990 }
2991 
2992 
2993 #ifdef ADD_DEBUGGER_COMMANDS
2994 
2995 
2996 static void
2997 _dump_advisory_locking(advisory_locking* locking)
2998 {
2999 	if (locking == NULL)
3000 		return;
3001 
3002 	kprintf("   lock:        %" B_PRId32, locking->lock);
3003 	kprintf("   wait_sem:    %" B_PRId32, locking->wait_sem);
3004 
3005 	int32 index = 0;
3006 	LockList::Iterator iterator = locking->locks.GetIterator();
3007 	while (iterator.HasNext()) {
3008 		struct advisory_lock* lock = iterator.Next();
3009 
3010 		kprintf("   [%2" B_PRId32 "] team:   %" B_PRId32 "\n", index++, lock->team);
3011 		kprintf("        start:  %" B_PRIdOFF "\n", lock->start);
3012 		kprintf("        end:    %" B_PRIdOFF "\n", lock->end);
3013 		kprintf("        shared? %s\n", lock->shared ? "yes" : "no");
3014 	}
3015 }
3016 
3017 
3018 static void
3019 _dump_mount(struct fs_mount* mount)
3020 {
3021 	kprintf("MOUNT: %p\n", mount);
3022 	kprintf(" id:            %" B_PRIdDEV "\n", mount->id);
3023 	kprintf(" device_name:   %s\n", mount->device_name);
3024 	kprintf(" root_vnode:    %p\n", mount->root_vnode);
3025 	kprintf(" covers:        %p\n", mount->root_vnode->covers);
3026 	kprintf(" partition:     %p\n", mount->partition);
3027 	kprintf(" lock:          %p\n", &mount->lock);
3028 	kprintf(" flags:        %s%s\n", mount->unmounting ? " unmounting" : "",
3029 		mount->owns_file_device ? " owns_file_device" : "");
3030 
3031 	fs_volume* volume = mount->volume;
3032 	while (volume != NULL) {
3033 		kprintf(" volume %p:\n", volume);
3034 		kprintf("  layer:            %" B_PRId32 "\n", volume->layer);
3035 		kprintf("  private_volume:   %p\n", volume->private_volume);
3036 		kprintf("  ops:              %p\n", volume->ops);
3037 		kprintf("  file_system:      %p\n", volume->file_system);
3038 		kprintf("  file_system_name: %s\n", volume->file_system_name);
3039 		volume = volume->super_volume;
3040 	}
3041 
3042 	set_debug_variable("_volume", (addr_t)mount->volume->private_volume);
3043 	set_debug_variable("_root", (addr_t)mount->root_vnode);
3044 	set_debug_variable("_covers", (addr_t)mount->root_vnode->covers);
3045 	set_debug_variable("_partition", (addr_t)mount->partition);
3046 }
3047 
3048 
3049 static bool
3050 debug_prepend_vnode_name_to_path(char* buffer, size_t& bufferSize,
3051 	const char* name)
3052 {
3053 	bool insertSlash = buffer[bufferSize] != '\0';
3054 	size_t nameLength = strlen(name);
3055 
3056 	if (bufferSize < nameLength + (insertSlash ? 1 : 0))
3057 		return false;
3058 
3059 	if (insertSlash)
3060 		buffer[--bufferSize] = '/';
3061 
3062 	bufferSize -= nameLength;
3063 	memcpy(buffer + bufferSize, name, nameLength);
3064 
3065 	return true;
3066 }
3067 
3068 
3069 static bool
3070 debug_prepend_vnode_id_to_path(char* buffer, size_t& bufferSize, dev_t devID,
3071 	ino_t nodeID)
3072 {
3073 	if (bufferSize == 0)
3074 		return false;
3075 
3076 	bool insertSlash = buffer[bufferSize] != '\0';
3077 	if (insertSlash)
3078 		buffer[--bufferSize] = '/';
3079 
3080 	size_t size = snprintf(buffer, bufferSize,
3081 		"<%" B_PRIdDEV ",%" B_PRIdINO ">", devID, nodeID);
3082 	if (size > bufferSize) {
3083 		if (insertSlash)
3084 			bufferSize++;
3085 		return false;
3086 	}
3087 
3088 	if (size < bufferSize)
3089 		memmove(buffer + bufferSize - size, buffer, size);
3090 
3091 	bufferSize -= size;
3092 	return true;
3093 }
3094 
3095 
3096 static char*
3097 debug_resolve_vnode_path(struct vnode* vnode, char* buffer, size_t bufferSize,
3098 	bool& _truncated)
3099 {
3100 	// null-terminate the path
3101 	buffer[--bufferSize] = '\0';
3102 
3103 	while (true) {
3104 		while (vnode->covers != NULL)
3105 			vnode = vnode->covers;
3106 
3107 		if (vnode == sRoot) {
3108 			_truncated = bufferSize == 0;
3109 			if (!_truncated)
3110 				buffer[--bufferSize] = '/';
3111 			return buffer + bufferSize;
3112 		}
3113 
3114 		// resolve the name
3115 		ino_t dirID;
3116 		const char* name = vnode->mount->entry_cache.DebugReverseLookup(
3117 			vnode->id, dirID);
3118 		if (name == NULL) {
3119 			// Failed to resolve the name -- prepend "<dev,node>/".
3120 			_truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3121 				vnode->mount->id, vnode->id);
3122 			return buffer + bufferSize;
3123 		}
3124 
3125 		// prepend the name
3126 		if (!debug_prepend_vnode_name_to_path(buffer, bufferSize, name)) {
3127 			_truncated = true;
3128 			return buffer + bufferSize;
3129 		}
3130 
3131 		// resolve the directory node
3132 		struct vnode* nextVnode = lookup_vnode(vnode->mount->id, dirID);
3133 		if (nextVnode == NULL) {
3134 			_truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3135 				vnode->mount->id, dirID);
3136 			return buffer + bufferSize;
3137 		}
3138 
3139 		vnode = nextVnode;
3140 	}
3141 }
3142 
3143 
3144 static void
3145 _dump_vnode(struct vnode* vnode, bool printPath)
3146 {
3147 	kprintf("VNODE: %p\n", vnode);
3148 	kprintf(" device:        %" B_PRIdDEV "\n", vnode->device);
3149 	kprintf(" id:            %" B_PRIdINO "\n", vnode->id);
3150 	kprintf(" ref_count:     %" B_PRId32 "\n", vnode->ref_count);
3151 	kprintf(" private_node:  %p\n", vnode->private_node);
3152 	kprintf(" mount:         %p\n", vnode->mount);
3153 	kprintf(" covered_by:    %p\n", vnode->covered_by);
3154 	kprintf(" covers:        %p\n", vnode->covers);
3155 	kprintf(" cache:         %p\n", vnode->cache);
3156 	kprintf(" type:          %#" B_PRIx32 "\n", vnode->Type());
3157 	kprintf(" flags:         %s%s%s\n", vnode->IsRemoved() ? "r" : "-",
3158 		vnode->IsBusy() ? "b" : "-", vnode->IsUnpublished() ? "u" : "-");
3159 	kprintf(" advisory_lock: %p\n", vnode->advisory_locking);
3160 
3161 	_dump_advisory_locking(vnode->advisory_locking);
3162 
3163 	if (printPath) {
3164 		void* buffer = debug_malloc(B_PATH_NAME_LENGTH);
3165 		if (buffer != NULL) {
3166 			bool truncated;
3167 			char* path = debug_resolve_vnode_path(vnode, (char*)buffer,
3168 				B_PATH_NAME_LENGTH, truncated);
3169 			if (path != NULL) {
3170 				kprintf(" path:          ");
3171 				if (truncated)
3172 					kputs("<truncated>/");
3173 				kputs(path);
3174 				kputs("\n");
3175 			} else
3176 				kprintf("Failed to resolve vnode path.\n");
3177 
3178 			debug_free(buffer);
3179 		} else
3180 			kprintf("Failed to allocate memory for constructing the path.\n");
3181 	}
3182 
3183 	set_debug_variable("_node", (addr_t)vnode->private_node);
3184 	set_debug_variable("_mount", (addr_t)vnode->mount);
3185 	set_debug_variable("_covered_by", (addr_t)vnode->covered_by);
3186 	set_debug_variable("_covers", (addr_t)vnode->covers);
3187 	set_debug_variable("_adv_lock", (addr_t)vnode->advisory_locking);
3188 }
3189 
3190 
3191 static int
3192 dump_mount(int argc, char** argv)
3193 {
3194 	if (argc != 2 || !strcmp(argv[1], "--help")) {
3195 		kprintf("usage: %s [id|address]\n", argv[0]);
3196 		return 0;
3197 	}
3198 
3199 	ulong val = parse_expression(argv[1]);
3200 	uint32 id = val;
3201 
3202 	struct fs_mount* mount = sMountsTable->Lookup(id);
3203 	if (mount == NULL) {
3204 		if (IS_USER_ADDRESS(id)) {
3205 			kprintf("fs_mount not found\n");
3206 			return 0;
3207 		}
3208 		mount = (fs_mount*)val;
3209 	}
3210 
3211 	_dump_mount(mount);
3212 	return 0;
3213 }
3214 
3215 
3216 static int
3217 dump_mounts(int argc, char** argv)
3218 {
3219 	if (argc != 1) {
3220 		kprintf("usage: %s\n", argv[0]);
3221 		return 0;
3222 	}
3223 
3224 	kprintf("%-*s    id %-*s   %-*s   %-*s   fs_name\n",
3225 		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "root",
3226 		B_PRINTF_POINTER_WIDTH, "covers", B_PRINTF_POINTER_WIDTH, "cookie");
3227 
3228 	struct fs_mount* mount;
3229 
3230 	MountTable::Iterator iterator(sMountsTable);
3231 	while (iterator.HasNext()) {
3232 		mount = iterator.Next();
3233 		kprintf("%p%4" B_PRIdDEV " %p %p %p %s\n", mount, mount->id, mount->root_vnode,
3234 			mount->root_vnode->covers, mount->volume->private_volume,
3235 			mount->volume->file_system_name);
3236 
3237 		fs_volume* volume = mount->volume;
3238 		while (volume->super_volume != NULL) {
3239 			volume = volume->super_volume;
3240 			kprintf("                                     %p %s\n",
3241 				volume->private_volume, volume->file_system_name);
3242 		}
3243 	}
3244 
3245 	return 0;
3246 }
3247 
3248 
3249 static int
3250 dump_vnode(int argc, char** argv)
3251 {
3252 	bool printPath = false;
3253 	int argi = 1;
3254 	if (argc >= 2 && strcmp(argv[argi], "-p") == 0) {
3255 		printPath = true;
3256 		argi++;
3257 	}
3258 
3259 	if (argi >= argc || argi + 2 < argc) {
3260 		print_debugger_command_usage(argv[0]);
3261 		return 0;
3262 	}
3263 
3264 	struct vnode* vnode = NULL;
3265 
3266 	if (argi + 1 == argc) {
3267 		vnode = (struct vnode*)parse_expression(argv[argi]);
3268 		if (IS_USER_ADDRESS(vnode)) {
3269 			kprintf("invalid vnode address\n");
3270 			return 0;
3271 		}
3272 		_dump_vnode(vnode, printPath);
3273 		return 0;
3274 	}
3275 
3276 	dev_t device = parse_expression(argv[argi]);
3277 	ino_t id = parse_expression(argv[argi + 1]);
3278 
3279 	VnodeTable::Iterator iterator(sVnodeTable);
3280 	while (iterator.HasNext()) {
3281 		vnode = iterator.Next();
3282 		if (vnode->id != id || vnode->device != device)
3283 			continue;
3284 
3285 		_dump_vnode(vnode, printPath);
3286 	}
3287 
3288 	return 0;
3289 }
3290 
3291 
3292 static int
3293 dump_vnodes(int argc, char** argv)
3294 {
3295 	if (argc != 2 || !strcmp(argv[1], "--help")) {
3296 		kprintf("usage: %s [device]\n", argv[0]);
3297 		return 0;
3298 	}
3299 
3300 	// restrict dumped nodes to a certain device if requested
3301 	dev_t device = parse_expression(argv[1]);
3302 
3303 	struct vnode* vnode;
3304 
3305 	kprintf("%-*s   dev     inode  ref %-*s   %-*s   %-*s   flags\n",
3306 		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache",
3307 		B_PRINTF_POINTER_WIDTH, "fs-node", B_PRINTF_POINTER_WIDTH, "locking");
3308 
3309 	VnodeTable::Iterator iterator(sVnodeTable);
3310 	while (iterator.HasNext()) {
3311 		vnode = iterator.Next();
3312 		if (vnode->device != device)
3313 			continue;
3314 
3315 		kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO "%5" B_PRId32 " %p %p %p %s%s%s\n",
3316 			vnode, vnode->device, vnode->id, vnode->ref_count, vnode->cache,
3317 			vnode->private_node, vnode->advisory_locking,
3318 			vnode->IsRemoved() ? "r" : "-", vnode->IsBusy() ? "b" : "-",
3319 			vnode->IsUnpublished() ? "u" : "-");
3320 	}
3321 
3322 	return 0;
3323 }
3324 
3325 
3326 static int
3327 dump_vnode_caches(int argc, char** argv)
3328 {
3329 	struct vnode* vnode;
3330 
3331 	if (argc > 2 || !strcmp(argv[1], "--help")) {
3332 		kprintf("usage: %s [device]\n", argv[0]);
3333 		return 0;
3334 	}
3335 
3336 	// restrict dumped nodes to a certain device if requested
3337 	dev_t device = -1;
3338 	if (argc > 1)
3339 		device = parse_expression(argv[1]);
3340 
3341 	kprintf("%-*s   dev     inode %-*s       size   pages\n",
3342 		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache");
3343 
3344 	VnodeTable::Iterator iterator(sVnodeTable);
3345 	while (iterator.HasNext()) {
3346 		vnode = iterator.Next();
3347 		if (vnode->cache == NULL)
3348 			continue;
3349 		if (device != -1 && vnode->device != device)
3350 			continue;
3351 
3352 		kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO " %p %8" B_PRIdOFF "%8" B_PRId32 "\n",
3353 			vnode, vnode->device, vnode->id, vnode->cache,
3354 			(vnode->cache->virtual_end + B_PAGE_SIZE - 1) / B_PAGE_SIZE,
3355 			vnode->cache->page_count);
3356 	}
3357 
3358 	return 0;
3359 }
3360 
3361 
3362 int
3363 dump_io_context(int argc, char** argv)
3364 {
3365 	if (argc > 2 || !strcmp(argv[1], "--help")) {
3366 		kprintf("usage: %s [team-id|address]\n", argv[0]);
3367 		return 0;
3368 	}
3369 
3370 	struct io_context* context = NULL;
3371 
3372 	if (argc > 1) {
3373 		ulong num = parse_expression(argv[1]);
3374 		if (IS_KERNEL_ADDRESS(num))
3375 			context = (struct io_context*)num;
3376 		else {
3377 			Team* team = team_get_team_struct_locked(num);
3378 			if (team == NULL) {
3379 				kprintf("could not find team with ID %lu\n", num);
3380 				return 0;
3381 			}
3382 			context = (struct io_context*)team->io_context;
3383 		}
3384 	} else
3385 		context = get_current_io_context(true);
3386 
3387 	kprintf("I/O CONTEXT: %p\n", context);
3388 	kprintf(" root vnode:\t%p\n", context->root);
3389 	kprintf(" cwd vnode:\t%p\n", context->cwd);
3390 	kprintf(" used fds:\t%" B_PRIu32 "\n", context->num_used_fds);
3391 	kprintf(" max fds:\t%" B_PRIu32 "\n", context->table_size);
3392 
3393 	if (context->num_used_fds) {
3394 		kprintf("   no.  type    %*s  ref  open  mode         pos    %*s\n",
3395 			B_PRINTF_POINTER_WIDTH, "ops", B_PRINTF_POINTER_WIDTH, "cookie");
3396 	}
3397 
3398 	for (uint32 i = 0; i < context->table_size; i++) {
3399 		struct file_descriptor* fd = context->fds[i];
3400 		if (fd == NULL)
3401 			continue;
3402 
3403 		kprintf("  %3" B_PRIu32 ":  %4" B_PRId32 "  %p  %3" B_PRId32 "  %4"
3404 			B_PRIu32 "  %4" B_PRIx32 "  %10" B_PRIdOFF "  %p  %s %p\n", i,
3405 			fd->type, fd->ops, fd->ref_count, fd->open_count, fd->open_mode,
3406 			fd->pos, fd->cookie,
3407 			fd->type >= FDTYPE_INDEX && fd->type <= FDTYPE_QUERY
3408 				? "mount" : "vnode",
3409 			fd->u.vnode);
3410 	}
3411 
3412 	kprintf(" used monitors:\t%" B_PRIu32 "\n", context->num_monitors);
3413 	kprintf(" max monitors:\t%" B_PRIu32 "\n", context->max_monitors);
3414 
3415 	set_debug_variable("_cwd", (addr_t)context->cwd);
3416 
3417 	return 0;
3418 }
3419 
3420 
3421 int
3422 dump_vnode_usage(int argc, char** argv)
3423 {
3424 	if (argc != 1) {
3425 		kprintf("usage: %s\n", argv[0]);
3426 		return 0;
3427 	}
3428 
3429 	kprintf("Unused vnodes: %" B_PRIu32 " (max unused %" B_PRIu32 ")\n",
3430 		sUnusedVnodes, kMaxUnusedVnodes);
3431 
3432 	uint32 count = sVnodeTable->CountElements();
3433 
3434 	kprintf("%" B_PRIu32 " vnodes total (%" B_PRIu32 " in use).\n", count,
3435 		count - sUnusedVnodes);
3436 	return 0;
3437 }
3438 
3439 #endif	// ADD_DEBUGGER_COMMANDS
3440 
3441 
3442 /*!	Clears memory specified by an iovec array.
3443 */
3444 static void
3445 zero_iovecs(const iovec* vecs, size_t vecCount, size_t bytes)
3446 {
3447 	for (size_t i = 0; i < vecCount && bytes > 0; i++) {
3448 		size_t length = std::min(vecs[i].iov_len, bytes);
3449 		memset(vecs[i].iov_base, 0, length);
3450 		bytes -= length;
3451 	}
3452 }
3453 
3454 
3455 /*!	Does the dirty work of combining the file_io_vecs with the iovecs
3456 	and calls the file system hooks to read/write the request to disk.
3457 */
3458 static status_t
3459 common_file_io_vec_pages(struct vnode* vnode, void* cookie,
3460 	const file_io_vec* fileVecs, size_t fileVecCount, const iovec* vecs,
3461 	size_t vecCount, uint32* _vecIndex, size_t* _vecOffset, size_t* _numBytes,
3462 	bool doWrite)
3463 {
3464 	if (fileVecCount == 0) {
3465 		// There are no file vecs at this offset, so we're obviously trying
3466 		// to access the file outside of its bounds
3467 		return B_BAD_VALUE;
3468 	}
3469 
3470 	size_t numBytes = *_numBytes;
3471 	uint32 fileVecIndex;
3472 	size_t vecOffset = *_vecOffset;
3473 	uint32 vecIndex = *_vecIndex;
3474 	status_t status;
3475 	size_t size;
3476 
3477 	if (!doWrite && vecOffset == 0) {
3478 		// now directly read the data from the device
3479 		// the first file_io_vec can be read directly
3480 
3481 		if (fileVecs[0].length < (off_t)numBytes)
3482 			size = fileVecs[0].length;
3483 		else
3484 			size = numBytes;
3485 
3486 		if (fileVecs[0].offset >= 0) {
3487 			status = FS_CALL(vnode, read_pages, cookie, fileVecs[0].offset,
3488 				&vecs[vecIndex], vecCount - vecIndex, &size);
3489 		} else {
3490 			// sparse read
3491 			zero_iovecs(&vecs[vecIndex], vecCount - vecIndex, size);
3492 			status = B_OK;
3493 		}
3494 		if (status != B_OK)
3495 			return status;
3496 
3497 		// TODO: this is a work-around for buggy device drivers!
3498 		//	When our own drivers honour the length, we can:
3499 		//	a) also use this direct I/O for writes (otherwise, it would
3500 		//	   overwrite precious data)
3501 		//	b) panic if the term below is true (at least for writes)
3502 		if ((off_t)size > fileVecs[0].length) {
3503 			//dprintf("warning: device driver %p doesn't respect total length "
3504 			//	"in read_pages() call!\n", ref->device);
3505 			size = fileVecs[0].length;
3506 		}
3507 
3508 		ASSERT((off_t)size <= fileVecs[0].length);
3509 
3510 		// If the file portion was contiguous, we're already done now
3511 		if (size == numBytes)
3512 			return B_OK;
3513 
3514 		// if we reached the end of the file, we can return as well
3515 		if ((off_t)size != fileVecs[0].length) {
3516 			*_numBytes = size;
3517 			return B_OK;
3518 		}
3519 
3520 		fileVecIndex = 1;
3521 
3522 		// first, find out where we have to continue in our iovecs
3523 		for (; vecIndex < vecCount; vecIndex++) {
3524 			if (size < vecs[vecIndex].iov_len)
3525 				break;
3526 
3527 			size -= vecs[vecIndex].iov_len;
3528 		}
3529 
3530 		vecOffset = size;
3531 	} else {
3532 		fileVecIndex = 0;
3533 		size = 0;
3534 	}
3535 
3536 	// Too bad, let's process the rest of the file_io_vecs
3537 
3538 	size_t totalSize = size;
3539 	size_t bytesLeft = numBytes - size;
3540 
3541 	for (; fileVecIndex < fileVecCount; fileVecIndex++) {
3542 		const file_io_vec &fileVec = fileVecs[fileVecIndex];
3543 		off_t fileOffset = fileVec.offset;
3544 		off_t fileLeft = min_c(fileVec.length, (off_t)bytesLeft);
3545 
3546 		TRACE(("FILE VEC [%" B_PRIu32 "] length %" B_PRIdOFF "\n", fileVecIndex,
3547 			fileLeft));
3548 
3549 		// process the complete fileVec
3550 		while (fileLeft > 0) {
3551 			iovec tempVecs[MAX_TEMP_IO_VECS];
3552 			uint32 tempCount = 0;
3553 
3554 			// size tracks how much of what is left of the current fileVec
3555 			// (fileLeft) has been assigned to tempVecs
3556 			size = 0;
3557 
3558 			// assign what is left of the current fileVec to the tempVecs
3559 			for (size = 0; (off_t)size < fileLeft && vecIndex < vecCount
3560 					&& tempCount < MAX_TEMP_IO_VECS;) {
3561 				// try to satisfy one iovec per iteration (or as much as
3562 				// possible)
3563 
3564 				// bytes left of the current iovec
3565 				size_t vecLeft = vecs[vecIndex].iov_len - vecOffset;
3566 				if (vecLeft == 0) {
3567 					vecOffset = 0;
3568 					vecIndex++;
3569 					continue;
3570 				}
3571 
3572 				TRACE(("fill vec %" B_PRIu32 ", offset = %lu, size = %lu\n",
3573 					vecIndex, vecOffset, size));
3574 
3575 				// actually available bytes
3576 				size_t tempVecSize = min_c(vecLeft, fileLeft - size);
3577 
3578 				tempVecs[tempCount].iov_base
3579 					= (void*)((addr_t)vecs[vecIndex].iov_base + vecOffset);
3580 				tempVecs[tempCount].iov_len = tempVecSize;
3581 				tempCount++;
3582 
3583 				size += tempVecSize;
3584 				vecOffset += tempVecSize;
3585 			}
3586 
3587 			size_t bytes = size;
3588 
3589 			if (fileOffset == -1) {
3590 				if (doWrite) {
3591 					panic("sparse write attempt: vnode %p", vnode);
3592 					status = B_IO_ERROR;
3593 				} else {
3594 					// sparse read
3595 					zero_iovecs(tempVecs, tempCount, bytes);
3596 					status = B_OK;
3597 				}
3598 			} else if (doWrite) {
3599 				status = FS_CALL(vnode, write_pages, cookie, fileOffset,
3600 					tempVecs, tempCount, &bytes);
3601 			} else {
3602 				status = FS_CALL(vnode, read_pages, cookie, fileOffset,
3603 					tempVecs, tempCount, &bytes);
3604 			}
3605 			if (status != B_OK)
3606 				return status;
3607 
3608 			totalSize += bytes;
3609 			bytesLeft -= size;
3610 			if (fileOffset >= 0)
3611 				fileOffset += size;
3612 			fileLeft -= size;
3613 			//dprintf("-> file left = %Lu\n", fileLeft);
3614 
3615 			if (size != bytes || vecIndex >= vecCount) {
3616 				// there are no more bytes or iovecs, let's bail out
3617 				*_numBytes = totalSize;
3618 				return B_OK;
3619 			}
3620 		}
3621 	}
3622 
3623 	*_vecIndex = vecIndex;
3624 	*_vecOffset = vecOffset;
3625 	*_numBytes = totalSize;
3626 	return B_OK;
3627 }
3628 
3629 
3630 static bool
3631 is_user_in_group(gid_t gid)
3632 {
3633 	if (gid == getegid())
3634 		return true;
3635 
3636 	gid_t groups[NGROUPS_MAX];
3637 	int groupCount = getgroups(NGROUPS_MAX, groups);
3638 	for (int i = 0; i < groupCount; i++) {
3639 		if (gid == groups[i])
3640 			return true;
3641 	}
3642 
3643 	return false;
3644 }
3645 
3646 
3647 static status_t
3648 free_io_context(io_context* context)
3649 {
3650 	uint32 i;
3651 
3652 	TIOC(FreeIOContext(context));
3653 
3654 	if (context->root)
3655 		put_vnode(context->root);
3656 
3657 	if (context->cwd)
3658 		put_vnode(context->cwd);
3659 
3660 	mutex_lock(&context->io_mutex);
3661 
3662 	for (i = 0; i < context->table_size; i++) {
3663 		if (struct file_descriptor* descriptor = context->fds[i]) {
3664 			close_fd(context, descriptor);
3665 			put_fd(descriptor);
3666 		}
3667 	}
3668 
3669 	mutex_destroy(&context->io_mutex);
3670 
3671 	remove_node_monitors(context);
3672 	free(context->fds);
3673 	free(context);
3674 
3675 	return B_OK;
3676 }
3677 
3678 
3679 static status_t
3680 resize_monitor_table(struct io_context* context, const int newSize)
3681 {
3682 	int	status = B_OK;
3683 
3684 	if (newSize <= 0 || newSize > MAX_NODE_MONITORS)
3685 		return B_BAD_VALUE;
3686 
3687 	mutex_lock(&context->io_mutex);
3688 
3689 	if ((size_t)newSize < context->num_monitors) {
3690 		status = B_BUSY;
3691 		goto out;
3692 	}
3693 	context->max_monitors = newSize;
3694 
3695 out:
3696 	mutex_unlock(&context->io_mutex);
3697 	return status;
3698 }
3699 
3700 
3701 //	#pragma mark - public API for file systems
3702 
3703 
3704 extern "C" status_t
3705 new_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3706 	fs_vnode_ops* ops)
3707 {
3708 	FUNCTION(("new_vnode(volume = %p (%" B_PRId32 "), vnodeID = %" B_PRId64
3709 		", node = %p)\n", volume, volume->id, vnodeID, privateNode));
3710 
3711 	if (privateNode == NULL)
3712 		return B_BAD_VALUE;
3713 
3714 	int32 tries = BUSY_VNODE_RETRIES;
3715 restart:
3716 	// create the node
3717 	bool nodeCreated;
3718 	struct vnode* vnode;
3719 	status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3720 		nodeCreated);
3721 	if (status != B_OK)
3722 		return status;
3723 
3724 	WriteLocker nodeLocker(sVnodeLock, true);
3725 		// create_new_vnode_and_lock() has locked for us
3726 
3727 	if (!nodeCreated && vnode->IsBusy()) {
3728 		nodeLocker.Unlock();
3729 		if (!retry_busy_vnode(tries, volume->id, vnodeID))
3730 			return B_BUSY;
3731 		goto restart;
3732 	}
3733 
3734 	// file system integrity check:
3735 	// test if the vnode already exists and bail out if this is the case!
3736 	if (!nodeCreated) {
3737 		panic("vnode %" B_PRIdDEV ":%" B_PRIdINO " already exists (node = %p, "
3738 			"vnode->node = %p)!", volume->id, vnodeID, privateNode,
3739 			vnode->private_node);
3740 		return B_ERROR;
3741 	}
3742 
3743 	vnode->private_node = privateNode;
3744 	vnode->ops = ops;
3745 	vnode->SetUnpublished(true);
3746 
3747 	TRACE(("returns: %s\n", strerror(status)));
3748 
3749 	return status;
3750 }
3751 
3752 
3753 extern "C" status_t
3754 publish_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3755 	fs_vnode_ops* ops, int type, uint32 flags)
3756 {
3757 	FUNCTION(("publish_vnode()\n"));
3758 
3759 	int32 tries = BUSY_VNODE_RETRIES;
3760 restart:
3761 	WriteLocker locker(sVnodeLock);
3762 
3763 	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3764 
3765 	bool nodeCreated = false;
3766 	if (vnode == NULL) {
3767 		if (privateNode == NULL)
3768 			return B_BAD_VALUE;
3769 
3770 		// create the node
3771 		locker.Unlock();
3772 			// create_new_vnode_and_lock() will re-lock for us on success
3773 		status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3774 			nodeCreated);
3775 		if (status != B_OK)
3776 			return status;
3777 
3778 		locker.SetTo(sVnodeLock, true);
3779 	}
3780 
3781 	if (nodeCreated) {
3782 		vnode->private_node = privateNode;
3783 		vnode->ops = ops;
3784 		vnode->SetUnpublished(true);
3785 	} else if (vnode->IsBusy() && vnode->IsUnpublished()
3786 		&& vnode->private_node == privateNode && vnode->ops == ops) {
3787 		// already known, but not published
3788 	} else if (vnode->IsBusy()) {
3789 		locker.Unlock();
3790 		if (!retry_busy_vnode(tries, volume->id, vnodeID))
3791 			return B_BUSY;
3792 		goto restart;
3793 	} else
3794 		return B_BAD_VALUE;
3795 
3796 	bool publishSpecialSubNode = false;
3797 
3798 	vnode->SetType(type);
3799 	vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
3800 	publishSpecialSubNode = is_special_node_type(type)
3801 		&& (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
3802 
3803 	status_t status = B_OK;
3804 
3805 	// create sub vnodes, if necessary
3806 	if (volume->sub_volume != NULL || publishSpecialSubNode) {
3807 		locker.Unlock();
3808 
3809 		fs_volume* subVolume = volume;
3810 		if (volume->sub_volume != NULL) {
3811 			while (status == B_OK && subVolume->sub_volume != NULL) {
3812 				subVolume = subVolume->sub_volume;
3813 				status = subVolume->ops->create_sub_vnode(subVolume, vnodeID,
3814 					vnode);
3815 			}
3816 		}
3817 
3818 		if (status == B_OK && publishSpecialSubNode)
3819 			status = create_special_sub_node(vnode, flags);
3820 
3821 		if (status != B_OK) {
3822 			// error -- clean up the created sub vnodes
3823 			while (subVolume->super_volume != volume) {
3824 				subVolume = subVolume->super_volume;
3825 				subVolume->ops->delete_sub_vnode(subVolume, vnode);
3826 			}
3827 		}
3828 
3829 		if (status == B_OK) {
3830 			ReadLocker vnodesReadLocker(sVnodeLock);
3831 			AutoLocker<Vnode> nodeLocker(vnode);
3832 			vnode->SetBusy(false);
3833 			vnode->SetUnpublished(false);
3834 		} else {
3835 			locker.Lock();
3836 			sVnodeTable->Remove(vnode);
3837 			remove_vnode_from_mount_list(vnode, vnode->mount);
3838 			free(vnode);
3839 		}
3840 	} else {
3841 		// we still hold the write lock -- mark the node unbusy and published
3842 		vnode->SetBusy(false);
3843 		vnode->SetUnpublished(false);
3844 	}
3845 
3846 	TRACE(("returns: %s\n", strerror(status)));
3847 
3848 	return status;
3849 }
3850 
3851 
3852 extern "C" status_t
3853 get_vnode(fs_volume* volume, ino_t vnodeID, void** _privateNode)
3854 {
3855 	struct vnode* vnode;
3856 
3857 	if (volume == NULL)
3858 		return B_BAD_VALUE;
3859 
3860 	status_t status = get_vnode(volume->id, vnodeID, &vnode, true, true);
3861 	if (status != B_OK)
3862 		return status;
3863 
3864 	// If this is a layered FS, we need to get the node cookie for the requested
3865 	// layer.
3866 	if (HAS_FS_CALL(vnode, get_super_vnode)) {
3867 		fs_vnode resolvedNode;
3868 		status_t status = FS_CALL(vnode, get_super_vnode, volume,
3869 			&resolvedNode);
3870 		if (status != B_OK) {
3871 			panic("get_vnode(): Failed to get super node for vnode %p, "
3872 				"volume: %p", vnode, volume);
3873 			put_vnode(vnode);
3874 			return status;
3875 		}
3876 
3877 		if (_privateNode != NULL)
3878 			*_privateNode = resolvedNode.private_node;
3879 	} else if (_privateNode != NULL)
3880 		*_privateNode = vnode->private_node;
3881 
3882 	return B_OK;
3883 }
3884 
3885 
3886 extern "C" status_t
3887 acquire_vnode(fs_volume* volume, ino_t vnodeID)
3888 {
3889 	struct vnode* vnode;
3890 
3891 	rw_lock_read_lock(&sVnodeLock);
3892 	vnode = lookup_vnode(volume->id, vnodeID);
3893 	rw_lock_read_unlock(&sVnodeLock);
3894 
3895 	if (vnode == NULL)
3896 		return B_BAD_VALUE;
3897 
3898 	inc_vnode_ref_count(vnode);
3899 	return B_OK;
3900 }
3901 
3902 
3903 extern "C" status_t
3904 put_vnode(fs_volume* volume, ino_t vnodeID)
3905 {
3906 	struct vnode* vnode;
3907 
3908 	rw_lock_read_lock(&sVnodeLock);
3909 	vnode = lookup_vnode(volume->id, vnodeID);
3910 	rw_lock_read_unlock(&sVnodeLock);
3911 
3912 	if (vnode == NULL)
3913 		return B_BAD_VALUE;
3914 
3915 	dec_vnode_ref_count(vnode, false, true);
3916 	return B_OK;
3917 }
3918 
3919 
3920 extern "C" status_t
3921 remove_vnode(fs_volume* volume, ino_t vnodeID)
3922 {
3923 	ReadLocker locker(sVnodeLock);
3924 
3925 	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3926 	if (vnode == NULL)
3927 		return B_ENTRY_NOT_FOUND;
3928 
3929 	if (vnode->covered_by != NULL || vnode->covers != NULL) {
3930 		// this vnode is in use
3931 		return B_BUSY;
3932 	}
3933 
3934 	vnode->Lock();
3935 
3936 	vnode->SetRemoved(true);
3937 	bool removeUnpublished = false;
3938 
3939 	if (vnode->IsUnpublished()) {
3940 		// prepare the vnode for deletion
3941 		removeUnpublished = true;
3942 		vnode->SetBusy(true);
3943 	}
3944 
3945 	vnode->Unlock();
3946 	locker.Unlock();
3947 
3948 	if (removeUnpublished) {
3949 		// If the vnode hasn't been published yet, we delete it here
3950 		atomic_add(&vnode->ref_count, -1);
3951 		free_vnode(vnode, true);
3952 	}
3953 
3954 	return B_OK;
3955 }
3956 
3957 
3958 extern "C" status_t
3959 unremove_vnode(fs_volume* volume, ino_t vnodeID)
3960 {
3961 	struct vnode* vnode;
3962 
3963 	rw_lock_read_lock(&sVnodeLock);
3964 
3965 	vnode = lookup_vnode(volume->id, vnodeID);
3966 	if (vnode) {
3967 		AutoLocker<Vnode> nodeLocker(vnode);
3968 		vnode->SetRemoved(false);
3969 	}
3970 
3971 	rw_lock_read_unlock(&sVnodeLock);
3972 	return B_OK;
3973 }
3974 
3975 
3976 extern "C" status_t
3977 get_vnode_removed(fs_volume* volume, ino_t vnodeID, bool* _removed)
3978 {
3979 	ReadLocker _(sVnodeLock);
3980 
3981 	if (struct vnode* vnode = lookup_vnode(volume->id, vnodeID)) {
3982 		if (_removed != NULL)
3983 			*_removed = vnode->IsRemoved();
3984 		return B_OK;
3985 	}
3986 
3987 	return B_BAD_VALUE;
3988 }
3989 
3990 
3991 extern "C" status_t
3992 mark_vnode_busy(fs_volume* volume, ino_t vnodeID, bool busy)
3993 {
3994 	ReadLocker locker(sVnodeLock);
3995 
3996 	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3997 	if (vnode == NULL)
3998 		return B_ENTRY_NOT_FOUND;
3999 
4000 	// are we trying to mark an already busy node busy again?
4001 	if (busy && vnode->IsBusy())
4002 		return B_BUSY;
4003 
4004 	vnode->Lock();
4005 	vnode->SetBusy(busy);
4006 	vnode->Unlock();
4007 
4008 	return B_OK;
4009 }
4010 
4011 
4012 extern "C" status_t
4013 change_vnode_id(fs_volume* volume, ino_t vnodeID, ino_t newID)
4014 {
4015 	WriteLocker locker(sVnodeLock);
4016 
4017 	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
4018 	if (vnode == NULL)
4019 		return B_ENTRY_NOT_FOUND;
4020 
4021 	sVnodeTable->Remove(vnode);
4022 	vnode->id = newID;
4023 	sVnodeTable->Insert(vnode);
4024 
4025 	if (vnode->cache != NULL && vnode->cache->type == CACHE_TYPE_VNODE)
4026 		((VMVnodeCache*)vnode->cache)->SetVnodeID(newID);
4027 
4028 	return B_OK;
4029 }
4030 
4031 
4032 extern "C" fs_volume*
4033 volume_for_vnode(fs_vnode* _vnode)
4034 {
4035 	if (_vnode == NULL)
4036 		return NULL;
4037 
4038 	struct vnode* vnode = static_cast<struct vnode*>(_vnode);
4039 	return vnode->mount->volume;
4040 }
4041 
4042 
4043 extern "C" status_t
4044 check_access_permissions(int accessMode, mode_t mode, gid_t nodeGroupID,
4045 	uid_t nodeUserID)
4046 {
4047 	// get node permissions
4048 	int userPermissions = (mode & S_IRWXU) >> 6;
4049 	int groupPermissions = (mode & S_IRWXG) >> 3;
4050 	int otherPermissions = mode & S_IRWXO;
4051 
4052 	// get the node permissions for this uid/gid
4053 	int permissions = 0;
4054 	uid_t uid = geteuid();
4055 
4056 	if (uid == 0) {
4057 		// user is root
4058 		// root has always read/write permission, but at least one of the
4059 		// X bits must be set for execute permission
4060 		permissions = userPermissions | groupPermissions | otherPermissions
4061 			| S_IROTH | S_IWOTH;
4062 		if (S_ISDIR(mode))
4063 			permissions |= S_IXOTH;
4064 	} else if (uid == nodeUserID) {
4065 		// user is node owner
4066 		permissions = userPermissions;
4067 	} else if (is_user_in_group(nodeGroupID)) {
4068 		// user is in owning group
4069 		permissions = groupPermissions;
4070 	} else {
4071 		// user is one of the others
4072 		permissions = otherPermissions;
4073 	}
4074 
4075 	return (accessMode & ~permissions) == 0 ? B_OK : B_PERMISSION_DENIED;
4076 }
4077 
4078 
4079 #if 0
4080 extern "C" status_t
4081 read_pages(int fd, off_t pos, const iovec* vecs, size_t count,
4082 	size_t* _numBytes)
4083 {
4084 	struct file_descriptor* descriptor;
4085 	struct vnode* vnode;
4086 
4087 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4088 	if (descriptor == NULL)
4089 		return B_FILE_ERROR;
4090 
4091 	status_t status = vfs_read_pages(vnode, descriptor->cookie, pos, vecs,
4092 		count, 0, _numBytes);
4093 
4094 	put_fd(descriptor);
4095 	return status;
4096 }
4097 
4098 
4099 extern "C" status_t
4100 write_pages(int fd, off_t pos, const iovec* vecs, size_t count,
4101 	size_t* _numBytes)
4102 {
4103 	struct file_descriptor* descriptor;
4104 	struct vnode* vnode;
4105 
4106 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4107 	if (descriptor == NULL)
4108 		return B_FILE_ERROR;
4109 
4110 	status_t status = vfs_write_pages(vnode, descriptor->cookie, pos, vecs,
4111 		count, 0, _numBytes);
4112 
4113 	put_fd(descriptor);
4114 	return status;
4115 }
4116 #endif
4117 
4118 
4119 extern "C" status_t
4120 read_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
4121 	const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
4122 	size_t* _bytes)
4123 {
4124 	struct file_descriptor* descriptor;
4125 	struct vnode* vnode;
4126 
4127 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4128 	if (descriptor == NULL)
4129 		return B_FILE_ERROR;
4130 
4131 	status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
4132 		fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
4133 		false);
4134 
4135 	put_fd(descriptor);
4136 	return status;
4137 }
4138 
4139 
4140 extern "C" status_t
4141 write_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
4142 	const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
4143 	size_t* _bytes)
4144 {
4145 	struct file_descriptor* descriptor;
4146 	struct vnode* vnode;
4147 
4148 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4149 	if (descriptor == NULL)
4150 		return B_FILE_ERROR;
4151 
4152 	status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
4153 		fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
4154 		true);
4155 
4156 	put_fd(descriptor);
4157 	return status;
4158 }
4159 
4160 
4161 extern "C" status_t
4162 entry_cache_add(dev_t mountID, ino_t dirID, const char* name, ino_t nodeID)
4163 {
4164 	// lookup mount -- the caller is required to make sure that the mount
4165 	// won't go away
4166 	MutexLocker locker(sMountMutex);
4167 	struct fs_mount* mount = find_mount(mountID);
4168 	if (mount == NULL)
4169 		return B_BAD_VALUE;
4170 	locker.Unlock();
4171 
4172 	return mount->entry_cache.Add(dirID, name, nodeID, false);
4173 }
4174 
4175 
4176 extern "C" status_t
4177 entry_cache_add_missing(dev_t mountID, ino_t dirID, const char* name)
4178 {
4179 	// lookup mount -- the caller is required to make sure that the mount
4180 	// won't go away
4181 	MutexLocker locker(sMountMutex);
4182 	struct fs_mount* mount = find_mount(mountID);
4183 	if (mount == NULL)
4184 		return B_BAD_VALUE;
4185 	locker.Unlock();
4186 
4187 	return mount->entry_cache.Add(dirID, name, -1, true);
4188 }
4189 
4190 
4191 extern "C" status_t
4192 entry_cache_remove(dev_t mountID, ino_t dirID, const char* name)
4193 {
4194 	// lookup mount -- the caller is required to make sure that the mount
4195 	// won't go away
4196 	MutexLocker locker(sMountMutex);
4197 	struct fs_mount* mount = find_mount(mountID);
4198 	if (mount == NULL)
4199 		return B_BAD_VALUE;
4200 	locker.Unlock();
4201 
4202 	return mount->entry_cache.Remove(dirID, name);
4203 }
4204 
4205 
4206 //	#pragma mark - private VFS API
4207 //	Functions the VFS exports for other parts of the kernel
4208 
4209 
4210 /*! Acquires another reference to the vnode that has to be released
4211 	by calling vfs_put_vnode().
4212 */
4213 void
4214 vfs_acquire_vnode(struct vnode* vnode)
4215 {
4216 	inc_vnode_ref_count(vnode);
4217 }
4218 
4219 
4220 /*! This is currently called from file_cache_create() only.
4221 	It's probably a temporary solution as long as devfs requires that
4222 	fs_read_pages()/fs_write_pages() are called with the standard
4223 	open cookie and not with a device cookie.
4224 	If that's done differently, remove this call; it has no other
4225 	purpose.
4226 */
4227 extern "C" status_t
4228 vfs_get_cookie_from_fd(int fd, void** _cookie)
4229 {
4230 	struct file_descriptor* descriptor;
4231 
4232 	descriptor = get_fd(get_current_io_context(true), fd);
4233 	if (descriptor == NULL)
4234 		return B_FILE_ERROR;
4235 
4236 	*_cookie = descriptor->cookie;
4237 	return B_OK;
4238 }
4239 
4240 
4241 extern "C" status_t
4242 vfs_get_vnode_from_fd(int fd, bool kernel, struct vnode** vnode)
4243 {
4244 	*vnode = get_vnode_from_fd(fd, kernel);
4245 
4246 	if (*vnode == NULL)
4247 		return B_FILE_ERROR;
4248 
4249 	return B_NO_ERROR;
4250 }
4251 
4252 
4253 extern "C" status_t
4254 vfs_get_vnode_from_path(const char* path, bool kernel, struct vnode** _vnode)
4255 {
4256 	TRACE(("vfs_get_vnode_from_path: entry. path = '%s', kernel %d\n",
4257 		path, kernel));
4258 
4259 	KPath pathBuffer;
4260 	if (pathBuffer.InitCheck() != B_OK)
4261 		return B_NO_MEMORY;
4262 
4263 	char* buffer = pathBuffer.LockBuffer();
4264 	strlcpy(buffer, path, pathBuffer.BufferSize());
4265 
4266 	struct vnode* vnode;
4267 	status_t status = path_to_vnode(buffer, true, &vnode, NULL, kernel);
4268 	if (status != B_OK)
4269 		return status;
4270 
4271 	*_vnode = vnode;
4272 	return B_OK;
4273 }
4274 
4275 
4276 extern "C" status_t
4277 vfs_get_vnode(dev_t mountID, ino_t vnodeID, bool canWait, struct vnode** _vnode)
4278 {
4279 	struct vnode* vnode = NULL;
4280 
4281 	status_t status = get_vnode(mountID, vnodeID, &vnode, canWait, false);
4282 	if (status != B_OK)
4283 		return status;
4284 
4285 	*_vnode = vnode;
4286 	return B_OK;
4287 }
4288 
4289 
4290 extern "C" status_t
4291 vfs_entry_ref_to_vnode(dev_t mountID, ino_t directoryID,
4292 	const char* name, struct vnode** _vnode)
4293 {
4294 	return entry_ref_to_vnode(mountID, directoryID, name, false, true, _vnode);
4295 }
4296 
4297 
4298 extern "C" void
4299 vfs_vnode_to_node_ref(struct vnode* vnode, dev_t* _mountID, ino_t* _vnodeID)
4300 {
4301 	*_mountID = vnode->device;
4302 	*_vnodeID = vnode->id;
4303 }
4304 
4305 
4306 /*!
4307 	Helper function abstracting the process of "converting" a given
4308 	vnode-pointer to a fs_vnode-pointer.
4309 	Currently only used in bindfs.
4310 */
4311 extern "C" fs_vnode*
4312 vfs_fsnode_for_vnode(struct vnode* vnode)
4313 {
4314 	return vnode;
4315 }
4316 
4317 
4318 /*!
4319 	Calls fs_open() on the given vnode and returns a new
4320 	file descriptor for it
4321 */
4322 int
4323 vfs_open_vnode(struct vnode* vnode, int openMode, bool kernel)
4324 {
4325 	return open_vnode(vnode, openMode, kernel);
4326 }
4327 
4328 
4329 /*!	Looks up a vnode with the given mount and vnode ID.
4330 	Must only be used with "in-use" vnodes as it doesn't grab a reference
4331 	to the node.
4332 	It's currently only be used by file_cache_create().
4333 */
4334 extern "C" status_t
4335 vfs_lookup_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode)
4336 {
4337 	rw_lock_read_lock(&sVnodeLock);
4338 	struct vnode* vnode = lookup_vnode(mountID, vnodeID);
4339 	rw_lock_read_unlock(&sVnodeLock);
4340 
4341 	if (vnode == NULL)
4342 		return B_ERROR;
4343 
4344 	*_vnode = vnode;
4345 	return B_OK;
4346 }
4347 
4348 
4349 extern "C" status_t
4350 vfs_get_fs_node_from_path(fs_volume* volume, const char* path,
4351 	bool traverseLeafLink, bool kernel, void** _node)
4352 {
4353 	TRACE(("vfs_get_fs_node_from_path(volume = %p, path = \"%s\", kernel %d)\n",
4354 		volume, path, kernel));
4355 
4356 	KPath pathBuffer;
4357 	if (pathBuffer.InitCheck() != B_OK)
4358 		return B_NO_MEMORY;
4359 
4360 	fs_mount* mount;
4361 	status_t status = get_mount(volume->id, &mount);
4362 	if (status != B_OK)
4363 		return status;
4364 
4365 	char* buffer = pathBuffer.LockBuffer();
4366 	strlcpy(buffer, path, pathBuffer.BufferSize());
4367 
4368 	struct vnode* vnode = mount->root_vnode;
4369 
4370 	if (buffer[0] == '/')
4371 		status = path_to_vnode(buffer, traverseLeafLink, &vnode, NULL, kernel);
4372 	else {
4373 		inc_vnode_ref_count(vnode);
4374 			// vnode_path_to_vnode() releases a reference to the starting vnode
4375 		status = vnode_path_to_vnode(vnode, buffer, traverseLeafLink, 0,
4376 			kernel, &vnode, NULL);
4377 	}
4378 
4379 	put_mount(mount);
4380 
4381 	if (status != B_OK)
4382 		return status;
4383 
4384 	if (vnode->device != volume->id) {
4385 		// wrong mount ID - must not gain access on foreign file system nodes
4386 		put_vnode(vnode);
4387 		return B_BAD_VALUE;
4388 	}
4389 
4390 	// Use get_vnode() to resolve the cookie for the right layer.
4391 	status = get_vnode(volume, vnode->id, _node);
4392 	put_vnode(vnode);
4393 
4394 	return status;
4395 }
4396 
4397 
4398 status_t
4399 vfs_read_stat(int fd, const char* path, bool traverseLeafLink,
4400 	struct stat* stat, bool kernel)
4401 {
4402 	status_t status;
4403 
4404 	if (path != NULL) {
4405 		// path given: get the stat of the node referred to by (fd, path)
4406 		KPath pathBuffer(path);
4407 		if (pathBuffer.InitCheck() != B_OK)
4408 			return B_NO_MEMORY;
4409 
4410 		status = common_path_read_stat(fd, pathBuffer.LockBuffer(),
4411 			traverseLeafLink, stat, kernel);
4412 	} else {
4413 		// no path given: get the FD and use the FD operation
4414 		struct file_descriptor* descriptor
4415 			= get_fd(get_current_io_context(kernel), fd);
4416 		if (descriptor == NULL)
4417 			return B_FILE_ERROR;
4418 
4419 		if (descriptor->ops->fd_read_stat)
4420 			status = descriptor->ops->fd_read_stat(descriptor, stat);
4421 		else
4422 			status = B_UNSUPPORTED;
4423 
4424 		put_fd(descriptor);
4425 	}
4426 
4427 	return status;
4428 }
4429 
4430 
4431 /*!	Finds the full path to the file that contains the module \a moduleName,
4432 	puts it into \a pathBuffer, and returns B_OK for success.
4433 	If \a pathBuffer was too small, it returns \c B_BUFFER_OVERFLOW,
4434 	\c B_ENTRY_NOT_FOUNT if no file could be found.
4435 	\a pathBuffer is clobbered in any case and must not be relied on if this
4436 	functions returns unsuccessfully.
4437 	\a basePath and \a pathBuffer must not point to the same space.
4438 */
4439 status_t
4440 vfs_get_module_path(const char* basePath, const char* moduleName,
4441 	char* pathBuffer, size_t bufferSize)
4442 {
4443 	struct vnode* dir;
4444 	struct vnode* file;
4445 	status_t status;
4446 	size_t length;
4447 	char* path;
4448 
4449 	if (bufferSize == 0
4450 		|| strlcpy(pathBuffer, basePath, bufferSize) >= bufferSize)
4451 		return B_BUFFER_OVERFLOW;
4452 
4453 	status = path_to_vnode(pathBuffer, true, &dir, NULL, true);
4454 	if (status != B_OK)
4455 		return status;
4456 
4457 	// the path buffer had been clobbered by the above call
4458 	length = strlcpy(pathBuffer, basePath, bufferSize);
4459 	if (pathBuffer[length - 1] != '/')
4460 		pathBuffer[length++] = '/';
4461 
4462 	path = pathBuffer + length;
4463 	bufferSize -= length;
4464 
4465 	while (moduleName) {
4466 		char* nextPath = strchr(moduleName, '/');
4467 		if (nextPath == NULL)
4468 			length = strlen(moduleName);
4469 		else {
4470 			length = nextPath - moduleName;
4471 			nextPath++;
4472 		}
4473 
4474 		if (length + 1 >= bufferSize) {
4475 			status = B_BUFFER_OVERFLOW;
4476 			goto err;
4477 		}
4478 
4479 		memcpy(path, moduleName, length);
4480 		path[length] = '\0';
4481 		moduleName = nextPath;
4482 
4483 		status = vnode_path_to_vnode(dir, path, true, 0, true, &file, NULL);
4484 		if (status != B_OK) {
4485 			// vnode_path_to_vnode() has already released the reference to dir
4486 			return status;
4487 		}
4488 
4489 		if (S_ISDIR(file->Type())) {
4490 			// goto the next directory
4491 			path[length] = '/';
4492 			path[length + 1] = '\0';
4493 			path += length + 1;
4494 			bufferSize -= length + 1;
4495 
4496 			dir = file;
4497 		} else if (S_ISREG(file->Type())) {
4498 			// it's a file so it should be what we've searched for
4499 			put_vnode(file);
4500 
4501 			return B_OK;
4502 		} else {
4503 			TRACE(("vfs_get_module_path(): something is strange here: "
4504 				"0x%08" B_PRIx32 "...\n", file->Type()));
4505 			status = B_ERROR;
4506 			dir = file;
4507 			goto err;
4508 		}
4509 	}
4510 
4511 	// if we got here, the moduleName just pointed to a directory, not to
4512 	// a real module - what should we do in this case?
4513 	status = B_ENTRY_NOT_FOUND;
4514 
4515 err:
4516 	put_vnode(dir);
4517 	return status;
4518 }
4519 
4520 
4521 /*!	\brief Normalizes a given path.
4522 
4523 	The path must refer to an existing or non-existing entry in an existing
4524 	directory, that is chopping off the leaf component the remaining path must
4525 	refer to an existing directory.
4526 
4527 	The returned will be canonical in that it will be absolute, will not
4528 	contain any "." or ".." components or duplicate occurrences of '/'s,
4529 	and none of the directory components will by symbolic links.
4530 
4531 	Any two paths referring to the same entry, will result in the same
4532 	normalized path (well, that is pretty much the definition of `normalized',
4533 	isn't it :-).
4534 
4535 	\param path The path to be normalized.
4536 	\param buffer The buffer into which the normalized path will be written.
4537 		   May be the same one as \a path.
4538 	\param bufferSize The size of \a buffer.
4539 	\param traverseLink If \c true, the function also resolves leaf symlinks.
4540 	\param kernel \c true, if the IO context of the kernel shall be used,
4541 		   otherwise that of the team this thread belongs to. Only relevant,
4542 		   if the path is relative (to get the CWD).
4543 	\return \c B_OK if everything went fine, another error code otherwise.
4544 */
4545 status_t
4546 vfs_normalize_path(const char* path, char* buffer, size_t bufferSize,
4547 	bool traverseLink, bool kernel)
4548 {
4549 	if (!path || !buffer || bufferSize < 1)
4550 		return B_BAD_VALUE;
4551 
4552 	if (path != buffer) {
4553 		if (strlcpy(buffer, path, bufferSize) >= bufferSize)
4554 			return B_BUFFER_OVERFLOW;
4555 	}
4556 
4557 	return normalize_path(buffer, bufferSize, traverseLink, kernel);
4558 }
4559 
4560 
4561 /*!	\brief Gets the parent of the passed in node.
4562 
4563 	Gets the parent of the passed in node, and correctly resolves covered
4564 	nodes.
4565 */
4566 extern "C" status_t
4567 vfs_resolve_parent(struct vnode* parent, dev_t* device, ino_t* node)
4568 {
4569 	return resolve_covered_parent(parent, device, node,
4570 		get_current_io_context(true));
4571 }
4572 
4573 
4574 /*!	\brief Creates a special node in the file system.
4575 
4576 	The caller gets a reference to the newly created node (which is passed
4577 	back through \a _createdVnode) and is responsible for releasing it.
4578 
4579 	\param path The path where to create the entry for the node. Can be \c NULL,
4580 		in which case the node is created without an entry in the root FS -- it
4581 		will automatically be deleted when the last reference has been released.
4582 	\param subVnode The definition of the subnode. Can be \c NULL, in which case
4583 		the target file system will just create the node with its standard
4584 		operations. Depending on the type of the node a subnode might be created
4585 		automatically, though.
4586 	\param mode The type and permissions for the node to be created.
4587 	\param flags Flags to be passed to the creating FS.
4588 	\param kernel \c true, if called in the kernel context (relevant only if
4589 		\a path is not \c NULL and not absolute).
4590 	\param _superVnode Pointer to a pre-allocated structure to be filled by the
4591 		file system creating the node, with the private data pointer and
4592 		operations for the super node. Can be \c NULL.
4593 	\param _createVnode Pointer to pre-allocated storage where to store the
4594 		pointer to the newly created node.
4595 	\return \c B_OK, if everything went fine, another error code otherwise.
4596 */
4597 status_t
4598 vfs_create_special_node(const char* path, fs_vnode* subVnode, mode_t mode,
4599 	uint32 flags, bool kernel, fs_vnode* _superVnode,
4600 	struct vnode** _createdVnode)
4601 {
4602 	struct vnode* dirNode;
4603 	char _leaf[B_FILE_NAME_LENGTH];
4604 	char* leaf = NULL;
4605 
4606 	if (path) {
4607 		// We've got a path. Get the dir vnode and the leaf name.
4608 		KPath tmpPathBuffer;
4609 		if (tmpPathBuffer.InitCheck() != B_OK)
4610 			return B_NO_MEMORY;
4611 
4612 		char* tmpPath = tmpPathBuffer.LockBuffer();
4613 		if (strlcpy(tmpPath, path, B_PATH_NAME_LENGTH) >= B_PATH_NAME_LENGTH)
4614 			return B_NAME_TOO_LONG;
4615 
4616 		// get the dir vnode and the leaf name
4617 		leaf = _leaf;
4618 		status_t error = path_to_dir_vnode(tmpPath, &dirNode, leaf, kernel);
4619 		if (error != B_OK)
4620 			return error;
4621 	} else {
4622 		// No path. Create the node in the root FS.
4623 		dirNode = sRoot;
4624 		inc_vnode_ref_count(dirNode);
4625 	}
4626 
4627 	VNodePutter _(dirNode);
4628 
4629 	// check support for creating special nodes
4630 	if (!HAS_FS_CALL(dirNode, create_special_node))
4631 		return B_UNSUPPORTED;
4632 
4633 	// create the node
4634 	fs_vnode superVnode;
4635 	ino_t nodeID;
4636 	status_t status = FS_CALL(dirNode, create_special_node, leaf, subVnode,
4637 		mode, flags, _superVnode != NULL ? _superVnode : &superVnode, &nodeID);
4638 	if (status != B_OK)
4639 		return status;
4640 
4641 	// lookup the node
4642 	rw_lock_read_lock(&sVnodeLock);
4643 	*_createdVnode = lookup_vnode(dirNode->mount->id, nodeID);
4644 	rw_lock_read_unlock(&sVnodeLock);
4645 
4646 	if (*_createdVnode == NULL) {
4647 		panic("vfs_create_special_node(): lookup of node failed");
4648 		return B_ERROR;
4649 	}
4650 
4651 	return B_OK;
4652 }
4653 
4654 
4655 extern "C" void
4656 vfs_put_vnode(struct vnode* vnode)
4657 {
4658 	put_vnode(vnode);
4659 }
4660 
4661 
4662 extern "C" status_t
4663 vfs_get_cwd(dev_t* _mountID, ino_t* _vnodeID)
4664 {
4665 	// Get current working directory from io context
4666 	struct io_context* context = get_current_io_context(false);
4667 	status_t status = B_OK;
4668 
4669 	mutex_lock(&context->io_mutex);
4670 
4671 	if (context->cwd != NULL) {
4672 		*_mountID = context->cwd->device;
4673 		*_vnodeID = context->cwd->id;
4674 	} else
4675 		status = B_ERROR;
4676 
4677 	mutex_unlock(&context->io_mutex);
4678 	return status;
4679 }
4680 
4681 
4682 status_t
4683 vfs_unmount(dev_t mountID, uint32 flags)
4684 {
4685 	return fs_unmount(NULL, mountID, flags, true);
4686 }
4687 
4688 
4689 extern "C" status_t
4690 vfs_disconnect_vnode(dev_t mountID, ino_t vnodeID)
4691 {
4692 	struct vnode* vnode;
4693 
4694 	status_t status = get_vnode(mountID, vnodeID, &vnode, true, true);
4695 	if (status != B_OK)
4696 		return status;
4697 
4698 	disconnect_mount_or_vnode_fds(vnode->mount, vnode);
4699 	put_vnode(vnode);
4700 	return B_OK;
4701 }
4702 
4703 
4704 extern "C" void
4705 vfs_free_unused_vnodes(int32 level)
4706 {
4707 	vnode_low_resource_handler(NULL,
4708 		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
4709 			| B_KERNEL_RESOURCE_ADDRESS_SPACE,
4710 		level);
4711 }
4712 
4713 
4714 extern "C" bool
4715 vfs_can_page(struct vnode* vnode, void* cookie)
4716 {
4717 	FUNCTION(("vfs_canpage: vnode %p\n", vnode));
4718 
4719 	if (HAS_FS_CALL(vnode, can_page))
4720 		return FS_CALL(vnode, can_page, cookie);
4721 	return false;
4722 }
4723 
4724 
4725 extern "C" status_t
4726 vfs_read_pages(struct vnode* vnode, void* cookie, off_t pos,
4727 	const generic_io_vec* vecs, size_t count, uint32 flags,
4728 	generic_size_t* _numBytes)
4729 {
4730 	FUNCTION(("vfs_read_pages: vnode %p, vecs %p, pos %" B_PRIdOFF "\n", vnode,
4731 		vecs, pos));
4732 
4733 #if VFS_PAGES_IO_TRACING
4734 	generic_size_t bytesRequested = *_numBytes;
4735 #endif
4736 
4737 	IORequest request;
4738 	status_t status = request.Init(pos, vecs, count, *_numBytes, false, flags);
4739 	if (status == B_OK) {
4740 		status = vfs_vnode_io(vnode, cookie, &request);
4741 		if (status == B_OK)
4742 			status = request.Wait();
4743 		*_numBytes = request.TransferredBytes();
4744 	}
4745 
4746 	TPIO(ReadPages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4747 		status, *_numBytes));
4748 
4749 	return status;
4750 }
4751 
4752 
4753 extern "C" status_t
4754 vfs_write_pages(struct vnode* vnode, void* cookie, off_t pos,
4755 	const generic_io_vec* vecs, size_t count, uint32 flags,
4756 	generic_size_t* _numBytes)
4757 {
4758 	FUNCTION(("vfs_write_pages: vnode %p, vecs %p, pos %" B_PRIdOFF "\n", vnode,
4759 		vecs, pos));
4760 
4761 #if VFS_PAGES_IO_TRACING
4762 	generic_size_t bytesRequested = *_numBytes;
4763 #endif
4764 
4765 	IORequest request;
4766 	status_t status = request.Init(pos, vecs, count, *_numBytes, true, flags);
4767 	if (status == B_OK) {
4768 		status = vfs_vnode_io(vnode, cookie, &request);
4769 		if (status == B_OK)
4770 			status = request.Wait();
4771 		*_numBytes = request.TransferredBytes();
4772 	}
4773 
4774 	TPIO(WritePages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4775 		status, *_numBytes));
4776 
4777 	return status;
4778 }
4779 
4780 
4781 /*!	Gets the vnode's VMCache object. If it didn't have one, it will be
4782 	created if \a allocate is \c true.
4783 	In case it's successful, it will also grab a reference to the cache
4784 	it returns.
4785 */
4786 extern "C" status_t
4787 vfs_get_vnode_cache(struct vnode* vnode, VMCache** _cache, bool allocate)
4788 {
4789 	if (vnode->cache != NULL) {
4790 		vnode->cache->AcquireRef();
4791 		*_cache = vnode->cache;
4792 		return B_OK;
4793 	}
4794 
4795 	rw_lock_read_lock(&sVnodeLock);
4796 	vnode->Lock();
4797 
4798 	status_t status = B_OK;
4799 
4800 	// The cache could have been created in the meantime
4801 	if (vnode->cache == NULL) {
4802 		if (allocate) {
4803 			// TODO: actually the vnode needs to be busy already here, or
4804 			//	else this won't work...
4805 			bool wasBusy = vnode->IsBusy();
4806 			vnode->SetBusy(true);
4807 
4808 			vnode->Unlock();
4809 			rw_lock_read_unlock(&sVnodeLock);
4810 
4811 			status = vm_create_vnode_cache(vnode, &vnode->cache);
4812 
4813 			rw_lock_read_lock(&sVnodeLock);
4814 			vnode->Lock();
4815 			vnode->SetBusy(wasBusy);
4816 		} else
4817 			status = B_BAD_VALUE;
4818 	}
4819 
4820 	vnode->Unlock();
4821 	rw_lock_read_unlock(&sVnodeLock);
4822 
4823 	if (status == B_OK) {
4824 		vnode->cache->AcquireRef();
4825 		*_cache = vnode->cache;
4826 	}
4827 
4828 	return status;
4829 }
4830 
4831 
4832 /*!	Sets the vnode's VMCache object, for subsystems that want to manage
4833 	their own.
4834 	In case it's successful, it will also grab a reference to the cache
4835 	it returns.
4836 */
4837 extern "C" status_t
4838 vfs_set_vnode_cache(struct vnode* vnode, VMCache* _cache)
4839 {
4840 	rw_lock_read_lock(&sVnodeLock);
4841 	vnode->Lock();
4842 
4843 	status_t status = B_OK;
4844 	if (vnode->cache != NULL) {
4845 		status = B_NOT_ALLOWED;
4846 	} else {
4847 		vnode->cache = _cache;
4848 		_cache->AcquireRef();
4849 	}
4850 
4851 	vnode->Unlock();
4852 	rw_lock_read_unlock(&sVnodeLock);
4853 	return status;
4854 }
4855 
4856 
4857 status_t
4858 vfs_get_file_map(struct vnode* vnode, off_t offset, size_t size,
4859 	file_io_vec* vecs, size_t* _count)
4860 {
4861 	FUNCTION(("vfs_get_file_map: vnode %p, vecs %p, offset %" B_PRIdOFF
4862 		", size = %" B_PRIuSIZE "\n", vnode, vecs, offset, size));
4863 
4864 	return FS_CALL(vnode, get_file_map, offset, size, vecs, _count);
4865 }
4866 
4867 
4868 status_t
4869 vfs_stat_vnode(struct vnode* vnode, struct stat* stat)
4870 {
4871 	status_t status = FS_CALL(vnode, read_stat, stat);
4872 
4873 	// fill in the st_dev and st_ino fields
4874 	if (status == B_OK) {
4875 		stat->st_dev = vnode->device;
4876 		stat->st_ino = vnode->id;
4877 		// the rdev field must stay unset for non-special files
4878 		if (!S_ISBLK(stat->st_mode) && !S_ISCHR(stat->st_mode))
4879 			stat->st_rdev = -1;
4880 	}
4881 
4882 	return status;
4883 }
4884 
4885 
4886 status_t
4887 vfs_stat_node_ref(dev_t device, ino_t inode, struct stat* stat)
4888 {
4889 	struct vnode* vnode;
4890 	status_t status = get_vnode(device, inode, &vnode, true, false);
4891 	if (status != B_OK)
4892 		return status;
4893 
4894 	status = vfs_stat_vnode(vnode, stat);
4895 
4896 	put_vnode(vnode);
4897 	return status;
4898 }
4899 
4900 
4901 status_t
4902 vfs_get_vnode_name(struct vnode* vnode, char* name, size_t nameSize)
4903 {
4904 	return get_vnode_name(vnode, NULL, name, nameSize, true);
4905 }
4906 
4907 
4908 status_t
4909 vfs_entry_ref_to_path(dev_t device, ino_t inode, const char* leaf,
4910 	bool kernel, char* path, size_t pathLength)
4911 {
4912 	struct vnode* vnode;
4913 	status_t status;
4914 
4915 	// filter invalid leaf names
4916 	if (leaf != NULL && (leaf[0] == '\0' || strchr(leaf, '/')))
4917 		return B_BAD_VALUE;
4918 
4919 	// get the vnode matching the dir's node_ref
4920 	if (leaf && (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0)) {
4921 		// special cases "." and "..": we can directly get the vnode of the
4922 		// referenced directory
4923 		status = entry_ref_to_vnode(device, inode, leaf, false, kernel, &vnode);
4924 		leaf = NULL;
4925 	} else
4926 		status = get_vnode(device, inode, &vnode, true, false);
4927 	if (status != B_OK)
4928 		return status;
4929 
4930 	// get the directory path
4931 	status = dir_vnode_to_path(vnode, path, pathLength, kernel);
4932 	put_vnode(vnode);
4933 		// we don't need the vnode anymore
4934 	if (status != B_OK)
4935 		return status;
4936 
4937 	// append the leaf name
4938 	if (leaf) {
4939 		// insert a directory separator if this is not the file system root
4940 		if ((strcmp(path, "/") && strlcat(path, "/", pathLength)
4941 				>= pathLength)
4942 			|| strlcat(path, leaf, pathLength) >= pathLength) {
4943 			return B_NAME_TOO_LONG;
4944 		}
4945 	}
4946 
4947 	return B_OK;
4948 }
4949 
4950 
4951 /*!	If the given descriptor locked its vnode, that lock will be released. */
4952 void
4953 vfs_unlock_vnode_if_locked(struct file_descriptor* descriptor)
4954 {
4955 	struct vnode* vnode = fd_vnode(descriptor);
4956 
4957 	if (vnode != NULL && vnode->mandatory_locked_by == descriptor)
4958 		vnode->mandatory_locked_by = NULL;
4959 }
4960 
4961 
4962 /*!	Releases any POSIX locks on the file descriptor. */
4963 status_t
4964 vfs_release_posix_lock(io_context* context, struct file_descriptor* descriptor)
4965 {
4966 	struct vnode* vnode = descriptor->u.vnode;
4967 	if (vnode == NULL)
4968 		return B_OK;
4969 
4970 	if (HAS_FS_CALL(vnode, release_lock))
4971 		return FS_CALL(vnode, release_lock, descriptor->cookie, NULL);
4972 
4973 	return release_advisory_lock(vnode, context, NULL, NULL);
4974 }
4975 
4976 
4977 /*!	Closes all file descriptors of the specified I/O context that
4978 	have the O_CLOEXEC flag set.
4979 */
4980 void
4981 vfs_exec_io_context(io_context* context)
4982 {
4983 	uint32 i;
4984 
4985 	for (i = 0; i < context->table_size; i++) {
4986 		mutex_lock(&context->io_mutex);
4987 
4988 		struct file_descriptor* descriptor = context->fds[i];
4989 		bool remove = false;
4990 
4991 		if (descriptor != NULL && fd_close_on_exec(context, i)) {
4992 			context->fds[i] = NULL;
4993 			context->num_used_fds--;
4994 
4995 			remove = true;
4996 		}
4997 
4998 		mutex_unlock(&context->io_mutex);
4999 
5000 		if (remove) {
5001 			close_fd(context, descriptor);
5002 			put_fd(descriptor);
5003 		}
5004 	}
5005 }
5006 
5007 
5008 /*! Sets up a new io_control structure, and inherits the properties
5009 	of the parent io_control if it is given.
5010 */
5011 io_context*
5012 vfs_new_io_context(io_context* parentContext, bool purgeCloseOnExec)
5013 {
5014 	io_context* context = (io_context*)malloc(sizeof(io_context));
5015 	if (context == NULL)
5016 		return NULL;
5017 
5018 	TIOC(NewIOContext(context, parentContext));
5019 
5020 	memset(context, 0, sizeof(io_context));
5021 	context->ref_count = 1;
5022 
5023 	MutexLocker parentLocker;
5024 
5025 	size_t tableSize;
5026 	if (parentContext != NULL) {
5027 		parentLocker.SetTo(parentContext->io_mutex, false);
5028 		tableSize = parentContext->table_size;
5029 	} else
5030 		tableSize = DEFAULT_FD_TABLE_SIZE;
5031 
5032 	// allocate space for FDs and their close-on-exec flag
5033 	context->fds = (file_descriptor**)malloc(
5034 		sizeof(struct file_descriptor*) * tableSize
5035 		+ sizeof(struct select_sync*) * tableSize
5036 		+ (tableSize + 7) / 8);
5037 	if (context->fds == NULL) {
5038 		free(context);
5039 		return NULL;
5040 	}
5041 
5042 	context->select_infos = (select_info**)(context->fds + tableSize);
5043 	context->fds_close_on_exec = (uint8*)(context->select_infos + tableSize);
5044 
5045 	memset(context->fds, 0, sizeof(struct file_descriptor*) * tableSize
5046 		+ sizeof(struct select_sync*) * tableSize
5047 		+ (tableSize + 7) / 8);
5048 
5049 	mutex_init(&context->io_mutex, "I/O context");
5050 
5051 	// Copy all parent file descriptors
5052 
5053 	if (parentContext != NULL) {
5054 		size_t i;
5055 
5056 		mutex_lock(&sIOContextRootLock);
5057 		context->root = parentContext->root;
5058 		if (context->root)
5059 			inc_vnode_ref_count(context->root);
5060 		mutex_unlock(&sIOContextRootLock);
5061 
5062 		context->cwd = parentContext->cwd;
5063 		if (context->cwd)
5064 			inc_vnode_ref_count(context->cwd);
5065 
5066 		if (parentContext->inherit_fds) {
5067 			for (i = 0; i < tableSize; i++) {
5068 				struct file_descriptor* descriptor = parentContext->fds[i];
5069 
5070 				if (descriptor != NULL
5071 					&& (descriptor->open_mode & O_DISCONNECTED) == 0) {
5072 					bool closeOnExec = fd_close_on_exec(parentContext, i);
5073 					if (closeOnExec && purgeCloseOnExec)
5074 						continue;
5075 
5076 					TFD(InheritFD(context, i, descriptor, parentContext));
5077 
5078 					context->fds[i] = descriptor;
5079 					context->num_used_fds++;
5080 					atomic_add(&descriptor->ref_count, 1);
5081 					atomic_add(&descriptor->open_count, 1);
5082 
5083 					if (closeOnExec)
5084 						fd_set_close_on_exec(context, i, true);
5085 				}
5086 			}
5087 		}
5088 
5089 		parentLocker.Unlock();
5090 	} else {
5091 		context->root = sRoot;
5092 		context->cwd = sRoot;
5093 
5094 		if (context->root)
5095 			inc_vnode_ref_count(context->root);
5096 
5097 		if (context->cwd)
5098 			inc_vnode_ref_count(context->cwd);
5099 	}
5100 
5101 	context->table_size = tableSize;
5102 	context->inherit_fds = parentContext != NULL;
5103 
5104 	list_init(&context->node_monitors);
5105 	context->max_monitors = DEFAULT_NODE_MONITORS;
5106 
5107 	return context;
5108 }
5109 
5110 
5111 void
5112 vfs_get_io_context(io_context* context)
5113 {
5114 	atomic_add(&context->ref_count, 1);
5115 }
5116 
5117 
5118 void
5119 vfs_put_io_context(io_context* context)
5120 {
5121 	if (atomic_add(&context->ref_count, -1) == 1)
5122 		free_io_context(context);
5123 }
5124 
5125 
5126 status_t
5127 vfs_resize_fd_table(struct io_context* context, uint32 newSize)
5128 {
5129 	if (newSize == 0 || newSize > MAX_FD_TABLE_SIZE)
5130 		return B_BAD_VALUE;
5131 
5132 	TIOC(ResizeIOContext(context, newSize));
5133 
5134 	MutexLocker _(context->io_mutex);
5135 
5136 	uint32 oldSize = context->table_size;
5137 	int oldCloseOnExitBitmapSize = (oldSize + 7) / 8;
5138 	int newCloseOnExitBitmapSize = (newSize + 7) / 8;
5139 
5140 	// If the tables shrink, make sure none of the fds being dropped are in use.
5141 	if (newSize < oldSize) {
5142 		for (uint32 i = oldSize; i-- > newSize;) {
5143 			if (context->fds[i])
5144 				return B_BUSY;
5145 		}
5146 	}
5147 
5148 	// store pointers to the old tables
5149 	file_descriptor** oldFDs = context->fds;
5150 	select_info** oldSelectInfos = context->select_infos;
5151 	uint8* oldCloseOnExecTable = context->fds_close_on_exec;
5152 
5153 	// allocate new tables
5154 	file_descriptor** newFDs = (file_descriptor**)malloc(
5155 		sizeof(struct file_descriptor*) * newSize
5156 		+ sizeof(struct select_sync*) * newSize
5157 		+ newCloseOnExitBitmapSize);
5158 	if (newFDs == NULL)
5159 		return B_NO_MEMORY;
5160 
5161 	context->fds = newFDs;
5162 	context->select_infos = (select_info**)(context->fds + newSize);
5163 	context->fds_close_on_exec = (uint8*)(context->select_infos + newSize);
5164 	context->table_size = newSize;
5165 
5166 	// copy entries from old tables
5167 	uint32 toCopy = min_c(oldSize, newSize);
5168 
5169 	memcpy(context->fds, oldFDs, sizeof(void*) * toCopy);
5170 	memcpy(context->select_infos, oldSelectInfos, sizeof(void*) * toCopy);
5171 	memcpy(context->fds_close_on_exec, oldCloseOnExecTable,
5172 		min_c(oldCloseOnExitBitmapSize, newCloseOnExitBitmapSize));
5173 
5174 	// clear additional entries, if the tables grow
5175 	if (newSize > oldSize) {
5176 		memset(context->fds + oldSize, 0, sizeof(void*) * (newSize - oldSize));
5177 		memset(context->select_infos + oldSize, 0,
5178 			sizeof(void*) * (newSize - oldSize));
5179 		memset(context->fds_close_on_exec + oldCloseOnExitBitmapSize, 0,
5180 			newCloseOnExitBitmapSize - oldCloseOnExitBitmapSize);
5181 	}
5182 
5183 	free(oldFDs);
5184 
5185 	return B_OK;
5186 }
5187 
5188 
5189 /*!	\brief Resolves a vnode to the vnode it is covered by, if any.
5190 
5191 	Given an arbitrary vnode (identified by mount and node ID), the function
5192 	checks, whether the vnode is covered by another vnode. If it is, the
5193 	function returns the mount and node ID of the covering vnode. Otherwise
5194 	it simply returns the supplied mount and node ID.
5195 
5196 	In case of error (e.g. the supplied node could not be found) the variables
5197 	for storing the resolved mount and node ID remain untouched and an error
5198 	code is returned.
5199 
5200 	\param mountID The mount ID of the vnode in question.
5201 	\param nodeID The node ID of the vnode in question.
5202 	\param resolvedMountID Pointer to storage for the resolved mount ID.
5203 	\param resolvedNodeID Pointer to storage for the resolved node ID.
5204 	\return
5205 	- \c B_OK, if everything went fine,
5206 	- another error code, if something went wrong.
5207 */
5208 status_t
5209 vfs_resolve_vnode_to_covering_vnode(dev_t mountID, ino_t nodeID,
5210 	dev_t* resolvedMountID, ino_t* resolvedNodeID)
5211 {
5212 	// get the node
5213 	struct vnode* node;
5214 	status_t error = get_vnode(mountID, nodeID, &node, true, false);
5215 	if (error != B_OK)
5216 		return error;
5217 
5218 	// resolve the node
5219 	if (Vnode* coveringNode = get_covering_vnode(node)) {
5220 		put_vnode(node);
5221 		node = coveringNode;
5222 	}
5223 
5224 	// set the return values
5225 	*resolvedMountID = node->device;
5226 	*resolvedNodeID = node->id;
5227 
5228 	put_vnode(node);
5229 
5230 	return B_OK;
5231 }
5232 
5233 
5234 status_t
5235 vfs_get_mount_point(dev_t mountID, dev_t* _mountPointMountID,
5236 	ino_t* _mountPointNodeID)
5237 {
5238 	ReadLocker nodeLocker(sVnodeLock);
5239 	MutexLocker mountLocker(sMountMutex);
5240 
5241 	struct fs_mount* mount = find_mount(mountID);
5242 	if (mount == NULL)
5243 		return B_BAD_VALUE;
5244 
5245 	Vnode* mountPoint = mount->covers_vnode;
5246 
5247 	*_mountPointMountID = mountPoint->device;
5248 	*_mountPointNodeID = mountPoint->id;
5249 
5250 	return B_OK;
5251 }
5252 
5253 
5254 status_t
5255 vfs_bind_mount_directory(dev_t mountID, ino_t nodeID, dev_t coveredMountID,
5256 	ino_t coveredNodeID)
5257 {
5258 	// get the vnodes
5259 	Vnode* vnode;
5260 	status_t error = get_vnode(mountID, nodeID, &vnode, true, false);
5261 	if (error != B_OK)
5262 		return B_BAD_VALUE;
5263 	VNodePutter vnodePutter(vnode);
5264 
5265 	Vnode* coveredVnode;
5266 	error = get_vnode(coveredMountID, coveredNodeID, &coveredVnode, true,
5267 		false);
5268 	if (error != B_OK)
5269 		return B_BAD_VALUE;
5270 	VNodePutter coveredVnodePutter(coveredVnode);
5271 
5272 	// establish the covered/covering links
5273 	WriteLocker locker(sVnodeLock);
5274 
5275 	if (vnode->covers != NULL || coveredVnode->covered_by != NULL
5276 		|| vnode->mount->unmounting || coveredVnode->mount->unmounting) {
5277 		return B_BUSY;
5278 	}
5279 
5280 	vnode->covers = coveredVnode;
5281 	vnode->SetCovering(true);
5282 
5283 	coveredVnode->covered_by = vnode;
5284 	coveredVnode->SetCovered(true);
5285 
5286 	// the vnodes do now reference each other
5287 	inc_vnode_ref_count(vnode);
5288 	inc_vnode_ref_count(coveredVnode);
5289 
5290 	return B_OK;
5291 }
5292 
5293 
5294 int
5295 vfs_getrlimit(int resource, struct rlimit* rlp)
5296 {
5297 	if (!rlp)
5298 		return B_BAD_ADDRESS;
5299 
5300 	switch (resource) {
5301 		case RLIMIT_NOFILE:
5302 		{
5303 			struct io_context* context = get_current_io_context(false);
5304 			MutexLocker _(context->io_mutex);
5305 
5306 			rlp->rlim_cur = context->table_size;
5307 			rlp->rlim_max = MAX_FD_TABLE_SIZE;
5308 			return 0;
5309 		}
5310 
5311 		case RLIMIT_NOVMON:
5312 		{
5313 			struct io_context* context = get_current_io_context(false);
5314 			MutexLocker _(context->io_mutex);
5315 
5316 			rlp->rlim_cur = context->max_monitors;
5317 			rlp->rlim_max = MAX_NODE_MONITORS;
5318 			return 0;
5319 		}
5320 
5321 		default:
5322 			return B_BAD_VALUE;
5323 	}
5324 }
5325 
5326 
5327 int
5328 vfs_setrlimit(int resource, const struct rlimit* rlp)
5329 {
5330 	if (!rlp)
5331 		return B_BAD_ADDRESS;
5332 
5333 	switch (resource) {
5334 		case RLIMIT_NOFILE:
5335 			/* TODO: check getuid() */
5336 			if (rlp->rlim_max != RLIM_SAVED_MAX
5337 				&& rlp->rlim_max != MAX_FD_TABLE_SIZE)
5338 				return B_NOT_ALLOWED;
5339 
5340 			return vfs_resize_fd_table(get_current_io_context(false),
5341 				rlp->rlim_cur);
5342 
5343 		case RLIMIT_NOVMON:
5344 			/* TODO: check getuid() */
5345 			if (rlp->rlim_max != RLIM_SAVED_MAX
5346 				&& rlp->rlim_max != MAX_NODE_MONITORS)
5347 				return B_NOT_ALLOWED;
5348 
5349 			return resize_monitor_table(get_current_io_context(false),
5350 				rlp->rlim_cur);
5351 
5352 		default:
5353 			return B_BAD_VALUE;
5354 	}
5355 }
5356 
5357 
5358 status_t
5359 vfs_init(kernel_args* args)
5360 {
5361 	vnode::StaticInit();
5362 
5363 	sVnodeTable = new(std::nothrow) VnodeTable();
5364 	if (sVnodeTable == NULL || sVnodeTable->Init(VNODE_HASH_TABLE_SIZE) != B_OK)
5365 		panic("vfs_init: error creating vnode hash table\n");
5366 
5367 	struct vnode dummy_vnode;
5368 	list_init_etc(&sUnusedVnodeList, offset_of_member(dummy_vnode, unused_link));
5369 
5370 	struct fs_mount dummyMount;
5371 	sMountsTable = new(std::nothrow) MountTable();
5372 	if (sMountsTable == NULL
5373 			|| sMountsTable->Init(MOUNTS_HASH_TABLE_SIZE) != B_OK)
5374 		panic("vfs_init: error creating mounts hash table\n");
5375 
5376 	sPathNameCache = create_object_cache("vfs path names",
5377 		B_PATH_NAME_LENGTH + 1, 8, NULL, NULL, NULL);
5378 	if (sPathNameCache == NULL)
5379 		panic("vfs_init: error creating path name object_cache\n");
5380 
5381 	sFileDescriptorCache = create_object_cache("vfs fds",
5382 		sizeof(file_descriptor), 8, NULL, NULL, NULL);
5383 	if (sFileDescriptorCache == NULL)
5384 		panic("vfs_init: error creating file descriptor object_cache\n");
5385 
5386 	node_monitor_init();
5387 
5388 	sRoot = NULL;
5389 
5390 	recursive_lock_init(&sMountOpLock, "vfs_mount_op_lock");
5391 
5392 	if (block_cache_init() != B_OK)
5393 		return B_ERROR;
5394 
5395 #ifdef ADD_DEBUGGER_COMMANDS
5396 	// add some debugger commands
5397 	add_debugger_command_etc("vnode", &dump_vnode,
5398 		"Print info about the specified vnode",
5399 		"[ \"-p\" ] ( <vnode> | <devID> <nodeID> )\n"
5400 		"Prints information about the vnode specified by address <vnode> or\n"
5401 		"<devID>, <vnodeID> pair. If \"-p\" is given, a path of the vnode is\n"
5402 		"constructed and printed. It might not be possible to construct a\n"
5403 		"complete path, though.\n",
5404 		0);
5405 	add_debugger_command("vnodes", &dump_vnodes,
5406 		"list all vnodes (from the specified device)");
5407 	add_debugger_command("vnode_caches", &dump_vnode_caches,
5408 		"list all vnode caches");
5409 	add_debugger_command("mount", &dump_mount,
5410 		"info about the specified fs_mount");
5411 	add_debugger_command("mounts", &dump_mounts, "list all fs_mounts");
5412 	add_debugger_command("io_context", &dump_io_context,
5413 		"info about the I/O context");
5414 	add_debugger_command("vnode_usage", &dump_vnode_usage,
5415 		"info about vnode usage");
5416 #endif
5417 
5418 	register_low_resource_handler(&vnode_low_resource_handler, NULL,
5419 		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
5420 			| B_KERNEL_RESOURCE_ADDRESS_SPACE,
5421 		0);
5422 
5423 	fifo_init();
5424 	file_map_init();
5425 
5426 	return file_cache_init();
5427 }
5428 
5429 
5430 //	#pragma mark - fd_ops implementations
5431 
5432 
5433 /*!
5434 	Calls fs_open() on the given vnode and returns a new
5435 	file descriptor for it
5436 */
5437 static int
5438 open_vnode(struct vnode* vnode, int openMode, bool kernel)
5439 {
5440 	void* cookie;
5441 	status_t status = FS_CALL(vnode, open, openMode, &cookie);
5442 	if (status != B_OK)
5443 		return status;
5444 
5445 	int fd = get_new_fd(FDTYPE_FILE, NULL, vnode, cookie, openMode, kernel);
5446 	if (fd < 0) {
5447 		FS_CALL(vnode, close, cookie);
5448 		FS_CALL(vnode, free_cookie, cookie);
5449 	}
5450 	return fd;
5451 }
5452 
5453 
5454 /*!
5455 	Calls fs_open() on the given vnode and returns a new
5456 	file descriptor for it
5457 */
5458 static int
5459 create_vnode(struct vnode* directory, const char* name, int openMode,
5460 	int perms, bool kernel)
5461 {
5462 	bool traverse = ((openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0);
5463 	status_t status = B_ERROR;
5464 	struct vnode* vnode;
5465 	void* cookie;
5466 	ino_t newID;
5467 
5468 	// This is somewhat tricky: If the entry already exists, the FS responsible
5469 	// for the directory might not necessarily also be the one responsible for
5470 	// the node the entry refers to (e.g. in case of mount points or FIFOs). So
5471 	// we can actually never call the create() hook without O_EXCL. Instead we
5472 	// try to look the entry up first. If it already exists, we just open the
5473 	// node (unless O_EXCL), otherwise we call create() with O_EXCL. This
5474 	// introduces a race condition, since someone else might have created the
5475 	// entry in the meantime. We hope the respective FS returns the correct
5476 	// error code and retry (up to 3 times) again.
5477 
5478 	for (int i = 0; i < 3 && status != B_OK; i++) {
5479 		// look the node up
5480 		status = lookup_dir_entry(directory, name, &vnode);
5481 		if (status == B_OK) {
5482 			VNodePutter putter(vnode);
5483 
5484 			if ((openMode & O_EXCL) != 0)
5485 				return B_FILE_EXISTS;
5486 
5487 			// If the node is a symlink, we have to follow it, unless
5488 			// O_NOTRAVERSE is set.
5489 			if (S_ISLNK(vnode->Type()) && traverse) {
5490 				putter.Put();
5491 				char clonedName[B_FILE_NAME_LENGTH + 1];
5492 				if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH)
5493 						>= B_FILE_NAME_LENGTH) {
5494 					return B_NAME_TOO_LONG;
5495 				}
5496 
5497 				inc_vnode_ref_count(directory);
5498 				status = vnode_path_to_vnode(directory, clonedName, true, 0,
5499 					kernel, &vnode, NULL);
5500 				if (status != B_OK)
5501 					return status;
5502 
5503 				putter.SetTo(vnode);
5504 			}
5505 
5506 			if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type()))
5507 				return B_LINK_LIMIT;
5508 
5509 			int fd = open_vnode(vnode, openMode & ~O_CREAT, kernel);
5510 			// on success keep the vnode reference for the FD
5511 			if (fd >= 0)
5512 				putter.Detach();
5513 
5514 			return fd;
5515 		}
5516 
5517 		// it doesn't exist yet -- try to create it
5518 
5519 		if (!HAS_FS_CALL(directory, create))
5520 			return B_READ_ONLY_DEVICE;
5521 
5522 		status = FS_CALL(directory, create, name, openMode | O_EXCL, perms,
5523 			&cookie, &newID);
5524 		if (status != B_OK
5525 			&& ((openMode & O_EXCL) != 0 || status != B_FILE_EXISTS)) {
5526 			return status;
5527 		}
5528 	}
5529 
5530 	if (status != B_OK)
5531 		return status;
5532 
5533 	// the node has been created successfully
5534 
5535 	rw_lock_read_lock(&sVnodeLock);
5536 	vnode = lookup_vnode(directory->device, newID);
5537 	rw_lock_read_unlock(&sVnodeLock);
5538 
5539 	if (vnode == NULL) {
5540 		panic("vfs: fs_create() returned success but there is no vnode, "
5541 			"mount ID %" B_PRIdDEV "!\n", directory->device);
5542 		return B_BAD_VALUE;
5543 	}
5544 
5545 	int fd = get_new_fd(FDTYPE_FILE, NULL, vnode, cookie, openMode, kernel);
5546 	if (fd >= 0)
5547 		return fd;
5548 
5549 	status = fd;
5550 
5551 	// something went wrong, clean up
5552 
5553 	FS_CALL(vnode, close, cookie);
5554 	FS_CALL(vnode, free_cookie, cookie);
5555 	put_vnode(vnode);
5556 
5557 	FS_CALL(directory, unlink, name);
5558 
5559 	return status;
5560 }
5561 
5562 
5563 /*! Calls fs open_dir() on the given vnode and returns a new
5564 	file descriptor for it
5565 */
5566 static int
5567 open_dir_vnode(struct vnode* vnode, bool kernel)
5568 {
5569 	void* cookie;
5570 	status_t status = FS_CALL(vnode, open_dir, &cookie);
5571 	if (status != B_OK)
5572 		return status;
5573 
5574 	// directory is opened, create a fd
5575 	status = get_new_fd(FDTYPE_DIR, NULL, vnode, cookie, O_CLOEXEC, kernel);
5576 	if (status >= 0)
5577 		return status;
5578 
5579 	FS_CALL(vnode, close_dir, cookie);
5580 	FS_CALL(vnode, free_dir_cookie, cookie);
5581 
5582 	return status;
5583 }
5584 
5585 
5586 /*! Calls fs open_attr_dir() on the given vnode and returns a new
5587 	file descriptor for it.
5588 	Used by attr_dir_open(), and attr_dir_open_fd().
5589 */
5590 static int
5591 open_attr_dir_vnode(struct vnode* vnode, bool kernel)
5592 {
5593 	if (!HAS_FS_CALL(vnode, open_attr_dir))
5594 		return B_UNSUPPORTED;
5595 
5596 	void* cookie;
5597 	status_t status = FS_CALL(vnode, open_attr_dir, &cookie);
5598 	if (status != B_OK)
5599 		return status;
5600 
5601 	// directory is opened, create a fd
5602 	status = get_new_fd(FDTYPE_ATTR_DIR, NULL, vnode, cookie, O_CLOEXEC,
5603 		kernel);
5604 	if (status >= 0)
5605 		return status;
5606 
5607 	FS_CALL(vnode, close_attr_dir, cookie);
5608 	FS_CALL(vnode, free_attr_dir_cookie, cookie);
5609 
5610 	return status;
5611 }
5612 
5613 
5614 static int
5615 file_create_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5616 	int openMode, int perms, bool kernel)
5617 {
5618 	FUNCTION(("file_create_entry_ref: name = '%s', omode %x, perms %d, "
5619 		"kernel %d\n", name, openMode, perms, kernel));
5620 
5621 	// get directory to put the new file in
5622 	struct vnode* directory;
5623 	status_t status = get_vnode(mountID, directoryID, &directory, true, false);
5624 	if (status != B_OK)
5625 		return status;
5626 
5627 	status = create_vnode(directory, name, openMode, perms, kernel);
5628 	put_vnode(directory);
5629 
5630 	return status;
5631 }
5632 
5633 
5634 static int
5635 file_create(int fd, char* path, int openMode, int perms, bool kernel)
5636 {
5637 	FUNCTION(("file_create: path '%s', omode %x, perms %d, kernel %d\n", path,
5638 		openMode, perms, kernel));
5639 
5640 	// get directory to put the new file in
5641 	char name[B_FILE_NAME_LENGTH];
5642 	struct vnode* directory;
5643 	status_t status = fd_and_path_to_dir_vnode(fd, path, &directory, name,
5644 		kernel);
5645 	if (status < 0)
5646 		return status;
5647 
5648 	status = create_vnode(directory, name, openMode, perms, kernel);
5649 
5650 	put_vnode(directory);
5651 	return status;
5652 }
5653 
5654 
5655 static int
5656 file_open_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5657 	int openMode, bool kernel)
5658 {
5659 	if (name == NULL || *name == '\0')
5660 		return B_BAD_VALUE;
5661 
5662 	FUNCTION(("file_open_entry_ref(ref = (%" B_PRId32 ", %" B_PRId64 ", %s), "
5663 		"openMode = %d)\n", mountID, directoryID, name, openMode));
5664 
5665 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5666 
5667 	// get the vnode matching the entry_ref
5668 	struct vnode* vnode;
5669 	status_t status = entry_ref_to_vnode(mountID, directoryID, name, traverse,
5670 		kernel, &vnode);
5671 	if (status != B_OK)
5672 		return status;
5673 
5674 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
5675 		put_vnode(vnode);
5676 		return B_LINK_LIMIT;
5677 	}
5678 
5679 	int newFD = open_vnode(vnode, openMode, kernel);
5680 	if (newFD >= 0) {
5681 		// The vnode reference has been transferred to the FD
5682 		cache_node_opened(vnode, FDTYPE_FILE, vnode->cache, mountID,
5683 			directoryID, vnode->id, name);
5684 	} else
5685 		put_vnode(vnode);
5686 
5687 	return newFD;
5688 }
5689 
5690 
5691 static int
5692 file_open(int fd, char* path, int openMode, bool kernel)
5693 {
5694 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5695 
5696 	FUNCTION(("file_open: fd: %d, entry path = '%s', omode %d, kernel %d\n",
5697 		fd, path, openMode, kernel));
5698 
5699 	// get the vnode matching the vnode + path combination
5700 	struct vnode* vnode;
5701 	ino_t parentID;
5702 	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode,
5703 		&parentID, kernel);
5704 	if (status != B_OK)
5705 		return status;
5706 
5707 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
5708 		put_vnode(vnode);
5709 		return B_LINK_LIMIT;
5710 	}
5711 
5712 	// open the vnode
5713 	int newFD = open_vnode(vnode, openMode, kernel);
5714 	if (newFD >= 0) {
5715 		// The vnode reference has been transferred to the FD
5716 		cache_node_opened(vnode, FDTYPE_FILE, vnode->cache,
5717 			vnode->device, parentID, vnode->id, NULL);
5718 	} else
5719 		put_vnode(vnode);
5720 
5721 	return newFD;
5722 }
5723 
5724 
5725 static status_t
5726 file_close(struct file_descriptor* descriptor)
5727 {
5728 	struct vnode* vnode = descriptor->u.vnode;
5729 	status_t status = B_OK;
5730 
5731 	FUNCTION(("file_close(descriptor = %p)\n", descriptor));
5732 
5733 	cache_node_closed(vnode, FDTYPE_FILE, vnode->cache, vnode->device,
5734 		vnode->id);
5735 	if (HAS_FS_CALL(vnode, close)) {
5736 		status = FS_CALL(vnode, close, descriptor->cookie);
5737 	}
5738 
5739 	if (status == B_OK) {
5740 		// remove all outstanding locks for this team
5741 		if (HAS_FS_CALL(vnode, release_lock))
5742 			status = FS_CALL(vnode, release_lock, descriptor->cookie, NULL);
5743 		else
5744 			status = release_advisory_lock(vnode, NULL, descriptor, NULL);
5745 	}
5746 	return status;
5747 }
5748 
5749 
5750 static void
5751 file_free_fd(struct file_descriptor* descriptor)
5752 {
5753 	struct vnode* vnode = descriptor->u.vnode;
5754 
5755 	if (vnode != NULL) {
5756 		FS_CALL(vnode, free_cookie, descriptor->cookie);
5757 		put_vnode(vnode);
5758 	}
5759 }
5760 
5761 
5762 static status_t
5763 file_read(struct file_descriptor* descriptor, off_t pos, void* buffer,
5764 	size_t* length)
5765 {
5766 	struct vnode* vnode = descriptor->u.vnode;
5767 	FUNCTION(("file_read: buf %p, pos %" B_PRIdOFF ", len %p = %ld\n", buffer,
5768 		pos, length, *length));
5769 
5770 	if (S_ISDIR(vnode->Type()))
5771 		return B_IS_A_DIRECTORY;
5772 
5773 	return FS_CALL(vnode, read, descriptor->cookie, pos, buffer, length);
5774 }
5775 
5776 
5777 static status_t
5778 file_write(struct file_descriptor* descriptor, off_t pos, const void* buffer,
5779 	size_t* length)
5780 {
5781 	struct vnode* vnode = descriptor->u.vnode;
5782 	FUNCTION(("file_write: buf %p, pos %" B_PRIdOFF ", len %p\n", buffer, pos,
5783 		length));
5784 
5785 	if (S_ISDIR(vnode->Type()))
5786 		return B_IS_A_DIRECTORY;
5787 	if (!HAS_FS_CALL(vnode, write))
5788 		return B_READ_ONLY_DEVICE;
5789 
5790 	return FS_CALL(vnode, write, descriptor->cookie, pos, buffer, length);
5791 }
5792 
5793 
5794 static off_t
5795 file_seek(struct file_descriptor* descriptor, off_t pos, int seekType)
5796 {
5797 	struct vnode* vnode = descriptor->u.vnode;
5798 	off_t offset;
5799 	bool isDevice = false;
5800 
5801 	FUNCTION(("file_seek(pos = %" B_PRIdOFF ", seekType = %d)\n", pos,
5802 		seekType));
5803 
5804 	// some kinds of files are not seekable
5805 	switch (vnode->Type() & S_IFMT) {
5806 		case S_IFIFO:
5807 		case S_IFSOCK:
5808 			return ESPIPE;
5809 
5810 		// drivers publish block devices as chr, so pick both
5811 		case S_IFBLK:
5812 		case S_IFCHR:
5813 			isDevice = true;
5814 			break;
5815 		// The Open Group Base Specs don't mention any file types besides pipes,
5816 		// fifos, and sockets specially, so we allow seeking them.
5817 		case S_IFREG:
5818 		case S_IFDIR:
5819 		case S_IFLNK:
5820 			break;
5821 	}
5822 
5823 	switch (seekType) {
5824 		case SEEK_SET:
5825 			offset = 0;
5826 			break;
5827 		case SEEK_CUR:
5828 			offset = descriptor->pos;
5829 			break;
5830 		case SEEK_END:
5831 		{
5832 			// stat() the node
5833 			if (!HAS_FS_CALL(vnode, read_stat))
5834 				return B_UNSUPPORTED;
5835 
5836 			struct stat stat;
5837 			status_t status = FS_CALL(vnode, read_stat, &stat);
5838 			if (status != B_OK)
5839 				return status;
5840 
5841 			offset = stat.st_size;
5842 
5843 			if (offset == 0 && isDevice) {
5844 				// stat() on regular drivers doesn't report size
5845 				device_geometry geometry;
5846 
5847 				if (HAS_FS_CALL(vnode, ioctl)) {
5848 					status = FS_CALL(vnode, ioctl, descriptor->cookie,
5849 						B_GET_GEOMETRY, &geometry, sizeof(geometry));
5850 					if (status == B_OK)
5851 						offset = (off_t)geometry.bytes_per_sector
5852 							* geometry.sectors_per_track
5853 							* geometry.cylinder_count
5854 							* geometry.head_count;
5855 				}
5856 			}
5857 
5858 			break;
5859 		}
5860 		default:
5861 			return B_BAD_VALUE;
5862 	}
5863 
5864 	// assumes off_t is 64 bits wide
5865 	if (offset > 0 && LONGLONG_MAX - offset < pos)
5866 		return B_BUFFER_OVERFLOW;
5867 
5868 	pos += offset;
5869 	if (pos < 0)
5870 		return B_BAD_VALUE;
5871 
5872 	return descriptor->pos = pos;
5873 }
5874 
5875 
5876 static status_t
5877 file_select(struct file_descriptor* descriptor, uint8 event,
5878 	struct selectsync* sync)
5879 {
5880 	FUNCTION(("file_select(%p, %u, %p)\n", descriptor, event, sync));
5881 
5882 	struct vnode* vnode = descriptor->u.vnode;
5883 
5884 	// If the FS has no select() hook, notify select() now.
5885 	if (!HAS_FS_CALL(vnode, select)) {
5886 		if (!SELECT_TYPE_IS_OUTPUT_ONLY(event))
5887 			return notify_select_event(sync, event);
5888 		else
5889 			return B_OK;
5890 	}
5891 
5892 	return FS_CALL(vnode, select, descriptor->cookie, event, sync);
5893 }
5894 
5895 
5896 static status_t
5897 file_deselect(struct file_descriptor* descriptor, uint8 event,
5898 	struct selectsync* sync)
5899 {
5900 	struct vnode* vnode = descriptor->u.vnode;
5901 
5902 	if (!HAS_FS_CALL(vnode, deselect))
5903 		return B_OK;
5904 
5905 	return FS_CALL(vnode, deselect, descriptor->cookie, event, sync);
5906 }
5907 
5908 
5909 static status_t
5910 dir_create_entry_ref(dev_t mountID, ino_t parentID, const char* name, int perms,
5911 	bool kernel)
5912 {
5913 	struct vnode* vnode;
5914 	status_t status;
5915 
5916 	if (name == NULL || *name == '\0')
5917 		return B_BAD_VALUE;
5918 
5919 	FUNCTION(("dir_create_entry_ref(dev = %" B_PRId32 ", ino = %" B_PRId64 ", "
5920 		"name = '%s', perms = %d)\n", mountID, parentID, name, perms));
5921 
5922 	status = get_vnode(mountID, parentID, &vnode, true, false);
5923 	if (status != B_OK)
5924 		return status;
5925 
5926 	if (HAS_FS_CALL(vnode, create_dir))
5927 		status = FS_CALL(vnode, create_dir, name, perms);
5928 	else
5929 		status = B_READ_ONLY_DEVICE;
5930 
5931 	put_vnode(vnode);
5932 	return status;
5933 }
5934 
5935 
5936 static status_t
5937 dir_create(int fd, char* path, int perms, bool kernel)
5938 {
5939 	char filename[B_FILE_NAME_LENGTH];
5940 	struct vnode* vnode;
5941 	status_t status;
5942 
5943 	FUNCTION(("dir_create: path '%s', perms %d, kernel %d\n", path, perms,
5944 		kernel));
5945 
5946 	status = fd_and_path_to_dir_vnode(fd, path, &vnode, filename, kernel);
5947 	if (status < 0)
5948 		return status;
5949 
5950 	if (HAS_FS_CALL(vnode, create_dir)) {
5951 		status = FS_CALL(vnode, create_dir, filename, perms);
5952 	} else
5953 		status = B_READ_ONLY_DEVICE;
5954 
5955 	put_vnode(vnode);
5956 	return status;
5957 }
5958 
5959 
5960 static int
5961 dir_open_entry_ref(dev_t mountID, ino_t parentID, const char* name, bool kernel)
5962 {
5963 	FUNCTION(("dir_open_entry_ref()\n"));
5964 
5965 	if (name && name[0] == '\0')
5966 		return B_BAD_VALUE;
5967 
5968 	// get the vnode matching the entry_ref/node_ref
5969 	struct vnode* vnode;
5970 	status_t status;
5971 	if (name) {
5972 		status = entry_ref_to_vnode(mountID, parentID, name, true, kernel,
5973 			&vnode);
5974 	} else
5975 		status = get_vnode(mountID, parentID, &vnode, true, false);
5976 	if (status != B_OK)
5977 		return status;
5978 
5979 	int newFD = open_dir_vnode(vnode, kernel);
5980 	if (newFD >= 0) {
5981 		// The vnode reference has been transferred to the FD
5982 		cache_node_opened(vnode, FDTYPE_DIR, vnode->cache, mountID, parentID,
5983 			vnode->id, name);
5984 	} else
5985 		put_vnode(vnode);
5986 
5987 	return newFD;
5988 }
5989 
5990 
5991 static int
5992 dir_open(int fd, char* path, bool kernel)
5993 {
5994 	FUNCTION(("dir_open: fd: %d, entry path = '%s', kernel %d\n", fd, path,
5995 		kernel));
5996 
5997 	// get the vnode matching the vnode + path combination
5998 	struct vnode* vnode = NULL;
5999 	ino_t parentID;
6000 	status_t status = fd_and_path_to_vnode(fd, path, true, &vnode, &parentID,
6001 		kernel);
6002 	if (status != B_OK)
6003 		return status;
6004 
6005 	// open the dir
6006 	int newFD = open_dir_vnode(vnode, kernel);
6007 	if (newFD >= 0) {
6008 		// The vnode reference has been transferred to the FD
6009 		cache_node_opened(vnode, FDTYPE_DIR, vnode->cache, vnode->device,
6010 			parentID, vnode->id, NULL);
6011 	} else
6012 		put_vnode(vnode);
6013 
6014 	return newFD;
6015 }
6016 
6017 
6018 static status_t
6019 dir_close(struct file_descriptor* descriptor)
6020 {
6021 	struct vnode* vnode = descriptor->u.vnode;
6022 
6023 	FUNCTION(("dir_close(descriptor = %p)\n", descriptor));
6024 
6025 	cache_node_closed(vnode, FDTYPE_DIR, vnode->cache, vnode->device,
6026 		vnode->id);
6027 	if (HAS_FS_CALL(vnode, close_dir))
6028 		return FS_CALL(vnode, close_dir, descriptor->cookie);
6029 
6030 	return B_OK;
6031 }
6032 
6033 
6034 static void
6035 dir_free_fd(struct file_descriptor* descriptor)
6036 {
6037 	struct vnode* vnode = descriptor->u.vnode;
6038 
6039 	if (vnode != NULL) {
6040 		FS_CALL(vnode, free_dir_cookie, descriptor->cookie);
6041 		put_vnode(vnode);
6042 	}
6043 }
6044 
6045 
6046 static status_t
6047 dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
6048 	struct dirent* buffer, size_t bufferSize, uint32* _count)
6049 {
6050 	return dir_read(ioContext, descriptor->u.vnode, descriptor->cookie, buffer,
6051 		bufferSize, _count);
6052 }
6053 
6054 
6055 static status_t
6056 fix_dirent(struct vnode* parent, struct dirent* entry,
6057 	struct io_context* ioContext)
6058 {
6059 	// set d_pdev and d_pino
6060 	entry->d_pdev = parent->device;
6061 	entry->d_pino = parent->id;
6062 
6063 	// If this is the ".." entry and the directory covering another vnode,
6064 	// we need to replace d_dev and d_ino with the actual values.
6065 	if (strcmp(entry->d_name, "..") == 0 && parent->IsCovering()) {
6066 		return resolve_covered_parent(parent, &entry->d_dev, &entry->d_ino,
6067 			ioContext);
6068 	}
6069 
6070 	// resolve covered vnodes
6071 	ReadLocker _(&sVnodeLock);
6072 
6073 	struct vnode* vnode = lookup_vnode(entry->d_dev, entry->d_ino);
6074 	if (vnode != NULL && vnode->covered_by != NULL) {
6075 		do {
6076 			vnode = vnode->covered_by;
6077 		} while (vnode->covered_by != NULL);
6078 
6079 		entry->d_dev = vnode->device;
6080 		entry->d_ino = vnode->id;
6081 	}
6082 
6083 	return B_OK;
6084 }
6085 
6086 
6087 static status_t
6088 dir_read(struct io_context* ioContext, struct vnode* vnode, void* cookie,
6089 	struct dirent* buffer, size_t bufferSize, uint32* _count)
6090 {
6091 	if (!HAS_FS_CALL(vnode, read_dir))
6092 		return B_UNSUPPORTED;
6093 
6094 	status_t error = FS_CALL(vnode, read_dir, cookie, buffer, bufferSize,
6095 		_count);
6096 	if (error != B_OK)
6097 		return error;
6098 
6099 	// we need to adjust the read dirents
6100 	uint32 count = *_count;
6101 	for (uint32 i = 0; i < count; i++) {
6102 		error = fix_dirent(vnode, buffer, ioContext);
6103 		if (error != B_OK)
6104 			return error;
6105 
6106 		buffer = (struct dirent*)((uint8*)buffer + buffer->d_reclen);
6107 	}
6108 
6109 	return error;
6110 }
6111 
6112 
6113 static status_t
6114 dir_rewind(struct file_descriptor* descriptor)
6115 {
6116 	struct vnode* vnode = descriptor->u.vnode;
6117 
6118 	if (HAS_FS_CALL(vnode, rewind_dir)) {
6119 		return FS_CALL(vnode, rewind_dir, descriptor->cookie);
6120 	}
6121 
6122 	return B_UNSUPPORTED;
6123 }
6124 
6125 
6126 static status_t
6127 dir_remove(int fd, char* path, bool kernel)
6128 {
6129 	char name[B_FILE_NAME_LENGTH];
6130 	struct vnode* directory;
6131 	status_t status;
6132 
6133 	if (path != NULL) {
6134 		// we need to make sure our path name doesn't stop with "/", ".",
6135 		// or ".."
6136 		char* lastSlash;
6137 		while ((lastSlash = strrchr(path, '/')) != NULL) {
6138 			char* leaf = lastSlash + 1;
6139 			if (!strcmp(leaf, ".."))
6140 				return B_NOT_ALLOWED;
6141 
6142 			// omit multiple slashes
6143 			while (lastSlash > path && lastSlash[-1] == '/')
6144 				lastSlash--;
6145 
6146 			if (leaf[0]
6147 				&& strcmp(leaf, ".")) {
6148 				break;
6149 			}
6150 			// "name/" -> "name", or "name/." -> "name"
6151 			lastSlash[0] = '\0';
6152 		}
6153 
6154 		if (!strcmp(path, ".") || !strcmp(path, ".."))
6155 			return B_NOT_ALLOWED;
6156 	}
6157 
6158 	status = fd_and_path_to_dir_vnode(fd, path, &directory, name, kernel);
6159 	if (status != B_OK)
6160 		return status;
6161 
6162 	if (HAS_FS_CALL(directory, remove_dir))
6163 		status = FS_CALL(directory, remove_dir, name);
6164 	else
6165 		status = B_READ_ONLY_DEVICE;
6166 
6167 	put_vnode(directory);
6168 	return status;
6169 }
6170 
6171 
6172 static status_t
6173 common_ioctl(struct file_descriptor* descriptor, ulong op, void* buffer,
6174 	size_t length)
6175 {
6176 	struct vnode* vnode = descriptor->u.vnode;
6177 
6178 	if (HAS_FS_CALL(vnode, ioctl))
6179 		return FS_CALL(vnode, ioctl, descriptor->cookie, op, buffer, length);
6180 
6181 	return B_DEV_INVALID_IOCTL;
6182 }
6183 
6184 
6185 static status_t
6186 common_fcntl(int fd, int op, size_t argument, bool kernel)
6187 {
6188 	struct flock flock;
6189 
6190 	FUNCTION(("common_fcntl(fd = %d, op = %d, argument = %lx, %s)\n",
6191 		fd, op, argument, kernel ? "kernel" : "user"));
6192 
6193 	struct io_context* context = get_current_io_context(kernel);
6194 
6195 	struct file_descriptor* descriptor = get_fd(context, fd);
6196 	if (descriptor == NULL)
6197 		return B_FILE_ERROR;
6198 
6199 	struct vnode* vnode = fd_vnode(descriptor);
6200 
6201 	status_t status = B_OK;
6202 
6203 	if (op == F_SETLK || op == F_SETLKW || op == F_GETLK) {
6204 		if (descriptor->type != FDTYPE_FILE)
6205 			status = B_BAD_VALUE;
6206 		else if (kernel)
6207 			memcpy(&flock, (struct flock*)argument, sizeof(struct flock));
6208 		else if (user_memcpy(&flock, (struct flock*)argument,
6209 				sizeof(struct flock)) != B_OK)
6210 			status = B_BAD_ADDRESS;
6211 		if (status != B_OK) {
6212 			put_fd(descriptor);
6213 			return status;
6214 		}
6215 	}
6216 
6217 	switch (op) {
6218 		case F_SETFD:
6219 		{
6220 			// Set file descriptor flags
6221 
6222 			// O_CLOEXEC is the only flag available at this time
6223 			mutex_lock(&context->io_mutex);
6224 			fd_set_close_on_exec(context, fd, (argument & FD_CLOEXEC) != 0);
6225 			mutex_unlock(&context->io_mutex);
6226 
6227 			status = B_OK;
6228 			break;
6229 		}
6230 
6231 		case F_GETFD:
6232 		{
6233 			// Get file descriptor flags
6234 			mutex_lock(&context->io_mutex);
6235 			status = fd_close_on_exec(context, fd) ? FD_CLOEXEC : 0;
6236 			mutex_unlock(&context->io_mutex);
6237 			break;
6238 		}
6239 
6240 		case F_SETFL:
6241 			// Set file descriptor open mode
6242 
6243 			// we only accept changes to O_APPEND and O_NONBLOCK
6244 			argument &= O_APPEND | O_NONBLOCK;
6245 			if (descriptor->ops->fd_set_flags != NULL) {
6246 				status = descriptor->ops->fd_set_flags(descriptor, argument);
6247 			} else if (vnode != NULL && HAS_FS_CALL(vnode, set_flags)) {
6248 				status = FS_CALL(vnode, set_flags, descriptor->cookie,
6249 					(int)argument);
6250 			} else
6251 				status = B_UNSUPPORTED;
6252 
6253 			if (status == B_OK) {
6254 				// update this descriptor's open_mode field
6255 				descriptor->open_mode = (descriptor->open_mode
6256 					& ~(O_APPEND | O_NONBLOCK)) | argument;
6257 			}
6258 
6259 			break;
6260 
6261 		case F_GETFL:
6262 			// Get file descriptor open mode
6263 			status = descriptor->open_mode;
6264 			break;
6265 
6266 		case F_DUPFD:
6267 		case F_DUPFD_CLOEXEC:
6268 		{
6269 			status = new_fd_etc(context, descriptor, (int)argument);
6270 			if (status >= 0) {
6271 				mutex_lock(&context->io_mutex);
6272 				fd_set_close_on_exec(context, status, op == F_DUPFD_CLOEXEC);
6273 				mutex_unlock(&context->io_mutex);
6274 
6275 				atomic_add(&descriptor->ref_count, 1);
6276 			}
6277 			break;
6278 		}
6279 
6280 		case F_GETLK:
6281 			if (vnode != NULL) {
6282 				struct flock normalizedLock;
6283 
6284 				memcpy(&normalizedLock, &flock, sizeof(struct flock));
6285 				status = normalize_flock(descriptor, &normalizedLock);
6286 				if (status != B_OK)
6287 					break;
6288 
6289 				if (HAS_FS_CALL(vnode, test_lock)) {
6290 					status = FS_CALL(vnode, test_lock, descriptor->cookie,
6291 						&normalizedLock);
6292 				} else
6293 					status = test_advisory_lock(vnode, &normalizedLock);
6294 				if (status == B_OK) {
6295 					if (normalizedLock.l_type == F_UNLCK) {
6296 						// no conflicting lock found, copy back the same struct
6297 						// we were given except change type to F_UNLCK
6298 						flock.l_type = F_UNLCK;
6299 						if (kernel) {
6300 							memcpy((struct flock*)argument, &flock,
6301 								sizeof(struct flock));
6302 						} else {
6303 							status = user_memcpy((struct flock*)argument,
6304 								&flock, sizeof(struct flock));
6305 						}
6306 					} else {
6307 						// a conflicting lock was found, copy back its range and
6308 						// type
6309 						if (normalizedLock.l_len == OFF_MAX)
6310 							normalizedLock.l_len = 0;
6311 
6312 						if (kernel) {
6313 							memcpy((struct flock*)argument,
6314 								&normalizedLock, sizeof(struct flock));
6315 						} else {
6316 							status = user_memcpy((struct flock*)argument,
6317 								&normalizedLock, sizeof(struct flock));
6318 						}
6319 					}
6320 				}
6321 			} else
6322 				status = B_BAD_VALUE;
6323 			break;
6324 
6325 		case F_SETLK:
6326 		case F_SETLKW:
6327 			status = normalize_flock(descriptor, &flock);
6328 			if (status != B_OK)
6329 				break;
6330 
6331 			if (vnode == NULL) {
6332 				status = B_BAD_VALUE;
6333 			} else if (flock.l_type == F_UNLCK) {
6334 				if (HAS_FS_CALL(vnode, release_lock)) {
6335 					status = FS_CALL(vnode, release_lock, descriptor->cookie,
6336 						&flock);
6337 				} else {
6338 					status = release_advisory_lock(vnode, context, NULL,
6339 						&flock);
6340 				}
6341 			} else {
6342 				// the open mode must match the lock type
6343 				if (((descriptor->open_mode & O_RWMASK) == O_RDONLY
6344 						&& flock.l_type == F_WRLCK)
6345 					|| ((descriptor->open_mode & O_RWMASK) == O_WRONLY
6346 						&& flock.l_type == F_RDLCK))
6347 					status = B_FILE_ERROR;
6348 				else {
6349 					if (HAS_FS_CALL(vnode, acquire_lock)) {
6350 						status = FS_CALL(vnode, acquire_lock,
6351 							descriptor->cookie, &flock, op == F_SETLKW);
6352 					} else {
6353 						status = acquire_advisory_lock(vnode, context, NULL,
6354 							&flock, op == F_SETLKW);
6355 					}
6356 				}
6357 			}
6358 			break;
6359 
6360 		// ToDo: add support for more ops?
6361 
6362 		default:
6363 			status = B_BAD_VALUE;
6364 	}
6365 
6366 	put_fd(descriptor);
6367 	return status;
6368 }
6369 
6370 
6371 static status_t
6372 common_sync(int fd, bool kernel)
6373 {
6374 	struct file_descriptor* descriptor;
6375 	struct vnode* vnode;
6376 	status_t status;
6377 
6378 	FUNCTION(("common_fsync: entry. fd %d kernel %d\n", fd, kernel));
6379 
6380 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6381 	if (descriptor == NULL)
6382 		return B_FILE_ERROR;
6383 
6384 	if (HAS_FS_CALL(vnode, fsync))
6385 		status = FS_CALL_NO_PARAMS(vnode, fsync);
6386 	else
6387 		status = B_UNSUPPORTED;
6388 
6389 	put_fd(descriptor);
6390 	return status;
6391 }
6392 
6393 
6394 static status_t
6395 common_lock_node(int fd, bool kernel)
6396 {
6397 	struct file_descriptor* descriptor;
6398 	struct vnode* vnode;
6399 
6400 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6401 	if (descriptor == NULL)
6402 		return B_FILE_ERROR;
6403 
6404 	status_t status = B_OK;
6405 
6406 	// We need to set the locking atomically - someone
6407 	// else might set one at the same time
6408 	if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by, descriptor,
6409 			(file_descriptor*)NULL) != NULL)
6410 		status = B_BUSY;
6411 
6412 	put_fd(descriptor);
6413 	return status;
6414 }
6415 
6416 
6417 static status_t
6418 common_unlock_node(int fd, bool kernel)
6419 {
6420 	struct file_descriptor* descriptor;
6421 	struct vnode* vnode;
6422 
6423 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6424 	if (descriptor == NULL)
6425 		return B_FILE_ERROR;
6426 
6427 	status_t status = B_OK;
6428 
6429 	// We need to set the locking atomically - someone
6430 	// else might set one at the same time
6431 	if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by,
6432 			(file_descriptor*)NULL, descriptor) != descriptor)
6433 		status = B_BAD_VALUE;
6434 
6435 	put_fd(descriptor);
6436 	return status;
6437 }
6438 
6439 
6440 static status_t
6441 common_read_link(int fd, char* path, char* buffer, size_t* _bufferSize,
6442 	bool kernel)
6443 {
6444 	struct vnode* vnode;
6445 	status_t status;
6446 
6447 	status = fd_and_path_to_vnode(fd, path, false, &vnode, NULL, kernel);
6448 	if (status != B_OK)
6449 		return status;
6450 
6451 	if (HAS_FS_CALL(vnode, read_symlink)) {
6452 		status = FS_CALL(vnode, read_symlink, buffer, _bufferSize);
6453 	} else
6454 		status = B_BAD_VALUE;
6455 
6456 	put_vnode(vnode);
6457 	return status;
6458 }
6459 
6460 
6461 static status_t
6462 common_create_symlink(int fd, char* path, const char* toPath, int mode,
6463 	bool kernel)
6464 {
6465 	// path validity checks have to be in the calling function!
6466 	char name[B_FILE_NAME_LENGTH];
6467 	struct vnode* vnode;
6468 	status_t status;
6469 
6470 	FUNCTION(("common_create_symlink(fd = %d, path = %s, toPath = %s, "
6471 		"mode = %d, kernel = %d)\n", fd, path, toPath, mode, kernel));
6472 
6473 	status = fd_and_path_to_dir_vnode(fd, path, &vnode, name, kernel);
6474 	if (status != B_OK)
6475 		return status;
6476 
6477 	if (HAS_FS_CALL(vnode, create_symlink))
6478 		status = FS_CALL(vnode, create_symlink, name, toPath, mode);
6479 	else {
6480 		status = HAS_FS_CALL(vnode, write)
6481 			? B_UNSUPPORTED : B_READ_ONLY_DEVICE;
6482 	}
6483 
6484 	put_vnode(vnode);
6485 
6486 	return status;
6487 }
6488 
6489 
6490 static status_t
6491 common_create_link(int pathFD, char* path, int toFD, char* toPath,
6492 	bool traverseLeafLink, bool kernel)
6493 {
6494 	// path validity checks have to be in the calling function!
6495 
6496 	FUNCTION(("common_create_link(path = %s, toPath = %s, kernel = %d)\n", path,
6497 		toPath, kernel));
6498 
6499 	char name[B_FILE_NAME_LENGTH];
6500 	struct vnode* directory;
6501 	status_t status = fd_and_path_to_dir_vnode(pathFD, path, &directory, name,
6502 		kernel);
6503 	if (status != B_OK)
6504 		return status;
6505 
6506 	struct vnode* vnode;
6507 	status = fd_and_path_to_vnode(toFD, toPath, traverseLeafLink, &vnode, NULL,
6508 		kernel);
6509 	if (status != B_OK)
6510 		goto err;
6511 
6512 	if (directory->mount != vnode->mount) {
6513 		status = B_CROSS_DEVICE_LINK;
6514 		goto err1;
6515 	}
6516 
6517 	if (HAS_FS_CALL(directory, link))
6518 		status = FS_CALL(directory, link, name, vnode);
6519 	else
6520 		status = B_READ_ONLY_DEVICE;
6521 
6522 err1:
6523 	put_vnode(vnode);
6524 err:
6525 	put_vnode(directory);
6526 
6527 	return status;
6528 }
6529 
6530 
6531 static status_t
6532 common_unlink(int fd, char* path, bool kernel)
6533 {
6534 	char filename[B_FILE_NAME_LENGTH];
6535 	struct vnode* vnode;
6536 	status_t status;
6537 
6538 	FUNCTION(("common_unlink: fd: %d, path '%s', kernel %d\n", fd, path,
6539 		kernel));
6540 
6541 	status = fd_and_path_to_dir_vnode(fd, path, &vnode, filename, kernel);
6542 	if (status < 0)
6543 		return status;
6544 
6545 	if (HAS_FS_CALL(vnode, unlink))
6546 		status = FS_CALL(vnode, unlink, filename);
6547 	else
6548 		status = B_READ_ONLY_DEVICE;
6549 
6550 	put_vnode(vnode);
6551 
6552 	return status;
6553 }
6554 
6555 
6556 static status_t
6557 common_access(int fd, char* path, int mode, bool effectiveUserGroup, bool kernel)
6558 {
6559 	struct vnode* vnode;
6560 	status_t status;
6561 
6562 	// TODO: honor effectiveUserGroup argument
6563 
6564 	status = fd_and_path_to_vnode(fd, path, true, &vnode, NULL, kernel);
6565 	if (status != B_OK)
6566 		return status;
6567 
6568 	if (HAS_FS_CALL(vnode, access))
6569 		status = FS_CALL(vnode, access, mode);
6570 	else
6571 		status = B_OK;
6572 
6573 	put_vnode(vnode);
6574 
6575 	return status;
6576 }
6577 
6578 
6579 static status_t
6580 common_rename(int fd, char* path, int newFD, char* newPath, bool kernel)
6581 {
6582 	struct vnode* fromVnode;
6583 	struct vnode* toVnode;
6584 	char fromName[B_FILE_NAME_LENGTH];
6585 	char toName[B_FILE_NAME_LENGTH];
6586 	status_t status;
6587 
6588 	FUNCTION(("common_rename(fd = %d, path = %s, newFD = %d, newPath = %s, "
6589 		"kernel = %d)\n", fd, path, newFD, newPath, kernel));
6590 
6591 	status = fd_and_path_to_dir_vnode(fd, path, &fromVnode, fromName, kernel);
6592 	if (status != B_OK)
6593 		return status;
6594 
6595 	status = fd_and_path_to_dir_vnode(newFD, newPath, &toVnode, toName, kernel);
6596 	if (status != B_OK)
6597 		goto err1;
6598 
6599 	if (fromVnode->device != toVnode->device) {
6600 		status = B_CROSS_DEVICE_LINK;
6601 		goto err2;
6602 	}
6603 
6604 	if (fromName[0] == '\0' || toName[0] == '\0'
6605 		|| !strcmp(fromName, ".") || !strcmp(fromName, "..")
6606 		|| !strcmp(toName, ".") || !strcmp(toName, "..")
6607 		|| (fromVnode == toVnode && !strcmp(fromName, toName))) {
6608 		status = B_BAD_VALUE;
6609 		goto err2;
6610 	}
6611 
6612 	if (HAS_FS_CALL(fromVnode, rename))
6613 		status = FS_CALL(fromVnode, rename, fromName, toVnode, toName);
6614 	else
6615 		status = B_READ_ONLY_DEVICE;
6616 
6617 err2:
6618 	put_vnode(toVnode);
6619 err1:
6620 	put_vnode(fromVnode);
6621 
6622 	return status;
6623 }
6624 
6625 
6626 static status_t
6627 common_read_stat(struct file_descriptor* descriptor, struct stat* stat)
6628 {
6629 	struct vnode* vnode = descriptor->u.vnode;
6630 
6631 	FUNCTION(("common_read_stat: stat %p\n", stat));
6632 
6633 	// TODO: remove this once all file systems properly set them!
6634 	stat->st_crtim.tv_nsec = 0;
6635 	stat->st_ctim.tv_nsec = 0;
6636 	stat->st_mtim.tv_nsec = 0;
6637 	stat->st_atim.tv_nsec = 0;
6638 
6639 	return vfs_stat_vnode(vnode, stat);
6640 }
6641 
6642 
6643 static status_t
6644 common_write_stat(struct file_descriptor* descriptor, const struct stat* stat,
6645 	int statMask)
6646 {
6647 	struct vnode* vnode = descriptor->u.vnode;
6648 
6649 	FUNCTION(("common_write_stat(vnode = %p, stat = %p, statMask = %d)\n",
6650 		vnode, stat, statMask));
6651 
6652 	if ((descriptor->open_mode & O_RWMASK) == O_RDONLY
6653 		&& (statMask & B_STAT_SIZE) != 0) {
6654 		return B_BAD_VALUE;
6655 	}
6656 
6657 	if (!HAS_FS_CALL(vnode, write_stat))
6658 		return B_READ_ONLY_DEVICE;
6659 
6660 	return FS_CALL(vnode, write_stat, stat, statMask);
6661 }
6662 
6663 
6664 static status_t
6665 common_path_read_stat(int fd, char* path, bool traverseLeafLink,
6666 	struct stat* stat, bool kernel)
6667 {
6668 	FUNCTION(("common_path_read_stat: fd: %d, path '%s', stat %p,\n", fd, path,
6669 		stat));
6670 
6671 	struct vnode* vnode;
6672 	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6673 		NULL, kernel);
6674 	if (status != B_OK)
6675 		return status;
6676 
6677 	status = vfs_stat_vnode(vnode, stat);
6678 
6679 	put_vnode(vnode);
6680 	return status;
6681 }
6682 
6683 
6684 static status_t
6685 common_path_write_stat(int fd, char* path, bool traverseLeafLink,
6686 	const struct stat* stat, int statMask, bool kernel)
6687 {
6688 	FUNCTION(("common_write_stat: fd: %d, path '%s', stat %p, stat_mask %d, "
6689 		"kernel %d\n", fd, path, stat, statMask, kernel));
6690 
6691 	struct vnode* vnode;
6692 	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6693 		NULL, kernel);
6694 	if (status != B_OK)
6695 		return status;
6696 
6697 	if (HAS_FS_CALL(vnode, write_stat))
6698 		status = FS_CALL(vnode, write_stat, stat, statMask);
6699 	else
6700 		status = B_READ_ONLY_DEVICE;
6701 
6702 	put_vnode(vnode);
6703 
6704 	return status;
6705 }
6706 
6707 
6708 static int
6709 attr_dir_open(int fd, char* path, bool traverseLeafLink, bool kernel)
6710 {
6711 	FUNCTION(("attr_dir_open(fd = %d, path = '%s', kernel = %d)\n", fd, path,
6712 		kernel));
6713 
6714 	struct vnode* vnode;
6715 	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6716 		NULL, kernel);
6717 	if (status != B_OK)
6718 		return status;
6719 
6720 	status = open_attr_dir_vnode(vnode, kernel);
6721 	if (status < 0)
6722 		put_vnode(vnode);
6723 
6724 	return status;
6725 }
6726 
6727 
6728 static status_t
6729 attr_dir_close(struct file_descriptor* descriptor)
6730 {
6731 	struct vnode* vnode = descriptor->u.vnode;
6732 
6733 	FUNCTION(("attr_dir_close(descriptor = %p)\n", descriptor));
6734 
6735 	if (HAS_FS_CALL(vnode, close_attr_dir))
6736 		return FS_CALL(vnode, close_attr_dir, descriptor->cookie);
6737 
6738 	return B_OK;
6739 }
6740 
6741 
6742 static void
6743 attr_dir_free_fd(struct file_descriptor* descriptor)
6744 {
6745 	struct vnode* vnode = descriptor->u.vnode;
6746 
6747 	if (vnode != NULL) {
6748 		FS_CALL(vnode, free_attr_dir_cookie, descriptor->cookie);
6749 		put_vnode(vnode);
6750 	}
6751 }
6752 
6753 
6754 static status_t
6755 attr_dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
6756 	struct dirent* buffer, size_t bufferSize, uint32* _count)
6757 {
6758 	struct vnode* vnode = descriptor->u.vnode;
6759 
6760 	FUNCTION(("attr_dir_read(descriptor = %p)\n", descriptor));
6761 
6762 	if (HAS_FS_CALL(vnode, read_attr_dir))
6763 		return FS_CALL(vnode, read_attr_dir, descriptor->cookie, buffer,
6764 			bufferSize, _count);
6765 
6766 	return B_UNSUPPORTED;
6767 }
6768 
6769 
6770 static status_t
6771 attr_dir_rewind(struct file_descriptor* descriptor)
6772 {
6773 	struct vnode* vnode = descriptor->u.vnode;
6774 
6775 	FUNCTION(("attr_dir_rewind(descriptor = %p)\n", descriptor));
6776 
6777 	if (HAS_FS_CALL(vnode, rewind_attr_dir))
6778 		return FS_CALL(vnode, rewind_attr_dir, descriptor->cookie);
6779 
6780 	return B_UNSUPPORTED;
6781 }
6782 
6783 
6784 static int
6785 attr_create(int fd, char* path, const char* name, uint32 type,
6786 	int openMode, bool kernel)
6787 {
6788 	if (name == NULL || *name == '\0')
6789 		return B_BAD_VALUE;
6790 
6791 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
6792 	struct vnode* vnode;
6793 	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode, NULL,
6794 		kernel);
6795 	if (status != B_OK)
6796 		return status;
6797 
6798 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
6799 		status = B_LINK_LIMIT;
6800 		goto err;
6801 	}
6802 
6803 	if (!HAS_FS_CALL(vnode, create_attr)) {
6804 		status = B_READ_ONLY_DEVICE;
6805 		goto err;
6806 	}
6807 
6808 	void* cookie;
6809 	status = FS_CALL(vnode, create_attr, name, type, openMode, &cookie);
6810 	if (status != B_OK)
6811 		goto err;
6812 
6813 	fd = get_new_fd(FDTYPE_ATTR, NULL, vnode, cookie, openMode, kernel);
6814 	if (fd >= 0)
6815 		return fd;
6816 
6817 	status = fd;
6818 
6819 	FS_CALL(vnode, close_attr, cookie);
6820 	FS_CALL(vnode, free_attr_cookie, cookie);
6821 
6822 	FS_CALL(vnode, remove_attr, name);
6823 
6824 err:
6825 	put_vnode(vnode);
6826 
6827 	return status;
6828 }
6829 
6830 
6831 static int
6832 attr_open(int fd, char* path, const char* name, int openMode, bool kernel)
6833 {
6834 	if (name == NULL || *name == '\0')
6835 		return B_BAD_VALUE;
6836 
6837 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
6838 	struct vnode* vnode;
6839 	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode, NULL,
6840 		kernel);
6841 	if (status != B_OK)
6842 		return status;
6843 
6844 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
6845 		status = B_LINK_LIMIT;
6846 		goto err;
6847 	}
6848 
6849 	if (!HAS_FS_CALL(vnode, open_attr)) {
6850 		status = B_UNSUPPORTED;
6851 		goto err;
6852 	}
6853 
6854 	void* cookie;
6855 	status = FS_CALL(vnode, open_attr, name, openMode, &cookie);
6856 	if (status != B_OK)
6857 		goto err;
6858 
6859 	// now we only need a file descriptor for this attribute and we're done
6860 	fd = get_new_fd(FDTYPE_ATTR, NULL, vnode, cookie, openMode, kernel);
6861 	if (fd >= 0)
6862 		return fd;
6863 
6864 	status = fd;
6865 
6866 	FS_CALL(vnode, close_attr, cookie);
6867 	FS_CALL(vnode, free_attr_cookie, cookie);
6868 
6869 err:
6870 	put_vnode(vnode);
6871 
6872 	return status;
6873 }
6874 
6875 
6876 static status_t
6877 attr_close(struct file_descriptor* descriptor)
6878 {
6879 	struct vnode* vnode = descriptor->u.vnode;
6880 
6881 	FUNCTION(("attr_close(descriptor = %p)\n", descriptor));
6882 
6883 	if (HAS_FS_CALL(vnode, close_attr))
6884 		return FS_CALL(vnode, close_attr, descriptor->cookie);
6885 
6886 	return B_OK;
6887 }
6888 
6889 
6890 static void
6891 attr_free_fd(struct file_descriptor* descriptor)
6892 {
6893 	struct vnode* vnode = descriptor->u.vnode;
6894 
6895 	if (vnode != NULL) {
6896 		FS_CALL(vnode, free_attr_cookie, descriptor->cookie);
6897 		put_vnode(vnode);
6898 	}
6899 }
6900 
6901 
6902 static status_t
6903 attr_read(struct file_descriptor* descriptor, off_t pos, void* buffer,
6904 	size_t* length)
6905 {
6906 	struct vnode* vnode = descriptor->u.vnode;
6907 
6908 	FUNCTION(("attr_read: buf %p, pos %" B_PRIdOFF ", len %p = %ld\n", buffer,
6909 		pos, length, *length));
6910 
6911 	if (!HAS_FS_CALL(vnode, read_attr))
6912 		return B_UNSUPPORTED;
6913 
6914 	return FS_CALL(vnode, read_attr, descriptor->cookie, pos, buffer, length);
6915 }
6916 
6917 
6918 static status_t
6919 attr_write(struct file_descriptor* descriptor, off_t pos, const void* buffer,
6920 	size_t* length)
6921 {
6922 	struct vnode* vnode = descriptor->u.vnode;
6923 
6924 	FUNCTION(("attr_write: buf %p, pos %" B_PRIdOFF ", len %p\n", buffer, pos,
6925 		length));
6926 
6927 	if (!HAS_FS_CALL(vnode, write_attr))
6928 		return B_UNSUPPORTED;
6929 
6930 	return FS_CALL(vnode, write_attr, descriptor->cookie, pos, buffer, length);
6931 }
6932 
6933 
6934 static off_t
6935 attr_seek(struct file_descriptor* descriptor, off_t pos, int seekType)
6936 {
6937 	off_t offset;
6938 
6939 	switch (seekType) {
6940 		case SEEK_SET:
6941 			offset = 0;
6942 			break;
6943 		case SEEK_CUR:
6944 			offset = descriptor->pos;
6945 			break;
6946 		case SEEK_END:
6947 		{
6948 			struct vnode* vnode = descriptor->u.vnode;
6949 			if (!HAS_FS_CALL(vnode, read_stat))
6950 				return B_UNSUPPORTED;
6951 
6952 			struct stat stat;
6953 			status_t status = FS_CALL(vnode, read_attr_stat, descriptor->cookie,
6954 				&stat);
6955 			if (status != B_OK)
6956 				return status;
6957 
6958 			offset = stat.st_size;
6959 			break;
6960 		}
6961 		default:
6962 			return B_BAD_VALUE;
6963 	}
6964 
6965 	// assumes off_t is 64 bits wide
6966 	if (offset > 0 && LONGLONG_MAX - offset < pos)
6967 		return B_BUFFER_OVERFLOW;
6968 
6969 	pos += offset;
6970 	if (pos < 0)
6971 		return B_BAD_VALUE;
6972 
6973 	return descriptor->pos = pos;
6974 }
6975 
6976 
6977 static status_t
6978 attr_read_stat(struct file_descriptor* descriptor, struct stat* stat)
6979 {
6980 	struct vnode* vnode = descriptor->u.vnode;
6981 
6982 	FUNCTION(("attr_read_stat: stat 0x%p\n", stat));
6983 
6984 	if (!HAS_FS_CALL(vnode, read_attr_stat))
6985 		return B_UNSUPPORTED;
6986 
6987 	return FS_CALL(vnode, read_attr_stat, descriptor->cookie, stat);
6988 }
6989 
6990 
6991 static status_t
6992 attr_write_stat(struct file_descriptor* descriptor, const struct stat* stat,
6993 	int statMask)
6994 {
6995 	struct vnode* vnode = descriptor->u.vnode;
6996 
6997 	FUNCTION(("attr_write_stat: stat = %p, statMask %d\n", stat, statMask));
6998 
6999 	if (!HAS_FS_CALL(vnode, write_attr_stat))
7000 		return B_READ_ONLY_DEVICE;
7001 
7002 	return FS_CALL(vnode, write_attr_stat, descriptor->cookie, stat, statMask);
7003 }
7004 
7005 
7006 static status_t
7007 attr_remove(int fd, const char* name, bool kernel)
7008 {
7009 	struct file_descriptor* descriptor;
7010 	struct vnode* vnode;
7011 	status_t status;
7012 
7013 	if (name == NULL || *name == '\0')
7014 		return B_BAD_VALUE;
7015 
7016 	FUNCTION(("attr_remove: fd = %d, name = \"%s\", kernel %d\n", fd, name,
7017 		kernel));
7018 
7019 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
7020 	if (descriptor == NULL)
7021 		return B_FILE_ERROR;
7022 
7023 	if (HAS_FS_CALL(vnode, remove_attr))
7024 		status = FS_CALL(vnode, remove_attr, name);
7025 	else
7026 		status = B_READ_ONLY_DEVICE;
7027 
7028 	put_fd(descriptor);
7029 
7030 	return status;
7031 }
7032 
7033 
7034 static status_t
7035 attr_rename(int fromFD, const char* fromName, int toFD, const char* toName,
7036 	bool kernel)
7037 {
7038 	struct file_descriptor* fromDescriptor;
7039 	struct file_descriptor* toDescriptor;
7040 	struct vnode* fromVnode;
7041 	struct vnode* toVnode;
7042 	status_t status;
7043 
7044 	if (fromName == NULL || *fromName == '\0' || toName == NULL
7045 		|| *toName == '\0')
7046 		return B_BAD_VALUE;
7047 
7048 	FUNCTION(("attr_rename: from fd = %d, from name = \"%s\", to fd = %d, to "
7049 		"name = \"%s\", kernel %d\n", fromFD, fromName, toFD, toName, kernel));
7050 
7051 	fromDescriptor = get_fd_and_vnode(fromFD, &fromVnode, kernel);
7052 	if (fromDescriptor == NULL)
7053 		return B_FILE_ERROR;
7054 
7055 	toDescriptor = get_fd_and_vnode(toFD, &toVnode, kernel);
7056 	if (toDescriptor == NULL) {
7057 		status = B_FILE_ERROR;
7058 		goto err;
7059 	}
7060 
7061 	// are the files on the same volume?
7062 	if (fromVnode->device != toVnode->device) {
7063 		status = B_CROSS_DEVICE_LINK;
7064 		goto err1;
7065 	}
7066 
7067 	if (HAS_FS_CALL(fromVnode, rename_attr)) {
7068 		status = FS_CALL(fromVnode, rename_attr, fromName, toVnode, toName);
7069 	} else
7070 		status = B_READ_ONLY_DEVICE;
7071 
7072 err1:
7073 	put_fd(toDescriptor);
7074 err:
7075 	put_fd(fromDescriptor);
7076 
7077 	return status;
7078 }
7079 
7080 
7081 static int
7082 index_dir_open(dev_t mountID, bool kernel)
7083 {
7084 	struct fs_mount* mount;
7085 	void* cookie;
7086 
7087 	FUNCTION(("index_dir_open(mountID = %" B_PRId32 ", kernel = %d)\n", mountID,
7088 		kernel));
7089 
7090 	status_t status = get_mount(mountID, &mount);
7091 	if (status != B_OK)
7092 		return status;
7093 
7094 	if (!HAS_FS_MOUNT_CALL(mount, open_index_dir)) {
7095 		status = B_UNSUPPORTED;
7096 		goto error;
7097 	}
7098 
7099 	status = FS_MOUNT_CALL(mount, open_index_dir, &cookie);
7100 	if (status != B_OK)
7101 		goto error;
7102 
7103 	// get fd for the index directory
7104 	int fd;
7105 	fd = get_new_fd(FDTYPE_INDEX_DIR, mount, NULL, cookie, O_CLOEXEC, kernel);
7106 	if (fd >= 0)
7107 		return fd;
7108 
7109 	// something went wrong
7110 	FS_MOUNT_CALL(mount, close_index_dir, cookie);
7111 	FS_MOUNT_CALL(mount, free_index_dir_cookie, cookie);
7112 
7113 	status = fd;
7114 
7115 error:
7116 	put_mount(mount);
7117 	return status;
7118 }
7119 
7120 
7121 static status_t
7122 index_dir_close(struct file_descriptor* descriptor)
7123 {
7124 	struct fs_mount* mount = descriptor->u.mount;
7125 
7126 	FUNCTION(("index_dir_close(descriptor = %p)\n", descriptor));
7127 
7128 	if (HAS_FS_MOUNT_CALL(mount, close_index_dir))
7129 		return FS_MOUNT_CALL(mount, close_index_dir, descriptor->cookie);
7130 
7131 	return B_OK;
7132 }
7133 
7134 
7135 static void
7136 index_dir_free_fd(struct file_descriptor* descriptor)
7137 {
7138 	struct fs_mount* mount = descriptor->u.mount;
7139 
7140 	if (mount != NULL) {
7141 		FS_MOUNT_CALL(mount, free_index_dir_cookie, descriptor->cookie);
7142 		put_mount(mount);
7143 	}
7144 }
7145 
7146 
7147 static status_t
7148 index_dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
7149 	struct dirent* buffer, size_t bufferSize, uint32* _count)
7150 {
7151 	struct fs_mount* mount = descriptor->u.mount;
7152 
7153 	if (HAS_FS_MOUNT_CALL(mount, read_index_dir)) {
7154 		return FS_MOUNT_CALL(mount, read_index_dir, descriptor->cookie, buffer,
7155 			bufferSize, _count);
7156 	}
7157 
7158 	return B_UNSUPPORTED;
7159 }
7160 
7161 
7162 static status_t
7163 index_dir_rewind(struct file_descriptor* descriptor)
7164 {
7165 	struct fs_mount* mount = descriptor->u.mount;
7166 
7167 	if (HAS_FS_MOUNT_CALL(mount, rewind_index_dir))
7168 		return FS_MOUNT_CALL(mount, rewind_index_dir, descriptor->cookie);
7169 
7170 	return B_UNSUPPORTED;
7171 }
7172 
7173 
7174 static status_t
7175 index_create(dev_t mountID, const char* name, uint32 type, uint32 flags,
7176 	bool kernel)
7177 {
7178 	FUNCTION(("index_create(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7179 		mountID, name, kernel));
7180 
7181 	struct fs_mount* mount;
7182 	status_t status = get_mount(mountID, &mount);
7183 	if (status != B_OK)
7184 		return status;
7185 
7186 	if (!HAS_FS_MOUNT_CALL(mount, create_index)) {
7187 		status = B_READ_ONLY_DEVICE;
7188 		goto out;
7189 	}
7190 
7191 	status = FS_MOUNT_CALL(mount, create_index, name, type, flags);
7192 
7193 out:
7194 	put_mount(mount);
7195 	return status;
7196 }
7197 
7198 
7199 #if 0
7200 static status_t
7201 index_read_stat(struct file_descriptor* descriptor, struct stat* stat)
7202 {
7203 	struct vnode* vnode = descriptor->u.vnode;
7204 
7205 	// ToDo: currently unused!
7206 	FUNCTION(("index_read_stat: stat 0x%p\n", stat));
7207 	if (!HAS_FS_CALL(vnode, read_index_stat))
7208 		return B_UNSUPPORTED;
7209 
7210 	return B_UNSUPPORTED;
7211 	//return FS_CALL(vnode, read_index_stat, descriptor->cookie, stat);
7212 }
7213 
7214 
7215 static void
7216 index_free_fd(struct file_descriptor* descriptor)
7217 {
7218 	struct vnode* vnode = descriptor->u.vnode;
7219 
7220 	if (vnode != NULL) {
7221 		FS_CALL(vnode, free_index_cookie, descriptor->cookie);
7222 		put_vnode(vnode);
7223 	}
7224 }
7225 #endif
7226 
7227 
7228 static status_t
7229 index_name_read_stat(dev_t mountID, const char* name, struct stat* stat,
7230 	bool kernel)
7231 {
7232 	FUNCTION(("index_remove(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7233 		mountID, name, kernel));
7234 
7235 	struct fs_mount* mount;
7236 	status_t status = get_mount(mountID, &mount);
7237 	if (status != B_OK)
7238 		return status;
7239 
7240 	if (!HAS_FS_MOUNT_CALL(mount, read_index_stat)) {
7241 		status = B_UNSUPPORTED;
7242 		goto out;
7243 	}
7244 
7245 	status = FS_MOUNT_CALL(mount, read_index_stat, name, stat);
7246 
7247 out:
7248 	put_mount(mount);
7249 	return status;
7250 }
7251 
7252 
7253 static status_t
7254 index_remove(dev_t mountID, const char* name, bool kernel)
7255 {
7256 	FUNCTION(("index_remove(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7257 		mountID, name, kernel));
7258 
7259 	struct fs_mount* mount;
7260 	status_t status = get_mount(mountID, &mount);
7261 	if (status != B_OK)
7262 		return status;
7263 
7264 	if (!HAS_FS_MOUNT_CALL(mount, remove_index)) {
7265 		status = B_READ_ONLY_DEVICE;
7266 		goto out;
7267 	}
7268 
7269 	status = FS_MOUNT_CALL(mount, remove_index, name);
7270 
7271 out:
7272 	put_mount(mount);
7273 	return status;
7274 }
7275 
7276 
7277 /*!	TODO: the query FS API is still the pretty much the same as in R5.
7278 		It would be nice if the FS would find some more kernel support
7279 		for them.
7280 		For example, query parsing should be moved into the kernel.
7281 */
7282 static int
7283 query_open(dev_t device, const char* query, uint32 flags, port_id port,
7284 	int32 token, bool kernel)
7285 {
7286 	struct fs_mount* mount;
7287 	void* cookie;
7288 
7289 	FUNCTION(("query_open(device = %" B_PRId32 ", query = \"%s\", kernel = %d)\n",
7290 		device, query, kernel));
7291 
7292 	status_t status = get_mount(device, &mount);
7293 	if (status != B_OK)
7294 		return status;
7295 
7296 	if (!HAS_FS_MOUNT_CALL(mount, open_query)) {
7297 		status = B_UNSUPPORTED;
7298 		goto error;
7299 	}
7300 
7301 	status = FS_MOUNT_CALL(mount, open_query, query, flags, port, token,
7302 		&cookie);
7303 	if (status != B_OK)
7304 		goto error;
7305 
7306 	// get fd for the index directory
7307 	int fd;
7308 	fd = get_new_fd(FDTYPE_QUERY, mount, NULL, cookie, O_CLOEXEC, kernel);
7309 	if (fd >= 0)
7310 		return fd;
7311 
7312 	status = fd;
7313 
7314 	// something went wrong
7315 	FS_MOUNT_CALL(mount, close_query, cookie);
7316 	FS_MOUNT_CALL(mount, free_query_cookie, cookie);
7317 
7318 error:
7319 	put_mount(mount);
7320 	return status;
7321 }
7322 
7323 
7324 static status_t
7325 query_close(struct file_descriptor* descriptor)
7326 {
7327 	struct fs_mount* mount = descriptor->u.mount;
7328 
7329 	FUNCTION(("query_close(descriptor = %p)\n", descriptor));
7330 
7331 	if (HAS_FS_MOUNT_CALL(mount, close_query))
7332 		return FS_MOUNT_CALL(mount, close_query, descriptor->cookie);
7333 
7334 	return B_OK;
7335 }
7336 
7337 
7338 static void
7339 query_free_fd(struct file_descriptor* descriptor)
7340 {
7341 	struct fs_mount* mount = descriptor->u.mount;
7342 
7343 	if (mount != NULL) {
7344 		FS_MOUNT_CALL(mount, free_query_cookie, descriptor->cookie);
7345 		put_mount(mount);
7346 	}
7347 }
7348 
7349 
7350 static status_t
7351 query_read(struct io_context* ioContext, struct file_descriptor* descriptor,
7352 	struct dirent* buffer, size_t bufferSize, uint32* _count)
7353 {
7354 	struct fs_mount* mount = descriptor->u.mount;
7355 
7356 	if (HAS_FS_MOUNT_CALL(mount, read_query)) {
7357 		return FS_MOUNT_CALL(mount, read_query, descriptor->cookie, buffer,
7358 			bufferSize, _count);
7359 	}
7360 
7361 	return B_UNSUPPORTED;
7362 }
7363 
7364 
7365 static status_t
7366 query_rewind(struct file_descriptor* descriptor)
7367 {
7368 	struct fs_mount* mount = descriptor->u.mount;
7369 
7370 	if (HAS_FS_MOUNT_CALL(mount, rewind_query))
7371 		return FS_MOUNT_CALL(mount, rewind_query, descriptor->cookie);
7372 
7373 	return B_UNSUPPORTED;
7374 }
7375 
7376 
7377 //	#pragma mark - General File System functions
7378 
7379 
7380 static dev_t
7381 fs_mount(char* path, const char* device, const char* fsName, uint32 flags,
7382 	const char* args, bool kernel)
7383 {
7384 	struct ::fs_mount* mount;
7385 	status_t status = B_OK;
7386 	fs_volume* volume = NULL;
7387 	int32 layer = 0;
7388 	Vnode* coveredNode = NULL;
7389 
7390 	FUNCTION(("fs_mount: path = '%s', device = '%s', fs_name = '%s', flags = %#"
7391 		B_PRIx32 ", args = '%s'\n", path, device, fsName, flags, args));
7392 
7393 	// The path is always safe, we just have to make sure that fsName is
7394 	// almost valid - we can't make any assumptions about args, though.
7395 	// A NULL fsName is OK, if a device was given and the FS is not virtual.
7396 	// We'll get it from the DDM later.
7397 	if (fsName == NULL) {
7398 		if (!device || flags & B_MOUNT_VIRTUAL_DEVICE)
7399 			return B_BAD_VALUE;
7400 	} else if (fsName[0] == '\0')
7401 		return B_BAD_VALUE;
7402 
7403 	RecursiveLocker mountOpLocker(sMountOpLock);
7404 
7405 	// Helper to delete a newly created file device on failure.
7406 	// Not exactly beautiful, but helps to keep the code below cleaner.
7407 	struct FileDeviceDeleter {
7408 		FileDeviceDeleter() : id(-1) {}
7409 		~FileDeviceDeleter()
7410 		{
7411 			KDiskDeviceManager::Default()->DeleteFileDevice(id);
7412 		}
7413 
7414 		partition_id id;
7415 	} fileDeviceDeleter;
7416 
7417 	// If the file system is not a "virtual" one, the device argument should
7418 	// point to a real file/device (if given at all).
7419 	// get the partition
7420 	KDiskDeviceManager* ddm = KDiskDeviceManager::Default();
7421 	KPartition* partition = NULL;
7422 	KPath normalizedDevice;
7423 	bool newlyCreatedFileDevice = false;
7424 
7425 	if (!(flags & B_MOUNT_VIRTUAL_DEVICE) && device != NULL) {
7426 		// normalize the device path
7427 		status = normalizedDevice.SetTo(device, true);
7428 		if (status != B_OK)
7429 			return status;
7430 
7431 		// get a corresponding partition from the DDM
7432 		partition = ddm->RegisterPartition(normalizedDevice.Path());
7433 		if (partition == NULL) {
7434 			// Partition not found: This either means, the user supplied
7435 			// an invalid path, or the path refers to an image file. We try
7436 			// to let the DDM create a file device for the path.
7437 			partition_id deviceID = ddm->CreateFileDevice(
7438 				normalizedDevice.Path(), &newlyCreatedFileDevice);
7439 			if (deviceID >= 0) {
7440 				partition = ddm->RegisterPartition(deviceID);
7441 				if (newlyCreatedFileDevice)
7442 					fileDeviceDeleter.id = deviceID;
7443 			}
7444 		}
7445 
7446 		if (!partition) {
7447 			TRACE(("fs_mount(): Partition `%s' not found.\n",
7448 				normalizedDevice.Path()));
7449 			return B_ENTRY_NOT_FOUND;
7450 		}
7451 
7452 		device = normalizedDevice.Path();
7453 			// correct path to file device
7454 	}
7455 	PartitionRegistrar partitionRegistrar(partition, true);
7456 
7457 	// Write lock the partition's device. For the time being, we keep the lock
7458 	// until we're done mounting -- not nice, but ensure, that no-one is
7459 	// interfering.
7460 	// TODO: Just mark the partition busy while mounting!
7461 	KDiskDevice* diskDevice = NULL;
7462 	if (partition) {
7463 		diskDevice = ddm->WriteLockDevice(partition->Device()->ID());
7464 		if (!diskDevice) {
7465 			TRACE(("fs_mount(): Failed to lock disk device!\n"));
7466 			return B_ERROR;
7467 		}
7468 	}
7469 
7470 	DeviceWriteLocker writeLocker(diskDevice, true);
7471 		// this takes over the write lock acquired before
7472 
7473 	if (partition != NULL) {
7474 		// make sure, that the partition is not busy
7475 		if (partition->IsBusy()) {
7476 			TRACE(("fs_mount(): Partition is busy.\n"));
7477 			return B_BUSY;
7478 		}
7479 
7480 		// if no FS name had been supplied, we get it from the partition
7481 		if (fsName == NULL) {
7482 			KDiskSystem* diskSystem = partition->DiskSystem();
7483 			if (!diskSystem) {
7484 				TRACE(("fs_mount(): No FS name was given, and the DDM didn't "
7485 					"recognize it.\n"));
7486 				return B_BAD_VALUE;
7487 			}
7488 
7489 			if (!diskSystem->IsFileSystem()) {
7490 				TRACE(("fs_mount(): No FS name was given, and the DDM found a "
7491 					"partitioning system.\n"));
7492 				return B_BAD_VALUE;
7493 			}
7494 
7495 			// The disk system name will not change, and the KDiskSystem
7496 			// object will not go away while the disk device is locked (and
7497 			// the partition has a reference to it), so this is safe.
7498 			fsName = diskSystem->Name();
7499 		}
7500 	}
7501 
7502 	mount = new(std::nothrow) (struct ::fs_mount);
7503 	if (mount == NULL)
7504 		return B_NO_MEMORY;
7505 
7506 	mount->device_name = strdup(device);
7507 		// "device" can be NULL
7508 
7509 	status = mount->entry_cache.Init();
7510 	if (status != B_OK)
7511 		goto err1;
7512 
7513 	// initialize structure
7514 	mount->id = sNextMountID++;
7515 	mount->partition = NULL;
7516 	mount->root_vnode = NULL;
7517 	mount->covers_vnode = NULL;
7518 	mount->unmounting = false;
7519 	mount->owns_file_device = false;
7520 	mount->volume = NULL;
7521 
7522 	// build up the volume(s)
7523 	while (true) {
7524 		char* layerFSName = get_file_system_name_for_layer(fsName, layer);
7525 		if (layerFSName == NULL) {
7526 			if (layer == 0) {
7527 				status = B_NO_MEMORY;
7528 				goto err1;
7529 			}
7530 
7531 			break;
7532 		}
7533 		MemoryDeleter layerFSNameDeleter(layerFSName);
7534 
7535 		volume = (fs_volume*)malloc(sizeof(fs_volume));
7536 		if (volume == NULL) {
7537 			status = B_NO_MEMORY;
7538 			goto err1;
7539 		}
7540 
7541 		volume->id = mount->id;
7542 		volume->partition = partition != NULL ? partition->ID() : -1;
7543 		volume->layer = layer++;
7544 		volume->private_volume = NULL;
7545 		volume->ops = NULL;
7546 		volume->sub_volume = NULL;
7547 		volume->super_volume = NULL;
7548 		volume->file_system = NULL;
7549 		volume->file_system_name = NULL;
7550 
7551 		volume->file_system_name = get_file_system_name(layerFSName);
7552 		if (volume->file_system_name == NULL) {
7553 			status = B_NO_MEMORY;
7554 			free(volume);
7555 			goto err1;
7556 		}
7557 
7558 		volume->file_system = get_file_system(layerFSName);
7559 		if (volume->file_system == NULL) {
7560 			status = B_DEVICE_NOT_FOUND;
7561 			free(volume->file_system_name);
7562 			free(volume);
7563 			goto err1;
7564 		}
7565 
7566 		if (mount->volume == NULL)
7567 			mount->volume = volume;
7568 		else {
7569 			volume->super_volume = mount->volume;
7570 			mount->volume->sub_volume = volume;
7571 			mount->volume = volume;
7572 		}
7573 	}
7574 
7575 	// insert mount struct into list before we call FS's mount() function
7576 	// so that vnodes can be created for this mount
7577 	mutex_lock(&sMountMutex);
7578 	sMountsTable->Insert(mount);
7579 	mutex_unlock(&sMountMutex);
7580 
7581 	ino_t rootID;
7582 
7583 	if (!sRoot) {
7584 		// we haven't mounted anything yet
7585 		if (strcmp(path, "/") != 0) {
7586 			status = B_ERROR;
7587 			goto err2;
7588 		}
7589 
7590 		status = mount->volume->file_system->mount(mount->volume, device, flags,
7591 			args, &rootID);
7592 		if (status != B_OK || mount->volume->ops == NULL)
7593 			goto err2;
7594 	} else {
7595 		status = path_to_vnode(path, true, &coveredNode, NULL, kernel);
7596 		if (status != B_OK)
7597 			goto err2;
7598 
7599 		mount->covers_vnode = coveredNode;
7600 
7601 		// make sure covered_vnode is a directory
7602 		if (!S_ISDIR(coveredNode->Type())) {
7603 			status = B_NOT_A_DIRECTORY;
7604 			goto err3;
7605 		}
7606 
7607 		if (coveredNode->IsCovered()) {
7608 			// this is already a covered vnode
7609 			status = B_BUSY;
7610 			goto err3;
7611 		}
7612 
7613 		// mount it/them
7614 		fs_volume* volume = mount->volume;
7615 		while (volume) {
7616 			status = volume->file_system->mount(volume, device, flags, args,
7617 				&rootID);
7618 			if (status != B_OK || volume->ops == NULL) {
7619 				if (status == B_OK && volume->ops == NULL)
7620 					panic("fs_mount: mount() succeeded but ops is NULL!");
7621 				if (volume->sub_volume)
7622 					goto err4;
7623 				goto err3;
7624 			}
7625 
7626 			volume = volume->super_volume;
7627 		}
7628 
7629 		volume = mount->volume;
7630 		while (volume) {
7631 			if (volume->ops->all_layers_mounted != NULL)
7632 				volume->ops->all_layers_mounted(volume);
7633 			volume = volume->super_volume;
7634 		}
7635 	}
7636 
7637 	// the root node is supposed to be owned by the file system - it must
7638 	// exist at this point
7639 	mount->root_vnode = lookup_vnode(mount->id, rootID);
7640 	if (mount->root_vnode == NULL || mount->root_vnode->ref_count != 1) {
7641 		panic("fs_mount: file system does not own its root node!\n");
7642 		status = B_ERROR;
7643 		goto err4;
7644 	}
7645 
7646 	// set up the links between the root vnode and the vnode it covers
7647 	rw_lock_write_lock(&sVnodeLock);
7648 	if (coveredNode != NULL) {
7649 		if (coveredNode->IsCovered()) {
7650 			// the vnode is covered now
7651 			status = B_BUSY;
7652 			rw_lock_write_unlock(&sVnodeLock);
7653 			goto err4;
7654 		}
7655 
7656 		mount->root_vnode->covers = coveredNode;
7657 		mount->root_vnode->SetCovering(true);
7658 
7659 		coveredNode->covered_by = mount->root_vnode;
7660 		coveredNode->SetCovered(true);
7661 	}
7662 	rw_lock_write_unlock(&sVnodeLock);
7663 
7664 	if (!sRoot) {
7665 		sRoot = mount->root_vnode;
7666 		mutex_lock(&sIOContextRootLock);
7667 		get_current_io_context(true)->root = sRoot;
7668 		mutex_unlock(&sIOContextRootLock);
7669 		inc_vnode_ref_count(sRoot);
7670 	}
7671 
7672 	// supply the partition (if any) with the mount cookie and mark it mounted
7673 	if (partition) {
7674 		partition->SetMountCookie(mount->volume->private_volume);
7675 		partition->SetVolumeID(mount->id);
7676 
7677 		// keep a partition reference as long as the partition is mounted
7678 		partitionRegistrar.Detach();
7679 		mount->partition = partition;
7680 		mount->owns_file_device = newlyCreatedFileDevice;
7681 		fileDeviceDeleter.id = -1;
7682 	}
7683 
7684 	notify_mount(mount->id,
7685 		coveredNode != NULL ? coveredNode->device : -1,
7686 		coveredNode ? coveredNode->id : -1);
7687 
7688 	return mount->id;
7689 
7690 err4:
7691 	FS_MOUNT_CALL_NO_PARAMS(mount, unmount);
7692 err3:
7693 	if (coveredNode != NULL)
7694 		put_vnode(coveredNode);
7695 err2:
7696 	mutex_lock(&sMountMutex);
7697 	sMountsTable->Remove(mount);
7698 	mutex_unlock(&sMountMutex);
7699 err1:
7700 	delete mount;
7701 
7702 	return status;
7703 }
7704 
7705 
7706 static status_t
7707 fs_unmount(char* path, dev_t mountID, uint32 flags, bool kernel)
7708 {
7709 	struct fs_mount* mount;
7710 	status_t err;
7711 
7712 	FUNCTION(("fs_unmount(path '%s', dev %" B_PRId32 ", kernel %d\n", path,
7713 		mountID, kernel));
7714 
7715 	struct vnode* pathVnode = NULL;
7716 	if (path != NULL) {
7717 		err = path_to_vnode(path, true, &pathVnode, NULL, kernel);
7718 		if (err != B_OK)
7719 			return B_ENTRY_NOT_FOUND;
7720 	}
7721 
7722 	RecursiveLocker mountOpLocker(sMountOpLock);
7723 
7724 	// this lock is not strictly necessary, but here in case of KDEBUG
7725 	// to keep the ASSERT in find_mount() working.
7726 	KDEBUG_ONLY(mutex_lock(&sMountMutex));
7727 	mount = find_mount(path != NULL ? pathVnode->device : mountID);
7728 	KDEBUG_ONLY(mutex_unlock(&sMountMutex));
7729 	if (mount == NULL) {
7730 		panic("fs_unmount: find_mount() failed on root vnode @%p of mount\n",
7731 			pathVnode);
7732 	}
7733 
7734 	if (path != NULL) {
7735 		put_vnode(pathVnode);
7736 
7737 		if (mount->root_vnode != pathVnode) {
7738 			// not mountpoint
7739 			return B_BAD_VALUE;
7740 		}
7741 	}
7742 
7743 	// if the volume is associated with a partition, lock the device of the
7744 	// partition as long as we are unmounting
7745 	KDiskDeviceManager* ddm = KDiskDeviceManager::Default();
7746 	KPartition* partition = mount->partition;
7747 	KDiskDevice* diskDevice = NULL;
7748 	if (partition != NULL) {
7749 		if (partition->Device() == NULL) {
7750 			dprintf("fs_unmount(): There is no device!\n");
7751 			return B_ERROR;
7752 		}
7753 		diskDevice = ddm->WriteLockDevice(partition->Device()->ID());
7754 		if (!diskDevice) {
7755 			TRACE(("fs_unmount(): Failed to lock disk device!\n"));
7756 			return B_ERROR;
7757 		}
7758 	}
7759 	DeviceWriteLocker writeLocker(diskDevice, true);
7760 
7761 	// make sure, that the partition is not busy
7762 	if (partition != NULL) {
7763 		if ((flags & B_UNMOUNT_BUSY_PARTITION) == 0 && partition->IsBusy()) {
7764 			TRACE(("fs_unmount(): Partition is busy.\n"));
7765 			return B_BUSY;
7766 		}
7767 	}
7768 
7769 	// grab the vnode master mutex to keep someone from creating
7770 	// a vnode while we're figuring out if we can continue
7771 	WriteLocker vnodesWriteLocker(&sVnodeLock);
7772 
7773 	bool disconnectedDescriptors = false;
7774 
7775 	while (true) {
7776 		bool busy = false;
7777 
7778 		// cycle through the list of vnodes associated with this mount and
7779 		// make sure all of them are not busy or have refs on them
7780 		VnodeList::Iterator iterator = mount->vnodes.GetIterator();
7781 		while (struct vnode* vnode = iterator.Next()) {
7782 			if (vnode->IsBusy()) {
7783 				busy = true;
7784 				break;
7785 			}
7786 
7787 			// check the vnode's ref count -- subtract additional references for
7788 			// covering
7789 			int32 refCount = vnode->ref_count;
7790 			if (vnode->covers != NULL)
7791 				refCount--;
7792 			if (vnode->covered_by != NULL)
7793 				refCount--;
7794 
7795 			if (refCount != 0) {
7796 				// there are still vnodes in use on this mount, so we cannot
7797 				// unmount yet
7798 				busy = true;
7799 				break;
7800 			}
7801 		}
7802 
7803 		if (!busy)
7804 			break;
7805 
7806 		if ((flags & B_FORCE_UNMOUNT) == 0)
7807 			return B_BUSY;
7808 
7809 		if (disconnectedDescriptors) {
7810 			// wait a bit until the last access is finished, and then try again
7811 			vnodesWriteLocker.Unlock();
7812 			snooze(100000);
7813 			// TODO: if there is some kind of bug that prevents the ref counts
7814 			// from getting back to zero, this will fall into an endless loop...
7815 			vnodesWriteLocker.Lock();
7816 			continue;
7817 		}
7818 
7819 		// the file system is still busy - but we're forced to unmount it,
7820 		// so let's disconnect all open file descriptors
7821 
7822 		mount->unmounting = true;
7823 			// prevent new vnodes from being created
7824 
7825 		vnodesWriteLocker.Unlock();
7826 
7827 		disconnect_mount_or_vnode_fds(mount, NULL);
7828 		disconnectedDescriptors = true;
7829 
7830 		vnodesWriteLocker.Lock();
7831 	}
7832 
7833 	// We can safely continue. Mark all of the vnodes busy and this mount
7834 	// structure in unmounting state. Also undo the vnode covers/covered_by
7835 	// links.
7836 	mount->unmounting = true;
7837 
7838 	VnodeList::Iterator iterator = mount->vnodes.GetIterator();
7839 	while (struct vnode* vnode = iterator.Next()) {
7840 		// Remove all covers/covered_by links from other mounts' nodes to this
7841 		// vnode and adjust the node ref count accordingly. We will release the
7842 		// references to the external vnodes below.
7843 		if (Vnode* coveredNode = vnode->covers) {
7844 			if (Vnode* coveringNode = vnode->covered_by) {
7845 				// We have both covered and covering vnodes, so just remove us
7846 				// from the chain.
7847 				coveredNode->covered_by = coveringNode;
7848 				coveringNode->covers = coveredNode;
7849 				vnode->ref_count -= 2;
7850 
7851 				vnode->covered_by = NULL;
7852 				vnode->covers = NULL;
7853 				vnode->SetCovering(false);
7854 				vnode->SetCovered(false);
7855 			} else {
7856 				// We only have a covered vnode. Remove its link to us.
7857 				coveredNode->covered_by = NULL;
7858 				coveredNode->SetCovered(false);
7859 				vnode->ref_count--;
7860 
7861 				// If the other node is an external vnode, we keep its link
7862 				// link around so we can put the reference later on. Otherwise
7863 				// we get rid of it right now.
7864 				if (coveredNode->mount == mount) {
7865 					vnode->covers = NULL;
7866 					coveredNode->ref_count--;
7867 				}
7868 			}
7869 		} else if (Vnode* coveringNode = vnode->covered_by) {
7870 			// We only have a covering vnode. Remove its link to us.
7871 			coveringNode->covers = NULL;
7872 			coveringNode->SetCovering(false);
7873 			vnode->ref_count--;
7874 
7875 			// If the other node is an external vnode, we keep its link
7876 			// link around so we can put the reference later on. Otherwise
7877 			// we get rid of it right now.
7878 			if (coveringNode->mount == mount) {
7879 				vnode->covered_by = NULL;
7880 				coveringNode->ref_count--;
7881 			}
7882 		}
7883 
7884 		vnode->SetBusy(true);
7885 		vnode_to_be_freed(vnode);
7886 	}
7887 
7888 	vnodesWriteLocker.Unlock();
7889 
7890 	// Free all vnodes associated with this mount.
7891 	// They will be removed from the mount list by free_vnode(), so
7892 	// we don't have to do this.
7893 	while (struct vnode* vnode = mount->vnodes.Head()) {
7894 		// Put the references to external covered/covering vnodes we kept above.
7895 		if (Vnode* coveredNode = vnode->covers)
7896 			put_vnode(coveredNode);
7897 		if (Vnode* coveringNode = vnode->covered_by)
7898 			put_vnode(coveringNode);
7899 
7900 		free_vnode(vnode, false);
7901 	}
7902 
7903 	// remove the mount structure from the hash table
7904 	mutex_lock(&sMountMutex);
7905 	sMountsTable->Remove(mount);
7906 	mutex_unlock(&sMountMutex);
7907 
7908 	mountOpLocker.Unlock();
7909 
7910 	FS_MOUNT_CALL_NO_PARAMS(mount, unmount);
7911 	notify_unmount(mount->id);
7912 
7913 	// dereference the partition and mark it unmounted
7914 	if (partition) {
7915 		partition->SetVolumeID(-1);
7916 		partition->SetMountCookie(NULL);
7917 
7918 		if (mount->owns_file_device)
7919 			KDiskDeviceManager::Default()->DeleteFileDevice(partition->ID());
7920 		partition->Unregister();
7921 	}
7922 
7923 	delete mount;
7924 	return B_OK;
7925 }
7926 
7927 
7928 static status_t
7929 fs_sync(dev_t device)
7930 {
7931 	struct fs_mount* mount;
7932 	status_t status = get_mount(device, &mount);
7933 	if (status != B_OK)
7934 		return status;
7935 
7936 	struct vnode marker;
7937 	memset(&marker, 0, sizeof(marker));
7938 	marker.SetBusy(true);
7939 	marker.SetRemoved(true);
7940 
7941 	// First, synchronize all file caches
7942 
7943 	while (true) {
7944 		WriteLocker locker(sVnodeLock);
7945 			// Note: That's the easy way. Which is probably OK for sync(),
7946 			// since it's a relatively rare call and doesn't need to allow for
7947 			// a lot of concurrency. Using a read lock would be possible, but
7948 			// also more involved, since we had to lock the individual nodes
7949 			// and take care of the locking order, which we might not want to
7950 			// do while holding fs_mount::lock.
7951 
7952 		// synchronize access to vnode list
7953 		mutex_lock(&mount->lock);
7954 
7955 		struct vnode* vnode;
7956 		if (!marker.IsRemoved()) {
7957 			vnode = mount->vnodes.GetNext(&marker);
7958 			mount->vnodes.Remove(&marker);
7959 			marker.SetRemoved(true);
7960 		} else
7961 			vnode = mount->vnodes.First();
7962 
7963 		while (vnode != NULL && (vnode->cache == NULL
7964 			|| vnode->IsRemoved() || vnode->IsBusy())) {
7965 			// TODO: we could track writes (and writable mapped vnodes)
7966 			//	and have a simple flag that we could test for here
7967 			vnode = mount->vnodes.GetNext(vnode);
7968 		}
7969 
7970 		if (vnode != NULL) {
7971 			// insert marker vnode again
7972 			mount->vnodes.Insert(mount->vnodes.GetNext(vnode), &marker);
7973 			marker.SetRemoved(false);
7974 		}
7975 
7976 		mutex_unlock(&mount->lock);
7977 
7978 		if (vnode == NULL)
7979 			break;
7980 
7981 		vnode = lookup_vnode(mount->id, vnode->id);
7982 		if (vnode == NULL || vnode->IsBusy())
7983 			continue;
7984 
7985 		if (vnode->ref_count == 0) {
7986 			// this vnode has been unused before
7987 			vnode_used(vnode);
7988 		}
7989 		inc_vnode_ref_count(vnode);
7990 
7991 		locker.Unlock();
7992 
7993 		if (vnode->cache != NULL && !vnode->IsRemoved())
7994 			vnode->cache->WriteModified();
7995 
7996 		put_vnode(vnode);
7997 	}
7998 
7999 	// Let the file systems do their synchronizing work
8000 	if (HAS_FS_MOUNT_CALL(mount, sync))
8001 		status = FS_MOUNT_CALL_NO_PARAMS(mount, sync);
8002 
8003 	// Finally, flush the underlying device's write cache (if possible.)
8004 	if (mount->partition != NULL && mount->partition->Device() != NULL)
8005 		ioctl(mount->partition->Device()->FD(), B_FLUSH_DRIVE_CACHE);
8006 
8007 	put_mount(mount);
8008 	return status;
8009 }
8010 
8011 
8012 static status_t
8013 fs_read_info(dev_t device, struct fs_info* info)
8014 {
8015 	struct fs_mount* mount;
8016 	status_t status = get_mount(device, &mount);
8017 	if (status != B_OK)
8018 		return status;
8019 
8020 	memset(info, 0, sizeof(struct fs_info));
8021 
8022 	if (HAS_FS_MOUNT_CALL(mount, read_fs_info))
8023 		status = FS_MOUNT_CALL(mount, read_fs_info, info);
8024 
8025 	// fill in info the file system doesn't (have to) know about
8026 	if (status == B_OK) {
8027 		info->dev = mount->id;
8028 		info->root = mount->root_vnode->id;
8029 
8030 		fs_volume* volume = mount->volume;
8031 		while (volume->super_volume != NULL)
8032 			volume = volume->super_volume;
8033 
8034 		strlcpy(info->fsh_name, volume->file_system_name,
8035 			sizeof(info->fsh_name));
8036 		if (mount->device_name != NULL) {
8037 			strlcpy(info->device_name, mount->device_name,
8038 				sizeof(info->device_name));
8039 		}
8040 	}
8041 
8042 	// if the call is not supported by the file system, there are still
8043 	// the parts that we filled out ourselves
8044 
8045 	put_mount(mount);
8046 	return status;
8047 }
8048 
8049 
8050 static status_t
8051 fs_write_info(dev_t device, const struct fs_info* info, int mask)
8052 {
8053 	struct fs_mount* mount;
8054 	status_t status = get_mount(device, &mount);
8055 	if (status != B_OK)
8056 		return status;
8057 
8058 	if (HAS_FS_MOUNT_CALL(mount, write_fs_info))
8059 		status = FS_MOUNT_CALL(mount, write_fs_info, info, mask);
8060 	else
8061 		status = B_READ_ONLY_DEVICE;
8062 
8063 	put_mount(mount);
8064 	return status;
8065 }
8066 
8067 
8068 static dev_t
8069 fs_next_device(int32* _cookie)
8070 {
8071 	struct fs_mount* mount = NULL;
8072 	dev_t device = *_cookie;
8073 
8074 	mutex_lock(&sMountMutex);
8075 
8076 	// Since device IDs are assigned sequentially, this algorithm
8077 	// does work good enough. It makes sure that the device list
8078 	// returned is sorted, and that no device is skipped when an
8079 	// already visited device got unmounted.
8080 
8081 	while (device < sNextMountID) {
8082 		mount = find_mount(device++);
8083 		if (mount != NULL && mount->volume->private_volume != NULL)
8084 			break;
8085 	}
8086 
8087 	*_cookie = device;
8088 
8089 	if (mount != NULL)
8090 		device = mount->id;
8091 	else
8092 		device = B_BAD_VALUE;
8093 
8094 	mutex_unlock(&sMountMutex);
8095 
8096 	return device;
8097 }
8098 
8099 
8100 ssize_t
8101 fs_read_attr(int fd, const char *attribute, uint32 type, off_t pos,
8102 	void *buffer, size_t readBytes)
8103 {
8104 	int attrFD = attr_open(fd, NULL, attribute, O_RDONLY, true);
8105 	if (attrFD < 0)
8106 		return attrFD;
8107 
8108 	ssize_t bytesRead = _kern_read(attrFD, pos, buffer, readBytes);
8109 
8110 	_kern_close(attrFD);
8111 
8112 	return bytesRead;
8113 }
8114 
8115 
8116 static status_t
8117 get_cwd(char* buffer, size_t size, bool kernel)
8118 {
8119 	// Get current working directory from io context
8120 	struct io_context* context = get_current_io_context(kernel);
8121 	status_t status;
8122 
8123 	FUNCTION(("vfs_get_cwd: buf %p, size %ld\n", buffer, size));
8124 
8125 	mutex_lock(&context->io_mutex);
8126 
8127 	struct vnode* vnode = context->cwd;
8128 	if (vnode)
8129 		inc_vnode_ref_count(vnode);
8130 
8131 	mutex_unlock(&context->io_mutex);
8132 
8133 	if (vnode) {
8134 		status = dir_vnode_to_path(vnode, buffer, size, kernel);
8135 		put_vnode(vnode);
8136 	} else
8137 		status = B_ERROR;
8138 
8139 	return status;
8140 }
8141 
8142 
8143 static status_t
8144 set_cwd(int fd, char* path, bool kernel)
8145 {
8146 	struct io_context* context;
8147 	struct vnode* vnode = NULL;
8148 	struct vnode* oldDirectory;
8149 	status_t status;
8150 
8151 	FUNCTION(("set_cwd: path = \'%s\'\n", path));
8152 
8153 	// Get vnode for passed path, and bail if it failed
8154 	status = fd_and_path_to_vnode(fd, path, true, &vnode, NULL, kernel);
8155 	if (status < 0)
8156 		return status;
8157 
8158 	if (!S_ISDIR(vnode->Type())) {
8159 		// nope, can't cwd to here
8160 		status = B_NOT_A_DIRECTORY;
8161 		goto err;
8162 	}
8163 
8164 	// We need to have the permission to enter the directory, too
8165 	if (HAS_FS_CALL(vnode, access)) {
8166 		status = FS_CALL(vnode, access, X_OK);
8167 		if (status != B_OK)
8168 			goto err;
8169 	}
8170 
8171 	// Get current io context and lock
8172 	context = get_current_io_context(kernel);
8173 	mutex_lock(&context->io_mutex);
8174 
8175 	// save the old current working directory first
8176 	oldDirectory = context->cwd;
8177 	context->cwd = vnode;
8178 
8179 	mutex_unlock(&context->io_mutex);
8180 
8181 	if (oldDirectory)
8182 		put_vnode(oldDirectory);
8183 
8184 	return B_NO_ERROR;
8185 
8186 err:
8187 	put_vnode(vnode);
8188 	return status;
8189 }
8190 
8191 
8192 static status_t
8193 user_copy_name(char* to, const char* from, size_t length)
8194 {
8195 	ssize_t len = user_strlcpy(to, from, length);
8196 	if (len < 0)
8197 		return len;
8198 	if (len >= (ssize_t)length)
8199 		return B_NAME_TOO_LONG;
8200 	return B_OK;
8201 }
8202 
8203 
8204 //	#pragma mark - kernel mirrored syscalls
8205 
8206 
8207 dev_t
8208 _kern_mount(const char* path, const char* device, const char* fsName,
8209 	uint32 flags, const char* args, size_t argsLength)
8210 {
8211 	KPath pathBuffer(path);
8212 	if (pathBuffer.InitCheck() != B_OK)
8213 		return B_NO_MEMORY;
8214 
8215 	return fs_mount(pathBuffer.LockBuffer(), device, fsName, flags, args, true);
8216 }
8217 
8218 
8219 status_t
8220 _kern_unmount(const char* path, uint32 flags)
8221 {
8222 	KPath pathBuffer(path);
8223 	if (pathBuffer.InitCheck() != B_OK)
8224 		return B_NO_MEMORY;
8225 
8226 	return fs_unmount(pathBuffer.LockBuffer(), -1, flags, true);
8227 }
8228 
8229 
8230 status_t
8231 _kern_read_fs_info(dev_t device, struct fs_info* info)
8232 {
8233 	if (info == NULL)
8234 		return B_BAD_VALUE;
8235 
8236 	return fs_read_info(device, info);
8237 }
8238 
8239 
8240 status_t
8241 _kern_write_fs_info(dev_t device, const struct fs_info* info, int mask)
8242 {
8243 	if (info == NULL)
8244 		return B_BAD_VALUE;
8245 
8246 	return fs_write_info(device, info, mask);
8247 }
8248 
8249 
8250 status_t
8251 _kern_sync(void)
8252 {
8253 	// Note: _kern_sync() is also called from _user_sync()
8254 	int32 cookie = 0;
8255 	dev_t device;
8256 	while ((device = next_dev(&cookie)) >= 0) {
8257 		status_t status = fs_sync(device);
8258 		if (status != B_OK && status != B_BAD_VALUE) {
8259 			dprintf("sync: device %" B_PRIdDEV " couldn't sync: %s\n", device,
8260 				strerror(status));
8261 		}
8262 	}
8263 
8264 	return B_OK;
8265 }
8266 
8267 
8268 dev_t
8269 _kern_next_device(int32* _cookie)
8270 {
8271 	return fs_next_device(_cookie);
8272 }
8273 
8274 
8275 status_t
8276 _kern_get_next_fd_info(team_id teamID, uint32* _cookie, fd_info* info,
8277 	size_t infoSize)
8278 {
8279 	if (infoSize != sizeof(fd_info))
8280 		return B_BAD_VALUE;
8281 
8282 	// get the team
8283 	Team* team = Team::Get(teamID);
8284 	if (team == NULL)
8285 		return B_BAD_TEAM_ID;
8286 	BReference<Team> teamReference(team, true);
8287 
8288 	// now that we have a team reference, its I/O context won't go away
8289 	io_context* context = team->io_context;
8290 	MutexLocker contextLocker(context->io_mutex);
8291 
8292 	uint32 slot = *_cookie;
8293 
8294 	struct file_descriptor* descriptor;
8295 	while (slot < context->table_size
8296 		&& (descriptor = context->fds[slot]) == NULL) {
8297 		slot++;
8298 	}
8299 
8300 	if (slot >= context->table_size)
8301 		return B_ENTRY_NOT_FOUND;
8302 
8303 	info->number = slot;
8304 	info->open_mode = descriptor->open_mode;
8305 
8306 	struct vnode* vnode = fd_vnode(descriptor);
8307 	if (vnode != NULL) {
8308 		info->device = vnode->device;
8309 		info->node = vnode->id;
8310 	} else if (descriptor->u.mount != NULL) {
8311 		info->device = descriptor->u.mount->id;
8312 		info->node = -1;
8313 	}
8314 
8315 	*_cookie = slot + 1;
8316 	return B_OK;
8317 }
8318 
8319 
8320 int
8321 _kern_open_entry_ref(dev_t device, ino_t inode, const char* name, int openMode,
8322 	int perms)
8323 {
8324 	if ((openMode & O_CREAT) != 0) {
8325 		return file_create_entry_ref(device, inode, name, openMode, perms,
8326 			true);
8327 	}
8328 
8329 	return file_open_entry_ref(device, inode, name, openMode, true);
8330 }
8331 
8332 
8333 /*!	\brief Opens a node specified by a FD + path pair.
8334 
8335 	At least one of \a fd and \a path must be specified.
8336 	If only \a fd is given, the function opens the node identified by this
8337 	FD. If only a path is given, this path is opened. If both are given and
8338 	the path is absolute, \a fd is ignored; a relative path is reckoned off
8339 	of the directory (!) identified by \a fd.
8340 
8341 	\param fd The FD. May be < 0.
8342 	\param path The absolute or relative path. May be \c NULL.
8343 	\param openMode The open mode.
8344 	\return A FD referring to the newly opened node, or an error code,
8345 			if an error occurs.
8346 */
8347 int
8348 _kern_open(int fd, const char* path, int openMode, int perms)
8349 {
8350 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8351 	if (pathBuffer.InitCheck() != B_OK)
8352 		return B_NO_MEMORY;
8353 
8354 	if ((openMode & O_CREAT) != 0)
8355 		return file_create(fd, pathBuffer.LockBuffer(), openMode, perms, true);
8356 
8357 	return file_open(fd, pathBuffer.LockBuffer(), openMode, true);
8358 }
8359 
8360 
8361 /*!	\brief Opens a directory specified by entry_ref or node_ref.
8362 
8363 	The supplied name may be \c NULL, in which case directory identified
8364 	by \a device and \a inode will be opened. Otherwise \a device and
8365 	\a inode identify the parent directory of the directory to be opened
8366 	and \a name its entry name.
8367 
8368 	\param device If \a name is specified the ID of the device the parent
8369 		   directory of the directory to be opened resides on, otherwise
8370 		   the device of the directory itself.
8371 	\param inode If \a name is specified the node ID of the parent
8372 		   directory of the directory to be opened, otherwise node ID of the
8373 		   directory itself.
8374 	\param name The entry name of the directory to be opened. If \c NULL,
8375 		   the \a device + \a inode pair identify the node to be opened.
8376 	\return The FD of the newly opened directory or an error code, if
8377 			something went wrong.
8378 */
8379 int
8380 _kern_open_dir_entry_ref(dev_t device, ino_t inode, const char* name)
8381 {
8382 	return dir_open_entry_ref(device, inode, name, true);
8383 }
8384 
8385 
8386 /*!	\brief Opens a directory specified by a FD + path pair.
8387 
8388 	At least one of \a fd and \a path must be specified.
8389 	If only \a fd is given, the function opens the directory identified by this
8390 	FD. If only a path is given, this path is opened. If both are given and
8391 	the path is absolute, \a fd is ignored; a relative path is reckoned off
8392 	of the directory (!) identified by \a fd.
8393 
8394 	\param fd The FD. May be < 0.
8395 	\param path The absolute or relative path. May be \c NULL.
8396 	\return A FD referring to the newly opened directory, or an error code,
8397 			if an error occurs.
8398 */
8399 int
8400 _kern_open_dir(int fd, const char* path)
8401 {
8402 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8403 	if (pathBuffer.InitCheck() != B_OK)
8404 		return B_NO_MEMORY;
8405 
8406 	return dir_open(fd, pathBuffer.LockBuffer(), true);
8407 }
8408 
8409 
8410 status_t
8411 _kern_fcntl(int fd, int op, size_t argument)
8412 {
8413 	return common_fcntl(fd, op, argument, true);
8414 }
8415 
8416 
8417 status_t
8418 _kern_fsync(int fd)
8419 {
8420 	return common_sync(fd, true);
8421 }
8422 
8423 
8424 status_t
8425 _kern_lock_node(int fd)
8426 {
8427 	return common_lock_node(fd, true);
8428 }
8429 
8430 
8431 status_t
8432 _kern_unlock_node(int fd)
8433 {
8434 	return common_unlock_node(fd, true);
8435 }
8436 
8437 
8438 status_t
8439 _kern_create_dir_entry_ref(dev_t device, ino_t inode, const char* name,
8440 	int perms)
8441 {
8442 	return dir_create_entry_ref(device, inode, name, perms, true);
8443 }
8444 
8445 
8446 /*!	\brief Creates a directory specified by a FD + path pair.
8447 
8448 	\a path must always be specified (it contains the name of the new directory
8449 	at least). If only a path is given, this path identifies the location at
8450 	which the directory shall be created. If both \a fd and \a path are given
8451 	and the path is absolute, \a fd is ignored; a relative path is reckoned off
8452 	of the directory (!) identified by \a fd.
8453 
8454 	\param fd The FD. May be < 0.
8455 	\param path The absolute or relative path. Must not be \c NULL.
8456 	\param perms The access permissions the new directory shall have.
8457 	\return \c B_OK, if the directory has been created successfully, another
8458 			error code otherwise.
8459 */
8460 status_t
8461 _kern_create_dir(int fd, const char* path, int perms)
8462 {
8463 	KPath pathBuffer(path, KPath::DEFAULT);
8464 	if (pathBuffer.InitCheck() != B_OK)
8465 		return B_NO_MEMORY;
8466 
8467 	return dir_create(fd, pathBuffer.LockBuffer(), perms, true);
8468 }
8469 
8470 
8471 status_t
8472 _kern_remove_dir(int fd, const char* path)
8473 {
8474 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8475 	if (pathBuffer.InitCheck() != B_OK)
8476 		return B_NO_MEMORY;
8477 
8478 	return dir_remove(fd, pathBuffer.LockBuffer(), true);
8479 }
8480 
8481 
8482 /*!	\brief Reads the contents of a symlink referred to by a FD + path pair.
8483 
8484 	At least one of \a fd and \a path must be specified.
8485 	If only \a fd is given, the function the symlink to be read is the node
8486 	identified by this FD. If only a path is given, this path identifies the
8487 	symlink to be read. If both are given and the path is absolute, \a fd is
8488 	ignored; a relative path is reckoned off of the directory (!) identified
8489 	by \a fd.
8490 	If this function fails with B_BUFFER_OVERFLOW, the \a _bufferSize pointer
8491 	will still be updated to reflect the required buffer size.
8492 
8493 	\param fd The FD. May be < 0.
8494 	\param path The absolute or relative path. May be \c NULL.
8495 	\param buffer The buffer into which the contents of the symlink shall be
8496 		   written.
8497 	\param _bufferSize A pointer to the size of the supplied buffer.
8498 	\return The length of the link on success or an appropriate error code
8499 */
8500 status_t
8501 _kern_read_link(int fd, const char* path, char* buffer, size_t* _bufferSize)
8502 {
8503 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8504 	if (pathBuffer.InitCheck() != B_OK)
8505 		return B_NO_MEMORY;
8506 
8507 	return common_read_link(fd, pathBuffer.LockBuffer(),
8508 		buffer, _bufferSize, true);
8509 }
8510 
8511 
8512 /*!	\brief Creates a symlink specified by a FD + path pair.
8513 
8514 	\a path must always be specified (it contains the name of the new symlink
8515 	at least). If only a path is given, this path identifies the location at
8516 	which the symlink shall be created. If both \a fd and \a path are given and
8517 	the path is absolute, \a fd is ignored; a relative path is reckoned off
8518 	of the directory (!) identified by \a fd.
8519 
8520 	\param fd The FD. May be < 0.
8521 	\param toPath The absolute or relative path. Must not be \c NULL.
8522 	\param mode The access permissions the new symlink shall have.
8523 	\return \c B_OK, if the symlink has been created successfully, another
8524 			error code otherwise.
8525 */
8526 status_t
8527 _kern_create_symlink(int fd, const char* path, const char* toPath, int mode)
8528 {
8529 	KPath pathBuffer(path);
8530 	if (pathBuffer.InitCheck() != B_OK)
8531 		return B_NO_MEMORY;
8532 
8533 	return common_create_symlink(fd, pathBuffer.LockBuffer(),
8534 		toPath, mode, true);
8535 }
8536 
8537 
8538 status_t
8539 _kern_create_link(int pathFD, const char* path, int toFD, const char* toPath,
8540 	bool traverseLeafLink)
8541 {
8542 	KPath pathBuffer(path);
8543 	KPath toPathBuffer(toPath);
8544 	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
8545 		return B_NO_MEMORY;
8546 
8547 	return common_create_link(pathFD, pathBuffer.LockBuffer(), toFD,
8548 		toPathBuffer.LockBuffer(), traverseLeafLink, true);
8549 }
8550 
8551 
8552 /*!	\brief Removes an entry specified by a FD + path pair from its directory.
8553 
8554 	\a path must always be specified (it contains at least the name of the entry
8555 	to be deleted). If only a path is given, this path identifies the entry
8556 	directly. If both \a fd and \a path are given and the path is absolute,
8557 	\a fd is ignored; a relative path is reckoned off of the directory (!)
8558 	identified by \a fd.
8559 
8560 	\param fd The FD. May be < 0.
8561 	\param path The absolute or relative path. Must not be \c NULL.
8562 	\return \c B_OK, if the entry has been removed successfully, another
8563 			error code otherwise.
8564 */
8565 status_t
8566 _kern_unlink(int fd, const char* path)
8567 {
8568 	KPath pathBuffer(path);
8569 	if (pathBuffer.InitCheck() != B_OK)
8570 		return B_NO_MEMORY;
8571 
8572 	return common_unlink(fd, pathBuffer.LockBuffer(), true);
8573 }
8574 
8575 
8576 /*!	\brief Moves an entry specified by a FD + path pair to a an entry specified
8577 		   by another FD + path pair.
8578 
8579 	\a oldPath and \a newPath must always be specified (they contain at least
8580 	the name of the entry). If only a path is given, this path identifies the
8581 	entry directly. If both a FD and a path are given and the path is absolute,
8582 	the FD is ignored; a relative path is reckoned off of the directory (!)
8583 	identified by the respective FD.
8584 
8585 	\param oldFD The FD of the old location. May be < 0.
8586 	\param oldPath The absolute or relative path of the old location. Must not
8587 		   be \c NULL.
8588 	\param newFD The FD of the new location. May be < 0.
8589 	\param newPath The absolute or relative path of the new location. Must not
8590 		   be \c NULL.
8591 	\return \c B_OK, if the entry has been moved successfully, another
8592 			error code otherwise.
8593 */
8594 status_t
8595 _kern_rename(int oldFD, const char* oldPath, int newFD, const char* newPath)
8596 {
8597 	KPath oldPathBuffer(oldPath);
8598 	KPath newPathBuffer(newPath);
8599 	if (oldPathBuffer.InitCheck() != B_OK || newPathBuffer.InitCheck() != B_OK)
8600 		return B_NO_MEMORY;
8601 
8602 	return common_rename(oldFD, oldPathBuffer.LockBuffer(),
8603 		newFD, newPathBuffer.LockBuffer(), true);
8604 }
8605 
8606 
8607 status_t
8608 _kern_access(int fd, const char* path, int mode, bool effectiveUserGroup)
8609 {
8610 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8611 	if (pathBuffer.InitCheck() != B_OK)
8612 		return B_NO_MEMORY;
8613 
8614 	return common_access(fd, pathBuffer.LockBuffer(), mode, effectiveUserGroup,
8615 		true);
8616 }
8617 
8618 
8619 /*!	\brief Reads stat data of an entity specified by a FD + path pair.
8620 
8621 	If only \a fd is given, the stat operation associated with the type
8622 	of the FD (node, attr, attr dir etc.) is performed. If only \a path is
8623 	given, this path identifies the entry for whose node to retrieve the
8624 	stat data. If both \a fd and \a path are given and the path is absolute,
8625 	\a fd is ignored; a relative path is reckoned off of the directory (!)
8626 	identified by \a fd and specifies the entry whose stat data shall be
8627 	retrieved.
8628 
8629 	\param fd The FD. May be < 0.
8630 	\param path The absolute or relative path. Must not be \c NULL.
8631 	\param traverseLeafLink If \a path is given, \c true specifies that the
8632 		   function shall not stick to symlinks, but traverse them.
8633 	\param stat The buffer the stat data shall be written into.
8634 	\param statSize The size of the supplied stat buffer.
8635 	\return \c B_OK, if the the stat data have been read successfully, another
8636 			error code otherwise.
8637 */
8638 status_t
8639 _kern_read_stat(int fd, const char* path, bool traverseLeafLink,
8640 	struct stat* stat, size_t statSize)
8641 {
8642 	struct stat completeStat;
8643 	struct stat* originalStat = NULL;
8644 	status_t status;
8645 
8646 	if (statSize > sizeof(struct stat))
8647 		return B_BAD_VALUE;
8648 
8649 	// this supports different stat extensions
8650 	if (statSize < sizeof(struct stat)) {
8651 		originalStat = stat;
8652 		stat = &completeStat;
8653 	}
8654 
8655 	status = vfs_read_stat(fd, path, traverseLeafLink, stat, true);
8656 
8657 	if (status == B_OK && originalStat != NULL)
8658 		memcpy(originalStat, stat, statSize);
8659 
8660 	return status;
8661 }
8662 
8663 
8664 /*!	\brief Writes stat data of an entity specified by a FD + path pair.
8665 
8666 	If only \a fd is given, the stat operation associated with the type
8667 	of the FD (node, attr, attr dir etc.) is performed. If only \a path is
8668 	given, this path identifies the entry for whose node to write the
8669 	stat data. If both \a fd and \a path are given and the path is absolute,
8670 	\a fd is ignored; a relative path is reckoned off of the directory (!)
8671 	identified by \a fd and specifies the entry whose stat data shall be
8672 	written.
8673 
8674 	\param fd The FD. May be < 0.
8675 	\param path The absolute or relative path. May be \c NULL.
8676 	\param traverseLeafLink If \a path is given, \c true specifies that the
8677 		   function shall not stick to symlinks, but traverse them.
8678 	\param stat The buffer containing the stat data to be written.
8679 	\param statSize The size of the supplied stat buffer.
8680 	\param statMask A mask specifying which parts of the stat data shall be
8681 		   written.
8682 	\return \c B_OK, if the the stat data have been written successfully,
8683 			another error code otherwise.
8684 */
8685 status_t
8686 _kern_write_stat(int fd, const char* path, bool traverseLeafLink,
8687 	const struct stat* stat, size_t statSize, int statMask)
8688 {
8689 	struct stat completeStat;
8690 
8691 	if (statSize > sizeof(struct stat))
8692 		return B_BAD_VALUE;
8693 
8694 	// this supports different stat extensions
8695 	if (statSize < sizeof(struct stat)) {
8696 		memset((uint8*)&completeStat + statSize, 0,
8697 			sizeof(struct stat) - statSize);
8698 		memcpy(&completeStat, stat, statSize);
8699 		stat = &completeStat;
8700 	}
8701 
8702 	status_t status;
8703 
8704 	if (path != NULL) {
8705 		// path given: write the stat of the node referred to by (fd, path)
8706 		KPath pathBuffer(path);
8707 		if (pathBuffer.InitCheck() != B_OK)
8708 			return B_NO_MEMORY;
8709 
8710 		status = common_path_write_stat(fd, pathBuffer.LockBuffer(),
8711 			traverseLeafLink, stat, statMask, true);
8712 	} else {
8713 		// no path given: get the FD and use the FD operation
8714 		struct file_descriptor* descriptor
8715 			= get_fd(get_current_io_context(true), fd);
8716 		if (descriptor == NULL)
8717 			return B_FILE_ERROR;
8718 
8719 		if (descriptor->ops->fd_write_stat)
8720 			status = descriptor->ops->fd_write_stat(descriptor, stat, statMask);
8721 		else
8722 			status = B_UNSUPPORTED;
8723 
8724 		put_fd(descriptor);
8725 	}
8726 
8727 	return status;
8728 }
8729 
8730 
8731 int
8732 _kern_open_attr_dir(int fd, const char* path, bool traverseLeafLink)
8733 {
8734 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8735 	if (pathBuffer.InitCheck() != B_OK)
8736 		return B_NO_MEMORY;
8737 
8738 	return attr_dir_open(fd, pathBuffer.LockBuffer(), traverseLeafLink, true);
8739 }
8740 
8741 
8742 int
8743 _kern_open_attr(int fd, const char* path, const char* name, uint32 type,
8744 	int openMode)
8745 {
8746 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8747 	if (pathBuffer.InitCheck() != B_OK)
8748 		return B_NO_MEMORY;
8749 
8750 	if ((openMode & O_CREAT) != 0) {
8751 		return attr_create(fd, pathBuffer.LockBuffer(), name, type, openMode,
8752 			true);
8753 	}
8754 
8755 	return attr_open(fd, pathBuffer.LockBuffer(), name, openMode, true);
8756 }
8757 
8758 
8759 status_t
8760 _kern_remove_attr(int fd, const char* name)
8761 {
8762 	return attr_remove(fd, name, true);
8763 }
8764 
8765 
8766 status_t
8767 _kern_rename_attr(int fromFile, const char* fromName, int toFile,
8768 	const char* toName)
8769 {
8770 	return attr_rename(fromFile, fromName, toFile, toName, true);
8771 }
8772 
8773 
8774 int
8775 _kern_open_index_dir(dev_t device)
8776 {
8777 	return index_dir_open(device, true);
8778 }
8779 
8780 
8781 status_t
8782 _kern_create_index(dev_t device, const char* name, uint32 type, uint32 flags)
8783 {
8784 	return index_create(device, name, type, flags, true);
8785 }
8786 
8787 
8788 status_t
8789 _kern_read_index_stat(dev_t device, const char* name, struct stat* stat)
8790 {
8791 	return index_name_read_stat(device, name, stat, true);
8792 }
8793 
8794 
8795 status_t
8796 _kern_remove_index(dev_t device, const char* name)
8797 {
8798 	return index_remove(device, name, true);
8799 }
8800 
8801 
8802 status_t
8803 _kern_getcwd(char* buffer, size_t size)
8804 {
8805 	TRACE(("_kern_getcwd: buf %p, %ld\n", buffer, size));
8806 
8807 	// Call vfs to get current working directory
8808 	return get_cwd(buffer, size, true);
8809 }
8810 
8811 
8812 status_t
8813 _kern_setcwd(int fd, const char* path)
8814 {
8815 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8816 	if (pathBuffer.InitCheck() != B_OK)
8817 		return B_NO_MEMORY;
8818 
8819 	return set_cwd(fd, pathBuffer.LockBuffer(), true);
8820 }
8821 
8822 
8823 //	#pragma mark - userland syscalls
8824 
8825 
8826 dev_t
8827 _user_mount(const char* userPath, const char* userDevice,
8828 	const char* userFileSystem, uint32 flags, const char* userArgs,
8829 	size_t argsLength)
8830 {
8831 	char fileSystem[B_FILE_NAME_LENGTH];
8832 	KPath path, device;
8833 	char* args = NULL;
8834 	status_t status;
8835 
8836 	if (!IS_USER_ADDRESS(userPath))
8837 		return B_BAD_ADDRESS;
8838 
8839 	if (path.InitCheck() != B_OK || device.InitCheck() != B_OK)
8840 		return B_NO_MEMORY;
8841 
8842 	status = user_copy_name(path.LockBuffer(), userPath,
8843 		B_PATH_NAME_LENGTH);
8844 	if (status != B_OK)
8845 		return status;
8846 	path.UnlockBuffer();
8847 
8848 	if (userFileSystem != NULL) {
8849 		if (!IS_USER_ADDRESS(userFileSystem))
8850 			return B_BAD_ADDRESS;
8851 
8852 		status = user_copy_name(fileSystem, userFileSystem, sizeof(fileSystem));
8853 		if (status != B_OK)
8854 			return status;
8855 	}
8856 
8857 	if (userDevice != NULL) {
8858 		if (!IS_USER_ADDRESS(userDevice))
8859 			return B_BAD_ADDRESS;
8860 
8861 		status = user_copy_name(device.LockBuffer(), userDevice,
8862 			B_PATH_NAME_LENGTH);
8863 		if (status != B_OK)
8864 			return status;
8865 		device.UnlockBuffer();
8866 	}
8867 
8868 	if (userArgs != NULL && argsLength > 0) {
8869 		if (!IS_USER_ADDRESS(userArgs))
8870 			return B_BAD_ADDRESS;
8871 
8872 		// this is a safety restriction
8873 		if (argsLength >= 65536)
8874 			return B_NAME_TOO_LONG;
8875 
8876 		args = (char*)malloc(argsLength + 1);
8877 		if (args == NULL)
8878 			return B_NO_MEMORY;
8879 
8880 		status = user_copy_name(args, userArgs, argsLength + 1);
8881 		if (status != B_OK) {
8882 			free(args);
8883 			return status;
8884 		}
8885 	}
8886 
8887 	status = fs_mount(path.LockBuffer(),
8888 		userDevice != NULL ? device.Path() : NULL,
8889 		userFileSystem ? fileSystem : NULL, flags, args, false);
8890 
8891 	free(args);
8892 	return status;
8893 }
8894 
8895 
8896 status_t
8897 _user_unmount(const char* userPath, uint32 flags)
8898 {
8899 	if (!IS_USER_ADDRESS(userPath))
8900 		return B_BAD_ADDRESS;
8901 
8902 	KPath pathBuffer;
8903 	if (pathBuffer.InitCheck() != B_OK)
8904 		return B_NO_MEMORY;
8905 
8906 	char* path = pathBuffer.LockBuffer();
8907 
8908 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
8909 	if (status != B_OK)
8910 		return status;
8911 
8912 	return fs_unmount(path, -1, flags & ~B_UNMOUNT_BUSY_PARTITION, false);
8913 }
8914 
8915 
8916 status_t
8917 _user_read_fs_info(dev_t device, struct fs_info* userInfo)
8918 {
8919 	struct fs_info info;
8920 	status_t status;
8921 
8922 	if (userInfo == NULL)
8923 		return B_BAD_VALUE;
8924 
8925 	if (!IS_USER_ADDRESS(userInfo))
8926 		return B_BAD_ADDRESS;
8927 
8928 	status = fs_read_info(device, &info);
8929 	if (status != B_OK)
8930 		return status;
8931 
8932 	if (user_memcpy(userInfo, &info, sizeof(struct fs_info)) != B_OK)
8933 		return B_BAD_ADDRESS;
8934 
8935 	return B_OK;
8936 }
8937 
8938 
8939 status_t
8940 _user_write_fs_info(dev_t device, const struct fs_info* userInfo, int mask)
8941 {
8942 	struct fs_info info;
8943 
8944 	if (userInfo == NULL)
8945 		return B_BAD_VALUE;
8946 
8947 	if (!IS_USER_ADDRESS(userInfo)
8948 		|| user_memcpy(&info, userInfo, sizeof(struct fs_info)) != B_OK)
8949 		return B_BAD_ADDRESS;
8950 
8951 	return fs_write_info(device, &info, mask);
8952 }
8953 
8954 
8955 dev_t
8956 _user_next_device(int32* _userCookie)
8957 {
8958 	int32 cookie;
8959 	dev_t device;
8960 
8961 	if (!IS_USER_ADDRESS(_userCookie)
8962 		|| user_memcpy(&cookie, _userCookie, sizeof(int32)) != B_OK)
8963 		return B_BAD_ADDRESS;
8964 
8965 	device = fs_next_device(&cookie);
8966 
8967 	if (device >= B_OK) {
8968 		// update user cookie
8969 		if (user_memcpy(_userCookie, &cookie, sizeof(int32)) != B_OK)
8970 			return B_BAD_ADDRESS;
8971 	}
8972 
8973 	return device;
8974 }
8975 
8976 
8977 status_t
8978 _user_sync(void)
8979 {
8980 	return _kern_sync();
8981 }
8982 
8983 
8984 status_t
8985 _user_get_next_fd_info(team_id team, uint32* userCookie, fd_info* userInfo,
8986 	size_t infoSize)
8987 {
8988 	struct fd_info info;
8989 	uint32 cookie;
8990 
8991 	// only root can do this
8992 	if (geteuid() != 0)
8993 		return B_NOT_ALLOWED;
8994 
8995 	if (infoSize != sizeof(fd_info))
8996 		return B_BAD_VALUE;
8997 
8998 	if (!IS_USER_ADDRESS(userCookie) || !IS_USER_ADDRESS(userInfo)
8999 		|| user_memcpy(&cookie, userCookie, sizeof(uint32)) != B_OK)
9000 		return B_BAD_ADDRESS;
9001 
9002 	status_t status = _kern_get_next_fd_info(team, &cookie, &info, infoSize);
9003 	if (status != B_OK)
9004 		return status;
9005 
9006 	if (user_memcpy(userCookie, &cookie, sizeof(uint32)) != B_OK
9007 		|| user_memcpy(userInfo, &info, infoSize) != B_OK)
9008 		return B_BAD_ADDRESS;
9009 
9010 	return status;
9011 }
9012 
9013 
9014 status_t
9015 _user_entry_ref_to_path(dev_t device, ino_t inode, const char* leaf,
9016 	char* userPath, size_t pathLength)
9017 {
9018 	if (!IS_USER_ADDRESS(userPath))
9019 		return B_BAD_ADDRESS;
9020 
9021 	KPath path;
9022 	if (path.InitCheck() != B_OK)
9023 		return B_NO_MEMORY;
9024 
9025 	// copy the leaf name onto the stack
9026 	char stackLeaf[B_FILE_NAME_LENGTH];
9027 	if (leaf != NULL) {
9028 		if (!IS_USER_ADDRESS(leaf))
9029 			return B_BAD_ADDRESS;
9030 
9031 		int status = user_copy_name(stackLeaf, leaf, B_FILE_NAME_LENGTH);
9032 		if (status != B_OK)
9033 			return status;
9034 
9035 		leaf = stackLeaf;
9036 	}
9037 
9038 	status_t status = vfs_entry_ref_to_path(device, inode, leaf,
9039 		false, path.LockBuffer(), path.BufferSize());
9040 	if (status != B_OK)
9041 		return status;
9042 
9043 	path.UnlockBuffer();
9044 
9045 	int length = user_strlcpy(userPath, path.Path(), pathLength);
9046 	if (length < 0)
9047 		return length;
9048 	if (length >= (int)pathLength)
9049 		return B_BUFFER_OVERFLOW;
9050 
9051 	return B_OK;
9052 }
9053 
9054 
9055 status_t
9056 _user_normalize_path(const char* userPath, bool traverseLink, char* buffer)
9057 {
9058 	if (userPath == NULL || buffer == NULL)
9059 		return B_BAD_VALUE;
9060 	if (!IS_USER_ADDRESS(userPath) || !IS_USER_ADDRESS(buffer))
9061 		return B_BAD_ADDRESS;
9062 
9063 	// copy path from userland
9064 	KPath pathBuffer;
9065 	if (pathBuffer.InitCheck() != B_OK)
9066 		return B_NO_MEMORY;
9067 	char* path = pathBuffer.LockBuffer();
9068 
9069 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9070 	if (status != B_OK)
9071 		return status;
9072 
9073 	status_t error = normalize_path(path, pathBuffer.BufferSize(), traverseLink,
9074 		false);
9075 	if (error != B_OK)
9076 		return error;
9077 
9078 	// copy back to userland
9079 	int len = user_strlcpy(buffer, path, B_PATH_NAME_LENGTH);
9080 	if (len < 0)
9081 		return len;
9082 	if (len >= B_PATH_NAME_LENGTH)
9083 		return B_BUFFER_OVERFLOW;
9084 
9085 	return B_OK;
9086 }
9087 
9088 
9089 int
9090 _user_open_entry_ref(dev_t device, ino_t inode, const char* userName,
9091 	int openMode, int perms)
9092 {
9093 	char name[B_FILE_NAME_LENGTH];
9094 
9095 	if (userName == NULL || device < 0 || inode < 0)
9096 		return B_BAD_VALUE;
9097 	if (!IS_USER_ADDRESS(userName))
9098 		return B_BAD_ADDRESS;
9099 	status_t status = user_copy_name(name, userName, sizeof(name));
9100 	if (status != B_OK)
9101 		return status;
9102 
9103 	if ((openMode & O_CREAT) != 0) {
9104 		return file_create_entry_ref(device, inode, name, openMode, perms,
9105 			false);
9106 	}
9107 
9108 	return file_open_entry_ref(device, inode, name, openMode, false);
9109 }
9110 
9111 
9112 int
9113 _user_open(int fd, const char* userPath, int openMode, int perms)
9114 {
9115 	KPath path;
9116 	if (path.InitCheck() != B_OK)
9117 		return B_NO_MEMORY;
9118 
9119 	char* buffer = path.LockBuffer();
9120 
9121 	if (!IS_USER_ADDRESS(userPath))
9122 		return B_BAD_ADDRESS;
9123 	status_t status = user_copy_name(buffer, userPath, B_PATH_NAME_LENGTH);
9124 	if (status != B_OK)
9125 		return status;
9126 
9127 	if ((openMode & O_CREAT) != 0)
9128 		return file_create(fd, buffer, openMode, perms, false);
9129 
9130 	return file_open(fd, buffer, openMode, false);
9131 }
9132 
9133 
9134 int
9135 _user_open_dir_entry_ref(dev_t device, ino_t inode, const char* userName)
9136 {
9137 	if (userName != NULL) {
9138 		char name[B_FILE_NAME_LENGTH];
9139 
9140 		if (!IS_USER_ADDRESS(userName))
9141 			return B_BAD_ADDRESS;
9142 		status_t status = user_copy_name(name, userName, sizeof(name));
9143 		if (status != B_OK)
9144 			return status;
9145 
9146 		return dir_open_entry_ref(device, inode, name, false);
9147 	}
9148 	return dir_open_entry_ref(device, inode, NULL, false);
9149 }
9150 
9151 
9152 int
9153 _user_open_dir(int fd, const char* userPath)
9154 {
9155 	if (userPath == NULL)
9156 		return dir_open(fd, NULL, false);
9157 
9158 	KPath path;
9159 	if (path.InitCheck() != B_OK)
9160 		return B_NO_MEMORY;
9161 
9162 	char* buffer = path.LockBuffer();
9163 
9164 	if (!IS_USER_ADDRESS(userPath))
9165 		return B_BAD_ADDRESS;
9166 	status_t status = user_copy_name(buffer, userPath, B_PATH_NAME_LENGTH);
9167 	if (status != B_OK)
9168 		return status;
9169 
9170 	return dir_open(fd, buffer, false);
9171 }
9172 
9173 
9174 /*!	\brief Opens a directory's parent directory and returns the entry name
9175 		   of the former.
9176 
9177 	Aside from that it returns the directory's entry name, this method is
9178 	equivalent to \code _user_open_dir(fd, "..") \endcode. It really is
9179 	equivalent, if \a userName is \c NULL.
9180 
9181 	If a name buffer is supplied and the name does not fit the buffer, the
9182 	function fails. A buffer of size \c B_FILE_NAME_LENGTH should be safe.
9183 
9184 	\param fd A FD referring to a directory.
9185 	\param userName Buffer the directory's entry name shall be written into.
9186 		   May be \c NULL.
9187 	\param nameLength Size of the name buffer.
9188 	\return The file descriptor of the opened parent directory, if everything
9189 			went fine, an error code otherwise.
9190 */
9191 int
9192 _user_open_parent_dir(int fd, char* userName, size_t nameLength)
9193 {
9194 	bool kernel = false;
9195 
9196 	if (userName && !IS_USER_ADDRESS(userName))
9197 		return B_BAD_ADDRESS;
9198 
9199 	// open the parent dir
9200 	int parentFD = dir_open(fd, (char*)"..", kernel);
9201 	if (parentFD < 0)
9202 		return parentFD;
9203 	FDCloser fdCloser(parentFD, kernel);
9204 
9205 	if (userName) {
9206 		// get the vnodes
9207 		struct vnode* parentVNode = get_vnode_from_fd(parentFD, kernel);
9208 		struct vnode* dirVNode = get_vnode_from_fd(fd, kernel);
9209 		VNodePutter parentVNodePutter(parentVNode);
9210 		VNodePutter dirVNodePutter(dirVNode);
9211 		if (!parentVNode || !dirVNode)
9212 			return B_FILE_ERROR;
9213 
9214 		// get the vnode name
9215 		char _buffer[sizeof(struct dirent) + B_FILE_NAME_LENGTH];
9216 		struct dirent* buffer = (struct dirent*)_buffer;
9217 		status_t status = get_vnode_name(dirVNode, parentVNode, buffer,
9218 			sizeof(_buffer), get_current_io_context(false));
9219 		if (status != B_OK)
9220 			return status;
9221 
9222 		// copy the name to the userland buffer
9223 		int len = user_strlcpy(userName, buffer->d_name, nameLength);
9224 		if (len < 0)
9225 			return len;
9226 		if (len >= (int)nameLength)
9227 			return B_BUFFER_OVERFLOW;
9228 	}
9229 
9230 	return fdCloser.Detach();
9231 }
9232 
9233 
9234 status_t
9235 _user_fcntl(int fd, int op, size_t argument)
9236 {
9237 	status_t status = common_fcntl(fd, op, argument, false);
9238 	if (op == F_SETLKW)
9239 		syscall_restart_handle_post(status);
9240 
9241 	return status;
9242 }
9243 
9244 
9245 status_t
9246 _user_fsync(int fd)
9247 {
9248 	return common_sync(fd, false);
9249 }
9250 
9251 
9252 status_t
9253 _user_flock(int fd, int operation)
9254 {
9255 	FUNCTION(("_user_fcntl(fd = %d, op = %d)\n", fd, operation));
9256 
9257 	// Check if the operation is valid
9258 	switch (operation & ~LOCK_NB) {
9259 		case LOCK_UN:
9260 		case LOCK_SH:
9261 		case LOCK_EX:
9262 			break;
9263 
9264 		default:
9265 			return B_BAD_VALUE;
9266 	}
9267 
9268 	struct file_descriptor* descriptor;
9269 	struct vnode* vnode;
9270 	descriptor = get_fd_and_vnode(fd, &vnode, false);
9271 	if (descriptor == NULL)
9272 		return B_FILE_ERROR;
9273 
9274 	if (descriptor->type != FDTYPE_FILE) {
9275 		put_fd(descriptor);
9276 		return B_BAD_VALUE;
9277 	}
9278 
9279 	struct flock flock;
9280 	flock.l_start = 0;
9281 	flock.l_len = OFF_MAX;
9282 	flock.l_whence = 0;
9283 	flock.l_type = (operation & LOCK_SH) != 0 ? F_RDLCK : F_WRLCK;
9284 
9285 	status_t status;
9286 	if ((operation & LOCK_UN) != 0) {
9287 		if (HAS_FS_CALL(vnode, release_lock))
9288 			status = FS_CALL(vnode, release_lock, descriptor->cookie, &flock);
9289 		else
9290 			status = release_advisory_lock(vnode, NULL, descriptor, &flock);
9291 	} else {
9292 		if (HAS_FS_CALL(vnode, acquire_lock)) {
9293 			status = FS_CALL(vnode, acquire_lock, descriptor->cookie, &flock,
9294 				(operation & LOCK_NB) == 0);
9295 		} else {
9296 			status = acquire_advisory_lock(vnode, NULL, descriptor, &flock,
9297 				(operation & LOCK_NB) == 0);
9298 		}
9299 	}
9300 
9301 	syscall_restart_handle_post(status);
9302 
9303 	put_fd(descriptor);
9304 	return status;
9305 }
9306 
9307 
9308 status_t
9309 _user_lock_node(int fd)
9310 {
9311 	return common_lock_node(fd, false);
9312 }
9313 
9314 
9315 status_t
9316 _user_unlock_node(int fd)
9317 {
9318 	return common_unlock_node(fd, false);
9319 }
9320 
9321 
9322 status_t
9323 _user_create_dir_entry_ref(dev_t device, ino_t inode, const char* userName,
9324 	int perms)
9325 {
9326 	char name[B_FILE_NAME_LENGTH];
9327 	status_t status;
9328 
9329 	if (!IS_USER_ADDRESS(userName))
9330 		return B_BAD_ADDRESS;
9331 
9332 	status = user_copy_name(name, userName, sizeof(name));
9333 	if (status != B_OK)
9334 		return status;
9335 
9336 	return dir_create_entry_ref(device, inode, name, perms, false);
9337 }
9338 
9339 
9340 status_t
9341 _user_create_dir(int fd, const char* userPath, int perms)
9342 {
9343 	KPath pathBuffer;
9344 	if (pathBuffer.InitCheck() != B_OK)
9345 		return B_NO_MEMORY;
9346 
9347 	char* path = pathBuffer.LockBuffer();
9348 
9349 	if (!IS_USER_ADDRESS(userPath))
9350 		return B_BAD_ADDRESS;
9351 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9352 	if (status != B_OK)
9353 		return status;
9354 
9355 	return dir_create(fd, path, perms, false);
9356 }
9357 
9358 
9359 status_t
9360 _user_remove_dir(int fd, const char* userPath)
9361 {
9362 	KPath pathBuffer;
9363 	if (pathBuffer.InitCheck() != B_OK)
9364 		return B_NO_MEMORY;
9365 
9366 	char* path = pathBuffer.LockBuffer();
9367 
9368 	if (userPath != NULL) {
9369 		if (!IS_USER_ADDRESS(userPath))
9370 			return B_BAD_ADDRESS;
9371 		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9372 		if (status != B_OK)
9373 			return status;
9374 	}
9375 
9376 	return dir_remove(fd, userPath ? path : NULL, false);
9377 }
9378 
9379 
9380 status_t
9381 _user_read_link(int fd, const char* userPath, char* userBuffer,
9382 	size_t* userBufferSize)
9383 {
9384 	KPath pathBuffer, linkBuffer;
9385 	if (pathBuffer.InitCheck() != B_OK || linkBuffer.InitCheck() != B_OK)
9386 		return B_NO_MEMORY;
9387 
9388 	size_t bufferSize;
9389 
9390 	if (!IS_USER_ADDRESS(userBuffer) || !IS_USER_ADDRESS(userBufferSize)
9391 		|| user_memcpy(&bufferSize, userBufferSize, sizeof(size_t)) != B_OK)
9392 		return B_BAD_ADDRESS;
9393 
9394 	char* path = pathBuffer.LockBuffer();
9395 	char* buffer = linkBuffer.LockBuffer();
9396 
9397 	if (userPath) {
9398 		if (!IS_USER_ADDRESS(userPath))
9399 			return B_BAD_ADDRESS;
9400 		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9401 		if (status != B_OK)
9402 			return status;
9403 
9404 		if (bufferSize > B_PATH_NAME_LENGTH)
9405 			bufferSize = B_PATH_NAME_LENGTH;
9406 	}
9407 
9408 	status_t status = common_read_link(fd, userPath ? path : NULL, buffer,
9409 		&bufferSize, false);
9410 
9411 	// we also update the bufferSize in case of errors
9412 	// (the real length will be returned in case of B_BUFFER_OVERFLOW)
9413 	if (user_memcpy(userBufferSize, &bufferSize, sizeof(size_t)) != B_OK)
9414 		return B_BAD_ADDRESS;
9415 
9416 	if (status != B_OK)
9417 		return status;
9418 
9419 	if (user_memcpy(userBuffer, buffer, bufferSize) != B_OK)
9420 		return B_BAD_ADDRESS;
9421 
9422 	return B_OK;
9423 }
9424 
9425 
9426 status_t
9427 _user_create_symlink(int fd, const char* userPath, const char* userToPath,
9428 	int mode)
9429 {
9430 	KPath pathBuffer;
9431 	KPath toPathBuffer;
9432 	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
9433 		return B_NO_MEMORY;
9434 
9435 	char* path = pathBuffer.LockBuffer();
9436 	char* toPath = toPathBuffer.LockBuffer();
9437 
9438 	if (!IS_USER_ADDRESS(userPath) || !IS_USER_ADDRESS(userToPath))
9439 		return B_BAD_ADDRESS;
9440 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9441 	if (status != B_OK)
9442 		return status;
9443 	status = user_copy_name(toPath, userToPath, B_PATH_NAME_LENGTH);
9444 	if (status != B_OK)
9445 		return status;
9446 
9447 	return common_create_symlink(fd, path, toPath, mode, false);
9448 }
9449 
9450 
9451 status_t
9452 _user_create_link(int pathFD, const char* userPath, int toFD,
9453 	const char* userToPath, bool traverseLeafLink)
9454 {
9455 	KPath pathBuffer;
9456 	KPath toPathBuffer;
9457 	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
9458 		return B_NO_MEMORY;
9459 
9460 	char* path = pathBuffer.LockBuffer();
9461 	char* toPath = toPathBuffer.LockBuffer();
9462 
9463 	if (!IS_USER_ADDRESS(userPath) || !IS_USER_ADDRESS(userToPath))
9464 		return B_BAD_ADDRESS;
9465 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9466 	if (status != B_OK)
9467 		return status;
9468 	status = user_copy_name(toPath, userToPath, B_PATH_NAME_LENGTH);
9469 	if (status != B_OK)
9470 		return status;
9471 
9472 	status = check_path(toPath);
9473 	if (status != B_OK)
9474 		return status;
9475 
9476 	return common_create_link(pathFD, path, toFD, toPath, traverseLeafLink,
9477 		false);
9478 }
9479 
9480 
9481 status_t
9482 _user_unlink(int fd, const char* userPath)
9483 {
9484 	KPath pathBuffer;
9485 	if (pathBuffer.InitCheck() != B_OK)
9486 		return B_NO_MEMORY;
9487 
9488 	char* path = pathBuffer.LockBuffer();
9489 
9490 	if (!IS_USER_ADDRESS(userPath))
9491 		return B_BAD_ADDRESS;
9492 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9493 	if (status != B_OK)
9494 		return status;
9495 
9496 	return common_unlink(fd, path, false);
9497 }
9498 
9499 
9500 status_t
9501 _user_rename(int oldFD, const char* userOldPath, int newFD,
9502 	const char* userNewPath)
9503 {
9504 	KPath oldPathBuffer;
9505 	KPath newPathBuffer;
9506 	if (oldPathBuffer.InitCheck() != B_OK || newPathBuffer.InitCheck() != B_OK)
9507 		return B_NO_MEMORY;
9508 
9509 	char* oldPath = oldPathBuffer.LockBuffer();
9510 	char* newPath = newPathBuffer.LockBuffer();
9511 
9512 	if (!IS_USER_ADDRESS(userOldPath) || !IS_USER_ADDRESS(userNewPath))
9513 		return B_BAD_ADDRESS;
9514 	status_t status = user_copy_name(oldPath, userOldPath, B_PATH_NAME_LENGTH);
9515 	if (status != B_OK)
9516 		return status;
9517 	status = user_copy_name(newPath, userNewPath, B_PATH_NAME_LENGTH);
9518 	if (status != B_OK)
9519 		return status;
9520 
9521 	return common_rename(oldFD, oldPath, newFD, newPath, false);
9522 }
9523 
9524 
9525 status_t
9526 _user_create_fifo(int fd, const char* userPath, mode_t perms)
9527 {
9528 	KPath pathBuffer;
9529 	if (pathBuffer.InitCheck() != B_OK)
9530 		return B_NO_MEMORY;
9531 
9532 	char* path = pathBuffer.LockBuffer();
9533 
9534 	if (!IS_USER_ADDRESS(userPath))
9535 		return B_BAD_ADDRESS;
9536 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9537 	if (status != B_OK)
9538 		return status;
9539 
9540 	// split into directory vnode and filename path
9541 	char filename[B_FILE_NAME_LENGTH];
9542 	struct vnode* dir;
9543 	status = fd_and_path_to_dir_vnode(fd, path, &dir, filename, false);
9544 	if (status != B_OK)
9545 		return status;
9546 
9547 	VNodePutter _(dir);
9548 
9549 	// the underlying FS needs to support creating FIFOs
9550 	if (!HAS_FS_CALL(dir, create_special_node))
9551 		return B_UNSUPPORTED;
9552 
9553 	// create the entry	-- the FIFO sub node is set up automatically
9554 	fs_vnode superVnode;
9555 	ino_t nodeID;
9556 	status = FS_CALL(dir, create_special_node, filename, NULL,
9557 		S_IFIFO | (perms & S_IUMSK), 0, &superVnode, &nodeID);
9558 
9559 	// create_special_node() acquired a reference for us that we don't need.
9560 	if (status == B_OK)
9561 		put_vnode(dir->mount->volume, nodeID);
9562 
9563 	return status;
9564 }
9565 
9566 
9567 status_t
9568 _user_create_pipe(int* userFDs)
9569 {
9570 	// rootfs should support creating FIFOs, but let's be sure
9571 	if (!HAS_FS_CALL(sRoot, create_special_node))
9572 		return B_UNSUPPORTED;
9573 
9574 	// create the node	-- the FIFO sub node is set up automatically
9575 	fs_vnode superVnode;
9576 	ino_t nodeID;
9577 	status_t status = FS_CALL(sRoot, create_special_node, NULL, NULL,
9578 		S_IFIFO | S_IRUSR | S_IWUSR, 0, &superVnode, &nodeID);
9579 	if (status != B_OK)
9580 		return status;
9581 
9582 	// We've got one reference to the node and need another one.
9583 	struct vnode* vnode;
9584 	status = get_vnode(sRoot->mount->id, nodeID, &vnode, true, false);
9585 	if (status != B_OK) {
9586 		// that should not happen
9587 		dprintf("_user_create_pipe(): Failed to lookup vnode (%" B_PRIdDEV ", "
9588 			"%" B_PRIdINO ")\n", sRoot->mount->id, sRoot->id);
9589 		return status;
9590 	}
9591 
9592 	// Everything looks good so far. Open two FDs for reading respectively
9593 	// writing.
9594 	int fds[2];
9595 	fds[0] = open_vnode(vnode, O_RDONLY, false);
9596 	fds[1] = open_vnode(vnode, O_WRONLY, false);
9597 
9598 	FDCloser closer0(fds[0], false);
9599 	FDCloser closer1(fds[1], false);
9600 
9601 	status = (fds[0] >= 0 ? (fds[1] >= 0 ? B_OK : fds[1]) : fds[0]);
9602 
9603 	// copy FDs to userland
9604 	if (status == B_OK) {
9605 		if (!IS_USER_ADDRESS(userFDs)
9606 			|| user_memcpy(userFDs, fds, sizeof(fds)) != B_OK) {
9607 			status = B_BAD_ADDRESS;
9608 		}
9609 	}
9610 
9611 	// keep FDs, if everything went fine
9612 	if (status == B_OK) {
9613 		closer0.Detach();
9614 		closer1.Detach();
9615 	}
9616 
9617 	return status;
9618 }
9619 
9620 
9621 status_t
9622 _user_access(int fd, const char* userPath, int mode, bool effectiveUserGroup)
9623 {
9624 	KPath pathBuffer;
9625 	if (pathBuffer.InitCheck() != B_OK)
9626 		return B_NO_MEMORY;
9627 
9628 	char* path = pathBuffer.LockBuffer();
9629 
9630 	if (!IS_USER_ADDRESS(userPath))
9631 		return B_BAD_ADDRESS;
9632 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9633 	if (status != B_OK)
9634 		return status;
9635 
9636 	return common_access(fd, path, mode, effectiveUserGroup, false);
9637 }
9638 
9639 
9640 status_t
9641 _user_read_stat(int fd, const char* userPath, bool traverseLink,
9642 	struct stat* userStat, size_t statSize)
9643 {
9644 	struct stat stat = {0};
9645 	status_t status;
9646 
9647 	if (statSize > sizeof(struct stat))
9648 		return B_BAD_VALUE;
9649 
9650 	if (!IS_USER_ADDRESS(userStat))
9651 		return B_BAD_ADDRESS;
9652 
9653 	if (userPath != NULL) {
9654 		// path given: get the stat of the node referred to by (fd, path)
9655 		if (!IS_USER_ADDRESS(userPath))
9656 			return B_BAD_ADDRESS;
9657 
9658 		KPath pathBuffer;
9659 		if (pathBuffer.InitCheck() != B_OK)
9660 			return B_NO_MEMORY;
9661 
9662 		char* path = pathBuffer.LockBuffer();
9663 
9664 		status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9665 		if (status != B_OK)
9666 			return status;
9667 
9668 		status = common_path_read_stat(fd, path, traverseLink, &stat, false);
9669 	} else {
9670 		// no path given: get the FD and use the FD operation
9671 		struct file_descriptor* descriptor
9672 			= get_fd(get_current_io_context(false), fd);
9673 		if (descriptor == NULL)
9674 			return B_FILE_ERROR;
9675 
9676 		if (descriptor->ops->fd_read_stat)
9677 			status = descriptor->ops->fd_read_stat(descriptor, &stat);
9678 		else
9679 			status = B_UNSUPPORTED;
9680 
9681 		put_fd(descriptor);
9682 	}
9683 
9684 	if (status != B_OK)
9685 		return status;
9686 
9687 	return user_memcpy(userStat, &stat, statSize);
9688 }
9689 
9690 
9691 status_t
9692 _user_write_stat(int fd, const char* userPath, bool traverseLeafLink,
9693 	const struct stat* userStat, size_t statSize, int statMask)
9694 {
9695 	if (statSize > sizeof(struct stat))
9696 		return B_BAD_VALUE;
9697 
9698 	struct stat stat;
9699 
9700 	if (!IS_USER_ADDRESS(userStat)
9701 		|| user_memcpy(&stat, userStat, statSize) < B_OK)
9702 		return B_BAD_ADDRESS;
9703 
9704 	// clear additional stat fields
9705 	if (statSize < sizeof(struct stat))
9706 		memset((uint8*)&stat + statSize, 0, sizeof(struct stat) - statSize);
9707 
9708 	status_t status;
9709 
9710 	if (userPath != NULL) {
9711 		// path given: write the stat of the node referred to by (fd, path)
9712 		if (!IS_USER_ADDRESS(userPath))
9713 			return B_BAD_ADDRESS;
9714 
9715 		KPath pathBuffer;
9716 		if (pathBuffer.InitCheck() != B_OK)
9717 			return B_NO_MEMORY;
9718 
9719 		char* path = pathBuffer.LockBuffer();
9720 
9721 		status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9722 		if (status != B_OK)
9723 			return status;
9724 
9725 		status = common_path_write_stat(fd, path, traverseLeafLink, &stat,
9726 			statMask, false);
9727 	} else {
9728 		// no path given: get the FD and use the FD operation
9729 		struct file_descriptor* descriptor
9730 			= get_fd(get_current_io_context(false), fd);
9731 		if (descriptor == NULL)
9732 			return B_FILE_ERROR;
9733 
9734 		if (descriptor->ops->fd_write_stat) {
9735 			status = descriptor->ops->fd_write_stat(descriptor, &stat,
9736 				statMask);
9737 		} else
9738 			status = B_UNSUPPORTED;
9739 
9740 		put_fd(descriptor);
9741 	}
9742 
9743 	return status;
9744 }
9745 
9746 
9747 int
9748 _user_open_attr_dir(int fd, const char* userPath, bool traverseLeafLink)
9749 {
9750 	KPath pathBuffer;
9751 	if (pathBuffer.InitCheck() != B_OK)
9752 		return B_NO_MEMORY;
9753 
9754 	char* path = pathBuffer.LockBuffer();
9755 
9756 	if (userPath != NULL) {
9757 		if (!IS_USER_ADDRESS(userPath))
9758 			return B_BAD_ADDRESS;
9759 		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9760 		if (status != B_OK)
9761 			return status;
9762 	}
9763 
9764 	return attr_dir_open(fd, userPath ? path : NULL, traverseLeafLink, false);
9765 }
9766 
9767 
9768 ssize_t
9769 _user_read_attr(int fd, const char* userAttribute, off_t pos, void* userBuffer,
9770 	size_t readBytes)
9771 {
9772 	char attribute[B_FILE_NAME_LENGTH];
9773 
9774 	if (userAttribute == NULL)
9775 		return B_BAD_VALUE;
9776 	if (!IS_USER_ADDRESS(userAttribute))
9777 		return B_BAD_ADDRESS;
9778 	status_t status = user_copy_name(attribute, userAttribute, sizeof(attribute));
9779 	if (status != B_OK)
9780 		return status;
9781 
9782 	int attr = attr_open(fd, NULL, attribute, O_RDONLY, false);
9783 	if (attr < 0)
9784 		return attr;
9785 
9786 	ssize_t bytes = _user_read(attr, pos, userBuffer, readBytes);
9787 	_user_close(attr);
9788 
9789 	return bytes;
9790 }
9791 
9792 
9793 ssize_t
9794 _user_write_attr(int fd, const char* userAttribute, uint32 type, off_t pos,
9795 	const void* buffer, size_t writeBytes)
9796 {
9797 	char attribute[B_FILE_NAME_LENGTH];
9798 
9799 	if (userAttribute == NULL)
9800 		return B_BAD_VALUE;
9801 	if (!IS_USER_ADDRESS(userAttribute))
9802 		return B_BAD_ADDRESS;
9803 	status_t status = user_copy_name(attribute, userAttribute, sizeof(attribute));
9804 	if (status != B_OK)
9805 		return status;
9806 
9807 	// Try to support the BeOS typical truncation as well as the position
9808 	// argument
9809 	int attr = attr_create(fd, NULL, attribute, type,
9810 		O_CREAT | O_WRONLY | (pos != 0 ? 0 : O_TRUNC), false);
9811 	if (attr < 0)
9812 		return attr;
9813 
9814 	ssize_t bytes = _user_write(attr, pos, buffer, writeBytes);
9815 	_user_close(attr);
9816 
9817 	return bytes;
9818 }
9819 
9820 
9821 status_t
9822 _user_stat_attr(int fd, const char* userAttribute,
9823 	struct attr_info* userAttrInfo)
9824 {
9825 	char attribute[B_FILE_NAME_LENGTH];
9826 
9827 	if (userAttribute == NULL || userAttrInfo == NULL)
9828 		return B_BAD_VALUE;
9829 	if (!IS_USER_ADDRESS(userAttribute) || !IS_USER_ADDRESS(userAttrInfo))
9830 		return B_BAD_ADDRESS;
9831 	status_t status = user_copy_name(attribute, userAttribute,
9832 		sizeof(attribute));
9833 	if (status != B_OK)
9834 		return status;
9835 
9836 	int attr = attr_open(fd, NULL, attribute, O_RDONLY, false);
9837 	if (attr < 0)
9838 		return attr;
9839 
9840 	struct file_descriptor* descriptor
9841 		= get_fd(get_current_io_context(false), attr);
9842 	if (descriptor == NULL) {
9843 		_user_close(attr);
9844 		return B_FILE_ERROR;
9845 	}
9846 
9847 	struct stat stat;
9848 	if (descriptor->ops->fd_read_stat)
9849 		status = descriptor->ops->fd_read_stat(descriptor, &stat);
9850 	else
9851 		status = B_UNSUPPORTED;
9852 
9853 	put_fd(descriptor);
9854 	_user_close(attr);
9855 
9856 	if (status == B_OK) {
9857 		attr_info info;
9858 		info.type = stat.st_type;
9859 		info.size = stat.st_size;
9860 
9861 		if (user_memcpy(userAttrInfo, &info, sizeof(struct attr_info)) != B_OK)
9862 			return B_BAD_ADDRESS;
9863 	}
9864 
9865 	return status;
9866 }
9867 
9868 
9869 int
9870 _user_open_attr(int fd, const char* userPath, const char* userName,
9871 	uint32 type, int openMode)
9872 {
9873 	char name[B_FILE_NAME_LENGTH];
9874 
9875 	if (!IS_USER_ADDRESS(userName))
9876 		return B_BAD_ADDRESS;
9877 	status_t status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
9878 	if (status != B_OK)
9879 		return status;
9880 
9881 	KPath pathBuffer;
9882 	if (pathBuffer.InitCheck() != B_OK)
9883 		return B_NO_MEMORY;
9884 
9885 	char* path = pathBuffer.LockBuffer();
9886 
9887 	if (userPath != NULL) {
9888 		if (!IS_USER_ADDRESS(userPath))
9889 			return B_BAD_ADDRESS;
9890 		status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9891 		if (status != B_OK)
9892 			return status;
9893 	}
9894 
9895 	if ((openMode & O_CREAT) != 0) {
9896 		return attr_create(fd, userPath ? path : NULL, name, type, openMode,
9897 			false);
9898 	}
9899 
9900 	return attr_open(fd, userPath ? path : NULL, name, openMode, false);
9901 }
9902 
9903 
9904 status_t
9905 _user_remove_attr(int fd, const char* userName)
9906 {
9907 	char name[B_FILE_NAME_LENGTH];
9908 
9909 	if (!IS_USER_ADDRESS(userName))
9910 		return B_BAD_ADDRESS;
9911 	status_t status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
9912 	if (status != B_OK)
9913 		return status;
9914 
9915 	return attr_remove(fd, name, false);
9916 }
9917 
9918 
9919 status_t
9920 _user_rename_attr(int fromFile, const char* userFromName, int toFile,
9921 	const char* userToName)
9922 {
9923 	if (!IS_USER_ADDRESS(userFromName)
9924 		|| !IS_USER_ADDRESS(userToName))
9925 		return B_BAD_ADDRESS;
9926 
9927 	KPath fromNameBuffer(B_FILE_NAME_LENGTH);
9928 	KPath toNameBuffer(B_FILE_NAME_LENGTH);
9929 	if (fromNameBuffer.InitCheck() != B_OK || toNameBuffer.InitCheck() != B_OK)
9930 		return B_NO_MEMORY;
9931 
9932 	char* fromName = fromNameBuffer.LockBuffer();
9933 	char* toName = toNameBuffer.LockBuffer();
9934 
9935 	status_t status = user_copy_name(fromName, userFromName, B_FILE_NAME_LENGTH);
9936 	if (status != B_OK)
9937 		return status;
9938 	status = user_copy_name(toName, userToName, B_FILE_NAME_LENGTH);
9939 	if (status != B_OK)
9940 		return status;
9941 
9942 	return attr_rename(fromFile, fromName, toFile, toName, false);
9943 }
9944 
9945 
9946 int
9947 _user_open_index_dir(dev_t device)
9948 {
9949 	return index_dir_open(device, false);
9950 }
9951 
9952 
9953 status_t
9954 _user_create_index(dev_t device, const char* userName, uint32 type,
9955 	uint32 flags)
9956 {
9957 	char name[B_FILE_NAME_LENGTH];
9958 
9959 	if (!IS_USER_ADDRESS(userName))
9960 		return B_BAD_ADDRESS;
9961 	status_t status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
9962 	if (status != B_OK)
9963 		return status;
9964 
9965 	return index_create(device, name, type, flags, false);
9966 }
9967 
9968 
9969 status_t
9970 _user_read_index_stat(dev_t device, const char* userName, struct stat* userStat)
9971 {
9972 	char name[B_FILE_NAME_LENGTH];
9973 	struct stat stat = {0};
9974 	status_t status;
9975 
9976 	if (!IS_USER_ADDRESS(userName) || !IS_USER_ADDRESS(userStat))
9977 		return B_BAD_ADDRESS;
9978 	status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
9979 	if (status != B_OK)
9980 		return status;
9981 
9982 	status = index_name_read_stat(device, name, &stat, false);
9983 	if (status == B_OK) {
9984 		if (user_memcpy(userStat, &stat, sizeof(stat)) != B_OK)
9985 			return B_BAD_ADDRESS;
9986 	}
9987 
9988 	return status;
9989 }
9990 
9991 
9992 status_t
9993 _user_remove_index(dev_t device, const char* userName)
9994 {
9995 	char name[B_FILE_NAME_LENGTH];
9996 
9997 	if (!IS_USER_ADDRESS(userName))
9998 		return B_BAD_ADDRESS;
9999 	status_t status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
10000 	if (status != B_OK)
10001 		return status;
10002 
10003 	return index_remove(device, name, false);
10004 }
10005 
10006 
10007 status_t
10008 _user_getcwd(char* userBuffer, size_t size)
10009 {
10010 	if (size == 0)
10011 		return B_BAD_VALUE;
10012 	if (!IS_USER_ADDRESS(userBuffer))
10013 		return B_BAD_ADDRESS;
10014 
10015 	if (size > kMaxPathLength)
10016 		size = kMaxPathLength;
10017 
10018 	KPath pathBuffer(size);
10019 	if (pathBuffer.InitCheck() != B_OK)
10020 		return B_NO_MEMORY;
10021 
10022 	TRACE(("user_getcwd: buf %p, %ld\n", userBuffer, size));
10023 
10024 	char* path = pathBuffer.LockBuffer();
10025 
10026 	status_t status = get_cwd(path, size, false);
10027 	if (status != B_OK)
10028 		return status;
10029 
10030 	// Copy back the result
10031 	if (user_strlcpy(userBuffer, path, size) < B_OK)
10032 		return B_BAD_ADDRESS;
10033 
10034 	return status;
10035 }
10036 
10037 
10038 status_t
10039 _user_setcwd(int fd, const char* userPath)
10040 {
10041 	TRACE(("user_setcwd: path = %p\n", userPath));
10042 
10043 	KPath pathBuffer;
10044 	if (pathBuffer.InitCheck() != B_OK)
10045 		return B_NO_MEMORY;
10046 
10047 	char* path = pathBuffer.LockBuffer();
10048 
10049 	if (userPath != NULL) {
10050 		if (!IS_USER_ADDRESS(userPath))
10051 			return B_BAD_ADDRESS;
10052 		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
10053 		if (status != B_OK)
10054 			return status;
10055 	}
10056 
10057 	return set_cwd(fd, userPath != NULL ? path : NULL, false);
10058 }
10059 
10060 
10061 status_t
10062 _user_change_root(const char* userPath)
10063 {
10064 	// only root is allowed to chroot()
10065 	if (geteuid() != 0)
10066 		return B_NOT_ALLOWED;
10067 
10068 	// alloc path buffer
10069 	KPath pathBuffer;
10070 	if (pathBuffer.InitCheck() != B_OK)
10071 		return B_NO_MEMORY;
10072 
10073 	// copy userland path to kernel
10074 	char* path = pathBuffer.LockBuffer();
10075 	if (userPath != NULL) {
10076 		if (!IS_USER_ADDRESS(userPath))
10077 			return B_BAD_ADDRESS;
10078 		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
10079 		if (status != B_OK)
10080 			return status;
10081 	}
10082 
10083 	// get the vnode
10084 	struct vnode* vnode;
10085 	status_t status = path_to_vnode(path, true, &vnode, NULL, false);
10086 	if (status != B_OK)
10087 		return status;
10088 
10089 	// set the new root
10090 	struct io_context* context = get_current_io_context(false);
10091 	mutex_lock(&sIOContextRootLock);
10092 	struct vnode* oldRoot = context->root;
10093 	context->root = vnode;
10094 	mutex_unlock(&sIOContextRootLock);
10095 
10096 	put_vnode(oldRoot);
10097 
10098 	return B_OK;
10099 }
10100 
10101 
10102 int
10103 _user_open_query(dev_t device, const char* userQuery, size_t queryLength,
10104 	uint32 flags, port_id port, int32 token)
10105 {
10106 	if (device < 0 || userQuery == NULL || queryLength == 0)
10107 		return B_BAD_VALUE;
10108 
10109 	if (!IS_USER_ADDRESS(userQuery))
10110 		return B_BAD_ADDRESS;
10111 
10112 	// this is a safety restriction
10113 	if (queryLength >= 65536)
10114 		return B_NAME_TOO_LONG;
10115 
10116 	BStackOrHeapArray<char, 128> query(queryLength);
10117 	if (!query.IsValid())
10118 		return B_NO_MEMORY;
10119 
10120 	if (user_strlcpy(query, userQuery, queryLength + 1) < B_OK)
10121 		return B_BAD_ADDRESS;
10122 
10123 	return query_open(device, query, flags, port, token, false);
10124 }
10125 
10126 
10127 #include "vfs_request_io.cpp"
10128