xref: /haiku/src/system/kernel/fs/vfs.cpp (revision d12bb8b14803d030b4a8fba91131e4bb96c4f406)
1 /*
2  * Copyright 2005-2013, Ingo Weinhold, ingo_weinhold@gmx.de.
3  * Copyright 2002-2018, Axel Dörfler, axeld@pinc-software.de.
4  * Distributed under the terms of the MIT License.
5  *
6  * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
7  * Distributed under the terms of the NewOS License.
8  */
9 
10 
11 /*! Virtual File System and File System Interface Layer */
12 
13 
14 #include <ctype.h>
15 #include <fcntl.h>
16 #include <limits.h>
17 #include <stddef.h>
18 #include <stdio.h>
19 #include <string.h>
20 #include <sys/file.h>
21 #include <sys/resource.h>
22 #include <sys/stat.h>
23 #include <unistd.h>
24 
25 #include <fs_attr.h>
26 #include <fs_info.h>
27 #include <fs_interface.h>
28 #include <fs_volume.h>
29 #include <NodeMonitor.h>
30 #include <OS.h>
31 #include <StorageDefs.h>
32 
33 #include <AutoDeleter.h>
34 #include <block_cache.h>
35 #include <boot/kernel_args.h>
36 #include <debug_heap.h>
37 #include <disk_device_manager/KDiskDevice.h>
38 #include <disk_device_manager/KDiskDeviceManager.h>
39 #include <disk_device_manager/KDiskDeviceUtils.h>
40 #include <disk_device_manager/KDiskSystem.h>
41 #include <fd.h>
42 #include <file_cache.h>
43 #include <fs/node_monitor.h>
44 #include <KPath.h>
45 #include <lock.h>
46 #include <low_resource_manager.h>
47 #include <slab/Slab.h>
48 #include <StackOrHeapArray.h>
49 #include <syscalls.h>
50 #include <syscall_restart.h>
51 #include <tracing.h>
52 #include <util/atomic.h>
53 #include <util/AutoLock.h>
54 #include <util/DoublyLinkedList.h>
55 #include <vfs.h>
56 #include <vm/vm.h>
57 #include <vm/VMCache.h>
58 #include <wait_for_objects.h>
59 
60 #include "EntryCache.h"
61 #include "fifo.h"
62 #include "IORequest.h"
63 #include "unused_vnodes.h"
64 #include "vfs_tracing.h"
65 #include "Vnode.h"
66 #include "../cache/vnode_store.h"
67 
68 
69 //#define TRACE_VFS
70 #ifdef TRACE_VFS
71 #	define TRACE(x) dprintf x
72 #	define FUNCTION(x) dprintf x
73 #else
74 #	define TRACE(x) ;
75 #	define FUNCTION(x) ;
76 #endif
77 
78 #define ADD_DEBUGGER_COMMANDS
79 
80 
81 #define HAS_FS_CALL(vnode, op)			(vnode->ops->op != NULL)
82 #define HAS_FS_MOUNT_CALL(mount, op)	(mount->volume->ops->op != NULL)
83 
84 #if KDEBUG
85 #	define FS_CALL(vnode, op, params...) \
86 		( HAS_FS_CALL(vnode, op) ? \
87 			vnode->ops->op(vnode->mount->volume, vnode, params) \
88 			: (panic("FS_CALL op " #op " is NULL"), 0))
89 #	define FS_CALL_NO_PARAMS(vnode, op) \
90 		( HAS_FS_CALL(vnode, op) ? \
91 			vnode->ops->op(vnode->mount->volume, vnode) \
92 			: (panic("FS_CALL_NO_PARAMS op " #op " is NULL"), 0))
93 #	define FS_MOUNT_CALL(mount, op, params...) \
94 		( HAS_FS_MOUNT_CALL(mount, op) ? \
95 			mount->volume->ops->op(mount->volume, params) \
96 			: (panic("FS_MOUNT_CALL op " #op " is NULL"), 0))
97 #	define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
98 		( HAS_FS_MOUNT_CALL(mount, op) ? \
99 			mount->volume->ops->op(mount->volume) \
100 			: (panic("FS_MOUNT_CALL_NO_PARAMS op " #op " is NULL"), 0))
101 #else
102 #	define FS_CALL(vnode, op, params...) \
103 			vnode->ops->op(vnode->mount->volume, vnode, params)
104 #	define FS_CALL_NO_PARAMS(vnode, op) \
105 			vnode->ops->op(vnode->mount->volume, vnode)
106 #	define FS_MOUNT_CALL(mount, op, params...) \
107 			mount->volume->ops->op(mount->volume, params)
108 #	define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
109 			mount->volume->ops->op(mount->volume)
110 #endif
111 
112 
113 const static size_t kMaxPathLength = 65536;
114 	// The absolute maximum path length (for getcwd() - this is not depending
115 	// on PATH_MAX
116 
117 
118 typedef DoublyLinkedList<vnode> VnodeList;
119 
120 /*!	\brief Structure to manage a mounted file system
121 
122 	Note: The root_vnode and root_vnode->covers fields (what others?) are
123 	initialized in fs_mount() and not changed afterwards. That is as soon
124 	as the mount is mounted and it is made sure it won't be unmounted
125 	(e.g. by holding a reference to a vnode of that mount) (read) access
126 	to those fields is always safe, even without additional locking. Morever
127 	while mounted the mount holds a reference to the root_vnode->covers vnode,
128 	and thus making the access path vnode->mount->root_vnode->covers->mount->...
129 	safe if a reference to vnode is held (note that for the root mount
130 	root_vnode->covers is NULL, though).
131 */
132 struct fs_mount {
133 	fs_mount()
134 		:
135 		volume(NULL),
136 		device_name(NULL)
137 	{
138 		mutex_init(&lock, "mount lock");
139 	}
140 
141 	~fs_mount()
142 	{
143 		mutex_destroy(&lock);
144 		free(device_name);
145 
146 		while (volume) {
147 			fs_volume* superVolume = volume->super_volume;
148 
149 			if (volume->file_system != NULL)
150 				put_module(volume->file_system->info.name);
151 
152 			free(volume->file_system_name);
153 			free(volume);
154 			volume = superVolume;
155 		}
156 	}
157 
158 	struct fs_mount* next;
159 	dev_t			id;
160 	fs_volume*		volume;
161 	char*			device_name;
162 	mutex			lock;	// guards the vnodes list
163 	struct vnode*	root_vnode;
164 	struct vnode*	covers_vnode;	// immutable
165 	KPartition*		partition;
166 	VnodeList		vnodes;
167 	EntryCache		entry_cache;
168 	bool			unmounting;
169 	bool			owns_file_device;
170 };
171 
172 
173 namespace {
174 
175 struct advisory_lock : public DoublyLinkedListLinkImpl<advisory_lock> {
176 	list_link		link;
177 	void*			bound_to;
178 	team_id			team;
179 	pid_t			session;
180 	off_t			start;
181 	off_t			end;
182 	bool			shared;
183 };
184 
185 typedef DoublyLinkedList<advisory_lock> LockList;
186 
187 } // namespace
188 
189 
190 struct advisory_locking {
191 	sem_id			lock;
192 	sem_id			wait_sem;
193 	LockList		locks;
194 
195 	advisory_locking()
196 		:
197 		lock(-1),
198 		wait_sem(-1)
199 	{
200 	}
201 
202 	~advisory_locking()
203 	{
204 		if (lock >= 0)
205 			delete_sem(lock);
206 		if (wait_sem >= 0)
207 			delete_sem(wait_sem);
208 	}
209 };
210 
211 /*!	\brief Guards sMountsTable.
212 
213 	The holder is allowed to read/write access the sMountsTable.
214 	Manipulation of the fs_mount structures themselves
215 	(and their destruction) requires different locks though.
216 */
217 static rw_lock sMountLock = RW_LOCK_INITIALIZER("vfs_mount_lock");
218 
219 /*!	\brief Guards mount/unmount operations.
220 
221 	The fs_mount() and fs_unmount() hold the lock during their whole operation.
222 	That is locking the lock ensures that no FS is mounted/unmounted. In
223 	particular this means that
224 	- sMountsTable will not be modified,
225 	- the fields immutable after initialization of the fs_mount structures in
226 	  sMountsTable will not be modified,
227 
228 	The thread trying to lock the lock must not hold sVnodeLock or
229 	sMountLock.
230 */
231 static recursive_lock sMountOpLock;
232 
233 /*!	\brief Guards sVnodeTable.
234 
235 	The holder is allowed read/write access to sVnodeTable and to
236 	any unbusy vnode in that table, save to the immutable fields (device, id,
237 	private_node, mount) to which only read-only access is allowed.
238 	The mutable fields advisory_locking, mandatory_locked_by, and ref_count, as
239 	well as the busy, removed, unused flags, and the vnode's type can also be
240 	write accessed when holding a read lock to sVnodeLock *and* having the vnode
241 	locked. Write access to covered_by and covers requires to write lock
242 	sVnodeLock.
243 
244 	The thread trying to acquire the lock must not hold sMountLock.
245 	You must not hold this lock when calling create_sem(), as this might call
246 	vfs_free_unused_vnodes() and thus cause a deadlock.
247 */
248 static rw_lock sVnodeLock = RW_LOCK_INITIALIZER("vfs_vnode_lock");
249 
250 /*!	\brief Guards io_context::root.
251 
252 	Must be held when setting or getting the io_context::root field.
253 	The only operation allowed while holding this lock besides getting or
254 	setting the field is inc_vnode_ref_count() on io_context::root.
255 */
256 static mutex sIOContextRootLock = MUTEX_INITIALIZER("io_context::root lock");
257 
258 
259 namespace {
260 
261 struct vnode_hash_key {
262 	dev_t	device;
263 	ino_t	vnode;
264 };
265 
266 struct VnodeHash {
267 	typedef vnode_hash_key	KeyType;
268 	typedef	struct vnode	ValueType;
269 
270 #define VHASH(mountid, vnodeid) \
271 	(((uint32)((vnodeid) >> 32) + (uint32)(vnodeid)) ^ (uint32)(mountid))
272 
273 	size_t HashKey(KeyType key) const
274 	{
275 		return VHASH(key.device, key.vnode);
276 	}
277 
278 	size_t Hash(ValueType* vnode) const
279 	{
280 		return VHASH(vnode->device, vnode->id);
281 	}
282 
283 #undef VHASH
284 
285 	bool Compare(KeyType key, ValueType* vnode) const
286 	{
287 		return vnode->device == key.device && vnode->id == key.vnode;
288 	}
289 
290 	ValueType*& GetLink(ValueType* value) const
291 	{
292 		return value->next;
293 	}
294 };
295 
296 typedef BOpenHashTable<VnodeHash> VnodeTable;
297 
298 
299 struct MountHash {
300 	typedef dev_t			KeyType;
301 	typedef	struct fs_mount	ValueType;
302 
303 	size_t HashKey(KeyType key) const
304 	{
305 		return key;
306 	}
307 
308 	size_t Hash(ValueType* mount) const
309 	{
310 		return mount->id;
311 	}
312 
313 	bool Compare(KeyType key, ValueType* mount) const
314 	{
315 		return mount->id == key;
316 	}
317 
318 	ValueType*& GetLink(ValueType* value) const
319 	{
320 		return value->next;
321 	}
322 };
323 
324 typedef BOpenHashTable<MountHash> MountTable;
325 
326 } // namespace
327 
328 
329 object_cache* sPathNameCache;
330 object_cache* sFileDescriptorCache;
331 
332 #define VNODE_HASH_TABLE_SIZE 1024
333 static VnodeTable* sVnodeTable;
334 static struct vnode* sRoot;
335 
336 #define MOUNTS_HASH_TABLE_SIZE 16
337 static MountTable* sMountsTable;
338 static dev_t sNextMountID = 1;
339 
340 #define MAX_TEMP_IO_VECS 8
341 
342 // How long to wait for busy vnodes (10s)
343 #define BUSY_VNODE_RETRIES 2000
344 #define BUSY_VNODE_DELAY 5000
345 
346 mode_t __gUmask = 022;
347 
348 /* function declarations */
349 
350 static void free_unused_vnodes();
351 
352 // file descriptor operation prototypes
353 static status_t file_read(struct file_descriptor* descriptor, off_t pos,
354 	void* buffer, size_t* _bytes);
355 static status_t file_write(struct file_descriptor* descriptor, off_t pos,
356 	const void* buffer, size_t* _bytes);
357 static off_t file_seek(struct file_descriptor* descriptor, off_t pos,
358 	int seekType);
359 static void file_free_fd(struct file_descriptor* descriptor);
360 static status_t file_close(struct file_descriptor* descriptor);
361 static status_t file_select(struct file_descriptor* descriptor, uint8 event,
362 	struct selectsync* sync);
363 static status_t file_deselect(struct file_descriptor* descriptor, uint8 event,
364 	struct selectsync* sync);
365 static status_t dir_read(struct io_context* context,
366 	struct file_descriptor* descriptor, struct dirent* buffer,
367 	size_t bufferSize, uint32* _count);
368 static status_t dir_read(struct io_context* ioContext, struct vnode* vnode,
369 	void* cookie, struct dirent* buffer, size_t bufferSize, uint32* _count);
370 static status_t dir_rewind(struct file_descriptor* descriptor);
371 static void dir_free_fd(struct file_descriptor* descriptor);
372 static status_t dir_close(struct file_descriptor* descriptor);
373 static status_t attr_dir_read(struct io_context* context,
374 	struct file_descriptor* descriptor, struct dirent* buffer,
375 	size_t bufferSize, uint32* _count);
376 static status_t attr_dir_rewind(struct file_descriptor* descriptor);
377 static void attr_dir_free_fd(struct file_descriptor* descriptor);
378 static status_t attr_dir_close(struct file_descriptor* descriptor);
379 static status_t attr_read(struct file_descriptor* descriptor, off_t pos,
380 	void* buffer, size_t* _bytes);
381 static status_t attr_write(struct file_descriptor* descriptor, off_t pos,
382 	const void* buffer, size_t* _bytes);
383 static off_t attr_seek(struct file_descriptor* descriptor, off_t pos,
384 	int seekType);
385 static void attr_free_fd(struct file_descriptor* descriptor);
386 static status_t attr_close(struct file_descriptor* descriptor);
387 static status_t attr_read_stat(struct file_descriptor* descriptor,
388 	struct stat* statData);
389 static status_t attr_write_stat(struct file_descriptor* descriptor,
390 	const struct stat* stat, int statMask);
391 static status_t index_dir_read(struct io_context* context,
392 	struct file_descriptor* descriptor, struct dirent* buffer,
393 	size_t bufferSize, uint32* _count);
394 static status_t index_dir_rewind(struct file_descriptor* descriptor);
395 static void index_dir_free_fd(struct file_descriptor* descriptor);
396 static status_t index_dir_close(struct file_descriptor* descriptor);
397 static status_t query_read(struct io_context* context,
398 	struct file_descriptor* descriptor, struct dirent* buffer,
399 	size_t bufferSize, uint32* _count);
400 static status_t query_rewind(struct file_descriptor* descriptor);
401 static void query_free_fd(struct file_descriptor* descriptor);
402 static status_t query_close(struct file_descriptor* descriptor);
403 
404 static status_t common_ioctl(struct file_descriptor* descriptor, ulong op,
405 	void* buffer, size_t length);
406 static status_t common_read_stat(struct file_descriptor* descriptor,
407 	struct stat* statData);
408 static status_t common_write_stat(struct file_descriptor* descriptor,
409 	const struct stat* statData, int statMask);
410 static status_t common_path_read_stat(int fd, char* path, bool traverseLeafLink,
411 	struct stat* stat, bool kernel);
412 
413 static status_t vnode_path_to_vnode(struct vnode* vnode, char* path,
414 	bool traverseLeafLink, int count, bool kernel,
415 	struct vnode** _vnode, ino_t* _parentID);
416 static status_t dir_vnode_to_path(struct vnode* vnode, char* buffer,
417 	size_t bufferSize, bool kernel);
418 static status_t fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
419 	struct vnode** _vnode, ino_t* _parentID, bool kernel);
420 static void inc_vnode_ref_count(struct vnode* vnode);
421 static status_t dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree,
422 	bool reenter);
423 static inline void put_vnode(struct vnode* vnode);
424 static status_t fs_unmount(char* path, dev_t mountID, uint32 flags,
425 	bool kernel);
426 static int open_vnode(struct vnode* vnode, int openMode, bool kernel);
427 
428 
429 static struct fd_ops sFileOps = {
430 	file_read,
431 	file_write,
432 	file_seek,
433 	common_ioctl,
434 	NULL,		// set_flags
435 	file_select,
436 	file_deselect,
437 	NULL,		// read_dir()
438 	NULL,		// rewind_dir()
439 	common_read_stat,
440 	common_write_stat,
441 	file_close,
442 	file_free_fd
443 };
444 
445 static struct fd_ops sDirectoryOps = {
446 	NULL,		// read()
447 	NULL,		// write()
448 	NULL,		// seek()
449 	common_ioctl,
450 	NULL,		// set_flags
451 	NULL,		// select()
452 	NULL,		// deselect()
453 	dir_read,
454 	dir_rewind,
455 	common_read_stat,
456 	common_write_stat,
457 	dir_close,
458 	dir_free_fd
459 };
460 
461 static struct fd_ops sAttributeDirectoryOps = {
462 	NULL,		// read()
463 	NULL,		// write()
464 	NULL,		// seek()
465 	common_ioctl,
466 	NULL,		// set_flags
467 	NULL,		// select()
468 	NULL,		// deselect()
469 	attr_dir_read,
470 	attr_dir_rewind,
471 	common_read_stat,
472 	common_write_stat,
473 	attr_dir_close,
474 	attr_dir_free_fd
475 };
476 
477 static struct fd_ops sAttributeOps = {
478 	attr_read,
479 	attr_write,
480 	attr_seek,
481 	common_ioctl,
482 	NULL,		// set_flags
483 	NULL,		// select()
484 	NULL,		// deselect()
485 	NULL,		// read_dir()
486 	NULL,		// rewind_dir()
487 	attr_read_stat,
488 	attr_write_stat,
489 	attr_close,
490 	attr_free_fd
491 };
492 
493 static struct fd_ops sIndexDirectoryOps = {
494 	NULL,		// read()
495 	NULL,		// write()
496 	NULL,		// seek()
497 	NULL,		// ioctl()
498 	NULL,		// set_flags
499 	NULL,		// select()
500 	NULL,		// deselect()
501 	index_dir_read,
502 	index_dir_rewind,
503 	NULL,		// read_stat()
504 	NULL,		// write_stat()
505 	index_dir_close,
506 	index_dir_free_fd
507 };
508 
509 #if 0
510 static struct fd_ops sIndexOps = {
511 	NULL,		// read()
512 	NULL,		// write()
513 	NULL,		// seek()
514 	NULL,		// ioctl()
515 	NULL,		// set_flags
516 	NULL,		// select()
517 	NULL,		// deselect()
518 	NULL,		// dir_read()
519 	NULL,		// dir_rewind()
520 	index_read_stat,	// read_stat()
521 	NULL,		// write_stat()
522 	NULL,		// dir_close()
523 	NULL		// free_fd()
524 };
525 #endif
526 
527 static struct fd_ops sQueryOps = {
528 	NULL,		// read()
529 	NULL,		// write()
530 	NULL,		// seek()
531 	NULL,		// ioctl()
532 	NULL,		// set_flags
533 	NULL,		// select()
534 	NULL,		// deselect()
535 	query_read,
536 	query_rewind,
537 	NULL,		// read_stat()
538 	NULL,		// write_stat()
539 	query_close,
540 	query_free_fd
541 };
542 
543 
544 namespace {
545 
546 class VNodePutter {
547 public:
548 	VNodePutter(struct vnode* vnode = NULL) : fVNode(vnode) {}
549 
550 	~VNodePutter()
551 	{
552 		Put();
553 	}
554 
555 	void SetTo(struct vnode* vnode)
556 	{
557 		Put();
558 		fVNode = vnode;
559 	}
560 
561 	void Put()
562 	{
563 		if (fVNode) {
564 			put_vnode(fVNode);
565 			fVNode = NULL;
566 		}
567 	}
568 
569 	struct vnode* Detach()
570 	{
571 		struct vnode* vnode = fVNode;
572 		fVNode = NULL;
573 		return vnode;
574 	}
575 
576 private:
577 	struct vnode* fVNode;
578 };
579 
580 
581 class FDCloser {
582 public:
583 	FDCloser() : fFD(-1), fKernel(true) {}
584 
585 	FDCloser(int fd, bool kernel) : fFD(fd), fKernel(kernel) {}
586 
587 	~FDCloser()
588 	{
589 		Close();
590 	}
591 
592 	void SetTo(int fd, bool kernel)
593 	{
594 		Close();
595 		fFD = fd;
596 		fKernel = kernel;
597 	}
598 
599 	void Close()
600 	{
601 		if (fFD >= 0) {
602 			if (fKernel)
603 				_kern_close(fFD);
604 			else
605 				_user_close(fFD);
606 			fFD = -1;
607 		}
608 	}
609 
610 	int Detach()
611 	{
612 		int fd = fFD;
613 		fFD = -1;
614 		return fd;
615 	}
616 
617 private:
618 	int		fFD;
619 	bool	fKernel;
620 };
621 
622 } // namespace
623 
624 
625 #if VFS_PAGES_IO_TRACING
626 
627 namespace VFSPagesIOTracing {
628 
629 class PagesIOTraceEntry : public AbstractTraceEntry {
630 protected:
631 	PagesIOTraceEntry(struct vnode* vnode, void* cookie, off_t pos,
632 		const generic_io_vec* vecs, uint32 count, uint32 flags,
633 		generic_size_t bytesRequested, status_t status,
634 		generic_size_t bytesTransferred)
635 		:
636 		fVnode(vnode),
637 		fMountID(vnode->mount->id),
638 		fNodeID(vnode->id),
639 		fCookie(cookie),
640 		fPos(pos),
641 		fCount(count),
642 		fFlags(flags),
643 		fBytesRequested(bytesRequested),
644 		fStatus(status),
645 		fBytesTransferred(bytesTransferred)
646 	{
647 		fVecs = (generic_io_vec*)alloc_tracing_buffer_memcpy(vecs,
648 			sizeof(generic_io_vec) * count, false);
649 	}
650 
651 	void AddDump(TraceOutput& out, const char* mode)
652 	{
653 		out.Print("vfs pages io %5s: vnode: %p (%" B_PRId32 ", %" B_PRId64 "), "
654 			"cookie: %p, pos: %" B_PRIdOFF ", size: %" B_PRIu64 ", vecs: {",
655 			mode, fVnode, fMountID, fNodeID, fCookie, fPos,
656 			(uint64)fBytesRequested);
657 
658 		if (fVecs != NULL) {
659 			for (uint32 i = 0; i < fCount; i++) {
660 				if (i > 0)
661 					out.Print(", ");
662 				out.Print("(%" B_PRIx64 ", %" B_PRIu64 ")", (uint64)fVecs[i].base,
663 					(uint64)fVecs[i].length);
664 			}
665 		}
666 
667 		out.Print("}, flags: %#" B_PRIx32 " -> status: %#" B_PRIx32 ", "
668 			"transferred: %" B_PRIu64, fFlags, fStatus,
669 			(uint64)fBytesTransferred);
670 	}
671 
672 protected:
673 	struct vnode*	fVnode;
674 	dev_t			fMountID;
675 	ino_t			fNodeID;
676 	void*			fCookie;
677 	off_t			fPos;
678 	generic_io_vec*	fVecs;
679 	uint32			fCount;
680 	uint32			fFlags;
681 	generic_size_t	fBytesRequested;
682 	status_t		fStatus;
683 	generic_size_t	fBytesTransferred;
684 };
685 
686 
687 class ReadPages : public PagesIOTraceEntry {
688 public:
689 	ReadPages(struct vnode* vnode, void* cookie, off_t pos,
690 		const generic_io_vec* vecs, uint32 count, uint32 flags,
691 		generic_size_t bytesRequested, status_t status,
692 		generic_size_t bytesTransferred)
693 		:
694 		PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
695 			bytesRequested, status, bytesTransferred)
696 	{
697 		Initialized();
698 	}
699 
700 	virtual void AddDump(TraceOutput& out)
701 	{
702 		PagesIOTraceEntry::AddDump(out, "read");
703 	}
704 };
705 
706 
707 class WritePages : public PagesIOTraceEntry {
708 public:
709 	WritePages(struct vnode* vnode, void* cookie, off_t pos,
710 		const generic_io_vec* vecs, uint32 count, uint32 flags,
711 		generic_size_t bytesRequested, status_t status,
712 		generic_size_t bytesTransferred)
713 		:
714 		PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
715 			bytesRequested, status, bytesTransferred)
716 	{
717 		Initialized();
718 	}
719 
720 	virtual void AddDump(TraceOutput& out)
721 	{
722 		PagesIOTraceEntry::AddDump(out, "write");
723 	}
724 };
725 
726 }	// namespace VFSPagesIOTracing
727 
728 #	define TPIO(x) new(std::nothrow) VFSPagesIOTracing::x;
729 #else
730 #	define TPIO(x) ;
731 #endif	// VFS_PAGES_IO_TRACING
732 
733 
734 /*! Finds the mounted device (the fs_mount structure) with the given ID.
735 	Note, you must hold the sMountLock lock when you call this function.
736 */
737 static struct fs_mount*
738 find_mount(dev_t id)
739 {
740 	ASSERT_READ_LOCKED_RW_LOCK(&sMountLock);
741 
742 	return sMountsTable->Lookup(id);
743 }
744 
745 
746 static status_t
747 get_mount(dev_t id, struct fs_mount** _mount)
748 {
749 	struct fs_mount* mount;
750 
751 	ReadLocker nodeLocker(sVnodeLock);
752 	ReadLocker mountLocker(sMountLock);
753 
754 	mount = find_mount(id);
755 	if (mount == NULL)
756 		return B_BAD_VALUE;
757 
758 	struct vnode* rootNode = mount->root_vnode;
759 	if (mount->unmounting || rootNode == NULL || rootNode->IsBusy()
760 		|| rootNode->ref_count == 0) {
761 		// might have been called during a mount/unmount operation
762 		return B_BUSY;
763 	}
764 
765 	inc_vnode_ref_count(rootNode);
766 	*_mount = mount;
767 	return B_OK;
768 }
769 
770 
771 static void
772 put_mount(struct fs_mount* mount)
773 {
774 	if (mount)
775 		put_vnode(mount->root_vnode);
776 }
777 
778 
779 /*!	Tries to open the specified file system module.
780 	Accepts a file system name of the form "bfs" or "file_systems/bfs/v1".
781 	Returns a pointer to file system module interface, or NULL if it
782 	could not open the module.
783 */
784 static file_system_module_info*
785 get_file_system(const char* fsName)
786 {
787 	char name[B_FILE_NAME_LENGTH];
788 	if (strncmp(fsName, "file_systems/", strlen("file_systems/"))) {
789 		// construct module name if we didn't get one
790 		// (we currently support only one API)
791 		snprintf(name, sizeof(name), "file_systems/%s/v1", fsName);
792 		fsName = NULL;
793 	}
794 
795 	file_system_module_info* info;
796 	if (get_module(fsName ? fsName : name, (module_info**)&info) != B_OK)
797 		return NULL;
798 
799 	return info;
800 }
801 
802 
803 /*!	Accepts a file system name of the form "bfs" or "file_systems/bfs/v1"
804 	and returns a compatible fs_info.fsh_name name ("bfs" in both cases).
805 	The name is allocated for you, and you have to free() it when you're
806 	done with it.
807 	Returns NULL if the required memory is not available.
808 */
809 static char*
810 get_file_system_name(const char* fsName)
811 {
812 	const size_t length = strlen("file_systems/");
813 
814 	if (strncmp(fsName, "file_systems/", length)) {
815 		// the name already seems to be the module's file name
816 		return strdup(fsName);
817 	}
818 
819 	fsName += length;
820 	const char* end = strchr(fsName, '/');
821 	if (end == NULL) {
822 		// this doesn't seem to be a valid name, but well...
823 		return strdup(fsName);
824 	}
825 
826 	// cut off the trailing /v1
827 
828 	char* name = (char*)malloc(end + 1 - fsName);
829 	if (name == NULL)
830 		return NULL;
831 
832 	strlcpy(name, fsName, end + 1 - fsName);
833 	return name;
834 }
835 
836 
837 /*!	Accepts a list of file system names separated by a colon, one for each
838 	layer and returns the file system name for the specified layer.
839 	The name is allocated for you, and you have to free() it when you're
840 	done with it.
841 	Returns NULL if the required memory is not available or if there is no
842 	name for the specified layer.
843 */
844 static char*
845 get_file_system_name_for_layer(const char* fsNames, int32 layer)
846 {
847 	while (layer >= 0) {
848 		const char* end = strchr(fsNames, ':');
849 		if (end == NULL) {
850 			if (layer == 0)
851 				return strdup(fsNames);
852 			return NULL;
853 		}
854 
855 		if (layer == 0) {
856 			size_t length = end - fsNames + 1;
857 			char* result = (char*)malloc(length);
858 			strlcpy(result, fsNames, length);
859 			return result;
860 		}
861 
862 		fsNames = end + 1;
863 		layer--;
864 	}
865 
866 	return NULL;
867 }
868 
869 
870 static void
871 add_vnode_to_mount_list(struct vnode* vnode, struct fs_mount* mount)
872 {
873 	MutexLocker _(mount->lock);
874 	mount->vnodes.Add(vnode);
875 }
876 
877 
878 static void
879 remove_vnode_from_mount_list(struct vnode* vnode, struct fs_mount* mount)
880 {
881 	MutexLocker _(mount->lock);
882 	mount->vnodes.Remove(vnode);
883 }
884 
885 
886 /*!	\brief Looks up a vnode by mount and node ID in the sVnodeTable.
887 
888 	The caller must hold the sVnodeLock (read lock at least).
889 
890 	\param mountID the mount ID.
891 	\param vnodeID the node ID.
892 
893 	\return The vnode structure, if it was found in the hash table, \c NULL
894 			otherwise.
895 */
896 static struct vnode*
897 lookup_vnode(dev_t mountID, ino_t vnodeID)
898 {
899 	struct vnode_hash_key key;
900 
901 	key.device = mountID;
902 	key.vnode = vnodeID;
903 
904 	return sVnodeTable->Lookup(key);
905 }
906 
907 
908 /*!	\brief Checks whether or not a busy vnode should be waited for (again).
909 
910 	This will also wait for BUSY_VNODE_DELAY before returning if one should
911 	still wait for the vnode becoming unbusy.
912 
913 	\return \c true if one should retry, \c false if not.
914 */
915 static bool
916 retry_busy_vnode(int32& tries, dev_t mountID, ino_t vnodeID)
917 {
918 	if (--tries < 0) {
919 		// vnode doesn't seem to become unbusy
920 		dprintf("vnode %" B_PRIdDEV ":%" B_PRIdINO
921 			" is not becoming unbusy!\n", mountID, vnodeID);
922 		return false;
923 	}
924 	snooze(BUSY_VNODE_DELAY);
925 	return true;
926 }
927 
928 
929 /*!	Creates a new vnode with the given mount and node ID.
930 	If the node already exists, it is returned instead and no new node is
931 	created. In either case -- but not, if an error occurs -- the function write
932 	locks \c sVnodeLock and keeps it locked for the caller when returning. On
933 	error the lock is not held on return.
934 
935 	\param mountID The mount ID.
936 	\param vnodeID The vnode ID.
937 	\param _vnode Will be set to the new vnode on success.
938 	\param _nodeCreated Will be set to \c true when the returned vnode has
939 		been newly created, \c false when it already existed. Will not be
940 		changed on error.
941 	\return \c B_OK, when the vnode was successfully created and inserted or
942 		a node with the given ID was found, \c B_NO_MEMORY or
943 		\c B_ENTRY_NOT_FOUND on error.
944 */
945 static status_t
946 create_new_vnode_and_lock(dev_t mountID, ino_t vnodeID, struct vnode*& _vnode,
947 	bool& _nodeCreated)
948 {
949 	FUNCTION(("create_new_vnode_and_lock()\n"));
950 
951 	struct vnode* vnode = (struct vnode*)malloc(sizeof(struct vnode));
952 	if (vnode == NULL)
953 		return B_NO_MEMORY;
954 
955 	// initialize basic values
956 	memset(vnode, 0, sizeof(struct vnode));
957 	vnode->device = mountID;
958 	vnode->id = vnodeID;
959 	vnode->ref_count = 1;
960 	vnode->SetBusy(true);
961 
962 	// look up the node -- it might have been added by someone else in the
963 	// meantime
964 	rw_lock_write_lock(&sVnodeLock);
965 	struct vnode* existingVnode = lookup_vnode(mountID, vnodeID);
966 	if (existingVnode != NULL) {
967 		free(vnode);
968 		_vnode = existingVnode;
969 		_nodeCreated = false;
970 		return B_OK;
971 	}
972 
973 	// get the mount structure
974 	rw_lock_read_lock(&sMountLock);
975 	vnode->mount = find_mount(mountID);
976 	if (!vnode->mount || vnode->mount->unmounting) {
977 		rw_lock_read_unlock(&sMountLock);
978 		rw_lock_write_unlock(&sVnodeLock);
979 		free(vnode);
980 		return B_ENTRY_NOT_FOUND;
981 	}
982 
983 	// add the vnode to the mount's node list and the hash table
984 	sVnodeTable->Insert(vnode);
985 	add_vnode_to_mount_list(vnode, vnode->mount);
986 
987 	rw_lock_read_unlock(&sMountLock);
988 
989 	_vnode = vnode;
990 	_nodeCreated = true;
991 
992 	// keep the vnode lock locked
993 	return B_OK;
994 }
995 
996 
997 /*!	Frees the vnode and all resources it has acquired, and removes
998 	it from the vnode hash as well as from its mount structure.
999 	Will also make sure that any cache modifications are written back.
1000 */
1001 static void
1002 free_vnode(struct vnode* vnode, bool reenter)
1003 {
1004 	ASSERT_PRINT(vnode->ref_count == 0 && vnode->IsBusy(), "vnode: %p\n",
1005 		vnode);
1006 	ASSERT_PRINT(vnode->advisory_locking == NULL, "vnode: %p\n", vnode);
1007 
1008 	// write back any changes in this vnode's cache -- but only
1009 	// if the vnode won't be deleted, in which case the changes
1010 	// will be discarded
1011 
1012 	if (!vnode->IsRemoved() && HAS_FS_CALL(vnode, fsync))
1013 		FS_CALL_NO_PARAMS(vnode, fsync);
1014 
1015 	// Note: If this vnode has a cache attached, there will still be two
1016 	// references to that cache at this point. The last one belongs to the vnode
1017 	// itself (cf. vfs_get_vnode_cache()) and one belongs to the node's file
1018 	// cache. Each but the last reference to a cache also includes a reference
1019 	// to the vnode. The file cache, however, released its reference (cf.
1020 	// file_cache_create()), so that this vnode's ref count has the chance to
1021 	// ever drop to 0. Deleting the file cache now, will cause the next to last
1022 	// cache reference to be released, which will also release a (no longer
1023 	// existing) vnode reference. To avoid problems, we set the vnode's ref
1024 	// count, so that it will neither become negative nor 0.
1025 	vnode->ref_count = 2;
1026 
1027 	if (!vnode->IsUnpublished()) {
1028 		if (vnode->IsRemoved())
1029 			FS_CALL(vnode, remove_vnode, reenter);
1030 		else
1031 			FS_CALL(vnode, put_vnode, reenter);
1032 	}
1033 
1034 	// If the vnode has a VMCache attached, make sure that it won't try to get
1035 	// another reference via VMVnodeCache::AcquireUnreferencedStoreRef(). As
1036 	// long as the vnode is busy and in the hash, that won't happen, but as
1037 	// soon as we've removed it from the hash, it could reload the vnode -- with
1038 	// a new cache attached!
1039 	if (vnode->cache != NULL && vnode->cache->type == CACHE_TYPE_VNODE)
1040 		((VMVnodeCache*)vnode->cache)->VnodeDeleted();
1041 
1042 	// The file system has removed the resources of the vnode now, so we can
1043 	// make it available again (by removing the busy vnode from the hash).
1044 	rw_lock_write_lock(&sVnodeLock);
1045 	sVnodeTable->Remove(vnode);
1046 	rw_lock_write_unlock(&sVnodeLock);
1047 
1048 	// if we have a VMCache attached, remove it
1049 	if (vnode->cache)
1050 		vnode->cache->ReleaseRef();
1051 
1052 	vnode->cache = NULL;
1053 
1054 	remove_vnode_from_mount_list(vnode, vnode->mount);
1055 
1056 	free(vnode);
1057 }
1058 
1059 
1060 /*!	\brief Decrements the reference counter of the given vnode and deletes it,
1061 	if the counter dropped to 0.
1062 
1063 	The caller must, of course, own a reference to the vnode to call this
1064 	function.
1065 	The caller must not hold the sVnodeLock or the sMountLock.
1066 
1067 	\param vnode the vnode.
1068 	\param alwaysFree don't move this vnode into the unused list, but really
1069 		   delete it if possible.
1070 	\param reenter \c true, if this function is called (indirectly) from within
1071 		   a file system. This will be passed to file system hooks only.
1072 	\return \c B_OK, if everything went fine, an error code otherwise.
1073 */
1074 static status_t
1075 dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree, bool reenter)
1076 {
1077 	ReadLocker locker(sVnodeLock);
1078 	AutoLocker<Vnode> nodeLocker(vnode);
1079 
1080 	int32 oldRefCount = atomic_add(&vnode->ref_count, -1);
1081 
1082 	ASSERT_PRINT(oldRefCount > 0, "vnode %p\n", vnode);
1083 
1084 	TRACE(("dec_vnode_ref_count: vnode %p, ref now %" B_PRId32 "\n", vnode,
1085 		vnode->ref_count));
1086 
1087 	if (oldRefCount != 1)
1088 		return B_OK;
1089 
1090 	if (vnode->IsBusy())
1091 		panic("dec_vnode_ref_count: called on busy vnode %p\n", vnode);
1092 
1093 	bool freeNode = false;
1094 	bool freeUnusedNodes = false;
1095 
1096 	// Just insert the vnode into an unused list if we don't need
1097 	// to delete it
1098 	if (vnode->IsRemoved() || alwaysFree) {
1099 		vnode_to_be_freed(vnode);
1100 		vnode->SetBusy(true);
1101 		freeNode = true;
1102 	} else
1103 		freeUnusedNodes = vnode_unused(vnode);
1104 
1105 	nodeLocker.Unlock();
1106 	locker.Unlock();
1107 
1108 	if (freeNode)
1109 		free_vnode(vnode, reenter);
1110 	else if (freeUnusedNodes)
1111 		free_unused_vnodes();
1112 
1113 	return B_OK;
1114 }
1115 
1116 
1117 /*!	\brief Increments the reference counter of the given vnode.
1118 
1119 	The caller must make sure that the node isn't deleted while this function
1120 	is called. This can be done either:
1121 	- by ensuring that a reference to the node exists and remains in existence,
1122 	  or
1123 	- by holding the vnode's lock (which also requires read locking sVnodeLock)
1124 	  or by holding sVnodeLock write locked.
1125 
1126 	In the second case the caller is responsible for dealing with the ref count
1127 	0 -> 1 transition. That is 1. this function must not be invoked when the
1128 	node is busy in the first place and 2. vnode_used() must be called for the
1129 	node.
1130 
1131 	\param vnode the vnode.
1132 */
1133 static void
1134 inc_vnode_ref_count(struct vnode* vnode)
1135 {
1136 	atomic_add(&vnode->ref_count, 1);
1137 	TRACE(("inc_vnode_ref_count: vnode %p, ref now %" B_PRId32 "\n", vnode,
1138 		vnode->ref_count));
1139 }
1140 
1141 
1142 static bool
1143 is_special_node_type(int type)
1144 {
1145 	// at the moment only FIFOs are supported
1146 	return S_ISFIFO(type);
1147 }
1148 
1149 
1150 static status_t
1151 create_special_sub_node(struct vnode* vnode, uint32 flags)
1152 {
1153 	if (S_ISFIFO(vnode->Type()))
1154 		return create_fifo_vnode(vnode->mount->volume, vnode);
1155 
1156 	return B_BAD_VALUE;
1157 }
1158 
1159 
1160 /*!	\brief Retrieves a vnode for a given mount ID, node ID pair.
1161 
1162 	If the node is not yet in memory, it will be loaded.
1163 
1164 	The caller must not hold the sVnodeLock or the sMountLock.
1165 
1166 	\param mountID the mount ID.
1167 	\param vnodeID the node ID.
1168 	\param _vnode Pointer to a vnode* variable into which the pointer to the
1169 		   retrieved vnode structure shall be written.
1170 	\param reenter \c true, if this function is called (indirectly) from within
1171 		   a file system.
1172 	\return \c B_OK, if everything when fine, an error code otherwise.
1173 */
1174 static status_t
1175 get_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode, bool canWait,
1176 	int reenter)
1177 {
1178 	FUNCTION(("get_vnode: mountid %" B_PRId32 " vnid 0x%" B_PRIx64 " %p\n",
1179 		mountID, vnodeID, _vnode));
1180 
1181 	rw_lock_read_lock(&sVnodeLock);
1182 
1183 	int32 tries = BUSY_VNODE_RETRIES;
1184 restart:
1185 	struct vnode* vnode = lookup_vnode(mountID, vnodeID);
1186 	AutoLocker<Vnode> nodeLocker(vnode);
1187 
1188 	if (vnode && vnode->IsBusy()) {
1189 		nodeLocker.Unlock();
1190 		rw_lock_read_unlock(&sVnodeLock);
1191 		if (!canWait) {
1192 			dprintf("vnode %" B_PRIdDEV ":%" B_PRIdINO " is busy!\n",
1193 				mountID, vnodeID);
1194 			return B_BUSY;
1195 		}
1196 		if (!retry_busy_vnode(tries, mountID, vnodeID))
1197 			return B_BUSY;
1198 
1199 		rw_lock_read_lock(&sVnodeLock);
1200 		goto restart;
1201 	}
1202 
1203 	TRACE(("get_vnode: tried to lookup vnode, got %p\n", vnode));
1204 
1205 	status_t status;
1206 
1207 	if (vnode) {
1208 		if (vnode->ref_count == 0) {
1209 			// this vnode has been unused before
1210 			vnode_used(vnode);
1211 		}
1212 		inc_vnode_ref_count(vnode);
1213 
1214 		nodeLocker.Unlock();
1215 		rw_lock_read_unlock(&sVnodeLock);
1216 	} else {
1217 		// we need to create a new vnode and read it in
1218 		rw_lock_read_unlock(&sVnodeLock);
1219 			// unlock -- create_new_vnode_and_lock() write-locks on success
1220 		bool nodeCreated;
1221 		status = create_new_vnode_and_lock(mountID, vnodeID, vnode,
1222 			nodeCreated);
1223 		if (status != B_OK)
1224 			return status;
1225 
1226 		if (!nodeCreated) {
1227 			rw_lock_read_lock(&sVnodeLock);
1228 			rw_lock_write_unlock(&sVnodeLock);
1229 			goto restart;
1230 		}
1231 
1232 		rw_lock_write_unlock(&sVnodeLock);
1233 
1234 		int type;
1235 		uint32 flags;
1236 		status = FS_MOUNT_CALL(vnode->mount, get_vnode, vnodeID, vnode, &type,
1237 			&flags, reenter);
1238 		if (status == B_OK && vnode->private_node == NULL)
1239 			status = B_BAD_VALUE;
1240 
1241 		bool gotNode = status == B_OK;
1242 		bool publishSpecialSubNode = false;
1243 		if (gotNode) {
1244 			vnode->SetType(type);
1245 			publishSpecialSubNode = is_special_node_type(type)
1246 				&& (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
1247 		}
1248 
1249 		if (gotNode && publishSpecialSubNode)
1250 			status = create_special_sub_node(vnode, flags);
1251 
1252 		if (status != B_OK) {
1253 			if (gotNode)
1254 				FS_CALL(vnode, put_vnode, reenter);
1255 
1256 			rw_lock_write_lock(&sVnodeLock);
1257 			sVnodeTable->Remove(vnode);
1258 			remove_vnode_from_mount_list(vnode, vnode->mount);
1259 			rw_lock_write_unlock(&sVnodeLock);
1260 
1261 			free(vnode);
1262 			return status;
1263 		}
1264 
1265 		rw_lock_read_lock(&sVnodeLock);
1266 		vnode->Lock();
1267 
1268 		vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
1269 		vnode->SetBusy(false);
1270 
1271 		vnode->Unlock();
1272 		rw_lock_read_unlock(&sVnodeLock);
1273 	}
1274 
1275 	TRACE(("get_vnode: returning %p\n", vnode));
1276 
1277 	*_vnode = vnode;
1278 	return B_OK;
1279 }
1280 
1281 
1282 /*!	\brief Decrements the reference counter of the given vnode and deletes it,
1283 	if the counter dropped to 0.
1284 
1285 	The caller must, of course, own a reference to the vnode to call this
1286 	function.
1287 	The caller must not hold the sVnodeLock or the sMountLock.
1288 
1289 	\param vnode the vnode.
1290 */
1291 static inline void
1292 put_vnode(struct vnode* vnode)
1293 {
1294 	dec_vnode_ref_count(vnode, false, false);
1295 }
1296 
1297 
1298 static void
1299 free_unused_vnodes(int32 level)
1300 {
1301 	unused_vnodes_check_started();
1302 
1303 	if (level == B_NO_LOW_RESOURCE) {
1304 		unused_vnodes_check_done();
1305 		return;
1306 	}
1307 
1308 	flush_hot_vnodes();
1309 
1310 	// determine how many nodes to free
1311 	uint32 count = 1;
1312 	{
1313 		MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1314 
1315 		switch (level) {
1316 			case B_LOW_RESOURCE_NOTE:
1317 				count = sUnusedVnodes / 100;
1318 				break;
1319 			case B_LOW_RESOURCE_WARNING:
1320 				count = sUnusedVnodes / 10;
1321 				break;
1322 			case B_LOW_RESOURCE_CRITICAL:
1323 				count = sUnusedVnodes;
1324 				break;
1325 		}
1326 
1327 		if (count > sUnusedVnodes)
1328 			count = sUnusedVnodes;
1329 	}
1330 
1331 	// Write back the modified pages of some unused vnodes and free them.
1332 
1333 	for (uint32 i = 0; i < count; i++) {
1334 		ReadLocker vnodesReadLocker(sVnodeLock);
1335 
1336 		// get the first node
1337 		MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1338 		struct vnode* vnode = (struct vnode*)list_get_first_item(
1339 			&sUnusedVnodeList);
1340 		unusedVnodesLocker.Unlock();
1341 
1342 		if (vnode == NULL)
1343 			break;
1344 
1345 		// lock the node
1346 		AutoLocker<Vnode> nodeLocker(vnode);
1347 
1348 		// Check whether the node is still unused -- since we only append to the
1349 		// tail of the unused queue, the vnode should still be at its head.
1350 		// Alternatively we could check its ref count for 0 and its busy flag,
1351 		// but if the node is no longer at the head of the queue, it means it
1352 		// has been touched in the meantime, i.e. it is no longer the least
1353 		// recently used unused vnode and we rather don't free it.
1354 		unusedVnodesLocker.Lock();
1355 		if (vnode != list_get_first_item(&sUnusedVnodeList))
1356 			continue;
1357 		unusedVnodesLocker.Unlock();
1358 
1359 		ASSERT(!vnode->IsBusy());
1360 
1361 		// grab a reference
1362 		inc_vnode_ref_count(vnode);
1363 		vnode_used(vnode);
1364 
1365 		// write back changes and free the node
1366 		nodeLocker.Unlock();
1367 		vnodesReadLocker.Unlock();
1368 
1369 		if (vnode->cache != NULL)
1370 			vnode->cache->WriteModified();
1371 
1372 		dec_vnode_ref_count(vnode, true, false);
1373 			// this should free the vnode when it's still unused
1374 	}
1375 
1376 	unused_vnodes_check_done();
1377 }
1378 
1379 
1380 /*!	Gets the vnode the given vnode is covering.
1381 
1382 	The caller must have \c sVnodeLock read-locked at least.
1383 
1384 	The function returns a reference to the retrieved vnode (if any), the caller
1385 	is responsible to free.
1386 
1387 	\param vnode The vnode whose covered node shall be returned.
1388 	\return The covered vnode, or \c NULL if the given vnode doesn't cover any
1389 		vnode.
1390 */
1391 static inline Vnode*
1392 get_covered_vnode_locked(Vnode* vnode)
1393 {
1394 	if (Vnode* coveredNode = vnode->covers) {
1395 		while (coveredNode->covers != NULL)
1396 			coveredNode = coveredNode->covers;
1397 
1398 		inc_vnode_ref_count(coveredNode);
1399 		return coveredNode;
1400 	}
1401 
1402 	return NULL;
1403 }
1404 
1405 
1406 /*!	Gets the vnode the given vnode is covering.
1407 
1408 	The caller must not hold \c sVnodeLock. Note that this implies a race
1409 	condition, since the situation can change at any time.
1410 
1411 	The function returns a reference to the retrieved vnode (if any), the caller
1412 	is responsible to free.
1413 
1414 	\param vnode The vnode whose covered node shall be returned.
1415 	\return The covered vnode, or \c NULL if the given vnode doesn't cover any
1416 		vnode.
1417 */
1418 static inline Vnode*
1419 get_covered_vnode(Vnode* vnode)
1420 {
1421 	if (!vnode->IsCovering())
1422 		return NULL;
1423 
1424 	ReadLocker vnodeReadLocker(sVnodeLock);
1425 	return get_covered_vnode_locked(vnode);
1426 }
1427 
1428 
1429 /*!	Gets the vnode the given vnode is covered by.
1430 
1431 	The caller must have \c sVnodeLock read-locked at least.
1432 
1433 	The function returns a reference to the retrieved vnode (if any), the caller
1434 	is responsible to free.
1435 
1436 	\param vnode The vnode whose covering node shall be returned.
1437 	\return The covering vnode, or \c NULL if the given vnode isn't covered by
1438 		any vnode.
1439 */
1440 static Vnode*
1441 get_covering_vnode_locked(Vnode* vnode)
1442 {
1443 	if (Vnode* coveringNode = vnode->covered_by) {
1444 		while (coveringNode->covered_by != NULL)
1445 			coveringNode = coveringNode->covered_by;
1446 
1447 		inc_vnode_ref_count(coveringNode);
1448 		return coveringNode;
1449 	}
1450 
1451 	return NULL;
1452 }
1453 
1454 
1455 /*!	Gets the vnode the given vnode is covered by.
1456 
1457 	The caller must not hold \c sVnodeLock. Note that this implies a race
1458 	condition, since the situation can change at any time.
1459 
1460 	The function returns a reference to the retrieved vnode (if any), the caller
1461 	is responsible to free.
1462 
1463 	\param vnode The vnode whose covering node shall be returned.
1464 	\return The covering vnode, or \c NULL if the given vnode isn't covered by
1465 		any vnode.
1466 */
1467 static inline Vnode*
1468 get_covering_vnode(Vnode* vnode)
1469 {
1470 	if (!vnode->IsCovered())
1471 		return NULL;
1472 
1473 	ReadLocker vnodeReadLocker(sVnodeLock);
1474 	return get_covering_vnode_locked(vnode);
1475 }
1476 
1477 
1478 static void
1479 free_unused_vnodes()
1480 {
1481 	free_unused_vnodes(
1482 		low_resource_state(B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1483 			| B_KERNEL_RESOURCE_ADDRESS_SPACE));
1484 }
1485 
1486 
1487 static void
1488 vnode_low_resource_handler(void* /*data*/, uint32 resources, int32 level)
1489 {
1490 	TRACE(("vnode_low_resource_handler(level = %" B_PRId32 ")\n", level));
1491 
1492 	free_unused_vnodes(level);
1493 }
1494 
1495 
1496 static inline void
1497 put_advisory_locking(struct advisory_locking* locking)
1498 {
1499 	release_sem(locking->lock);
1500 }
1501 
1502 
1503 /*!	Returns the advisory_locking object of the \a vnode in case it
1504 	has one, and locks it.
1505 	You have to call put_advisory_locking() when you're done with
1506 	it.
1507 	Note, you must not have the vnode mutex locked when calling
1508 	this function.
1509 */
1510 static struct advisory_locking*
1511 get_advisory_locking(struct vnode* vnode)
1512 {
1513 	rw_lock_read_lock(&sVnodeLock);
1514 	vnode->Lock();
1515 
1516 	struct advisory_locking* locking = vnode->advisory_locking;
1517 	sem_id lock = locking != NULL ? locking->lock : B_ERROR;
1518 
1519 	vnode->Unlock();
1520 	rw_lock_read_unlock(&sVnodeLock);
1521 
1522 	if (lock >= 0)
1523 		lock = acquire_sem(lock);
1524 	if (lock < 0) {
1525 		// This means the locking has been deleted in the mean time
1526 		// or had never existed in the first place - otherwise, we
1527 		// would get the lock at some point.
1528 		return NULL;
1529 	}
1530 
1531 	return locking;
1532 }
1533 
1534 
1535 /*!	Creates a locked advisory_locking object, and attaches it to the
1536 	given \a vnode.
1537 	Returns B_OK in case of success - also if the vnode got such an
1538 	object from someone else in the mean time, you'll still get this
1539 	one locked then.
1540 */
1541 static status_t
1542 create_advisory_locking(struct vnode* vnode)
1543 {
1544 	if (vnode == NULL)
1545 		return B_FILE_ERROR;
1546 
1547 	ObjectDeleter<advisory_locking> lockingDeleter;
1548 	struct advisory_locking* locking = NULL;
1549 
1550 	while (get_advisory_locking(vnode) == NULL) {
1551 		// no locking object set on the vnode yet, create one
1552 		if (locking == NULL) {
1553 			locking = new(std::nothrow) advisory_locking;
1554 			if (locking == NULL)
1555 				return B_NO_MEMORY;
1556 			lockingDeleter.SetTo(locking);
1557 
1558 			locking->wait_sem = create_sem(0, "advisory lock");
1559 			if (locking->wait_sem < 0)
1560 				return locking->wait_sem;
1561 
1562 			locking->lock = create_sem(0, "advisory locking");
1563 			if (locking->lock < 0)
1564 				return locking->lock;
1565 		}
1566 
1567 		// set our newly created locking object
1568 		ReadLocker _(sVnodeLock);
1569 		AutoLocker<Vnode> nodeLocker(vnode);
1570 		if (vnode->advisory_locking == NULL) {
1571 			vnode->advisory_locking = locking;
1572 			lockingDeleter.Detach();
1573 			return B_OK;
1574 		}
1575 	}
1576 
1577 	// The vnode already had a locking object. That's just as well.
1578 
1579 	return B_OK;
1580 }
1581 
1582 
1583 /*! Returns \c true when either \a flock is \c NULL or the \a flock intersects
1584 	with the advisory_lock \a lock.
1585 */
1586 static bool
1587 advisory_lock_intersects(struct advisory_lock* lock, struct flock* flock)
1588 {
1589 	if (flock == NULL)
1590 		return true;
1591 
1592 	return lock->start <= flock->l_start - 1 + flock->l_len
1593 		&& lock->end >= flock->l_start;
1594 }
1595 
1596 
1597 /*!	Tests whether acquiring a lock would block.
1598 */
1599 static status_t
1600 test_advisory_lock(struct vnode* vnode, struct flock* flock)
1601 {
1602 	flock->l_type = F_UNLCK;
1603 
1604 	struct advisory_locking* locking = get_advisory_locking(vnode);
1605 	if (locking == NULL)
1606 		return B_OK;
1607 
1608 	team_id team = team_get_current_team_id();
1609 
1610 	LockList::Iterator iterator = locking->locks.GetIterator();
1611 	while (iterator.HasNext()) {
1612 		struct advisory_lock* lock = iterator.Next();
1613 
1614 		 if (lock->team != team && advisory_lock_intersects(lock, flock)) {
1615 			// locks do overlap
1616 			if (flock->l_type != F_RDLCK || !lock->shared) {
1617 				// collision
1618 				flock->l_type = lock->shared ? F_RDLCK : F_WRLCK;
1619 				flock->l_whence = SEEK_SET;
1620 				flock->l_start = lock->start;
1621 				flock->l_len = lock->end - lock->start + 1;
1622 				flock->l_pid = lock->team;
1623 				break;
1624 			}
1625 		}
1626 	}
1627 
1628 	put_advisory_locking(locking);
1629 	return B_OK;
1630 }
1631 
1632 
1633 /*!	Removes the specified lock, or all locks of the calling team
1634 	if \a flock is NULL.
1635 */
1636 static status_t
1637 release_advisory_lock(struct vnode* vnode, struct io_context* context,
1638 	struct file_descriptor* descriptor, struct flock* flock)
1639 {
1640 	FUNCTION(("release_advisory_lock(vnode = %p, flock = %p)\n", vnode, flock));
1641 
1642 	struct advisory_locking* locking = get_advisory_locking(vnode);
1643 	if (locking == NULL)
1644 		return B_OK;
1645 
1646 	// find matching lock entries
1647 
1648 	LockList::Iterator iterator = locking->locks.GetIterator();
1649 	while (iterator.HasNext()) {
1650 		struct advisory_lock* lock = iterator.Next();
1651 		bool removeLock = false;
1652 
1653 		if (descriptor != NULL && lock->bound_to == descriptor) {
1654 			// Remove flock() locks
1655 			removeLock = true;
1656 		} else if (lock->bound_to == context
1657 				&& advisory_lock_intersects(lock, flock)) {
1658 			// Remove POSIX locks
1659 			bool endsBeyond = false;
1660 			bool startsBefore = false;
1661 			if (flock != NULL) {
1662 				startsBefore = lock->start < flock->l_start;
1663 				endsBeyond = lock->end > flock->l_start - 1 + flock->l_len;
1664 			}
1665 
1666 			if (!startsBefore && !endsBeyond) {
1667 				// lock is completely contained in flock
1668 				removeLock = true;
1669 			} else if (startsBefore && !endsBeyond) {
1670 				// cut the end of the lock
1671 				lock->end = flock->l_start - 1;
1672 			} else if (!startsBefore && endsBeyond) {
1673 				// cut the start of the lock
1674 				lock->start = flock->l_start + flock->l_len;
1675 			} else {
1676 				// divide the lock into two locks
1677 				struct advisory_lock* secondLock = new advisory_lock;
1678 				if (secondLock == NULL) {
1679 					// TODO: we should probably revert the locks we already
1680 					// changed... (ie. allocate upfront)
1681 					put_advisory_locking(locking);
1682 					return B_NO_MEMORY;
1683 				}
1684 
1685 				lock->end = flock->l_start - 1;
1686 
1687 				secondLock->bound_to = context;
1688 				secondLock->team = lock->team;
1689 				secondLock->session = lock->session;
1690 				// values must already be normalized when getting here
1691 				secondLock->start = flock->l_start + flock->l_len;
1692 				secondLock->end = lock->end;
1693 				secondLock->shared = lock->shared;
1694 
1695 				locking->locks.Add(secondLock);
1696 			}
1697 		}
1698 
1699 		if (removeLock) {
1700 			// this lock is no longer used
1701 			iterator.Remove();
1702 			free(lock);
1703 		}
1704 	}
1705 
1706 	bool removeLocking = locking->locks.IsEmpty();
1707 	release_sem_etc(locking->wait_sem, 1, B_RELEASE_ALL);
1708 
1709 	put_advisory_locking(locking);
1710 
1711 	if (removeLocking) {
1712 		// We can remove the whole advisory locking structure; it's no
1713 		// longer used
1714 		locking = get_advisory_locking(vnode);
1715 		if (locking != NULL) {
1716 			ReadLocker locker(sVnodeLock);
1717 			AutoLocker<Vnode> nodeLocker(vnode);
1718 
1719 			// the locking could have been changed in the mean time
1720 			if (locking->locks.IsEmpty()) {
1721 				vnode->advisory_locking = NULL;
1722 				nodeLocker.Unlock();
1723 				locker.Unlock();
1724 
1725 				// we've detached the locking from the vnode, so we can
1726 				// safely delete it
1727 				delete locking;
1728 			} else {
1729 				// the locking is in use again
1730 				nodeLocker.Unlock();
1731 				locker.Unlock();
1732 				release_sem_etc(locking->lock, 1, B_DO_NOT_RESCHEDULE);
1733 			}
1734 		}
1735 	}
1736 
1737 	return B_OK;
1738 }
1739 
1740 
1741 /*!	Acquires an advisory lock for the \a vnode. If \a wait is \c true, it
1742 	will wait for the lock to become available, if there are any collisions
1743 	(it will return B_PERMISSION_DENIED in this case if \a wait is \c false).
1744 
1745 	If \a descriptor is NULL, POSIX semantics are used for this lock. Otherwise,
1746 	BSD flock() semantics are used, that is, all children can unlock the file
1747 	in question (we even allow parents to remove the lock, though, but that
1748 	seems to be in line to what the BSD's are doing).
1749 */
1750 static status_t
1751 acquire_advisory_lock(struct vnode* vnode, io_context* context,
1752 	struct file_descriptor* descriptor, struct flock* flock, bool wait)
1753 {
1754 	FUNCTION(("acquire_advisory_lock(vnode = %p, flock = %p, wait = %s)\n",
1755 		vnode, flock, wait ? "yes" : "no"));
1756 	dprintf("acquire_advisory_lock(vnode = %p, flock = %p, wait = %s)\n",
1757 		vnode, flock, wait ? "yes" : "no");
1758 
1759 	bool shared = flock->l_type == F_RDLCK;
1760 	void* boundTo = descriptor != NULL ? (void*)descriptor : (void*)context;
1761 	status_t status = B_OK;
1762 
1763 	// TODO: do deadlock detection!
1764 
1765 	struct advisory_locking* locking;
1766 
1767 	while (true) {
1768 		// if this vnode has an advisory_locking structure attached,
1769 		// lock that one and search for any colliding file lock
1770 		status = create_advisory_locking(vnode);
1771 		if (status != B_OK)
1772 			return status;
1773 
1774 		locking = vnode->advisory_locking;
1775 		team_id team = team_get_current_team_id();
1776 		sem_id waitForLock = -1;
1777 
1778 		// test for collisions
1779 		LockList::Iterator iterator = locking->locks.GetIterator();
1780 		while (iterator.HasNext()) {
1781 			struct advisory_lock* lock = iterator.Next();
1782 
1783 			// TODO: locks from the same team might be joinable!
1784 			if ((lock->team != team || lock->bound_to != boundTo)
1785 					&& advisory_lock_intersects(lock, flock)) {
1786 				// locks do overlap
1787 				if (!shared || !lock->shared) {
1788 					// we need to wait
1789 					waitForLock = locking->wait_sem;
1790 					break;
1791 				}
1792 			}
1793 		}
1794 
1795 		if (waitForLock < 0)
1796 			break;
1797 
1798 		// We need to wait. Do that or fail now, if we've been asked not to.
1799 
1800 		if (!wait) {
1801 			put_advisory_locking(locking);
1802 			return descriptor != NULL ? B_WOULD_BLOCK : B_PERMISSION_DENIED;
1803 		}
1804 
1805 		status = switch_sem_etc(locking->lock, waitForLock, 1,
1806 			B_CAN_INTERRUPT, 0);
1807 		if (status != B_OK && status != B_BAD_SEM_ID)
1808 			return status;
1809 
1810 		// We have been notified, but we need to re-lock the locking object. So
1811 		// go another round...
1812 	}
1813 
1814 	// install new lock
1815 
1816 	struct advisory_lock* lock = (struct advisory_lock*)malloc(
1817 		sizeof(struct advisory_lock));
1818 	if (lock == NULL) {
1819 		put_advisory_locking(locking);
1820 		return B_NO_MEMORY;
1821 	}
1822 
1823 	lock->bound_to = boundTo;
1824 	lock->team = team_get_current_team_id();
1825 	lock->session = thread_get_current_thread()->team->session_id;
1826 	// values must already be normalized when getting here
1827 	lock->start = flock->l_start;
1828 	lock->end = flock->l_start - 1 + flock->l_len;
1829 	lock->shared = shared;
1830 
1831 	locking->locks.Add(lock);
1832 	put_advisory_locking(locking);
1833 
1834 	return status;
1835 }
1836 
1837 
1838 /*!	Normalizes the \a flock structure to make it easier to compare the
1839 	structure with others. The l_start and l_len fields are set to absolute
1840 	values according to the l_whence field.
1841 */
1842 static status_t
1843 normalize_flock(struct file_descriptor* descriptor, struct flock* flock)
1844 {
1845 	switch (flock->l_whence) {
1846 		case SEEK_SET:
1847 			break;
1848 		case SEEK_CUR:
1849 			flock->l_start += descriptor->pos;
1850 			break;
1851 		case SEEK_END:
1852 		{
1853 			struct vnode* vnode = descriptor->u.vnode;
1854 			struct stat stat;
1855 			status_t status;
1856 
1857 			if (!HAS_FS_CALL(vnode, read_stat))
1858 				return B_UNSUPPORTED;
1859 
1860 			status = FS_CALL(vnode, read_stat, &stat);
1861 			if (status != B_OK)
1862 				return status;
1863 
1864 			flock->l_start += stat.st_size;
1865 			break;
1866 		}
1867 		default:
1868 			return B_BAD_VALUE;
1869 	}
1870 
1871 	if (flock->l_start < 0)
1872 		flock->l_start = 0;
1873 	if (flock->l_len == 0)
1874 		flock->l_len = OFF_MAX;
1875 
1876 	// don't let the offset and length overflow
1877 	if (flock->l_start > 0 && OFF_MAX - flock->l_start < flock->l_len)
1878 		flock->l_len = OFF_MAX - flock->l_start;
1879 
1880 	if (flock->l_len < 0) {
1881 		// a negative length reverses the region
1882 		flock->l_start += flock->l_len;
1883 		flock->l_len = -flock->l_len;
1884 	}
1885 
1886 	return B_OK;
1887 }
1888 
1889 
1890 static void
1891 replace_vnode_if_disconnected(struct fs_mount* mount,
1892 	struct vnode* vnodeToDisconnect, struct vnode*& vnode,
1893 	struct vnode* fallBack, bool lockRootLock)
1894 {
1895 	struct vnode* givenVnode = vnode;
1896 	bool vnodeReplaced = false;
1897 
1898 	ReadLocker vnodeReadLocker(sVnodeLock);
1899 
1900 	if (lockRootLock)
1901 		mutex_lock(&sIOContextRootLock);
1902 
1903 	while (vnode != NULL && vnode->mount == mount
1904 		&& (vnodeToDisconnect == NULL || vnodeToDisconnect == vnode)) {
1905 		if (vnode->covers != NULL) {
1906 			// redirect the vnode to the covered vnode
1907 			vnode = vnode->covers;
1908 		} else
1909 			vnode = fallBack;
1910 
1911 		vnodeReplaced = true;
1912 	}
1913 
1914 	// If we've replaced the node, grab a reference for the new one.
1915 	if (vnodeReplaced && vnode != NULL)
1916 		inc_vnode_ref_count(vnode);
1917 
1918 	if (lockRootLock)
1919 		mutex_unlock(&sIOContextRootLock);
1920 
1921 	vnodeReadLocker.Unlock();
1922 
1923 	if (vnodeReplaced)
1924 		put_vnode(givenVnode);
1925 }
1926 
1927 
1928 /*!	Disconnects all file descriptors that are associated with the
1929 	\a vnodeToDisconnect, or if this is NULL, all vnodes of the specified
1930 	\a mount object.
1931 
1932 	Note, after you've called this function, there might still be ongoing
1933 	accesses - they won't be interrupted if they already happened before.
1934 	However, any subsequent access will fail.
1935 
1936 	This is not a cheap function and should be used with care and rarely.
1937 	TODO: there is currently no means to stop a blocking read/write!
1938 */
1939 static void
1940 disconnect_mount_or_vnode_fds(struct fs_mount* mount,
1941 	struct vnode* vnodeToDisconnect)
1942 {
1943 	// iterate over all teams and peek into their file descriptors
1944 	TeamListIterator teamIterator;
1945 	while (Team* team = teamIterator.Next()) {
1946 		BReference<Team> teamReference(team, true);
1947 		TeamLocker teamLocker(team);
1948 
1949 		// lock the I/O context
1950 		io_context* context = team->io_context;
1951 		if (context == NULL)
1952 			continue;
1953 		MutexLocker contextLocker(context->io_mutex);
1954 
1955 		teamLocker.Unlock();
1956 
1957 		replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->root,
1958 			sRoot, true);
1959 		replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->cwd,
1960 			sRoot, false);
1961 
1962 		for (uint32 i = 0; i < context->table_size; i++) {
1963 			struct file_descriptor* descriptor = context->fds[i];
1964 			if (descriptor == NULL || (descriptor->open_mode & O_DISCONNECTED) != 0)
1965 				continue;
1966 
1967 			inc_fd_ref_count(descriptor);
1968 
1969 			// if this descriptor points at this mount, we
1970 			// need to disconnect it to be able to unmount
1971 			struct vnode* vnode = fd_vnode(descriptor);
1972 			if (vnodeToDisconnect != NULL) {
1973 				if (vnode == vnodeToDisconnect)
1974 					disconnect_fd(descriptor);
1975 			} else if ((vnode != NULL && vnode->mount == mount)
1976 				|| (vnode == NULL && descriptor->u.mount == mount))
1977 				disconnect_fd(descriptor);
1978 
1979 			put_fd(descriptor);
1980 		}
1981 	}
1982 }
1983 
1984 
1985 /*!	\brief Gets the root node of the current IO context.
1986 	If \a kernel is \c true, the kernel IO context will be used.
1987 	The caller obtains a reference to the returned node.
1988 */
1989 struct vnode*
1990 get_root_vnode(bool kernel)
1991 {
1992 	if (!kernel) {
1993 		// Get current working directory from io context
1994 		struct io_context* context = get_current_io_context(kernel);
1995 
1996 		mutex_lock(&sIOContextRootLock);
1997 
1998 		struct vnode* root = context->root;
1999 		if (root != NULL)
2000 			inc_vnode_ref_count(root);
2001 
2002 		mutex_unlock(&sIOContextRootLock);
2003 
2004 		if (root != NULL)
2005 			return root;
2006 
2007 		// That should never happen.
2008 		dprintf("get_root_vnode(): IO context for team %" B_PRId32 " doesn't "
2009 			"have a root\n", team_get_current_team_id());
2010 	}
2011 
2012 	inc_vnode_ref_count(sRoot);
2013 	return sRoot;
2014 }
2015 
2016 
2017 /*!	\brief Gets the directory path and leaf name for a given path.
2018 
2019 	The supplied \a path is transformed to refer to the directory part of
2020 	the entry identified by the original path, and into the buffer \a filename
2021 	the leaf name of the original entry is written.
2022 	Neither the returned path nor the leaf name can be expected to be
2023 	canonical.
2024 
2025 	\param path The path to be analyzed. Must be able to store at least one
2026 		   additional character.
2027 	\param filename The buffer into which the leaf name will be written.
2028 		   Must be of size B_FILE_NAME_LENGTH at least.
2029 	\return \c B_OK, if everything went fine, \c B_NAME_TOO_LONG, if the leaf
2030 		   name is longer than \c B_FILE_NAME_LENGTH, or \c B_ENTRY_NOT_FOUND,
2031 		   if the given path name is empty.
2032 */
2033 static status_t
2034 get_dir_path_and_leaf(char* path, char* filename)
2035 {
2036 	if (*path == '\0')
2037 		return B_ENTRY_NOT_FOUND;
2038 
2039 	char* last = strrchr(path, '/');
2040 		// '/' are not allowed in file names!
2041 
2042 	FUNCTION(("get_dir_path_and_leaf(path = %s)\n", path));
2043 
2044 	if (last == NULL) {
2045 		// this path is single segment with no '/' in it
2046 		// ex. "foo"
2047 		if (strlcpy(filename, path, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2048 			return B_NAME_TOO_LONG;
2049 
2050 		strcpy(path, ".");
2051 	} else {
2052 		last++;
2053 		if (last[0] == '\0') {
2054 			// special case: the path ends in one or more '/' - remove them
2055 			while (*--last == '/' && last != path);
2056 			last[1] = '\0';
2057 
2058 			if (last == path && last[0] == '/') {
2059 				// This path points to the root of the file system
2060 				strcpy(filename, ".");
2061 				return B_OK;
2062 			}
2063 			for (; last != path && *(last - 1) != '/'; last--);
2064 				// rewind to the start of the leaf before the '/'
2065 		}
2066 
2067 		// normal leaf: replace the leaf portion of the path with a '.'
2068 		if (strlcpy(filename, last, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2069 			return B_NAME_TOO_LONG;
2070 
2071 		last[0] = '.';
2072 		last[1] = '\0';
2073 	}
2074 	return B_OK;
2075 }
2076 
2077 
2078 static status_t
2079 entry_ref_to_vnode(dev_t mountID, ino_t directoryID, const char* name,
2080 	bool traverse, bool kernel, struct vnode** _vnode)
2081 {
2082 	char clonedName[B_FILE_NAME_LENGTH + 1];
2083 	if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2084 		return B_NAME_TOO_LONG;
2085 
2086 	// get the directory vnode and let vnode_path_to_vnode() do the rest
2087 	struct vnode* directory;
2088 
2089 	status_t status = get_vnode(mountID, directoryID, &directory, true, false);
2090 	if (status < 0)
2091 		return status;
2092 
2093 	return vnode_path_to_vnode(directory, clonedName, traverse, 0, kernel,
2094 		_vnode, NULL);
2095 }
2096 
2097 
2098 /*!	Looks up the entry with name \a name in the directory represented by \a dir
2099 	and returns the respective vnode.
2100 	On success a reference to the vnode is acquired for the caller.
2101 */
2102 static status_t
2103 lookup_dir_entry(struct vnode* dir, const char* name, struct vnode** _vnode)
2104 {
2105 	ino_t id;
2106 	bool missing;
2107 
2108 	if (dir->mount->entry_cache.Lookup(dir->id, name, id, missing)) {
2109 		return missing ? B_ENTRY_NOT_FOUND
2110 			: get_vnode(dir->device, id, _vnode, true, false);
2111 	}
2112 
2113 	status_t status = FS_CALL(dir, lookup, name, &id);
2114 	if (status != B_OK)
2115 		return status;
2116 
2117 	// The lookup() hook calls get_vnode() or publish_vnode(), so we do already
2118 	// have a reference and just need to look the node up.
2119 	rw_lock_read_lock(&sVnodeLock);
2120 	*_vnode = lookup_vnode(dir->device, id);
2121 	rw_lock_read_unlock(&sVnodeLock);
2122 
2123 	if (*_vnode == NULL) {
2124 		panic("lookup_dir_entry(): could not lookup vnode (mountid 0x%" B_PRIx32
2125 			" vnid 0x%" B_PRIx64 ")\n", dir->device, id);
2126 		return B_ENTRY_NOT_FOUND;
2127 	}
2128 
2129 //	ktrace_printf("lookup_dir_entry(): dir: %p (%ld, %lld), name: \"%s\" -> "
2130 //		"%p (%ld, %lld)", dir, dir->mount->id, dir->id, name, *_vnode,
2131 //		(*_vnode)->mount->id, (*_vnode)->id);
2132 
2133 	return B_OK;
2134 }
2135 
2136 
2137 /*!	Returns the vnode for the relative path starting at the specified \a vnode.
2138 	\a path must not be NULL.
2139 	If it returns successfully, \a path contains the name of the last path
2140 	component. This function clobbers the buffer pointed to by \a path only
2141 	if it does contain more than one component.
2142 	Note, this reduces the ref_count of the starting \a vnode, no matter if
2143 	it is successful or not!
2144 */
2145 static status_t
2146 vnode_path_to_vnode(struct vnode* vnode, char* path, bool traverseLeafLink,
2147 	int count, struct io_context* ioContext, struct vnode** _vnode,
2148 	ino_t* _parentID)
2149 {
2150 	status_t status = B_OK;
2151 	ino_t lastParentID = vnode->id;
2152 
2153 	FUNCTION(("vnode_path_to_vnode(vnode = %p, path = %s)\n", vnode, path));
2154 
2155 	if (path == NULL) {
2156 		put_vnode(vnode);
2157 		return B_BAD_VALUE;
2158 	}
2159 
2160 	if (*path == '\0') {
2161 		put_vnode(vnode);
2162 		return B_ENTRY_NOT_FOUND;
2163 	}
2164 
2165 	while (true) {
2166 		struct vnode* nextVnode;
2167 		char* nextPath;
2168 
2169 		TRACE(("vnode_path_to_vnode: top of loop. p = %p, p = '%s'\n", path,
2170 			path));
2171 
2172 		// done?
2173 		if (path[0] == '\0')
2174 			break;
2175 
2176 		// walk to find the next path component ("path" will point to a single
2177 		// path component), and filter out multiple slashes
2178 		for (nextPath = path + 1; *nextPath != '\0' && *nextPath != '/';
2179 				nextPath++);
2180 
2181 		if (*nextPath == '/') {
2182 			*nextPath = '\0';
2183 			do
2184 				nextPath++;
2185 			while (*nextPath == '/');
2186 		}
2187 
2188 		// See if the '..' is at a covering vnode move to the covered
2189 		// vnode so we pass the '..' path to the underlying filesystem.
2190 		// Also prevent breaking the root of the IO context.
2191 		if (strcmp("..", path) == 0) {
2192 			if (vnode == ioContext->root) {
2193 				// Attempted prison break! Keep it contained.
2194 				path = nextPath;
2195 				continue;
2196 			}
2197 
2198 			if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2199 				nextVnode = coveredVnode;
2200 				put_vnode(vnode);
2201 				vnode = nextVnode;
2202 			}
2203 		}
2204 
2205 		// check if vnode is really a directory
2206 		if (status == B_OK && !S_ISDIR(vnode->Type()))
2207 			status = B_NOT_A_DIRECTORY;
2208 
2209 		// Check if we have the right to search the current directory vnode.
2210 		// If a file system doesn't have the access() function, we assume that
2211 		// searching a directory is always allowed
2212 		if (status == B_OK && HAS_FS_CALL(vnode, access))
2213 			status = FS_CALL(vnode, access, X_OK);
2214 
2215 		// Tell the filesystem to get the vnode of this path component (if we
2216 		// got the permission from the call above)
2217 		if (status == B_OK)
2218 			status = lookup_dir_entry(vnode, path, &nextVnode);
2219 
2220 		if (status != B_OK) {
2221 			put_vnode(vnode);
2222 			return status;
2223 		}
2224 
2225 		// If the new node is a symbolic link, resolve it (if we've been told
2226 		// to do it)
2227 		if (S_ISLNK(nextVnode->Type())
2228 			&& (traverseLeafLink || nextPath[0] != '\0')) {
2229 			size_t bufferSize;
2230 			char* buffer;
2231 
2232 			TRACE(("traverse link\n"));
2233 
2234 			// it's not exactly nice style using goto in this way, but hey,
2235 			// it works :-/
2236 			if (count + 1 > B_MAX_SYMLINKS) {
2237 				status = B_LINK_LIMIT;
2238 				goto resolve_link_error;
2239 			}
2240 
2241 			bufferSize = B_PATH_NAME_LENGTH;
2242 			buffer = (char*)object_cache_alloc(sPathNameCache, 0);
2243 			if (buffer == NULL) {
2244 				status = B_NO_MEMORY;
2245 				goto resolve_link_error;
2246 			}
2247 
2248 			if (HAS_FS_CALL(nextVnode, read_symlink)) {
2249 				bufferSize--;
2250 				status = FS_CALL(nextVnode, read_symlink, buffer, &bufferSize);
2251 				// null-terminate
2252 				if (status >= 0)
2253 					buffer[bufferSize] = '\0';
2254 			} else
2255 				status = B_BAD_VALUE;
2256 
2257 			if (status != B_OK) {
2258 				free(buffer);
2259 
2260 		resolve_link_error:
2261 				put_vnode(vnode);
2262 				put_vnode(nextVnode);
2263 
2264 				return status;
2265 			}
2266 			put_vnode(nextVnode);
2267 
2268 			// Check if we start from the root directory or the current
2269 			// directory ("vnode" still points to that one).
2270 			// Cut off all leading slashes if it's the root directory
2271 			path = buffer;
2272 			bool absoluteSymlink = false;
2273 			if (path[0] == '/') {
2274 				// we don't need the old directory anymore
2275 				put_vnode(vnode);
2276 
2277 				while (*++path == '/')
2278 					;
2279 
2280 				mutex_lock(&sIOContextRootLock);
2281 				vnode = ioContext->root;
2282 				inc_vnode_ref_count(vnode);
2283 				mutex_unlock(&sIOContextRootLock);
2284 
2285 				absoluteSymlink = true;
2286 			}
2287 
2288 			inc_vnode_ref_count(vnode);
2289 				// balance the next recursion - we will decrement the
2290 				// ref_count of the vnode, no matter if we succeeded or not
2291 
2292 			if (absoluteSymlink && *path == '\0') {
2293 				// symlink was just "/"
2294 				nextVnode = vnode;
2295 			} else {
2296 				status = vnode_path_to_vnode(vnode, path, true, count + 1,
2297 					ioContext, &nextVnode, &lastParentID);
2298 			}
2299 
2300 			object_cache_free(sPathNameCache, buffer, 0);
2301 
2302 			if (status != B_OK) {
2303 				put_vnode(vnode);
2304 				return status;
2305 			}
2306 		} else
2307 			lastParentID = vnode->id;
2308 
2309 		// decrease the ref count on the old dir we just looked up into
2310 		put_vnode(vnode);
2311 
2312 		path = nextPath;
2313 		vnode = nextVnode;
2314 
2315 		// see if we hit a covered node
2316 		if (Vnode* coveringNode = get_covering_vnode(vnode)) {
2317 			put_vnode(vnode);
2318 			vnode = coveringNode;
2319 		}
2320 	}
2321 
2322 	*_vnode = vnode;
2323 	if (_parentID)
2324 		*_parentID = lastParentID;
2325 
2326 	return B_OK;
2327 }
2328 
2329 
2330 static status_t
2331 vnode_path_to_vnode(struct vnode* vnode, char* path, bool traverseLeafLink,
2332 	int count, bool kernel, struct vnode** _vnode, ino_t* _parentID)
2333 {
2334 	return vnode_path_to_vnode(vnode, path, traverseLeafLink, count,
2335 		get_current_io_context(kernel), _vnode, _parentID);
2336 }
2337 
2338 
2339 static status_t
2340 path_to_vnode(char* path, bool traverseLink, struct vnode** _vnode,
2341 	ino_t* _parentID, bool kernel)
2342 {
2343 	struct vnode* start = NULL;
2344 
2345 	FUNCTION(("path_to_vnode(path = \"%s\")\n", path));
2346 
2347 	if (!path)
2348 		return B_BAD_VALUE;
2349 
2350 	if (*path == '\0')
2351 		return B_ENTRY_NOT_FOUND;
2352 
2353 	// figure out if we need to start at root or at cwd
2354 	if (*path == '/') {
2355 		if (sRoot == NULL) {
2356 			// we're a bit early, aren't we?
2357 			return B_ERROR;
2358 		}
2359 
2360 		while (*++path == '/')
2361 			;
2362 		start = get_root_vnode(kernel);
2363 
2364 		if (*path == '\0') {
2365 			*_vnode = start;
2366 			return B_OK;
2367 		}
2368 
2369 	} else {
2370 		struct io_context* context = get_current_io_context(kernel);
2371 
2372 		mutex_lock(&context->io_mutex);
2373 		start = context->cwd;
2374 		if (start != NULL)
2375 			inc_vnode_ref_count(start);
2376 		mutex_unlock(&context->io_mutex);
2377 
2378 		if (start == NULL)
2379 			return B_ERROR;
2380 	}
2381 
2382 	return vnode_path_to_vnode(start, path, traverseLink, 0, kernel, _vnode,
2383 		_parentID);
2384 }
2385 
2386 
2387 /*! Returns the vnode in the next to last segment of the path, and returns
2388 	the last portion in filename.
2389 	The path buffer must be able to store at least one additional character.
2390 */
2391 static status_t
2392 path_to_dir_vnode(char* path, struct vnode** _vnode, char* filename,
2393 	bool kernel)
2394 {
2395 	status_t status = get_dir_path_and_leaf(path, filename);
2396 	if (status != B_OK)
2397 		return status;
2398 
2399 	return path_to_vnode(path, true, _vnode, NULL, kernel);
2400 }
2401 
2402 
2403 /*!	\brief Retrieves the directory vnode and the leaf name of an entry referred
2404 		   to by a FD + path pair.
2405 
2406 	\a path must be given in either case. \a fd might be omitted, in which
2407 	case \a path is either an absolute path or one relative to the current
2408 	directory. If both a supplied and \a path is relative it is reckoned off
2409 	of the directory referred to by \a fd. If \a path is absolute \a fd is
2410 	ignored.
2411 
2412 	The caller has the responsibility to call put_vnode() on the returned
2413 	directory vnode.
2414 
2415 	\param fd The FD. May be < 0.
2416 	\param path The absolute or relative path. Must not be \c NULL. The buffer
2417 	       is modified by this function. It must have at least room for a
2418 	       string one character longer than the path it contains.
2419 	\param _vnode A pointer to a variable the directory vnode shall be written
2420 		   into.
2421 	\param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2422 		   the leaf name of the specified entry will be written.
2423 	\param kernel \c true, if invoked from inside the kernel, \c false if
2424 		   invoked from userland.
2425 	\return \c B_OK, if everything went fine, another error code otherwise.
2426 */
2427 static status_t
2428 fd_and_path_to_dir_vnode(int fd, char* path, struct vnode** _vnode,
2429 	char* filename, bool kernel)
2430 {
2431 	if (!path)
2432 		return B_BAD_VALUE;
2433 	if (*path == '\0')
2434 		return B_ENTRY_NOT_FOUND;
2435 	if (fd < 0)
2436 		return path_to_dir_vnode(path, _vnode, filename, kernel);
2437 
2438 	status_t status = get_dir_path_and_leaf(path, filename);
2439 	if (status != B_OK)
2440 		return status;
2441 
2442 	return fd_and_path_to_vnode(fd, path, true, _vnode, NULL, kernel);
2443 }
2444 
2445 
2446 /*!	\brief Retrieves the directory vnode and the leaf name of an entry referred
2447 		   to by a vnode + path pair.
2448 
2449 	\a path must be given in either case. \a vnode might be omitted, in which
2450 	case \a path is either an absolute path or one relative to the current
2451 	directory. If both a supplied and \a path is relative it is reckoned off
2452 	of the directory referred to by \a vnode. If \a path is absolute \a vnode is
2453 	ignored.
2454 
2455 	The caller has the responsibility to call put_vnode() on the returned
2456 	directory vnode.
2457 
2458 	\param vnode The vnode. May be \c NULL.
2459 	\param path The absolute or relative path. Must not be \c NULL. The buffer
2460 	       is modified by this function. It must have at least room for a
2461 	       string one character longer than the path it contains.
2462 	\param _vnode A pointer to a variable the directory vnode shall be written
2463 		   into.
2464 	\param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2465 		   the leaf name of the specified entry will be written.
2466 	\param kernel \c true, if invoked from inside the kernel, \c false if
2467 		   invoked from userland.
2468 	\return \c B_OK, if everything went fine, another error code otherwise.
2469 */
2470 static status_t
2471 vnode_and_path_to_dir_vnode(struct vnode* vnode, char* path,
2472 	struct vnode** _vnode, char* filename, bool kernel)
2473 {
2474 	if (!path)
2475 		return B_BAD_VALUE;
2476 	if (*path == '\0')
2477 		return B_ENTRY_NOT_FOUND;
2478 	if (vnode == NULL || path[0] == '/')
2479 		return path_to_dir_vnode(path, _vnode, filename, kernel);
2480 
2481 	status_t status = get_dir_path_and_leaf(path, filename);
2482 	if (status != B_OK)
2483 		return status;
2484 
2485 	inc_vnode_ref_count(vnode);
2486 		// vnode_path_to_vnode() always decrements the ref count
2487 
2488 	return vnode_path_to_vnode(vnode, path, true, 0, kernel, _vnode, NULL);
2489 }
2490 
2491 
2492 /*! Returns a vnode's name in the d_name field of a supplied dirent buffer.
2493 */
2494 static status_t
2495 get_vnode_name(struct vnode* vnode, struct vnode* parent, struct dirent* buffer,
2496 	size_t bufferSize, struct io_context* ioContext)
2497 {
2498 	if (bufferSize < sizeof(struct dirent))
2499 		return B_BAD_VALUE;
2500 
2501 	// See if the vnode is covering another vnode and move to the covered
2502 	// vnode so we get the underlying file system
2503 	VNodePutter vnodePutter;
2504 	if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2505 		vnode = coveredVnode;
2506 		vnodePutter.SetTo(vnode);
2507 	}
2508 
2509 	if (HAS_FS_CALL(vnode, get_vnode_name)) {
2510 		// The FS supports getting the name of a vnode.
2511 		if (FS_CALL(vnode, get_vnode_name, buffer->d_name,
2512 			(char*)buffer + bufferSize - buffer->d_name) == B_OK)
2513 			return B_OK;
2514 	}
2515 
2516 	// The FS doesn't support getting the name of a vnode. So we search the
2517 	// parent directory for the vnode, if the caller let us.
2518 
2519 	if (parent == NULL || !HAS_FS_CALL(parent, read_dir))
2520 		return B_UNSUPPORTED;
2521 
2522 	void* cookie;
2523 
2524 	status_t status = FS_CALL(parent, open_dir, &cookie);
2525 	if (status >= B_OK) {
2526 		while (true) {
2527 			uint32 num = 1;
2528 			// We use the FS hook directly instead of dir_read(), since we don't
2529 			// want the entries to be fixed. We have already resolved vnode to
2530 			// the covered node.
2531 			status = FS_CALL(parent, read_dir, cookie, buffer, bufferSize,
2532 				&num);
2533 			if (status != B_OK)
2534 				break;
2535 			if (num == 0) {
2536 				status = B_ENTRY_NOT_FOUND;
2537 				break;
2538 			}
2539 
2540 			if (vnode->id == buffer->d_ino) {
2541 				// found correct entry!
2542 				break;
2543 			}
2544 		}
2545 
2546 		FS_CALL(parent, close_dir, cookie);
2547 		FS_CALL(parent, free_dir_cookie, cookie);
2548 	}
2549 	return status;
2550 }
2551 
2552 
2553 static status_t
2554 get_vnode_name(struct vnode* vnode, struct vnode* parent, char* name,
2555 	size_t nameSize, bool kernel)
2556 {
2557 	char buffer[sizeof(struct dirent) + B_FILE_NAME_LENGTH];
2558 	struct dirent* dirent = (struct dirent*)buffer;
2559 
2560 	status_t status = get_vnode_name(vnode, parent, dirent, sizeof(buffer),
2561 		get_current_io_context(kernel));
2562 	if (status != B_OK)
2563 		return status;
2564 
2565 	if (strlcpy(name, dirent->d_name, nameSize) >= nameSize)
2566 		return B_BUFFER_OVERFLOW;
2567 
2568 	return B_OK;
2569 }
2570 
2571 
2572 /*!	Gets the full path to a given directory vnode.
2573 	It uses the fs_get_vnode_name() call to get the name of a vnode; if a
2574 	file system doesn't support this call, it will fall back to iterating
2575 	through the parent directory to get the name of the child.
2576 
2577 	To protect against circular loops, it supports a maximum tree depth
2578 	of 256 levels.
2579 
2580 	Note that the path may not be correct the time this function returns!
2581 	It doesn't use any locking to prevent returning the correct path, as
2582 	paths aren't safe anyway: the path to a file can change at any time.
2583 
2584 	It might be a good idea, though, to check if the returned path exists
2585 	in the calling function (it's not done here because of efficiency)
2586 */
2587 static status_t
2588 dir_vnode_to_path(struct vnode* vnode, char* buffer, size_t bufferSize,
2589 	bool kernel)
2590 {
2591 	FUNCTION(("dir_vnode_to_path(%p, %p, %lu)\n", vnode, buffer, bufferSize));
2592 
2593 	if (vnode == NULL || buffer == NULL || bufferSize == 0)
2594 		return B_BAD_VALUE;
2595 
2596 	if (!S_ISDIR(vnode->Type()))
2597 		return B_NOT_A_DIRECTORY;
2598 
2599 	char* path = buffer;
2600 	int32 insert = bufferSize;
2601 	int32 maxLevel = 256;
2602 	int32 length;
2603 	status_t status = B_OK;
2604 	struct io_context* ioContext = get_current_io_context(kernel);
2605 
2606 	// we don't use get_vnode() here because this call is more
2607 	// efficient and does all we need from get_vnode()
2608 	inc_vnode_ref_count(vnode);
2609 
2610 	path[--insert] = '\0';
2611 		// the path is filled right to left
2612 
2613 	while (true) {
2614 		// If the node is the context's root, bail out. Otherwise resolve mount
2615 		// points.
2616 		if (vnode == ioContext->root)
2617 			break;
2618 
2619 		if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2620 			put_vnode(vnode);
2621 			vnode = coveredVnode;
2622 		}
2623 
2624 		// lookup the parent vnode
2625 		struct vnode* parentVnode;
2626 		status = lookup_dir_entry(vnode, "..", &parentVnode);
2627 		if (status != B_OK)
2628 			goto out;
2629 
2630 		if (parentVnode == vnode) {
2631 			// The caller apparently got their hands on a node outside of their
2632 			// context's root. Now we've hit the global root.
2633 			put_vnode(parentVnode);
2634 			break;
2635 		}
2636 
2637 		// get the node's name
2638 		char nameBuffer[sizeof(struct dirent) + B_FILE_NAME_LENGTH];
2639 			// also used for fs_read_dir()
2640 		char* name = &((struct dirent*)nameBuffer)->d_name[0];
2641 		status = get_vnode_name(vnode, parentVnode, (struct dirent*)nameBuffer,
2642 			sizeof(nameBuffer), ioContext);
2643 
2644 		// release the current vnode, we only need its parent from now on
2645 		put_vnode(vnode);
2646 		vnode = parentVnode;
2647 
2648 		if (status != B_OK)
2649 			goto out;
2650 
2651 		// TODO: add an explicit check for loops in about 10 levels to do
2652 		// real loop detection
2653 
2654 		// don't go deeper as 'maxLevel' to prevent circular loops
2655 		if (maxLevel-- < 0) {
2656 			status = B_LINK_LIMIT;
2657 			goto out;
2658 		}
2659 
2660 		// add the name in front of the current path
2661 		name[B_FILE_NAME_LENGTH - 1] = '\0';
2662 		length = strlen(name);
2663 		insert -= length;
2664 		if (insert <= 0) {
2665 			status = B_RESULT_NOT_REPRESENTABLE;
2666 			goto out;
2667 		}
2668 		memcpy(path + insert, name, length);
2669 		path[--insert] = '/';
2670 	}
2671 
2672 	// the root dir will result in an empty path: fix it
2673 	if (path[insert] == '\0')
2674 		path[--insert] = '/';
2675 
2676 	TRACE(("  path is: %s\n", path + insert));
2677 
2678 	// move the path to the start of the buffer
2679 	length = bufferSize - insert;
2680 	memmove(buffer, path + insert, length);
2681 
2682 out:
2683 	put_vnode(vnode);
2684 	return status;
2685 }
2686 
2687 
2688 /*!	Checks the length of every path component, and adds a '.'
2689 	if the path ends in a slash.
2690 	The given path buffer must be able to store at least one
2691 	additional character.
2692 */
2693 static status_t
2694 check_path(char* to)
2695 {
2696 	int32 length = 0;
2697 
2698 	// check length of every path component
2699 
2700 	while (*to) {
2701 		char* begin;
2702 		if (*to == '/')
2703 			to++, length++;
2704 
2705 		begin = to;
2706 		while (*to != '/' && *to)
2707 			to++, length++;
2708 
2709 		if (to - begin > B_FILE_NAME_LENGTH)
2710 			return B_NAME_TOO_LONG;
2711 	}
2712 
2713 	if (length == 0)
2714 		return B_ENTRY_NOT_FOUND;
2715 
2716 	// complete path if there is a slash at the end
2717 
2718 	if (*(to - 1) == '/') {
2719 		if (length > B_PATH_NAME_LENGTH - 2)
2720 			return B_NAME_TOO_LONG;
2721 
2722 		to[0] = '.';
2723 		to[1] = '\0';
2724 	}
2725 
2726 	return B_OK;
2727 }
2728 
2729 
2730 static struct file_descriptor*
2731 get_fd_and_vnode(int fd, struct vnode** _vnode, bool kernel)
2732 {
2733 	struct file_descriptor* descriptor
2734 		= get_fd(get_current_io_context(kernel), fd);
2735 	if (descriptor == NULL)
2736 		return NULL;
2737 
2738 	struct vnode* vnode = fd_vnode(descriptor);
2739 	if (vnode == NULL) {
2740 		put_fd(descriptor);
2741 		return NULL;
2742 	}
2743 
2744 	// ToDo: when we can close a file descriptor at any point, investigate
2745 	//	if this is still valid to do (accessing the vnode without ref_count
2746 	//	or locking)
2747 	*_vnode = vnode;
2748 	return descriptor;
2749 }
2750 
2751 
2752 static struct vnode*
2753 get_vnode_from_fd(int fd, bool kernel)
2754 {
2755 	struct file_descriptor* descriptor;
2756 	struct vnode* vnode;
2757 
2758 	descriptor = get_fd(get_current_io_context(kernel), fd);
2759 	if (descriptor == NULL)
2760 		return NULL;
2761 
2762 	vnode = fd_vnode(descriptor);
2763 	if (vnode != NULL)
2764 		inc_vnode_ref_count(vnode);
2765 
2766 	put_fd(descriptor);
2767 	return vnode;
2768 }
2769 
2770 
2771 /*!	Gets the vnode from an FD + path combination. If \a fd is lower than zero,
2772 	only the path will be considered. In this case, the \a path must not be
2773 	NULL.
2774 	If \a fd is a valid file descriptor, \a path may be NULL for directories,
2775 	and should be NULL for files.
2776 */
2777 static status_t
2778 fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
2779 	struct vnode** _vnode, ino_t* _parentID, bool kernel)
2780 {
2781 	if (fd < 0 && !path)
2782 		return B_BAD_VALUE;
2783 
2784 	if (path != NULL && *path == '\0')
2785 		return B_ENTRY_NOT_FOUND;
2786 
2787 	if (fd < 0 || (path != NULL && path[0] == '/')) {
2788 		// no FD or absolute path
2789 		return path_to_vnode(path, traverseLeafLink, _vnode, _parentID, kernel);
2790 	}
2791 
2792 	// FD only, or FD + relative path
2793 	struct vnode* vnode = get_vnode_from_fd(fd, kernel);
2794 	if (vnode == NULL)
2795 		return B_FILE_ERROR;
2796 
2797 	if (path != NULL) {
2798 		return vnode_path_to_vnode(vnode, path, traverseLeafLink, 0, kernel,
2799 			_vnode, _parentID);
2800 	}
2801 
2802 	// there is no relative path to take into account
2803 
2804 	*_vnode = vnode;
2805 	if (_parentID)
2806 		*_parentID = -1;
2807 
2808 	return B_OK;
2809 }
2810 
2811 
2812 static int
2813 get_new_fd(int type, struct fs_mount* mount, struct vnode* vnode,
2814 	void* cookie, int openMode, bool kernel)
2815 {
2816 	struct file_descriptor* descriptor;
2817 	int fd;
2818 
2819 	// If the vnode is locked, we don't allow creating a new file/directory
2820 	// file_descriptor for it
2821 	if (vnode && vnode->mandatory_locked_by != NULL
2822 		&& (type == FDTYPE_FILE || type == FDTYPE_DIR))
2823 		return B_BUSY;
2824 
2825 	descriptor = alloc_fd();
2826 	if (!descriptor)
2827 		return B_NO_MEMORY;
2828 
2829 	if (vnode)
2830 		descriptor->u.vnode = vnode;
2831 	else
2832 		descriptor->u.mount = mount;
2833 	descriptor->cookie = cookie;
2834 
2835 	switch (type) {
2836 		// vnode types
2837 		case FDTYPE_FILE:
2838 			descriptor->ops = &sFileOps;
2839 			break;
2840 		case FDTYPE_DIR:
2841 			descriptor->ops = &sDirectoryOps;
2842 			break;
2843 		case FDTYPE_ATTR:
2844 			descriptor->ops = &sAttributeOps;
2845 			break;
2846 		case FDTYPE_ATTR_DIR:
2847 			descriptor->ops = &sAttributeDirectoryOps;
2848 			break;
2849 
2850 		// mount types
2851 		case FDTYPE_INDEX_DIR:
2852 			descriptor->ops = &sIndexDirectoryOps;
2853 			break;
2854 		case FDTYPE_QUERY:
2855 			descriptor->ops = &sQueryOps;
2856 			break;
2857 
2858 		default:
2859 			panic("get_new_fd() called with unknown type %d\n", type);
2860 			break;
2861 	}
2862 	descriptor->type = type;
2863 	descriptor->open_mode = openMode;
2864 
2865 	io_context* context = get_current_io_context(kernel);
2866 	fd = new_fd(context, descriptor);
2867 	if (fd < 0) {
2868 		descriptor->ops = NULL;
2869 		put_fd(descriptor);
2870 		return B_NO_MORE_FDS;
2871 	}
2872 
2873 	mutex_lock(&context->io_mutex);
2874 	fd_set_close_on_exec(context, fd, (openMode & O_CLOEXEC) != 0);
2875 	mutex_unlock(&context->io_mutex);
2876 
2877 	return fd;
2878 }
2879 
2880 
2881 /*!	In-place normalizes \a path. It's otherwise semantically equivalent to
2882 	vfs_normalize_path(). See there for more documentation.
2883 */
2884 static status_t
2885 normalize_path(char* path, size_t pathSize, bool traverseLink, bool kernel)
2886 {
2887 	VNodePutter dirPutter;
2888 	struct vnode* dir = NULL;
2889 	status_t error;
2890 
2891 	for (int i = 0; i < B_MAX_SYMLINKS; i++) {
2892 		// get dir vnode + leaf name
2893 		struct vnode* nextDir;
2894 		char leaf[B_FILE_NAME_LENGTH];
2895 		error = vnode_and_path_to_dir_vnode(dir, path, &nextDir, leaf, kernel);
2896 		if (error != B_OK)
2897 			return error;
2898 
2899 		dir = nextDir;
2900 		strcpy(path, leaf);
2901 		dirPutter.SetTo(dir);
2902 
2903 		// get file vnode, if we shall resolve links
2904 		bool fileExists = false;
2905 		struct vnode* fileVnode;
2906 		VNodePutter fileVnodePutter;
2907 		if (traverseLink) {
2908 			inc_vnode_ref_count(dir);
2909 			if (vnode_path_to_vnode(dir, path, false, 0, kernel, &fileVnode,
2910 					NULL) == B_OK) {
2911 				fileVnodePutter.SetTo(fileVnode);
2912 				fileExists = true;
2913 			}
2914 		}
2915 
2916 		if (!fileExists || !traverseLink || !S_ISLNK(fileVnode->Type())) {
2917 			// we're done -- construct the path
2918 			bool hasLeaf = true;
2919 			if (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0) {
2920 				// special cases "." and ".." -- get the dir, forget the leaf
2921 				inc_vnode_ref_count(dir);
2922 				error = vnode_path_to_vnode(dir, leaf, false, 0, kernel,
2923 					&nextDir, NULL);
2924 				if (error != B_OK)
2925 					return error;
2926 				dir = nextDir;
2927 				dirPutter.SetTo(dir);
2928 				hasLeaf = false;
2929 			}
2930 
2931 			// get the directory path
2932 			error = dir_vnode_to_path(dir, path, B_PATH_NAME_LENGTH, kernel);
2933 			if (error != B_OK)
2934 				return error;
2935 
2936 			// append the leaf name
2937 			if (hasLeaf) {
2938 				// insert a directory separator if this is not the file system
2939 				// root
2940 				if ((strcmp(path, "/") != 0
2941 					&& strlcat(path, "/", pathSize) >= pathSize)
2942 					|| strlcat(path, leaf, pathSize) >= pathSize) {
2943 					return B_NAME_TOO_LONG;
2944 				}
2945 			}
2946 
2947 			return B_OK;
2948 		}
2949 
2950 		// read link
2951 		if (HAS_FS_CALL(fileVnode, read_symlink)) {
2952 			size_t bufferSize = B_PATH_NAME_LENGTH - 1;
2953 			error = FS_CALL(fileVnode, read_symlink, path, &bufferSize);
2954 			if (error != B_OK)
2955 				return error;
2956 			path[bufferSize] = '\0';
2957 		} else
2958 			return B_BAD_VALUE;
2959 	}
2960 
2961 	return B_LINK_LIMIT;
2962 }
2963 
2964 
2965 static status_t
2966 resolve_covered_parent(struct vnode* parent, dev_t* _device, ino_t* _node,
2967 	struct io_context* ioContext)
2968 {
2969 	// Make sure the IO context root is not bypassed.
2970 	if (parent == ioContext->root) {
2971 		*_device = parent->device;
2972 		*_node = parent->id;
2973 		return B_OK;
2974 	}
2975 
2976 	inc_vnode_ref_count(parent);
2977 		// vnode_path_to_vnode() puts the node
2978 
2979 	// ".." is guaranteed not to be clobbered by this call
2980 	struct vnode* vnode;
2981 	status_t status = vnode_path_to_vnode(parent, (char*)"..", false, 0,
2982 		ioContext, &vnode, NULL);
2983 	if (status == B_OK) {
2984 		*_device = vnode->device;
2985 		*_node = vnode->id;
2986 		put_vnode(vnode);
2987 	}
2988 
2989 	return status;
2990 }
2991 
2992 
2993 #ifdef ADD_DEBUGGER_COMMANDS
2994 
2995 
2996 static void
2997 _dump_advisory_locking(advisory_locking* locking)
2998 {
2999 	if (locking == NULL)
3000 		return;
3001 
3002 	kprintf("   lock:        %" B_PRId32, locking->lock);
3003 	kprintf("   wait_sem:    %" B_PRId32, locking->wait_sem);
3004 
3005 	int32 index = 0;
3006 	LockList::Iterator iterator = locking->locks.GetIterator();
3007 	while (iterator.HasNext()) {
3008 		struct advisory_lock* lock = iterator.Next();
3009 
3010 		kprintf("   [%2" B_PRId32 "] team:   %" B_PRId32 "\n", index++, lock->team);
3011 		kprintf("        start:  %" B_PRIdOFF "\n", lock->start);
3012 		kprintf("        end:    %" B_PRIdOFF "\n", lock->end);
3013 		kprintf("        shared? %s\n", lock->shared ? "yes" : "no");
3014 	}
3015 }
3016 
3017 
3018 static void
3019 _dump_mount(struct fs_mount* mount)
3020 {
3021 	kprintf("MOUNT: %p\n", mount);
3022 	kprintf(" id:            %" B_PRIdDEV "\n", mount->id);
3023 	kprintf(" device_name:   %s\n", mount->device_name);
3024 	kprintf(" root_vnode:    %p\n", mount->root_vnode);
3025 	kprintf(" covers:        %p\n", mount->root_vnode->covers);
3026 	kprintf(" partition:     %p\n", mount->partition);
3027 	kprintf(" lock:          %p\n", &mount->lock);
3028 	kprintf(" flags:        %s%s\n", mount->unmounting ? " unmounting" : "",
3029 		mount->owns_file_device ? " owns_file_device" : "");
3030 
3031 	fs_volume* volume = mount->volume;
3032 	while (volume != NULL) {
3033 		kprintf(" volume %p:\n", volume);
3034 		kprintf("  layer:            %" B_PRId32 "\n", volume->layer);
3035 		kprintf("  private_volume:   %p\n", volume->private_volume);
3036 		kprintf("  ops:              %p\n", volume->ops);
3037 		kprintf("  file_system:      %p\n", volume->file_system);
3038 		kprintf("  file_system_name: %s\n", volume->file_system_name);
3039 		volume = volume->super_volume;
3040 	}
3041 
3042 	set_debug_variable("_volume", (addr_t)mount->volume->private_volume);
3043 	set_debug_variable("_root", (addr_t)mount->root_vnode);
3044 	set_debug_variable("_covers", (addr_t)mount->root_vnode->covers);
3045 	set_debug_variable("_partition", (addr_t)mount->partition);
3046 }
3047 
3048 
3049 static bool
3050 debug_prepend_vnode_name_to_path(char* buffer, size_t& bufferSize,
3051 	const char* name)
3052 {
3053 	bool insertSlash = buffer[bufferSize] != '\0';
3054 	size_t nameLength = strlen(name);
3055 
3056 	if (bufferSize < nameLength + (insertSlash ? 1 : 0))
3057 		return false;
3058 
3059 	if (insertSlash)
3060 		buffer[--bufferSize] = '/';
3061 
3062 	bufferSize -= nameLength;
3063 	memcpy(buffer + bufferSize, name, nameLength);
3064 
3065 	return true;
3066 }
3067 
3068 
3069 static bool
3070 debug_prepend_vnode_id_to_path(char* buffer, size_t& bufferSize, dev_t devID,
3071 	ino_t nodeID)
3072 {
3073 	if (bufferSize == 0)
3074 		return false;
3075 
3076 	bool insertSlash = buffer[bufferSize] != '\0';
3077 	if (insertSlash)
3078 		buffer[--bufferSize] = '/';
3079 
3080 	size_t size = snprintf(buffer, bufferSize,
3081 		"<%" B_PRIdDEV ",%" B_PRIdINO ">", devID, nodeID);
3082 	if (size > bufferSize) {
3083 		if (insertSlash)
3084 			bufferSize++;
3085 		return false;
3086 	}
3087 
3088 	if (size < bufferSize)
3089 		memmove(buffer + bufferSize - size, buffer, size);
3090 
3091 	bufferSize -= size;
3092 	return true;
3093 }
3094 
3095 
3096 static char*
3097 debug_resolve_vnode_path(struct vnode* vnode, char* buffer, size_t bufferSize,
3098 	bool& _truncated)
3099 {
3100 	// null-terminate the path
3101 	buffer[--bufferSize] = '\0';
3102 
3103 	while (true) {
3104 		while (vnode->covers != NULL)
3105 			vnode = vnode->covers;
3106 
3107 		if (vnode == sRoot) {
3108 			_truncated = bufferSize == 0;
3109 			if (!_truncated)
3110 				buffer[--bufferSize] = '/';
3111 			return buffer + bufferSize;
3112 		}
3113 
3114 		// resolve the name
3115 		ino_t dirID;
3116 		const char* name = vnode->mount->entry_cache.DebugReverseLookup(
3117 			vnode->id, dirID);
3118 		if (name == NULL) {
3119 			// Failed to resolve the name -- prepend "<dev,node>/".
3120 			_truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3121 				vnode->mount->id, vnode->id);
3122 			return buffer + bufferSize;
3123 		}
3124 
3125 		// prepend the name
3126 		if (!debug_prepend_vnode_name_to_path(buffer, bufferSize, name)) {
3127 			_truncated = true;
3128 			return buffer + bufferSize;
3129 		}
3130 
3131 		// resolve the directory node
3132 		struct vnode* nextVnode = lookup_vnode(vnode->mount->id, dirID);
3133 		if (nextVnode == NULL) {
3134 			_truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3135 				vnode->mount->id, dirID);
3136 			return buffer + bufferSize;
3137 		}
3138 
3139 		vnode = nextVnode;
3140 	}
3141 }
3142 
3143 
3144 static void
3145 _dump_vnode(struct vnode* vnode, bool printPath)
3146 {
3147 	kprintf("VNODE: %p\n", vnode);
3148 	kprintf(" device:        %" B_PRIdDEV "\n", vnode->device);
3149 	kprintf(" id:            %" B_PRIdINO "\n", vnode->id);
3150 	kprintf(" ref_count:     %" B_PRId32 "\n", vnode->ref_count);
3151 	kprintf(" private_node:  %p\n", vnode->private_node);
3152 	kprintf(" mount:         %p\n", vnode->mount);
3153 	kprintf(" covered_by:    %p\n", vnode->covered_by);
3154 	kprintf(" covers:        %p\n", vnode->covers);
3155 	kprintf(" cache:         %p\n", vnode->cache);
3156 	kprintf(" type:          %#" B_PRIx32 "\n", vnode->Type());
3157 	kprintf(" flags:         %s%s%s\n", vnode->IsRemoved() ? "r" : "-",
3158 		vnode->IsBusy() ? "b" : "-", vnode->IsUnpublished() ? "u" : "-");
3159 	kprintf(" advisory_lock: %p\n", vnode->advisory_locking);
3160 
3161 	_dump_advisory_locking(vnode->advisory_locking);
3162 
3163 	if (printPath) {
3164 		void* buffer = debug_malloc(B_PATH_NAME_LENGTH);
3165 		if (buffer != NULL) {
3166 			bool truncated;
3167 			char* path = debug_resolve_vnode_path(vnode, (char*)buffer,
3168 				B_PATH_NAME_LENGTH, truncated);
3169 			if (path != NULL) {
3170 				kprintf(" path:          ");
3171 				if (truncated)
3172 					kputs("<truncated>/");
3173 				kputs(path);
3174 				kputs("\n");
3175 			} else
3176 				kprintf("Failed to resolve vnode path.\n");
3177 
3178 			debug_free(buffer);
3179 		} else
3180 			kprintf("Failed to allocate memory for constructing the path.\n");
3181 	}
3182 
3183 	set_debug_variable("_node", (addr_t)vnode->private_node);
3184 	set_debug_variable("_mount", (addr_t)vnode->mount);
3185 	set_debug_variable("_covered_by", (addr_t)vnode->covered_by);
3186 	set_debug_variable("_covers", (addr_t)vnode->covers);
3187 	set_debug_variable("_adv_lock", (addr_t)vnode->advisory_locking);
3188 }
3189 
3190 
3191 static int
3192 dump_mount(int argc, char** argv)
3193 {
3194 	if (argc != 2 || !strcmp(argv[1], "--help")) {
3195 		kprintf("usage: %s [id|address]\n", argv[0]);
3196 		return 0;
3197 	}
3198 
3199 	ulong val = parse_expression(argv[1]);
3200 	uint32 id = val;
3201 
3202 	struct fs_mount* mount = sMountsTable->Lookup(id);
3203 	if (mount == NULL) {
3204 		if (IS_USER_ADDRESS(id)) {
3205 			kprintf("fs_mount not found\n");
3206 			return 0;
3207 		}
3208 		mount = (fs_mount*)val;
3209 	}
3210 
3211 	_dump_mount(mount);
3212 	return 0;
3213 }
3214 
3215 
3216 static int
3217 dump_mounts(int argc, char** argv)
3218 {
3219 	if (argc != 1) {
3220 		kprintf("usage: %s\n", argv[0]);
3221 		return 0;
3222 	}
3223 
3224 	kprintf("%-*s    id %-*s   %-*s   %-*s   fs_name\n",
3225 		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "root",
3226 		B_PRINTF_POINTER_WIDTH, "covers", B_PRINTF_POINTER_WIDTH, "cookie");
3227 
3228 	struct fs_mount* mount;
3229 
3230 	MountTable::Iterator iterator(sMountsTable);
3231 	while (iterator.HasNext()) {
3232 		mount = iterator.Next();
3233 		kprintf("%p%4" B_PRIdDEV " %p %p %p %s\n", mount, mount->id, mount->root_vnode,
3234 			mount->root_vnode->covers, mount->volume->private_volume,
3235 			mount->volume->file_system_name);
3236 
3237 		fs_volume* volume = mount->volume;
3238 		while (volume->super_volume != NULL) {
3239 			volume = volume->super_volume;
3240 			kprintf("                                     %p %s\n",
3241 				volume->private_volume, volume->file_system_name);
3242 		}
3243 	}
3244 
3245 	return 0;
3246 }
3247 
3248 
3249 static int
3250 dump_vnode(int argc, char** argv)
3251 {
3252 	bool printPath = false;
3253 	int argi = 1;
3254 	if (argc >= 2 && strcmp(argv[argi], "-p") == 0) {
3255 		printPath = true;
3256 		argi++;
3257 	}
3258 
3259 	if (argi >= argc || argi + 2 < argc) {
3260 		print_debugger_command_usage(argv[0]);
3261 		return 0;
3262 	}
3263 
3264 	struct vnode* vnode = NULL;
3265 
3266 	if (argi + 1 == argc) {
3267 		vnode = (struct vnode*)parse_expression(argv[argi]);
3268 		if (IS_USER_ADDRESS(vnode)) {
3269 			kprintf("invalid vnode address\n");
3270 			return 0;
3271 		}
3272 		_dump_vnode(vnode, printPath);
3273 		return 0;
3274 	}
3275 
3276 	dev_t device = parse_expression(argv[argi]);
3277 	ino_t id = parse_expression(argv[argi + 1]);
3278 
3279 	VnodeTable::Iterator iterator(sVnodeTable);
3280 	while (iterator.HasNext()) {
3281 		vnode = iterator.Next();
3282 		if (vnode->id != id || vnode->device != device)
3283 			continue;
3284 
3285 		_dump_vnode(vnode, printPath);
3286 	}
3287 
3288 	return 0;
3289 }
3290 
3291 
3292 static int
3293 dump_vnodes(int argc, char** argv)
3294 {
3295 	if (argc != 2 || !strcmp(argv[1], "--help")) {
3296 		kprintf("usage: %s [device]\n", argv[0]);
3297 		return 0;
3298 	}
3299 
3300 	// restrict dumped nodes to a certain device if requested
3301 	dev_t device = parse_expression(argv[1]);
3302 
3303 	struct vnode* vnode;
3304 
3305 	kprintf("%-*s   dev     inode  ref %-*s   %-*s   %-*s   flags\n",
3306 		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache",
3307 		B_PRINTF_POINTER_WIDTH, "fs-node", B_PRINTF_POINTER_WIDTH, "locking");
3308 
3309 	VnodeTable::Iterator iterator(sVnodeTable);
3310 	while (iterator.HasNext()) {
3311 		vnode = iterator.Next();
3312 		if (vnode->device != device)
3313 			continue;
3314 
3315 		kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO "%5" B_PRId32 " %p %p %p %s%s%s\n",
3316 			vnode, vnode->device, vnode->id, vnode->ref_count, vnode->cache,
3317 			vnode->private_node, vnode->advisory_locking,
3318 			vnode->IsRemoved() ? "r" : "-", vnode->IsBusy() ? "b" : "-",
3319 			vnode->IsUnpublished() ? "u" : "-");
3320 	}
3321 
3322 	return 0;
3323 }
3324 
3325 
3326 static int
3327 dump_vnode_caches(int argc, char** argv)
3328 {
3329 	struct vnode* vnode;
3330 
3331 	if (argc > 2 || !strcmp(argv[1], "--help")) {
3332 		kprintf("usage: %s [device]\n", argv[0]);
3333 		return 0;
3334 	}
3335 
3336 	// restrict dumped nodes to a certain device if requested
3337 	dev_t device = -1;
3338 	if (argc > 1)
3339 		device = parse_expression(argv[1]);
3340 
3341 	kprintf("%-*s   dev     inode %-*s       size   pages\n",
3342 		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache");
3343 
3344 	VnodeTable::Iterator iterator(sVnodeTable);
3345 	while (iterator.HasNext()) {
3346 		vnode = iterator.Next();
3347 		if (vnode->cache == NULL)
3348 			continue;
3349 		if (device != -1 && vnode->device != device)
3350 			continue;
3351 
3352 		kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO " %p %8" B_PRIdOFF "%8" B_PRId32 "\n",
3353 			vnode, vnode->device, vnode->id, vnode->cache,
3354 			(vnode->cache->virtual_end + B_PAGE_SIZE - 1) / B_PAGE_SIZE,
3355 			vnode->cache->page_count);
3356 	}
3357 
3358 	return 0;
3359 }
3360 
3361 
3362 int
3363 dump_io_context(int argc, char** argv)
3364 {
3365 	if (argc > 2 || !strcmp(argv[1], "--help")) {
3366 		kprintf("usage: %s [team-id|address]\n", argv[0]);
3367 		return 0;
3368 	}
3369 
3370 	struct io_context* context = NULL;
3371 
3372 	if (argc > 1) {
3373 		ulong num = parse_expression(argv[1]);
3374 		if (IS_KERNEL_ADDRESS(num))
3375 			context = (struct io_context*)num;
3376 		else {
3377 			Team* team = team_get_team_struct_locked(num);
3378 			if (team == NULL) {
3379 				kprintf("could not find team with ID %lu\n", num);
3380 				return 0;
3381 			}
3382 			context = (struct io_context*)team->io_context;
3383 		}
3384 	} else
3385 		context = get_current_io_context(true);
3386 
3387 	kprintf("I/O CONTEXT: %p\n", context);
3388 	kprintf(" root vnode:\t%p\n", context->root);
3389 	kprintf(" cwd vnode:\t%p\n", context->cwd);
3390 	kprintf(" used fds:\t%" B_PRIu32 "\n", context->num_used_fds);
3391 	kprintf(" max fds:\t%" B_PRIu32 "\n", context->table_size);
3392 
3393 	if (context->num_used_fds) {
3394 		kprintf("   no.  type    %*s  ref  open  mode         pos    %*s\n",
3395 			B_PRINTF_POINTER_WIDTH, "ops", B_PRINTF_POINTER_WIDTH, "cookie");
3396 	}
3397 
3398 	for (uint32 i = 0; i < context->table_size; i++) {
3399 		struct file_descriptor* fd = context->fds[i];
3400 		if (fd == NULL)
3401 			continue;
3402 
3403 		kprintf("  %3" B_PRIu32 ":  %4" B_PRId32 "  %p  %3" B_PRId32 "  %4"
3404 			B_PRIu32 "  %4" B_PRIx32 "  %10" B_PRIdOFF "  %p  %s %p\n", i,
3405 			fd->type, fd->ops, fd->ref_count, fd->open_count, fd->open_mode,
3406 			fd->pos, fd->cookie,
3407 			fd->type >= FDTYPE_INDEX && fd->type <= FDTYPE_QUERY
3408 				? "mount" : "vnode",
3409 			fd->u.vnode);
3410 	}
3411 
3412 	kprintf(" used monitors:\t%" B_PRIu32 "\n", context->num_monitors);
3413 	kprintf(" max monitors:\t%" B_PRIu32 "\n", context->max_monitors);
3414 
3415 	set_debug_variable("_cwd", (addr_t)context->cwd);
3416 
3417 	return 0;
3418 }
3419 
3420 
3421 int
3422 dump_vnode_usage(int argc, char** argv)
3423 {
3424 	if (argc != 1) {
3425 		kprintf("usage: %s\n", argv[0]);
3426 		return 0;
3427 	}
3428 
3429 	kprintf("Unused vnodes: %" B_PRIu32 " (max unused %" B_PRIu32 ")\n",
3430 		sUnusedVnodes, kMaxUnusedVnodes);
3431 
3432 	uint32 count = sVnodeTable->CountElements();
3433 
3434 	kprintf("%" B_PRIu32 " vnodes total (%" B_PRIu32 " in use).\n", count,
3435 		count - sUnusedVnodes);
3436 	return 0;
3437 }
3438 
3439 #endif	// ADD_DEBUGGER_COMMANDS
3440 
3441 
3442 /*!	Clears memory specified by an iovec array.
3443 */
3444 static void
3445 zero_iovecs(const iovec* vecs, size_t vecCount, size_t bytes)
3446 {
3447 	for (size_t i = 0; i < vecCount && bytes > 0; i++) {
3448 		size_t length = std::min(vecs[i].iov_len, bytes);
3449 		memset(vecs[i].iov_base, 0, length);
3450 		bytes -= length;
3451 	}
3452 }
3453 
3454 
3455 /*!	Does the dirty work of combining the file_io_vecs with the iovecs
3456 	and calls the file system hooks to read/write the request to disk.
3457 */
3458 static status_t
3459 common_file_io_vec_pages(struct vnode* vnode, void* cookie,
3460 	const file_io_vec* fileVecs, size_t fileVecCount, const iovec* vecs,
3461 	size_t vecCount, uint32* _vecIndex, size_t* _vecOffset, size_t* _numBytes,
3462 	bool doWrite)
3463 {
3464 	if (fileVecCount == 0) {
3465 		// There are no file vecs at this offset, so we're obviously trying
3466 		// to access the file outside of its bounds
3467 		return B_BAD_VALUE;
3468 	}
3469 
3470 	size_t numBytes = *_numBytes;
3471 	uint32 fileVecIndex;
3472 	size_t vecOffset = *_vecOffset;
3473 	uint32 vecIndex = *_vecIndex;
3474 	status_t status;
3475 	size_t size;
3476 
3477 	if (!doWrite && vecOffset == 0) {
3478 		// now directly read the data from the device
3479 		// the first file_io_vec can be read directly
3480 
3481 		if (fileVecs[0].length < (off_t)numBytes)
3482 			size = fileVecs[0].length;
3483 		else
3484 			size = numBytes;
3485 
3486 		if (fileVecs[0].offset >= 0) {
3487 			status = FS_CALL(vnode, read_pages, cookie, fileVecs[0].offset,
3488 				&vecs[vecIndex], vecCount - vecIndex, &size);
3489 		} else {
3490 			// sparse read
3491 			zero_iovecs(&vecs[vecIndex], vecCount - vecIndex, size);
3492 			status = B_OK;
3493 		}
3494 		if (status != B_OK)
3495 			return status;
3496 
3497 		// TODO: this is a work-around for buggy device drivers!
3498 		//	When our own drivers honour the length, we can:
3499 		//	a) also use this direct I/O for writes (otherwise, it would
3500 		//	   overwrite precious data)
3501 		//	b) panic if the term below is true (at least for writes)
3502 		if ((off_t)size > fileVecs[0].length) {
3503 			//dprintf("warning: device driver %p doesn't respect total length "
3504 			//	"in read_pages() call!\n", ref->device);
3505 			size = fileVecs[0].length;
3506 		}
3507 
3508 		ASSERT((off_t)size <= fileVecs[0].length);
3509 
3510 		// If the file portion was contiguous, we're already done now
3511 		if (size == numBytes)
3512 			return B_OK;
3513 
3514 		// if we reached the end of the file, we can return as well
3515 		if ((off_t)size != fileVecs[0].length) {
3516 			*_numBytes = size;
3517 			return B_OK;
3518 		}
3519 
3520 		fileVecIndex = 1;
3521 
3522 		// first, find out where we have to continue in our iovecs
3523 		for (; vecIndex < vecCount; vecIndex++) {
3524 			if (size < vecs[vecIndex].iov_len)
3525 				break;
3526 
3527 			size -= vecs[vecIndex].iov_len;
3528 		}
3529 
3530 		vecOffset = size;
3531 	} else {
3532 		fileVecIndex = 0;
3533 		size = 0;
3534 	}
3535 
3536 	// Too bad, let's process the rest of the file_io_vecs
3537 
3538 	size_t totalSize = size;
3539 	size_t bytesLeft = numBytes - size;
3540 
3541 	for (; fileVecIndex < fileVecCount; fileVecIndex++) {
3542 		const file_io_vec &fileVec = fileVecs[fileVecIndex];
3543 		off_t fileOffset = fileVec.offset;
3544 		off_t fileLeft = min_c(fileVec.length, (off_t)bytesLeft);
3545 
3546 		TRACE(("FILE VEC [%" B_PRIu32 "] length %" B_PRIdOFF "\n", fileVecIndex,
3547 			fileLeft));
3548 
3549 		// process the complete fileVec
3550 		while (fileLeft > 0) {
3551 			iovec tempVecs[MAX_TEMP_IO_VECS];
3552 			uint32 tempCount = 0;
3553 
3554 			// size tracks how much of what is left of the current fileVec
3555 			// (fileLeft) has been assigned to tempVecs
3556 			size = 0;
3557 
3558 			// assign what is left of the current fileVec to the tempVecs
3559 			for (size = 0; (off_t)size < fileLeft && vecIndex < vecCount
3560 					&& tempCount < MAX_TEMP_IO_VECS;) {
3561 				// try to satisfy one iovec per iteration (or as much as
3562 				// possible)
3563 
3564 				// bytes left of the current iovec
3565 				size_t vecLeft = vecs[vecIndex].iov_len - vecOffset;
3566 				if (vecLeft == 0) {
3567 					vecOffset = 0;
3568 					vecIndex++;
3569 					continue;
3570 				}
3571 
3572 				TRACE(("fill vec %" B_PRIu32 ", offset = %lu, size = %lu\n",
3573 					vecIndex, vecOffset, size));
3574 
3575 				// actually available bytes
3576 				size_t tempVecSize = min_c(vecLeft, fileLeft - size);
3577 
3578 				tempVecs[tempCount].iov_base
3579 					= (void*)((addr_t)vecs[vecIndex].iov_base + vecOffset);
3580 				tempVecs[tempCount].iov_len = tempVecSize;
3581 				tempCount++;
3582 
3583 				size += tempVecSize;
3584 				vecOffset += tempVecSize;
3585 			}
3586 
3587 			size_t bytes = size;
3588 
3589 			if (fileOffset == -1) {
3590 				if (doWrite) {
3591 					panic("sparse write attempt: vnode %p", vnode);
3592 					status = B_IO_ERROR;
3593 				} else {
3594 					// sparse read
3595 					zero_iovecs(tempVecs, tempCount, bytes);
3596 					status = B_OK;
3597 				}
3598 			} else if (doWrite) {
3599 				status = FS_CALL(vnode, write_pages, cookie, fileOffset,
3600 					tempVecs, tempCount, &bytes);
3601 			} else {
3602 				status = FS_CALL(vnode, read_pages, cookie, fileOffset,
3603 					tempVecs, tempCount, &bytes);
3604 			}
3605 			if (status != B_OK)
3606 				return status;
3607 
3608 			totalSize += bytes;
3609 			bytesLeft -= size;
3610 			if (fileOffset >= 0)
3611 				fileOffset += size;
3612 			fileLeft -= size;
3613 			//dprintf("-> file left = %Lu\n", fileLeft);
3614 
3615 			if (size != bytes || vecIndex >= vecCount) {
3616 				// there are no more bytes or iovecs, let's bail out
3617 				*_numBytes = totalSize;
3618 				return B_OK;
3619 			}
3620 		}
3621 	}
3622 
3623 	*_vecIndex = vecIndex;
3624 	*_vecOffset = vecOffset;
3625 	*_numBytes = totalSize;
3626 	return B_OK;
3627 }
3628 
3629 
3630 static bool
3631 is_user_in_group(gid_t gid)
3632 {
3633 	if (gid == getegid())
3634 		return true;
3635 
3636 	gid_t groups[NGROUPS_MAX];
3637 	int groupCount = getgroups(NGROUPS_MAX, groups);
3638 	for (int i = 0; i < groupCount; i++) {
3639 		if (gid == groups[i])
3640 			return true;
3641 	}
3642 
3643 	return false;
3644 }
3645 
3646 
3647 static status_t
3648 free_io_context(io_context* context)
3649 {
3650 	uint32 i;
3651 
3652 	TIOC(FreeIOContext(context));
3653 
3654 	if (context->root)
3655 		put_vnode(context->root);
3656 
3657 	if (context->cwd)
3658 		put_vnode(context->cwd);
3659 
3660 	mutex_lock(&context->io_mutex);
3661 
3662 	for (i = 0; i < context->table_size; i++) {
3663 		if (struct file_descriptor* descriptor = context->fds[i]) {
3664 			close_fd(context, descriptor);
3665 			put_fd(descriptor);
3666 		}
3667 	}
3668 
3669 	mutex_destroy(&context->io_mutex);
3670 
3671 	remove_node_monitors(context);
3672 	free(context->fds);
3673 	free(context);
3674 
3675 	return B_OK;
3676 }
3677 
3678 
3679 static status_t
3680 resize_monitor_table(struct io_context* context, const int newSize)
3681 {
3682 	int	status = B_OK;
3683 
3684 	if (newSize <= 0 || newSize > MAX_NODE_MONITORS)
3685 		return B_BAD_VALUE;
3686 
3687 	mutex_lock(&context->io_mutex);
3688 
3689 	if ((size_t)newSize < context->num_monitors) {
3690 		status = B_BUSY;
3691 		goto out;
3692 	}
3693 	context->max_monitors = newSize;
3694 
3695 out:
3696 	mutex_unlock(&context->io_mutex);
3697 	return status;
3698 }
3699 
3700 
3701 //	#pragma mark - public API for file systems
3702 
3703 
3704 extern "C" status_t
3705 new_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3706 	fs_vnode_ops* ops)
3707 {
3708 	FUNCTION(("new_vnode(volume = %p (%" B_PRId32 "), vnodeID = %" B_PRId64
3709 		", node = %p)\n", volume, volume->id, vnodeID, privateNode));
3710 
3711 	if (privateNode == NULL)
3712 		return B_BAD_VALUE;
3713 
3714 	int32 tries = BUSY_VNODE_RETRIES;
3715 restart:
3716 	// create the node
3717 	bool nodeCreated;
3718 	struct vnode* vnode;
3719 	status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3720 		nodeCreated);
3721 	if (status != B_OK)
3722 		return status;
3723 
3724 	WriteLocker nodeLocker(sVnodeLock, true);
3725 		// create_new_vnode_and_lock() has locked for us
3726 
3727 	if (!nodeCreated && vnode->IsBusy()) {
3728 		nodeLocker.Unlock();
3729 		if (!retry_busy_vnode(tries, volume->id, vnodeID))
3730 			return B_BUSY;
3731 		goto restart;
3732 	}
3733 
3734 	// file system integrity check:
3735 	// test if the vnode already exists and bail out if this is the case!
3736 	if (!nodeCreated) {
3737 		panic("vnode %" B_PRIdDEV ":%" B_PRIdINO " already exists (node = %p, "
3738 			"vnode->node = %p)!", volume->id, vnodeID, privateNode,
3739 			vnode->private_node);
3740 		return B_ERROR;
3741 	}
3742 
3743 	vnode->private_node = privateNode;
3744 	vnode->ops = ops;
3745 	vnode->SetUnpublished(true);
3746 
3747 	TRACE(("returns: %s\n", strerror(status)));
3748 
3749 	return status;
3750 }
3751 
3752 
3753 extern "C" status_t
3754 publish_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3755 	fs_vnode_ops* ops, int type, uint32 flags)
3756 {
3757 	FUNCTION(("publish_vnode()\n"));
3758 
3759 	int32 tries = BUSY_VNODE_RETRIES;
3760 restart:
3761 	WriteLocker locker(sVnodeLock);
3762 
3763 	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3764 
3765 	bool nodeCreated = false;
3766 	if (vnode == NULL) {
3767 		if (privateNode == NULL)
3768 			return B_BAD_VALUE;
3769 
3770 		// create the node
3771 		locker.Unlock();
3772 			// create_new_vnode_and_lock() will re-lock for us on success
3773 		status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3774 			nodeCreated);
3775 		if (status != B_OK)
3776 			return status;
3777 
3778 		locker.SetTo(sVnodeLock, true);
3779 	}
3780 
3781 	if (nodeCreated) {
3782 		vnode->private_node = privateNode;
3783 		vnode->ops = ops;
3784 		vnode->SetUnpublished(true);
3785 	} else if (vnode->IsBusy() && vnode->IsUnpublished()
3786 		&& vnode->private_node == privateNode && vnode->ops == ops) {
3787 		// already known, but not published
3788 	} else if (vnode->IsBusy()) {
3789 		locker.Unlock();
3790 		if (!retry_busy_vnode(tries, volume->id, vnodeID))
3791 			return B_BUSY;
3792 		goto restart;
3793 	} else
3794 		return B_BAD_VALUE;
3795 
3796 	bool publishSpecialSubNode = false;
3797 
3798 	vnode->SetType(type);
3799 	vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
3800 	publishSpecialSubNode = is_special_node_type(type)
3801 		&& (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
3802 
3803 	status_t status = B_OK;
3804 
3805 	// create sub vnodes, if necessary
3806 	if (volume->sub_volume != NULL || publishSpecialSubNode) {
3807 		locker.Unlock();
3808 
3809 		fs_volume* subVolume = volume;
3810 		if (volume->sub_volume != NULL) {
3811 			while (status == B_OK && subVolume->sub_volume != NULL) {
3812 				subVolume = subVolume->sub_volume;
3813 				status = subVolume->ops->create_sub_vnode(subVolume, vnodeID,
3814 					vnode);
3815 			}
3816 		}
3817 
3818 		if (status == B_OK && publishSpecialSubNode)
3819 			status = create_special_sub_node(vnode, flags);
3820 
3821 		if (status != B_OK) {
3822 			// error -- clean up the created sub vnodes
3823 			while (subVolume->super_volume != volume) {
3824 				subVolume = subVolume->super_volume;
3825 				subVolume->ops->delete_sub_vnode(subVolume, vnode);
3826 			}
3827 		}
3828 
3829 		if (status == B_OK) {
3830 			ReadLocker vnodesReadLocker(sVnodeLock);
3831 			AutoLocker<Vnode> nodeLocker(vnode);
3832 			vnode->SetBusy(false);
3833 			vnode->SetUnpublished(false);
3834 		} else {
3835 			locker.Lock();
3836 			sVnodeTable->Remove(vnode);
3837 			remove_vnode_from_mount_list(vnode, vnode->mount);
3838 			free(vnode);
3839 		}
3840 	} else {
3841 		// we still hold the write lock -- mark the node unbusy and published
3842 		vnode->SetBusy(false);
3843 		vnode->SetUnpublished(false);
3844 	}
3845 
3846 	TRACE(("returns: %s\n", strerror(status)));
3847 
3848 	return status;
3849 }
3850 
3851 
3852 extern "C" status_t
3853 get_vnode(fs_volume* volume, ino_t vnodeID, void** _privateNode)
3854 {
3855 	struct vnode* vnode;
3856 
3857 	if (volume == NULL)
3858 		return B_BAD_VALUE;
3859 
3860 	status_t status = get_vnode(volume->id, vnodeID, &vnode, true, true);
3861 	if (status != B_OK)
3862 		return status;
3863 
3864 	// If this is a layered FS, we need to get the node cookie for the requested
3865 	// layer.
3866 	if (HAS_FS_CALL(vnode, get_super_vnode)) {
3867 		fs_vnode resolvedNode;
3868 		status_t status = FS_CALL(vnode, get_super_vnode, volume,
3869 			&resolvedNode);
3870 		if (status != B_OK) {
3871 			panic("get_vnode(): Failed to get super node for vnode %p, "
3872 				"volume: %p", vnode, volume);
3873 			put_vnode(vnode);
3874 			return status;
3875 		}
3876 
3877 		if (_privateNode != NULL)
3878 			*_privateNode = resolvedNode.private_node;
3879 	} else if (_privateNode != NULL)
3880 		*_privateNode = vnode->private_node;
3881 
3882 	return B_OK;
3883 }
3884 
3885 
3886 extern "C" status_t
3887 acquire_vnode(fs_volume* volume, ino_t vnodeID)
3888 {
3889 	struct vnode* vnode;
3890 
3891 	rw_lock_read_lock(&sVnodeLock);
3892 	vnode = lookup_vnode(volume->id, vnodeID);
3893 	rw_lock_read_unlock(&sVnodeLock);
3894 
3895 	if (vnode == NULL)
3896 		return B_BAD_VALUE;
3897 
3898 	inc_vnode_ref_count(vnode);
3899 	return B_OK;
3900 }
3901 
3902 
3903 extern "C" status_t
3904 put_vnode(fs_volume* volume, ino_t vnodeID)
3905 {
3906 	struct vnode* vnode;
3907 
3908 	rw_lock_read_lock(&sVnodeLock);
3909 	vnode = lookup_vnode(volume->id, vnodeID);
3910 	rw_lock_read_unlock(&sVnodeLock);
3911 
3912 	if (vnode == NULL)
3913 		return B_BAD_VALUE;
3914 
3915 	dec_vnode_ref_count(vnode, false, true);
3916 	return B_OK;
3917 }
3918 
3919 
3920 extern "C" status_t
3921 remove_vnode(fs_volume* volume, ino_t vnodeID)
3922 {
3923 	ReadLocker locker(sVnodeLock);
3924 
3925 	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3926 	if (vnode == NULL)
3927 		return B_ENTRY_NOT_FOUND;
3928 
3929 	if (vnode->covered_by != NULL || vnode->covers != NULL) {
3930 		// this vnode is in use
3931 		return B_BUSY;
3932 	}
3933 
3934 	vnode->Lock();
3935 
3936 	vnode->SetRemoved(true);
3937 	bool removeUnpublished = false;
3938 
3939 	if (vnode->IsUnpublished()) {
3940 		// prepare the vnode for deletion
3941 		removeUnpublished = true;
3942 		vnode->SetBusy(true);
3943 	}
3944 
3945 	vnode->Unlock();
3946 	locker.Unlock();
3947 
3948 	if (removeUnpublished) {
3949 		// If the vnode hasn't been published yet, we delete it here
3950 		atomic_add(&vnode->ref_count, -1);
3951 		free_vnode(vnode, true);
3952 	}
3953 
3954 	return B_OK;
3955 }
3956 
3957 
3958 extern "C" status_t
3959 unremove_vnode(fs_volume* volume, ino_t vnodeID)
3960 {
3961 	struct vnode* vnode;
3962 
3963 	rw_lock_read_lock(&sVnodeLock);
3964 
3965 	vnode = lookup_vnode(volume->id, vnodeID);
3966 	if (vnode) {
3967 		AutoLocker<Vnode> nodeLocker(vnode);
3968 		vnode->SetRemoved(false);
3969 	}
3970 
3971 	rw_lock_read_unlock(&sVnodeLock);
3972 	return B_OK;
3973 }
3974 
3975 
3976 extern "C" status_t
3977 get_vnode_removed(fs_volume* volume, ino_t vnodeID, bool* _removed)
3978 {
3979 	ReadLocker _(sVnodeLock);
3980 
3981 	if (struct vnode* vnode = lookup_vnode(volume->id, vnodeID)) {
3982 		if (_removed != NULL)
3983 			*_removed = vnode->IsRemoved();
3984 		return B_OK;
3985 	}
3986 
3987 	return B_BAD_VALUE;
3988 }
3989 
3990 
3991 extern "C" status_t
3992 mark_vnode_busy(fs_volume* volume, ino_t vnodeID, bool busy)
3993 {
3994 	ReadLocker locker(sVnodeLock);
3995 
3996 	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3997 	if (vnode == NULL)
3998 		return B_ENTRY_NOT_FOUND;
3999 
4000 	// are we trying to mark an already busy node busy again?
4001 	if (busy && vnode->IsBusy())
4002 		return B_BUSY;
4003 
4004 	vnode->Lock();
4005 	vnode->SetBusy(busy);
4006 	vnode->Unlock();
4007 
4008 	return B_OK;
4009 }
4010 
4011 
4012 extern "C" status_t
4013 change_vnode_id(fs_volume* volume, ino_t vnodeID, ino_t newID)
4014 {
4015 	WriteLocker locker(sVnodeLock);
4016 
4017 	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
4018 	if (vnode == NULL)
4019 		return B_ENTRY_NOT_FOUND;
4020 
4021 	sVnodeTable->Remove(vnode);
4022 	vnode->id = newID;
4023 	sVnodeTable->Insert(vnode);
4024 
4025 	if (vnode->cache != NULL && vnode->cache->type == CACHE_TYPE_VNODE)
4026 		((VMVnodeCache*)vnode->cache)->SetVnodeID(newID);
4027 
4028 	return B_OK;
4029 }
4030 
4031 
4032 extern "C" fs_volume*
4033 volume_for_vnode(fs_vnode* _vnode)
4034 {
4035 	if (_vnode == NULL)
4036 		return NULL;
4037 
4038 	struct vnode* vnode = static_cast<struct vnode*>(_vnode);
4039 	return vnode->mount->volume;
4040 }
4041 
4042 
4043 extern "C" status_t
4044 check_access_permissions(int accessMode, mode_t mode, gid_t nodeGroupID,
4045 	uid_t nodeUserID)
4046 {
4047 	// get node permissions
4048 	int userPermissions = (mode & S_IRWXU) >> 6;
4049 	int groupPermissions = (mode & S_IRWXG) >> 3;
4050 	int otherPermissions = mode & S_IRWXO;
4051 
4052 	// get the node permissions for this uid/gid
4053 	int permissions = 0;
4054 	uid_t uid = geteuid();
4055 
4056 	if (uid == 0) {
4057 		// user is root
4058 		// root has always read/write permission, but at least one of the
4059 		// X bits must be set for execute permission
4060 		permissions = userPermissions | groupPermissions | otherPermissions
4061 			| S_IROTH | S_IWOTH;
4062 		if (S_ISDIR(mode))
4063 			permissions |= S_IXOTH;
4064 	} else if (uid == nodeUserID) {
4065 		// user is node owner
4066 		permissions = userPermissions;
4067 	} else if (is_user_in_group(nodeGroupID)) {
4068 		// user is in owning group
4069 		permissions = groupPermissions;
4070 	} else {
4071 		// user is one of the others
4072 		permissions = otherPermissions;
4073 	}
4074 
4075 	return (accessMode & ~permissions) == 0 ? B_OK : B_PERMISSION_DENIED;
4076 }
4077 
4078 
4079 #if 0
4080 extern "C" status_t
4081 read_pages(int fd, off_t pos, const iovec* vecs, size_t count,
4082 	size_t* _numBytes)
4083 {
4084 	struct file_descriptor* descriptor;
4085 	struct vnode* vnode;
4086 
4087 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4088 	if (descriptor == NULL)
4089 		return B_FILE_ERROR;
4090 
4091 	status_t status = vfs_read_pages(vnode, descriptor->cookie, pos, vecs,
4092 		count, 0, _numBytes);
4093 
4094 	put_fd(descriptor);
4095 	return status;
4096 }
4097 
4098 
4099 extern "C" status_t
4100 write_pages(int fd, off_t pos, const iovec* vecs, size_t count,
4101 	size_t* _numBytes)
4102 {
4103 	struct file_descriptor* descriptor;
4104 	struct vnode* vnode;
4105 
4106 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4107 	if (descriptor == NULL)
4108 		return B_FILE_ERROR;
4109 
4110 	status_t status = vfs_write_pages(vnode, descriptor->cookie, pos, vecs,
4111 		count, 0, _numBytes);
4112 
4113 	put_fd(descriptor);
4114 	return status;
4115 }
4116 #endif
4117 
4118 
4119 extern "C" status_t
4120 read_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
4121 	const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
4122 	size_t* _bytes)
4123 {
4124 	struct file_descriptor* descriptor;
4125 	struct vnode* vnode;
4126 
4127 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4128 	if (descriptor == NULL)
4129 		return B_FILE_ERROR;
4130 
4131 	status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
4132 		fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
4133 		false);
4134 
4135 	put_fd(descriptor);
4136 	return status;
4137 }
4138 
4139 
4140 extern "C" status_t
4141 write_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
4142 	const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
4143 	size_t* _bytes)
4144 {
4145 	struct file_descriptor* descriptor;
4146 	struct vnode* vnode;
4147 
4148 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4149 	if (descriptor == NULL)
4150 		return B_FILE_ERROR;
4151 
4152 	status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
4153 		fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
4154 		true);
4155 
4156 	put_fd(descriptor);
4157 	return status;
4158 }
4159 
4160 
4161 extern "C" status_t
4162 entry_cache_add(dev_t mountID, ino_t dirID, const char* name, ino_t nodeID)
4163 {
4164 	// lookup mount -- the caller is required to make sure that the mount
4165 	// won't go away
4166 	ReadLocker locker(sMountLock);
4167 	struct fs_mount* mount = find_mount(mountID);
4168 	if (mount == NULL)
4169 		return B_BAD_VALUE;
4170 	locker.Unlock();
4171 
4172 	return mount->entry_cache.Add(dirID, name, nodeID, false);
4173 }
4174 
4175 
4176 extern "C" status_t
4177 entry_cache_add_missing(dev_t mountID, ino_t dirID, const char* name)
4178 {
4179 	// lookup mount -- the caller is required to make sure that the mount
4180 	// won't go away
4181 	ReadLocker locker(sMountLock);
4182 	struct fs_mount* mount = find_mount(mountID);
4183 	if (mount == NULL)
4184 		return B_BAD_VALUE;
4185 	locker.Unlock();
4186 
4187 	return mount->entry_cache.Add(dirID, name, -1, true);
4188 }
4189 
4190 
4191 extern "C" status_t
4192 entry_cache_remove(dev_t mountID, ino_t dirID, const char* name)
4193 {
4194 	// lookup mount -- the caller is required to make sure that the mount
4195 	// won't go away
4196 	ReadLocker locker(sMountLock);
4197 	struct fs_mount* mount = find_mount(mountID);
4198 	if (mount == NULL)
4199 		return B_BAD_VALUE;
4200 	locker.Unlock();
4201 
4202 	return mount->entry_cache.Remove(dirID, name);
4203 }
4204 
4205 
4206 //	#pragma mark - private VFS API
4207 //	Functions the VFS exports for other parts of the kernel
4208 
4209 
4210 /*! Acquires another reference to the vnode that has to be released
4211 	by calling vfs_put_vnode().
4212 */
4213 void
4214 vfs_acquire_vnode(struct vnode* vnode)
4215 {
4216 	inc_vnode_ref_count(vnode);
4217 }
4218 
4219 
4220 /*! This is currently called from file_cache_create() only.
4221 	It's probably a temporary solution as long as devfs requires that
4222 	fs_read_pages()/fs_write_pages() are called with the standard
4223 	open cookie and not with a device cookie.
4224 	If that's done differently, remove this call; it has no other
4225 	purpose.
4226 */
4227 extern "C" status_t
4228 vfs_get_cookie_from_fd(int fd, void** _cookie)
4229 {
4230 	struct file_descriptor* descriptor;
4231 
4232 	descriptor = get_fd(get_current_io_context(true), fd);
4233 	if (descriptor == NULL)
4234 		return B_FILE_ERROR;
4235 
4236 	*_cookie = descriptor->cookie;
4237 	return B_OK;
4238 }
4239 
4240 
4241 extern "C" status_t
4242 vfs_get_vnode_from_fd(int fd, bool kernel, struct vnode** vnode)
4243 {
4244 	*vnode = get_vnode_from_fd(fd, kernel);
4245 
4246 	if (*vnode == NULL)
4247 		return B_FILE_ERROR;
4248 
4249 	return B_NO_ERROR;
4250 }
4251 
4252 
4253 extern "C" status_t
4254 vfs_get_vnode_from_path(const char* path, bool kernel, struct vnode** _vnode)
4255 {
4256 	TRACE(("vfs_get_vnode_from_path: entry. path = '%s', kernel %d\n",
4257 		path, kernel));
4258 
4259 	KPath pathBuffer;
4260 	if (pathBuffer.InitCheck() != B_OK)
4261 		return B_NO_MEMORY;
4262 
4263 	char* buffer = pathBuffer.LockBuffer();
4264 	strlcpy(buffer, path, pathBuffer.BufferSize());
4265 
4266 	struct vnode* vnode;
4267 	status_t status = path_to_vnode(buffer, true, &vnode, NULL, kernel);
4268 	if (status != B_OK)
4269 		return status;
4270 
4271 	*_vnode = vnode;
4272 	return B_OK;
4273 }
4274 
4275 
4276 extern "C" status_t
4277 vfs_get_vnode(dev_t mountID, ino_t vnodeID, bool canWait, struct vnode** _vnode)
4278 {
4279 	struct vnode* vnode = NULL;
4280 
4281 	status_t status = get_vnode(mountID, vnodeID, &vnode, canWait, false);
4282 	if (status != B_OK)
4283 		return status;
4284 
4285 	*_vnode = vnode;
4286 	return B_OK;
4287 }
4288 
4289 
4290 extern "C" status_t
4291 vfs_entry_ref_to_vnode(dev_t mountID, ino_t directoryID,
4292 	const char* name, struct vnode** _vnode)
4293 {
4294 	return entry_ref_to_vnode(mountID, directoryID, name, false, true, _vnode);
4295 }
4296 
4297 
4298 extern "C" void
4299 vfs_vnode_to_node_ref(struct vnode* vnode, dev_t* _mountID, ino_t* _vnodeID)
4300 {
4301 	*_mountID = vnode->device;
4302 	*_vnodeID = vnode->id;
4303 }
4304 
4305 
4306 /*!
4307 	Helper function abstracting the process of "converting" a given
4308 	vnode-pointer to a fs_vnode-pointer.
4309 	Currently only used in bindfs.
4310 */
4311 extern "C" fs_vnode*
4312 vfs_fsnode_for_vnode(struct vnode* vnode)
4313 {
4314 	return vnode;
4315 }
4316 
4317 
4318 /*!
4319 	Calls fs_open() on the given vnode and returns a new
4320 	file descriptor for it
4321 */
4322 int
4323 vfs_open_vnode(struct vnode* vnode, int openMode, bool kernel)
4324 {
4325 	return open_vnode(vnode, openMode, kernel);
4326 }
4327 
4328 
4329 /*!	Looks up a vnode with the given mount and vnode ID.
4330 	Must only be used with "in-use" vnodes as it doesn't grab a reference
4331 	to the node.
4332 	It's currently only be used by file_cache_create().
4333 */
4334 extern "C" status_t
4335 vfs_lookup_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode)
4336 {
4337 	rw_lock_read_lock(&sVnodeLock);
4338 	struct vnode* vnode = lookup_vnode(mountID, vnodeID);
4339 	rw_lock_read_unlock(&sVnodeLock);
4340 
4341 	if (vnode == NULL)
4342 		return B_ERROR;
4343 
4344 	*_vnode = vnode;
4345 	return B_OK;
4346 }
4347 
4348 
4349 extern "C" status_t
4350 vfs_get_fs_node_from_path(fs_volume* volume, const char* path,
4351 	bool traverseLeafLink, bool kernel, void** _node)
4352 {
4353 	TRACE(("vfs_get_fs_node_from_path(volume = %p, path = \"%s\", kernel %d)\n",
4354 		volume, path, kernel));
4355 
4356 	KPath pathBuffer;
4357 	if (pathBuffer.InitCheck() != B_OK)
4358 		return B_NO_MEMORY;
4359 
4360 	fs_mount* mount;
4361 	status_t status = get_mount(volume->id, &mount);
4362 	if (status != B_OK)
4363 		return status;
4364 
4365 	char* buffer = pathBuffer.LockBuffer();
4366 	strlcpy(buffer, path, pathBuffer.BufferSize());
4367 
4368 	struct vnode* vnode = mount->root_vnode;
4369 
4370 	if (buffer[0] == '/')
4371 		status = path_to_vnode(buffer, traverseLeafLink, &vnode, NULL, kernel);
4372 	else {
4373 		inc_vnode_ref_count(vnode);
4374 			// vnode_path_to_vnode() releases a reference to the starting vnode
4375 		status = vnode_path_to_vnode(vnode, buffer, traverseLeafLink, 0,
4376 			kernel, &vnode, NULL);
4377 	}
4378 
4379 	put_mount(mount);
4380 
4381 	if (status != B_OK)
4382 		return status;
4383 
4384 	if (vnode->device != volume->id) {
4385 		// wrong mount ID - must not gain access on foreign file system nodes
4386 		put_vnode(vnode);
4387 		return B_BAD_VALUE;
4388 	}
4389 
4390 	// Use get_vnode() to resolve the cookie for the right layer.
4391 	status = get_vnode(volume, vnode->id, _node);
4392 	put_vnode(vnode);
4393 
4394 	return status;
4395 }
4396 
4397 
4398 status_t
4399 vfs_read_stat(int fd, const char* path, bool traverseLeafLink,
4400 	struct stat* stat, bool kernel)
4401 {
4402 	status_t status;
4403 
4404 	if (path != NULL) {
4405 		// path given: get the stat of the node referred to by (fd, path)
4406 		KPath pathBuffer(path);
4407 		if (pathBuffer.InitCheck() != B_OK)
4408 			return B_NO_MEMORY;
4409 
4410 		status = common_path_read_stat(fd, pathBuffer.LockBuffer(),
4411 			traverseLeafLink, stat, kernel);
4412 	} else {
4413 		// no path given: get the FD and use the FD operation
4414 		struct file_descriptor* descriptor
4415 			= get_fd(get_current_io_context(kernel), fd);
4416 		if (descriptor == NULL)
4417 			return B_FILE_ERROR;
4418 
4419 		if (descriptor->ops->fd_read_stat)
4420 			status = descriptor->ops->fd_read_stat(descriptor, stat);
4421 		else
4422 			status = B_UNSUPPORTED;
4423 
4424 		put_fd(descriptor);
4425 	}
4426 
4427 	return status;
4428 }
4429 
4430 
4431 /*!	Finds the full path to the file that contains the module \a moduleName,
4432 	puts it into \a pathBuffer, and returns B_OK for success.
4433 	If \a pathBuffer was too small, it returns \c B_BUFFER_OVERFLOW,
4434 	\c B_ENTRY_NOT_FOUNT if no file could be found.
4435 	\a pathBuffer is clobbered in any case and must not be relied on if this
4436 	functions returns unsuccessfully.
4437 	\a basePath and \a pathBuffer must not point to the same space.
4438 */
4439 status_t
4440 vfs_get_module_path(const char* basePath, const char* moduleName,
4441 	char* pathBuffer, size_t bufferSize)
4442 {
4443 	struct vnode* dir;
4444 	struct vnode* file;
4445 	status_t status;
4446 	size_t length;
4447 	char* path;
4448 
4449 	if (bufferSize == 0
4450 		|| strlcpy(pathBuffer, basePath, bufferSize) >= bufferSize)
4451 		return B_BUFFER_OVERFLOW;
4452 
4453 	status = path_to_vnode(pathBuffer, true, &dir, NULL, true);
4454 	if (status != B_OK)
4455 		return status;
4456 
4457 	// the path buffer had been clobbered by the above call
4458 	length = strlcpy(pathBuffer, basePath, bufferSize);
4459 	if (pathBuffer[length - 1] != '/')
4460 		pathBuffer[length++] = '/';
4461 
4462 	path = pathBuffer + length;
4463 	bufferSize -= length;
4464 
4465 	while (moduleName) {
4466 		char* nextPath = strchr(moduleName, '/');
4467 		if (nextPath == NULL)
4468 			length = strlen(moduleName);
4469 		else {
4470 			length = nextPath - moduleName;
4471 			nextPath++;
4472 		}
4473 
4474 		if (length + 1 >= bufferSize) {
4475 			status = B_BUFFER_OVERFLOW;
4476 			goto err;
4477 		}
4478 
4479 		memcpy(path, moduleName, length);
4480 		path[length] = '\0';
4481 		moduleName = nextPath;
4482 
4483 		status = vnode_path_to_vnode(dir, path, true, 0, true, &file, NULL);
4484 		if (status != B_OK) {
4485 			// vnode_path_to_vnode() has already released the reference to dir
4486 			return status;
4487 		}
4488 
4489 		if (S_ISDIR(file->Type())) {
4490 			// goto the next directory
4491 			path[length] = '/';
4492 			path[length + 1] = '\0';
4493 			path += length + 1;
4494 			bufferSize -= length + 1;
4495 
4496 			dir = file;
4497 		} else if (S_ISREG(file->Type())) {
4498 			// it's a file so it should be what we've searched for
4499 			put_vnode(file);
4500 
4501 			return B_OK;
4502 		} else {
4503 			TRACE(("vfs_get_module_path(): something is strange here: "
4504 				"0x%08" B_PRIx32 "...\n", file->Type()));
4505 			status = B_ERROR;
4506 			dir = file;
4507 			goto err;
4508 		}
4509 	}
4510 
4511 	// if we got here, the moduleName just pointed to a directory, not to
4512 	// a real module - what should we do in this case?
4513 	status = B_ENTRY_NOT_FOUND;
4514 
4515 err:
4516 	put_vnode(dir);
4517 	return status;
4518 }
4519 
4520 
4521 /*!	\brief Normalizes a given path.
4522 
4523 	The path must refer to an existing or non-existing entry in an existing
4524 	directory, that is chopping off the leaf component the remaining path must
4525 	refer to an existing directory.
4526 
4527 	The returned will be canonical in that it will be absolute, will not
4528 	contain any "." or ".." components or duplicate occurrences of '/'s,
4529 	and none of the directory components will by symbolic links.
4530 
4531 	Any two paths referring to the same entry, will result in the same
4532 	normalized path (well, that is pretty much the definition of `normalized',
4533 	isn't it :-).
4534 
4535 	\param path The path to be normalized.
4536 	\param buffer The buffer into which the normalized path will be written.
4537 		   May be the same one as \a path.
4538 	\param bufferSize The size of \a buffer.
4539 	\param traverseLink If \c true, the function also resolves leaf symlinks.
4540 	\param kernel \c true, if the IO context of the kernel shall be used,
4541 		   otherwise that of the team this thread belongs to. Only relevant,
4542 		   if the path is relative (to get the CWD).
4543 	\return \c B_OK if everything went fine, another error code otherwise.
4544 */
4545 status_t
4546 vfs_normalize_path(const char* path, char* buffer, size_t bufferSize,
4547 	bool traverseLink, bool kernel)
4548 {
4549 	if (!path || !buffer || bufferSize < 1)
4550 		return B_BAD_VALUE;
4551 
4552 	if (path != buffer) {
4553 		if (strlcpy(buffer, path, bufferSize) >= bufferSize)
4554 			return B_BUFFER_OVERFLOW;
4555 	}
4556 
4557 	return normalize_path(buffer, bufferSize, traverseLink, kernel);
4558 }
4559 
4560 
4561 /*!	\brief Gets the parent of the passed in node.
4562 
4563 	Gets the parent of the passed in node, and correctly resolves covered
4564 	nodes.
4565 */
4566 extern "C" status_t
4567 vfs_resolve_parent(struct vnode* parent, dev_t* device, ino_t* node)
4568 {
4569 	return resolve_covered_parent(parent, device, node,
4570 		get_current_io_context(true));
4571 }
4572 
4573 
4574 /*!	\brief Creates a special node in the file system.
4575 
4576 	The caller gets a reference to the newly created node (which is passed
4577 	back through \a _createdVnode) and is responsible for releasing it.
4578 
4579 	\param path The path where to create the entry for the node. Can be \c NULL,
4580 		in which case the node is created without an entry in the root FS -- it
4581 		will automatically be deleted when the last reference has been released.
4582 	\param subVnode The definition of the subnode. Can be \c NULL, in which case
4583 		the target file system will just create the node with its standard
4584 		operations. Depending on the type of the node a subnode might be created
4585 		automatically, though.
4586 	\param mode The type and permissions for the node to be created.
4587 	\param flags Flags to be passed to the creating FS.
4588 	\param kernel \c true, if called in the kernel context (relevant only if
4589 		\a path is not \c NULL and not absolute).
4590 	\param _superVnode Pointer to a pre-allocated structure to be filled by the
4591 		file system creating the node, with the private data pointer and
4592 		operations for the super node. Can be \c NULL.
4593 	\param _createVnode Pointer to pre-allocated storage where to store the
4594 		pointer to the newly created node.
4595 	\return \c B_OK, if everything went fine, another error code otherwise.
4596 */
4597 status_t
4598 vfs_create_special_node(const char* path, fs_vnode* subVnode, mode_t mode,
4599 	uint32 flags, bool kernel, fs_vnode* _superVnode,
4600 	struct vnode** _createdVnode)
4601 {
4602 	struct vnode* dirNode;
4603 	char _leaf[B_FILE_NAME_LENGTH];
4604 	char* leaf = NULL;
4605 
4606 	if (path) {
4607 		// We've got a path. Get the dir vnode and the leaf name.
4608 		KPath tmpPathBuffer;
4609 		if (tmpPathBuffer.InitCheck() != B_OK)
4610 			return B_NO_MEMORY;
4611 
4612 		char* tmpPath = tmpPathBuffer.LockBuffer();
4613 		if (strlcpy(tmpPath, path, B_PATH_NAME_LENGTH) >= B_PATH_NAME_LENGTH)
4614 			return B_NAME_TOO_LONG;
4615 
4616 		// get the dir vnode and the leaf name
4617 		leaf = _leaf;
4618 		status_t error = path_to_dir_vnode(tmpPath, &dirNode, leaf, kernel);
4619 		if (error != B_OK)
4620 			return error;
4621 	} else {
4622 		// No path. Create the node in the root FS.
4623 		dirNode = sRoot;
4624 		inc_vnode_ref_count(dirNode);
4625 	}
4626 
4627 	VNodePutter _(dirNode);
4628 
4629 	// check support for creating special nodes
4630 	if (!HAS_FS_CALL(dirNode, create_special_node))
4631 		return B_UNSUPPORTED;
4632 
4633 	// create the node
4634 	fs_vnode superVnode;
4635 	ino_t nodeID;
4636 	status_t status = FS_CALL(dirNode, create_special_node, leaf, subVnode,
4637 		mode, flags, _superVnode != NULL ? _superVnode : &superVnode, &nodeID);
4638 	if (status != B_OK)
4639 		return status;
4640 
4641 	// lookup the node
4642 	rw_lock_read_lock(&sVnodeLock);
4643 	*_createdVnode = lookup_vnode(dirNode->mount->id, nodeID);
4644 	rw_lock_read_unlock(&sVnodeLock);
4645 
4646 	if (*_createdVnode == NULL) {
4647 		panic("vfs_create_special_node(): lookup of node failed");
4648 		return B_ERROR;
4649 	}
4650 
4651 	return B_OK;
4652 }
4653 
4654 
4655 extern "C" void
4656 vfs_put_vnode(struct vnode* vnode)
4657 {
4658 	put_vnode(vnode);
4659 }
4660 
4661 
4662 extern "C" status_t
4663 vfs_get_cwd(dev_t* _mountID, ino_t* _vnodeID)
4664 {
4665 	// Get current working directory from io context
4666 	struct io_context* context = get_current_io_context(false);
4667 	status_t status = B_OK;
4668 
4669 	mutex_lock(&context->io_mutex);
4670 
4671 	if (context->cwd != NULL) {
4672 		*_mountID = context->cwd->device;
4673 		*_vnodeID = context->cwd->id;
4674 	} else
4675 		status = B_ERROR;
4676 
4677 	mutex_unlock(&context->io_mutex);
4678 	return status;
4679 }
4680 
4681 
4682 status_t
4683 vfs_unmount(dev_t mountID, uint32 flags)
4684 {
4685 	return fs_unmount(NULL, mountID, flags, true);
4686 }
4687 
4688 
4689 extern "C" status_t
4690 vfs_disconnect_vnode(dev_t mountID, ino_t vnodeID)
4691 {
4692 	struct vnode* vnode;
4693 
4694 	status_t status = get_vnode(mountID, vnodeID, &vnode, true, true);
4695 	if (status != B_OK)
4696 		return status;
4697 
4698 	disconnect_mount_or_vnode_fds(vnode->mount, vnode);
4699 	put_vnode(vnode);
4700 	return B_OK;
4701 }
4702 
4703 
4704 extern "C" void
4705 vfs_free_unused_vnodes(int32 level)
4706 {
4707 	vnode_low_resource_handler(NULL,
4708 		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
4709 			| B_KERNEL_RESOURCE_ADDRESS_SPACE,
4710 		level);
4711 }
4712 
4713 
4714 extern "C" bool
4715 vfs_can_page(struct vnode* vnode, void* cookie)
4716 {
4717 	FUNCTION(("vfs_canpage: vnode %p\n", vnode));
4718 
4719 	if (HAS_FS_CALL(vnode, can_page))
4720 		return FS_CALL(vnode, can_page, cookie);
4721 	return false;
4722 }
4723 
4724 
4725 extern "C" status_t
4726 vfs_read_pages(struct vnode* vnode, void* cookie, off_t pos,
4727 	const generic_io_vec* vecs, size_t count, uint32 flags,
4728 	generic_size_t* _numBytes)
4729 {
4730 	FUNCTION(("vfs_read_pages: vnode %p, vecs %p, pos %" B_PRIdOFF "\n", vnode,
4731 		vecs, pos));
4732 
4733 #if VFS_PAGES_IO_TRACING
4734 	generic_size_t bytesRequested = *_numBytes;
4735 #endif
4736 
4737 	IORequest request;
4738 	status_t status = request.Init(pos, vecs, count, *_numBytes, false, flags);
4739 	if (status == B_OK) {
4740 		status = vfs_vnode_io(vnode, cookie, &request);
4741 		if (status == B_OK)
4742 			status = request.Wait();
4743 		*_numBytes = request.TransferredBytes();
4744 	}
4745 
4746 	TPIO(ReadPages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4747 		status, *_numBytes));
4748 
4749 	return status;
4750 }
4751 
4752 
4753 extern "C" status_t
4754 vfs_write_pages(struct vnode* vnode, void* cookie, off_t pos,
4755 	const generic_io_vec* vecs, size_t count, uint32 flags,
4756 	generic_size_t* _numBytes)
4757 {
4758 	FUNCTION(("vfs_write_pages: vnode %p, vecs %p, pos %" B_PRIdOFF "\n", vnode,
4759 		vecs, pos));
4760 
4761 #if VFS_PAGES_IO_TRACING
4762 	generic_size_t bytesRequested = *_numBytes;
4763 #endif
4764 
4765 	IORequest request;
4766 	status_t status = request.Init(pos, vecs, count, *_numBytes, true, flags);
4767 	if (status == B_OK) {
4768 		status = vfs_vnode_io(vnode, cookie, &request);
4769 		if (status == B_OK)
4770 			status = request.Wait();
4771 		*_numBytes = request.TransferredBytes();
4772 	}
4773 
4774 	TPIO(WritePages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4775 		status, *_numBytes));
4776 
4777 	return status;
4778 }
4779 
4780 
4781 /*!	Gets the vnode's VMCache object. If it didn't have one, it will be
4782 	created if \a allocate is \c true.
4783 	In case it's successful, it will also grab a reference to the cache
4784 	it returns.
4785 */
4786 extern "C" status_t
4787 vfs_get_vnode_cache(struct vnode* vnode, VMCache** _cache, bool allocate)
4788 {
4789 	if (vnode->cache != NULL) {
4790 		vnode->cache->AcquireRef();
4791 		*_cache = vnode->cache;
4792 		return B_OK;
4793 	}
4794 
4795 	rw_lock_read_lock(&sVnodeLock);
4796 	vnode->Lock();
4797 
4798 	status_t status = B_OK;
4799 
4800 	// The cache could have been created in the meantime
4801 	if (vnode->cache == NULL) {
4802 		if (allocate) {
4803 			// TODO: actually the vnode needs to be busy already here, or
4804 			//	else this won't work...
4805 			bool wasBusy = vnode->IsBusy();
4806 			vnode->SetBusy(true);
4807 
4808 			vnode->Unlock();
4809 			rw_lock_read_unlock(&sVnodeLock);
4810 
4811 			status = vm_create_vnode_cache(vnode, &vnode->cache);
4812 
4813 			rw_lock_read_lock(&sVnodeLock);
4814 			vnode->Lock();
4815 			vnode->SetBusy(wasBusy);
4816 		} else
4817 			status = B_BAD_VALUE;
4818 	}
4819 
4820 	vnode->Unlock();
4821 	rw_lock_read_unlock(&sVnodeLock);
4822 
4823 	if (status == B_OK) {
4824 		vnode->cache->AcquireRef();
4825 		*_cache = vnode->cache;
4826 	}
4827 
4828 	return status;
4829 }
4830 
4831 
4832 /*!	Sets the vnode's VMCache object, for subsystems that want to manage
4833 	their own.
4834 	In case it's successful, it will also grab a reference to the cache
4835 	it returns.
4836 */
4837 extern "C" status_t
4838 vfs_set_vnode_cache(struct vnode* vnode, VMCache* _cache)
4839 {
4840 	rw_lock_read_lock(&sVnodeLock);
4841 	vnode->Lock();
4842 
4843 	status_t status = B_OK;
4844 	if (vnode->cache != NULL) {
4845 		status = B_NOT_ALLOWED;
4846 	} else {
4847 		vnode->cache = _cache;
4848 		_cache->AcquireRef();
4849 	}
4850 
4851 	vnode->Unlock();
4852 	rw_lock_read_unlock(&sVnodeLock);
4853 	return status;
4854 }
4855 
4856 
4857 status_t
4858 vfs_get_file_map(struct vnode* vnode, off_t offset, size_t size,
4859 	file_io_vec* vecs, size_t* _count)
4860 {
4861 	FUNCTION(("vfs_get_file_map: vnode %p, vecs %p, offset %" B_PRIdOFF
4862 		", size = %" B_PRIuSIZE "\n", vnode, vecs, offset, size));
4863 
4864 	return FS_CALL(vnode, get_file_map, offset, size, vecs, _count);
4865 }
4866 
4867 
4868 status_t
4869 vfs_stat_vnode(struct vnode* vnode, struct stat* stat)
4870 {
4871 	status_t status = FS_CALL(vnode, read_stat, stat);
4872 
4873 	// fill in the st_dev and st_ino fields
4874 	if (status == B_OK) {
4875 		stat->st_dev = vnode->device;
4876 		stat->st_ino = vnode->id;
4877 		// the rdev field must stay unset for non-special files
4878 		if (!S_ISBLK(stat->st_mode) && !S_ISCHR(stat->st_mode))
4879 			stat->st_rdev = -1;
4880 	}
4881 
4882 	return status;
4883 }
4884 
4885 
4886 status_t
4887 vfs_stat_node_ref(dev_t device, ino_t inode, struct stat* stat)
4888 {
4889 	struct vnode* vnode;
4890 	status_t status = get_vnode(device, inode, &vnode, true, false);
4891 	if (status != B_OK)
4892 		return status;
4893 
4894 	status = vfs_stat_vnode(vnode, stat);
4895 
4896 	put_vnode(vnode);
4897 	return status;
4898 }
4899 
4900 
4901 status_t
4902 vfs_get_vnode_name(struct vnode* vnode, char* name, size_t nameSize)
4903 {
4904 	return get_vnode_name(vnode, NULL, name, nameSize, true);
4905 }
4906 
4907 
4908 status_t
4909 vfs_entry_ref_to_path(dev_t device, ino_t inode, const char* leaf,
4910 	bool kernel, char* path, size_t pathLength)
4911 {
4912 	struct vnode* vnode;
4913 	status_t status;
4914 
4915 	// filter invalid leaf names
4916 	if (leaf != NULL && (leaf[0] == '\0' || strchr(leaf, '/')))
4917 		return B_BAD_VALUE;
4918 
4919 	// get the vnode matching the dir's node_ref
4920 	if (leaf && (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0)) {
4921 		// special cases "." and "..": we can directly get the vnode of the
4922 		// referenced directory
4923 		status = entry_ref_to_vnode(device, inode, leaf, false, kernel, &vnode);
4924 		leaf = NULL;
4925 	} else
4926 		status = get_vnode(device, inode, &vnode, true, false);
4927 	if (status != B_OK)
4928 		return status;
4929 
4930 	// get the directory path
4931 	status = dir_vnode_to_path(vnode, path, pathLength, kernel);
4932 	put_vnode(vnode);
4933 		// we don't need the vnode anymore
4934 	if (status != B_OK)
4935 		return status;
4936 
4937 	// append the leaf name
4938 	if (leaf) {
4939 		// insert a directory separator if this is not the file system root
4940 		if ((strcmp(path, "/") && strlcat(path, "/", pathLength)
4941 				>= pathLength)
4942 			|| strlcat(path, leaf, pathLength) >= pathLength) {
4943 			return B_NAME_TOO_LONG;
4944 		}
4945 	}
4946 
4947 	return B_OK;
4948 }
4949 
4950 
4951 /*!	If the given descriptor locked its vnode, that lock will be released. */
4952 void
4953 vfs_unlock_vnode_if_locked(struct file_descriptor* descriptor)
4954 {
4955 	struct vnode* vnode = fd_vnode(descriptor);
4956 
4957 	if (vnode != NULL && vnode->mandatory_locked_by == descriptor)
4958 		vnode->mandatory_locked_by = NULL;
4959 }
4960 
4961 
4962 /*!	Releases any POSIX locks on the file descriptor. */
4963 status_t
4964 vfs_release_posix_lock(io_context* context, struct file_descriptor* descriptor)
4965 {
4966 	struct vnode* vnode = descriptor->u.vnode;
4967 	if (vnode == NULL)
4968 		return B_OK;
4969 
4970 	if (HAS_FS_CALL(vnode, release_lock))
4971 		return FS_CALL(vnode, release_lock, descriptor->cookie, NULL);
4972 
4973 	return release_advisory_lock(vnode, context, NULL, NULL);
4974 }
4975 
4976 
4977 /*!	Closes all file descriptors of the specified I/O context that
4978 	have the O_CLOEXEC flag set.
4979 */
4980 void
4981 vfs_exec_io_context(io_context* context)
4982 {
4983 	uint32 i;
4984 
4985 	for (i = 0; i < context->table_size; i++) {
4986 		mutex_lock(&context->io_mutex);
4987 
4988 		struct file_descriptor* descriptor = context->fds[i];
4989 		bool remove = false;
4990 
4991 		if (descriptor != NULL && fd_close_on_exec(context, i)) {
4992 			context->fds[i] = NULL;
4993 			context->num_used_fds--;
4994 
4995 			remove = true;
4996 		}
4997 
4998 		mutex_unlock(&context->io_mutex);
4999 
5000 		if (remove) {
5001 			close_fd(context, descriptor);
5002 			put_fd(descriptor);
5003 		}
5004 	}
5005 }
5006 
5007 
5008 /*! Sets up a new io_control structure, and inherits the properties
5009 	of the parent io_control if it is given.
5010 */
5011 io_context*
5012 vfs_new_io_context(io_context* parentContext, bool purgeCloseOnExec)
5013 {
5014 	io_context* context = (io_context*)malloc(sizeof(io_context));
5015 	if (context == NULL)
5016 		return NULL;
5017 
5018 	TIOC(NewIOContext(context, parentContext));
5019 
5020 	memset(context, 0, sizeof(io_context));
5021 	context->ref_count = 1;
5022 
5023 	MutexLocker parentLocker;
5024 
5025 	size_t tableSize;
5026 	if (parentContext != NULL) {
5027 		parentLocker.SetTo(parentContext->io_mutex, false);
5028 		tableSize = parentContext->table_size;
5029 	} else
5030 		tableSize = DEFAULT_FD_TABLE_SIZE;
5031 
5032 	// allocate space for FDs and their close-on-exec flag
5033 	context->fds = (file_descriptor**)malloc(
5034 		sizeof(struct file_descriptor*) * tableSize
5035 		+ sizeof(struct select_sync*) * tableSize
5036 		+ (tableSize + 7) / 8);
5037 	if (context->fds == NULL) {
5038 		free(context);
5039 		return NULL;
5040 	}
5041 
5042 	context->select_infos = (select_info**)(context->fds + tableSize);
5043 	context->fds_close_on_exec = (uint8*)(context->select_infos + tableSize);
5044 
5045 	memset(context->fds, 0, sizeof(struct file_descriptor*) * tableSize
5046 		+ sizeof(struct select_sync*) * tableSize
5047 		+ (tableSize + 7) / 8);
5048 
5049 	mutex_init(&context->io_mutex, "I/O context");
5050 
5051 	// Copy all parent file descriptors
5052 
5053 	if (parentContext != NULL) {
5054 		size_t i;
5055 
5056 		mutex_lock(&sIOContextRootLock);
5057 		context->root = parentContext->root;
5058 		if (context->root)
5059 			inc_vnode_ref_count(context->root);
5060 		mutex_unlock(&sIOContextRootLock);
5061 
5062 		context->cwd = parentContext->cwd;
5063 		if (context->cwd)
5064 			inc_vnode_ref_count(context->cwd);
5065 
5066 		if (parentContext->inherit_fds) {
5067 			for (i = 0; i < tableSize; i++) {
5068 				struct file_descriptor* descriptor = parentContext->fds[i];
5069 
5070 				if (descriptor != NULL
5071 					&& (descriptor->open_mode & O_DISCONNECTED) == 0) {
5072 					bool closeOnExec = fd_close_on_exec(parentContext, i);
5073 					if (closeOnExec && purgeCloseOnExec)
5074 						continue;
5075 
5076 					TFD(InheritFD(context, i, descriptor, parentContext));
5077 
5078 					context->fds[i] = descriptor;
5079 					context->num_used_fds++;
5080 					atomic_add(&descriptor->ref_count, 1);
5081 					atomic_add(&descriptor->open_count, 1);
5082 
5083 					if (closeOnExec)
5084 						fd_set_close_on_exec(context, i, true);
5085 				}
5086 			}
5087 		}
5088 
5089 		parentLocker.Unlock();
5090 	} else {
5091 		context->root = sRoot;
5092 		context->cwd = sRoot;
5093 
5094 		if (context->root)
5095 			inc_vnode_ref_count(context->root);
5096 
5097 		if (context->cwd)
5098 			inc_vnode_ref_count(context->cwd);
5099 	}
5100 
5101 	context->table_size = tableSize;
5102 	context->inherit_fds = parentContext != NULL;
5103 
5104 	list_init(&context->node_monitors);
5105 	context->max_monitors = DEFAULT_NODE_MONITORS;
5106 
5107 	return context;
5108 }
5109 
5110 
5111 void
5112 vfs_get_io_context(io_context* context)
5113 {
5114 	atomic_add(&context->ref_count, 1);
5115 }
5116 
5117 
5118 void
5119 vfs_put_io_context(io_context* context)
5120 {
5121 	if (atomic_add(&context->ref_count, -1) == 1)
5122 		free_io_context(context);
5123 }
5124 
5125 
5126 status_t
5127 vfs_resize_fd_table(struct io_context* context, uint32 newSize)
5128 {
5129 	if (newSize == 0 || newSize > MAX_FD_TABLE_SIZE)
5130 		return B_BAD_VALUE;
5131 
5132 	TIOC(ResizeIOContext(context, newSize));
5133 
5134 	MutexLocker _(context->io_mutex);
5135 
5136 	uint32 oldSize = context->table_size;
5137 	int oldCloseOnExitBitmapSize = (oldSize + 7) / 8;
5138 	int newCloseOnExitBitmapSize = (newSize + 7) / 8;
5139 
5140 	// If the tables shrink, make sure none of the fds being dropped are in use.
5141 	if (newSize < oldSize) {
5142 		for (uint32 i = oldSize; i-- > newSize;) {
5143 			if (context->fds[i])
5144 				return B_BUSY;
5145 		}
5146 	}
5147 
5148 	// store pointers to the old tables
5149 	file_descriptor** oldFDs = context->fds;
5150 	select_info** oldSelectInfos = context->select_infos;
5151 	uint8* oldCloseOnExecTable = context->fds_close_on_exec;
5152 
5153 	// allocate new tables
5154 	file_descriptor** newFDs = (file_descriptor**)malloc(
5155 		sizeof(struct file_descriptor*) * newSize
5156 		+ sizeof(struct select_sync*) * newSize
5157 		+ newCloseOnExitBitmapSize);
5158 	if (newFDs == NULL)
5159 		return B_NO_MEMORY;
5160 
5161 	context->fds = newFDs;
5162 	context->select_infos = (select_info**)(context->fds + newSize);
5163 	context->fds_close_on_exec = (uint8*)(context->select_infos + newSize);
5164 	context->table_size = newSize;
5165 
5166 	// copy entries from old tables
5167 	uint32 toCopy = min_c(oldSize, newSize);
5168 
5169 	memcpy(context->fds, oldFDs, sizeof(void*) * toCopy);
5170 	memcpy(context->select_infos, oldSelectInfos, sizeof(void*) * toCopy);
5171 	memcpy(context->fds_close_on_exec, oldCloseOnExecTable,
5172 		min_c(oldCloseOnExitBitmapSize, newCloseOnExitBitmapSize));
5173 
5174 	// clear additional entries, if the tables grow
5175 	if (newSize > oldSize) {
5176 		memset(context->fds + oldSize, 0, sizeof(void*) * (newSize - oldSize));
5177 		memset(context->select_infos + oldSize, 0,
5178 			sizeof(void*) * (newSize - oldSize));
5179 		memset(context->fds_close_on_exec + oldCloseOnExitBitmapSize, 0,
5180 			newCloseOnExitBitmapSize - oldCloseOnExitBitmapSize);
5181 	}
5182 
5183 	free(oldFDs);
5184 
5185 	return B_OK;
5186 }
5187 
5188 
5189 /*!	\brief Resolves a vnode to the vnode it is covered by, if any.
5190 
5191 	Given an arbitrary vnode (identified by mount and node ID), the function
5192 	checks, whether the vnode is covered by another vnode. If it is, the
5193 	function returns the mount and node ID of the covering vnode. Otherwise
5194 	it simply returns the supplied mount and node ID.
5195 
5196 	In case of error (e.g. the supplied node could not be found) the variables
5197 	for storing the resolved mount and node ID remain untouched and an error
5198 	code is returned.
5199 
5200 	\param mountID The mount ID of the vnode in question.
5201 	\param nodeID The node ID of the vnode in question.
5202 	\param resolvedMountID Pointer to storage for the resolved mount ID.
5203 	\param resolvedNodeID Pointer to storage for the resolved node ID.
5204 	\return
5205 	- \c B_OK, if everything went fine,
5206 	- another error code, if something went wrong.
5207 */
5208 status_t
5209 vfs_resolve_vnode_to_covering_vnode(dev_t mountID, ino_t nodeID,
5210 	dev_t* resolvedMountID, ino_t* resolvedNodeID)
5211 {
5212 	// get the node
5213 	struct vnode* node;
5214 	status_t error = get_vnode(mountID, nodeID, &node, true, false);
5215 	if (error != B_OK)
5216 		return error;
5217 
5218 	// resolve the node
5219 	if (Vnode* coveringNode = get_covering_vnode(node)) {
5220 		put_vnode(node);
5221 		node = coveringNode;
5222 	}
5223 
5224 	// set the return values
5225 	*resolvedMountID = node->device;
5226 	*resolvedNodeID = node->id;
5227 
5228 	put_vnode(node);
5229 
5230 	return B_OK;
5231 }
5232 
5233 
5234 status_t
5235 vfs_get_mount_point(dev_t mountID, dev_t* _mountPointMountID,
5236 	ino_t* _mountPointNodeID)
5237 {
5238 	ReadLocker nodeLocker(sVnodeLock);
5239 	ReadLocker mountLocker(sMountLock);
5240 
5241 	struct fs_mount* mount = find_mount(mountID);
5242 	if (mount == NULL)
5243 		return B_BAD_VALUE;
5244 
5245 	Vnode* mountPoint = mount->covers_vnode;
5246 
5247 	*_mountPointMountID = mountPoint->device;
5248 	*_mountPointNodeID = mountPoint->id;
5249 
5250 	return B_OK;
5251 }
5252 
5253 
5254 status_t
5255 vfs_bind_mount_directory(dev_t mountID, ino_t nodeID, dev_t coveredMountID,
5256 	ino_t coveredNodeID)
5257 {
5258 	// get the vnodes
5259 	Vnode* vnode;
5260 	status_t error = get_vnode(mountID, nodeID, &vnode, true, false);
5261 	if (error != B_OK)
5262 		return B_BAD_VALUE;
5263 	VNodePutter vnodePutter(vnode);
5264 
5265 	Vnode* coveredVnode;
5266 	error = get_vnode(coveredMountID, coveredNodeID, &coveredVnode, true,
5267 		false);
5268 	if (error != B_OK)
5269 		return B_BAD_VALUE;
5270 	VNodePutter coveredVnodePutter(coveredVnode);
5271 
5272 	// establish the covered/covering links
5273 	WriteLocker locker(sVnodeLock);
5274 
5275 	if (vnode->covers != NULL || coveredVnode->covered_by != NULL
5276 		|| vnode->mount->unmounting || coveredVnode->mount->unmounting) {
5277 		return B_BUSY;
5278 	}
5279 
5280 	vnode->covers = coveredVnode;
5281 	vnode->SetCovering(true);
5282 
5283 	coveredVnode->covered_by = vnode;
5284 	coveredVnode->SetCovered(true);
5285 
5286 	// the vnodes do now reference each other
5287 	inc_vnode_ref_count(vnode);
5288 	inc_vnode_ref_count(coveredVnode);
5289 
5290 	return B_OK;
5291 }
5292 
5293 
5294 int
5295 vfs_getrlimit(int resource, struct rlimit* rlp)
5296 {
5297 	if (!rlp)
5298 		return B_BAD_ADDRESS;
5299 
5300 	switch (resource) {
5301 		case RLIMIT_NOFILE:
5302 		{
5303 			struct io_context* context = get_current_io_context(false);
5304 			MutexLocker _(context->io_mutex);
5305 
5306 			rlp->rlim_cur = context->table_size;
5307 			rlp->rlim_max = MAX_FD_TABLE_SIZE;
5308 			return 0;
5309 		}
5310 
5311 		case RLIMIT_NOVMON:
5312 		{
5313 			struct io_context* context = get_current_io_context(false);
5314 			MutexLocker _(context->io_mutex);
5315 
5316 			rlp->rlim_cur = context->max_monitors;
5317 			rlp->rlim_max = MAX_NODE_MONITORS;
5318 			return 0;
5319 		}
5320 
5321 		default:
5322 			return B_BAD_VALUE;
5323 	}
5324 }
5325 
5326 
5327 int
5328 vfs_setrlimit(int resource, const struct rlimit* rlp)
5329 {
5330 	if (!rlp)
5331 		return B_BAD_ADDRESS;
5332 
5333 	switch (resource) {
5334 		case RLIMIT_NOFILE:
5335 			/* TODO: check getuid() */
5336 			if (rlp->rlim_max != RLIM_SAVED_MAX
5337 				&& rlp->rlim_max != MAX_FD_TABLE_SIZE)
5338 				return B_NOT_ALLOWED;
5339 
5340 			return vfs_resize_fd_table(get_current_io_context(false),
5341 				rlp->rlim_cur);
5342 
5343 		case RLIMIT_NOVMON:
5344 			/* TODO: check getuid() */
5345 			if (rlp->rlim_max != RLIM_SAVED_MAX
5346 				&& rlp->rlim_max != MAX_NODE_MONITORS)
5347 				return B_NOT_ALLOWED;
5348 
5349 			return resize_monitor_table(get_current_io_context(false),
5350 				rlp->rlim_cur);
5351 
5352 		default:
5353 			return B_BAD_VALUE;
5354 	}
5355 }
5356 
5357 
5358 status_t
5359 vfs_init(kernel_args* args)
5360 {
5361 	vnode::StaticInit();
5362 
5363 	sVnodeTable = new(std::nothrow) VnodeTable();
5364 	if (sVnodeTable == NULL || sVnodeTable->Init(VNODE_HASH_TABLE_SIZE) != B_OK)
5365 		panic("vfs_init: error creating vnode hash table\n");
5366 
5367 	struct vnode dummy_vnode;
5368 	list_init_etc(&sUnusedVnodeList, offset_of_member(dummy_vnode, unused_link));
5369 
5370 	struct fs_mount dummyMount;
5371 	sMountsTable = new(std::nothrow) MountTable();
5372 	if (sMountsTable == NULL
5373 			|| sMountsTable->Init(MOUNTS_HASH_TABLE_SIZE) != B_OK)
5374 		panic("vfs_init: error creating mounts hash table\n");
5375 
5376 	sPathNameCache = create_object_cache("vfs path names",
5377 		B_PATH_NAME_LENGTH + 1, 8, NULL, NULL, NULL);
5378 	if (sPathNameCache == NULL)
5379 		panic("vfs_init: error creating path name object_cache\n");
5380 
5381 	sFileDescriptorCache = create_object_cache("vfs fds",
5382 		sizeof(file_descriptor), 8, NULL, NULL, NULL);
5383 	if (sFileDescriptorCache == NULL)
5384 		panic("vfs_init: error creating file descriptor object_cache\n");
5385 
5386 	node_monitor_init();
5387 
5388 	sRoot = NULL;
5389 
5390 	recursive_lock_init(&sMountOpLock, "vfs_mount_op_lock");
5391 
5392 	if (block_cache_init() != B_OK)
5393 		return B_ERROR;
5394 
5395 #ifdef ADD_DEBUGGER_COMMANDS
5396 	// add some debugger commands
5397 	add_debugger_command_etc("vnode", &dump_vnode,
5398 		"Print info about the specified vnode",
5399 		"[ \"-p\" ] ( <vnode> | <devID> <nodeID> )\n"
5400 		"Prints information about the vnode specified by address <vnode> or\n"
5401 		"<devID>, <vnodeID> pair. If \"-p\" is given, a path of the vnode is\n"
5402 		"constructed and printed. It might not be possible to construct a\n"
5403 		"complete path, though.\n",
5404 		0);
5405 	add_debugger_command("vnodes", &dump_vnodes,
5406 		"list all vnodes (from the specified device)");
5407 	add_debugger_command("vnode_caches", &dump_vnode_caches,
5408 		"list all vnode caches");
5409 	add_debugger_command("mount", &dump_mount,
5410 		"info about the specified fs_mount");
5411 	add_debugger_command("mounts", &dump_mounts, "list all fs_mounts");
5412 	add_debugger_command("io_context", &dump_io_context,
5413 		"info about the I/O context");
5414 	add_debugger_command("vnode_usage", &dump_vnode_usage,
5415 		"info about vnode usage");
5416 #endif
5417 
5418 	register_low_resource_handler(&vnode_low_resource_handler, NULL,
5419 		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
5420 			| B_KERNEL_RESOURCE_ADDRESS_SPACE,
5421 		0);
5422 
5423 	fifo_init();
5424 	file_map_init();
5425 
5426 	return file_cache_init();
5427 }
5428 
5429 
5430 //	#pragma mark - fd_ops implementations
5431 
5432 
5433 /*!
5434 	Calls fs_open() on the given vnode and returns a new
5435 	file descriptor for it
5436 */
5437 static int
5438 open_vnode(struct vnode* vnode, int openMode, bool kernel)
5439 {
5440 	void* cookie;
5441 	status_t status = FS_CALL(vnode, open, openMode, &cookie);
5442 	if (status != B_OK)
5443 		return status;
5444 
5445 	int fd = get_new_fd(FDTYPE_FILE, NULL, vnode, cookie, openMode, kernel);
5446 	if (fd < 0) {
5447 		FS_CALL(vnode, close, cookie);
5448 		FS_CALL(vnode, free_cookie, cookie);
5449 	}
5450 	return fd;
5451 }
5452 
5453 
5454 /*!
5455 	Calls fs_open() on the given vnode and returns a new
5456 	file descriptor for it
5457 */
5458 static int
5459 create_vnode(struct vnode* directory, const char* name, int openMode,
5460 	int perms, bool kernel)
5461 {
5462 	bool traverse = ((openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0);
5463 	status_t status = B_ERROR;
5464 	struct vnode* vnode;
5465 	void* cookie;
5466 	ino_t newID;
5467 
5468 	// This is somewhat tricky: If the entry already exists, the FS responsible
5469 	// for the directory might not necessarily also be the one responsible for
5470 	// the node the entry refers to (e.g. in case of mount points or FIFOs). So
5471 	// we can actually never call the create() hook without O_EXCL. Instead we
5472 	// try to look the entry up first. If it already exists, we just open the
5473 	// node (unless O_EXCL), otherwise we call create() with O_EXCL. This
5474 	// introduces a race condition, since someone else might have created the
5475 	// entry in the meantime. We hope the respective FS returns the correct
5476 	// error code and retry (up to 3 times) again.
5477 
5478 	for (int i = 0; i < 3 && status != B_OK; i++) {
5479 		// look the node up
5480 		status = lookup_dir_entry(directory, name, &vnode);
5481 		if (status == B_OK) {
5482 			VNodePutter putter(vnode);
5483 
5484 			if ((openMode & O_EXCL) != 0)
5485 				return B_FILE_EXISTS;
5486 
5487 			// If the node is a symlink, we have to follow it, unless
5488 			// O_NOTRAVERSE is set.
5489 			if (S_ISLNK(vnode->Type()) && traverse) {
5490 				putter.Put();
5491 				char clonedName[B_FILE_NAME_LENGTH + 1];
5492 				if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH)
5493 						>= B_FILE_NAME_LENGTH) {
5494 					return B_NAME_TOO_LONG;
5495 				}
5496 
5497 				inc_vnode_ref_count(directory);
5498 				status = vnode_path_to_vnode(directory, clonedName, true, 0,
5499 					kernel, &vnode, NULL);
5500 				if (status != B_OK)
5501 					return status;
5502 
5503 				putter.SetTo(vnode);
5504 			}
5505 
5506 			if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type()))
5507 				return B_LINK_LIMIT;
5508 
5509 			int fd = open_vnode(vnode, openMode & ~O_CREAT, kernel);
5510 			// on success keep the vnode reference for the FD
5511 			if (fd >= 0)
5512 				putter.Detach();
5513 
5514 			return fd;
5515 		}
5516 
5517 		// it doesn't exist yet -- try to create it
5518 
5519 		if (!HAS_FS_CALL(directory, create))
5520 			return B_READ_ONLY_DEVICE;
5521 
5522 		status = FS_CALL(directory, create, name, openMode | O_EXCL, perms,
5523 			&cookie, &newID);
5524 		if (status != B_OK
5525 			&& ((openMode & O_EXCL) != 0 || status != B_FILE_EXISTS)) {
5526 			return status;
5527 		}
5528 	}
5529 
5530 	if (status != B_OK)
5531 		return status;
5532 
5533 	// the node has been created successfully
5534 
5535 	rw_lock_read_lock(&sVnodeLock);
5536 	vnode = lookup_vnode(directory->device, newID);
5537 	rw_lock_read_unlock(&sVnodeLock);
5538 
5539 	if (vnode == NULL) {
5540 		panic("vfs: fs_create() returned success but there is no vnode, "
5541 			"mount ID %" B_PRIdDEV "!\n", directory->device);
5542 		return B_BAD_VALUE;
5543 	}
5544 
5545 	int fd = get_new_fd(FDTYPE_FILE, NULL, vnode, cookie, openMode, kernel);
5546 	if (fd >= 0)
5547 		return fd;
5548 
5549 	status = fd;
5550 
5551 	// something went wrong, clean up
5552 
5553 	FS_CALL(vnode, close, cookie);
5554 	FS_CALL(vnode, free_cookie, cookie);
5555 	put_vnode(vnode);
5556 
5557 	FS_CALL(directory, unlink, name);
5558 
5559 	return status;
5560 }
5561 
5562 
5563 /*! Calls fs open_dir() on the given vnode and returns a new
5564 	file descriptor for it
5565 */
5566 static int
5567 open_dir_vnode(struct vnode* vnode, bool kernel)
5568 {
5569 	void* cookie;
5570 	status_t status = FS_CALL(vnode, open_dir, &cookie);
5571 	if (status != B_OK)
5572 		return status;
5573 
5574 	// directory is opened, create a fd
5575 	status = get_new_fd(FDTYPE_DIR, NULL, vnode, cookie, O_CLOEXEC, kernel);
5576 	if (status >= 0)
5577 		return status;
5578 
5579 	FS_CALL(vnode, close_dir, cookie);
5580 	FS_CALL(vnode, free_dir_cookie, cookie);
5581 
5582 	return status;
5583 }
5584 
5585 
5586 /*! Calls fs open_attr_dir() on the given vnode and returns a new
5587 	file descriptor for it.
5588 	Used by attr_dir_open(), and attr_dir_open_fd().
5589 */
5590 static int
5591 open_attr_dir_vnode(struct vnode* vnode, bool kernel)
5592 {
5593 	if (!HAS_FS_CALL(vnode, open_attr_dir))
5594 		return B_UNSUPPORTED;
5595 
5596 	void* cookie;
5597 	status_t status = FS_CALL(vnode, open_attr_dir, &cookie);
5598 	if (status != B_OK)
5599 		return status;
5600 
5601 	// directory is opened, create a fd
5602 	status = get_new_fd(FDTYPE_ATTR_DIR, NULL, vnode, cookie, O_CLOEXEC,
5603 		kernel);
5604 	if (status >= 0)
5605 		return status;
5606 
5607 	FS_CALL(vnode, close_attr_dir, cookie);
5608 	FS_CALL(vnode, free_attr_dir_cookie, cookie);
5609 
5610 	return status;
5611 }
5612 
5613 
5614 static int
5615 file_create_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5616 	int openMode, int perms, bool kernel)
5617 {
5618 	FUNCTION(("file_create_entry_ref: name = '%s', omode %x, perms %d, "
5619 		"kernel %d\n", name, openMode, perms, kernel));
5620 
5621 	// get directory to put the new file in
5622 	struct vnode* directory;
5623 	status_t status = get_vnode(mountID, directoryID, &directory, true, false);
5624 	if (status != B_OK)
5625 		return status;
5626 
5627 	status = create_vnode(directory, name, openMode, perms, kernel);
5628 	put_vnode(directory);
5629 
5630 	return status;
5631 }
5632 
5633 
5634 static int
5635 file_create(int fd, char* path, int openMode, int perms, bool kernel)
5636 {
5637 	FUNCTION(("file_create: path '%s', omode %x, perms %d, kernel %d\n", path,
5638 		openMode, perms, kernel));
5639 
5640 	// get directory to put the new file in
5641 	char name[B_FILE_NAME_LENGTH];
5642 	struct vnode* directory;
5643 	status_t status = fd_and_path_to_dir_vnode(fd, path, &directory, name,
5644 		kernel);
5645 	if (status < 0)
5646 		return status;
5647 
5648 	status = create_vnode(directory, name, openMode, perms, kernel);
5649 
5650 	put_vnode(directory);
5651 	return status;
5652 }
5653 
5654 
5655 static int
5656 file_open_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5657 	int openMode, bool kernel)
5658 {
5659 	if (name == NULL || *name == '\0')
5660 		return B_BAD_VALUE;
5661 
5662 	FUNCTION(("file_open_entry_ref(ref = (%" B_PRId32 ", %" B_PRId64 ", %s), "
5663 		"openMode = %d)\n", mountID, directoryID, name, openMode));
5664 
5665 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5666 
5667 	// get the vnode matching the entry_ref
5668 	struct vnode* vnode;
5669 	status_t status = entry_ref_to_vnode(mountID, directoryID, name, traverse,
5670 		kernel, &vnode);
5671 	if (status != B_OK)
5672 		return status;
5673 
5674 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
5675 		put_vnode(vnode);
5676 		return B_LINK_LIMIT;
5677 	}
5678 
5679 	int newFD = open_vnode(vnode, openMode, kernel);
5680 	if (newFD >= 0) {
5681 		// The vnode reference has been transferred to the FD
5682 		cache_node_opened(vnode, FDTYPE_FILE, vnode->cache, mountID,
5683 			directoryID, vnode->id, name);
5684 	} else
5685 		put_vnode(vnode);
5686 
5687 	return newFD;
5688 }
5689 
5690 
5691 static int
5692 file_open(int fd, char* path, int openMode, bool kernel)
5693 {
5694 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5695 
5696 	FUNCTION(("file_open: fd: %d, entry path = '%s', omode %d, kernel %d\n",
5697 		fd, path, openMode, kernel));
5698 
5699 	// get the vnode matching the vnode + path combination
5700 	struct vnode* vnode;
5701 	ino_t parentID;
5702 	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode,
5703 		&parentID, kernel);
5704 	if (status != B_OK)
5705 		return status;
5706 
5707 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
5708 		put_vnode(vnode);
5709 		return B_LINK_LIMIT;
5710 	}
5711 
5712 	// open the vnode
5713 	int newFD = open_vnode(vnode, openMode, kernel);
5714 	if (newFD >= 0) {
5715 		// The vnode reference has been transferred to the FD
5716 		cache_node_opened(vnode, FDTYPE_FILE, vnode->cache,
5717 			vnode->device, parentID, vnode->id, NULL);
5718 	} else
5719 		put_vnode(vnode);
5720 
5721 	return newFD;
5722 }
5723 
5724 
5725 static status_t
5726 file_close(struct file_descriptor* descriptor)
5727 {
5728 	struct vnode* vnode = descriptor->u.vnode;
5729 	status_t status = B_OK;
5730 
5731 	FUNCTION(("file_close(descriptor = %p)\n", descriptor));
5732 
5733 	cache_node_closed(vnode, FDTYPE_FILE, vnode->cache, vnode->device,
5734 		vnode->id);
5735 	if (HAS_FS_CALL(vnode, close)) {
5736 		status = FS_CALL(vnode, close, descriptor->cookie);
5737 	}
5738 
5739 	if (status == B_OK) {
5740 		// remove all outstanding locks for this team
5741 		if (HAS_FS_CALL(vnode, release_lock))
5742 			status = FS_CALL(vnode, release_lock, descriptor->cookie, NULL);
5743 		else
5744 			status = release_advisory_lock(vnode, NULL, descriptor, NULL);
5745 	}
5746 	return status;
5747 }
5748 
5749 
5750 static void
5751 file_free_fd(struct file_descriptor* descriptor)
5752 {
5753 	struct vnode* vnode = descriptor->u.vnode;
5754 
5755 	if (vnode != NULL) {
5756 		FS_CALL(vnode, free_cookie, descriptor->cookie);
5757 		put_vnode(vnode);
5758 	}
5759 }
5760 
5761 
5762 static status_t
5763 file_read(struct file_descriptor* descriptor, off_t pos, void* buffer,
5764 	size_t* length)
5765 {
5766 	struct vnode* vnode = descriptor->u.vnode;
5767 	FUNCTION(("file_read: buf %p, pos %" B_PRIdOFF ", len %p = %ld\n", buffer,
5768 		pos, length, *length));
5769 
5770 	if (S_ISDIR(vnode->Type()))
5771 		return B_IS_A_DIRECTORY;
5772 
5773 	return FS_CALL(vnode, read, descriptor->cookie, pos, buffer, length);
5774 }
5775 
5776 
5777 static status_t
5778 file_write(struct file_descriptor* descriptor, off_t pos, const void* buffer,
5779 	size_t* length)
5780 {
5781 	struct vnode* vnode = descriptor->u.vnode;
5782 	FUNCTION(("file_write: buf %p, pos %" B_PRIdOFF ", len %p\n", buffer, pos,
5783 		length));
5784 
5785 	if (S_ISDIR(vnode->Type()))
5786 		return B_IS_A_DIRECTORY;
5787 	if (!HAS_FS_CALL(vnode, write))
5788 		return B_READ_ONLY_DEVICE;
5789 
5790 	return FS_CALL(vnode, write, descriptor->cookie, pos, buffer, length);
5791 }
5792 
5793 
5794 static off_t
5795 file_seek(struct file_descriptor* descriptor, off_t pos, int seekType)
5796 {
5797 	struct vnode* vnode = descriptor->u.vnode;
5798 	off_t offset;
5799 	bool isDevice = false;
5800 
5801 	FUNCTION(("file_seek(pos = %" B_PRIdOFF ", seekType = %d)\n", pos,
5802 		seekType));
5803 
5804 	// some kinds of files are not seekable
5805 	switch (vnode->Type() & S_IFMT) {
5806 		case S_IFIFO:
5807 		case S_IFSOCK:
5808 			return ESPIPE;
5809 
5810 		// drivers publish block devices as chr, so pick both
5811 		case S_IFBLK:
5812 		case S_IFCHR:
5813 			isDevice = true;
5814 			break;
5815 		// The Open Group Base Specs don't mention any file types besides pipes,
5816 		// fifos, and sockets specially, so we allow seeking them.
5817 		case S_IFREG:
5818 		case S_IFDIR:
5819 		case S_IFLNK:
5820 			break;
5821 	}
5822 
5823 	switch (seekType) {
5824 		case SEEK_SET:
5825 			offset = 0;
5826 			break;
5827 		case SEEK_CUR:
5828 			offset = descriptor->pos;
5829 			break;
5830 		case SEEK_END:
5831 		{
5832 			// stat() the node
5833 			if (!HAS_FS_CALL(vnode, read_stat))
5834 				return B_UNSUPPORTED;
5835 
5836 			struct stat stat;
5837 			status_t status = FS_CALL(vnode, read_stat, &stat);
5838 			if (status != B_OK)
5839 				return status;
5840 
5841 			offset = stat.st_size;
5842 
5843 			if (offset == 0 && isDevice) {
5844 				// stat() on regular drivers doesn't report size
5845 				device_geometry geometry;
5846 
5847 				if (HAS_FS_CALL(vnode, ioctl)) {
5848 					status = FS_CALL(vnode, ioctl, descriptor->cookie,
5849 						B_GET_GEOMETRY, &geometry, sizeof(geometry));
5850 					if (status == B_OK)
5851 						offset = (off_t)geometry.bytes_per_sector
5852 							* geometry.sectors_per_track
5853 							* geometry.cylinder_count
5854 							* geometry.head_count;
5855 				}
5856 			}
5857 
5858 			break;
5859 		}
5860 		default:
5861 			return B_BAD_VALUE;
5862 	}
5863 
5864 	// assumes off_t is 64 bits wide
5865 	if (offset > 0 && LONGLONG_MAX - offset < pos)
5866 		return B_BUFFER_OVERFLOW;
5867 
5868 	pos += offset;
5869 	if (pos < 0)
5870 		return B_BAD_VALUE;
5871 
5872 	return descriptor->pos = pos;
5873 }
5874 
5875 
5876 static status_t
5877 file_select(struct file_descriptor* descriptor, uint8 event,
5878 	struct selectsync* sync)
5879 {
5880 	FUNCTION(("file_select(%p, %u, %p)\n", descriptor, event, sync));
5881 
5882 	struct vnode* vnode = descriptor->u.vnode;
5883 
5884 	// If the FS has no select() hook, notify select() now.
5885 	if (!HAS_FS_CALL(vnode, select)) {
5886 		if (!SELECT_TYPE_IS_OUTPUT_ONLY(event))
5887 			return notify_select_event(sync, event);
5888 		else
5889 			return B_OK;
5890 	}
5891 
5892 	return FS_CALL(vnode, select, descriptor->cookie, event, sync);
5893 }
5894 
5895 
5896 static status_t
5897 file_deselect(struct file_descriptor* descriptor, uint8 event,
5898 	struct selectsync* sync)
5899 {
5900 	struct vnode* vnode = descriptor->u.vnode;
5901 
5902 	if (!HAS_FS_CALL(vnode, deselect))
5903 		return B_OK;
5904 
5905 	return FS_CALL(vnode, deselect, descriptor->cookie, event, sync);
5906 }
5907 
5908 
5909 static status_t
5910 dir_create_entry_ref(dev_t mountID, ino_t parentID, const char* name, int perms,
5911 	bool kernel)
5912 {
5913 	struct vnode* vnode;
5914 	status_t status;
5915 
5916 	if (name == NULL || *name == '\0')
5917 		return B_BAD_VALUE;
5918 
5919 	FUNCTION(("dir_create_entry_ref(dev = %" B_PRId32 ", ino = %" B_PRId64 ", "
5920 		"name = '%s', perms = %d)\n", mountID, parentID, name, perms));
5921 
5922 	status = get_vnode(mountID, parentID, &vnode, true, false);
5923 	if (status != B_OK)
5924 		return status;
5925 
5926 	if (HAS_FS_CALL(vnode, create_dir))
5927 		status = FS_CALL(vnode, create_dir, name, perms);
5928 	else
5929 		status = B_READ_ONLY_DEVICE;
5930 
5931 	put_vnode(vnode);
5932 	return status;
5933 }
5934 
5935 
5936 static status_t
5937 dir_create(int fd, char* path, int perms, bool kernel)
5938 {
5939 	char filename[B_FILE_NAME_LENGTH];
5940 	struct vnode* vnode;
5941 	status_t status;
5942 
5943 	FUNCTION(("dir_create: path '%s', perms %d, kernel %d\n", path, perms,
5944 		kernel));
5945 
5946 	status = fd_and_path_to_dir_vnode(fd, path, &vnode, filename, kernel);
5947 	if (status < 0)
5948 		return status;
5949 
5950 	if (HAS_FS_CALL(vnode, create_dir)) {
5951 		status = FS_CALL(vnode, create_dir, filename, perms);
5952 	} else
5953 		status = B_READ_ONLY_DEVICE;
5954 
5955 	put_vnode(vnode);
5956 	return status;
5957 }
5958 
5959 
5960 static int
5961 dir_open_entry_ref(dev_t mountID, ino_t parentID, const char* name, bool kernel)
5962 {
5963 	FUNCTION(("dir_open_entry_ref()\n"));
5964 
5965 	if (name && name[0] == '\0')
5966 		return B_BAD_VALUE;
5967 
5968 	// get the vnode matching the entry_ref/node_ref
5969 	struct vnode* vnode;
5970 	status_t status;
5971 	if (name) {
5972 		status = entry_ref_to_vnode(mountID, parentID, name, true, kernel,
5973 			&vnode);
5974 	} else
5975 		status = get_vnode(mountID, parentID, &vnode, true, false);
5976 	if (status != B_OK)
5977 		return status;
5978 
5979 	int newFD = open_dir_vnode(vnode, kernel);
5980 	if (newFD >= 0) {
5981 		// The vnode reference has been transferred to the FD
5982 		cache_node_opened(vnode, FDTYPE_DIR, vnode->cache, mountID, parentID,
5983 			vnode->id, name);
5984 	} else
5985 		put_vnode(vnode);
5986 
5987 	return newFD;
5988 }
5989 
5990 
5991 static int
5992 dir_open(int fd, char* path, bool kernel)
5993 {
5994 	FUNCTION(("dir_open: fd: %d, entry path = '%s', kernel %d\n", fd, path,
5995 		kernel));
5996 
5997 	// get the vnode matching the vnode + path combination
5998 	struct vnode* vnode = NULL;
5999 	ino_t parentID;
6000 	status_t status = fd_and_path_to_vnode(fd, path, true, &vnode, &parentID,
6001 		kernel);
6002 	if (status != B_OK)
6003 		return status;
6004 
6005 	// open the dir
6006 	int newFD = open_dir_vnode(vnode, kernel);
6007 	if (newFD >= 0) {
6008 		// The vnode reference has been transferred to the FD
6009 		cache_node_opened(vnode, FDTYPE_DIR, vnode->cache, vnode->device,
6010 			parentID, vnode->id, NULL);
6011 	} else
6012 		put_vnode(vnode);
6013 
6014 	return newFD;
6015 }
6016 
6017 
6018 static status_t
6019 dir_close(struct file_descriptor* descriptor)
6020 {
6021 	struct vnode* vnode = descriptor->u.vnode;
6022 
6023 	FUNCTION(("dir_close(descriptor = %p)\n", descriptor));
6024 
6025 	cache_node_closed(vnode, FDTYPE_DIR, vnode->cache, vnode->device,
6026 		vnode->id);
6027 	if (HAS_FS_CALL(vnode, close_dir))
6028 		return FS_CALL(vnode, close_dir, descriptor->cookie);
6029 
6030 	return B_OK;
6031 }
6032 
6033 
6034 static void
6035 dir_free_fd(struct file_descriptor* descriptor)
6036 {
6037 	struct vnode* vnode = descriptor->u.vnode;
6038 
6039 	if (vnode != NULL) {
6040 		FS_CALL(vnode, free_dir_cookie, descriptor->cookie);
6041 		put_vnode(vnode);
6042 	}
6043 }
6044 
6045 
6046 static status_t
6047 dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
6048 	struct dirent* buffer, size_t bufferSize, uint32* _count)
6049 {
6050 	return dir_read(ioContext, descriptor->u.vnode, descriptor->cookie, buffer,
6051 		bufferSize, _count);
6052 }
6053 
6054 
6055 static status_t
6056 fix_dirent(struct vnode* parent, struct dirent* entry,
6057 	struct io_context* ioContext)
6058 {
6059 	// set d_pdev and d_pino
6060 	entry->d_pdev = parent->device;
6061 	entry->d_pino = parent->id;
6062 
6063 	// If this is the ".." entry and the directory covering another vnode,
6064 	// we need to replace d_dev and d_ino with the actual values.
6065 	if (strcmp(entry->d_name, "..") == 0 && parent->IsCovering()) {
6066 		return resolve_covered_parent(parent, &entry->d_dev, &entry->d_ino,
6067 			ioContext);
6068 	}
6069 
6070 	// resolve covered vnodes
6071 	ReadLocker _(&sVnodeLock);
6072 
6073 	struct vnode* vnode = lookup_vnode(entry->d_dev, entry->d_ino);
6074 	if (vnode != NULL && vnode->covered_by != NULL) {
6075 		do {
6076 			vnode = vnode->covered_by;
6077 		} while (vnode->covered_by != NULL);
6078 
6079 		entry->d_dev = vnode->device;
6080 		entry->d_ino = vnode->id;
6081 	}
6082 
6083 	return B_OK;
6084 }
6085 
6086 
6087 static status_t
6088 dir_read(struct io_context* ioContext, struct vnode* vnode, void* cookie,
6089 	struct dirent* buffer, size_t bufferSize, uint32* _count)
6090 {
6091 	if (!HAS_FS_CALL(vnode, read_dir))
6092 		return B_UNSUPPORTED;
6093 
6094 	status_t error = FS_CALL(vnode, read_dir, cookie, buffer, bufferSize,
6095 		_count);
6096 	if (error != B_OK)
6097 		return error;
6098 
6099 	// we need to adjust the read dirents
6100 	uint32 count = *_count;
6101 	for (uint32 i = 0; i < count; i++) {
6102 		error = fix_dirent(vnode, buffer, ioContext);
6103 		if (error != B_OK)
6104 			return error;
6105 
6106 		buffer = (struct dirent*)((uint8*)buffer + buffer->d_reclen);
6107 	}
6108 
6109 	return error;
6110 }
6111 
6112 
6113 static status_t
6114 dir_rewind(struct file_descriptor* descriptor)
6115 {
6116 	struct vnode* vnode = descriptor->u.vnode;
6117 
6118 	if (HAS_FS_CALL(vnode, rewind_dir)) {
6119 		return FS_CALL(vnode, rewind_dir, descriptor->cookie);
6120 	}
6121 
6122 	return B_UNSUPPORTED;
6123 }
6124 
6125 
6126 static status_t
6127 dir_remove(int fd, char* path, bool kernel)
6128 {
6129 	char name[B_FILE_NAME_LENGTH];
6130 	struct vnode* directory;
6131 	status_t status;
6132 
6133 	if (path != NULL) {
6134 		// we need to make sure our path name doesn't stop with "/", ".",
6135 		// or ".."
6136 		char* lastSlash;
6137 		while ((lastSlash = strrchr(path, '/')) != NULL) {
6138 			char* leaf = lastSlash + 1;
6139 			if (!strcmp(leaf, ".."))
6140 				return B_NOT_ALLOWED;
6141 
6142 			// omit multiple slashes
6143 			while (lastSlash > path && lastSlash[-1] == '/')
6144 				lastSlash--;
6145 
6146 			if (leaf[0]
6147 				&& strcmp(leaf, ".")) {
6148 				break;
6149 			}
6150 			// "name/" -> "name", or "name/." -> "name"
6151 			lastSlash[0] = '\0';
6152 		}
6153 
6154 		if (!strcmp(path, ".") || !strcmp(path, ".."))
6155 			return B_NOT_ALLOWED;
6156 	}
6157 
6158 	status = fd_and_path_to_dir_vnode(fd, path, &directory, name, kernel);
6159 	if (status != B_OK)
6160 		return status;
6161 
6162 	if (HAS_FS_CALL(directory, remove_dir))
6163 		status = FS_CALL(directory, remove_dir, name);
6164 	else
6165 		status = B_READ_ONLY_DEVICE;
6166 
6167 	put_vnode(directory);
6168 	return status;
6169 }
6170 
6171 
6172 static status_t
6173 common_ioctl(struct file_descriptor* descriptor, ulong op, void* buffer,
6174 	size_t length)
6175 {
6176 	struct vnode* vnode = descriptor->u.vnode;
6177 
6178 	if (HAS_FS_CALL(vnode, ioctl))
6179 		return FS_CALL(vnode, ioctl, descriptor->cookie, op, buffer, length);
6180 
6181 	return B_DEV_INVALID_IOCTL;
6182 }
6183 
6184 
6185 static status_t
6186 common_fcntl(int fd, int op, size_t argument, bool kernel)
6187 {
6188 	struct flock flock;
6189 
6190 	FUNCTION(("common_fcntl(fd = %d, op = %d, argument = %lx, %s)\n",
6191 		fd, op, argument, kernel ? "kernel" : "user"));
6192 
6193 	struct io_context* context = get_current_io_context(kernel);
6194 
6195 	struct file_descriptor* descriptor = get_fd(context, fd);
6196 	if (descriptor == NULL)
6197 		return B_FILE_ERROR;
6198 
6199 	struct vnode* vnode = fd_vnode(descriptor);
6200 
6201 	status_t status = B_OK;
6202 
6203 	if (op == F_SETLK || op == F_SETLKW || op == F_GETLK) {
6204 		if (descriptor->type != FDTYPE_FILE)
6205 			status = B_BAD_VALUE;
6206 		else if (kernel)
6207 			memcpy(&flock, (struct flock*)argument, sizeof(struct flock));
6208 		else if (user_memcpy(&flock, (struct flock*)argument,
6209 				sizeof(struct flock)) != B_OK)
6210 			status = B_BAD_ADDRESS;
6211 		if (status != B_OK) {
6212 			put_fd(descriptor);
6213 			return status;
6214 		}
6215 	}
6216 
6217 	switch (op) {
6218 		case F_SETFD:
6219 		{
6220 			// Set file descriptor flags
6221 
6222 			// O_CLOEXEC is the only flag available at this time
6223 			mutex_lock(&context->io_mutex);
6224 			fd_set_close_on_exec(context, fd, (argument & FD_CLOEXEC) != 0);
6225 			mutex_unlock(&context->io_mutex);
6226 
6227 			status = B_OK;
6228 			break;
6229 		}
6230 
6231 		case F_GETFD:
6232 		{
6233 			// Get file descriptor flags
6234 			mutex_lock(&context->io_mutex);
6235 			status = fd_close_on_exec(context, fd) ? FD_CLOEXEC : 0;
6236 			mutex_unlock(&context->io_mutex);
6237 			break;
6238 		}
6239 
6240 		case F_SETFL:
6241 			// Set file descriptor open mode
6242 
6243 			// we only accept changes to O_APPEND and O_NONBLOCK
6244 			argument &= O_APPEND | O_NONBLOCK;
6245 			if (descriptor->ops->fd_set_flags != NULL) {
6246 				status = descriptor->ops->fd_set_flags(descriptor, argument);
6247 			} else if (vnode != NULL && HAS_FS_CALL(vnode, set_flags)) {
6248 				status = FS_CALL(vnode, set_flags, descriptor->cookie,
6249 					(int)argument);
6250 			} else
6251 				status = B_UNSUPPORTED;
6252 
6253 			if (status == B_OK) {
6254 				// update this descriptor's open_mode field
6255 				descriptor->open_mode = (descriptor->open_mode
6256 					& ~(O_APPEND | O_NONBLOCK)) | argument;
6257 			}
6258 
6259 			break;
6260 
6261 		case F_GETFL:
6262 			// Get file descriptor open mode
6263 			status = descriptor->open_mode;
6264 			break;
6265 
6266 		case F_DUPFD:
6267 		case F_DUPFD_CLOEXEC:
6268 		{
6269 			status = new_fd_etc(context, descriptor, (int)argument);
6270 			if (status >= 0) {
6271 				mutex_lock(&context->io_mutex);
6272 				fd_set_close_on_exec(context, status, op == F_DUPFD_CLOEXEC);
6273 				mutex_unlock(&context->io_mutex);
6274 
6275 				atomic_add(&descriptor->ref_count, 1);
6276 			}
6277 			break;
6278 		}
6279 
6280 		case F_GETLK:
6281 			if (vnode != NULL) {
6282 				struct flock normalizedLock;
6283 
6284 				memcpy(&normalizedLock, &flock, sizeof(struct flock));
6285 				status = normalize_flock(descriptor, &normalizedLock);
6286 				if (status != B_OK)
6287 					break;
6288 
6289 				if (HAS_FS_CALL(vnode, test_lock)) {
6290 					status = FS_CALL(vnode, test_lock, descriptor->cookie,
6291 						&normalizedLock);
6292 				} else
6293 					status = test_advisory_lock(vnode, &normalizedLock);
6294 				if (status == B_OK) {
6295 					if (normalizedLock.l_type == F_UNLCK) {
6296 						// no conflicting lock found, copy back the same struct
6297 						// we were given except change type to F_UNLCK
6298 						flock.l_type = F_UNLCK;
6299 						if (kernel) {
6300 							memcpy((struct flock*)argument, &flock,
6301 								sizeof(struct flock));
6302 						} else {
6303 							status = user_memcpy((struct flock*)argument,
6304 								&flock, sizeof(struct flock));
6305 						}
6306 					} else {
6307 						// a conflicting lock was found, copy back its range and
6308 						// type
6309 						if (normalizedLock.l_len == OFF_MAX)
6310 							normalizedLock.l_len = 0;
6311 
6312 						if (kernel) {
6313 							memcpy((struct flock*)argument,
6314 								&normalizedLock, sizeof(struct flock));
6315 						} else {
6316 							status = user_memcpy((struct flock*)argument,
6317 								&normalizedLock, sizeof(struct flock));
6318 						}
6319 					}
6320 				}
6321 			} else
6322 				status = B_BAD_VALUE;
6323 			break;
6324 
6325 		case F_SETLK:
6326 		case F_SETLKW:
6327 			status = normalize_flock(descriptor, &flock);
6328 			if (status != B_OK)
6329 				break;
6330 
6331 			if (vnode == NULL) {
6332 				status = B_BAD_VALUE;
6333 			} else if (flock.l_type == F_UNLCK) {
6334 				if (HAS_FS_CALL(vnode, release_lock)) {
6335 					status = FS_CALL(vnode, release_lock, descriptor->cookie,
6336 						&flock);
6337 				} else {
6338 					status = release_advisory_lock(vnode, context, NULL,
6339 						&flock);
6340 				}
6341 			} else {
6342 				// the open mode must match the lock type
6343 				if (((descriptor->open_mode & O_RWMASK) == O_RDONLY
6344 						&& flock.l_type == F_WRLCK)
6345 					|| ((descriptor->open_mode & O_RWMASK) == O_WRONLY
6346 						&& flock.l_type == F_RDLCK))
6347 					status = B_FILE_ERROR;
6348 				else {
6349 					if (HAS_FS_CALL(vnode, acquire_lock)) {
6350 						status = FS_CALL(vnode, acquire_lock,
6351 							descriptor->cookie, &flock, op == F_SETLKW);
6352 					} else {
6353 						status = acquire_advisory_lock(vnode, context, NULL,
6354 							&flock, op == F_SETLKW);
6355 					}
6356 				}
6357 			}
6358 			break;
6359 
6360 		// ToDo: add support for more ops?
6361 
6362 		default:
6363 			status = B_BAD_VALUE;
6364 	}
6365 
6366 	put_fd(descriptor);
6367 	return status;
6368 }
6369 
6370 
6371 static status_t
6372 common_sync(int fd, bool kernel)
6373 {
6374 	struct file_descriptor* descriptor;
6375 	struct vnode* vnode;
6376 	status_t status;
6377 
6378 	FUNCTION(("common_fsync: entry. fd %d kernel %d\n", fd, kernel));
6379 
6380 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6381 	if (descriptor == NULL)
6382 		return B_FILE_ERROR;
6383 
6384 	if (HAS_FS_CALL(vnode, fsync))
6385 		status = FS_CALL_NO_PARAMS(vnode, fsync);
6386 	else
6387 		status = B_UNSUPPORTED;
6388 
6389 	put_fd(descriptor);
6390 	return status;
6391 }
6392 
6393 
6394 static status_t
6395 common_lock_node(int fd, bool kernel)
6396 {
6397 	struct file_descriptor* descriptor;
6398 	struct vnode* vnode;
6399 
6400 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6401 	if (descriptor == NULL)
6402 		return B_FILE_ERROR;
6403 
6404 	status_t status = B_OK;
6405 
6406 	// We need to set the locking atomically - someone
6407 	// else might set one at the same time
6408 	if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by, descriptor,
6409 			(file_descriptor*)NULL) != NULL)
6410 		status = B_BUSY;
6411 
6412 	put_fd(descriptor);
6413 	return status;
6414 }
6415 
6416 
6417 static status_t
6418 common_unlock_node(int fd, bool kernel)
6419 {
6420 	struct file_descriptor* descriptor;
6421 	struct vnode* vnode;
6422 
6423 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6424 	if (descriptor == NULL)
6425 		return B_FILE_ERROR;
6426 
6427 	status_t status = B_OK;
6428 
6429 	// We need to set the locking atomically - someone
6430 	// else might set one at the same time
6431 	if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by,
6432 			(file_descriptor*)NULL, descriptor) != descriptor)
6433 		status = B_BAD_VALUE;
6434 
6435 	put_fd(descriptor);
6436 	return status;
6437 }
6438 
6439 
6440 static status_t
6441 common_read_link(int fd, char* path, char* buffer, size_t* _bufferSize,
6442 	bool kernel)
6443 {
6444 	struct vnode* vnode;
6445 	status_t status;
6446 
6447 	status = fd_and_path_to_vnode(fd, path, false, &vnode, NULL, kernel);
6448 	if (status != B_OK)
6449 		return status;
6450 
6451 	if (HAS_FS_CALL(vnode, read_symlink)) {
6452 		status = FS_CALL(vnode, read_symlink, buffer, _bufferSize);
6453 	} else
6454 		status = B_BAD_VALUE;
6455 
6456 	put_vnode(vnode);
6457 	return status;
6458 }
6459 
6460 
6461 static status_t
6462 common_create_symlink(int fd, char* path, const char* toPath, int mode,
6463 	bool kernel)
6464 {
6465 	// path validity checks have to be in the calling function!
6466 	char name[B_FILE_NAME_LENGTH];
6467 	struct vnode* vnode;
6468 	status_t status;
6469 
6470 	FUNCTION(("common_create_symlink(fd = %d, path = %s, toPath = %s, "
6471 		"mode = %d, kernel = %d)\n", fd, path, toPath, mode, kernel));
6472 
6473 	status = fd_and_path_to_dir_vnode(fd, path, &vnode, name, kernel);
6474 	if (status != B_OK)
6475 		return status;
6476 
6477 	if (HAS_FS_CALL(vnode, create_symlink))
6478 		status = FS_CALL(vnode, create_symlink, name, toPath, mode);
6479 	else {
6480 		status = HAS_FS_CALL(vnode, write)
6481 			? B_UNSUPPORTED : B_READ_ONLY_DEVICE;
6482 	}
6483 
6484 	put_vnode(vnode);
6485 
6486 	return status;
6487 }
6488 
6489 
6490 static status_t
6491 common_create_link(int pathFD, char* path, int toFD, char* toPath,
6492 	bool traverseLeafLink, bool kernel)
6493 {
6494 	// path validity checks have to be in the calling function!
6495 
6496 	FUNCTION(("common_create_link(path = %s, toPath = %s, kernel = %d)\n", path,
6497 		toPath, kernel));
6498 
6499 	char name[B_FILE_NAME_LENGTH];
6500 	struct vnode* directory;
6501 	status_t status = fd_and_path_to_dir_vnode(pathFD, path, &directory, name,
6502 		kernel);
6503 	if (status != B_OK)
6504 		return status;
6505 
6506 	struct vnode* vnode;
6507 	status = fd_and_path_to_vnode(toFD, toPath, traverseLeafLink, &vnode, NULL,
6508 		kernel);
6509 	if (status != B_OK)
6510 		goto err;
6511 
6512 	if (directory->mount != vnode->mount) {
6513 		status = B_CROSS_DEVICE_LINK;
6514 		goto err1;
6515 	}
6516 
6517 	if (HAS_FS_CALL(directory, link))
6518 		status = FS_CALL(directory, link, name, vnode);
6519 	else
6520 		status = B_READ_ONLY_DEVICE;
6521 
6522 err1:
6523 	put_vnode(vnode);
6524 err:
6525 	put_vnode(directory);
6526 
6527 	return status;
6528 }
6529 
6530 
6531 static status_t
6532 common_unlink(int fd, char* path, bool kernel)
6533 {
6534 	char filename[B_FILE_NAME_LENGTH];
6535 	struct vnode* vnode;
6536 	status_t status;
6537 
6538 	FUNCTION(("common_unlink: fd: %d, path '%s', kernel %d\n", fd, path,
6539 		kernel));
6540 
6541 	status = fd_and_path_to_dir_vnode(fd, path, &vnode, filename, kernel);
6542 	if (status < 0)
6543 		return status;
6544 
6545 	if (HAS_FS_CALL(vnode, unlink))
6546 		status = FS_CALL(vnode, unlink, filename);
6547 	else
6548 		status = B_READ_ONLY_DEVICE;
6549 
6550 	put_vnode(vnode);
6551 
6552 	return status;
6553 }
6554 
6555 
6556 static status_t
6557 common_access(int fd, char* path, int mode, bool effectiveUserGroup, bool kernel)
6558 {
6559 	struct vnode* vnode;
6560 	status_t status;
6561 
6562 	// TODO: honor effectiveUserGroup argument
6563 
6564 	status = fd_and_path_to_vnode(fd, path, true, &vnode, NULL, kernel);
6565 	if (status != B_OK)
6566 		return status;
6567 
6568 	if (HAS_FS_CALL(vnode, access))
6569 		status = FS_CALL(vnode, access, mode);
6570 	else
6571 		status = B_OK;
6572 
6573 	put_vnode(vnode);
6574 
6575 	return status;
6576 }
6577 
6578 
6579 static status_t
6580 common_rename(int fd, char* path, int newFD, char* newPath, bool kernel)
6581 {
6582 	struct vnode* fromVnode;
6583 	struct vnode* toVnode;
6584 	char fromName[B_FILE_NAME_LENGTH];
6585 	char toName[B_FILE_NAME_LENGTH];
6586 	status_t status;
6587 
6588 	FUNCTION(("common_rename(fd = %d, path = %s, newFD = %d, newPath = %s, "
6589 		"kernel = %d)\n", fd, path, newFD, newPath, kernel));
6590 
6591 	status = fd_and_path_to_dir_vnode(fd, path, &fromVnode, fromName, kernel);
6592 	if (status != B_OK)
6593 		return status;
6594 
6595 	status = fd_and_path_to_dir_vnode(newFD, newPath, &toVnode, toName, kernel);
6596 	if (status != B_OK)
6597 		goto err1;
6598 
6599 	if (fromVnode->device != toVnode->device) {
6600 		status = B_CROSS_DEVICE_LINK;
6601 		goto err2;
6602 	}
6603 
6604 	if (fromName[0] == '\0' || toName[0] == '\0'
6605 		|| !strcmp(fromName, ".") || !strcmp(fromName, "..")
6606 		|| !strcmp(toName, ".") || !strcmp(toName, "..")
6607 		|| (fromVnode == toVnode && !strcmp(fromName, toName))) {
6608 		status = B_BAD_VALUE;
6609 		goto err2;
6610 	}
6611 
6612 	if (HAS_FS_CALL(fromVnode, rename))
6613 		status = FS_CALL(fromVnode, rename, fromName, toVnode, toName);
6614 	else
6615 		status = B_READ_ONLY_DEVICE;
6616 
6617 err2:
6618 	put_vnode(toVnode);
6619 err1:
6620 	put_vnode(fromVnode);
6621 
6622 	return status;
6623 }
6624 
6625 
6626 static status_t
6627 common_read_stat(struct file_descriptor* descriptor, struct stat* stat)
6628 {
6629 	struct vnode* vnode = descriptor->u.vnode;
6630 
6631 	FUNCTION(("common_read_stat: stat %p\n", stat));
6632 
6633 	// TODO: remove this once all file systems properly set them!
6634 	stat->st_crtim.tv_nsec = 0;
6635 	stat->st_ctim.tv_nsec = 0;
6636 	stat->st_mtim.tv_nsec = 0;
6637 	stat->st_atim.tv_nsec = 0;
6638 
6639 	return vfs_stat_vnode(vnode, stat);
6640 }
6641 
6642 
6643 static status_t
6644 common_write_stat(struct file_descriptor* descriptor, const struct stat* stat,
6645 	int statMask)
6646 {
6647 	struct vnode* vnode = descriptor->u.vnode;
6648 
6649 	FUNCTION(("common_write_stat(vnode = %p, stat = %p, statMask = %d)\n",
6650 		vnode, stat, statMask));
6651 
6652 	if ((descriptor->open_mode & O_RWMASK) == O_RDONLY
6653 		&& (statMask & B_STAT_SIZE) != 0) {
6654 		return B_BAD_VALUE;
6655 	}
6656 
6657 	if (!HAS_FS_CALL(vnode, write_stat))
6658 		return B_READ_ONLY_DEVICE;
6659 
6660 	return FS_CALL(vnode, write_stat, stat, statMask);
6661 }
6662 
6663 
6664 static status_t
6665 common_path_read_stat(int fd, char* path, bool traverseLeafLink,
6666 	struct stat* stat, bool kernel)
6667 {
6668 	FUNCTION(("common_path_read_stat: fd: %d, path '%s', stat %p,\n", fd, path,
6669 		stat));
6670 
6671 	struct vnode* vnode;
6672 	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6673 		NULL, kernel);
6674 	if (status != B_OK)
6675 		return status;
6676 
6677 	status = vfs_stat_vnode(vnode, stat);
6678 
6679 	put_vnode(vnode);
6680 	return status;
6681 }
6682 
6683 
6684 static status_t
6685 common_path_write_stat(int fd, char* path, bool traverseLeafLink,
6686 	const struct stat* stat, int statMask, bool kernel)
6687 {
6688 	FUNCTION(("common_write_stat: fd: %d, path '%s', stat %p, stat_mask %d, "
6689 		"kernel %d\n", fd, path, stat, statMask, kernel));
6690 
6691 	struct vnode* vnode;
6692 	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6693 		NULL, kernel);
6694 	if (status != B_OK)
6695 		return status;
6696 
6697 	if (HAS_FS_CALL(vnode, write_stat))
6698 		status = FS_CALL(vnode, write_stat, stat, statMask);
6699 	else
6700 		status = B_READ_ONLY_DEVICE;
6701 
6702 	put_vnode(vnode);
6703 
6704 	return status;
6705 }
6706 
6707 
6708 static int
6709 attr_dir_open(int fd, char* path, bool traverseLeafLink, bool kernel)
6710 {
6711 	FUNCTION(("attr_dir_open(fd = %d, path = '%s', kernel = %d)\n", fd, path,
6712 		kernel));
6713 
6714 	struct vnode* vnode;
6715 	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6716 		NULL, kernel);
6717 	if (status != B_OK)
6718 		return status;
6719 
6720 	status = open_attr_dir_vnode(vnode, kernel);
6721 	if (status < 0)
6722 		put_vnode(vnode);
6723 
6724 	return status;
6725 }
6726 
6727 
6728 static status_t
6729 attr_dir_close(struct file_descriptor* descriptor)
6730 {
6731 	struct vnode* vnode = descriptor->u.vnode;
6732 
6733 	FUNCTION(("attr_dir_close(descriptor = %p)\n", descriptor));
6734 
6735 	if (HAS_FS_CALL(vnode, close_attr_dir))
6736 		return FS_CALL(vnode, close_attr_dir, descriptor->cookie);
6737 
6738 	return B_OK;
6739 }
6740 
6741 
6742 static void
6743 attr_dir_free_fd(struct file_descriptor* descriptor)
6744 {
6745 	struct vnode* vnode = descriptor->u.vnode;
6746 
6747 	if (vnode != NULL) {
6748 		FS_CALL(vnode, free_attr_dir_cookie, descriptor->cookie);
6749 		put_vnode(vnode);
6750 	}
6751 }
6752 
6753 
6754 static status_t
6755 attr_dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
6756 	struct dirent* buffer, size_t bufferSize, uint32* _count)
6757 {
6758 	struct vnode* vnode = descriptor->u.vnode;
6759 
6760 	FUNCTION(("attr_dir_read(descriptor = %p)\n", descriptor));
6761 
6762 	if (HAS_FS_CALL(vnode, read_attr_dir))
6763 		return FS_CALL(vnode, read_attr_dir, descriptor->cookie, buffer,
6764 			bufferSize, _count);
6765 
6766 	return B_UNSUPPORTED;
6767 }
6768 
6769 
6770 static status_t
6771 attr_dir_rewind(struct file_descriptor* descriptor)
6772 {
6773 	struct vnode* vnode = descriptor->u.vnode;
6774 
6775 	FUNCTION(("attr_dir_rewind(descriptor = %p)\n", descriptor));
6776 
6777 	if (HAS_FS_CALL(vnode, rewind_attr_dir))
6778 		return FS_CALL(vnode, rewind_attr_dir, descriptor->cookie);
6779 
6780 	return B_UNSUPPORTED;
6781 }
6782 
6783 
6784 static int
6785 attr_create(int fd, char* path, const char* name, uint32 type,
6786 	int openMode, bool kernel)
6787 {
6788 	if (name == NULL || *name == '\0')
6789 		return B_BAD_VALUE;
6790 
6791 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
6792 	struct vnode* vnode;
6793 	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode, NULL,
6794 		kernel);
6795 	if (status != B_OK)
6796 		return status;
6797 
6798 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
6799 		status = B_LINK_LIMIT;
6800 		goto err;
6801 	}
6802 
6803 	if (!HAS_FS_CALL(vnode, create_attr)) {
6804 		status = B_READ_ONLY_DEVICE;
6805 		goto err;
6806 	}
6807 
6808 	void* cookie;
6809 	status = FS_CALL(vnode, create_attr, name, type, openMode, &cookie);
6810 	if (status != B_OK)
6811 		goto err;
6812 
6813 	fd = get_new_fd(FDTYPE_ATTR, NULL, vnode, cookie, openMode, kernel);
6814 	if (fd >= 0)
6815 		return fd;
6816 
6817 	status = fd;
6818 
6819 	FS_CALL(vnode, close_attr, cookie);
6820 	FS_CALL(vnode, free_attr_cookie, cookie);
6821 
6822 	FS_CALL(vnode, remove_attr, name);
6823 
6824 err:
6825 	put_vnode(vnode);
6826 
6827 	return status;
6828 }
6829 
6830 
6831 static int
6832 attr_open(int fd, char* path, const char* name, int openMode, bool kernel)
6833 {
6834 	if (name == NULL || *name == '\0')
6835 		return B_BAD_VALUE;
6836 
6837 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
6838 	struct vnode* vnode;
6839 	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode, NULL,
6840 		kernel);
6841 	if (status != B_OK)
6842 		return status;
6843 
6844 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
6845 		status = B_LINK_LIMIT;
6846 		goto err;
6847 	}
6848 
6849 	if (!HAS_FS_CALL(vnode, open_attr)) {
6850 		status = B_UNSUPPORTED;
6851 		goto err;
6852 	}
6853 
6854 	void* cookie;
6855 	status = FS_CALL(vnode, open_attr, name, openMode, &cookie);
6856 	if (status != B_OK)
6857 		goto err;
6858 
6859 	// now we only need a file descriptor for this attribute and we're done
6860 	fd = get_new_fd(FDTYPE_ATTR, NULL, vnode, cookie, openMode, kernel);
6861 	if (fd >= 0)
6862 		return fd;
6863 
6864 	status = fd;
6865 
6866 	FS_CALL(vnode, close_attr, cookie);
6867 	FS_CALL(vnode, free_attr_cookie, cookie);
6868 
6869 err:
6870 	put_vnode(vnode);
6871 
6872 	return status;
6873 }
6874 
6875 
6876 static status_t
6877 attr_close(struct file_descriptor* descriptor)
6878 {
6879 	struct vnode* vnode = descriptor->u.vnode;
6880 
6881 	FUNCTION(("attr_close(descriptor = %p)\n", descriptor));
6882 
6883 	if (HAS_FS_CALL(vnode, close_attr))
6884 		return FS_CALL(vnode, close_attr, descriptor->cookie);
6885 
6886 	return B_OK;
6887 }
6888 
6889 
6890 static void
6891 attr_free_fd(struct file_descriptor* descriptor)
6892 {
6893 	struct vnode* vnode = descriptor->u.vnode;
6894 
6895 	if (vnode != NULL) {
6896 		FS_CALL(vnode, free_attr_cookie, descriptor->cookie);
6897 		put_vnode(vnode);
6898 	}
6899 }
6900 
6901 
6902 static status_t
6903 attr_read(struct file_descriptor* descriptor, off_t pos, void* buffer,
6904 	size_t* length)
6905 {
6906 	struct vnode* vnode = descriptor->u.vnode;
6907 
6908 	FUNCTION(("attr_read: buf %p, pos %" B_PRIdOFF ", len %p = %ld\n", buffer,
6909 		pos, length, *length));
6910 
6911 	if (!HAS_FS_CALL(vnode, read_attr))
6912 		return B_UNSUPPORTED;
6913 
6914 	return FS_CALL(vnode, read_attr, descriptor->cookie, pos, buffer, length);
6915 }
6916 
6917 
6918 static status_t
6919 attr_write(struct file_descriptor* descriptor, off_t pos, const void* buffer,
6920 	size_t* length)
6921 {
6922 	struct vnode* vnode = descriptor->u.vnode;
6923 
6924 	FUNCTION(("attr_write: buf %p, pos %" B_PRIdOFF ", len %p\n", buffer, pos,
6925 		length));
6926 
6927 	if (!HAS_FS_CALL(vnode, write_attr))
6928 		return B_UNSUPPORTED;
6929 
6930 	return FS_CALL(vnode, write_attr, descriptor->cookie, pos, buffer, length);
6931 }
6932 
6933 
6934 static off_t
6935 attr_seek(struct file_descriptor* descriptor, off_t pos, int seekType)
6936 {
6937 	off_t offset;
6938 
6939 	switch (seekType) {
6940 		case SEEK_SET:
6941 			offset = 0;
6942 			break;
6943 		case SEEK_CUR:
6944 			offset = descriptor->pos;
6945 			break;
6946 		case SEEK_END:
6947 		{
6948 			struct vnode* vnode = descriptor->u.vnode;
6949 			if (!HAS_FS_CALL(vnode, read_stat))
6950 				return B_UNSUPPORTED;
6951 
6952 			struct stat stat;
6953 			status_t status = FS_CALL(vnode, read_attr_stat, descriptor->cookie,
6954 				&stat);
6955 			if (status != B_OK)
6956 				return status;
6957 
6958 			offset = stat.st_size;
6959 			break;
6960 		}
6961 		default:
6962 			return B_BAD_VALUE;
6963 	}
6964 
6965 	// assumes off_t is 64 bits wide
6966 	if (offset > 0 && LONGLONG_MAX - offset < pos)
6967 		return B_BUFFER_OVERFLOW;
6968 
6969 	pos += offset;
6970 	if (pos < 0)
6971 		return B_BAD_VALUE;
6972 
6973 	return descriptor->pos = pos;
6974 }
6975 
6976 
6977 static status_t
6978 attr_read_stat(struct file_descriptor* descriptor, struct stat* stat)
6979 {
6980 	struct vnode* vnode = descriptor->u.vnode;
6981 
6982 	FUNCTION(("attr_read_stat: stat 0x%p\n", stat));
6983 
6984 	if (!HAS_FS_CALL(vnode, read_attr_stat))
6985 		return B_UNSUPPORTED;
6986 
6987 	return FS_CALL(vnode, read_attr_stat, descriptor->cookie, stat);
6988 }
6989 
6990 
6991 static status_t
6992 attr_write_stat(struct file_descriptor* descriptor, const struct stat* stat,
6993 	int statMask)
6994 {
6995 	struct vnode* vnode = descriptor->u.vnode;
6996 
6997 	FUNCTION(("attr_write_stat: stat = %p, statMask %d\n", stat, statMask));
6998 
6999 	if (!HAS_FS_CALL(vnode, write_attr_stat))
7000 		return B_READ_ONLY_DEVICE;
7001 
7002 	return FS_CALL(vnode, write_attr_stat, descriptor->cookie, stat, statMask);
7003 }
7004 
7005 
7006 static status_t
7007 attr_remove(int fd, const char* name, bool kernel)
7008 {
7009 	struct file_descriptor* descriptor;
7010 	struct vnode* vnode;
7011 	status_t status;
7012 
7013 	if (name == NULL || *name == '\0')
7014 		return B_BAD_VALUE;
7015 
7016 	FUNCTION(("attr_remove: fd = %d, name = \"%s\", kernel %d\n", fd, name,
7017 		kernel));
7018 
7019 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
7020 	if (descriptor == NULL)
7021 		return B_FILE_ERROR;
7022 
7023 	if (HAS_FS_CALL(vnode, remove_attr))
7024 		status = FS_CALL(vnode, remove_attr, name);
7025 	else
7026 		status = B_READ_ONLY_DEVICE;
7027 
7028 	put_fd(descriptor);
7029 
7030 	return status;
7031 }
7032 
7033 
7034 static status_t
7035 attr_rename(int fromFD, const char* fromName, int toFD, const char* toName,
7036 	bool kernel)
7037 {
7038 	struct file_descriptor* fromDescriptor;
7039 	struct file_descriptor* toDescriptor;
7040 	struct vnode* fromVnode;
7041 	struct vnode* toVnode;
7042 	status_t status;
7043 
7044 	if (fromName == NULL || *fromName == '\0' || toName == NULL
7045 		|| *toName == '\0')
7046 		return B_BAD_VALUE;
7047 
7048 	FUNCTION(("attr_rename: from fd = %d, from name = \"%s\", to fd = %d, to "
7049 		"name = \"%s\", kernel %d\n", fromFD, fromName, toFD, toName, kernel));
7050 
7051 	fromDescriptor = get_fd_and_vnode(fromFD, &fromVnode, kernel);
7052 	if (fromDescriptor == NULL)
7053 		return B_FILE_ERROR;
7054 
7055 	toDescriptor = get_fd_and_vnode(toFD, &toVnode, kernel);
7056 	if (toDescriptor == NULL) {
7057 		status = B_FILE_ERROR;
7058 		goto err;
7059 	}
7060 
7061 	// are the files on the same volume?
7062 	if (fromVnode->device != toVnode->device) {
7063 		status = B_CROSS_DEVICE_LINK;
7064 		goto err1;
7065 	}
7066 
7067 	if (HAS_FS_CALL(fromVnode, rename_attr)) {
7068 		status = FS_CALL(fromVnode, rename_attr, fromName, toVnode, toName);
7069 	} else
7070 		status = B_READ_ONLY_DEVICE;
7071 
7072 err1:
7073 	put_fd(toDescriptor);
7074 err:
7075 	put_fd(fromDescriptor);
7076 
7077 	return status;
7078 }
7079 
7080 
7081 static int
7082 index_dir_open(dev_t mountID, bool kernel)
7083 {
7084 	struct fs_mount* mount;
7085 	void* cookie;
7086 
7087 	FUNCTION(("index_dir_open(mountID = %" B_PRId32 ", kernel = %d)\n", mountID,
7088 		kernel));
7089 
7090 	status_t status = get_mount(mountID, &mount);
7091 	if (status != B_OK)
7092 		return status;
7093 
7094 	if (!HAS_FS_MOUNT_CALL(mount, open_index_dir)) {
7095 		status = B_UNSUPPORTED;
7096 		goto error;
7097 	}
7098 
7099 	status = FS_MOUNT_CALL(mount, open_index_dir, &cookie);
7100 	if (status != B_OK)
7101 		goto error;
7102 
7103 	// get fd for the index directory
7104 	int fd;
7105 	fd = get_new_fd(FDTYPE_INDEX_DIR, mount, NULL, cookie, O_CLOEXEC, kernel);
7106 	if (fd >= 0)
7107 		return fd;
7108 
7109 	// something went wrong
7110 	FS_MOUNT_CALL(mount, close_index_dir, cookie);
7111 	FS_MOUNT_CALL(mount, free_index_dir_cookie, cookie);
7112 
7113 	status = fd;
7114 
7115 error:
7116 	put_mount(mount);
7117 	return status;
7118 }
7119 
7120 
7121 static status_t
7122 index_dir_close(struct file_descriptor* descriptor)
7123 {
7124 	struct fs_mount* mount = descriptor->u.mount;
7125 
7126 	FUNCTION(("index_dir_close(descriptor = %p)\n", descriptor));
7127 
7128 	if (HAS_FS_MOUNT_CALL(mount, close_index_dir))
7129 		return FS_MOUNT_CALL(mount, close_index_dir, descriptor->cookie);
7130 
7131 	return B_OK;
7132 }
7133 
7134 
7135 static void
7136 index_dir_free_fd(struct file_descriptor* descriptor)
7137 {
7138 	struct fs_mount* mount = descriptor->u.mount;
7139 
7140 	if (mount != NULL) {
7141 		FS_MOUNT_CALL(mount, free_index_dir_cookie, descriptor->cookie);
7142 		put_mount(mount);
7143 	}
7144 }
7145 
7146 
7147 static status_t
7148 index_dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
7149 	struct dirent* buffer, size_t bufferSize, uint32* _count)
7150 {
7151 	struct fs_mount* mount = descriptor->u.mount;
7152 
7153 	if (HAS_FS_MOUNT_CALL(mount, read_index_dir)) {
7154 		return FS_MOUNT_CALL(mount, read_index_dir, descriptor->cookie, buffer,
7155 			bufferSize, _count);
7156 	}
7157 
7158 	return B_UNSUPPORTED;
7159 }
7160 
7161 
7162 static status_t
7163 index_dir_rewind(struct file_descriptor* descriptor)
7164 {
7165 	struct fs_mount* mount = descriptor->u.mount;
7166 
7167 	if (HAS_FS_MOUNT_CALL(mount, rewind_index_dir))
7168 		return FS_MOUNT_CALL(mount, rewind_index_dir, descriptor->cookie);
7169 
7170 	return B_UNSUPPORTED;
7171 }
7172 
7173 
7174 static status_t
7175 index_create(dev_t mountID, const char* name, uint32 type, uint32 flags,
7176 	bool kernel)
7177 {
7178 	FUNCTION(("index_create(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7179 		mountID, name, kernel));
7180 
7181 	struct fs_mount* mount;
7182 	status_t status = get_mount(mountID, &mount);
7183 	if (status != B_OK)
7184 		return status;
7185 
7186 	if (!HAS_FS_MOUNT_CALL(mount, create_index)) {
7187 		status = B_READ_ONLY_DEVICE;
7188 		goto out;
7189 	}
7190 
7191 	status = FS_MOUNT_CALL(mount, create_index, name, type, flags);
7192 
7193 out:
7194 	put_mount(mount);
7195 	return status;
7196 }
7197 
7198 
7199 #if 0
7200 static status_t
7201 index_read_stat(struct file_descriptor* descriptor, struct stat* stat)
7202 {
7203 	struct vnode* vnode = descriptor->u.vnode;
7204 
7205 	// ToDo: currently unused!
7206 	FUNCTION(("index_read_stat: stat 0x%p\n", stat));
7207 	if (!HAS_FS_CALL(vnode, read_index_stat))
7208 		return B_UNSUPPORTED;
7209 
7210 	return B_UNSUPPORTED;
7211 	//return FS_CALL(vnode, read_index_stat, descriptor->cookie, stat);
7212 }
7213 
7214 
7215 static void
7216 index_free_fd(struct file_descriptor* descriptor)
7217 {
7218 	struct vnode* vnode = descriptor->u.vnode;
7219 
7220 	if (vnode != NULL) {
7221 		FS_CALL(vnode, free_index_cookie, descriptor->cookie);
7222 		put_vnode(vnode);
7223 	}
7224 }
7225 #endif
7226 
7227 
7228 static status_t
7229 index_name_read_stat(dev_t mountID, const char* name, struct stat* stat,
7230 	bool kernel)
7231 {
7232 	FUNCTION(("index_remove(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7233 		mountID, name, kernel));
7234 
7235 	struct fs_mount* mount;
7236 	status_t status = get_mount(mountID, &mount);
7237 	if (status != B_OK)
7238 		return status;
7239 
7240 	if (!HAS_FS_MOUNT_CALL(mount, read_index_stat)) {
7241 		status = B_UNSUPPORTED;
7242 		goto out;
7243 	}
7244 
7245 	status = FS_MOUNT_CALL(mount, read_index_stat, name, stat);
7246 
7247 out:
7248 	put_mount(mount);
7249 	return status;
7250 }
7251 
7252 
7253 static status_t
7254 index_remove(dev_t mountID, const char* name, bool kernel)
7255 {
7256 	FUNCTION(("index_remove(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7257 		mountID, name, kernel));
7258 
7259 	struct fs_mount* mount;
7260 	status_t status = get_mount(mountID, &mount);
7261 	if (status != B_OK)
7262 		return status;
7263 
7264 	if (!HAS_FS_MOUNT_CALL(mount, remove_index)) {
7265 		status = B_READ_ONLY_DEVICE;
7266 		goto out;
7267 	}
7268 
7269 	status = FS_MOUNT_CALL(mount, remove_index, name);
7270 
7271 out:
7272 	put_mount(mount);
7273 	return status;
7274 }
7275 
7276 
7277 /*!	TODO: the query FS API is still the pretty much the same as in R5.
7278 		It would be nice if the FS would find some more kernel support
7279 		for them.
7280 		For example, query parsing should be moved into the kernel.
7281 */
7282 static int
7283 query_open(dev_t device, const char* query, uint32 flags, port_id port,
7284 	int32 token, bool kernel)
7285 {
7286 	struct fs_mount* mount;
7287 	void* cookie;
7288 
7289 	FUNCTION(("query_open(device = %" B_PRId32 ", query = \"%s\", kernel = %d)\n",
7290 		device, query, kernel));
7291 
7292 	status_t status = get_mount(device, &mount);
7293 	if (status != B_OK)
7294 		return status;
7295 
7296 	if (!HAS_FS_MOUNT_CALL(mount, open_query)) {
7297 		status = B_UNSUPPORTED;
7298 		goto error;
7299 	}
7300 
7301 	status = FS_MOUNT_CALL(mount, open_query, query, flags, port, token,
7302 		&cookie);
7303 	if (status != B_OK)
7304 		goto error;
7305 
7306 	// get fd for the index directory
7307 	int fd;
7308 	fd = get_new_fd(FDTYPE_QUERY, mount, NULL, cookie, O_CLOEXEC, kernel);
7309 	if (fd >= 0)
7310 		return fd;
7311 
7312 	status = fd;
7313 
7314 	// something went wrong
7315 	FS_MOUNT_CALL(mount, close_query, cookie);
7316 	FS_MOUNT_CALL(mount, free_query_cookie, cookie);
7317 
7318 error:
7319 	put_mount(mount);
7320 	return status;
7321 }
7322 
7323 
7324 static status_t
7325 query_close(struct file_descriptor* descriptor)
7326 {
7327 	struct fs_mount* mount = descriptor->u.mount;
7328 
7329 	FUNCTION(("query_close(descriptor = %p)\n", descriptor));
7330 
7331 	if (HAS_FS_MOUNT_CALL(mount, close_query))
7332 		return FS_MOUNT_CALL(mount, close_query, descriptor->cookie);
7333 
7334 	return B_OK;
7335 }
7336 
7337 
7338 static void
7339 query_free_fd(struct file_descriptor* descriptor)
7340 {
7341 	struct fs_mount* mount = descriptor->u.mount;
7342 
7343 	if (mount != NULL) {
7344 		FS_MOUNT_CALL(mount, free_query_cookie, descriptor->cookie);
7345 		put_mount(mount);
7346 	}
7347 }
7348 
7349 
7350 static status_t
7351 query_read(struct io_context* ioContext, struct file_descriptor* descriptor,
7352 	struct dirent* buffer, size_t bufferSize, uint32* _count)
7353 {
7354 	struct fs_mount* mount = descriptor->u.mount;
7355 
7356 	if (HAS_FS_MOUNT_CALL(mount, read_query)) {
7357 		return FS_MOUNT_CALL(mount, read_query, descriptor->cookie, buffer,
7358 			bufferSize, _count);
7359 	}
7360 
7361 	return B_UNSUPPORTED;
7362 }
7363 
7364 
7365 static status_t
7366 query_rewind(struct file_descriptor* descriptor)
7367 {
7368 	struct fs_mount* mount = descriptor->u.mount;
7369 
7370 	if (HAS_FS_MOUNT_CALL(mount, rewind_query))
7371 		return FS_MOUNT_CALL(mount, rewind_query, descriptor->cookie);
7372 
7373 	return B_UNSUPPORTED;
7374 }
7375 
7376 
7377 //	#pragma mark - General File System functions
7378 
7379 
7380 static dev_t
7381 fs_mount(char* path, const char* device, const char* fsName, uint32 flags,
7382 	const char* args, bool kernel)
7383 {
7384 	struct ::fs_mount* mount;
7385 	status_t status = B_OK;
7386 	fs_volume* volume = NULL;
7387 	int32 layer = 0;
7388 	Vnode* coveredNode = NULL;
7389 
7390 	FUNCTION(("fs_mount: path = '%s', device = '%s', fs_name = '%s', flags = %#"
7391 		B_PRIx32 ", args = '%s'\n", path, device, fsName, flags, args));
7392 
7393 	// The path is always safe, we just have to make sure that fsName is
7394 	// almost valid - we can't make any assumptions about args, though.
7395 	// A NULL fsName is OK, if a device was given and the FS is not virtual.
7396 	// We'll get it from the DDM later.
7397 	if (fsName == NULL) {
7398 		if (!device || flags & B_MOUNT_VIRTUAL_DEVICE)
7399 			return B_BAD_VALUE;
7400 	} else if (fsName[0] == '\0')
7401 		return B_BAD_VALUE;
7402 
7403 	RecursiveLocker mountOpLocker(sMountOpLock);
7404 
7405 	// Helper to delete a newly created file device on failure.
7406 	// Not exactly beautiful, but helps to keep the code below cleaner.
7407 	struct FileDeviceDeleter {
7408 		FileDeviceDeleter() : id(-1) {}
7409 		~FileDeviceDeleter()
7410 		{
7411 			KDiskDeviceManager::Default()->DeleteFileDevice(id);
7412 		}
7413 
7414 		partition_id id;
7415 	} fileDeviceDeleter;
7416 
7417 	// If the file system is not a "virtual" one, the device argument should
7418 	// point to a real file/device (if given at all).
7419 	// get the partition
7420 	KDiskDeviceManager* ddm = KDiskDeviceManager::Default();
7421 	KPartition* partition = NULL;
7422 	KPath normalizedDevice;
7423 	bool newlyCreatedFileDevice = false;
7424 
7425 	if (!(flags & B_MOUNT_VIRTUAL_DEVICE) && device != NULL) {
7426 		// normalize the device path
7427 		status = normalizedDevice.SetTo(device, true);
7428 		if (status != B_OK)
7429 			return status;
7430 
7431 		// get a corresponding partition from the DDM
7432 		partition = ddm->RegisterPartition(normalizedDevice.Path());
7433 		if (partition == NULL) {
7434 			// Partition not found: This either means, the user supplied
7435 			// an invalid path, or the path refers to an image file. We try
7436 			// to let the DDM create a file device for the path.
7437 			partition_id deviceID = ddm->CreateFileDevice(
7438 				normalizedDevice.Path(), &newlyCreatedFileDevice);
7439 			if (deviceID >= 0) {
7440 				partition = ddm->RegisterPartition(deviceID);
7441 				if (newlyCreatedFileDevice)
7442 					fileDeviceDeleter.id = deviceID;
7443 			}
7444 		}
7445 
7446 		if (!partition) {
7447 			TRACE(("fs_mount(): Partition `%s' not found.\n",
7448 				normalizedDevice.Path()));
7449 			return B_ENTRY_NOT_FOUND;
7450 		}
7451 
7452 		device = normalizedDevice.Path();
7453 			// correct path to file device
7454 	}
7455 	PartitionRegistrar partitionRegistrar(partition, true);
7456 
7457 	// Write lock the partition's device. For the time being, we keep the lock
7458 	// until we're done mounting -- not nice, but ensure, that no-one is
7459 	// interfering.
7460 	// TODO: Just mark the partition busy while mounting!
7461 	KDiskDevice* diskDevice = NULL;
7462 	if (partition) {
7463 		diskDevice = ddm->WriteLockDevice(partition->Device()->ID());
7464 		if (!diskDevice) {
7465 			TRACE(("fs_mount(): Failed to lock disk device!\n"));
7466 			return B_ERROR;
7467 		}
7468 	}
7469 
7470 	DeviceWriteLocker writeLocker(diskDevice, true);
7471 		// this takes over the write lock acquired before
7472 
7473 	if (partition != NULL) {
7474 		// make sure, that the partition is not busy
7475 		if (partition->IsBusy()) {
7476 			TRACE(("fs_mount(): Partition is busy.\n"));
7477 			return B_BUSY;
7478 		}
7479 
7480 		// if no FS name had been supplied, we get it from the partition
7481 		if (fsName == NULL) {
7482 			KDiskSystem* diskSystem = partition->DiskSystem();
7483 			if (!diskSystem) {
7484 				TRACE(("fs_mount(): No FS name was given, and the DDM didn't "
7485 					"recognize it.\n"));
7486 				return B_BAD_VALUE;
7487 			}
7488 
7489 			if (!diskSystem->IsFileSystem()) {
7490 				TRACE(("fs_mount(): No FS name was given, and the DDM found a "
7491 					"partitioning system.\n"));
7492 				return B_BAD_VALUE;
7493 			}
7494 
7495 			// The disk system name will not change, and the KDiskSystem
7496 			// object will not go away while the disk device is locked (and
7497 			// the partition has a reference to it), so this is safe.
7498 			fsName = diskSystem->Name();
7499 		}
7500 	}
7501 
7502 	mount = new(std::nothrow) (struct ::fs_mount);
7503 	if (mount == NULL)
7504 		return B_NO_MEMORY;
7505 
7506 	mount->device_name = strdup(device);
7507 		// "device" can be NULL
7508 
7509 	status = mount->entry_cache.Init();
7510 	if (status != B_OK)
7511 		goto err1;
7512 
7513 	// initialize structure
7514 	mount->id = sNextMountID++;
7515 	mount->partition = NULL;
7516 	mount->root_vnode = NULL;
7517 	mount->covers_vnode = NULL;
7518 	mount->unmounting = false;
7519 	mount->owns_file_device = false;
7520 	mount->volume = NULL;
7521 
7522 	// build up the volume(s)
7523 	while (true) {
7524 		char* layerFSName = get_file_system_name_for_layer(fsName, layer);
7525 		if (layerFSName == NULL) {
7526 			if (layer == 0) {
7527 				status = B_NO_MEMORY;
7528 				goto err1;
7529 			}
7530 
7531 			break;
7532 		}
7533 		MemoryDeleter layerFSNameDeleter(layerFSName);
7534 
7535 		volume = (fs_volume*)malloc(sizeof(fs_volume));
7536 		if (volume == NULL) {
7537 			status = B_NO_MEMORY;
7538 			goto err1;
7539 		}
7540 
7541 		volume->id = mount->id;
7542 		volume->partition = partition != NULL ? partition->ID() : -1;
7543 		volume->layer = layer++;
7544 		volume->private_volume = NULL;
7545 		volume->ops = NULL;
7546 		volume->sub_volume = NULL;
7547 		volume->super_volume = NULL;
7548 		volume->file_system = NULL;
7549 		volume->file_system_name = NULL;
7550 
7551 		volume->file_system_name = get_file_system_name(layerFSName);
7552 		if (volume->file_system_name == NULL) {
7553 			status = B_NO_MEMORY;
7554 			free(volume);
7555 			goto err1;
7556 		}
7557 
7558 		volume->file_system = get_file_system(layerFSName);
7559 		if (volume->file_system == NULL) {
7560 			status = B_DEVICE_NOT_FOUND;
7561 			free(volume->file_system_name);
7562 			free(volume);
7563 			goto err1;
7564 		}
7565 
7566 		if (mount->volume == NULL)
7567 			mount->volume = volume;
7568 		else {
7569 			volume->super_volume = mount->volume;
7570 			mount->volume->sub_volume = volume;
7571 			mount->volume = volume;
7572 		}
7573 	}
7574 
7575 	// insert mount struct into list before we call FS's mount() function
7576 	// so that vnodes can be created for this mount
7577 	rw_lock_write_lock(&sMountLock);
7578 	sMountsTable->Insert(mount);
7579 	rw_lock_write_unlock(&sMountLock);
7580 
7581 	ino_t rootID;
7582 
7583 	if (!sRoot) {
7584 		// we haven't mounted anything yet
7585 		if (strcmp(path, "/") != 0) {
7586 			status = B_ERROR;
7587 			goto err2;
7588 		}
7589 
7590 		status = mount->volume->file_system->mount(mount->volume, device, flags,
7591 			args, &rootID);
7592 		if (status != B_OK || mount->volume->ops == NULL)
7593 			goto err2;
7594 	} else {
7595 		status = path_to_vnode(path, true, &coveredNode, NULL, kernel);
7596 		if (status != B_OK)
7597 			goto err2;
7598 
7599 		mount->covers_vnode = coveredNode;
7600 
7601 		// make sure covered_vnode is a directory
7602 		if (!S_ISDIR(coveredNode->Type())) {
7603 			status = B_NOT_A_DIRECTORY;
7604 			goto err3;
7605 		}
7606 
7607 		if (coveredNode->IsCovered()) {
7608 			// this is already a covered vnode
7609 			status = B_BUSY;
7610 			goto err3;
7611 		}
7612 
7613 		// mount it/them
7614 		fs_volume* volume = mount->volume;
7615 		while (volume) {
7616 			status = volume->file_system->mount(volume, device, flags, args,
7617 				&rootID);
7618 			if (status != B_OK || volume->ops == NULL) {
7619 				if (status == B_OK && volume->ops == NULL)
7620 					panic("fs_mount: mount() succeeded but ops is NULL!");
7621 				if (volume->sub_volume)
7622 					goto err4;
7623 				goto err3;
7624 			}
7625 
7626 			volume = volume->super_volume;
7627 		}
7628 
7629 		volume = mount->volume;
7630 		while (volume) {
7631 			if (volume->ops->all_layers_mounted != NULL)
7632 				volume->ops->all_layers_mounted(volume);
7633 			volume = volume->super_volume;
7634 		}
7635 	}
7636 
7637 	// the root node is supposed to be owned by the file system - it must
7638 	// exist at this point
7639 	mount->root_vnode = lookup_vnode(mount->id, rootID);
7640 	if (mount->root_vnode == NULL || mount->root_vnode->ref_count != 1) {
7641 		panic("fs_mount: file system does not own its root node!\n");
7642 		status = B_ERROR;
7643 		goto err4;
7644 	}
7645 
7646 	// set up the links between the root vnode and the vnode it covers
7647 	rw_lock_write_lock(&sVnodeLock);
7648 	if (coveredNode != NULL) {
7649 		if (coveredNode->IsCovered()) {
7650 			// the vnode is covered now
7651 			status = B_BUSY;
7652 			rw_lock_write_unlock(&sVnodeLock);
7653 			goto err4;
7654 		}
7655 
7656 		mount->root_vnode->covers = coveredNode;
7657 		mount->root_vnode->SetCovering(true);
7658 
7659 		coveredNode->covered_by = mount->root_vnode;
7660 		coveredNode->SetCovered(true);
7661 	}
7662 	rw_lock_write_unlock(&sVnodeLock);
7663 
7664 	if (!sRoot) {
7665 		sRoot = mount->root_vnode;
7666 		mutex_lock(&sIOContextRootLock);
7667 		get_current_io_context(true)->root = sRoot;
7668 		mutex_unlock(&sIOContextRootLock);
7669 		inc_vnode_ref_count(sRoot);
7670 	}
7671 
7672 	// supply the partition (if any) with the mount cookie and mark it mounted
7673 	if (partition) {
7674 		partition->SetMountCookie(mount->volume->private_volume);
7675 		partition->SetVolumeID(mount->id);
7676 
7677 		// keep a partition reference as long as the partition is mounted
7678 		partitionRegistrar.Detach();
7679 		mount->partition = partition;
7680 		mount->owns_file_device = newlyCreatedFileDevice;
7681 		fileDeviceDeleter.id = -1;
7682 	}
7683 
7684 	notify_mount(mount->id,
7685 		coveredNode != NULL ? coveredNode->device : -1,
7686 		coveredNode ? coveredNode->id : -1);
7687 
7688 	return mount->id;
7689 
7690 err4:
7691 	FS_MOUNT_CALL_NO_PARAMS(mount, unmount);
7692 err3:
7693 	if (coveredNode != NULL)
7694 		put_vnode(coveredNode);
7695 err2:
7696 	rw_lock_write_lock(&sMountLock);
7697 	sMountsTable->Remove(mount);
7698 	rw_lock_write_unlock(&sMountLock);
7699 err1:
7700 	delete mount;
7701 
7702 	return status;
7703 }
7704 
7705 
7706 static status_t
7707 fs_unmount(char* path, dev_t mountID, uint32 flags, bool kernel)
7708 {
7709 	struct fs_mount* mount;
7710 	status_t err;
7711 
7712 	FUNCTION(("fs_unmount(path '%s', dev %" B_PRId32 ", kernel %d\n", path,
7713 		mountID, kernel));
7714 
7715 	struct vnode* pathVnode = NULL;
7716 	if (path != NULL) {
7717 		err = path_to_vnode(path, true, &pathVnode, NULL, kernel);
7718 		if (err != B_OK)
7719 			return B_ENTRY_NOT_FOUND;
7720 	}
7721 
7722 	RecursiveLocker mountOpLocker(sMountOpLock);
7723 	ReadLocker mountLocker(sMountLock);
7724 
7725 	mount = find_mount(path != NULL ? pathVnode->device : mountID);
7726 	if (mount == NULL) {
7727 		panic("fs_unmount: find_mount() failed on root vnode @%p of mount\n",
7728 			pathVnode);
7729 	}
7730 
7731 	mountLocker.Unlock();
7732 
7733 	if (path != NULL) {
7734 		put_vnode(pathVnode);
7735 
7736 		if (mount->root_vnode != pathVnode) {
7737 			// not mountpoint
7738 			return B_BAD_VALUE;
7739 		}
7740 	}
7741 
7742 	// if the volume is associated with a partition, lock the device of the
7743 	// partition as long as we are unmounting
7744 	KDiskDeviceManager* ddm = KDiskDeviceManager::Default();
7745 	KPartition* partition = mount->partition;
7746 	KDiskDevice* diskDevice = NULL;
7747 	if (partition != NULL) {
7748 		if (partition->Device() == NULL) {
7749 			dprintf("fs_unmount(): There is no device!\n");
7750 			return B_ERROR;
7751 		}
7752 		diskDevice = ddm->WriteLockDevice(partition->Device()->ID());
7753 		if (!diskDevice) {
7754 			TRACE(("fs_unmount(): Failed to lock disk device!\n"));
7755 			return B_ERROR;
7756 		}
7757 	}
7758 	DeviceWriteLocker writeLocker(diskDevice, true);
7759 
7760 	// make sure, that the partition is not busy
7761 	if (partition != NULL) {
7762 		if ((flags & B_UNMOUNT_BUSY_PARTITION) == 0 && partition->IsBusy()) {
7763 			TRACE(("fs_unmount(): Partition is busy.\n"));
7764 			return B_BUSY;
7765 		}
7766 	}
7767 
7768 	// grab the vnode master mutex to keep someone from creating
7769 	// a vnode while we're figuring out if we can continue
7770 	WriteLocker vnodesWriteLocker(&sVnodeLock);
7771 
7772 	bool disconnectedDescriptors = false;
7773 
7774 	while (true) {
7775 		bool busy = false;
7776 
7777 		// cycle through the list of vnodes associated with this mount and
7778 		// make sure all of them are not busy or have refs on them
7779 		VnodeList::Iterator iterator = mount->vnodes.GetIterator();
7780 		while (struct vnode* vnode = iterator.Next()) {
7781 			if (vnode->IsBusy()) {
7782 				busy = true;
7783 				break;
7784 			}
7785 
7786 			// check the vnode's ref count -- subtract additional references for
7787 			// covering
7788 			int32 refCount = vnode->ref_count;
7789 			if (vnode->covers != NULL)
7790 				refCount--;
7791 			if (vnode->covered_by != NULL)
7792 				refCount--;
7793 
7794 			if (refCount != 0) {
7795 				// there are still vnodes in use on this mount, so we cannot
7796 				// unmount yet
7797 				busy = true;
7798 				break;
7799 			}
7800 		}
7801 
7802 		if (!busy)
7803 			break;
7804 
7805 		if ((flags & B_FORCE_UNMOUNT) == 0)
7806 			return B_BUSY;
7807 
7808 		if (disconnectedDescriptors) {
7809 			// wait a bit until the last access is finished, and then try again
7810 			vnodesWriteLocker.Unlock();
7811 			snooze(100000);
7812 			// TODO: if there is some kind of bug that prevents the ref counts
7813 			// from getting back to zero, this will fall into an endless loop...
7814 			vnodesWriteLocker.Lock();
7815 			continue;
7816 		}
7817 
7818 		// the file system is still busy - but we're forced to unmount it,
7819 		// so let's disconnect all open file descriptors
7820 
7821 		mount->unmounting = true;
7822 			// prevent new vnodes from being created
7823 
7824 		vnodesWriteLocker.Unlock();
7825 
7826 		disconnect_mount_or_vnode_fds(mount, NULL);
7827 		disconnectedDescriptors = true;
7828 
7829 		vnodesWriteLocker.Lock();
7830 	}
7831 
7832 	// We can safely continue. Mark all of the vnodes busy and this mount
7833 	// structure in unmounting state. Also undo the vnode covers/covered_by
7834 	// links.
7835 	mount->unmounting = true;
7836 
7837 	VnodeList::Iterator iterator = mount->vnodes.GetIterator();
7838 	while (struct vnode* vnode = iterator.Next()) {
7839 		// Remove all covers/covered_by links from other mounts' nodes to this
7840 		// vnode and adjust the node ref count accordingly. We will release the
7841 		// references to the external vnodes below.
7842 		if (Vnode* coveredNode = vnode->covers) {
7843 			if (Vnode* coveringNode = vnode->covered_by) {
7844 				// We have both covered and covering vnodes, so just remove us
7845 				// from the chain.
7846 				coveredNode->covered_by = coveringNode;
7847 				coveringNode->covers = coveredNode;
7848 				vnode->ref_count -= 2;
7849 
7850 				vnode->covered_by = NULL;
7851 				vnode->covers = NULL;
7852 				vnode->SetCovering(false);
7853 				vnode->SetCovered(false);
7854 			} else {
7855 				// We only have a covered vnode. Remove its link to us.
7856 				coveredNode->covered_by = NULL;
7857 				coveredNode->SetCovered(false);
7858 				vnode->ref_count--;
7859 
7860 				// If the other node is an external vnode, we keep its link
7861 				// link around so we can put the reference later on. Otherwise
7862 				// we get rid of it right now.
7863 				if (coveredNode->mount == mount) {
7864 					vnode->covers = NULL;
7865 					coveredNode->ref_count--;
7866 				}
7867 			}
7868 		} else if (Vnode* coveringNode = vnode->covered_by) {
7869 			// We only have a covering vnode. Remove its link to us.
7870 			coveringNode->covers = NULL;
7871 			coveringNode->SetCovering(false);
7872 			vnode->ref_count--;
7873 
7874 			// If the other node is an external vnode, we keep its link
7875 			// link around so we can put the reference later on. Otherwise
7876 			// we get rid of it right now.
7877 			if (coveringNode->mount == mount) {
7878 				vnode->covered_by = NULL;
7879 				coveringNode->ref_count--;
7880 			}
7881 		}
7882 
7883 		vnode->SetBusy(true);
7884 		vnode_to_be_freed(vnode);
7885 	}
7886 
7887 	vnodesWriteLocker.Unlock();
7888 
7889 	// Free all vnodes associated with this mount.
7890 	// They will be removed from the mount list by free_vnode(), so
7891 	// we don't have to do this.
7892 	while (struct vnode* vnode = mount->vnodes.Head()) {
7893 		// Put the references to external covered/covering vnodes we kept above.
7894 		if (Vnode* coveredNode = vnode->covers)
7895 			put_vnode(coveredNode);
7896 		if (Vnode* coveringNode = vnode->covered_by)
7897 			put_vnode(coveringNode);
7898 
7899 		free_vnode(vnode, false);
7900 	}
7901 
7902 	// remove the mount structure from the hash table
7903 	rw_lock_write_lock(&sMountLock);
7904 	sMountsTable->Remove(mount);
7905 	rw_lock_write_unlock(&sMountLock);
7906 
7907 	mountOpLocker.Unlock();
7908 
7909 	FS_MOUNT_CALL_NO_PARAMS(mount, unmount);
7910 	notify_unmount(mount->id);
7911 
7912 	// dereference the partition and mark it unmounted
7913 	if (partition) {
7914 		partition->SetVolumeID(-1);
7915 		partition->SetMountCookie(NULL);
7916 
7917 		if (mount->owns_file_device)
7918 			KDiskDeviceManager::Default()->DeleteFileDevice(partition->ID());
7919 		partition->Unregister();
7920 	}
7921 
7922 	delete mount;
7923 	return B_OK;
7924 }
7925 
7926 
7927 static status_t
7928 fs_sync(dev_t device)
7929 {
7930 	struct fs_mount* mount;
7931 	status_t status = get_mount(device, &mount);
7932 	if (status != B_OK)
7933 		return status;
7934 
7935 	struct vnode marker;
7936 	memset(&marker, 0, sizeof(marker));
7937 	marker.SetBusy(true);
7938 	marker.SetRemoved(true);
7939 
7940 	// First, synchronize all file caches
7941 
7942 	while (true) {
7943 		WriteLocker locker(sVnodeLock);
7944 			// Note: That's the easy way. Which is probably OK for sync(),
7945 			// since it's a relatively rare call and doesn't need to allow for
7946 			// a lot of concurrency. Using a read lock would be possible, but
7947 			// also more involved, since we had to lock the individual nodes
7948 			// and take care of the locking order, which we might not want to
7949 			// do while holding fs_mount::lock.
7950 
7951 		// synchronize access to vnode list
7952 		mutex_lock(&mount->lock);
7953 
7954 		struct vnode* vnode;
7955 		if (!marker.IsRemoved()) {
7956 			vnode = mount->vnodes.GetNext(&marker);
7957 			mount->vnodes.Remove(&marker);
7958 			marker.SetRemoved(true);
7959 		} else
7960 			vnode = mount->vnodes.First();
7961 
7962 		while (vnode != NULL && (vnode->cache == NULL
7963 			|| vnode->IsRemoved() || vnode->IsBusy())) {
7964 			// TODO: we could track writes (and writable mapped vnodes)
7965 			//	and have a simple flag that we could test for here
7966 			vnode = mount->vnodes.GetNext(vnode);
7967 		}
7968 
7969 		if (vnode != NULL) {
7970 			// insert marker vnode again
7971 			mount->vnodes.Insert(mount->vnodes.GetNext(vnode), &marker);
7972 			marker.SetRemoved(false);
7973 		}
7974 
7975 		mutex_unlock(&mount->lock);
7976 
7977 		if (vnode == NULL)
7978 			break;
7979 
7980 		vnode = lookup_vnode(mount->id, vnode->id);
7981 		if (vnode == NULL || vnode->IsBusy())
7982 			continue;
7983 
7984 		if (vnode->ref_count == 0) {
7985 			// this vnode has been unused before
7986 			vnode_used(vnode);
7987 		}
7988 		inc_vnode_ref_count(vnode);
7989 
7990 		locker.Unlock();
7991 
7992 		if (vnode->cache != NULL && !vnode->IsRemoved())
7993 			vnode->cache->WriteModified();
7994 
7995 		put_vnode(vnode);
7996 	}
7997 
7998 	// Let the file systems do their synchronizing work
7999 	if (HAS_FS_MOUNT_CALL(mount, sync))
8000 		status = FS_MOUNT_CALL_NO_PARAMS(mount, sync);
8001 
8002 	// Finally, flush the underlying device's write cache (if possible.)
8003 	if (mount->partition != NULL && mount->partition->Device() != NULL)
8004 		ioctl(mount->partition->Device()->FD(), B_FLUSH_DRIVE_CACHE);
8005 
8006 	put_mount(mount);
8007 	return status;
8008 }
8009 
8010 
8011 static status_t
8012 fs_read_info(dev_t device, struct fs_info* info)
8013 {
8014 	struct fs_mount* mount;
8015 	status_t status = get_mount(device, &mount);
8016 	if (status != B_OK)
8017 		return status;
8018 
8019 	memset(info, 0, sizeof(struct fs_info));
8020 
8021 	if (HAS_FS_MOUNT_CALL(mount, read_fs_info))
8022 		status = FS_MOUNT_CALL(mount, read_fs_info, info);
8023 
8024 	// fill in info the file system doesn't (have to) know about
8025 	if (status == B_OK) {
8026 		info->dev = mount->id;
8027 		info->root = mount->root_vnode->id;
8028 
8029 		fs_volume* volume = mount->volume;
8030 		while (volume->super_volume != NULL)
8031 			volume = volume->super_volume;
8032 
8033 		strlcpy(info->fsh_name, volume->file_system_name,
8034 			sizeof(info->fsh_name));
8035 		if (mount->device_name != NULL) {
8036 			strlcpy(info->device_name, mount->device_name,
8037 				sizeof(info->device_name));
8038 		}
8039 	}
8040 
8041 	// if the call is not supported by the file system, there are still
8042 	// the parts that we filled out ourselves
8043 
8044 	put_mount(mount);
8045 	return status;
8046 }
8047 
8048 
8049 static status_t
8050 fs_write_info(dev_t device, const struct fs_info* info, int mask)
8051 {
8052 	struct fs_mount* mount;
8053 	status_t status = get_mount(device, &mount);
8054 	if (status != B_OK)
8055 		return status;
8056 
8057 	if (HAS_FS_MOUNT_CALL(mount, write_fs_info))
8058 		status = FS_MOUNT_CALL(mount, write_fs_info, info, mask);
8059 	else
8060 		status = B_READ_ONLY_DEVICE;
8061 
8062 	put_mount(mount);
8063 	return status;
8064 }
8065 
8066 
8067 static dev_t
8068 fs_next_device(int32* _cookie)
8069 {
8070 	struct fs_mount* mount = NULL;
8071 	dev_t device = *_cookie;
8072 
8073 	rw_lock_read_lock(&sMountLock);
8074 
8075 	// Since device IDs are assigned sequentially, this algorithm
8076 	// does work good enough. It makes sure that the device list
8077 	// returned is sorted, and that no device is skipped when an
8078 	// already visited device got unmounted.
8079 
8080 	while (device < sNextMountID) {
8081 		mount = find_mount(device++);
8082 		if (mount != NULL && mount->volume->private_volume != NULL)
8083 			break;
8084 	}
8085 
8086 	*_cookie = device;
8087 
8088 	if (mount != NULL)
8089 		device = mount->id;
8090 	else
8091 		device = B_BAD_VALUE;
8092 
8093 	rw_lock_read_unlock(&sMountLock);
8094 
8095 	return device;
8096 }
8097 
8098 
8099 ssize_t
8100 fs_read_attr(int fd, const char *attribute, uint32 type, off_t pos,
8101 	void *buffer, size_t readBytes)
8102 {
8103 	int attrFD = attr_open(fd, NULL, attribute, O_RDONLY, true);
8104 	if (attrFD < 0)
8105 		return attrFD;
8106 
8107 	ssize_t bytesRead = _kern_read(attrFD, pos, buffer, readBytes);
8108 
8109 	_kern_close(attrFD);
8110 
8111 	return bytesRead;
8112 }
8113 
8114 
8115 static status_t
8116 get_cwd(char* buffer, size_t size, bool kernel)
8117 {
8118 	// Get current working directory from io context
8119 	struct io_context* context = get_current_io_context(kernel);
8120 	status_t status;
8121 
8122 	FUNCTION(("vfs_get_cwd: buf %p, size %ld\n", buffer, size));
8123 
8124 	mutex_lock(&context->io_mutex);
8125 
8126 	struct vnode* vnode = context->cwd;
8127 	if (vnode)
8128 		inc_vnode_ref_count(vnode);
8129 
8130 	mutex_unlock(&context->io_mutex);
8131 
8132 	if (vnode) {
8133 		status = dir_vnode_to_path(vnode, buffer, size, kernel);
8134 		put_vnode(vnode);
8135 	} else
8136 		status = B_ERROR;
8137 
8138 	return status;
8139 }
8140 
8141 
8142 static status_t
8143 set_cwd(int fd, char* path, bool kernel)
8144 {
8145 	struct io_context* context;
8146 	struct vnode* vnode = NULL;
8147 	struct vnode* oldDirectory;
8148 	status_t status;
8149 
8150 	FUNCTION(("set_cwd: path = \'%s\'\n", path));
8151 
8152 	// Get vnode for passed path, and bail if it failed
8153 	status = fd_and_path_to_vnode(fd, path, true, &vnode, NULL, kernel);
8154 	if (status < 0)
8155 		return status;
8156 
8157 	if (!S_ISDIR(vnode->Type())) {
8158 		// nope, can't cwd to here
8159 		status = B_NOT_A_DIRECTORY;
8160 		goto err;
8161 	}
8162 
8163 	// We need to have the permission to enter the directory, too
8164 	if (HAS_FS_CALL(vnode, access)) {
8165 		status = FS_CALL(vnode, access, X_OK);
8166 		if (status != B_OK)
8167 			goto err;
8168 	}
8169 
8170 	// Get current io context and lock
8171 	context = get_current_io_context(kernel);
8172 	mutex_lock(&context->io_mutex);
8173 
8174 	// save the old current working directory first
8175 	oldDirectory = context->cwd;
8176 	context->cwd = vnode;
8177 
8178 	mutex_unlock(&context->io_mutex);
8179 
8180 	if (oldDirectory)
8181 		put_vnode(oldDirectory);
8182 
8183 	return B_NO_ERROR;
8184 
8185 err:
8186 	put_vnode(vnode);
8187 	return status;
8188 }
8189 
8190 
8191 static status_t
8192 user_copy_name(char* to, const char* from, size_t length)
8193 {
8194 	ssize_t len = user_strlcpy(to, from, length);
8195 	if (len < 0)
8196 		return len;
8197 	if (len >= (ssize_t)length)
8198 		return B_NAME_TOO_LONG;
8199 	return B_OK;
8200 }
8201 
8202 
8203 //	#pragma mark - kernel mirrored syscalls
8204 
8205 
8206 dev_t
8207 _kern_mount(const char* path, const char* device, const char* fsName,
8208 	uint32 flags, const char* args, size_t argsLength)
8209 {
8210 	KPath pathBuffer(path);
8211 	if (pathBuffer.InitCheck() != B_OK)
8212 		return B_NO_MEMORY;
8213 
8214 	return fs_mount(pathBuffer.LockBuffer(), device, fsName, flags, args, true);
8215 }
8216 
8217 
8218 status_t
8219 _kern_unmount(const char* path, uint32 flags)
8220 {
8221 	KPath pathBuffer(path);
8222 	if (pathBuffer.InitCheck() != B_OK)
8223 		return B_NO_MEMORY;
8224 
8225 	return fs_unmount(pathBuffer.LockBuffer(), -1, flags, true);
8226 }
8227 
8228 
8229 status_t
8230 _kern_read_fs_info(dev_t device, struct fs_info* info)
8231 {
8232 	if (info == NULL)
8233 		return B_BAD_VALUE;
8234 
8235 	return fs_read_info(device, info);
8236 }
8237 
8238 
8239 status_t
8240 _kern_write_fs_info(dev_t device, const struct fs_info* info, int mask)
8241 {
8242 	if (info == NULL)
8243 		return B_BAD_VALUE;
8244 
8245 	return fs_write_info(device, info, mask);
8246 }
8247 
8248 
8249 status_t
8250 _kern_sync(void)
8251 {
8252 	// Note: _kern_sync() is also called from _user_sync()
8253 	int32 cookie = 0;
8254 	dev_t device;
8255 	while ((device = next_dev(&cookie)) >= 0) {
8256 		status_t status = fs_sync(device);
8257 		if (status != B_OK && status != B_BAD_VALUE) {
8258 			dprintf("sync: device %" B_PRIdDEV " couldn't sync: %s\n", device,
8259 				strerror(status));
8260 		}
8261 	}
8262 
8263 	return B_OK;
8264 }
8265 
8266 
8267 dev_t
8268 _kern_next_device(int32* _cookie)
8269 {
8270 	return fs_next_device(_cookie);
8271 }
8272 
8273 
8274 status_t
8275 _kern_get_next_fd_info(team_id teamID, uint32* _cookie, fd_info* info,
8276 	size_t infoSize)
8277 {
8278 	if (infoSize != sizeof(fd_info))
8279 		return B_BAD_VALUE;
8280 
8281 	// get the team
8282 	Team* team = Team::Get(teamID);
8283 	if (team == NULL)
8284 		return B_BAD_TEAM_ID;
8285 	BReference<Team> teamReference(team, true);
8286 
8287 	// now that we have a team reference, its I/O context won't go away
8288 	io_context* context = team->io_context;
8289 	MutexLocker contextLocker(context->io_mutex);
8290 
8291 	uint32 slot = *_cookie;
8292 
8293 	struct file_descriptor* descriptor;
8294 	while (slot < context->table_size
8295 		&& (descriptor = context->fds[slot]) == NULL) {
8296 		slot++;
8297 	}
8298 
8299 	if (slot >= context->table_size)
8300 		return B_ENTRY_NOT_FOUND;
8301 
8302 	info->number = slot;
8303 	info->open_mode = descriptor->open_mode;
8304 
8305 	struct vnode* vnode = fd_vnode(descriptor);
8306 	if (vnode != NULL) {
8307 		info->device = vnode->device;
8308 		info->node = vnode->id;
8309 	} else if (descriptor->u.mount != NULL) {
8310 		info->device = descriptor->u.mount->id;
8311 		info->node = -1;
8312 	}
8313 
8314 	*_cookie = slot + 1;
8315 	return B_OK;
8316 }
8317 
8318 
8319 int
8320 _kern_open_entry_ref(dev_t device, ino_t inode, const char* name, int openMode,
8321 	int perms)
8322 {
8323 	if ((openMode & O_CREAT) != 0) {
8324 		return file_create_entry_ref(device, inode, name, openMode, perms,
8325 			true);
8326 	}
8327 
8328 	return file_open_entry_ref(device, inode, name, openMode, true);
8329 }
8330 
8331 
8332 /*!	\brief Opens a node specified by a FD + path pair.
8333 
8334 	At least one of \a fd and \a path must be specified.
8335 	If only \a fd is given, the function opens the node identified by this
8336 	FD. If only a path is given, this path is opened. If both are given and
8337 	the path is absolute, \a fd is ignored; a relative path is reckoned off
8338 	of the directory (!) identified by \a fd.
8339 
8340 	\param fd The FD. May be < 0.
8341 	\param path The absolute or relative path. May be \c NULL.
8342 	\param openMode The open mode.
8343 	\return A FD referring to the newly opened node, or an error code,
8344 			if an error occurs.
8345 */
8346 int
8347 _kern_open(int fd, const char* path, int openMode, int perms)
8348 {
8349 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8350 	if (pathBuffer.InitCheck() != B_OK)
8351 		return B_NO_MEMORY;
8352 
8353 	if ((openMode & O_CREAT) != 0)
8354 		return file_create(fd, pathBuffer.LockBuffer(), openMode, perms, true);
8355 
8356 	return file_open(fd, pathBuffer.LockBuffer(), openMode, true);
8357 }
8358 
8359 
8360 /*!	\brief Opens a directory specified by entry_ref or node_ref.
8361 
8362 	The supplied name may be \c NULL, in which case directory identified
8363 	by \a device and \a inode will be opened. Otherwise \a device and
8364 	\a inode identify the parent directory of the directory to be opened
8365 	and \a name its entry name.
8366 
8367 	\param device If \a name is specified the ID of the device the parent
8368 		   directory of the directory to be opened resides on, otherwise
8369 		   the device of the directory itself.
8370 	\param inode If \a name is specified the node ID of the parent
8371 		   directory of the directory to be opened, otherwise node ID of the
8372 		   directory itself.
8373 	\param name The entry name of the directory to be opened. If \c NULL,
8374 		   the \a device + \a inode pair identify the node to be opened.
8375 	\return The FD of the newly opened directory or an error code, if
8376 			something went wrong.
8377 */
8378 int
8379 _kern_open_dir_entry_ref(dev_t device, ino_t inode, const char* name)
8380 {
8381 	return dir_open_entry_ref(device, inode, name, true);
8382 }
8383 
8384 
8385 /*!	\brief Opens a directory specified by a FD + path pair.
8386 
8387 	At least one of \a fd and \a path must be specified.
8388 	If only \a fd is given, the function opens the directory identified by this
8389 	FD. If only a path is given, this path is opened. If both are given and
8390 	the path is absolute, \a fd is ignored; a relative path is reckoned off
8391 	of the directory (!) identified by \a fd.
8392 
8393 	\param fd The FD. May be < 0.
8394 	\param path The absolute or relative path. May be \c NULL.
8395 	\return A FD referring to the newly opened directory, or an error code,
8396 			if an error occurs.
8397 */
8398 int
8399 _kern_open_dir(int fd, const char* path)
8400 {
8401 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8402 	if (pathBuffer.InitCheck() != B_OK)
8403 		return B_NO_MEMORY;
8404 
8405 	return dir_open(fd, pathBuffer.LockBuffer(), true);
8406 }
8407 
8408 
8409 status_t
8410 _kern_fcntl(int fd, int op, size_t argument)
8411 {
8412 	return common_fcntl(fd, op, argument, true);
8413 }
8414 
8415 
8416 status_t
8417 _kern_fsync(int fd)
8418 {
8419 	return common_sync(fd, true);
8420 }
8421 
8422 
8423 status_t
8424 _kern_lock_node(int fd)
8425 {
8426 	return common_lock_node(fd, true);
8427 }
8428 
8429 
8430 status_t
8431 _kern_unlock_node(int fd)
8432 {
8433 	return common_unlock_node(fd, true);
8434 }
8435 
8436 
8437 status_t
8438 _kern_create_dir_entry_ref(dev_t device, ino_t inode, const char* name,
8439 	int perms)
8440 {
8441 	return dir_create_entry_ref(device, inode, name, perms, true);
8442 }
8443 
8444 
8445 /*!	\brief Creates a directory specified by a FD + path pair.
8446 
8447 	\a path must always be specified (it contains the name of the new directory
8448 	at least). If only a path is given, this path identifies the location at
8449 	which the directory shall be created. If both \a fd and \a path are given
8450 	and the path is absolute, \a fd is ignored; a relative path is reckoned off
8451 	of the directory (!) identified by \a fd.
8452 
8453 	\param fd The FD. May be < 0.
8454 	\param path The absolute or relative path. Must not be \c NULL.
8455 	\param perms The access permissions the new directory shall have.
8456 	\return \c B_OK, if the directory has been created successfully, another
8457 			error code otherwise.
8458 */
8459 status_t
8460 _kern_create_dir(int fd, const char* path, int perms)
8461 {
8462 	KPath pathBuffer(path, KPath::DEFAULT);
8463 	if (pathBuffer.InitCheck() != B_OK)
8464 		return B_NO_MEMORY;
8465 
8466 	return dir_create(fd, pathBuffer.LockBuffer(), perms, true);
8467 }
8468 
8469 
8470 status_t
8471 _kern_remove_dir(int fd, const char* path)
8472 {
8473 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8474 	if (pathBuffer.InitCheck() != B_OK)
8475 		return B_NO_MEMORY;
8476 
8477 	return dir_remove(fd, pathBuffer.LockBuffer(), true);
8478 }
8479 
8480 
8481 /*!	\brief Reads the contents of a symlink referred to by a FD + path pair.
8482 
8483 	At least one of \a fd and \a path must be specified.
8484 	If only \a fd is given, the function the symlink to be read is the node
8485 	identified by this FD. If only a path is given, this path identifies the
8486 	symlink to be read. If both are given and the path is absolute, \a fd is
8487 	ignored; a relative path is reckoned off of the directory (!) identified
8488 	by \a fd.
8489 	If this function fails with B_BUFFER_OVERFLOW, the \a _bufferSize pointer
8490 	will still be updated to reflect the required buffer size.
8491 
8492 	\param fd The FD. May be < 0.
8493 	\param path The absolute or relative path. May be \c NULL.
8494 	\param buffer The buffer into which the contents of the symlink shall be
8495 		   written.
8496 	\param _bufferSize A pointer to the size of the supplied buffer.
8497 	\return The length of the link on success or an appropriate error code
8498 */
8499 status_t
8500 _kern_read_link(int fd, const char* path, char* buffer, size_t* _bufferSize)
8501 {
8502 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8503 	if (pathBuffer.InitCheck() != B_OK)
8504 		return B_NO_MEMORY;
8505 
8506 	return common_read_link(fd, pathBuffer.LockBuffer(),
8507 		buffer, _bufferSize, true);
8508 }
8509 
8510 
8511 /*!	\brief Creates a symlink specified by a FD + path pair.
8512 
8513 	\a path must always be specified (it contains the name of the new symlink
8514 	at least). If only a path is given, this path identifies the location at
8515 	which the symlink shall be created. If both \a fd and \a path are given and
8516 	the path is absolute, \a fd is ignored; a relative path is reckoned off
8517 	of the directory (!) identified by \a fd.
8518 
8519 	\param fd The FD. May be < 0.
8520 	\param toPath The absolute or relative path. Must not be \c NULL.
8521 	\param mode The access permissions the new symlink shall have.
8522 	\return \c B_OK, if the symlink has been created successfully, another
8523 			error code otherwise.
8524 */
8525 status_t
8526 _kern_create_symlink(int fd, const char* path, const char* toPath, int mode)
8527 {
8528 	KPath pathBuffer(path);
8529 	if (pathBuffer.InitCheck() != B_OK)
8530 		return B_NO_MEMORY;
8531 
8532 	return common_create_symlink(fd, pathBuffer.LockBuffer(),
8533 		toPath, mode, true);
8534 }
8535 
8536 
8537 status_t
8538 _kern_create_link(int pathFD, const char* path, int toFD, const char* toPath,
8539 	bool traverseLeafLink)
8540 {
8541 	KPath pathBuffer(path);
8542 	KPath toPathBuffer(toPath);
8543 	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
8544 		return B_NO_MEMORY;
8545 
8546 	return common_create_link(pathFD, pathBuffer.LockBuffer(), toFD,
8547 		toPathBuffer.LockBuffer(), traverseLeafLink, true);
8548 }
8549 
8550 
8551 /*!	\brief Removes an entry specified by a FD + path pair from its directory.
8552 
8553 	\a path must always be specified (it contains at least the name of the entry
8554 	to be deleted). If only a path is given, this path identifies the entry
8555 	directly. If both \a fd and \a path are given and the path is absolute,
8556 	\a fd is ignored; a relative path is reckoned off of the directory (!)
8557 	identified by \a fd.
8558 
8559 	\param fd The FD. May be < 0.
8560 	\param path The absolute or relative path. Must not be \c NULL.
8561 	\return \c B_OK, if the entry has been removed successfully, another
8562 			error code otherwise.
8563 */
8564 status_t
8565 _kern_unlink(int fd, const char* path)
8566 {
8567 	KPath pathBuffer(path);
8568 	if (pathBuffer.InitCheck() != B_OK)
8569 		return B_NO_MEMORY;
8570 
8571 	return common_unlink(fd, pathBuffer.LockBuffer(), true);
8572 }
8573 
8574 
8575 /*!	\brief Moves an entry specified by a FD + path pair to a an entry specified
8576 		   by another FD + path pair.
8577 
8578 	\a oldPath and \a newPath must always be specified (they contain at least
8579 	the name of the entry). If only a path is given, this path identifies the
8580 	entry directly. If both a FD and a path are given and the path is absolute,
8581 	the FD is ignored; a relative path is reckoned off of the directory (!)
8582 	identified by the respective FD.
8583 
8584 	\param oldFD The FD of the old location. May be < 0.
8585 	\param oldPath The absolute or relative path of the old location. Must not
8586 		   be \c NULL.
8587 	\param newFD The FD of the new location. May be < 0.
8588 	\param newPath The absolute or relative path of the new location. Must not
8589 		   be \c NULL.
8590 	\return \c B_OK, if the entry has been moved successfully, another
8591 			error code otherwise.
8592 */
8593 status_t
8594 _kern_rename(int oldFD, const char* oldPath, int newFD, const char* newPath)
8595 {
8596 	KPath oldPathBuffer(oldPath);
8597 	KPath newPathBuffer(newPath);
8598 	if (oldPathBuffer.InitCheck() != B_OK || newPathBuffer.InitCheck() != B_OK)
8599 		return B_NO_MEMORY;
8600 
8601 	return common_rename(oldFD, oldPathBuffer.LockBuffer(),
8602 		newFD, newPathBuffer.LockBuffer(), true);
8603 }
8604 
8605 
8606 status_t
8607 _kern_access(int fd, const char* path, int mode, bool effectiveUserGroup)
8608 {
8609 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8610 	if (pathBuffer.InitCheck() != B_OK)
8611 		return B_NO_MEMORY;
8612 
8613 	return common_access(fd, pathBuffer.LockBuffer(), mode, effectiveUserGroup,
8614 		true);
8615 }
8616 
8617 
8618 /*!	\brief Reads stat data of an entity specified by a FD + path pair.
8619 
8620 	If only \a fd is given, the stat operation associated with the type
8621 	of the FD (node, attr, attr dir etc.) is performed. If only \a path is
8622 	given, this path identifies the entry for whose node to retrieve the
8623 	stat data. If both \a fd and \a path are given and the path is absolute,
8624 	\a fd is ignored; a relative path is reckoned off of the directory (!)
8625 	identified by \a fd and specifies the entry whose stat data shall be
8626 	retrieved.
8627 
8628 	\param fd The FD. May be < 0.
8629 	\param path The absolute or relative path. Must not be \c NULL.
8630 	\param traverseLeafLink If \a path is given, \c true specifies that the
8631 		   function shall not stick to symlinks, but traverse them.
8632 	\param stat The buffer the stat data shall be written into.
8633 	\param statSize The size of the supplied stat buffer.
8634 	\return \c B_OK, if the the stat data have been read successfully, another
8635 			error code otherwise.
8636 */
8637 status_t
8638 _kern_read_stat(int fd, const char* path, bool traverseLeafLink,
8639 	struct stat* stat, size_t statSize)
8640 {
8641 	struct stat completeStat;
8642 	struct stat* originalStat = NULL;
8643 	status_t status;
8644 
8645 	if (statSize > sizeof(struct stat))
8646 		return B_BAD_VALUE;
8647 
8648 	// this supports different stat extensions
8649 	if (statSize < sizeof(struct stat)) {
8650 		originalStat = stat;
8651 		stat = &completeStat;
8652 	}
8653 
8654 	status = vfs_read_stat(fd, path, traverseLeafLink, stat, true);
8655 
8656 	if (status == B_OK && originalStat != NULL)
8657 		memcpy(originalStat, stat, statSize);
8658 
8659 	return status;
8660 }
8661 
8662 
8663 /*!	\brief Writes stat data of an entity specified by a FD + path pair.
8664 
8665 	If only \a fd is given, the stat operation associated with the type
8666 	of the FD (node, attr, attr dir etc.) is performed. If only \a path is
8667 	given, this path identifies the entry for whose node to write the
8668 	stat data. If both \a fd and \a path are given and the path is absolute,
8669 	\a fd is ignored; a relative path is reckoned off of the directory (!)
8670 	identified by \a fd and specifies the entry whose stat data shall be
8671 	written.
8672 
8673 	\param fd The FD. May be < 0.
8674 	\param path The absolute or relative path. May be \c NULL.
8675 	\param traverseLeafLink If \a path is given, \c true specifies that the
8676 		   function shall not stick to symlinks, but traverse them.
8677 	\param stat The buffer containing the stat data to be written.
8678 	\param statSize The size of the supplied stat buffer.
8679 	\param statMask A mask specifying which parts of the stat data shall be
8680 		   written.
8681 	\return \c B_OK, if the the stat data have been written successfully,
8682 			another error code otherwise.
8683 */
8684 status_t
8685 _kern_write_stat(int fd, const char* path, bool traverseLeafLink,
8686 	const struct stat* stat, size_t statSize, int statMask)
8687 {
8688 	struct stat completeStat;
8689 
8690 	if (statSize > sizeof(struct stat))
8691 		return B_BAD_VALUE;
8692 
8693 	// this supports different stat extensions
8694 	if (statSize < sizeof(struct stat)) {
8695 		memset((uint8*)&completeStat + statSize, 0,
8696 			sizeof(struct stat) - statSize);
8697 		memcpy(&completeStat, stat, statSize);
8698 		stat = &completeStat;
8699 	}
8700 
8701 	status_t status;
8702 
8703 	if (path != NULL) {
8704 		// path given: write the stat of the node referred to by (fd, path)
8705 		KPath pathBuffer(path);
8706 		if (pathBuffer.InitCheck() != B_OK)
8707 			return B_NO_MEMORY;
8708 
8709 		status = common_path_write_stat(fd, pathBuffer.LockBuffer(),
8710 			traverseLeafLink, stat, statMask, true);
8711 	} else {
8712 		// no path given: get the FD and use the FD operation
8713 		struct file_descriptor* descriptor
8714 			= get_fd(get_current_io_context(true), fd);
8715 		if (descriptor == NULL)
8716 			return B_FILE_ERROR;
8717 
8718 		if (descriptor->ops->fd_write_stat)
8719 			status = descriptor->ops->fd_write_stat(descriptor, stat, statMask);
8720 		else
8721 			status = B_UNSUPPORTED;
8722 
8723 		put_fd(descriptor);
8724 	}
8725 
8726 	return status;
8727 }
8728 
8729 
8730 int
8731 _kern_open_attr_dir(int fd, const char* path, bool traverseLeafLink)
8732 {
8733 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8734 	if (pathBuffer.InitCheck() != B_OK)
8735 		return B_NO_MEMORY;
8736 
8737 	return attr_dir_open(fd, pathBuffer.LockBuffer(), traverseLeafLink, true);
8738 }
8739 
8740 
8741 int
8742 _kern_open_attr(int fd, const char* path, const char* name, uint32 type,
8743 	int openMode)
8744 {
8745 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8746 	if (pathBuffer.InitCheck() != B_OK)
8747 		return B_NO_MEMORY;
8748 
8749 	if ((openMode & O_CREAT) != 0) {
8750 		return attr_create(fd, pathBuffer.LockBuffer(), name, type, openMode,
8751 			true);
8752 	}
8753 
8754 	return attr_open(fd, pathBuffer.LockBuffer(), name, openMode, true);
8755 }
8756 
8757 
8758 status_t
8759 _kern_remove_attr(int fd, const char* name)
8760 {
8761 	return attr_remove(fd, name, true);
8762 }
8763 
8764 
8765 status_t
8766 _kern_rename_attr(int fromFile, const char* fromName, int toFile,
8767 	const char* toName)
8768 {
8769 	return attr_rename(fromFile, fromName, toFile, toName, true);
8770 }
8771 
8772 
8773 int
8774 _kern_open_index_dir(dev_t device)
8775 {
8776 	return index_dir_open(device, true);
8777 }
8778 
8779 
8780 status_t
8781 _kern_create_index(dev_t device, const char* name, uint32 type, uint32 flags)
8782 {
8783 	return index_create(device, name, type, flags, true);
8784 }
8785 
8786 
8787 status_t
8788 _kern_read_index_stat(dev_t device, const char* name, struct stat* stat)
8789 {
8790 	return index_name_read_stat(device, name, stat, true);
8791 }
8792 
8793 
8794 status_t
8795 _kern_remove_index(dev_t device, const char* name)
8796 {
8797 	return index_remove(device, name, true);
8798 }
8799 
8800 
8801 status_t
8802 _kern_getcwd(char* buffer, size_t size)
8803 {
8804 	TRACE(("_kern_getcwd: buf %p, %ld\n", buffer, size));
8805 
8806 	// Call vfs to get current working directory
8807 	return get_cwd(buffer, size, true);
8808 }
8809 
8810 
8811 status_t
8812 _kern_setcwd(int fd, const char* path)
8813 {
8814 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8815 	if (pathBuffer.InitCheck() != B_OK)
8816 		return B_NO_MEMORY;
8817 
8818 	return set_cwd(fd, pathBuffer.LockBuffer(), true);
8819 }
8820 
8821 
8822 //	#pragma mark - userland syscalls
8823 
8824 
8825 dev_t
8826 _user_mount(const char* userPath, const char* userDevice,
8827 	const char* userFileSystem, uint32 flags, const char* userArgs,
8828 	size_t argsLength)
8829 {
8830 	char fileSystem[B_FILE_NAME_LENGTH];
8831 	KPath path, device;
8832 	char* args = NULL;
8833 	status_t status;
8834 
8835 	if (!IS_USER_ADDRESS(userPath))
8836 		return B_BAD_ADDRESS;
8837 
8838 	if (path.InitCheck() != B_OK || device.InitCheck() != B_OK)
8839 		return B_NO_MEMORY;
8840 
8841 	status = user_copy_name(path.LockBuffer(), userPath,
8842 		B_PATH_NAME_LENGTH);
8843 	if (status != B_OK)
8844 		return status;
8845 	path.UnlockBuffer();
8846 
8847 	if (userFileSystem != NULL) {
8848 		if (!IS_USER_ADDRESS(userFileSystem))
8849 			return B_BAD_ADDRESS;
8850 
8851 		status = user_copy_name(fileSystem, userFileSystem, sizeof(fileSystem));
8852 		if (status != B_OK)
8853 			return status;
8854 	}
8855 
8856 	if (userDevice != NULL) {
8857 		if (!IS_USER_ADDRESS(userDevice))
8858 			return B_BAD_ADDRESS;
8859 
8860 		status = user_copy_name(device.LockBuffer(), userDevice,
8861 			B_PATH_NAME_LENGTH);
8862 		if (status != B_OK)
8863 			return status;
8864 		device.UnlockBuffer();
8865 	}
8866 
8867 	if (userArgs != NULL && argsLength > 0) {
8868 		if (!IS_USER_ADDRESS(userArgs))
8869 			return B_BAD_ADDRESS;
8870 
8871 		// this is a safety restriction
8872 		if (argsLength >= 65536)
8873 			return B_NAME_TOO_LONG;
8874 
8875 		args = (char*)malloc(argsLength + 1);
8876 		if (args == NULL)
8877 			return B_NO_MEMORY;
8878 
8879 		status = user_copy_name(args, userArgs, argsLength + 1);
8880 		if (status != B_OK) {
8881 			free(args);
8882 			return status;
8883 		}
8884 	}
8885 
8886 	status = fs_mount(path.LockBuffer(),
8887 		userDevice != NULL ? device.Path() : NULL,
8888 		userFileSystem ? fileSystem : NULL, flags, args, false);
8889 
8890 	free(args);
8891 	return status;
8892 }
8893 
8894 
8895 status_t
8896 _user_unmount(const char* userPath, uint32 flags)
8897 {
8898 	if (!IS_USER_ADDRESS(userPath))
8899 		return B_BAD_ADDRESS;
8900 
8901 	KPath pathBuffer;
8902 	if (pathBuffer.InitCheck() != B_OK)
8903 		return B_NO_MEMORY;
8904 
8905 	char* path = pathBuffer.LockBuffer();
8906 
8907 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
8908 	if (status != B_OK)
8909 		return status;
8910 
8911 	return fs_unmount(path, -1, flags & ~B_UNMOUNT_BUSY_PARTITION, false);
8912 }
8913 
8914 
8915 status_t
8916 _user_read_fs_info(dev_t device, struct fs_info* userInfo)
8917 {
8918 	struct fs_info info;
8919 	status_t status;
8920 
8921 	if (userInfo == NULL)
8922 		return B_BAD_VALUE;
8923 
8924 	if (!IS_USER_ADDRESS(userInfo))
8925 		return B_BAD_ADDRESS;
8926 
8927 	status = fs_read_info(device, &info);
8928 	if (status != B_OK)
8929 		return status;
8930 
8931 	if (user_memcpy(userInfo, &info, sizeof(struct fs_info)) != B_OK)
8932 		return B_BAD_ADDRESS;
8933 
8934 	return B_OK;
8935 }
8936 
8937 
8938 status_t
8939 _user_write_fs_info(dev_t device, const struct fs_info* userInfo, int mask)
8940 {
8941 	struct fs_info info;
8942 
8943 	if (userInfo == NULL)
8944 		return B_BAD_VALUE;
8945 
8946 	if (!IS_USER_ADDRESS(userInfo)
8947 		|| user_memcpy(&info, userInfo, sizeof(struct fs_info)) != B_OK)
8948 		return B_BAD_ADDRESS;
8949 
8950 	return fs_write_info(device, &info, mask);
8951 }
8952 
8953 
8954 dev_t
8955 _user_next_device(int32* _userCookie)
8956 {
8957 	int32 cookie;
8958 	dev_t device;
8959 
8960 	if (!IS_USER_ADDRESS(_userCookie)
8961 		|| user_memcpy(&cookie, _userCookie, sizeof(int32)) != B_OK)
8962 		return B_BAD_ADDRESS;
8963 
8964 	device = fs_next_device(&cookie);
8965 
8966 	if (device >= B_OK) {
8967 		// update user cookie
8968 		if (user_memcpy(_userCookie, &cookie, sizeof(int32)) != B_OK)
8969 			return B_BAD_ADDRESS;
8970 	}
8971 
8972 	return device;
8973 }
8974 
8975 
8976 status_t
8977 _user_sync(void)
8978 {
8979 	return _kern_sync();
8980 }
8981 
8982 
8983 status_t
8984 _user_get_next_fd_info(team_id team, uint32* userCookie, fd_info* userInfo,
8985 	size_t infoSize)
8986 {
8987 	struct fd_info info;
8988 	uint32 cookie;
8989 
8990 	// only root can do this
8991 	if (geteuid() != 0)
8992 		return B_NOT_ALLOWED;
8993 
8994 	if (infoSize != sizeof(fd_info))
8995 		return B_BAD_VALUE;
8996 
8997 	if (!IS_USER_ADDRESS(userCookie) || !IS_USER_ADDRESS(userInfo)
8998 		|| user_memcpy(&cookie, userCookie, sizeof(uint32)) != B_OK)
8999 		return B_BAD_ADDRESS;
9000 
9001 	status_t status = _kern_get_next_fd_info(team, &cookie, &info, infoSize);
9002 	if (status != B_OK)
9003 		return status;
9004 
9005 	if (user_memcpy(userCookie, &cookie, sizeof(uint32)) != B_OK
9006 		|| user_memcpy(userInfo, &info, infoSize) != B_OK)
9007 		return B_BAD_ADDRESS;
9008 
9009 	return status;
9010 }
9011 
9012 
9013 status_t
9014 _user_entry_ref_to_path(dev_t device, ino_t inode, const char* leaf,
9015 	char* userPath, size_t pathLength)
9016 {
9017 	if (!IS_USER_ADDRESS(userPath))
9018 		return B_BAD_ADDRESS;
9019 
9020 	KPath path;
9021 	if (path.InitCheck() != B_OK)
9022 		return B_NO_MEMORY;
9023 
9024 	// copy the leaf name onto the stack
9025 	char stackLeaf[B_FILE_NAME_LENGTH];
9026 	if (leaf != NULL) {
9027 		if (!IS_USER_ADDRESS(leaf))
9028 			return B_BAD_ADDRESS;
9029 
9030 		int status = user_copy_name(stackLeaf, leaf, B_FILE_NAME_LENGTH);
9031 		if (status != B_OK)
9032 			return status;
9033 
9034 		leaf = stackLeaf;
9035 	}
9036 
9037 	status_t status = vfs_entry_ref_to_path(device, inode, leaf,
9038 		false, path.LockBuffer(), path.BufferSize());
9039 	if (status != B_OK)
9040 		return status;
9041 
9042 	path.UnlockBuffer();
9043 
9044 	int length = user_strlcpy(userPath, path.Path(), pathLength);
9045 	if (length < 0)
9046 		return length;
9047 	if (length >= (int)pathLength)
9048 		return B_BUFFER_OVERFLOW;
9049 
9050 	return B_OK;
9051 }
9052 
9053 
9054 status_t
9055 _user_normalize_path(const char* userPath, bool traverseLink, char* buffer)
9056 {
9057 	if (userPath == NULL || buffer == NULL)
9058 		return B_BAD_VALUE;
9059 	if (!IS_USER_ADDRESS(userPath) || !IS_USER_ADDRESS(buffer))
9060 		return B_BAD_ADDRESS;
9061 
9062 	// copy path from userland
9063 	KPath pathBuffer;
9064 	if (pathBuffer.InitCheck() != B_OK)
9065 		return B_NO_MEMORY;
9066 	char* path = pathBuffer.LockBuffer();
9067 
9068 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9069 	if (status != B_OK)
9070 		return status;
9071 
9072 	status_t error = normalize_path(path, pathBuffer.BufferSize(), traverseLink,
9073 		false);
9074 	if (error != B_OK)
9075 		return error;
9076 
9077 	// copy back to userland
9078 	int len = user_strlcpy(buffer, path, B_PATH_NAME_LENGTH);
9079 	if (len < 0)
9080 		return len;
9081 	if (len >= B_PATH_NAME_LENGTH)
9082 		return B_BUFFER_OVERFLOW;
9083 
9084 	return B_OK;
9085 }
9086 
9087 
9088 int
9089 _user_open_entry_ref(dev_t device, ino_t inode, const char* userName,
9090 	int openMode, int perms)
9091 {
9092 	char name[B_FILE_NAME_LENGTH];
9093 
9094 	if (userName == NULL || device < 0 || inode < 0)
9095 		return B_BAD_VALUE;
9096 	if (!IS_USER_ADDRESS(userName))
9097 		return B_BAD_ADDRESS;
9098 	status_t status = user_copy_name(name, userName, sizeof(name));
9099 	if (status != B_OK)
9100 		return status;
9101 
9102 	if ((openMode & O_CREAT) != 0) {
9103 		return file_create_entry_ref(device, inode, name, openMode, perms,
9104 			false);
9105 	}
9106 
9107 	return file_open_entry_ref(device, inode, name, openMode, false);
9108 }
9109 
9110 
9111 int
9112 _user_open(int fd, const char* userPath, int openMode, int perms)
9113 {
9114 	KPath path;
9115 	if (path.InitCheck() != B_OK)
9116 		return B_NO_MEMORY;
9117 
9118 	char* buffer = path.LockBuffer();
9119 
9120 	if (!IS_USER_ADDRESS(userPath))
9121 		return B_BAD_ADDRESS;
9122 	status_t status = user_copy_name(buffer, userPath, B_PATH_NAME_LENGTH);
9123 	if (status != B_OK)
9124 		return status;
9125 
9126 	if ((openMode & O_CREAT) != 0)
9127 		return file_create(fd, buffer, openMode, perms, false);
9128 
9129 	return file_open(fd, buffer, openMode, false);
9130 }
9131 
9132 
9133 int
9134 _user_open_dir_entry_ref(dev_t device, ino_t inode, const char* userName)
9135 {
9136 	if (userName != NULL) {
9137 		char name[B_FILE_NAME_LENGTH];
9138 
9139 		if (!IS_USER_ADDRESS(userName))
9140 			return B_BAD_ADDRESS;
9141 		status_t status = user_copy_name(name, userName, sizeof(name));
9142 		if (status != B_OK)
9143 			return status;
9144 
9145 		return dir_open_entry_ref(device, inode, name, false);
9146 	}
9147 	return dir_open_entry_ref(device, inode, NULL, false);
9148 }
9149 
9150 
9151 int
9152 _user_open_dir(int fd, const char* userPath)
9153 {
9154 	if (userPath == NULL)
9155 		return dir_open(fd, NULL, false);
9156 
9157 	KPath path;
9158 	if (path.InitCheck() != B_OK)
9159 		return B_NO_MEMORY;
9160 
9161 	char* buffer = path.LockBuffer();
9162 
9163 	if (!IS_USER_ADDRESS(userPath))
9164 		return B_BAD_ADDRESS;
9165 	status_t status = user_copy_name(buffer, userPath, B_PATH_NAME_LENGTH);
9166 	if (status != B_OK)
9167 		return status;
9168 
9169 	return dir_open(fd, buffer, false);
9170 }
9171 
9172 
9173 /*!	\brief Opens a directory's parent directory and returns the entry name
9174 		   of the former.
9175 
9176 	Aside from that it returns the directory's entry name, this method is
9177 	equivalent to \code _user_open_dir(fd, "..") \endcode. It really is
9178 	equivalent, if \a userName is \c NULL.
9179 
9180 	If a name buffer is supplied and the name does not fit the buffer, the
9181 	function fails. A buffer of size \c B_FILE_NAME_LENGTH should be safe.
9182 
9183 	\param fd A FD referring to a directory.
9184 	\param userName Buffer the directory's entry name shall be written into.
9185 		   May be \c NULL.
9186 	\param nameLength Size of the name buffer.
9187 	\return The file descriptor of the opened parent directory, if everything
9188 			went fine, an error code otherwise.
9189 */
9190 int
9191 _user_open_parent_dir(int fd, char* userName, size_t nameLength)
9192 {
9193 	bool kernel = false;
9194 
9195 	if (userName && !IS_USER_ADDRESS(userName))
9196 		return B_BAD_ADDRESS;
9197 
9198 	// open the parent dir
9199 	int parentFD = dir_open(fd, (char*)"..", kernel);
9200 	if (parentFD < 0)
9201 		return parentFD;
9202 	FDCloser fdCloser(parentFD, kernel);
9203 
9204 	if (userName) {
9205 		// get the vnodes
9206 		struct vnode* parentVNode = get_vnode_from_fd(parentFD, kernel);
9207 		struct vnode* dirVNode = get_vnode_from_fd(fd, kernel);
9208 		VNodePutter parentVNodePutter(parentVNode);
9209 		VNodePutter dirVNodePutter(dirVNode);
9210 		if (!parentVNode || !dirVNode)
9211 			return B_FILE_ERROR;
9212 
9213 		// get the vnode name
9214 		char _buffer[sizeof(struct dirent) + B_FILE_NAME_LENGTH];
9215 		struct dirent* buffer = (struct dirent*)_buffer;
9216 		status_t status = get_vnode_name(dirVNode, parentVNode, buffer,
9217 			sizeof(_buffer), get_current_io_context(false));
9218 		if (status != B_OK)
9219 			return status;
9220 
9221 		// copy the name to the userland buffer
9222 		int len = user_strlcpy(userName, buffer->d_name, nameLength);
9223 		if (len < 0)
9224 			return len;
9225 		if (len >= (int)nameLength)
9226 			return B_BUFFER_OVERFLOW;
9227 	}
9228 
9229 	return fdCloser.Detach();
9230 }
9231 
9232 
9233 status_t
9234 _user_fcntl(int fd, int op, size_t argument)
9235 {
9236 	status_t status = common_fcntl(fd, op, argument, false);
9237 	if (op == F_SETLKW)
9238 		syscall_restart_handle_post(status);
9239 
9240 	return status;
9241 }
9242 
9243 
9244 status_t
9245 _user_fsync(int fd)
9246 {
9247 	return common_sync(fd, false);
9248 }
9249 
9250 
9251 status_t
9252 _user_flock(int fd, int operation)
9253 {
9254 	FUNCTION(("_user_fcntl(fd = %d, op = %d)\n", fd, operation));
9255 
9256 	// Check if the operation is valid
9257 	switch (operation & ~LOCK_NB) {
9258 		case LOCK_UN:
9259 		case LOCK_SH:
9260 		case LOCK_EX:
9261 			break;
9262 
9263 		default:
9264 			return B_BAD_VALUE;
9265 	}
9266 
9267 	struct file_descriptor* descriptor;
9268 	struct vnode* vnode;
9269 	descriptor = get_fd_and_vnode(fd, &vnode, false);
9270 	if (descriptor == NULL)
9271 		return B_FILE_ERROR;
9272 
9273 	if (descriptor->type != FDTYPE_FILE) {
9274 		put_fd(descriptor);
9275 		return B_BAD_VALUE;
9276 	}
9277 
9278 	struct flock flock;
9279 	flock.l_start = 0;
9280 	flock.l_len = OFF_MAX;
9281 	flock.l_whence = 0;
9282 	flock.l_type = (operation & LOCK_SH) != 0 ? F_RDLCK : F_WRLCK;
9283 
9284 	status_t status;
9285 	if ((operation & LOCK_UN) != 0) {
9286 		if (HAS_FS_CALL(vnode, release_lock))
9287 			status = FS_CALL(vnode, release_lock, descriptor->cookie, &flock);
9288 		else
9289 			status = release_advisory_lock(vnode, NULL, descriptor, &flock);
9290 	} else {
9291 		if (HAS_FS_CALL(vnode, acquire_lock)) {
9292 			status = FS_CALL(vnode, acquire_lock, descriptor->cookie, &flock,
9293 				(operation & LOCK_NB) == 0);
9294 		} else {
9295 			status = acquire_advisory_lock(vnode, NULL, descriptor, &flock,
9296 				(operation & LOCK_NB) == 0);
9297 		}
9298 	}
9299 
9300 	syscall_restart_handle_post(status);
9301 
9302 	put_fd(descriptor);
9303 	return status;
9304 }
9305 
9306 
9307 status_t
9308 _user_lock_node(int fd)
9309 {
9310 	return common_lock_node(fd, false);
9311 }
9312 
9313 
9314 status_t
9315 _user_unlock_node(int fd)
9316 {
9317 	return common_unlock_node(fd, false);
9318 }
9319 
9320 
9321 status_t
9322 _user_create_dir_entry_ref(dev_t device, ino_t inode, const char* userName,
9323 	int perms)
9324 {
9325 	char name[B_FILE_NAME_LENGTH];
9326 	status_t status;
9327 
9328 	if (!IS_USER_ADDRESS(userName))
9329 		return B_BAD_ADDRESS;
9330 
9331 	status = user_copy_name(name, userName, sizeof(name));
9332 	if (status != B_OK)
9333 		return status;
9334 
9335 	return dir_create_entry_ref(device, inode, name, perms, false);
9336 }
9337 
9338 
9339 status_t
9340 _user_create_dir(int fd, const char* userPath, int perms)
9341 {
9342 	KPath pathBuffer;
9343 	if (pathBuffer.InitCheck() != B_OK)
9344 		return B_NO_MEMORY;
9345 
9346 	char* path = pathBuffer.LockBuffer();
9347 
9348 	if (!IS_USER_ADDRESS(userPath))
9349 		return B_BAD_ADDRESS;
9350 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9351 	if (status != B_OK)
9352 		return status;
9353 
9354 	return dir_create(fd, path, perms, false);
9355 }
9356 
9357 
9358 status_t
9359 _user_remove_dir(int fd, const char* userPath)
9360 {
9361 	KPath pathBuffer;
9362 	if (pathBuffer.InitCheck() != B_OK)
9363 		return B_NO_MEMORY;
9364 
9365 	char* path = pathBuffer.LockBuffer();
9366 
9367 	if (userPath != NULL) {
9368 		if (!IS_USER_ADDRESS(userPath))
9369 			return B_BAD_ADDRESS;
9370 		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9371 		if (status != B_OK)
9372 			return status;
9373 	}
9374 
9375 	return dir_remove(fd, userPath ? path : NULL, false);
9376 }
9377 
9378 
9379 status_t
9380 _user_read_link(int fd, const char* userPath, char* userBuffer,
9381 	size_t* userBufferSize)
9382 {
9383 	KPath pathBuffer, linkBuffer;
9384 	if (pathBuffer.InitCheck() != B_OK || linkBuffer.InitCheck() != B_OK)
9385 		return B_NO_MEMORY;
9386 
9387 	size_t bufferSize;
9388 
9389 	if (!IS_USER_ADDRESS(userBuffer) || !IS_USER_ADDRESS(userBufferSize)
9390 		|| user_memcpy(&bufferSize, userBufferSize, sizeof(size_t)) != B_OK)
9391 		return B_BAD_ADDRESS;
9392 
9393 	char* path = pathBuffer.LockBuffer();
9394 	char* buffer = linkBuffer.LockBuffer();
9395 
9396 	if (userPath) {
9397 		if (!IS_USER_ADDRESS(userPath))
9398 			return B_BAD_ADDRESS;
9399 		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9400 		if (status != B_OK)
9401 			return status;
9402 
9403 		if (bufferSize > B_PATH_NAME_LENGTH)
9404 			bufferSize = B_PATH_NAME_LENGTH;
9405 	}
9406 
9407 	status_t status = common_read_link(fd, userPath ? path : NULL, buffer,
9408 		&bufferSize, false);
9409 
9410 	// we also update the bufferSize in case of errors
9411 	// (the real length will be returned in case of B_BUFFER_OVERFLOW)
9412 	if (user_memcpy(userBufferSize, &bufferSize, sizeof(size_t)) != B_OK)
9413 		return B_BAD_ADDRESS;
9414 
9415 	if (status != B_OK)
9416 		return status;
9417 
9418 	if (user_memcpy(userBuffer, buffer, bufferSize) != B_OK)
9419 		return B_BAD_ADDRESS;
9420 
9421 	return B_OK;
9422 }
9423 
9424 
9425 status_t
9426 _user_create_symlink(int fd, const char* userPath, const char* userToPath,
9427 	int mode)
9428 {
9429 	KPath pathBuffer;
9430 	KPath toPathBuffer;
9431 	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
9432 		return B_NO_MEMORY;
9433 
9434 	char* path = pathBuffer.LockBuffer();
9435 	char* toPath = toPathBuffer.LockBuffer();
9436 
9437 	if (!IS_USER_ADDRESS(userPath) || !IS_USER_ADDRESS(userToPath))
9438 		return B_BAD_ADDRESS;
9439 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9440 	if (status != B_OK)
9441 		return status;
9442 	status = user_copy_name(toPath, userToPath, B_PATH_NAME_LENGTH);
9443 	if (status != B_OK)
9444 		return status;
9445 
9446 	return common_create_symlink(fd, path, toPath, mode, false);
9447 }
9448 
9449 
9450 status_t
9451 _user_create_link(int pathFD, const char* userPath, int toFD,
9452 	const char* userToPath, bool traverseLeafLink)
9453 {
9454 	KPath pathBuffer;
9455 	KPath toPathBuffer;
9456 	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
9457 		return B_NO_MEMORY;
9458 
9459 	char* path = pathBuffer.LockBuffer();
9460 	char* toPath = toPathBuffer.LockBuffer();
9461 
9462 	if (!IS_USER_ADDRESS(userPath) || !IS_USER_ADDRESS(userToPath))
9463 		return B_BAD_ADDRESS;
9464 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9465 	if (status != B_OK)
9466 		return status;
9467 	status = user_copy_name(toPath, userToPath, B_PATH_NAME_LENGTH);
9468 	if (status != B_OK)
9469 		return status;
9470 
9471 	status = check_path(toPath);
9472 	if (status != B_OK)
9473 		return status;
9474 
9475 	return common_create_link(pathFD, path, toFD, toPath, traverseLeafLink,
9476 		false);
9477 }
9478 
9479 
9480 status_t
9481 _user_unlink(int fd, const char* userPath)
9482 {
9483 	KPath pathBuffer;
9484 	if (pathBuffer.InitCheck() != B_OK)
9485 		return B_NO_MEMORY;
9486 
9487 	char* path = pathBuffer.LockBuffer();
9488 
9489 	if (!IS_USER_ADDRESS(userPath))
9490 		return B_BAD_ADDRESS;
9491 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9492 	if (status != B_OK)
9493 		return status;
9494 
9495 	return common_unlink(fd, path, false);
9496 }
9497 
9498 
9499 status_t
9500 _user_rename(int oldFD, const char* userOldPath, int newFD,
9501 	const char* userNewPath)
9502 {
9503 	KPath oldPathBuffer;
9504 	KPath newPathBuffer;
9505 	if (oldPathBuffer.InitCheck() != B_OK || newPathBuffer.InitCheck() != B_OK)
9506 		return B_NO_MEMORY;
9507 
9508 	char* oldPath = oldPathBuffer.LockBuffer();
9509 	char* newPath = newPathBuffer.LockBuffer();
9510 
9511 	if (!IS_USER_ADDRESS(userOldPath) || !IS_USER_ADDRESS(userNewPath))
9512 		return B_BAD_ADDRESS;
9513 	status_t status = user_copy_name(oldPath, userOldPath, B_PATH_NAME_LENGTH);
9514 	if (status != B_OK)
9515 		return status;
9516 	status = user_copy_name(newPath, userNewPath, B_PATH_NAME_LENGTH);
9517 	if (status != B_OK)
9518 		return status;
9519 
9520 	return common_rename(oldFD, oldPath, newFD, newPath, false);
9521 }
9522 
9523 
9524 status_t
9525 _user_create_fifo(int fd, const char* userPath, mode_t perms)
9526 {
9527 	KPath pathBuffer;
9528 	if (pathBuffer.InitCheck() != B_OK)
9529 		return B_NO_MEMORY;
9530 
9531 	char* path = pathBuffer.LockBuffer();
9532 
9533 	if (!IS_USER_ADDRESS(userPath))
9534 		return B_BAD_ADDRESS;
9535 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9536 	if (status != B_OK)
9537 		return status;
9538 
9539 	// split into directory vnode and filename path
9540 	char filename[B_FILE_NAME_LENGTH];
9541 	struct vnode* dir;
9542 	status = fd_and_path_to_dir_vnode(fd, path, &dir, filename, false);
9543 	if (status != B_OK)
9544 		return status;
9545 
9546 	VNodePutter _(dir);
9547 
9548 	// the underlying FS needs to support creating FIFOs
9549 	if (!HAS_FS_CALL(dir, create_special_node))
9550 		return B_UNSUPPORTED;
9551 
9552 	// create the entry	-- the FIFO sub node is set up automatically
9553 	fs_vnode superVnode;
9554 	ino_t nodeID;
9555 	status = FS_CALL(dir, create_special_node, filename, NULL,
9556 		S_IFIFO | (perms & S_IUMSK), 0, &superVnode, &nodeID);
9557 
9558 	// create_special_node() acquired a reference for us that we don't need.
9559 	if (status == B_OK)
9560 		put_vnode(dir->mount->volume, nodeID);
9561 
9562 	return status;
9563 }
9564 
9565 
9566 status_t
9567 _user_create_pipe(int* userFDs)
9568 {
9569 	// rootfs should support creating FIFOs, but let's be sure
9570 	if (!HAS_FS_CALL(sRoot, create_special_node))
9571 		return B_UNSUPPORTED;
9572 
9573 	// create the node	-- the FIFO sub node is set up automatically
9574 	fs_vnode superVnode;
9575 	ino_t nodeID;
9576 	status_t status = FS_CALL(sRoot, create_special_node, NULL, NULL,
9577 		S_IFIFO | S_IRUSR | S_IWUSR, 0, &superVnode, &nodeID);
9578 	if (status != B_OK)
9579 		return status;
9580 
9581 	// We've got one reference to the node and need another one.
9582 	struct vnode* vnode;
9583 	status = get_vnode(sRoot->mount->id, nodeID, &vnode, true, false);
9584 	if (status != B_OK) {
9585 		// that should not happen
9586 		dprintf("_user_create_pipe(): Failed to lookup vnode (%" B_PRIdDEV ", "
9587 			"%" B_PRIdINO ")\n", sRoot->mount->id, sRoot->id);
9588 		return status;
9589 	}
9590 
9591 	// Everything looks good so far. Open two FDs for reading respectively
9592 	// writing.
9593 	int fds[2];
9594 	fds[0] = open_vnode(vnode, O_RDONLY, false);
9595 	fds[1] = open_vnode(vnode, O_WRONLY, false);
9596 
9597 	FDCloser closer0(fds[0], false);
9598 	FDCloser closer1(fds[1], false);
9599 
9600 	status = (fds[0] >= 0 ? (fds[1] >= 0 ? B_OK : fds[1]) : fds[0]);
9601 
9602 	// copy FDs to userland
9603 	if (status == B_OK) {
9604 		if (!IS_USER_ADDRESS(userFDs)
9605 			|| user_memcpy(userFDs, fds, sizeof(fds)) != B_OK) {
9606 			status = B_BAD_ADDRESS;
9607 		}
9608 	}
9609 
9610 	// keep FDs, if everything went fine
9611 	if (status == B_OK) {
9612 		closer0.Detach();
9613 		closer1.Detach();
9614 	}
9615 
9616 	return status;
9617 }
9618 
9619 
9620 status_t
9621 _user_access(int fd, const char* userPath, int mode, bool effectiveUserGroup)
9622 {
9623 	KPath pathBuffer;
9624 	if (pathBuffer.InitCheck() != B_OK)
9625 		return B_NO_MEMORY;
9626 
9627 	char* path = pathBuffer.LockBuffer();
9628 
9629 	if (!IS_USER_ADDRESS(userPath))
9630 		return B_BAD_ADDRESS;
9631 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9632 	if (status != B_OK)
9633 		return status;
9634 
9635 	return common_access(fd, path, mode, effectiveUserGroup, false);
9636 }
9637 
9638 
9639 status_t
9640 _user_read_stat(int fd, const char* userPath, bool traverseLink,
9641 	struct stat* userStat, size_t statSize)
9642 {
9643 	struct stat stat = {0};
9644 	status_t status;
9645 
9646 	if (statSize > sizeof(struct stat))
9647 		return B_BAD_VALUE;
9648 
9649 	if (!IS_USER_ADDRESS(userStat))
9650 		return B_BAD_ADDRESS;
9651 
9652 	if (userPath != NULL) {
9653 		// path given: get the stat of the node referred to by (fd, path)
9654 		if (!IS_USER_ADDRESS(userPath))
9655 			return B_BAD_ADDRESS;
9656 
9657 		KPath pathBuffer;
9658 		if (pathBuffer.InitCheck() != B_OK)
9659 			return B_NO_MEMORY;
9660 
9661 		char* path = pathBuffer.LockBuffer();
9662 
9663 		status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9664 		if (status != B_OK)
9665 			return status;
9666 
9667 		status = common_path_read_stat(fd, path, traverseLink, &stat, false);
9668 	} else {
9669 		// no path given: get the FD and use the FD operation
9670 		struct file_descriptor* descriptor
9671 			= get_fd(get_current_io_context(false), fd);
9672 		if (descriptor == NULL)
9673 			return B_FILE_ERROR;
9674 
9675 		if (descriptor->ops->fd_read_stat)
9676 			status = descriptor->ops->fd_read_stat(descriptor, &stat);
9677 		else
9678 			status = B_UNSUPPORTED;
9679 
9680 		put_fd(descriptor);
9681 	}
9682 
9683 	if (status != B_OK)
9684 		return status;
9685 
9686 	return user_memcpy(userStat, &stat, statSize);
9687 }
9688 
9689 
9690 status_t
9691 _user_write_stat(int fd, const char* userPath, bool traverseLeafLink,
9692 	const struct stat* userStat, size_t statSize, int statMask)
9693 {
9694 	if (statSize > sizeof(struct stat))
9695 		return B_BAD_VALUE;
9696 
9697 	struct stat stat;
9698 
9699 	if (!IS_USER_ADDRESS(userStat)
9700 		|| user_memcpy(&stat, userStat, statSize) < B_OK)
9701 		return B_BAD_ADDRESS;
9702 
9703 	// clear additional stat fields
9704 	if (statSize < sizeof(struct stat))
9705 		memset((uint8*)&stat + statSize, 0, sizeof(struct stat) - statSize);
9706 
9707 	status_t status;
9708 
9709 	if (userPath != NULL) {
9710 		// path given: write the stat of the node referred to by (fd, path)
9711 		if (!IS_USER_ADDRESS(userPath))
9712 			return B_BAD_ADDRESS;
9713 
9714 		KPath pathBuffer;
9715 		if (pathBuffer.InitCheck() != B_OK)
9716 			return B_NO_MEMORY;
9717 
9718 		char* path = pathBuffer.LockBuffer();
9719 
9720 		status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9721 		if (status != B_OK)
9722 			return status;
9723 
9724 		status = common_path_write_stat(fd, path, traverseLeafLink, &stat,
9725 			statMask, false);
9726 	} else {
9727 		// no path given: get the FD and use the FD operation
9728 		struct file_descriptor* descriptor
9729 			= get_fd(get_current_io_context(false), fd);
9730 		if (descriptor == NULL)
9731 			return B_FILE_ERROR;
9732 
9733 		if (descriptor->ops->fd_write_stat) {
9734 			status = descriptor->ops->fd_write_stat(descriptor, &stat,
9735 				statMask);
9736 		} else
9737 			status = B_UNSUPPORTED;
9738 
9739 		put_fd(descriptor);
9740 	}
9741 
9742 	return status;
9743 }
9744 
9745 
9746 int
9747 _user_open_attr_dir(int fd, const char* userPath, bool traverseLeafLink)
9748 {
9749 	KPath pathBuffer;
9750 	if (pathBuffer.InitCheck() != B_OK)
9751 		return B_NO_MEMORY;
9752 
9753 	char* path = pathBuffer.LockBuffer();
9754 
9755 	if (userPath != NULL) {
9756 		if (!IS_USER_ADDRESS(userPath))
9757 			return B_BAD_ADDRESS;
9758 		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9759 		if (status != B_OK)
9760 			return status;
9761 	}
9762 
9763 	return attr_dir_open(fd, userPath ? path : NULL, traverseLeafLink, false);
9764 }
9765 
9766 
9767 ssize_t
9768 _user_read_attr(int fd, const char* userAttribute, off_t pos, void* userBuffer,
9769 	size_t readBytes)
9770 {
9771 	char attribute[B_FILE_NAME_LENGTH];
9772 
9773 	if (userAttribute == NULL)
9774 		return B_BAD_VALUE;
9775 	if (!IS_USER_ADDRESS(userAttribute))
9776 		return B_BAD_ADDRESS;
9777 	status_t status = user_copy_name(attribute, userAttribute, sizeof(attribute));
9778 	if (status != B_OK)
9779 		return status;
9780 
9781 	int attr = attr_open(fd, NULL, attribute, O_RDONLY, false);
9782 	if (attr < 0)
9783 		return attr;
9784 
9785 	ssize_t bytes = _user_read(attr, pos, userBuffer, readBytes);
9786 	_user_close(attr);
9787 
9788 	return bytes;
9789 }
9790 
9791 
9792 ssize_t
9793 _user_write_attr(int fd, const char* userAttribute, uint32 type, off_t pos,
9794 	const void* buffer, size_t writeBytes)
9795 {
9796 	char attribute[B_FILE_NAME_LENGTH];
9797 
9798 	if (userAttribute == NULL)
9799 		return B_BAD_VALUE;
9800 	if (!IS_USER_ADDRESS(userAttribute))
9801 		return B_BAD_ADDRESS;
9802 	status_t status = user_copy_name(attribute, userAttribute, sizeof(attribute));
9803 	if (status != B_OK)
9804 		return status;
9805 
9806 	// Try to support the BeOS typical truncation as well as the position
9807 	// argument
9808 	int attr = attr_create(fd, NULL, attribute, type,
9809 		O_CREAT | O_WRONLY | (pos != 0 ? 0 : O_TRUNC), false);
9810 	if (attr < 0)
9811 		return attr;
9812 
9813 	ssize_t bytes = _user_write(attr, pos, buffer, writeBytes);
9814 	_user_close(attr);
9815 
9816 	return bytes;
9817 }
9818 
9819 
9820 status_t
9821 _user_stat_attr(int fd, const char* userAttribute,
9822 	struct attr_info* userAttrInfo)
9823 {
9824 	char attribute[B_FILE_NAME_LENGTH];
9825 
9826 	if (userAttribute == NULL || userAttrInfo == NULL)
9827 		return B_BAD_VALUE;
9828 	if (!IS_USER_ADDRESS(userAttribute) || !IS_USER_ADDRESS(userAttrInfo))
9829 		return B_BAD_ADDRESS;
9830 	status_t status = user_copy_name(attribute, userAttribute,
9831 		sizeof(attribute));
9832 	if (status != B_OK)
9833 		return status;
9834 
9835 	int attr = attr_open(fd, NULL, attribute, O_RDONLY, false);
9836 	if (attr < 0)
9837 		return attr;
9838 
9839 	struct file_descriptor* descriptor
9840 		= get_fd(get_current_io_context(false), attr);
9841 	if (descriptor == NULL) {
9842 		_user_close(attr);
9843 		return B_FILE_ERROR;
9844 	}
9845 
9846 	struct stat stat;
9847 	if (descriptor->ops->fd_read_stat)
9848 		status = descriptor->ops->fd_read_stat(descriptor, &stat);
9849 	else
9850 		status = B_UNSUPPORTED;
9851 
9852 	put_fd(descriptor);
9853 	_user_close(attr);
9854 
9855 	if (status == B_OK) {
9856 		attr_info info;
9857 		info.type = stat.st_type;
9858 		info.size = stat.st_size;
9859 
9860 		if (user_memcpy(userAttrInfo, &info, sizeof(struct attr_info)) != B_OK)
9861 			return B_BAD_ADDRESS;
9862 	}
9863 
9864 	return status;
9865 }
9866 
9867 
9868 int
9869 _user_open_attr(int fd, const char* userPath, const char* userName,
9870 	uint32 type, int openMode)
9871 {
9872 	char name[B_FILE_NAME_LENGTH];
9873 
9874 	if (!IS_USER_ADDRESS(userName))
9875 		return B_BAD_ADDRESS;
9876 	status_t status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
9877 	if (status != B_OK)
9878 		return status;
9879 
9880 	KPath pathBuffer;
9881 	if (pathBuffer.InitCheck() != B_OK)
9882 		return B_NO_MEMORY;
9883 
9884 	char* path = pathBuffer.LockBuffer();
9885 
9886 	if (userPath != NULL) {
9887 		if (!IS_USER_ADDRESS(userPath))
9888 			return B_BAD_ADDRESS;
9889 		status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9890 		if (status != B_OK)
9891 			return status;
9892 	}
9893 
9894 	if ((openMode & O_CREAT) != 0) {
9895 		return attr_create(fd, userPath ? path : NULL, name, type, openMode,
9896 			false);
9897 	}
9898 
9899 	return attr_open(fd, userPath ? path : NULL, name, openMode, false);
9900 }
9901 
9902 
9903 status_t
9904 _user_remove_attr(int fd, const char* userName)
9905 {
9906 	char name[B_FILE_NAME_LENGTH];
9907 
9908 	if (!IS_USER_ADDRESS(userName))
9909 		return B_BAD_ADDRESS;
9910 	status_t status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
9911 	if (status != B_OK)
9912 		return status;
9913 
9914 	return attr_remove(fd, name, false);
9915 }
9916 
9917 
9918 status_t
9919 _user_rename_attr(int fromFile, const char* userFromName, int toFile,
9920 	const char* userToName)
9921 {
9922 	if (!IS_USER_ADDRESS(userFromName)
9923 		|| !IS_USER_ADDRESS(userToName))
9924 		return B_BAD_ADDRESS;
9925 
9926 	KPath fromNameBuffer(B_FILE_NAME_LENGTH);
9927 	KPath toNameBuffer(B_FILE_NAME_LENGTH);
9928 	if (fromNameBuffer.InitCheck() != B_OK || toNameBuffer.InitCheck() != B_OK)
9929 		return B_NO_MEMORY;
9930 
9931 	char* fromName = fromNameBuffer.LockBuffer();
9932 	char* toName = toNameBuffer.LockBuffer();
9933 
9934 	status_t status = user_copy_name(fromName, userFromName, B_FILE_NAME_LENGTH);
9935 	if (status != B_OK)
9936 		return status;
9937 	status = user_copy_name(toName, userToName, B_FILE_NAME_LENGTH);
9938 	if (status != B_OK)
9939 		return status;
9940 
9941 	return attr_rename(fromFile, fromName, toFile, toName, false);
9942 }
9943 
9944 
9945 int
9946 _user_open_index_dir(dev_t device)
9947 {
9948 	return index_dir_open(device, false);
9949 }
9950 
9951 
9952 status_t
9953 _user_create_index(dev_t device, const char* userName, uint32 type,
9954 	uint32 flags)
9955 {
9956 	char name[B_FILE_NAME_LENGTH];
9957 
9958 	if (!IS_USER_ADDRESS(userName))
9959 		return B_BAD_ADDRESS;
9960 	status_t status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
9961 	if (status != B_OK)
9962 		return status;
9963 
9964 	return index_create(device, name, type, flags, false);
9965 }
9966 
9967 
9968 status_t
9969 _user_read_index_stat(dev_t device, const char* userName, struct stat* userStat)
9970 {
9971 	char name[B_FILE_NAME_LENGTH];
9972 	struct stat stat = {0};
9973 	status_t status;
9974 
9975 	if (!IS_USER_ADDRESS(userName) || !IS_USER_ADDRESS(userStat))
9976 		return B_BAD_ADDRESS;
9977 	status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
9978 	if (status != B_OK)
9979 		return status;
9980 
9981 	status = index_name_read_stat(device, name, &stat, false);
9982 	if (status == B_OK) {
9983 		if (user_memcpy(userStat, &stat, sizeof(stat)) != B_OK)
9984 			return B_BAD_ADDRESS;
9985 	}
9986 
9987 	return status;
9988 }
9989 
9990 
9991 status_t
9992 _user_remove_index(dev_t device, const char* userName)
9993 {
9994 	char name[B_FILE_NAME_LENGTH];
9995 
9996 	if (!IS_USER_ADDRESS(userName))
9997 		return B_BAD_ADDRESS;
9998 	status_t status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
9999 	if (status != B_OK)
10000 		return status;
10001 
10002 	return index_remove(device, name, false);
10003 }
10004 
10005 
10006 status_t
10007 _user_getcwd(char* userBuffer, size_t size)
10008 {
10009 	if (size == 0)
10010 		return B_BAD_VALUE;
10011 	if (!IS_USER_ADDRESS(userBuffer))
10012 		return B_BAD_ADDRESS;
10013 
10014 	if (size > kMaxPathLength)
10015 		size = kMaxPathLength;
10016 
10017 	KPath pathBuffer(size);
10018 	if (pathBuffer.InitCheck() != B_OK)
10019 		return B_NO_MEMORY;
10020 
10021 	TRACE(("user_getcwd: buf %p, %ld\n", userBuffer, size));
10022 
10023 	char* path = pathBuffer.LockBuffer();
10024 
10025 	status_t status = get_cwd(path, size, false);
10026 	if (status != B_OK)
10027 		return status;
10028 
10029 	// Copy back the result
10030 	if (user_strlcpy(userBuffer, path, size) < B_OK)
10031 		return B_BAD_ADDRESS;
10032 
10033 	return status;
10034 }
10035 
10036 
10037 status_t
10038 _user_setcwd(int fd, const char* userPath)
10039 {
10040 	TRACE(("user_setcwd: path = %p\n", userPath));
10041 
10042 	KPath pathBuffer;
10043 	if (pathBuffer.InitCheck() != B_OK)
10044 		return B_NO_MEMORY;
10045 
10046 	char* path = pathBuffer.LockBuffer();
10047 
10048 	if (userPath != NULL) {
10049 		if (!IS_USER_ADDRESS(userPath))
10050 			return B_BAD_ADDRESS;
10051 		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
10052 		if (status != B_OK)
10053 			return status;
10054 	}
10055 
10056 	return set_cwd(fd, userPath != NULL ? path : NULL, false);
10057 }
10058 
10059 
10060 status_t
10061 _user_change_root(const char* userPath)
10062 {
10063 	// only root is allowed to chroot()
10064 	if (geteuid() != 0)
10065 		return B_NOT_ALLOWED;
10066 
10067 	// alloc path buffer
10068 	KPath pathBuffer;
10069 	if (pathBuffer.InitCheck() != B_OK)
10070 		return B_NO_MEMORY;
10071 
10072 	// copy userland path to kernel
10073 	char* path = pathBuffer.LockBuffer();
10074 	if (userPath != NULL) {
10075 		if (!IS_USER_ADDRESS(userPath))
10076 			return B_BAD_ADDRESS;
10077 		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
10078 		if (status != B_OK)
10079 			return status;
10080 	}
10081 
10082 	// get the vnode
10083 	struct vnode* vnode;
10084 	status_t status = path_to_vnode(path, true, &vnode, NULL, false);
10085 	if (status != B_OK)
10086 		return status;
10087 
10088 	// set the new root
10089 	struct io_context* context = get_current_io_context(false);
10090 	mutex_lock(&sIOContextRootLock);
10091 	struct vnode* oldRoot = context->root;
10092 	context->root = vnode;
10093 	mutex_unlock(&sIOContextRootLock);
10094 
10095 	put_vnode(oldRoot);
10096 
10097 	return B_OK;
10098 }
10099 
10100 
10101 int
10102 _user_open_query(dev_t device, const char* userQuery, size_t queryLength,
10103 	uint32 flags, port_id port, int32 token)
10104 {
10105 	if (device < 0 || userQuery == NULL || queryLength == 0)
10106 		return B_BAD_VALUE;
10107 
10108 	if (!IS_USER_ADDRESS(userQuery))
10109 		return B_BAD_ADDRESS;
10110 
10111 	// this is a safety restriction
10112 	if (queryLength >= 65536)
10113 		return B_NAME_TOO_LONG;
10114 
10115 	BStackOrHeapArray<char, 128> query(queryLength);
10116 	if (!query.IsValid())
10117 		return B_NO_MEMORY;
10118 
10119 	if (user_strlcpy(query, userQuery, queryLength + 1) < B_OK)
10120 		return B_BAD_ADDRESS;
10121 
10122 	return query_open(device, query, flags, port, token, false);
10123 }
10124 
10125 
10126 #include "vfs_request_io.cpp"
10127