xref: /haiku/src/system/kernel/fs/vfs.cpp (revision db6fcb750a1afb5fdc752322972adf6044d3b4c4)
1 /*
2  * Copyright 2005-2013, Ingo Weinhold, ingo_weinhold@gmx.de.
3  * Copyright 2002-2018, Axel Dörfler, axeld@pinc-software.de.
4  * Distributed under the terms of the MIT License.
5  *
6  * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
7  * Distributed under the terms of the NewOS License.
8  */
9 
10 
11 /*! Virtual File System and File System Interface Layer */
12 
13 
14 #include <ctype.h>
15 #include <fcntl.h>
16 #include <limits.h>
17 #include <stddef.h>
18 #include <stdio.h>
19 #include <string.h>
20 #include <sys/file.h>
21 #include <sys/resource.h>
22 #include <sys/stat.h>
23 #include <unistd.h>
24 
25 #include <fs_attr.h>
26 #include <fs_info.h>
27 #include <fs_interface.h>
28 #include <fs_volume.h>
29 #include <NodeMonitor.h>
30 #include <OS.h>
31 #include <StorageDefs.h>
32 
33 #include <AutoDeleter.h>
34 #include <block_cache.h>
35 #include <boot/kernel_args.h>
36 #include <debug_heap.h>
37 #include <disk_device_manager/KDiskDevice.h>
38 #include <disk_device_manager/KDiskDeviceManager.h>
39 #include <disk_device_manager/KDiskDeviceUtils.h>
40 #include <disk_device_manager/KDiskSystem.h>
41 #include <fd.h>
42 #include <file_cache.h>
43 #include <fs/node_monitor.h>
44 #include <KPath.h>
45 #include <lock.h>
46 #include <low_resource_manager.h>
47 #include <slab/Slab.h>
48 #include <syscalls.h>
49 #include <syscall_restart.h>
50 #include <tracing.h>
51 #include <util/atomic.h>
52 #include <util/AutoLock.h>
53 #include <util/DoublyLinkedList.h>
54 #include <vfs.h>
55 #include <vm/vm.h>
56 #include <vm/VMCache.h>
57 #include <wait_for_objects.h>
58 
59 #include "EntryCache.h"
60 #include "fifo.h"
61 #include "IORequest.h"
62 #include "unused_vnodes.h"
63 #include "vfs_tracing.h"
64 #include "Vnode.h"
65 #include "../cache/vnode_store.h"
66 
67 
68 //#define TRACE_VFS
69 #ifdef TRACE_VFS
70 #	define TRACE(x) dprintf x
71 #	define FUNCTION(x) dprintf x
72 #else
73 #	define TRACE(x) ;
74 #	define FUNCTION(x) ;
75 #endif
76 
77 #define ADD_DEBUGGER_COMMANDS
78 
79 
80 #define HAS_FS_CALL(vnode, op)			(vnode->ops->op != NULL)
81 #define HAS_FS_MOUNT_CALL(mount, op)	(mount->volume->ops->op != NULL)
82 
83 #if KDEBUG
84 #	define FS_CALL(vnode, op, params...) \
85 		( HAS_FS_CALL(vnode, op) ? \
86 			vnode->ops->op(vnode->mount->volume, vnode, params) \
87 			: (panic("FS_CALL op " #op " is NULL"), 0))
88 #	define FS_CALL_NO_PARAMS(vnode, op) \
89 		( HAS_FS_CALL(vnode, op) ? \
90 			vnode->ops->op(vnode->mount->volume, vnode) \
91 			: (panic("FS_CALL_NO_PARAMS op " #op " is NULL"), 0))
92 #	define FS_MOUNT_CALL(mount, op, params...) \
93 		( HAS_FS_MOUNT_CALL(mount, op) ? \
94 			mount->volume->ops->op(mount->volume, params) \
95 			: (panic("FS_MOUNT_CALL op " #op " is NULL"), 0))
96 #	define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
97 		( HAS_FS_MOUNT_CALL(mount, op) ? \
98 			mount->volume->ops->op(mount->volume) \
99 			: (panic("FS_MOUNT_CALL_NO_PARAMS op " #op " is NULL"), 0))
100 #else
101 #	define FS_CALL(vnode, op, params...) \
102 			vnode->ops->op(vnode->mount->volume, vnode, params)
103 #	define FS_CALL_NO_PARAMS(vnode, op) \
104 			vnode->ops->op(vnode->mount->volume, vnode)
105 #	define FS_MOUNT_CALL(mount, op, params...) \
106 			mount->volume->ops->op(mount->volume, params)
107 #	define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
108 			mount->volume->ops->op(mount->volume)
109 #endif
110 
111 
112 const static size_t kMaxPathLength = 65536;
113 	// The absolute maximum path length (for getcwd() - this is not depending
114 	// on PATH_MAX
115 
116 
117 typedef DoublyLinkedList<vnode> VnodeList;
118 
119 /*!	\brief Structure to manage a mounted file system
120 
121 	Note: The root_vnode and root_vnode->covers fields (what others?) are
122 	initialized in fs_mount() and not changed afterwards. That is as soon
123 	as the mount is mounted and it is made sure it won't be unmounted
124 	(e.g. by holding a reference to a vnode of that mount) (read) access
125 	to those fields is always safe, even without additional locking. Morever
126 	while mounted the mount holds a reference to the root_vnode->covers vnode,
127 	and thus making the access path vnode->mount->root_vnode->covers->mount->...
128 	safe if a reference to vnode is held (note that for the root mount
129 	root_vnode->covers is NULL, though).
130 */
131 struct fs_mount {
132 	fs_mount()
133 		:
134 		volume(NULL),
135 		device_name(NULL)
136 	{
137 		mutex_init(&lock, "mount lock");
138 	}
139 
140 	~fs_mount()
141 	{
142 		mutex_destroy(&lock);
143 		free(device_name);
144 
145 		while (volume) {
146 			fs_volume* superVolume = volume->super_volume;
147 
148 			if (volume->file_system != NULL)
149 				put_module(volume->file_system->info.name);
150 
151 			free(volume->file_system_name);
152 			free(volume);
153 			volume = superVolume;
154 		}
155 	}
156 
157 	struct fs_mount* next;
158 	dev_t			id;
159 	fs_volume*		volume;
160 	char*			device_name;
161 	mutex			lock;	// guards the vnodes list
162 	struct vnode*	root_vnode;
163 	struct vnode*	covers_vnode;	// immutable
164 	KPartition*		partition;
165 	VnodeList		vnodes;
166 	EntryCache		entry_cache;
167 	bool			unmounting;
168 	bool			owns_file_device;
169 };
170 
171 
172 namespace {
173 
174 struct advisory_lock : public DoublyLinkedListLinkImpl<advisory_lock> {
175 	list_link		link;
176 	void*			bound_to;
177 	team_id			team;
178 	pid_t			session;
179 	off_t			start;
180 	off_t			end;
181 	bool			shared;
182 };
183 
184 typedef DoublyLinkedList<advisory_lock> LockList;
185 
186 } // namespace
187 
188 
189 struct advisory_locking {
190 	sem_id			lock;
191 	sem_id			wait_sem;
192 	LockList		locks;
193 
194 	advisory_locking()
195 		:
196 		lock(-1),
197 		wait_sem(-1)
198 	{
199 	}
200 
201 	~advisory_locking()
202 	{
203 		if (lock >= 0)
204 			delete_sem(lock);
205 		if (wait_sem >= 0)
206 			delete_sem(wait_sem);
207 	}
208 };
209 
210 /*!	\brief Guards sMountsTable.
211 
212 	The holder is allowed to read/write access the sMountsTable.
213 	Manipulation of the fs_mount structures themselves
214 	(and their destruction) requires different locks though.
215 */
216 static mutex sMountMutex = MUTEX_INITIALIZER("vfs_mount_lock");
217 
218 /*!	\brief Guards mount/unmount operations.
219 
220 	The fs_mount() and fs_unmount() hold the lock during their whole operation.
221 	That is locking the lock ensures that no FS is mounted/unmounted. In
222 	particular this means that
223 	- sMountsTable will not be modified,
224 	- the fields immutable after initialization of the fs_mount structures in
225 	  sMountsTable will not be modified,
226 
227 	The thread trying to lock the lock must not hold sVnodeLock or
228 	sMountMutex.
229 */
230 static recursive_lock sMountOpLock;
231 
232 /*!	\brief Guards sVnodeTable.
233 
234 	The holder is allowed read/write access to sVnodeTable and to
235 	any unbusy vnode in that table, save to the immutable fields (device, id,
236 	private_node, mount) to which only read-only access is allowed.
237 	The mutable fields advisory_locking, mandatory_locked_by, and ref_count, as
238 	well as the busy, removed, unused flags, and the vnode's type can also be
239 	write accessed when holding a read lock to sVnodeLock *and* having the vnode
240 	locked. Write access to covered_by and covers requires to write lock
241 	sVnodeLock.
242 
243 	The thread trying to acquire the lock must not hold sMountMutex.
244 	You must not hold this lock when calling create_sem(), as this might call
245 	vfs_free_unused_vnodes() and thus cause a deadlock.
246 */
247 static rw_lock sVnodeLock = RW_LOCK_INITIALIZER("vfs_vnode_lock");
248 
249 /*!	\brief Guards io_context::root.
250 
251 	Must be held when setting or getting the io_context::root field.
252 	The only operation allowed while holding this lock besides getting or
253 	setting the field is inc_vnode_ref_count() on io_context::root.
254 */
255 static mutex sIOContextRootLock = MUTEX_INITIALIZER("io_context::root lock");
256 
257 
258 namespace {
259 
260 struct vnode_hash_key {
261 	dev_t	device;
262 	ino_t	vnode;
263 };
264 
265 struct VnodeHash {
266 	typedef vnode_hash_key	KeyType;
267 	typedef	struct vnode	ValueType;
268 
269 #define VHASH(mountid, vnodeid) \
270 	(((uint32)((vnodeid) >> 32) + (uint32)(vnodeid)) ^ (uint32)(mountid))
271 
272 	size_t HashKey(KeyType key) const
273 	{
274 		return VHASH(key.device, key.vnode);
275 	}
276 
277 	size_t Hash(ValueType* vnode) const
278 	{
279 		return VHASH(vnode->device, vnode->id);
280 	}
281 
282 #undef VHASH
283 
284 	bool Compare(KeyType key, ValueType* vnode) const
285 	{
286 		return vnode->device == key.device && vnode->id == key.vnode;
287 	}
288 
289 	ValueType*& GetLink(ValueType* value) const
290 	{
291 		return value->next;
292 	}
293 };
294 
295 typedef BOpenHashTable<VnodeHash> VnodeTable;
296 
297 
298 struct MountHash {
299 	typedef dev_t			KeyType;
300 	typedef	struct fs_mount	ValueType;
301 
302 	size_t HashKey(KeyType key) const
303 	{
304 		return key;
305 	}
306 
307 	size_t Hash(ValueType* mount) const
308 	{
309 		return mount->id;
310 	}
311 
312 	bool Compare(KeyType key, ValueType* mount) const
313 	{
314 		return mount->id == key;
315 	}
316 
317 	ValueType*& GetLink(ValueType* value) const
318 	{
319 		return value->next;
320 	}
321 };
322 
323 typedef BOpenHashTable<MountHash> MountTable;
324 
325 } // namespace
326 
327 
328 object_cache* sPathNameCache;
329 object_cache* sFileDescriptorCache;
330 
331 #define VNODE_HASH_TABLE_SIZE 1024
332 static VnodeTable* sVnodeTable;
333 static struct vnode* sRoot;
334 
335 #define MOUNTS_HASH_TABLE_SIZE 16
336 static MountTable* sMountsTable;
337 static dev_t sNextMountID = 1;
338 
339 #define MAX_TEMP_IO_VECS 8
340 
341 // How long to wait for busy vnodes (10s)
342 #define BUSY_VNODE_RETRIES 2000
343 #define BUSY_VNODE_DELAY 5000
344 
345 mode_t __gUmask = 022;
346 
347 /* function declarations */
348 
349 static void free_unused_vnodes();
350 
351 // file descriptor operation prototypes
352 static status_t file_read(struct file_descriptor* descriptor, off_t pos,
353 	void* buffer, size_t* _bytes);
354 static status_t file_write(struct file_descriptor* descriptor, off_t pos,
355 	const void* buffer, size_t* _bytes);
356 static off_t file_seek(struct file_descriptor* descriptor, off_t pos,
357 	int seekType);
358 static void file_free_fd(struct file_descriptor* descriptor);
359 static status_t file_close(struct file_descriptor* descriptor);
360 static status_t file_select(struct file_descriptor* descriptor, uint8 event,
361 	struct selectsync* sync);
362 static status_t file_deselect(struct file_descriptor* descriptor, uint8 event,
363 	struct selectsync* sync);
364 static status_t dir_read(struct io_context* context,
365 	struct file_descriptor* descriptor, struct dirent* buffer,
366 	size_t bufferSize, uint32* _count);
367 static status_t dir_read(struct io_context* ioContext, struct vnode* vnode,
368 	void* cookie, struct dirent* buffer, size_t bufferSize, uint32* _count);
369 static status_t dir_rewind(struct file_descriptor* descriptor);
370 static void dir_free_fd(struct file_descriptor* descriptor);
371 static status_t dir_close(struct file_descriptor* descriptor);
372 static status_t attr_dir_read(struct io_context* context,
373 	struct file_descriptor* descriptor, struct dirent* buffer,
374 	size_t bufferSize, uint32* _count);
375 static status_t attr_dir_rewind(struct file_descriptor* descriptor);
376 static void attr_dir_free_fd(struct file_descriptor* descriptor);
377 static status_t attr_dir_close(struct file_descriptor* descriptor);
378 static status_t attr_read(struct file_descriptor* descriptor, off_t pos,
379 	void* buffer, size_t* _bytes);
380 static status_t attr_write(struct file_descriptor* descriptor, off_t pos,
381 	const void* buffer, size_t* _bytes);
382 static off_t attr_seek(struct file_descriptor* descriptor, off_t pos,
383 	int seekType);
384 static void attr_free_fd(struct file_descriptor* descriptor);
385 static status_t attr_close(struct file_descriptor* descriptor);
386 static status_t attr_read_stat(struct file_descriptor* descriptor,
387 	struct stat* statData);
388 static status_t attr_write_stat(struct file_descriptor* descriptor,
389 	const struct stat* stat, int statMask);
390 static status_t index_dir_read(struct io_context* context,
391 	struct file_descriptor* descriptor, struct dirent* buffer,
392 	size_t bufferSize, uint32* _count);
393 static status_t index_dir_rewind(struct file_descriptor* descriptor);
394 static void index_dir_free_fd(struct file_descriptor* descriptor);
395 static status_t index_dir_close(struct file_descriptor* descriptor);
396 static status_t query_read(struct io_context* context,
397 	struct file_descriptor* descriptor, struct dirent* buffer,
398 	size_t bufferSize, uint32* _count);
399 static status_t query_rewind(struct file_descriptor* descriptor);
400 static void query_free_fd(struct file_descriptor* descriptor);
401 static status_t query_close(struct file_descriptor* descriptor);
402 
403 static status_t common_ioctl(struct file_descriptor* descriptor, ulong op,
404 	void* buffer, size_t length);
405 static status_t common_read_stat(struct file_descriptor* descriptor,
406 	struct stat* statData);
407 static status_t common_write_stat(struct file_descriptor* descriptor,
408 	const struct stat* statData, int statMask);
409 static status_t common_path_read_stat(int fd, char* path, bool traverseLeafLink,
410 	struct stat* stat, bool kernel);
411 
412 static status_t vnode_path_to_vnode(struct vnode* vnode, char* path,
413 	bool traverseLeafLink, int count, bool kernel,
414 	struct vnode** _vnode, ino_t* _parentID);
415 static status_t dir_vnode_to_path(struct vnode* vnode, char* buffer,
416 	size_t bufferSize, bool kernel);
417 static status_t fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
418 	struct vnode** _vnode, ino_t* _parentID, bool kernel);
419 static void inc_vnode_ref_count(struct vnode* vnode);
420 static status_t dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree,
421 	bool reenter);
422 static inline void put_vnode(struct vnode* vnode);
423 static status_t fs_unmount(char* path, dev_t mountID, uint32 flags,
424 	bool kernel);
425 static int open_vnode(struct vnode* vnode, int openMode, bool kernel);
426 
427 
428 static struct fd_ops sFileOps = {
429 	file_read,
430 	file_write,
431 	file_seek,
432 	common_ioctl,
433 	NULL,		// set_flags
434 	file_select,
435 	file_deselect,
436 	NULL,		// read_dir()
437 	NULL,		// rewind_dir()
438 	common_read_stat,
439 	common_write_stat,
440 	file_close,
441 	file_free_fd
442 };
443 
444 static struct fd_ops sDirectoryOps = {
445 	NULL,		// read()
446 	NULL,		// write()
447 	NULL,		// seek()
448 	common_ioctl,
449 	NULL,		// set_flags
450 	NULL,		// select()
451 	NULL,		// deselect()
452 	dir_read,
453 	dir_rewind,
454 	common_read_stat,
455 	common_write_stat,
456 	dir_close,
457 	dir_free_fd
458 };
459 
460 static struct fd_ops sAttributeDirectoryOps = {
461 	NULL,		// read()
462 	NULL,		// write()
463 	NULL,		// seek()
464 	common_ioctl,
465 	NULL,		// set_flags
466 	NULL,		// select()
467 	NULL,		// deselect()
468 	attr_dir_read,
469 	attr_dir_rewind,
470 	common_read_stat,
471 	common_write_stat,
472 	attr_dir_close,
473 	attr_dir_free_fd
474 };
475 
476 static struct fd_ops sAttributeOps = {
477 	attr_read,
478 	attr_write,
479 	attr_seek,
480 	common_ioctl,
481 	NULL,		// set_flags
482 	NULL,		// select()
483 	NULL,		// deselect()
484 	NULL,		// read_dir()
485 	NULL,		// rewind_dir()
486 	attr_read_stat,
487 	attr_write_stat,
488 	attr_close,
489 	attr_free_fd
490 };
491 
492 static struct fd_ops sIndexDirectoryOps = {
493 	NULL,		// read()
494 	NULL,		// write()
495 	NULL,		// seek()
496 	NULL,		// ioctl()
497 	NULL,		// set_flags
498 	NULL,		// select()
499 	NULL,		// deselect()
500 	index_dir_read,
501 	index_dir_rewind,
502 	NULL,		// read_stat()
503 	NULL,		// write_stat()
504 	index_dir_close,
505 	index_dir_free_fd
506 };
507 
508 #if 0
509 static struct fd_ops sIndexOps = {
510 	NULL,		// read()
511 	NULL,		// write()
512 	NULL,		// seek()
513 	NULL,		// ioctl()
514 	NULL,		// set_flags
515 	NULL,		// select()
516 	NULL,		// deselect()
517 	NULL,		// dir_read()
518 	NULL,		// dir_rewind()
519 	index_read_stat,	// read_stat()
520 	NULL,		// write_stat()
521 	NULL,		// dir_close()
522 	NULL		// free_fd()
523 };
524 #endif
525 
526 static struct fd_ops sQueryOps = {
527 	NULL,		// read()
528 	NULL,		// write()
529 	NULL,		// seek()
530 	NULL,		// ioctl()
531 	NULL,		// set_flags
532 	NULL,		// select()
533 	NULL,		// deselect()
534 	query_read,
535 	query_rewind,
536 	NULL,		// read_stat()
537 	NULL,		// write_stat()
538 	query_close,
539 	query_free_fd
540 };
541 
542 
543 namespace {
544 
545 class VNodePutter {
546 public:
547 	VNodePutter(struct vnode* vnode = NULL) : fVNode(vnode) {}
548 
549 	~VNodePutter()
550 	{
551 		Put();
552 	}
553 
554 	void SetTo(struct vnode* vnode)
555 	{
556 		Put();
557 		fVNode = vnode;
558 	}
559 
560 	void Put()
561 	{
562 		if (fVNode) {
563 			put_vnode(fVNode);
564 			fVNode = NULL;
565 		}
566 	}
567 
568 	struct vnode* Detach()
569 	{
570 		struct vnode* vnode = fVNode;
571 		fVNode = NULL;
572 		return vnode;
573 	}
574 
575 private:
576 	struct vnode* fVNode;
577 };
578 
579 
580 class FDCloser {
581 public:
582 	FDCloser() : fFD(-1), fKernel(true) {}
583 
584 	FDCloser(int fd, bool kernel) : fFD(fd), fKernel(kernel) {}
585 
586 	~FDCloser()
587 	{
588 		Close();
589 	}
590 
591 	void SetTo(int fd, bool kernel)
592 	{
593 		Close();
594 		fFD = fd;
595 		fKernel = kernel;
596 	}
597 
598 	void Close()
599 	{
600 		if (fFD >= 0) {
601 			if (fKernel)
602 				_kern_close(fFD);
603 			else
604 				_user_close(fFD);
605 			fFD = -1;
606 		}
607 	}
608 
609 	int Detach()
610 	{
611 		int fd = fFD;
612 		fFD = -1;
613 		return fd;
614 	}
615 
616 private:
617 	int		fFD;
618 	bool	fKernel;
619 };
620 
621 } // namespace
622 
623 
624 #if VFS_PAGES_IO_TRACING
625 
626 namespace VFSPagesIOTracing {
627 
628 class PagesIOTraceEntry : public AbstractTraceEntry {
629 protected:
630 	PagesIOTraceEntry(struct vnode* vnode, void* cookie, off_t pos,
631 		const generic_io_vec* vecs, uint32 count, uint32 flags,
632 		generic_size_t bytesRequested, status_t status,
633 		generic_size_t bytesTransferred)
634 		:
635 		fVnode(vnode),
636 		fMountID(vnode->mount->id),
637 		fNodeID(vnode->id),
638 		fCookie(cookie),
639 		fPos(pos),
640 		fCount(count),
641 		fFlags(flags),
642 		fBytesRequested(bytesRequested),
643 		fStatus(status),
644 		fBytesTransferred(bytesTransferred)
645 	{
646 		fVecs = (generic_io_vec*)alloc_tracing_buffer_memcpy(vecs,
647 			sizeof(generic_io_vec) * count, false);
648 	}
649 
650 	void AddDump(TraceOutput& out, const char* mode)
651 	{
652 		out.Print("vfs pages io %5s: vnode: %p (%" B_PRId32 ", %" B_PRId64 "), "
653 			"cookie: %p, pos: %" B_PRIdOFF ", size: %" B_PRIu64 ", vecs: {",
654 			mode, fVnode, fMountID, fNodeID, fCookie, fPos,
655 			(uint64)fBytesRequested);
656 
657 		if (fVecs != NULL) {
658 			for (uint32 i = 0; i < fCount; i++) {
659 				if (i > 0)
660 					out.Print(", ");
661 				out.Print("(%" B_PRIx64 ", %" B_PRIu64 ")", (uint64)fVecs[i].base,
662 					(uint64)fVecs[i].length);
663 			}
664 		}
665 
666 		out.Print("}, flags: %#" B_PRIx32 " -> status: %#" B_PRIx32 ", "
667 			"transferred: %" B_PRIu64, fFlags, fStatus,
668 			(uint64)fBytesTransferred);
669 	}
670 
671 protected:
672 	struct vnode*	fVnode;
673 	dev_t			fMountID;
674 	ino_t			fNodeID;
675 	void*			fCookie;
676 	off_t			fPos;
677 	generic_io_vec*	fVecs;
678 	uint32			fCount;
679 	uint32			fFlags;
680 	generic_size_t	fBytesRequested;
681 	status_t		fStatus;
682 	generic_size_t	fBytesTransferred;
683 };
684 
685 
686 class ReadPages : public PagesIOTraceEntry {
687 public:
688 	ReadPages(struct vnode* vnode, void* cookie, off_t pos,
689 		const generic_io_vec* vecs, uint32 count, uint32 flags,
690 		generic_size_t bytesRequested, status_t status,
691 		generic_size_t bytesTransferred)
692 		:
693 		PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
694 			bytesRequested, status, bytesTransferred)
695 	{
696 		Initialized();
697 	}
698 
699 	virtual void AddDump(TraceOutput& out)
700 	{
701 		PagesIOTraceEntry::AddDump(out, "read");
702 	}
703 };
704 
705 
706 class WritePages : public PagesIOTraceEntry {
707 public:
708 	WritePages(struct vnode* vnode, void* cookie, off_t pos,
709 		const generic_io_vec* vecs, uint32 count, uint32 flags,
710 		generic_size_t bytesRequested, status_t status,
711 		generic_size_t bytesTransferred)
712 		:
713 		PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
714 			bytesRequested, status, bytesTransferred)
715 	{
716 		Initialized();
717 	}
718 
719 	virtual void AddDump(TraceOutput& out)
720 	{
721 		PagesIOTraceEntry::AddDump(out, "write");
722 	}
723 };
724 
725 }	// namespace VFSPagesIOTracing
726 
727 #	define TPIO(x) new(std::nothrow) VFSPagesIOTracing::x;
728 #else
729 #	define TPIO(x) ;
730 #endif	// VFS_PAGES_IO_TRACING
731 
732 
733 /*! Finds the mounted device (the fs_mount structure) with the given ID.
734 	Note, you must hold the gMountMutex lock when you call this function.
735 */
736 static struct fs_mount*
737 find_mount(dev_t id)
738 {
739 	ASSERT_LOCKED_MUTEX(&sMountMutex);
740 
741 	return sMountsTable->Lookup(id);
742 }
743 
744 
745 static status_t
746 get_mount(dev_t id, struct fs_mount** _mount)
747 {
748 	struct fs_mount* mount;
749 
750 	ReadLocker nodeLocker(sVnodeLock);
751 	MutexLocker mountLocker(sMountMutex);
752 
753 	mount = find_mount(id);
754 	if (mount == NULL)
755 		return B_BAD_VALUE;
756 
757 	struct vnode* rootNode = mount->root_vnode;
758 	if (mount->unmounting || rootNode == NULL || rootNode->IsBusy()
759 		|| rootNode->ref_count == 0) {
760 		// might have been called during a mount/unmount operation
761 		return B_BUSY;
762 	}
763 
764 	inc_vnode_ref_count(rootNode);
765 	*_mount = mount;
766 	return B_OK;
767 }
768 
769 
770 static void
771 put_mount(struct fs_mount* mount)
772 {
773 	if (mount)
774 		put_vnode(mount->root_vnode);
775 }
776 
777 
778 /*!	Tries to open the specified file system module.
779 	Accepts a file system name of the form "bfs" or "file_systems/bfs/v1".
780 	Returns a pointer to file system module interface, or NULL if it
781 	could not open the module.
782 */
783 static file_system_module_info*
784 get_file_system(const char* fsName)
785 {
786 	char name[B_FILE_NAME_LENGTH];
787 	if (strncmp(fsName, "file_systems/", strlen("file_systems/"))) {
788 		// construct module name if we didn't get one
789 		// (we currently support only one API)
790 		snprintf(name, sizeof(name), "file_systems/%s/v1", fsName);
791 		fsName = NULL;
792 	}
793 
794 	file_system_module_info* info;
795 	if (get_module(fsName ? fsName : name, (module_info**)&info) != B_OK)
796 		return NULL;
797 
798 	return info;
799 }
800 
801 
802 /*!	Accepts a file system name of the form "bfs" or "file_systems/bfs/v1"
803 	and returns a compatible fs_info.fsh_name name ("bfs" in both cases).
804 	The name is allocated for you, and you have to free() it when you're
805 	done with it.
806 	Returns NULL if the required memory is not available.
807 */
808 static char*
809 get_file_system_name(const char* fsName)
810 {
811 	const size_t length = strlen("file_systems/");
812 
813 	if (strncmp(fsName, "file_systems/", length)) {
814 		// the name already seems to be the module's file name
815 		return strdup(fsName);
816 	}
817 
818 	fsName += length;
819 	const char* end = strchr(fsName, '/');
820 	if (end == NULL) {
821 		// this doesn't seem to be a valid name, but well...
822 		return strdup(fsName);
823 	}
824 
825 	// cut off the trailing /v1
826 
827 	char* name = (char*)malloc(end + 1 - fsName);
828 	if (name == NULL)
829 		return NULL;
830 
831 	strlcpy(name, fsName, end + 1 - fsName);
832 	return name;
833 }
834 
835 
836 /*!	Accepts a list of file system names separated by a colon, one for each
837 	layer and returns the file system name for the specified layer.
838 	The name is allocated for you, and you have to free() it when you're
839 	done with it.
840 	Returns NULL if the required memory is not available or if there is no
841 	name for the specified layer.
842 */
843 static char*
844 get_file_system_name_for_layer(const char* fsNames, int32 layer)
845 {
846 	while (layer >= 0) {
847 		const char* end = strchr(fsNames, ':');
848 		if (end == NULL) {
849 			if (layer == 0)
850 				return strdup(fsNames);
851 			return NULL;
852 		}
853 
854 		if (layer == 0) {
855 			size_t length = end - fsNames + 1;
856 			char* result = (char*)malloc(length);
857 			strlcpy(result, fsNames, length);
858 			return result;
859 		}
860 
861 		fsNames = end + 1;
862 		layer--;
863 	}
864 
865 	return NULL;
866 }
867 
868 
869 static void
870 add_vnode_to_mount_list(struct vnode* vnode, struct fs_mount* mount)
871 {
872 	MutexLocker _(mount->lock);
873 	mount->vnodes.Add(vnode);
874 }
875 
876 
877 static void
878 remove_vnode_from_mount_list(struct vnode* vnode, struct fs_mount* mount)
879 {
880 	MutexLocker _(mount->lock);
881 	mount->vnodes.Remove(vnode);
882 }
883 
884 
885 /*!	\brief Looks up a vnode by mount and node ID in the sVnodeTable.
886 
887 	The caller must hold the sVnodeLock (read lock at least).
888 
889 	\param mountID the mount ID.
890 	\param vnodeID the node ID.
891 
892 	\return The vnode structure, if it was found in the hash table, \c NULL
893 			otherwise.
894 */
895 static struct vnode*
896 lookup_vnode(dev_t mountID, ino_t vnodeID)
897 {
898 	struct vnode_hash_key key;
899 
900 	key.device = mountID;
901 	key.vnode = vnodeID;
902 
903 	return sVnodeTable->Lookup(key);
904 }
905 
906 
907 /*!	\brief Checks whether or not a busy vnode should be waited for (again).
908 
909 	This will also wait for BUSY_VNODE_DELAY before returning if one should
910 	still wait for the vnode becoming unbusy.
911 
912 	\return \c true if one should retry, \c false if not.
913 */
914 static bool
915 retry_busy_vnode(int32& tries, dev_t mountID, ino_t vnodeID)
916 {
917 	if (--tries < 0) {
918 		// vnode doesn't seem to become unbusy
919 		dprintf("vnode %" B_PRIdDEV ":%" B_PRIdINO
920 			" is not becoming unbusy!\n", mountID, vnodeID);
921 		return false;
922 	}
923 	snooze(BUSY_VNODE_DELAY);
924 	return true;
925 }
926 
927 
928 /*!	Creates a new vnode with the given mount and node ID.
929 	If the node already exists, it is returned instead and no new node is
930 	created. In either case -- but not, if an error occurs -- the function write
931 	locks \c sVnodeLock and keeps it locked for the caller when returning. On
932 	error the lock is not held on return.
933 
934 	\param mountID The mount ID.
935 	\param vnodeID The vnode ID.
936 	\param _vnode Will be set to the new vnode on success.
937 	\param _nodeCreated Will be set to \c true when the returned vnode has
938 		been newly created, \c false when it already existed. Will not be
939 		changed on error.
940 	\return \c B_OK, when the vnode was successfully created and inserted or
941 		a node with the given ID was found, \c B_NO_MEMORY or
942 		\c B_ENTRY_NOT_FOUND on error.
943 */
944 static status_t
945 create_new_vnode_and_lock(dev_t mountID, ino_t vnodeID, struct vnode*& _vnode,
946 	bool& _nodeCreated)
947 {
948 	FUNCTION(("create_new_vnode_and_lock()\n"));
949 
950 	struct vnode* vnode = (struct vnode*)malloc(sizeof(struct vnode));
951 	if (vnode == NULL)
952 		return B_NO_MEMORY;
953 
954 	// initialize basic values
955 	memset(vnode, 0, sizeof(struct vnode));
956 	vnode->device = mountID;
957 	vnode->id = vnodeID;
958 	vnode->ref_count = 1;
959 	vnode->SetBusy(true);
960 
961 	// look up the node -- it might have been added by someone else in the
962 	// meantime
963 	rw_lock_write_lock(&sVnodeLock);
964 	struct vnode* existingVnode = lookup_vnode(mountID, vnodeID);
965 	if (existingVnode != NULL) {
966 		free(vnode);
967 		_vnode = existingVnode;
968 		_nodeCreated = false;
969 		return B_OK;
970 	}
971 
972 	// get the mount structure
973 	mutex_lock(&sMountMutex);
974 	vnode->mount = find_mount(mountID);
975 	if (!vnode->mount || vnode->mount->unmounting) {
976 		mutex_unlock(&sMountMutex);
977 		rw_lock_write_unlock(&sVnodeLock);
978 		free(vnode);
979 		return B_ENTRY_NOT_FOUND;
980 	}
981 
982 	// add the vnode to the mount's node list and the hash table
983 	sVnodeTable->Insert(vnode);
984 	add_vnode_to_mount_list(vnode, vnode->mount);
985 
986 	mutex_unlock(&sMountMutex);
987 
988 	_vnode = vnode;
989 	_nodeCreated = true;
990 
991 	// keep the vnode lock locked
992 	return B_OK;
993 }
994 
995 
996 /*!	Frees the vnode and all resources it has acquired, and removes
997 	it from the vnode hash as well as from its mount structure.
998 	Will also make sure that any cache modifications are written back.
999 */
1000 static void
1001 free_vnode(struct vnode* vnode, bool reenter)
1002 {
1003 	ASSERT_PRINT(vnode->ref_count == 0 && vnode->IsBusy(), "vnode: %p\n",
1004 		vnode);
1005 	ASSERT_PRINT(vnode->advisory_locking == NULL, "vnode: %p\n", vnode);
1006 
1007 	// write back any changes in this vnode's cache -- but only
1008 	// if the vnode won't be deleted, in which case the changes
1009 	// will be discarded
1010 
1011 	if (!vnode->IsRemoved() && HAS_FS_CALL(vnode, fsync))
1012 		FS_CALL_NO_PARAMS(vnode, fsync);
1013 
1014 	// Note: If this vnode has a cache attached, there will still be two
1015 	// references to that cache at this point. The last one belongs to the vnode
1016 	// itself (cf. vfs_get_vnode_cache()) and one belongs to the node's file
1017 	// cache. Each but the last reference to a cache also includes a reference
1018 	// to the vnode. The file cache, however, released its reference (cf.
1019 	// file_cache_create()), so that this vnode's ref count has the chance to
1020 	// ever drop to 0. Deleting the file cache now, will cause the next to last
1021 	// cache reference to be released, which will also release a (no longer
1022 	// existing) vnode reference. To avoid problems, we set the vnode's ref
1023 	// count, so that it will neither become negative nor 0.
1024 	vnode->ref_count = 2;
1025 
1026 	if (!vnode->IsUnpublished()) {
1027 		if (vnode->IsRemoved())
1028 			FS_CALL(vnode, remove_vnode, reenter);
1029 		else
1030 			FS_CALL(vnode, put_vnode, reenter);
1031 	}
1032 
1033 	// If the vnode has a VMCache attached, make sure that it won't try to get
1034 	// another reference via VMVnodeCache::AcquireUnreferencedStoreRef(). As
1035 	// long as the vnode is busy and in the hash, that won't happen, but as
1036 	// soon as we've removed it from the hash, it could reload the vnode -- with
1037 	// a new cache attached!
1038 	if (vnode->cache != NULL && vnode->cache->type == CACHE_TYPE_VNODE)
1039 		((VMVnodeCache*)vnode->cache)->VnodeDeleted();
1040 
1041 	// The file system has removed the resources of the vnode now, so we can
1042 	// make it available again (by removing the busy vnode from the hash).
1043 	rw_lock_write_lock(&sVnodeLock);
1044 	sVnodeTable->Remove(vnode);
1045 	rw_lock_write_unlock(&sVnodeLock);
1046 
1047 	// if we have a VMCache attached, remove it
1048 	if (vnode->cache)
1049 		vnode->cache->ReleaseRef();
1050 
1051 	vnode->cache = NULL;
1052 
1053 	remove_vnode_from_mount_list(vnode, vnode->mount);
1054 
1055 	free(vnode);
1056 }
1057 
1058 
1059 /*!	\brief Decrements the reference counter of the given vnode and deletes it,
1060 	if the counter dropped to 0.
1061 
1062 	The caller must, of course, own a reference to the vnode to call this
1063 	function.
1064 	The caller must not hold the sVnodeLock or the sMountMutex.
1065 
1066 	\param vnode the vnode.
1067 	\param alwaysFree don't move this vnode into the unused list, but really
1068 		   delete it if possible.
1069 	\param reenter \c true, if this function is called (indirectly) from within
1070 		   a file system. This will be passed to file system hooks only.
1071 	\return \c B_OK, if everything went fine, an error code otherwise.
1072 */
1073 static status_t
1074 dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree, bool reenter)
1075 {
1076 	ReadLocker locker(sVnodeLock);
1077 	AutoLocker<Vnode> nodeLocker(vnode);
1078 
1079 	int32 oldRefCount = atomic_add(&vnode->ref_count, -1);
1080 
1081 	ASSERT_PRINT(oldRefCount > 0, "vnode %p\n", vnode);
1082 
1083 	TRACE(("dec_vnode_ref_count: vnode %p, ref now %" B_PRId32 "\n", vnode,
1084 		vnode->ref_count));
1085 
1086 	if (oldRefCount != 1)
1087 		return B_OK;
1088 
1089 	if (vnode->IsBusy())
1090 		panic("dec_vnode_ref_count: called on busy vnode %p\n", vnode);
1091 
1092 	bool freeNode = false;
1093 	bool freeUnusedNodes = false;
1094 
1095 	// Just insert the vnode into an unused list if we don't need
1096 	// to delete it
1097 	if (vnode->IsRemoved() || alwaysFree) {
1098 		vnode_to_be_freed(vnode);
1099 		vnode->SetBusy(true);
1100 		freeNode = true;
1101 	} else
1102 		freeUnusedNodes = vnode_unused(vnode);
1103 
1104 	nodeLocker.Unlock();
1105 	locker.Unlock();
1106 
1107 	if (freeNode)
1108 		free_vnode(vnode, reenter);
1109 	else if (freeUnusedNodes)
1110 		free_unused_vnodes();
1111 
1112 	return B_OK;
1113 }
1114 
1115 
1116 /*!	\brief Increments the reference counter of the given vnode.
1117 
1118 	The caller must make sure that the node isn't deleted while this function
1119 	is called. This can be done either:
1120 	- by ensuring that a reference to the node exists and remains in existence,
1121 	  or
1122 	- by holding the vnode's lock (which also requires read locking sVnodeLock)
1123 	  or by holding sVnodeLock write locked.
1124 
1125 	In the second case the caller is responsible for dealing with the ref count
1126 	0 -> 1 transition. That is 1. this function must not be invoked when the
1127 	node is busy in the first place and 2. vnode_used() must be called for the
1128 	node.
1129 
1130 	\param vnode the vnode.
1131 */
1132 static void
1133 inc_vnode_ref_count(struct vnode* vnode)
1134 {
1135 	atomic_add(&vnode->ref_count, 1);
1136 	TRACE(("inc_vnode_ref_count: vnode %p, ref now %" B_PRId32 "\n", vnode,
1137 		vnode->ref_count));
1138 }
1139 
1140 
1141 static bool
1142 is_special_node_type(int type)
1143 {
1144 	// at the moment only FIFOs are supported
1145 	return S_ISFIFO(type);
1146 }
1147 
1148 
1149 static status_t
1150 create_special_sub_node(struct vnode* vnode, uint32 flags)
1151 {
1152 	if (S_ISFIFO(vnode->Type()))
1153 		return create_fifo_vnode(vnode->mount->volume, vnode);
1154 
1155 	return B_BAD_VALUE;
1156 }
1157 
1158 
1159 /*!	\brief Retrieves a vnode for a given mount ID, node ID pair.
1160 
1161 	If the node is not yet in memory, it will be loaded.
1162 
1163 	The caller must not hold the sVnodeLock or the sMountMutex.
1164 
1165 	\param mountID the mount ID.
1166 	\param vnodeID the node ID.
1167 	\param _vnode Pointer to a vnode* variable into which the pointer to the
1168 		   retrieved vnode structure shall be written.
1169 	\param reenter \c true, if this function is called (indirectly) from within
1170 		   a file system.
1171 	\return \c B_OK, if everything when fine, an error code otherwise.
1172 */
1173 static status_t
1174 get_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode, bool canWait,
1175 	int reenter)
1176 {
1177 	FUNCTION(("get_vnode: mountid %" B_PRId32 " vnid 0x%" B_PRIx64 " %p\n",
1178 		mountID, vnodeID, _vnode));
1179 
1180 	rw_lock_read_lock(&sVnodeLock);
1181 
1182 	int32 tries = BUSY_VNODE_RETRIES;
1183 restart:
1184 	struct vnode* vnode = lookup_vnode(mountID, vnodeID);
1185 	AutoLocker<Vnode> nodeLocker(vnode);
1186 
1187 	if (vnode && vnode->IsBusy()) {
1188 		nodeLocker.Unlock();
1189 		rw_lock_read_unlock(&sVnodeLock);
1190 		if (!canWait) {
1191 			dprintf("vnode %" B_PRIdDEV ":%" B_PRIdINO " is busy!\n",
1192 				mountID, vnodeID);
1193 			return B_BUSY;
1194 		}
1195 		if (!retry_busy_vnode(tries, mountID, vnodeID))
1196 			return B_BUSY;
1197 
1198 		rw_lock_read_lock(&sVnodeLock);
1199 		goto restart;
1200 	}
1201 
1202 	TRACE(("get_vnode: tried to lookup vnode, got %p\n", vnode));
1203 
1204 	status_t status;
1205 
1206 	if (vnode) {
1207 		if (vnode->ref_count == 0) {
1208 			// this vnode has been unused before
1209 			vnode_used(vnode);
1210 		}
1211 		inc_vnode_ref_count(vnode);
1212 
1213 		nodeLocker.Unlock();
1214 		rw_lock_read_unlock(&sVnodeLock);
1215 	} else {
1216 		// we need to create a new vnode and read it in
1217 		rw_lock_read_unlock(&sVnodeLock);
1218 			// unlock -- create_new_vnode_and_lock() write-locks on success
1219 		bool nodeCreated;
1220 		status = create_new_vnode_and_lock(mountID, vnodeID, vnode,
1221 			nodeCreated);
1222 		if (status != B_OK)
1223 			return status;
1224 
1225 		if (!nodeCreated) {
1226 			rw_lock_read_lock(&sVnodeLock);
1227 			rw_lock_write_unlock(&sVnodeLock);
1228 			goto restart;
1229 		}
1230 
1231 		rw_lock_write_unlock(&sVnodeLock);
1232 
1233 		int type;
1234 		uint32 flags;
1235 		status = FS_MOUNT_CALL(vnode->mount, get_vnode, vnodeID, vnode, &type,
1236 			&flags, reenter);
1237 		if (status == B_OK && vnode->private_node == NULL)
1238 			status = B_BAD_VALUE;
1239 
1240 		bool gotNode = status == B_OK;
1241 		bool publishSpecialSubNode = false;
1242 		if (gotNode) {
1243 			vnode->SetType(type);
1244 			publishSpecialSubNode = is_special_node_type(type)
1245 				&& (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
1246 		}
1247 
1248 		if (gotNode && publishSpecialSubNode)
1249 			status = create_special_sub_node(vnode, flags);
1250 
1251 		if (status != B_OK) {
1252 			if (gotNode)
1253 				FS_CALL(vnode, put_vnode, reenter);
1254 
1255 			rw_lock_write_lock(&sVnodeLock);
1256 			sVnodeTable->Remove(vnode);
1257 			remove_vnode_from_mount_list(vnode, vnode->mount);
1258 			rw_lock_write_unlock(&sVnodeLock);
1259 
1260 			free(vnode);
1261 			return status;
1262 		}
1263 
1264 		rw_lock_read_lock(&sVnodeLock);
1265 		vnode->Lock();
1266 
1267 		vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
1268 		vnode->SetBusy(false);
1269 
1270 		vnode->Unlock();
1271 		rw_lock_read_unlock(&sVnodeLock);
1272 	}
1273 
1274 	TRACE(("get_vnode: returning %p\n", vnode));
1275 
1276 	*_vnode = vnode;
1277 	return B_OK;
1278 }
1279 
1280 
1281 /*!	\brief Decrements the reference counter of the given vnode and deletes it,
1282 	if the counter dropped to 0.
1283 
1284 	The caller must, of course, own a reference to the vnode to call this
1285 	function.
1286 	The caller must not hold the sVnodeLock or the sMountMutex.
1287 
1288 	\param vnode the vnode.
1289 */
1290 static inline void
1291 put_vnode(struct vnode* vnode)
1292 {
1293 	dec_vnode_ref_count(vnode, false, false);
1294 }
1295 
1296 
1297 static void
1298 free_unused_vnodes(int32 level)
1299 {
1300 	unused_vnodes_check_started();
1301 
1302 	if (level == B_NO_LOW_RESOURCE) {
1303 		unused_vnodes_check_done();
1304 		return;
1305 	}
1306 
1307 	flush_hot_vnodes();
1308 
1309 	// determine how many nodes to free
1310 	uint32 count = 1;
1311 	{
1312 		MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1313 
1314 		switch (level) {
1315 			case B_LOW_RESOURCE_NOTE:
1316 				count = sUnusedVnodes / 100;
1317 				break;
1318 			case B_LOW_RESOURCE_WARNING:
1319 				count = sUnusedVnodes / 10;
1320 				break;
1321 			case B_LOW_RESOURCE_CRITICAL:
1322 				count = sUnusedVnodes;
1323 				break;
1324 		}
1325 
1326 		if (count > sUnusedVnodes)
1327 			count = sUnusedVnodes;
1328 	}
1329 
1330 	// Write back the modified pages of some unused vnodes and free them.
1331 
1332 	for (uint32 i = 0; i < count; i++) {
1333 		ReadLocker vnodesReadLocker(sVnodeLock);
1334 
1335 		// get the first node
1336 		MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1337 		struct vnode* vnode = (struct vnode*)list_get_first_item(
1338 			&sUnusedVnodeList);
1339 		unusedVnodesLocker.Unlock();
1340 
1341 		if (vnode == NULL)
1342 			break;
1343 
1344 		// lock the node
1345 		AutoLocker<Vnode> nodeLocker(vnode);
1346 
1347 		// Check whether the node is still unused -- since we only append to the
1348 		// tail of the unused queue, the vnode should still be at its head.
1349 		// Alternatively we could check its ref count for 0 and its busy flag,
1350 		// but if the node is no longer at the head of the queue, it means it
1351 		// has been touched in the meantime, i.e. it is no longer the least
1352 		// recently used unused vnode and we rather don't free it.
1353 		unusedVnodesLocker.Lock();
1354 		if (vnode != list_get_first_item(&sUnusedVnodeList))
1355 			continue;
1356 		unusedVnodesLocker.Unlock();
1357 
1358 		ASSERT(!vnode->IsBusy());
1359 
1360 		// grab a reference
1361 		inc_vnode_ref_count(vnode);
1362 		vnode_used(vnode);
1363 
1364 		// write back changes and free the node
1365 		nodeLocker.Unlock();
1366 		vnodesReadLocker.Unlock();
1367 
1368 		if (vnode->cache != NULL)
1369 			vnode->cache->WriteModified();
1370 
1371 		dec_vnode_ref_count(vnode, true, false);
1372 			// this should free the vnode when it's still unused
1373 	}
1374 
1375 	unused_vnodes_check_done();
1376 }
1377 
1378 
1379 /*!	Gets the vnode the given vnode is covering.
1380 
1381 	The caller must have \c sVnodeLock read-locked at least.
1382 
1383 	The function returns a reference to the retrieved vnode (if any), the caller
1384 	is responsible to free.
1385 
1386 	\param vnode The vnode whose covered node shall be returned.
1387 	\return The covered vnode, or \c NULL if the given vnode doesn't cover any
1388 		vnode.
1389 */
1390 static inline Vnode*
1391 get_covered_vnode_locked(Vnode* vnode)
1392 {
1393 	if (Vnode* coveredNode = vnode->covers) {
1394 		while (coveredNode->covers != NULL)
1395 			coveredNode = coveredNode->covers;
1396 
1397 		inc_vnode_ref_count(coveredNode);
1398 		return coveredNode;
1399 	}
1400 
1401 	return NULL;
1402 }
1403 
1404 
1405 /*!	Gets the vnode the given vnode is covering.
1406 
1407 	The caller must not hold \c sVnodeLock. Note that this implies a race
1408 	condition, since the situation can change at any time.
1409 
1410 	The function returns a reference to the retrieved vnode (if any), the caller
1411 	is responsible to free.
1412 
1413 	\param vnode The vnode whose covered node shall be returned.
1414 	\return The covered vnode, or \c NULL if the given vnode doesn't cover any
1415 		vnode.
1416 */
1417 static inline Vnode*
1418 get_covered_vnode(Vnode* vnode)
1419 {
1420 	if (!vnode->IsCovering())
1421 		return NULL;
1422 
1423 	ReadLocker vnodeReadLocker(sVnodeLock);
1424 	return get_covered_vnode_locked(vnode);
1425 }
1426 
1427 
1428 /*!	Gets the vnode the given vnode is covered by.
1429 
1430 	The caller must have \c sVnodeLock read-locked at least.
1431 
1432 	The function returns a reference to the retrieved vnode (if any), the caller
1433 	is responsible to free.
1434 
1435 	\param vnode The vnode whose covering node shall be returned.
1436 	\return The covering vnode, or \c NULL if the given vnode isn't covered by
1437 		any vnode.
1438 */
1439 static Vnode*
1440 get_covering_vnode_locked(Vnode* vnode)
1441 {
1442 	if (Vnode* coveringNode = vnode->covered_by) {
1443 		while (coveringNode->covered_by != NULL)
1444 			coveringNode = coveringNode->covered_by;
1445 
1446 		inc_vnode_ref_count(coveringNode);
1447 		return coveringNode;
1448 	}
1449 
1450 	return NULL;
1451 }
1452 
1453 
1454 /*!	Gets the vnode the given vnode is covered by.
1455 
1456 	The caller must not hold \c sVnodeLock. Note that this implies a race
1457 	condition, since the situation can change at any time.
1458 
1459 	The function returns a reference to the retrieved vnode (if any), the caller
1460 	is responsible to free.
1461 
1462 	\param vnode The vnode whose covering node shall be returned.
1463 	\return The covering vnode, or \c NULL if the given vnode isn't covered by
1464 		any vnode.
1465 */
1466 static inline Vnode*
1467 get_covering_vnode(Vnode* vnode)
1468 {
1469 	if (!vnode->IsCovered())
1470 		return NULL;
1471 
1472 	ReadLocker vnodeReadLocker(sVnodeLock);
1473 	return get_covering_vnode_locked(vnode);
1474 }
1475 
1476 
1477 static void
1478 free_unused_vnodes()
1479 {
1480 	free_unused_vnodes(
1481 		low_resource_state(B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1482 			| B_KERNEL_RESOURCE_ADDRESS_SPACE));
1483 }
1484 
1485 
1486 static void
1487 vnode_low_resource_handler(void* /*data*/, uint32 resources, int32 level)
1488 {
1489 	TRACE(("vnode_low_resource_handler(level = %" B_PRId32 ")\n", level));
1490 
1491 	free_unused_vnodes(level);
1492 }
1493 
1494 
1495 static inline void
1496 put_advisory_locking(struct advisory_locking* locking)
1497 {
1498 	release_sem(locking->lock);
1499 }
1500 
1501 
1502 /*!	Returns the advisory_locking object of the \a vnode in case it
1503 	has one, and locks it.
1504 	You have to call put_advisory_locking() when you're done with
1505 	it.
1506 	Note, you must not have the vnode mutex locked when calling
1507 	this function.
1508 */
1509 static struct advisory_locking*
1510 get_advisory_locking(struct vnode* vnode)
1511 {
1512 	rw_lock_read_lock(&sVnodeLock);
1513 	vnode->Lock();
1514 
1515 	struct advisory_locking* locking = vnode->advisory_locking;
1516 	sem_id lock = locking != NULL ? locking->lock : B_ERROR;
1517 
1518 	vnode->Unlock();
1519 	rw_lock_read_unlock(&sVnodeLock);
1520 
1521 	if (lock >= 0)
1522 		lock = acquire_sem(lock);
1523 	if (lock < 0) {
1524 		// This means the locking has been deleted in the mean time
1525 		// or had never existed in the first place - otherwise, we
1526 		// would get the lock at some point.
1527 		return NULL;
1528 	}
1529 
1530 	return locking;
1531 }
1532 
1533 
1534 /*!	Creates a locked advisory_locking object, and attaches it to the
1535 	given \a vnode.
1536 	Returns B_OK in case of success - also if the vnode got such an
1537 	object from someone else in the mean time, you'll still get this
1538 	one locked then.
1539 */
1540 static status_t
1541 create_advisory_locking(struct vnode* vnode)
1542 {
1543 	if (vnode == NULL)
1544 		return B_FILE_ERROR;
1545 
1546 	ObjectDeleter<advisory_locking> lockingDeleter;
1547 	struct advisory_locking* locking = NULL;
1548 
1549 	while (get_advisory_locking(vnode) == NULL) {
1550 		// no locking object set on the vnode yet, create one
1551 		if (locking == NULL) {
1552 			locking = new(std::nothrow) advisory_locking;
1553 			if (locking == NULL)
1554 				return B_NO_MEMORY;
1555 			lockingDeleter.SetTo(locking);
1556 
1557 			locking->wait_sem = create_sem(0, "advisory lock");
1558 			if (locking->wait_sem < 0)
1559 				return locking->wait_sem;
1560 
1561 			locking->lock = create_sem(0, "advisory locking");
1562 			if (locking->lock < 0)
1563 				return locking->lock;
1564 		}
1565 
1566 		// set our newly created locking object
1567 		ReadLocker _(sVnodeLock);
1568 		AutoLocker<Vnode> nodeLocker(vnode);
1569 		if (vnode->advisory_locking == NULL) {
1570 			vnode->advisory_locking = locking;
1571 			lockingDeleter.Detach();
1572 			return B_OK;
1573 		}
1574 	}
1575 
1576 	// The vnode already had a locking object. That's just as well.
1577 
1578 	return B_OK;
1579 }
1580 
1581 
1582 /*! Returns \c true when either \a flock is \c NULL or the \a flock intersects
1583 	with the advisory_lock \a lock.
1584 */
1585 static bool
1586 advisory_lock_intersects(struct advisory_lock* lock, struct flock* flock)
1587 {
1588 	if (flock == NULL)
1589 		return true;
1590 
1591 	return lock->start <= flock->l_start - 1 + flock->l_len
1592 		&& lock->end >= flock->l_start;
1593 }
1594 
1595 
1596 /*!	Tests whether acquiring a lock would block.
1597 */
1598 static status_t
1599 test_advisory_lock(struct vnode* vnode, struct flock* flock)
1600 {
1601 	flock->l_type = F_UNLCK;
1602 
1603 	struct advisory_locking* locking = get_advisory_locking(vnode);
1604 	if (locking == NULL)
1605 		return B_OK;
1606 
1607 	team_id team = team_get_current_team_id();
1608 
1609 	LockList::Iterator iterator = locking->locks.GetIterator();
1610 	while (iterator.HasNext()) {
1611 		struct advisory_lock* lock = iterator.Next();
1612 
1613 		 if (lock->team != team && advisory_lock_intersects(lock, flock)) {
1614 			// locks do overlap
1615 			if (flock->l_type != F_RDLCK || !lock->shared) {
1616 				// collision
1617 				flock->l_type = lock->shared ? F_RDLCK : F_WRLCK;
1618 				flock->l_whence = SEEK_SET;
1619 				flock->l_start = lock->start;
1620 				flock->l_len = lock->end - lock->start + 1;
1621 				flock->l_pid = lock->team;
1622 				break;
1623 			}
1624 		}
1625 	}
1626 
1627 	put_advisory_locking(locking);
1628 	return B_OK;
1629 }
1630 
1631 
1632 /*!	Removes the specified lock, or all locks of the calling team
1633 	if \a flock is NULL.
1634 */
1635 static status_t
1636 release_advisory_lock(struct vnode* vnode, struct io_context* context,
1637 	struct file_descriptor* descriptor, struct flock* flock)
1638 {
1639 	FUNCTION(("release_advisory_lock(vnode = %p, flock = %p)\n", vnode, flock));
1640 
1641 	struct advisory_locking* locking = get_advisory_locking(vnode);
1642 	if (locking == NULL)
1643 		return B_OK;
1644 
1645 	// find matching lock entries
1646 
1647 	LockList::Iterator iterator = locking->locks.GetIterator();
1648 	while (iterator.HasNext()) {
1649 		struct advisory_lock* lock = iterator.Next();
1650 		bool removeLock = false;
1651 
1652 		if (descriptor != NULL && lock->bound_to == descriptor) {
1653 			// Remove flock() locks
1654 			removeLock = true;
1655 		} else if (lock->bound_to == context
1656 				&& advisory_lock_intersects(lock, flock)) {
1657 			// Remove POSIX locks
1658 			bool endsBeyond = false;
1659 			bool startsBefore = false;
1660 			if (flock != NULL) {
1661 				startsBefore = lock->start < flock->l_start;
1662 				endsBeyond = lock->end > flock->l_start - 1 + flock->l_len;
1663 			}
1664 
1665 			if (!startsBefore && !endsBeyond) {
1666 				// lock is completely contained in flock
1667 				removeLock = true;
1668 			} else if (startsBefore && !endsBeyond) {
1669 				// cut the end of the lock
1670 				lock->end = flock->l_start - 1;
1671 			} else if (!startsBefore && endsBeyond) {
1672 				// cut the start of the lock
1673 				lock->start = flock->l_start + flock->l_len;
1674 			} else {
1675 				// divide the lock into two locks
1676 				struct advisory_lock* secondLock = new advisory_lock;
1677 				if (secondLock == NULL) {
1678 					// TODO: we should probably revert the locks we already
1679 					// changed... (ie. allocate upfront)
1680 					put_advisory_locking(locking);
1681 					return B_NO_MEMORY;
1682 				}
1683 
1684 				lock->end = flock->l_start - 1;
1685 
1686 				secondLock->bound_to = context;
1687 				secondLock->team = lock->team;
1688 				secondLock->session = lock->session;
1689 				// values must already be normalized when getting here
1690 				secondLock->start = flock->l_start + flock->l_len;
1691 				secondLock->end = lock->end;
1692 				secondLock->shared = lock->shared;
1693 
1694 				locking->locks.Add(secondLock);
1695 			}
1696 		}
1697 
1698 		if (removeLock) {
1699 			// this lock is no longer used
1700 			iterator.Remove();
1701 			free(lock);
1702 		}
1703 	}
1704 
1705 	bool removeLocking = locking->locks.IsEmpty();
1706 	release_sem_etc(locking->wait_sem, 1, B_RELEASE_ALL);
1707 
1708 	put_advisory_locking(locking);
1709 
1710 	if (removeLocking) {
1711 		// We can remove the whole advisory locking structure; it's no
1712 		// longer used
1713 		locking = get_advisory_locking(vnode);
1714 		if (locking != NULL) {
1715 			ReadLocker locker(sVnodeLock);
1716 			AutoLocker<Vnode> nodeLocker(vnode);
1717 
1718 			// the locking could have been changed in the mean time
1719 			if (locking->locks.IsEmpty()) {
1720 				vnode->advisory_locking = NULL;
1721 				nodeLocker.Unlock();
1722 				locker.Unlock();
1723 
1724 				// we've detached the locking from the vnode, so we can
1725 				// safely delete it
1726 				delete locking;
1727 			} else {
1728 				// the locking is in use again
1729 				nodeLocker.Unlock();
1730 				locker.Unlock();
1731 				release_sem_etc(locking->lock, 1, B_DO_NOT_RESCHEDULE);
1732 			}
1733 		}
1734 	}
1735 
1736 	return B_OK;
1737 }
1738 
1739 
1740 /*!	Acquires an advisory lock for the \a vnode. If \a wait is \c true, it
1741 	will wait for the lock to become available, if there are any collisions
1742 	(it will return B_PERMISSION_DENIED in this case if \a wait is \c false).
1743 
1744 	If \a descriptor is NULL, POSIX semantics are used for this lock. Otherwise,
1745 	BSD flock() semantics are used, that is, all children can unlock the file
1746 	in question (we even allow parents to remove the lock, though, but that
1747 	seems to be in line to what the BSD's are doing).
1748 */
1749 static status_t
1750 acquire_advisory_lock(struct vnode* vnode, io_context* context,
1751 	struct file_descriptor* descriptor, struct flock* flock, bool wait)
1752 {
1753 	FUNCTION(("acquire_advisory_lock(vnode = %p, flock = %p, wait = %s)\n",
1754 		vnode, flock, wait ? "yes" : "no"));
1755 	dprintf("acquire_advisory_lock(vnode = %p, flock = %p, wait = %s)\n",
1756 		vnode, flock, wait ? "yes" : "no");
1757 
1758 	bool shared = flock->l_type == F_RDLCK;
1759 	void* boundTo = descriptor != NULL ? (void*)descriptor : (void*)context;
1760 	status_t status = B_OK;
1761 
1762 	// TODO: do deadlock detection!
1763 
1764 	struct advisory_locking* locking;
1765 
1766 	while (true) {
1767 		// if this vnode has an advisory_locking structure attached,
1768 		// lock that one and search for any colliding file lock
1769 		status = create_advisory_locking(vnode);
1770 		if (status != B_OK)
1771 			return status;
1772 
1773 		locking = vnode->advisory_locking;
1774 		team_id team = team_get_current_team_id();
1775 		sem_id waitForLock = -1;
1776 
1777 		// test for collisions
1778 		LockList::Iterator iterator = locking->locks.GetIterator();
1779 		while (iterator.HasNext()) {
1780 			struct advisory_lock* lock = iterator.Next();
1781 
1782 			// TODO: locks from the same team might be joinable!
1783 			if ((lock->team != team || lock->bound_to != boundTo)
1784 					&& advisory_lock_intersects(lock, flock)) {
1785 				// locks do overlap
1786 				if (!shared || !lock->shared) {
1787 					// we need to wait
1788 					waitForLock = locking->wait_sem;
1789 					break;
1790 				}
1791 			}
1792 		}
1793 
1794 		if (waitForLock < 0)
1795 			break;
1796 
1797 		// We need to wait. Do that or fail now, if we've been asked not to.
1798 
1799 		if (!wait) {
1800 			put_advisory_locking(locking);
1801 			return descriptor != NULL ? B_WOULD_BLOCK : B_PERMISSION_DENIED;
1802 		}
1803 
1804 		status = switch_sem_etc(locking->lock, waitForLock, 1,
1805 			B_CAN_INTERRUPT, 0);
1806 		if (status != B_OK && status != B_BAD_SEM_ID)
1807 			return status;
1808 
1809 		// We have been notified, but we need to re-lock the locking object. So
1810 		// go another round...
1811 	}
1812 
1813 	// install new lock
1814 
1815 	struct advisory_lock* lock = (struct advisory_lock*)malloc(
1816 		sizeof(struct advisory_lock));
1817 	if (lock == NULL) {
1818 		put_advisory_locking(locking);
1819 		return B_NO_MEMORY;
1820 	}
1821 
1822 	lock->bound_to = boundTo;
1823 	lock->team = team_get_current_team_id();
1824 	lock->session = thread_get_current_thread()->team->session_id;
1825 	// values must already be normalized when getting here
1826 	lock->start = flock->l_start;
1827 	lock->end = flock->l_start - 1 + flock->l_len;
1828 	lock->shared = shared;
1829 
1830 	locking->locks.Add(lock);
1831 	put_advisory_locking(locking);
1832 
1833 	return status;
1834 }
1835 
1836 
1837 /*!	Normalizes the \a flock structure to make it easier to compare the
1838 	structure with others. The l_start and l_len fields are set to absolute
1839 	values according to the l_whence field.
1840 */
1841 static status_t
1842 normalize_flock(struct file_descriptor* descriptor, struct flock* flock)
1843 {
1844 	switch (flock->l_whence) {
1845 		case SEEK_SET:
1846 			break;
1847 		case SEEK_CUR:
1848 			flock->l_start += descriptor->pos;
1849 			break;
1850 		case SEEK_END:
1851 		{
1852 			struct vnode* vnode = descriptor->u.vnode;
1853 			struct stat stat;
1854 			status_t status;
1855 
1856 			if (!HAS_FS_CALL(vnode, read_stat))
1857 				return B_UNSUPPORTED;
1858 
1859 			status = FS_CALL(vnode, read_stat, &stat);
1860 			if (status != B_OK)
1861 				return status;
1862 
1863 			flock->l_start += stat.st_size;
1864 			break;
1865 		}
1866 		default:
1867 			return B_BAD_VALUE;
1868 	}
1869 
1870 	if (flock->l_start < 0)
1871 		flock->l_start = 0;
1872 	if (flock->l_len == 0)
1873 		flock->l_len = OFF_MAX;
1874 
1875 	// don't let the offset and length overflow
1876 	if (flock->l_start > 0 && OFF_MAX - flock->l_start < flock->l_len)
1877 		flock->l_len = OFF_MAX - flock->l_start;
1878 
1879 	if (flock->l_len < 0) {
1880 		// a negative length reverses the region
1881 		flock->l_start += flock->l_len;
1882 		flock->l_len = -flock->l_len;
1883 	}
1884 
1885 	return B_OK;
1886 }
1887 
1888 
1889 static void
1890 replace_vnode_if_disconnected(struct fs_mount* mount,
1891 	struct vnode* vnodeToDisconnect, struct vnode*& vnode,
1892 	struct vnode* fallBack, bool lockRootLock)
1893 {
1894 	struct vnode* givenVnode = vnode;
1895 	bool vnodeReplaced = false;
1896 
1897 	ReadLocker vnodeReadLocker(sVnodeLock);
1898 
1899 	if (lockRootLock)
1900 		mutex_lock(&sIOContextRootLock);
1901 
1902 	while (vnode != NULL && vnode->mount == mount
1903 		&& (vnodeToDisconnect == NULL || vnodeToDisconnect == vnode)) {
1904 		if (vnode->covers != NULL) {
1905 			// redirect the vnode to the covered vnode
1906 			vnode = vnode->covers;
1907 		} else
1908 			vnode = fallBack;
1909 
1910 		vnodeReplaced = true;
1911 	}
1912 
1913 	// If we've replaced the node, grab a reference for the new one.
1914 	if (vnodeReplaced && vnode != NULL)
1915 		inc_vnode_ref_count(vnode);
1916 
1917 	if (lockRootLock)
1918 		mutex_unlock(&sIOContextRootLock);
1919 
1920 	vnodeReadLocker.Unlock();
1921 
1922 	if (vnodeReplaced)
1923 		put_vnode(givenVnode);
1924 }
1925 
1926 
1927 /*!	Disconnects all file descriptors that are associated with the
1928 	\a vnodeToDisconnect, or if this is NULL, all vnodes of the specified
1929 	\a mount object.
1930 
1931 	Note, after you've called this function, there might still be ongoing
1932 	accesses - they won't be interrupted if they already happened before.
1933 	However, any subsequent access will fail.
1934 
1935 	This is not a cheap function and should be used with care and rarely.
1936 	TODO: there is currently no means to stop a blocking read/write!
1937 */
1938 static void
1939 disconnect_mount_or_vnode_fds(struct fs_mount* mount,
1940 	struct vnode* vnodeToDisconnect)
1941 {
1942 	// iterate over all teams and peek into their file descriptors
1943 	TeamListIterator teamIterator;
1944 	while (Team* team = teamIterator.Next()) {
1945 		BReference<Team> teamReference(team, true);
1946 		TeamLocker teamLocker(team);
1947 
1948 		// lock the I/O context
1949 		io_context* context = team->io_context;
1950 		if (context == NULL)
1951 			continue;
1952 		MutexLocker contextLocker(context->io_mutex);
1953 
1954 		teamLocker.Unlock();
1955 
1956 		replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->root,
1957 			sRoot, true);
1958 		replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->cwd,
1959 			sRoot, false);
1960 
1961 		for (uint32 i = 0; i < context->table_size; i++) {
1962 			struct file_descriptor* descriptor = context->fds[i];
1963 			if (descriptor == NULL || (descriptor->open_mode & O_DISCONNECTED) != 0)
1964 				continue;
1965 
1966 			inc_fd_ref_count(descriptor);
1967 
1968 			// if this descriptor points at this mount, we
1969 			// need to disconnect it to be able to unmount
1970 			struct vnode* vnode = fd_vnode(descriptor);
1971 			if (vnodeToDisconnect != NULL) {
1972 				if (vnode == vnodeToDisconnect)
1973 					disconnect_fd(descriptor);
1974 			} else if ((vnode != NULL && vnode->mount == mount)
1975 				|| (vnode == NULL && descriptor->u.mount == mount))
1976 				disconnect_fd(descriptor);
1977 
1978 			put_fd(descriptor);
1979 		}
1980 	}
1981 }
1982 
1983 
1984 /*!	\brief Gets the root node of the current IO context.
1985 	If \a kernel is \c true, the kernel IO context will be used.
1986 	The caller obtains a reference to the returned node.
1987 */
1988 struct vnode*
1989 get_root_vnode(bool kernel)
1990 {
1991 	if (!kernel) {
1992 		// Get current working directory from io context
1993 		struct io_context* context = get_current_io_context(kernel);
1994 
1995 		mutex_lock(&sIOContextRootLock);
1996 
1997 		struct vnode* root = context->root;
1998 		if (root != NULL)
1999 			inc_vnode_ref_count(root);
2000 
2001 		mutex_unlock(&sIOContextRootLock);
2002 
2003 		if (root != NULL)
2004 			return root;
2005 
2006 		// That should never happen.
2007 		dprintf("get_root_vnode(): IO context for team %" B_PRId32 " doesn't "
2008 			"have a root\n", team_get_current_team_id());
2009 	}
2010 
2011 	inc_vnode_ref_count(sRoot);
2012 	return sRoot;
2013 }
2014 
2015 
2016 /*!	\brief Gets the directory path and leaf name for a given path.
2017 
2018 	The supplied \a path is transformed to refer to the directory part of
2019 	the entry identified by the original path, and into the buffer \a filename
2020 	the leaf name of the original entry is written.
2021 	Neither the returned path nor the leaf name can be expected to be
2022 	canonical.
2023 
2024 	\param path The path to be analyzed. Must be able to store at least one
2025 		   additional character.
2026 	\param filename The buffer into which the leaf name will be written.
2027 		   Must be of size B_FILE_NAME_LENGTH at least.
2028 	\return \c B_OK, if everything went fine, \c B_NAME_TOO_LONG, if the leaf
2029 		   name is longer than \c B_FILE_NAME_LENGTH, or \c B_ENTRY_NOT_FOUND,
2030 		   if the given path name is empty.
2031 */
2032 static status_t
2033 get_dir_path_and_leaf(char* path, char* filename)
2034 {
2035 	if (*path == '\0')
2036 		return B_ENTRY_NOT_FOUND;
2037 
2038 	char* last = strrchr(path, '/');
2039 		// '/' are not allowed in file names!
2040 
2041 	FUNCTION(("get_dir_path_and_leaf(path = %s)\n", path));
2042 
2043 	if (last == NULL) {
2044 		// this path is single segment with no '/' in it
2045 		// ex. "foo"
2046 		if (strlcpy(filename, path, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2047 			return B_NAME_TOO_LONG;
2048 
2049 		strcpy(path, ".");
2050 	} else {
2051 		last++;
2052 		if (last[0] == '\0') {
2053 			// special case: the path ends in one or more '/' - remove them
2054 			while (*--last == '/' && last != path);
2055 			last[1] = '\0';
2056 
2057 			if (last == path && last[0] == '/') {
2058 				// This path points to the root of the file system
2059 				strcpy(filename, ".");
2060 				return B_OK;
2061 			}
2062 			for (; last != path && *(last - 1) != '/'; last--);
2063 				// rewind to the start of the leaf before the '/'
2064 		}
2065 
2066 		// normal leaf: replace the leaf portion of the path with a '.'
2067 		if (strlcpy(filename, last, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2068 			return B_NAME_TOO_LONG;
2069 
2070 		last[0] = '.';
2071 		last[1] = '\0';
2072 	}
2073 	return B_OK;
2074 }
2075 
2076 
2077 static status_t
2078 entry_ref_to_vnode(dev_t mountID, ino_t directoryID, const char* name,
2079 	bool traverse, bool kernel, struct vnode** _vnode)
2080 {
2081 	char clonedName[B_FILE_NAME_LENGTH + 1];
2082 	if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2083 		return B_NAME_TOO_LONG;
2084 
2085 	// get the directory vnode and let vnode_path_to_vnode() do the rest
2086 	struct vnode* directory;
2087 
2088 	status_t status = get_vnode(mountID, directoryID, &directory, true, false);
2089 	if (status < 0)
2090 		return status;
2091 
2092 	return vnode_path_to_vnode(directory, clonedName, traverse, 0, kernel,
2093 		_vnode, NULL);
2094 }
2095 
2096 
2097 /*!	Looks up the entry with name \a name in the directory represented by \a dir
2098 	and returns the respective vnode.
2099 	On success a reference to the vnode is acquired for the caller.
2100 */
2101 static status_t
2102 lookup_dir_entry(struct vnode* dir, const char* name, struct vnode** _vnode)
2103 {
2104 	ino_t id;
2105 	bool missing;
2106 
2107 	if (dir->mount->entry_cache.Lookup(dir->id, name, id, missing)) {
2108 		return missing ? B_ENTRY_NOT_FOUND
2109 			: get_vnode(dir->device, id, _vnode, true, false);
2110 	}
2111 
2112 	status_t status = FS_CALL(dir, lookup, name, &id);
2113 	if (status != B_OK)
2114 		return status;
2115 
2116 	// The lookup() hook calls get_vnode() or publish_vnode(), so we do already
2117 	// have a reference and just need to look the node up.
2118 	rw_lock_read_lock(&sVnodeLock);
2119 	*_vnode = lookup_vnode(dir->device, id);
2120 	rw_lock_read_unlock(&sVnodeLock);
2121 
2122 	if (*_vnode == NULL) {
2123 		panic("lookup_dir_entry(): could not lookup vnode (mountid 0x%" B_PRIx32
2124 			" vnid 0x%" B_PRIx64 ")\n", dir->device, id);
2125 		return B_ENTRY_NOT_FOUND;
2126 	}
2127 
2128 //	ktrace_printf("lookup_dir_entry(): dir: %p (%ld, %lld), name: \"%s\" -> "
2129 //		"%p (%ld, %lld)", dir, dir->mount->id, dir->id, name, *_vnode,
2130 //		(*_vnode)->mount->id, (*_vnode)->id);
2131 
2132 	return B_OK;
2133 }
2134 
2135 
2136 /*!	Returns the vnode for the relative path starting at the specified \a vnode.
2137 	\a path must not be NULL.
2138 	If it returns successfully, \a path contains the name of the last path
2139 	component. This function clobbers the buffer pointed to by \a path only
2140 	if it does contain more than one component.
2141 	Note, this reduces the ref_count of the starting \a vnode, no matter if
2142 	it is successful or not!
2143 */
2144 static status_t
2145 vnode_path_to_vnode(struct vnode* vnode, char* path, bool traverseLeafLink,
2146 	int count, struct io_context* ioContext, struct vnode** _vnode,
2147 	ino_t* _parentID)
2148 {
2149 	status_t status = B_OK;
2150 	ino_t lastParentID = vnode->id;
2151 
2152 	FUNCTION(("vnode_path_to_vnode(vnode = %p, path = %s)\n", vnode, path));
2153 
2154 	if (path == NULL) {
2155 		put_vnode(vnode);
2156 		return B_BAD_VALUE;
2157 	}
2158 
2159 	if (*path == '\0') {
2160 		put_vnode(vnode);
2161 		return B_ENTRY_NOT_FOUND;
2162 	}
2163 
2164 	while (true) {
2165 		struct vnode* nextVnode;
2166 		char* nextPath;
2167 
2168 		TRACE(("vnode_path_to_vnode: top of loop. p = %p, p = '%s'\n", path,
2169 			path));
2170 
2171 		// done?
2172 		if (path[0] == '\0')
2173 			break;
2174 
2175 		// walk to find the next path component ("path" will point to a single
2176 		// path component), and filter out multiple slashes
2177 		for (nextPath = path + 1; *nextPath != '\0' && *nextPath != '/';
2178 				nextPath++);
2179 
2180 		if (*nextPath == '/') {
2181 			*nextPath = '\0';
2182 			do
2183 				nextPath++;
2184 			while (*nextPath == '/');
2185 		}
2186 
2187 		// See if the '..' is at a covering vnode move to the covered
2188 		// vnode so we pass the '..' path to the underlying filesystem.
2189 		// Also prevent breaking the root of the IO context.
2190 		if (strcmp("..", path) == 0) {
2191 			if (vnode == ioContext->root) {
2192 				// Attempted prison break! Keep it contained.
2193 				path = nextPath;
2194 				continue;
2195 			}
2196 
2197 			if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2198 				nextVnode = coveredVnode;
2199 				put_vnode(vnode);
2200 				vnode = nextVnode;
2201 			}
2202 		}
2203 
2204 		// check if vnode is really a directory
2205 		if (status == B_OK && !S_ISDIR(vnode->Type()))
2206 			status = B_NOT_A_DIRECTORY;
2207 
2208 		// Check if we have the right to search the current directory vnode.
2209 		// If a file system doesn't have the access() function, we assume that
2210 		// searching a directory is always allowed
2211 		if (status == B_OK && HAS_FS_CALL(vnode, access))
2212 			status = FS_CALL(vnode, access, X_OK);
2213 
2214 		// Tell the filesystem to get the vnode of this path component (if we
2215 		// got the permission from the call above)
2216 		if (status == B_OK)
2217 			status = lookup_dir_entry(vnode, path, &nextVnode);
2218 
2219 		if (status != B_OK) {
2220 			put_vnode(vnode);
2221 			return status;
2222 		}
2223 
2224 		// If the new node is a symbolic link, resolve it (if we've been told
2225 		// to do it)
2226 		if (S_ISLNK(nextVnode->Type())
2227 			&& (traverseLeafLink || nextPath[0] != '\0')) {
2228 			size_t bufferSize;
2229 			char* buffer;
2230 
2231 			TRACE(("traverse link\n"));
2232 
2233 			// it's not exactly nice style using goto in this way, but hey,
2234 			// it works :-/
2235 			if (count + 1 > B_MAX_SYMLINKS) {
2236 				status = B_LINK_LIMIT;
2237 				goto resolve_link_error;
2238 			}
2239 
2240 			bufferSize = B_PATH_NAME_LENGTH;
2241 			buffer = (char*)object_cache_alloc(sPathNameCache, 0);
2242 			if (buffer == NULL) {
2243 				status = B_NO_MEMORY;
2244 				goto resolve_link_error;
2245 			}
2246 
2247 			if (HAS_FS_CALL(nextVnode, read_symlink)) {
2248 				bufferSize--;
2249 				status = FS_CALL(nextVnode, read_symlink, buffer, &bufferSize);
2250 				// null-terminate
2251 				if (status >= 0)
2252 					buffer[bufferSize] = '\0';
2253 			} else
2254 				status = B_BAD_VALUE;
2255 
2256 			if (status != B_OK) {
2257 				free(buffer);
2258 
2259 		resolve_link_error:
2260 				put_vnode(vnode);
2261 				put_vnode(nextVnode);
2262 
2263 				return status;
2264 			}
2265 			put_vnode(nextVnode);
2266 
2267 			// Check if we start from the root directory or the current
2268 			// directory ("vnode" still points to that one).
2269 			// Cut off all leading slashes if it's the root directory
2270 			path = buffer;
2271 			bool absoluteSymlink = false;
2272 			if (path[0] == '/') {
2273 				// we don't need the old directory anymore
2274 				put_vnode(vnode);
2275 
2276 				while (*++path == '/')
2277 					;
2278 
2279 				mutex_lock(&sIOContextRootLock);
2280 				vnode = ioContext->root;
2281 				inc_vnode_ref_count(vnode);
2282 				mutex_unlock(&sIOContextRootLock);
2283 
2284 				absoluteSymlink = true;
2285 			}
2286 
2287 			inc_vnode_ref_count(vnode);
2288 				// balance the next recursion - we will decrement the
2289 				// ref_count of the vnode, no matter if we succeeded or not
2290 
2291 			if (absoluteSymlink && *path == '\0') {
2292 				// symlink was just "/"
2293 				nextVnode = vnode;
2294 			} else {
2295 				status = vnode_path_to_vnode(vnode, path, true, count + 1,
2296 					ioContext, &nextVnode, &lastParentID);
2297 			}
2298 
2299 			object_cache_free(sPathNameCache, buffer, 0);
2300 
2301 			if (status != B_OK) {
2302 				put_vnode(vnode);
2303 				return status;
2304 			}
2305 		} else
2306 			lastParentID = vnode->id;
2307 
2308 		// decrease the ref count on the old dir we just looked up into
2309 		put_vnode(vnode);
2310 
2311 		path = nextPath;
2312 		vnode = nextVnode;
2313 
2314 		// see if we hit a covered node
2315 		if (Vnode* coveringNode = get_covering_vnode(vnode)) {
2316 			put_vnode(vnode);
2317 			vnode = coveringNode;
2318 		}
2319 	}
2320 
2321 	*_vnode = vnode;
2322 	if (_parentID)
2323 		*_parentID = lastParentID;
2324 
2325 	return B_OK;
2326 }
2327 
2328 
2329 static status_t
2330 vnode_path_to_vnode(struct vnode* vnode, char* path, bool traverseLeafLink,
2331 	int count, bool kernel, struct vnode** _vnode, ino_t* _parentID)
2332 {
2333 	return vnode_path_to_vnode(vnode, path, traverseLeafLink, count,
2334 		get_current_io_context(kernel), _vnode, _parentID);
2335 }
2336 
2337 
2338 static status_t
2339 path_to_vnode(char* path, bool traverseLink, struct vnode** _vnode,
2340 	ino_t* _parentID, bool kernel)
2341 {
2342 	struct vnode* start = NULL;
2343 
2344 	FUNCTION(("path_to_vnode(path = \"%s\")\n", path));
2345 
2346 	if (!path)
2347 		return B_BAD_VALUE;
2348 
2349 	if (*path == '\0')
2350 		return B_ENTRY_NOT_FOUND;
2351 
2352 	// figure out if we need to start at root or at cwd
2353 	if (*path == '/') {
2354 		if (sRoot == NULL) {
2355 			// we're a bit early, aren't we?
2356 			return B_ERROR;
2357 		}
2358 
2359 		while (*++path == '/')
2360 			;
2361 		start = get_root_vnode(kernel);
2362 
2363 		if (*path == '\0') {
2364 			*_vnode = start;
2365 			return B_OK;
2366 		}
2367 
2368 	} else {
2369 		struct io_context* context = get_current_io_context(kernel);
2370 
2371 		mutex_lock(&context->io_mutex);
2372 		start = context->cwd;
2373 		if (start != NULL)
2374 			inc_vnode_ref_count(start);
2375 		mutex_unlock(&context->io_mutex);
2376 
2377 		if (start == NULL)
2378 			return B_ERROR;
2379 	}
2380 
2381 	return vnode_path_to_vnode(start, path, traverseLink, 0, kernel, _vnode,
2382 		_parentID);
2383 }
2384 
2385 
2386 /*! Returns the vnode in the next to last segment of the path, and returns
2387 	the last portion in filename.
2388 	The path buffer must be able to store at least one additional character.
2389 */
2390 static status_t
2391 path_to_dir_vnode(char* path, struct vnode** _vnode, char* filename,
2392 	bool kernel)
2393 {
2394 	status_t status = get_dir_path_and_leaf(path, filename);
2395 	if (status != B_OK)
2396 		return status;
2397 
2398 	return path_to_vnode(path, true, _vnode, NULL, kernel);
2399 }
2400 
2401 
2402 /*!	\brief Retrieves the directory vnode and the leaf name of an entry referred
2403 		   to by a FD + path pair.
2404 
2405 	\a path must be given in either case. \a fd might be omitted, in which
2406 	case \a path is either an absolute path or one relative to the current
2407 	directory. If both a supplied and \a path is relative it is reckoned off
2408 	of the directory referred to by \a fd. If \a path is absolute \a fd is
2409 	ignored.
2410 
2411 	The caller has the responsibility to call put_vnode() on the returned
2412 	directory vnode.
2413 
2414 	\param fd The FD. May be < 0.
2415 	\param path The absolute or relative path. Must not be \c NULL. The buffer
2416 	       is modified by this function. It must have at least room for a
2417 	       string one character longer than the path it contains.
2418 	\param _vnode A pointer to a variable the directory vnode shall be written
2419 		   into.
2420 	\param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2421 		   the leaf name of the specified entry will be written.
2422 	\param kernel \c true, if invoked from inside the kernel, \c false if
2423 		   invoked from userland.
2424 	\return \c B_OK, if everything went fine, another error code otherwise.
2425 */
2426 static status_t
2427 fd_and_path_to_dir_vnode(int fd, char* path, struct vnode** _vnode,
2428 	char* filename, bool kernel)
2429 {
2430 	if (!path)
2431 		return B_BAD_VALUE;
2432 	if (*path == '\0')
2433 		return B_ENTRY_NOT_FOUND;
2434 	if (fd < 0)
2435 		return path_to_dir_vnode(path, _vnode, filename, kernel);
2436 
2437 	status_t status = get_dir_path_and_leaf(path, filename);
2438 	if (status != B_OK)
2439 		return status;
2440 
2441 	return fd_and_path_to_vnode(fd, path, true, _vnode, NULL, kernel);
2442 }
2443 
2444 
2445 /*!	\brief Retrieves the directory vnode and the leaf name of an entry referred
2446 		   to by a vnode + path pair.
2447 
2448 	\a path must be given in either case. \a vnode might be omitted, in which
2449 	case \a path is either an absolute path or one relative to the current
2450 	directory. If both a supplied and \a path is relative it is reckoned off
2451 	of the directory referred to by \a vnode. If \a path is absolute \a vnode is
2452 	ignored.
2453 
2454 	The caller has the responsibility to call put_vnode() on the returned
2455 	directory vnode.
2456 
2457 	\param vnode The vnode. May be \c NULL.
2458 	\param path The absolute or relative path. Must not be \c NULL. The buffer
2459 	       is modified by this function. It must have at least room for a
2460 	       string one character longer than the path it contains.
2461 	\param _vnode A pointer to a variable the directory vnode shall be written
2462 		   into.
2463 	\param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2464 		   the leaf name of the specified entry will be written.
2465 	\param kernel \c true, if invoked from inside the kernel, \c false if
2466 		   invoked from userland.
2467 	\return \c B_OK, if everything went fine, another error code otherwise.
2468 */
2469 static status_t
2470 vnode_and_path_to_dir_vnode(struct vnode* vnode, char* path,
2471 	struct vnode** _vnode, char* filename, bool kernel)
2472 {
2473 	if (!path)
2474 		return B_BAD_VALUE;
2475 	if (*path == '\0')
2476 		return B_ENTRY_NOT_FOUND;
2477 	if (vnode == NULL || path[0] == '/')
2478 		return path_to_dir_vnode(path, _vnode, filename, kernel);
2479 
2480 	status_t status = get_dir_path_and_leaf(path, filename);
2481 	if (status != B_OK)
2482 		return status;
2483 
2484 	inc_vnode_ref_count(vnode);
2485 		// vnode_path_to_vnode() always decrements the ref count
2486 
2487 	return vnode_path_to_vnode(vnode, path, true, 0, kernel, _vnode, NULL);
2488 }
2489 
2490 
2491 /*! Returns a vnode's name in the d_name field of a supplied dirent buffer.
2492 */
2493 static status_t
2494 get_vnode_name(struct vnode* vnode, struct vnode* parent, struct dirent* buffer,
2495 	size_t bufferSize, struct io_context* ioContext)
2496 {
2497 	if (bufferSize < sizeof(struct dirent))
2498 		return B_BAD_VALUE;
2499 
2500 	// See if the vnode is covering another vnode and move to the covered
2501 	// vnode so we get the underlying file system
2502 	VNodePutter vnodePutter;
2503 	if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2504 		vnode = coveredVnode;
2505 		vnodePutter.SetTo(vnode);
2506 	}
2507 
2508 	if (HAS_FS_CALL(vnode, get_vnode_name)) {
2509 		// The FS supports getting the name of a vnode.
2510 		if (FS_CALL(vnode, get_vnode_name, buffer->d_name,
2511 			(char*)buffer + bufferSize - buffer->d_name) == B_OK)
2512 			return B_OK;
2513 	}
2514 
2515 	// The FS doesn't support getting the name of a vnode. So we search the
2516 	// parent directory for the vnode, if the caller let us.
2517 
2518 	if (parent == NULL || !HAS_FS_CALL(parent, read_dir))
2519 		return B_UNSUPPORTED;
2520 
2521 	void* cookie;
2522 
2523 	status_t status = FS_CALL(parent, open_dir, &cookie);
2524 	if (status >= B_OK) {
2525 		while (true) {
2526 			uint32 num = 1;
2527 			// We use the FS hook directly instead of dir_read(), since we don't
2528 			// want the entries to be fixed. We have already resolved vnode to
2529 			// the covered node.
2530 			status = FS_CALL(parent, read_dir, cookie, buffer, bufferSize,
2531 				&num);
2532 			if (status != B_OK)
2533 				break;
2534 			if (num == 0) {
2535 				status = B_ENTRY_NOT_FOUND;
2536 				break;
2537 			}
2538 
2539 			if (vnode->id == buffer->d_ino) {
2540 				// found correct entry!
2541 				break;
2542 			}
2543 		}
2544 
2545 		FS_CALL(parent, close_dir, cookie);
2546 		FS_CALL(parent, free_dir_cookie, cookie);
2547 	}
2548 	return status;
2549 }
2550 
2551 
2552 static status_t
2553 get_vnode_name(struct vnode* vnode, struct vnode* parent, char* name,
2554 	size_t nameSize, bool kernel)
2555 {
2556 	char buffer[sizeof(struct dirent) + B_FILE_NAME_LENGTH];
2557 	struct dirent* dirent = (struct dirent*)buffer;
2558 
2559 	status_t status = get_vnode_name(vnode, parent, dirent, sizeof(buffer),
2560 		get_current_io_context(kernel));
2561 	if (status != B_OK)
2562 		return status;
2563 
2564 	if (strlcpy(name, dirent->d_name, nameSize) >= nameSize)
2565 		return B_BUFFER_OVERFLOW;
2566 
2567 	return B_OK;
2568 }
2569 
2570 
2571 /*!	Gets the full path to a given directory vnode.
2572 	It uses the fs_get_vnode_name() call to get the name of a vnode; if a
2573 	file system doesn't support this call, it will fall back to iterating
2574 	through the parent directory to get the name of the child.
2575 
2576 	To protect against circular loops, it supports a maximum tree depth
2577 	of 256 levels.
2578 
2579 	Note that the path may not be correct the time this function returns!
2580 	It doesn't use any locking to prevent returning the correct path, as
2581 	paths aren't safe anyway: the path to a file can change at any time.
2582 
2583 	It might be a good idea, though, to check if the returned path exists
2584 	in the calling function (it's not done here because of efficiency)
2585 */
2586 static status_t
2587 dir_vnode_to_path(struct vnode* vnode, char* buffer, size_t bufferSize,
2588 	bool kernel)
2589 {
2590 	FUNCTION(("dir_vnode_to_path(%p, %p, %lu)\n", vnode, buffer, bufferSize));
2591 
2592 	if (vnode == NULL || buffer == NULL || bufferSize == 0)
2593 		return B_BAD_VALUE;
2594 
2595 	if (!S_ISDIR(vnode->Type()))
2596 		return B_NOT_A_DIRECTORY;
2597 
2598 	char* path = buffer;
2599 	int32 insert = bufferSize;
2600 	int32 maxLevel = 256;
2601 	int32 length;
2602 	status_t status = B_OK;
2603 	struct io_context* ioContext = get_current_io_context(kernel);
2604 
2605 	// we don't use get_vnode() here because this call is more
2606 	// efficient and does all we need from get_vnode()
2607 	inc_vnode_ref_count(vnode);
2608 
2609 	path[--insert] = '\0';
2610 		// the path is filled right to left
2611 
2612 	while (true) {
2613 		// If the node is the context's root, bail out. Otherwise resolve mount
2614 		// points.
2615 		if (vnode == ioContext->root)
2616 			break;
2617 
2618 		if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2619 			put_vnode(vnode);
2620 			vnode = coveredVnode;
2621 		}
2622 
2623 		// lookup the parent vnode
2624 		struct vnode* parentVnode;
2625 		status = lookup_dir_entry(vnode, "..", &parentVnode);
2626 		if (status != B_OK)
2627 			goto out;
2628 
2629 		if (parentVnode == vnode) {
2630 			// The caller apparently got their hands on a node outside of their
2631 			// context's root. Now we've hit the global root.
2632 			put_vnode(parentVnode);
2633 			break;
2634 		}
2635 
2636 		// get the node's name
2637 		char nameBuffer[sizeof(struct dirent) + B_FILE_NAME_LENGTH];
2638 			// also used for fs_read_dir()
2639 		char* name = &((struct dirent*)nameBuffer)->d_name[0];
2640 		status = get_vnode_name(vnode, parentVnode, (struct dirent*)nameBuffer,
2641 			sizeof(nameBuffer), ioContext);
2642 
2643 		// release the current vnode, we only need its parent from now on
2644 		put_vnode(vnode);
2645 		vnode = parentVnode;
2646 
2647 		if (status != B_OK)
2648 			goto out;
2649 
2650 		// TODO: add an explicit check for loops in about 10 levels to do
2651 		// real loop detection
2652 
2653 		// don't go deeper as 'maxLevel' to prevent circular loops
2654 		if (maxLevel-- < 0) {
2655 			status = B_LINK_LIMIT;
2656 			goto out;
2657 		}
2658 
2659 		// add the name in front of the current path
2660 		name[B_FILE_NAME_LENGTH - 1] = '\0';
2661 		length = strlen(name);
2662 		insert -= length;
2663 		if (insert <= 0) {
2664 			status = B_RESULT_NOT_REPRESENTABLE;
2665 			goto out;
2666 		}
2667 		memcpy(path + insert, name, length);
2668 		path[--insert] = '/';
2669 	}
2670 
2671 	// the root dir will result in an empty path: fix it
2672 	if (path[insert] == '\0')
2673 		path[--insert] = '/';
2674 
2675 	TRACE(("  path is: %s\n", path + insert));
2676 
2677 	// move the path to the start of the buffer
2678 	length = bufferSize - insert;
2679 	memmove(buffer, path + insert, length);
2680 
2681 out:
2682 	put_vnode(vnode);
2683 	return status;
2684 }
2685 
2686 
2687 /*!	Checks the length of every path component, and adds a '.'
2688 	if the path ends in a slash.
2689 	The given path buffer must be able to store at least one
2690 	additional character.
2691 */
2692 static status_t
2693 check_path(char* to)
2694 {
2695 	int32 length = 0;
2696 
2697 	// check length of every path component
2698 
2699 	while (*to) {
2700 		char* begin;
2701 		if (*to == '/')
2702 			to++, length++;
2703 
2704 		begin = to;
2705 		while (*to != '/' && *to)
2706 			to++, length++;
2707 
2708 		if (to - begin > B_FILE_NAME_LENGTH)
2709 			return B_NAME_TOO_LONG;
2710 	}
2711 
2712 	if (length == 0)
2713 		return B_ENTRY_NOT_FOUND;
2714 
2715 	// complete path if there is a slash at the end
2716 
2717 	if (*(to - 1) == '/') {
2718 		if (length > B_PATH_NAME_LENGTH - 2)
2719 			return B_NAME_TOO_LONG;
2720 
2721 		to[0] = '.';
2722 		to[1] = '\0';
2723 	}
2724 
2725 	return B_OK;
2726 }
2727 
2728 
2729 static struct file_descriptor*
2730 get_fd_and_vnode(int fd, struct vnode** _vnode, bool kernel)
2731 {
2732 	struct file_descriptor* descriptor
2733 		= get_fd(get_current_io_context(kernel), fd);
2734 	if (descriptor == NULL)
2735 		return NULL;
2736 
2737 	struct vnode* vnode = fd_vnode(descriptor);
2738 	if (vnode == NULL) {
2739 		put_fd(descriptor);
2740 		return NULL;
2741 	}
2742 
2743 	// ToDo: when we can close a file descriptor at any point, investigate
2744 	//	if this is still valid to do (accessing the vnode without ref_count
2745 	//	or locking)
2746 	*_vnode = vnode;
2747 	return descriptor;
2748 }
2749 
2750 
2751 static struct vnode*
2752 get_vnode_from_fd(int fd, bool kernel)
2753 {
2754 	struct file_descriptor* descriptor;
2755 	struct vnode* vnode;
2756 
2757 	descriptor = get_fd(get_current_io_context(kernel), fd);
2758 	if (descriptor == NULL)
2759 		return NULL;
2760 
2761 	vnode = fd_vnode(descriptor);
2762 	if (vnode != NULL)
2763 		inc_vnode_ref_count(vnode);
2764 
2765 	put_fd(descriptor);
2766 	return vnode;
2767 }
2768 
2769 
2770 /*!	Gets the vnode from an FD + path combination. If \a fd is lower than zero,
2771 	only the path will be considered. In this case, the \a path must not be
2772 	NULL.
2773 	If \a fd is a valid file descriptor, \a path may be NULL for directories,
2774 	and should be NULL for files.
2775 */
2776 static status_t
2777 fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
2778 	struct vnode** _vnode, ino_t* _parentID, bool kernel)
2779 {
2780 	if (fd < 0 && !path)
2781 		return B_BAD_VALUE;
2782 
2783 	if (path != NULL && *path == '\0')
2784 		return B_ENTRY_NOT_FOUND;
2785 
2786 	if (fd < 0 || (path != NULL && path[0] == '/')) {
2787 		// no FD or absolute path
2788 		return path_to_vnode(path, traverseLeafLink, _vnode, _parentID, kernel);
2789 	}
2790 
2791 	// FD only, or FD + relative path
2792 	struct vnode* vnode = get_vnode_from_fd(fd, kernel);
2793 	if (vnode == NULL)
2794 		return B_FILE_ERROR;
2795 
2796 	if (path != NULL) {
2797 		return vnode_path_to_vnode(vnode, path, traverseLeafLink, 0, kernel,
2798 			_vnode, _parentID);
2799 	}
2800 
2801 	// there is no relative path to take into account
2802 
2803 	*_vnode = vnode;
2804 	if (_parentID)
2805 		*_parentID = -1;
2806 
2807 	return B_OK;
2808 }
2809 
2810 
2811 static int
2812 get_new_fd(int type, struct fs_mount* mount, struct vnode* vnode,
2813 	void* cookie, int openMode, bool kernel)
2814 {
2815 	struct file_descriptor* descriptor;
2816 	int fd;
2817 
2818 	// If the vnode is locked, we don't allow creating a new file/directory
2819 	// file_descriptor for it
2820 	if (vnode && vnode->mandatory_locked_by != NULL
2821 		&& (type == FDTYPE_FILE || type == FDTYPE_DIR))
2822 		return B_BUSY;
2823 
2824 	descriptor = alloc_fd();
2825 	if (!descriptor)
2826 		return B_NO_MEMORY;
2827 
2828 	if (vnode)
2829 		descriptor->u.vnode = vnode;
2830 	else
2831 		descriptor->u.mount = mount;
2832 	descriptor->cookie = cookie;
2833 
2834 	switch (type) {
2835 		// vnode types
2836 		case FDTYPE_FILE:
2837 			descriptor->ops = &sFileOps;
2838 			break;
2839 		case FDTYPE_DIR:
2840 			descriptor->ops = &sDirectoryOps;
2841 			break;
2842 		case FDTYPE_ATTR:
2843 			descriptor->ops = &sAttributeOps;
2844 			break;
2845 		case FDTYPE_ATTR_DIR:
2846 			descriptor->ops = &sAttributeDirectoryOps;
2847 			break;
2848 
2849 		// mount types
2850 		case FDTYPE_INDEX_DIR:
2851 			descriptor->ops = &sIndexDirectoryOps;
2852 			break;
2853 		case FDTYPE_QUERY:
2854 			descriptor->ops = &sQueryOps;
2855 			break;
2856 
2857 		default:
2858 			panic("get_new_fd() called with unknown type %d\n", type);
2859 			break;
2860 	}
2861 	descriptor->type = type;
2862 	descriptor->open_mode = openMode;
2863 
2864 	io_context* context = get_current_io_context(kernel);
2865 	fd = new_fd(context, descriptor);
2866 	if (fd < 0) {
2867 		descriptor->ops = NULL;
2868 		put_fd(descriptor);
2869 		return B_NO_MORE_FDS;
2870 	}
2871 
2872 	mutex_lock(&context->io_mutex);
2873 	fd_set_close_on_exec(context, fd, (openMode & O_CLOEXEC) != 0);
2874 	mutex_unlock(&context->io_mutex);
2875 
2876 	return fd;
2877 }
2878 
2879 
2880 /*!	In-place normalizes \a path. It's otherwise semantically equivalent to
2881 	vfs_normalize_path(). See there for more documentation.
2882 */
2883 static status_t
2884 normalize_path(char* path, size_t pathSize, bool traverseLink, bool kernel)
2885 {
2886 	VNodePutter dirPutter;
2887 	struct vnode* dir = NULL;
2888 	status_t error;
2889 
2890 	for (int i = 0; i < B_MAX_SYMLINKS; i++) {
2891 		// get dir vnode + leaf name
2892 		struct vnode* nextDir;
2893 		char leaf[B_FILE_NAME_LENGTH];
2894 		error = vnode_and_path_to_dir_vnode(dir, path, &nextDir, leaf, kernel);
2895 		if (error != B_OK)
2896 			return error;
2897 
2898 		dir = nextDir;
2899 		strcpy(path, leaf);
2900 		dirPutter.SetTo(dir);
2901 
2902 		// get file vnode, if we shall resolve links
2903 		bool fileExists = false;
2904 		struct vnode* fileVnode;
2905 		VNodePutter fileVnodePutter;
2906 		if (traverseLink) {
2907 			inc_vnode_ref_count(dir);
2908 			if (vnode_path_to_vnode(dir, path, false, 0, kernel, &fileVnode,
2909 					NULL) == B_OK) {
2910 				fileVnodePutter.SetTo(fileVnode);
2911 				fileExists = true;
2912 			}
2913 		}
2914 
2915 		if (!fileExists || !traverseLink || !S_ISLNK(fileVnode->Type())) {
2916 			// we're done -- construct the path
2917 			bool hasLeaf = true;
2918 			if (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0) {
2919 				// special cases "." and ".." -- get the dir, forget the leaf
2920 				inc_vnode_ref_count(dir);
2921 				error = vnode_path_to_vnode(dir, leaf, false, 0, kernel,
2922 					&nextDir, NULL);
2923 				if (error != B_OK)
2924 					return error;
2925 				dir = nextDir;
2926 				dirPutter.SetTo(dir);
2927 				hasLeaf = false;
2928 			}
2929 
2930 			// get the directory path
2931 			error = dir_vnode_to_path(dir, path, B_PATH_NAME_LENGTH, kernel);
2932 			if (error != B_OK)
2933 				return error;
2934 
2935 			// append the leaf name
2936 			if (hasLeaf) {
2937 				// insert a directory separator if this is not the file system
2938 				// root
2939 				if ((strcmp(path, "/") != 0
2940 					&& strlcat(path, "/", pathSize) >= pathSize)
2941 					|| strlcat(path, leaf, pathSize) >= pathSize) {
2942 					return B_NAME_TOO_LONG;
2943 				}
2944 			}
2945 
2946 			return B_OK;
2947 		}
2948 
2949 		// read link
2950 		if (HAS_FS_CALL(fileVnode, read_symlink)) {
2951 			size_t bufferSize = B_PATH_NAME_LENGTH - 1;
2952 			error = FS_CALL(fileVnode, read_symlink, path, &bufferSize);
2953 			if (error != B_OK)
2954 				return error;
2955 			path[bufferSize] = '\0';
2956 		} else
2957 			return B_BAD_VALUE;
2958 	}
2959 
2960 	return B_LINK_LIMIT;
2961 }
2962 
2963 
2964 static status_t
2965 resolve_covered_parent(struct vnode* parent, dev_t* _device, ino_t* _node,
2966 	struct io_context* ioContext)
2967 {
2968 	// Make sure the IO context root is not bypassed.
2969 	if (parent == ioContext->root) {
2970 		*_device = parent->device;
2971 		*_node = parent->id;
2972 		return B_OK;
2973 	}
2974 
2975 	inc_vnode_ref_count(parent);
2976 		// vnode_path_to_vnode() puts the node
2977 
2978 	// ".." is guaranteed not to be clobbered by this call
2979 	struct vnode* vnode;
2980 	status_t status = vnode_path_to_vnode(parent, (char*)"..", false, 0,
2981 		ioContext, &vnode, NULL);
2982 	if (status == B_OK) {
2983 		*_device = vnode->device;
2984 		*_node = vnode->id;
2985 		put_vnode(vnode);
2986 	}
2987 
2988 	return status;
2989 }
2990 
2991 
2992 #ifdef ADD_DEBUGGER_COMMANDS
2993 
2994 
2995 static void
2996 _dump_advisory_locking(advisory_locking* locking)
2997 {
2998 	if (locking == NULL)
2999 		return;
3000 
3001 	kprintf("   lock:        %" B_PRId32, locking->lock);
3002 	kprintf("   wait_sem:    %" B_PRId32, locking->wait_sem);
3003 
3004 	int32 index = 0;
3005 	LockList::Iterator iterator = locking->locks.GetIterator();
3006 	while (iterator.HasNext()) {
3007 		struct advisory_lock* lock = iterator.Next();
3008 
3009 		kprintf("   [%2" B_PRId32 "] team:   %" B_PRId32 "\n", index++, lock->team);
3010 		kprintf("        start:  %" B_PRIdOFF "\n", lock->start);
3011 		kprintf("        end:    %" B_PRIdOFF "\n", lock->end);
3012 		kprintf("        shared? %s\n", lock->shared ? "yes" : "no");
3013 	}
3014 }
3015 
3016 
3017 static void
3018 _dump_mount(struct fs_mount* mount)
3019 {
3020 	kprintf("MOUNT: %p\n", mount);
3021 	kprintf(" id:            %" B_PRIdDEV "\n", mount->id);
3022 	kprintf(" device_name:   %s\n", mount->device_name);
3023 	kprintf(" root_vnode:    %p\n", mount->root_vnode);
3024 	kprintf(" covers:        %p\n", mount->root_vnode->covers);
3025 	kprintf(" partition:     %p\n", mount->partition);
3026 	kprintf(" lock:          %p\n", &mount->lock);
3027 	kprintf(" flags:        %s%s\n", mount->unmounting ? " unmounting" : "",
3028 		mount->owns_file_device ? " owns_file_device" : "");
3029 
3030 	fs_volume* volume = mount->volume;
3031 	while (volume != NULL) {
3032 		kprintf(" volume %p:\n", volume);
3033 		kprintf("  layer:            %" B_PRId32 "\n", volume->layer);
3034 		kprintf("  private_volume:   %p\n", volume->private_volume);
3035 		kprintf("  ops:              %p\n", volume->ops);
3036 		kprintf("  file_system:      %p\n", volume->file_system);
3037 		kprintf("  file_system_name: %s\n", volume->file_system_name);
3038 		volume = volume->super_volume;
3039 	}
3040 
3041 	set_debug_variable("_volume", (addr_t)mount->volume->private_volume);
3042 	set_debug_variable("_root", (addr_t)mount->root_vnode);
3043 	set_debug_variable("_covers", (addr_t)mount->root_vnode->covers);
3044 	set_debug_variable("_partition", (addr_t)mount->partition);
3045 }
3046 
3047 
3048 static bool
3049 debug_prepend_vnode_name_to_path(char* buffer, size_t& bufferSize,
3050 	const char* name)
3051 {
3052 	bool insertSlash = buffer[bufferSize] != '\0';
3053 	size_t nameLength = strlen(name);
3054 
3055 	if (bufferSize < nameLength + (insertSlash ? 1 : 0))
3056 		return false;
3057 
3058 	if (insertSlash)
3059 		buffer[--bufferSize] = '/';
3060 
3061 	bufferSize -= nameLength;
3062 	memcpy(buffer + bufferSize, name, nameLength);
3063 
3064 	return true;
3065 }
3066 
3067 
3068 static bool
3069 debug_prepend_vnode_id_to_path(char* buffer, size_t& bufferSize, dev_t devID,
3070 	ino_t nodeID)
3071 {
3072 	if (bufferSize == 0)
3073 		return false;
3074 
3075 	bool insertSlash = buffer[bufferSize] != '\0';
3076 	if (insertSlash)
3077 		buffer[--bufferSize] = '/';
3078 
3079 	size_t size = snprintf(buffer, bufferSize,
3080 		"<%" B_PRIdDEV ",%" B_PRIdINO ">", devID, nodeID);
3081 	if (size > bufferSize) {
3082 		if (insertSlash)
3083 			bufferSize++;
3084 		return false;
3085 	}
3086 
3087 	if (size < bufferSize)
3088 		memmove(buffer + bufferSize - size, buffer, size);
3089 
3090 	bufferSize -= size;
3091 	return true;
3092 }
3093 
3094 
3095 static char*
3096 debug_resolve_vnode_path(struct vnode* vnode, char* buffer, size_t bufferSize,
3097 	bool& _truncated)
3098 {
3099 	// null-terminate the path
3100 	buffer[--bufferSize] = '\0';
3101 
3102 	while (true) {
3103 		while (vnode->covers != NULL)
3104 			vnode = vnode->covers;
3105 
3106 		if (vnode == sRoot) {
3107 			_truncated = bufferSize == 0;
3108 			if (!_truncated)
3109 				buffer[--bufferSize] = '/';
3110 			return buffer + bufferSize;
3111 		}
3112 
3113 		// resolve the name
3114 		ino_t dirID;
3115 		const char* name = vnode->mount->entry_cache.DebugReverseLookup(
3116 			vnode->id, dirID);
3117 		if (name == NULL) {
3118 			// Failed to resolve the name -- prepend "<dev,node>/".
3119 			_truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3120 				vnode->mount->id, vnode->id);
3121 			return buffer + bufferSize;
3122 		}
3123 
3124 		// prepend the name
3125 		if (!debug_prepend_vnode_name_to_path(buffer, bufferSize, name)) {
3126 			_truncated = true;
3127 			return buffer + bufferSize;
3128 		}
3129 
3130 		// resolve the directory node
3131 		struct vnode* nextVnode = lookup_vnode(vnode->mount->id, dirID);
3132 		if (nextVnode == NULL) {
3133 			_truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3134 				vnode->mount->id, dirID);
3135 			return buffer + bufferSize;
3136 		}
3137 
3138 		vnode = nextVnode;
3139 	}
3140 }
3141 
3142 
3143 static void
3144 _dump_vnode(struct vnode* vnode, bool printPath)
3145 {
3146 	kprintf("VNODE: %p\n", vnode);
3147 	kprintf(" device:        %" B_PRIdDEV "\n", vnode->device);
3148 	kprintf(" id:            %" B_PRIdINO "\n", vnode->id);
3149 	kprintf(" ref_count:     %" B_PRId32 "\n", vnode->ref_count);
3150 	kprintf(" private_node:  %p\n", vnode->private_node);
3151 	kprintf(" mount:         %p\n", vnode->mount);
3152 	kprintf(" covered_by:    %p\n", vnode->covered_by);
3153 	kprintf(" covers:        %p\n", vnode->covers);
3154 	kprintf(" cache:         %p\n", vnode->cache);
3155 	kprintf(" type:          %#" B_PRIx32 "\n", vnode->Type());
3156 	kprintf(" flags:         %s%s%s\n", vnode->IsRemoved() ? "r" : "-",
3157 		vnode->IsBusy() ? "b" : "-", vnode->IsUnpublished() ? "u" : "-");
3158 	kprintf(" advisory_lock: %p\n", vnode->advisory_locking);
3159 
3160 	_dump_advisory_locking(vnode->advisory_locking);
3161 
3162 	if (printPath) {
3163 		void* buffer = debug_malloc(B_PATH_NAME_LENGTH);
3164 		if (buffer != NULL) {
3165 			bool truncated;
3166 			char* path = debug_resolve_vnode_path(vnode, (char*)buffer,
3167 				B_PATH_NAME_LENGTH, truncated);
3168 			if (path != NULL) {
3169 				kprintf(" path:          ");
3170 				if (truncated)
3171 					kputs("<truncated>/");
3172 				kputs(path);
3173 				kputs("\n");
3174 			} else
3175 				kprintf("Failed to resolve vnode path.\n");
3176 
3177 			debug_free(buffer);
3178 		} else
3179 			kprintf("Failed to allocate memory for constructing the path.\n");
3180 	}
3181 
3182 	set_debug_variable("_node", (addr_t)vnode->private_node);
3183 	set_debug_variable("_mount", (addr_t)vnode->mount);
3184 	set_debug_variable("_covered_by", (addr_t)vnode->covered_by);
3185 	set_debug_variable("_covers", (addr_t)vnode->covers);
3186 	set_debug_variable("_adv_lock", (addr_t)vnode->advisory_locking);
3187 }
3188 
3189 
3190 static int
3191 dump_mount(int argc, char** argv)
3192 {
3193 	if (argc != 2 || !strcmp(argv[1], "--help")) {
3194 		kprintf("usage: %s [id|address]\n", argv[0]);
3195 		return 0;
3196 	}
3197 
3198 	ulong val = parse_expression(argv[1]);
3199 	uint32 id = val;
3200 
3201 	struct fs_mount* mount = sMountsTable->Lookup(id);
3202 	if (mount == NULL) {
3203 		if (IS_USER_ADDRESS(id)) {
3204 			kprintf("fs_mount not found\n");
3205 			return 0;
3206 		}
3207 		mount = (fs_mount*)val;
3208 	}
3209 
3210 	_dump_mount(mount);
3211 	return 0;
3212 }
3213 
3214 
3215 static int
3216 dump_mounts(int argc, char** argv)
3217 {
3218 	if (argc != 1) {
3219 		kprintf("usage: %s\n", argv[0]);
3220 		return 0;
3221 	}
3222 
3223 	kprintf("%-*s    id %-*s   %-*s   %-*s   fs_name\n",
3224 		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "root",
3225 		B_PRINTF_POINTER_WIDTH, "covers", B_PRINTF_POINTER_WIDTH, "cookie");
3226 
3227 	struct fs_mount* mount;
3228 
3229 	MountTable::Iterator iterator(sMountsTable);
3230 	while (iterator.HasNext()) {
3231 		mount = iterator.Next();
3232 		kprintf("%p%4" B_PRIdDEV " %p %p %p %s\n", mount, mount->id, mount->root_vnode,
3233 			mount->root_vnode->covers, mount->volume->private_volume,
3234 			mount->volume->file_system_name);
3235 
3236 		fs_volume* volume = mount->volume;
3237 		while (volume->super_volume != NULL) {
3238 			volume = volume->super_volume;
3239 			kprintf("                                     %p %s\n",
3240 				volume->private_volume, volume->file_system_name);
3241 		}
3242 	}
3243 
3244 	return 0;
3245 }
3246 
3247 
3248 static int
3249 dump_vnode(int argc, char** argv)
3250 {
3251 	bool printPath = false;
3252 	int argi = 1;
3253 	if (argc >= 2 && strcmp(argv[argi], "-p") == 0) {
3254 		printPath = true;
3255 		argi++;
3256 	}
3257 
3258 	if (argi >= argc || argi + 2 < argc) {
3259 		print_debugger_command_usage(argv[0]);
3260 		return 0;
3261 	}
3262 
3263 	struct vnode* vnode = NULL;
3264 
3265 	if (argi + 1 == argc) {
3266 		vnode = (struct vnode*)parse_expression(argv[argi]);
3267 		if (IS_USER_ADDRESS(vnode)) {
3268 			kprintf("invalid vnode address\n");
3269 			return 0;
3270 		}
3271 		_dump_vnode(vnode, printPath);
3272 		return 0;
3273 	}
3274 
3275 	dev_t device = parse_expression(argv[argi]);
3276 	ino_t id = parse_expression(argv[argi + 1]);
3277 
3278 	VnodeTable::Iterator iterator(sVnodeTable);
3279 	while (iterator.HasNext()) {
3280 		vnode = iterator.Next();
3281 		if (vnode->id != id || vnode->device != device)
3282 			continue;
3283 
3284 		_dump_vnode(vnode, printPath);
3285 	}
3286 
3287 	return 0;
3288 }
3289 
3290 
3291 static int
3292 dump_vnodes(int argc, char** argv)
3293 {
3294 	if (argc != 2 || !strcmp(argv[1], "--help")) {
3295 		kprintf("usage: %s [device]\n", argv[0]);
3296 		return 0;
3297 	}
3298 
3299 	// restrict dumped nodes to a certain device if requested
3300 	dev_t device = parse_expression(argv[1]);
3301 
3302 	struct vnode* vnode;
3303 
3304 	kprintf("%-*s   dev     inode  ref %-*s   %-*s   %-*s   flags\n",
3305 		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache",
3306 		B_PRINTF_POINTER_WIDTH, "fs-node", B_PRINTF_POINTER_WIDTH, "locking");
3307 
3308 	VnodeTable::Iterator iterator(sVnodeTable);
3309 	while (iterator.HasNext()) {
3310 		vnode = iterator.Next();
3311 		if (vnode->device != device)
3312 			continue;
3313 
3314 		kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO "%5" B_PRId32 " %p %p %p %s%s%s\n",
3315 			vnode, vnode->device, vnode->id, vnode->ref_count, vnode->cache,
3316 			vnode->private_node, vnode->advisory_locking,
3317 			vnode->IsRemoved() ? "r" : "-", vnode->IsBusy() ? "b" : "-",
3318 			vnode->IsUnpublished() ? "u" : "-");
3319 	}
3320 
3321 	return 0;
3322 }
3323 
3324 
3325 static int
3326 dump_vnode_caches(int argc, char** argv)
3327 {
3328 	struct vnode* vnode;
3329 
3330 	if (argc > 2 || !strcmp(argv[1], "--help")) {
3331 		kprintf("usage: %s [device]\n", argv[0]);
3332 		return 0;
3333 	}
3334 
3335 	// restrict dumped nodes to a certain device if requested
3336 	dev_t device = -1;
3337 	if (argc > 1)
3338 		device = parse_expression(argv[1]);
3339 
3340 	kprintf("%-*s   dev     inode %-*s       size   pages\n",
3341 		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache");
3342 
3343 	VnodeTable::Iterator iterator(sVnodeTable);
3344 	while (iterator.HasNext()) {
3345 		vnode = iterator.Next();
3346 		if (vnode->cache == NULL)
3347 			continue;
3348 		if (device != -1 && vnode->device != device)
3349 			continue;
3350 
3351 		kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO " %p %8" B_PRIdOFF "%8" B_PRId32 "\n",
3352 			vnode, vnode->device, vnode->id, vnode->cache,
3353 			(vnode->cache->virtual_end + B_PAGE_SIZE - 1) / B_PAGE_SIZE,
3354 			vnode->cache->page_count);
3355 	}
3356 
3357 	return 0;
3358 }
3359 
3360 
3361 int
3362 dump_io_context(int argc, char** argv)
3363 {
3364 	if (argc > 2 || !strcmp(argv[1], "--help")) {
3365 		kprintf("usage: %s [team-id|address]\n", argv[0]);
3366 		return 0;
3367 	}
3368 
3369 	struct io_context* context = NULL;
3370 
3371 	if (argc > 1) {
3372 		ulong num = parse_expression(argv[1]);
3373 		if (IS_KERNEL_ADDRESS(num))
3374 			context = (struct io_context*)num;
3375 		else {
3376 			Team* team = team_get_team_struct_locked(num);
3377 			if (team == NULL) {
3378 				kprintf("could not find team with ID %lu\n", num);
3379 				return 0;
3380 			}
3381 			context = (struct io_context*)team->io_context;
3382 		}
3383 	} else
3384 		context = get_current_io_context(true);
3385 
3386 	kprintf("I/O CONTEXT: %p\n", context);
3387 	kprintf(" root vnode:\t%p\n", context->root);
3388 	kprintf(" cwd vnode:\t%p\n", context->cwd);
3389 	kprintf(" used fds:\t%" B_PRIu32 "\n", context->num_used_fds);
3390 	kprintf(" max fds:\t%" B_PRIu32 "\n", context->table_size);
3391 
3392 	if (context->num_used_fds) {
3393 		kprintf("   no.  type    %*s  ref  open  mode         pos    %*s\n",
3394 			B_PRINTF_POINTER_WIDTH, "ops", B_PRINTF_POINTER_WIDTH, "cookie");
3395 	}
3396 
3397 	for (uint32 i = 0; i < context->table_size; i++) {
3398 		struct file_descriptor* fd = context->fds[i];
3399 		if (fd == NULL)
3400 			continue;
3401 
3402 		kprintf("  %3" B_PRIu32 ":  %4" B_PRId32 "  %p  %3" B_PRId32 "  %4"
3403 			B_PRIu32 "  %4" B_PRIx32 "  %10" B_PRIdOFF "  %p  %s %p\n", i,
3404 			fd->type, fd->ops, fd->ref_count, fd->open_count, fd->open_mode,
3405 			fd->pos, fd->cookie,
3406 			fd->type >= FDTYPE_INDEX && fd->type <= FDTYPE_QUERY
3407 				? "mount" : "vnode",
3408 			fd->u.vnode);
3409 	}
3410 
3411 	kprintf(" used monitors:\t%" B_PRIu32 "\n", context->num_monitors);
3412 	kprintf(" max monitors:\t%" B_PRIu32 "\n", context->max_monitors);
3413 
3414 	set_debug_variable("_cwd", (addr_t)context->cwd);
3415 
3416 	return 0;
3417 }
3418 
3419 
3420 int
3421 dump_vnode_usage(int argc, char** argv)
3422 {
3423 	if (argc != 1) {
3424 		kprintf("usage: %s\n", argv[0]);
3425 		return 0;
3426 	}
3427 
3428 	kprintf("Unused vnodes: %" B_PRIu32 " (max unused %" B_PRIu32 ")\n",
3429 		sUnusedVnodes, kMaxUnusedVnodes);
3430 
3431 	uint32 count = sVnodeTable->CountElements();
3432 
3433 	kprintf("%" B_PRIu32 " vnodes total (%" B_PRIu32 " in use).\n", count,
3434 		count - sUnusedVnodes);
3435 	return 0;
3436 }
3437 
3438 #endif	// ADD_DEBUGGER_COMMANDS
3439 
3440 
3441 /*!	Clears memory specified by an iovec array.
3442 */
3443 static void
3444 zero_iovecs(const iovec* vecs, size_t vecCount, size_t bytes)
3445 {
3446 	for (size_t i = 0; i < vecCount && bytes > 0; i++) {
3447 		size_t length = std::min(vecs[i].iov_len, bytes);
3448 		memset(vecs[i].iov_base, 0, length);
3449 		bytes -= length;
3450 	}
3451 }
3452 
3453 
3454 /*!	Does the dirty work of combining the file_io_vecs with the iovecs
3455 	and calls the file system hooks to read/write the request to disk.
3456 */
3457 static status_t
3458 common_file_io_vec_pages(struct vnode* vnode, void* cookie,
3459 	const file_io_vec* fileVecs, size_t fileVecCount, const iovec* vecs,
3460 	size_t vecCount, uint32* _vecIndex, size_t* _vecOffset, size_t* _numBytes,
3461 	bool doWrite)
3462 {
3463 	if (fileVecCount == 0) {
3464 		// There are no file vecs at this offset, so we're obviously trying
3465 		// to access the file outside of its bounds
3466 		return B_BAD_VALUE;
3467 	}
3468 
3469 	size_t numBytes = *_numBytes;
3470 	uint32 fileVecIndex;
3471 	size_t vecOffset = *_vecOffset;
3472 	uint32 vecIndex = *_vecIndex;
3473 	status_t status;
3474 	size_t size;
3475 
3476 	if (!doWrite && vecOffset == 0) {
3477 		// now directly read the data from the device
3478 		// the first file_io_vec can be read directly
3479 
3480 		if (fileVecs[0].length < (off_t)numBytes)
3481 			size = fileVecs[0].length;
3482 		else
3483 			size = numBytes;
3484 
3485 		if (fileVecs[0].offset >= 0) {
3486 			status = FS_CALL(vnode, read_pages, cookie, fileVecs[0].offset,
3487 				&vecs[vecIndex], vecCount - vecIndex, &size);
3488 		} else {
3489 			// sparse read
3490 			zero_iovecs(&vecs[vecIndex], vecCount - vecIndex, size);
3491 			status = B_OK;
3492 		}
3493 		if (status != B_OK)
3494 			return status;
3495 
3496 		// TODO: this is a work-around for buggy device drivers!
3497 		//	When our own drivers honour the length, we can:
3498 		//	a) also use this direct I/O for writes (otherwise, it would
3499 		//	   overwrite precious data)
3500 		//	b) panic if the term below is true (at least for writes)
3501 		if ((off_t)size > fileVecs[0].length) {
3502 			//dprintf("warning: device driver %p doesn't respect total length "
3503 			//	"in read_pages() call!\n", ref->device);
3504 			size = fileVecs[0].length;
3505 		}
3506 
3507 		ASSERT((off_t)size <= fileVecs[0].length);
3508 
3509 		// If the file portion was contiguous, we're already done now
3510 		if (size == numBytes)
3511 			return B_OK;
3512 
3513 		// if we reached the end of the file, we can return as well
3514 		if ((off_t)size != fileVecs[0].length) {
3515 			*_numBytes = size;
3516 			return B_OK;
3517 		}
3518 
3519 		fileVecIndex = 1;
3520 
3521 		// first, find out where we have to continue in our iovecs
3522 		for (; vecIndex < vecCount; vecIndex++) {
3523 			if (size < vecs[vecIndex].iov_len)
3524 				break;
3525 
3526 			size -= vecs[vecIndex].iov_len;
3527 		}
3528 
3529 		vecOffset = size;
3530 	} else {
3531 		fileVecIndex = 0;
3532 		size = 0;
3533 	}
3534 
3535 	// Too bad, let's process the rest of the file_io_vecs
3536 
3537 	size_t totalSize = size;
3538 	size_t bytesLeft = numBytes - size;
3539 
3540 	for (; fileVecIndex < fileVecCount; fileVecIndex++) {
3541 		const file_io_vec &fileVec = fileVecs[fileVecIndex];
3542 		off_t fileOffset = fileVec.offset;
3543 		off_t fileLeft = min_c(fileVec.length, (off_t)bytesLeft);
3544 
3545 		TRACE(("FILE VEC [%" B_PRIu32 "] length %" B_PRIdOFF "\n", fileVecIndex,
3546 			fileLeft));
3547 
3548 		// process the complete fileVec
3549 		while (fileLeft > 0) {
3550 			iovec tempVecs[MAX_TEMP_IO_VECS];
3551 			uint32 tempCount = 0;
3552 
3553 			// size tracks how much of what is left of the current fileVec
3554 			// (fileLeft) has been assigned to tempVecs
3555 			size = 0;
3556 
3557 			// assign what is left of the current fileVec to the tempVecs
3558 			for (size = 0; (off_t)size < fileLeft && vecIndex < vecCount
3559 					&& tempCount < MAX_TEMP_IO_VECS;) {
3560 				// try to satisfy one iovec per iteration (or as much as
3561 				// possible)
3562 
3563 				// bytes left of the current iovec
3564 				size_t vecLeft = vecs[vecIndex].iov_len - vecOffset;
3565 				if (vecLeft == 0) {
3566 					vecOffset = 0;
3567 					vecIndex++;
3568 					continue;
3569 				}
3570 
3571 				TRACE(("fill vec %" B_PRIu32 ", offset = %lu, size = %lu\n",
3572 					vecIndex, vecOffset, size));
3573 
3574 				// actually available bytes
3575 				size_t tempVecSize = min_c(vecLeft, fileLeft - size);
3576 
3577 				tempVecs[tempCount].iov_base
3578 					= (void*)((addr_t)vecs[vecIndex].iov_base + vecOffset);
3579 				tempVecs[tempCount].iov_len = tempVecSize;
3580 				tempCount++;
3581 
3582 				size += tempVecSize;
3583 				vecOffset += tempVecSize;
3584 			}
3585 
3586 			size_t bytes = size;
3587 
3588 			if (fileOffset == -1) {
3589 				if (doWrite) {
3590 					panic("sparse write attempt: vnode %p", vnode);
3591 					status = B_IO_ERROR;
3592 				} else {
3593 					// sparse read
3594 					zero_iovecs(tempVecs, tempCount, bytes);
3595 					status = B_OK;
3596 				}
3597 			} else if (doWrite) {
3598 				status = FS_CALL(vnode, write_pages, cookie, fileOffset,
3599 					tempVecs, tempCount, &bytes);
3600 			} else {
3601 				status = FS_CALL(vnode, read_pages, cookie, fileOffset,
3602 					tempVecs, tempCount, &bytes);
3603 			}
3604 			if (status != B_OK)
3605 				return status;
3606 
3607 			totalSize += bytes;
3608 			bytesLeft -= size;
3609 			if (fileOffset >= 0)
3610 				fileOffset += size;
3611 			fileLeft -= size;
3612 			//dprintf("-> file left = %Lu\n", fileLeft);
3613 
3614 			if (size != bytes || vecIndex >= vecCount) {
3615 				// there are no more bytes or iovecs, let's bail out
3616 				*_numBytes = totalSize;
3617 				return B_OK;
3618 			}
3619 		}
3620 	}
3621 
3622 	*_vecIndex = vecIndex;
3623 	*_vecOffset = vecOffset;
3624 	*_numBytes = totalSize;
3625 	return B_OK;
3626 }
3627 
3628 
3629 static bool
3630 is_user_in_group(gid_t gid)
3631 {
3632 	if (gid == getegid())
3633 		return true;
3634 
3635 	gid_t groups[NGROUPS_MAX];
3636 	int groupCount = getgroups(NGROUPS_MAX, groups);
3637 	for (int i = 0; i < groupCount; i++) {
3638 		if (gid == groups[i])
3639 			return true;
3640 	}
3641 
3642 	return false;
3643 }
3644 
3645 
3646 static status_t
3647 free_io_context(io_context* context)
3648 {
3649 	uint32 i;
3650 
3651 	TIOC(FreeIOContext(context));
3652 
3653 	if (context->root)
3654 		put_vnode(context->root);
3655 
3656 	if (context->cwd)
3657 		put_vnode(context->cwd);
3658 
3659 	mutex_lock(&context->io_mutex);
3660 
3661 	for (i = 0; i < context->table_size; i++) {
3662 		if (struct file_descriptor* descriptor = context->fds[i]) {
3663 			close_fd(context, descriptor);
3664 			put_fd(descriptor);
3665 		}
3666 	}
3667 
3668 	mutex_destroy(&context->io_mutex);
3669 
3670 	remove_node_monitors(context);
3671 	free(context->fds);
3672 	free(context);
3673 
3674 	return B_OK;
3675 }
3676 
3677 
3678 static status_t
3679 resize_monitor_table(struct io_context* context, const int newSize)
3680 {
3681 	int	status = B_OK;
3682 
3683 	if (newSize <= 0 || newSize > MAX_NODE_MONITORS)
3684 		return B_BAD_VALUE;
3685 
3686 	mutex_lock(&context->io_mutex);
3687 
3688 	if ((size_t)newSize < context->num_monitors) {
3689 		status = B_BUSY;
3690 		goto out;
3691 	}
3692 	context->max_monitors = newSize;
3693 
3694 out:
3695 	mutex_unlock(&context->io_mutex);
3696 	return status;
3697 }
3698 
3699 
3700 //	#pragma mark - public API for file systems
3701 
3702 
3703 extern "C" status_t
3704 new_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3705 	fs_vnode_ops* ops)
3706 {
3707 	FUNCTION(("new_vnode(volume = %p (%" B_PRId32 "), vnodeID = %" B_PRId64
3708 		", node = %p)\n", volume, volume->id, vnodeID, privateNode));
3709 
3710 	if (privateNode == NULL)
3711 		return B_BAD_VALUE;
3712 
3713 	int32 tries = BUSY_VNODE_RETRIES;
3714 restart:
3715 	// create the node
3716 	bool nodeCreated;
3717 	struct vnode* vnode;
3718 	status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3719 		nodeCreated);
3720 	if (status != B_OK)
3721 		return status;
3722 
3723 	WriteLocker nodeLocker(sVnodeLock, true);
3724 		// create_new_vnode_and_lock() has locked for us
3725 
3726 	if (!nodeCreated && vnode->IsBusy()) {
3727 		nodeLocker.Unlock();
3728 		if (!retry_busy_vnode(tries, volume->id, vnodeID))
3729 			return B_BUSY;
3730 		goto restart;
3731 	}
3732 
3733 	// file system integrity check:
3734 	// test if the vnode already exists and bail out if this is the case!
3735 	if (!nodeCreated) {
3736 		panic("vnode %" B_PRIdDEV ":%" B_PRIdINO " already exists (node = %p, "
3737 			"vnode->node = %p)!", volume->id, vnodeID, privateNode,
3738 			vnode->private_node);
3739 		return B_ERROR;
3740 	}
3741 
3742 	vnode->private_node = privateNode;
3743 	vnode->ops = ops;
3744 	vnode->SetUnpublished(true);
3745 
3746 	TRACE(("returns: %s\n", strerror(status)));
3747 
3748 	return status;
3749 }
3750 
3751 
3752 extern "C" status_t
3753 publish_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3754 	fs_vnode_ops* ops, int type, uint32 flags)
3755 {
3756 	FUNCTION(("publish_vnode()\n"));
3757 
3758 	int32 tries = BUSY_VNODE_RETRIES;
3759 restart:
3760 	WriteLocker locker(sVnodeLock);
3761 
3762 	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3763 
3764 	bool nodeCreated = false;
3765 	if (vnode == NULL) {
3766 		if (privateNode == NULL)
3767 			return B_BAD_VALUE;
3768 
3769 		// create the node
3770 		locker.Unlock();
3771 			// create_new_vnode_and_lock() will re-lock for us on success
3772 		status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3773 			nodeCreated);
3774 		if (status != B_OK)
3775 			return status;
3776 
3777 		locker.SetTo(sVnodeLock, true);
3778 	}
3779 
3780 	if (nodeCreated) {
3781 		vnode->private_node = privateNode;
3782 		vnode->ops = ops;
3783 		vnode->SetUnpublished(true);
3784 	} else if (vnode->IsBusy() && vnode->IsUnpublished()
3785 		&& vnode->private_node == privateNode && vnode->ops == ops) {
3786 		// already known, but not published
3787 	} else if (vnode->IsBusy()) {
3788 		locker.Unlock();
3789 		if (!retry_busy_vnode(tries, volume->id, vnodeID))
3790 			return B_BUSY;
3791 		goto restart;
3792 	} else
3793 		return B_BAD_VALUE;
3794 
3795 	bool publishSpecialSubNode = false;
3796 
3797 	vnode->SetType(type);
3798 	vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
3799 	publishSpecialSubNode = is_special_node_type(type)
3800 		&& (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
3801 
3802 	status_t status = B_OK;
3803 
3804 	// create sub vnodes, if necessary
3805 	if (volume->sub_volume != NULL || publishSpecialSubNode) {
3806 		locker.Unlock();
3807 
3808 		fs_volume* subVolume = volume;
3809 		if (volume->sub_volume != NULL) {
3810 			while (status == B_OK && subVolume->sub_volume != NULL) {
3811 				subVolume = subVolume->sub_volume;
3812 				status = subVolume->ops->create_sub_vnode(subVolume, vnodeID,
3813 					vnode);
3814 			}
3815 		}
3816 
3817 		if (status == B_OK && publishSpecialSubNode)
3818 			status = create_special_sub_node(vnode, flags);
3819 
3820 		if (status != B_OK) {
3821 			// error -- clean up the created sub vnodes
3822 			while (subVolume->super_volume != volume) {
3823 				subVolume = subVolume->super_volume;
3824 				subVolume->ops->delete_sub_vnode(subVolume, vnode);
3825 			}
3826 		}
3827 
3828 		if (status == B_OK) {
3829 			ReadLocker vnodesReadLocker(sVnodeLock);
3830 			AutoLocker<Vnode> nodeLocker(vnode);
3831 			vnode->SetBusy(false);
3832 			vnode->SetUnpublished(false);
3833 		} else {
3834 			locker.Lock();
3835 			sVnodeTable->Remove(vnode);
3836 			remove_vnode_from_mount_list(vnode, vnode->mount);
3837 			free(vnode);
3838 		}
3839 	} else {
3840 		// we still hold the write lock -- mark the node unbusy and published
3841 		vnode->SetBusy(false);
3842 		vnode->SetUnpublished(false);
3843 	}
3844 
3845 	TRACE(("returns: %s\n", strerror(status)));
3846 
3847 	return status;
3848 }
3849 
3850 
3851 extern "C" status_t
3852 get_vnode(fs_volume* volume, ino_t vnodeID, void** _privateNode)
3853 {
3854 	struct vnode* vnode;
3855 
3856 	if (volume == NULL)
3857 		return B_BAD_VALUE;
3858 
3859 	status_t status = get_vnode(volume->id, vnodeID, &vnode, true, true);
3860 	if (status != B_OK)
3861 		return status;
3862 
3863 	// If this is a layered FS, we need to get the node cookie for the requested
3864 	// layer.
3865 	if (HAS_FS_CALL(vnode, get_super_vnode)) {
3866 		fs_vnode resolvedNode;
3867 		status_t status = FS_CALL(vnode, get_super_vnode, volume,
3868 			&resolvedNode);
3869 		if (status != B_OK) {
3870 			panic("get_vnode(): Failed to get super node for vnode %p, "
3871 				"volume: %p", vnode, volume);
3872 			put_vnode(vnode);
3873 			return status;
3874 		}
3875 
3876 		if (_privateNode != NULL)
3877 			*_privateNode = resolvedNode.private_node;
3878 	} else if (_privateNode != NULL)
3879 		*_privateNode = vnode->private_node;
3880 
3881 	return B_OK;
3882 }
3883 
3884 
3885 extern "C" status_t
3886 acquire_vnode(fs_volume* volume, ino_t vnodeID)
3887 {
3888 	struct vnode* vnode;
3889 
3890 	rw_lock_read_lock(&sVnodeLock);
3891 	vnode = lookup_vnode(volume->id, vnodeID);
3892 	rw_lock_read_unlock(&sVnodeLock);
3893 
3894 	if (vnode == NULL)
3895 		return B_BAD_VALUE;
3896 
3897 	inc_vnode_ref_count(vnode);
3898 	return B_OK;
3899 }
3900 
3901 
3902 extern "C" status_t
3903 put_vnode(fs_volume* volume, ino_t vnodeID)
3904 {
3905 	struct vnode* vnode;
3906 
3907 	rw_lock_read_lock(&sVnodeLock);
3908 	vnode = lookup_vnode(volume->id, vnodeID);
3909 	rw_lock_read_unlock(&sVnodeLock);
3910 
3911 	if (vnode == NULL)
3912 		return B_BAD_VALUE;
3913 
3914 	dec_vnode_ref_count(vnode, false, true);
3915 	return B_OK;
3916 }
3917 
3918 
3919 extern "C" status_t
3920 remove_vnode(fs_volume* volume, ino_t vnodeID)
3921 {
3922 	ReadLocker locker(sVnodeLock);
3923 
3924 	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3925 	if (vnode == NULL)
3926 		return B_ENTRY_NOT_FOUND;
3927 
3928 	if (vnode->covered_by != NULL || vnode->covers != NULL) {
3929 		// this vnode is in use
3930 		return B_BUSY;
3931 	}
3932 
3933 	vnode->Lock();
3934 
3935 	vnode->SetRemoved(true);
3936 	bool removeUnpublished = false;
3937 
3938 	if (vnode->IsUnpublished()) {
3939 		// prepare the vnode for deletion
3940 		removeUnpublished = true;
3941 		vnode->SetBusy(true);
3942 	}
3943 
3944 	vnode->Unlock();
3945 	locker.Unlock();
3946 
3947 	if (removeUnpublished) {
3948 		// If the vnode hasn't been published yet, we delete it here
3949 		atomic_add(&vnode->ref_count, -1);
3950 		free_vnode(vnode, true);
3951 	}
3952 
3953 	return B_OK;
3954 }
3955 
3956 
3957 extern "C" status_t
3958 unremove_vnode(fs_volume* volume, ino_t vnodeID)
3959 {
3960 	struct vnode* vnode;
3961 
3962 	rw_lock_read_lock(&sVnodeLock);
3963 
3964 	vnode = lookup_vnode(volume->id, vnodeID);
3965 	if (vnode) {
3966 		AutoLocker<Vnode> nodeLocker(vnode);
3967 		vnode->SetRemoved(false);
3968 	}
3969 
3970 	rw_lock_read_unlock(&sVnodeLock);
3971 	return B_OK;
3972 }
3973 
3974 
3975 extern "C" status_t
3976 get_vnode_removed(fs_volume* volume, ino_t vnodeID, bool* _removed)
3977 {
3978 	ReadLocker _(sVnodeLock);
3979 
3980 	if (struct vnode* vnode = lookup_vnode(volume->id, vnodeID)) {
3981 		if (_removed != NULL)
3982 			*_removed = vnode->IsRemoved();
3983 		return B_OK;
3984 	}
3985 
3986 	return B_BAD_VALUE;
3987 }
3988 
3989 
3990 extern "C" status_t
3991 mark_vnode_busy(fs_volume* volume, ino_t vnodeID, bool busy)
3992 {
3993 	ReadLocker locker(sVnodeLock);
3994 
3995 	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3996 	if (vnode == NULL)
3997 		return B_ENTRY_NOT_FOUND;
3998 
3999 	// are we trying to mark an already busy node busy again?
4000 	if (busy && vnode->IsBusy())
4001 		return B_BUSY;
4002 
4003 	vnode->Lock();
4004 	vnode->SetBusy(busy);
4005 	vnode->Unlock();
4006 
4007 	return B_OK;
4008 }
4009 
4010 
4011 extern "C" status_t
4012 change_vnode_id(fs_volume* volume, ino_t vnodeID, ino_t newID)
4013 {
4014 	WriteLocker locker(sVnodeLock);
4015 
4016 	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
4017 	if (vnode == NULL)
4018 		return B_ENTRY_NOT_FOUND;
4019 
4020 	sVnodeTable->Remove(vnode);
4021 	vnode->id = newID;
4022 	sVnodeTable->Insert(vnode);
4023 
4024 	if (vnode->cache != NULL && vnode->cache->type == CACHE_TYPE_VNODE)
4025 		((VMVnodeCache*)vnode->cache)->SetVnodeID(newID);
4026 
4027 	return B_OK;
4028 }
4029 
4030 
4031 extern "C" fs_volume*
4032 volume_for_vnode(fs_vnode* _vnode)
4033 {
4034 	if (_vnode == NULL)
4035 		return NULL;
4036 
4037 	struct vnode* vnode = static_cast<struct vnode*>(_vnode);
4038 	return vnode->mount->volume;
4039 }
4040 
4041 
4042 extern "C" status_t
4043 check_access_permissions(int accessMode, mode_t mode, gid_t nodeGroupID,
4044 	uid_t nodeUserID)
4045 {
4046 	// get node permissions
4047 	int userPermissions = (mode & S_IRWXU) >> 6;
4048 	int groupPermissions = (mode & S_IRWXG) >> 3;
4049 	int otherPermissions = mode & S_IRWXO;
4050 
4051 	// get the node permissions for this uid/gid
4052 	int permissions = 0;
4053 	uid_t uid = geteuid();
4054 
4055 	if (uid == 0) {
4056 		// user is root
4057 		// root has always read/write permission, but at least one of the
4058 		// X bits must be set for execute permission
4059 		permissions = userPermissions | groupPermissions | otherPermissions
4060 			| S_IROTH | S_IWOTH;
4061 		if (S_ISDIR(mode))
4062 			permissions |= S_IXOTH;
4063 	} else if (uid == nodeUserID) {
4064 		// user is node owner
4065 		permissions = userPermissions;
4066 	} else if (is_user_in_group(nodeGroupID)) {
4067 		// user is in owning group
4068 		permissions = groupPermissions;
4069 	} else {
4070 		// user is one of the others
4071 		permissions = otherPermissions;
4072 	}
4073 
4074 	return (accessMode & ~permissions) == 0 ? B_OK : B_PERMISSION_DENIED;
4075 }
4076 
4077 
4078 #if 0
4079 extern "C" status_t
4080 read_pages(int fd, off_t pos, const iovec* vecs, size_t count,
4081 	size_t* _numBytes)
4082 {
4083 	struct file_descriptor* descriptor;
4084 	struct vnode* vnode;
4085 
4086 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4087 	if (descriptor == NULL)
4088 		return B_FILE_ERROR;
4089 
4090 	status_t status = vfs_read_pages(vnode, descriptor->cookie, pos, vecs,
4091 		count, 0, _numBytes);
4092 
4093 	put_fd(descriptor);
4094 	return status;
4095 }
4096 
4097 
4098 extern "C" status_t
4099 write_pages(int fd, off_t pos, const iovec* vecs, size_t count,
4100 	size_t* _numBytes)
4101 {
4102 	struct file_descriptor* descriptor;
4103 	struct vnode* vnode;
4104 
4105 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4106 	if (descriptor == NULL)
4107 		return B_FILE_ERROR;
4108 
4109 	status_t status = vfs_write_pages(vnode, descriptor->cookie, pos, vecs,
4110 		count, 0, _numBytes);
4111 
4112 	put_fd(descriptor);
4113 	return status;
4114 }
4115 #endif
4116 
4117 
4118 extern "C" status_t
4119 read_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
4120 	const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
4121 	size_t* _bytes)
4122 {
4123 	struct file_descriptor* descriptor;
4124 	struct vnode* vnode;
4125 
4126 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4127 	if (descriptor == NULL)
4128 		return B_FILE_ERROR;
4129 
4130 	status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
4131 		fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
4132 		false);
4133 
4134 	put_fd(descriptor);
4135 	return status;
4136 }
4137 
4138 
4139 extern "C" status_t
4140 write_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
4141 	const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
4142 	size_t* _bytes)
4143 {
4144 	struct file_descriptor* descriptor;
4145 	struct vnode* vnode;
4146 
4147 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4148 	if (descriptor == NULL)
4149 		return B_FILE_ERROR;
4150 
4151 	status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
4152 		fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
4153 		true);
4154 
4155 	put_fd(descriptor);
4156 	return status;
4157 }
4158 
4159 
4160 extern "C" status_t
4161 entry_cache_add(dev_t mountID, ino_t dirID, const char* name, ino_t nodeID)
4162 {
4163 	// lookup mount -- the caller is required to make sure that the mount
4164 	// won't go away
4165 	MutexLocker locker(sMountMutex);
4166 	struct fs_mount* mount = find_mount(mountID);
4167 	if (mount == NULL)
4168 		return B_BAD_VALUE;
4169 	locker.Unlock();
4170 
4171 	return mount->entry_cache.Add(dirID, name, nodeID, false);
4172 }
4173 
4174 
4175 extern "C" status_t
4176 entry_cache_add_missing(dev_t mountID, ino_t dirID, const char* name)
4177 {
4178 	// lookup mount -- the caller is required to make sure that the mount
4179 	// won't go away
4180 	MutexLocker locker(sMountMutex);
4181 	struct fs_mount* mount = find_mount(mountID);
4182 	if (mount == NULL)
4183 		return B_BAD_VALUE;
4184 	locker.Unlock();
4185 
4186 	return mount->entry_cache.Add(dirID, name, -1, true);
4187 }
4188 
4189 
4190 extern "C" status_t
4191 entry_cache_remove(dev_t mountID, ino_t dirID, const char* name)
4192 {
4193 	// lookup mount -- the caller is required to make sure that the mount
4194 	// won't go away
4195 	MutexLocker locker(sMountMutex);
4196 	struct fs_mount* mount = find_mount(mountID);
4197 	if (mount == NULL)
4198 		return B_BAD_VALUE;
4199 	locker.Unlock();
4200 
4201 	return mount->entry_cache.Remove(dirID, name);
4202 }
4203 
4204 
4205 //	#pragma mark - private VFS API
4206 //	Functions the VFS exports for other parts of the kernel
4207 
4208 
4209 /*! Acquires another reference to the vnode that has to be released
4210 	by calling vfs_put_vnode().
4211 */
4212 void
4213 vfs_acquire_vnode(struct vnode* vnode)
4214 {
4215 	inc_vnode_ref_count(vnode);
4216 }
4217 
4218 
4219 /*! This is currently called from file_cache_create() only.
4220 	It's probably a temporary solution as long as devfs requires that
4221 	fs_read_pages()/fs_write_pages() are called with the standard
4222 	open cookie and not with a device cookie.
4223 	If that's done differently, remove this call; it has no other
4224 	purpose.
4225 */
4226 extern "C" status_t
4227 vfs_get_cookie_from_fd(int fd, void** _cookie)
4228 {
4229 	struct file_descriptor* descriptor;
4230 
4231 	descriptor = get_fd(get_current_io_context(true), fd);
4232 	if (descriptor == NULL)
4233 		return B_FILE_ERROR;
4234 
4235 	*_cookie = descriptor->cookie;
4236 	return B_OK;
4237 }
4238 
4239 
4240 extern "C" status_t
4241 vfs_get_vnode_from_fd(int fd, bool kernel, struct vnode** vnode)
4242 {
4243 	*vnode = get_vnode_from_fd(fd, kernel);
4244 
4245 	if (*vnode == NULL)
4246 		return B_FILE_ERROR;
4247 
4248 	return B_NO_ERROR;
4249 }
4250 
4251 
4252 extern "C" status_t
4253 vfs_get_vnode_from_path(const char* path, bool kernel, struct vnode** _vnode)
4254 {
4255 	TRACE(("vfs_get_vnode_from_path: entry. path = '%s', kernel %d\n",
4256 		path, kernel));
4257 
4258 	KPath pathBuffer;
4259 	if (pathBuffer.InitCheck() != B_OK)
4260 		return B_NO_MEMORY;
4261 
4262 	char* buffer = pathBuffer.LockBuffer();
4263 	strlcpy(buffer, path, pathBuffer.BufferSize());
4264 
4265 	struct vnode* vnode;
4266 	status_t status = path_to_vnode(buffer, true, &vnode, NULL, kernel);
4267 	if (status != B_OK)
4268 		return status;
4269 
4270 	*_vnode = vnode;
4271 	return B_OK;
4272 }
4273 
4274 
4275 extern "C" status_t
4276 vfs_get_vnode(dev_t mountID, ino_t vnodeID, bool canWait, struct vnode** _vnode)
4277 {
4278 	struct vnode* vnode = NULL;
4279 
4280 	status_t status = get_vnode(mountID, vnodeID, &vnode, canWait, false);
4281 	if (status != B_OK)
4282 		return status;
4283 
4284 	*_vnode = vnode;
4285 	return B_OK;
4286 }
4287 
4288 
4289 extern "C" status_t
4290 vfs_entry_ref_to_vnode(dev_t mountID, ino_t directoryID,
4291 	const char* name, struct vnode** _vnode)
4292 {
4293 	return entry_ref_to_vnode(mountID, directoryID, name, false, true, _vnode);
4294 }
4295 
4296 
4297 extern "C" void
4298 vfs_vnode_to_node_ref(struct vnode* vnode, dev_t* _mountID, ino_t* _vnodeID)
4299 {
4300 	*_mountID = vnode->device;
4301 	*_vnodeID = vnode->id;
4302 }
4303 
4304 
4305 /*!
4306 	Helper function abstracting the process of "converting" a given
4307 	vnode-pointer to a fs_vnode-pointer.
4308 	Currently only used in bindfs.
4309 */
4310 extern "C" fs_vnode*
4311 vfs_fsnode_for_vnode(struct vnode* vnode)
4312 {
4313 	return vnode;
4314 }
4315 
4316 
4317 /*!
4318 	Calls fs_open() on the given vnode and returns a new
4319 	file descriptor for it
4320 */
4321 int
4322 vfs_open_vnode(struct vnode* vnode, int openMode, bool kernel)
4323 {
4324 	return open_vnode(vnode, openMode, kernel);
4325 }
4326 
4327 
4328 /*!	Looks up a vnode with the given mount and vnode ID.
4329 	Must only be used with "in-use" vnodes as it doesn't grab a reference
4330 	to the node.
4331 	It's currently only be used by file_cache_create().
4332 */
4333 extern "C" status_t
4334 vfs_lookup_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode)
4335 {
4336 	rw_lock_read_lock(&sVnodeLock);
4337 	struct vnode* vnode = lookup_vnode(mountID, vnodeID);
4338 	rw_lock_read_unlock(&sVnodeLock);
4339 
4340 	if (vnode == NULL)
4341 		return B_ERROR;
4342 
4343 	*_vnode = vnode;
4344 	return B_OK;
4345 }
4346 
4347 
4348 extern "C" status_t
4349 vfs_get_fs_node_from_path(fs_volume* volume, const char* path,
4350 	bool traverseLeafLink, bool kernel, void** _node)
4351 {
4352 	TRACE(("vfs_get_fs_node_from_path(volume = %p, path = \"%s\", kernel %d)\n",
4353 		volume, path, kernel));
4354 
4355 	KPath pathBuffer;
4356 	if (pathBuffer.InitCheck() != B_OK)
4357 		return B_NO_MEMORY;
4358 
4359 	fs_mount* mount;
4360 	status_t status = get_mount(volume->id, &mount);
4361 	if (status != B_OK)
4362 		return status;
4363 
4364 	char* buffer = pathBuffer.LockBuffer();
4365 	strlcpy(buffer, path, pathBuffer.BufferSize());
4366 
4367 	struct vnode* vnode = mount->root_vnode;
4368 
4369 	if (buffer[0] == '/')
4370 		status = path_to_vnode(buffer, traverseLeafLink, &vnode, NULL, kernel);
4371 	else {
4372 		inc_vnode_ref_count(vnode);
4373 			// vnode_path_to_vnode() releases a reference to the starting vnode
4374 		status = vnode_path_to_vnode(vnode, buffer, traverseLeafLink, 0,
4375 			kernel, &vnode, NULL);
4376 	}
4377 
4378 	put_mount(mount);
4379 
4380 	if (status != B_OK)
4381 		return status;
4382 
4383 	if (vnode->device != volume->id) {
4384 		// wrong mount ID - must not gain access on foreign file system nodes
4385 		put_vnode(vnode);
4386 		return B_BAD_VALUE;
4387 	}
4388 
4389 	// Use get_vnode() to resolve the cookie for the right layer.
4390 	status = get_vnode(volume, vnode->id, _node);
4391 	put_vnode(vnode);
4392 
4393 	return status;
4394 }
4395 
4396 
4397 status_t
4398 vfs_read_stat(int fd, const char* path, bool traverseLeafLink,
4399 	struct stat* stat, bool kernel)
4400 {
4401 	status_t status;
4402 
4403 	if (path != NULL) {
4404 		// path given: get the stat of the node referred to by (fd, path)
4405 		KPath pathBuffer(path);
4406 		if (pathBuffer.InitCheck() != B_OK)
4407 			return B_NO_MEMORY;
4408 
4409 		status = common_path_read_stat(fd, pathBuffer.LockBuffer(),
4410 			traverseLeafLink, stat, kernel);
4411 	} else {
4412 		// no path given: get the FD and use the FD operation
4413 		struct file_descriptor* descriptor
4414 			= get_fd(get_current_io_context(kernel), fd);
4415 		if (descriptor == NULL)
4416 			return B_FILE_ERROR;
4417 
4418 		if (descriptor->ops->fd_read_stat)
4419 			status = descriptor->ops->fd_read_stat(descriptor, stat);
4420 		else
4421 			status = B_UNSUPPORTED;
4422 
4423 		put_fd(descriptor);
4424 	}
4425 
4426 	return status;
4427 }
4428 
4429 
4430 /*!	Finds the full path to the file that contains the module \a moduleName,
4431 	puts it into \a pathBuffer, and returns B_OK for success.
4432 	If \a pathBuffer was too small, it returns \c B_BUFFER_OVERFLOW,
4433 	\c B_ENTRY_NOT_FOUNT if no file could be found.
4434 	\a pathBuffer is clobbered in any case and must not be relied on if this
4435 	functions returns unsuccessfully.
4436 	\a basePath and \a pathBuffer must not point to the same space.
4437 */
4438 status_t
4439 vfs_get_module_path(const char* basePath, const char* moduleName,
4440 	char* pathBuffer, size_t bufferSize)
4441 {
4442 	struct vnode* dir;
4443 	struct vnode* file;
4444 	status_t status;
4445 	size_t length;
4446 	char* path;
4447 
4448 	if (bufferSize == 0
4449 		|| strlcpy(pathBuffer, basePath, bufferSize) >= bufferSize)
4450 		return B_BUFFER_OVERFLOW;
4451 
4452 	status = path_to_vnode(pathBuffer, true, &dir, NULL, true);
4453 	if (status != B_OK)
4454 		return status;
4455 
4456 	// the path buffer had been clobbered by the above call
4457 	length = strlcpy(pathBuffer, basePath, bufferSize);
4458 	if (pathBuffer[length - 1] != '/')
4459 		pathBuffer[length++] = '/';
4460 
4461 	path = pathBuffer + length;
4462 	bufferSize -= length;
4463 
4464 	while (moduleName) {
4465 		char* nextPath = strchr(moduleName, '/');
4466 		if (nextPath == NULL)
4467 			length = strlen(moduleName);
4468 		else {
4469 			length = nextPath - moduleName;
4470 			nextPath++;
4471 		}
4472 
4473 		if (length + 1 >= bufferSize) {
4474 			status = B_BUFFER_OVERFLOW;
4475 			goto err;
4476 		}
4477 
4478 		memcpy(path, moduleName, length);
4479 		path[length] = '\0';
4480 		moduleName = nextPath;
4481 
4482 		status = vnode_path_to_vnode(dir, path, true, 0, true, &file, NULL);
4483 		if (status != B_OK) {
4484 			// vnode_path_to_vnode() has already released the reference to dir
4485 			return status;
4486 		}
4487 
4488 		if (S_ISDIR(file->Type())) {
4489 			// goto the next directory
4490 			path[length] = '/';
4491 			path[length + 1] = '\0';
4492 			path += length + 1;
4493 			bufferSize -= length + 1;
4494 
4495 			dir = file;
4496 		} else if (S_ISREG(file->Type())) {
4497 			// it's a file so it should be what we've searched for
4498 			put_vnode(file);
4499 
4500 			return B_OK;
4501 		} else {
4502 			TRACE(("vfs_get_module_path(): something is strange here: "
4503 				"0x%08" B_PRIx32 "...\n", file->Type()));
4504 			status = B_ERROR;
4505 			dir = file;
4506 			goto err;
4507 		}
4508 	}
4509 
4510 	// if we got here, the moduleName just pointed to a directory, not to
4511 	// a real module - what should we do in this case?
4512 	status = B_ENTRY_NOT_FOUND;
4513 
4514 err:
4515 	put_vnode(dir);
4516 	return status;
4517 }
4518 
4519 
4520 /*!	\brief Normalizes a given path.
4521 
4522 	The path must refer to an existing or non-existing entry in an existing
4523 	directory, that is chopping off the leaf component the remaining path must
4524 	refer to an existing directory.
4525 
4526 	The returned will be canonical in that it will be absolute, will not
4527 	contain any "." or ".." components or duplicate occurrences of '/'s,
4528 	and none of the directory components will by symbolic links.
4529 
4530 	Any two paths referring to the same entry, will result in the same
4531 	normalized path (well, that is pretty much the definition of `normalized',
4532 	isn't it :-).
4533 
4534 	\param path The path to be normalized.
4535 	\param buffer The buffer into which the normalized path will be written.
4536 		   May be the same one as \a path.
4537 	\param bufferSize The size of \a buffer.
4538 	\param traverseLink If \c true, the function also resolves leaf symlinks.
4539 	\param kernel \c true, if the IO context of the kernel shall be used,
4540 		   otherwise that of the team this thread belongs to. Only relevant,
4541 		   if the path is relative (to get the CWD).
4542 	\return \c B_OK if everything went fine, another error code otherwise.
4543 */
4544 status_t
4545 vfs_normalize_path(const char* path, char* buffer, size_t bufferSize,
4546 	bool traverseLink, bool kernel)
4547 {
4548 	if (!path || !buffer || bufferSize < 1)
4549 		return B_BAD_VALUE;
4550 
4551 	if (path != buffer) {
4552 		if (strlcpy(buffer, path, bufferSize) >= bufferSize)
4553 			return B_BUFFER_OVERFLOW;
4554 	}
4555 
4556 	return normalize_path(buffer, bufferSize, traverseLink, kernel);
4557 }
4558 
4559 
4560 /*!	\brief Gets the parent of the passed in node.
4561 
4562 	Gets the parent of the passed in node, and correctly resolves covered
4563 	nodes.
4564 */
4565 extern "C" status_t
4566 vfs_resolve_parent(struct vnode* parent, dev_t* device, ino_t* node)
4567 {
4568 	return resolve_covered_parent(parent, device, node,
4569 		get_current_io_context(true));
4570 }
4571 
4572 
4573 /*!	\brief Creates a special node in the file system.
4574 
4575 	The caller gets a reference to the newly created node (which is passed
4576 	back through \a _createdVnode) and is responsible for releasing it.
4577 
4578 	\param path The path where to create the entry for the node. Can be \c NULL,
4579 		in which case the node is created without an entry in the root FS -- it
4580 		will automatically be deleted when the last reference has been released.
4581 	\param subVnode The definition of the subnode. Can be \c NULL, in which case
4582 		the target file system will just create the node with its standard
4583 		operations. Depending on the type of the node a subnode might be created
4584 		automatically, though.
4585 	\param mode The type and permissions for the node to be created.
4586 	\param flags Flags to be passed to the creating FS.
4587 	\param kernel \c true, if called in the kernel context (relevant only if
4588 		\a path is not \c NULL and not absolute).
4589 	\param _superVnode Pointer to a pre-allocated structure to be filled by the
4590 		file system creating the node, with the private data pointer and
4591 		operations for the super node. Can be \c NULL.
4592 	\param _createVnode Pointer to pre-allocated storage where to store the
4593 		pointer to the newly created node.
4594 	\return \c B_OK, if everything went fine, another error code otherwise.
4595 */
4596 status_t
4597 vfs_create_special_node(const char* path, fs_vnode* subVnode, mode_t mode,
4598 	uint32 flags, bool kernel, fs_vnode* _superVnode,
4599 	struct vnode** _createdVnode)
4600 {
4601 	struct vnode* dirNode;
4602 	char _leaf[B_FILE_NAME_LENGTH];
4603 	char* leaf = NULL;
4604 
4605 	if (path) {
4606 		// We've got a path. Get the dir vnode and the leaf name.
4607 		KPath tmpPathBuffer;
4608 		if (tmpPathBuffer.InitCheck() != B_OK)
4609 			return B_NO_MEMORY;
4610 
4611 		char* tmpPath = tmpPathBuffer.LockBuffer();
4612 		if (strlcpy(tmpPath, path, B_PATH_NAME_LENGTH) >= B_PATH_NAME_LENGTH)
4613 			return B_NAME_TOO_LONG;
4614 
4615 		// get the dir vnode and the leaf name
4616 		leaf = _leaf;
4617 		status_t error = path_to_dir_vnode(tmpPath, &dirNode, leaf, kernel);
4618 		if (error != B_OK)
4619 			return error;
4620 	} else {
4621 		// No path. Create the node in the root FS.
4622 		dirNode = sRoot;
4623 		inc_vnode_ref_count(dirNode);
4624 	}
4625 
4626 	VNodePutter _(dirNode);
4627 
4628 	// check support for creating special nodes
4629 	if (!HAS_FS_CALL(dirNode, create_special_node))
4630 		return B_UNSUPPORTED;
4631 
4632 	// create the node
4633 	fs_vnode superVnode;
4634 	ino_t nodeID;
4635 	status_t status = FS_CALL(dirNode, create_special_node, leaf, subVnode,
4636 		mode, flags, _superVnode != NULL ? _superVnode : &superVnode, &nodeID);
4637 	if (status != B_OK)
4638 		return status;
4639 
4640 	// lookup the node
4641 	rw_lock_read_lock(&sVnodeLock);
4642 	*_createdVnode = lookup_vnode(dirNode->mount->id, nodeID);
4643 	rw_lock_read_unlock(&sVnodeLock);
4644 
4645 	if (*_createdVnode == NULL) {
4646 		panic("vfs_create_special_node(): lookup of node failed");
4647 		return B_ERROR;
4648 	}
4649 
4650 	return B_OK;
4651 }
4652 
4653 
4654 extern "C" void
4655 vfs_put_vnode(struct vnode* vnode)
4656 {
4657 	put_vnode(vnode);
4658 }
4659 
4660 
4661 extern "C" status_t
4662 vfs_get_cwd(dev_t* _mountID, ino_t* _vnodeID)
4663 {
4664 	// Get current working directory from io context
4665 	struct io_context* context = get_current_io_context(false);
4666 	status_t status = B_OK;
4667 
4668 	mutex_lock(&context->io_mutex);
4669 
4670 	if (context->cwd != NULL) {
4671 		*_mountID = context->cwd->device;
4672 		*_vnodeID = context->cwd->id;
4673 	} else
4674 		status = B_ERROR;
4675 
4676 	mutex_unlock(&context->io_mutex);
4677 	return status;
4678 }
4679 
4680 
4681 status_t
4682 vfs_unmount(dev_t mountID, uint32 flags)
4683 {
4684 	return fs_unmount(NULL, mountID, flags, true);
4685 }
4686 
4687 
4688 extern "C" status_t
4689 vfs_disconnect_vnode(dev_t mountID, ino_t vnodeID)
4690 {
4691 	struct vnode* vnode;
4692 
4693 	status_t status = get_vnode(mountID, vnodeID, &vnode, true, true);
4694 	if (status != B_OK)
4695 		return status;
4696 
4697 	disconnect_mount_or_vnode_fds(vnode->mount, vnode);
4698 	put_vnode(vnode);
4699 	return B_OK;
4700 }
4701 
4702 
4703 extern "C" void
4704 vfs_free_unused_vnodes(int32 level)
4705 {
4706 	vnode_low_resource_handler(NULL,
4707 		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
4708 			| B_KERNEL_RESOURCE_ADDRESS_SPACE,
4709 		level);
4710 }
4711 
4712 
4713 extern "C" bool
4714 vfs_can_page(struct vnode* vnode, void* cookie)
4715 {
4716 	FUNCTION(("vfs_canpage: vnode %p\n", vnode));
4717 
4718 	if (HAS_FS_CALL(vnode, can_page))
4719 		return FS_CALL(vnode, can_page, cookie);
4720 	return false;
4721 }
4722 
4723 
4724 extern "C" status_t
4725 vfs_read_pages(struct vnode* vnode, void* cookie, off_t pos,
4726 	const generic_io_vec* vecs, size_t count, uint32 flags,
4727 	generic_size_t* _numBytes)
4728 {
4729 	FUNCTION(("vfs_read_pages: vnode %p, vecs %p, pos %" B_PRIdOFF "\n", vnode,
4730 		vecs, pos));
4731 
4732 #if VFS_PAGES_IO_TRACING
4733 	generic_size_t bytesRequested = *_numBytes;
4734 #endif
4735 
4736 	IORequest request;
4737 	status_t status = request.Init(pos, vecs, count, *_numBytes, false, flags);
4738 	if (status == B_OK) {
4739 		status = vfs_vnode_io(vnode, cookie, &request);
4740 		if (status == B_OK)
4741 			status = request.Wait();
4742 		*_numBytes = request.TransferredBytes();
4743 	}
4744 
4745 	TPIO(ReadPages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4746 		status, *_numBytes));
4747 
4748 	return status;
4749 }
4750 
4751 
4752 extern "C" status_t
4753 vfs_write_pages(struct vnode* vnode, void* cookie, off_t pos,
4754 	const generic_io_vec* vecs, size_t count, uint32 flags,
4755 	generic_size_t* _numBytes)
4756 {
4757 	FUNCTION(("vfs_write_pages: vnode %p, vecs %p, pos %" B_PRIdOFF "\n", vnode,
4758 		vecs, pos));
4759 
4760 #if VFS_PAGES_IO_TRACING
4761 	generic_size_t bytesRequested = *_numBytes;
4762 #endif
4763 
4764 	IORequest request;
4765 	status_t status = request.Init(pos, vecs, count, *_numBytes, true, flags);
4766 	if (status == B_OK) {
4767 		status = vfs_vnode_io(vnode, cookie, &request);
4768 		if (status == B_OK)
4769 			status = request.Wait();
4770 		*_numBytes = request.TransferredBytes();
4771 	}
4772 
4773 	TPIO(WritePages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4774 		status, *_numBytes));
4775 
4776 	return status;
4777 }
4778 
4779 
4780 /*!	Gets the vnode's VMCache object. If it didn't have one, it will be
4781 	created if \a allocate is \c true.
4782 	In case it's successful, it will also grab a reference to the cache
4783 	it returns.
4784 */
4785 extern "C" status_t
4786 vfs_get_vnode_cache(struct vnode* vnode, VMCache** _cache, bool allocate)
4787 {
4788 	if (vnode->cache != NULL) {
4789 		vnode->cache->AcquireRef();
4790 		*_cache = vnode->cache;
4791 		return B_OK;
4792 	}
4793 
4794 	rw_lock_read_lock(&sVnodeLock);
4795 	vnode->Lock();
4796 
4797 	status_t status = B_OK;
4798 
4799 	// The cache could have been created in the meantime
4800 	if (vnode->cache == NULL) {
4801 		if (allocate) {
4802 			// TODO: actually the vnode needs to be busy already here, or
4803 			//	else this won't work...
4804 			bool wasBusy = vnode->IsBusy();
4805 			vnode->SetBusy(true);
4806 
4807 			vnode->Unlock();
4808 			rw_lock_read_unlock(&sVnodeLock);
4809 
4810 			status = vm_create_vnode_cache(vnode, &vnode->cache);
4811 
4812 			rw_lock_read_lock(&sVnodeLock);
4813 			vnode->Lock();
4814 			vnode->SetBusy(wasBusy);
4815 		} else
4816 			status = B_BAD_VALUE;
4817 	}
4818 
4819 	vnode->Unlock();
4820 	rw_lock_read_unlock(&sVnodeLock);
4821 
4822 	if (status == B_OK) {
4823 		vnode->cache->AcquireRef();
4824 		*_cache = vnode->cache;
4825 	}
4826 
4827 	return status;
4828 }
4829 
4830 
4831 /*!	Sets the vnode's VMCache object, for subsystems that want to manage
4832 	their own.
4833 	In case it's successful, it will also grab a reference to the cache
4834 	it returns.
4835 */
4836 extern "C" status_t
4837 vfs_set_vnode_cache(struct vnode* vnode, VMCache* _cache)
4838 {
4839 	rw_lock_read_lock(&sVnodeLock);
4840 	vnode->Lock();
4841 
4842 	status_t status = B_OK;
4843 	if (vnode->cache != NULL) {
4844 		status = B_NOT_ALLOWED;
4845 	} else {
4846 		vnode->cache = _cache;
4847 		_cache->AcquireRef();
4848 	}
4849 
4850 	vnode->Unlock();
4851 	rw_lock_read_unlock(&sVnodeLock);
4852 	return status;
4853 }
4854 
4855 
4856 status_t
4857 vfs_get_file_map(struct vnode* vnode, off_t offset, size_t size,
4858 	file_io_vec* vecs, size_t* _count)
4859 {
4860 	FUNCTION(("vfs_get_file_map: vnode %p, vecs %p, offset %" B_PRIdOFF
4861 		", size = %" B_PRIuSIZE "\n", vnode, vecs, offset, size));
4862 
4863 	return FS_CALL(vnode, get_file_map, offset, size, vecs, _count);
4864 }
4865 
4866 
4867 status_t
4868 vfs_stat_vnode(struct vnode* vnode, struct stat* stat)
4869 {
4870 	status_t status = FS_CALL(vnode, read_stat, stat);
4871 
4872 	// fill in the st_dev and st_ino fields
4873 	if (status == B_OK) {
4874 		stat->st_dev = vnode->device;
4875 		stat->st_ino = vnode->id;
4876 		// the rdev field must stay unset for non-special files
4877 		if (!S_ISBLK(stat->st_mode) && !S_ISCHR(stat->st_mode))
4878 			stat->st_rdev = -1;
4879 	}
4880 
4881 	return status;
4882 }
4883 
4884 
4885 status_t
4886 vfs_stat_node_ref(dev_t device, ino_t inode, struct stat* stat)
4887 {
4888 	struct vnode* vnode;
4889 	status_t status = get_vnode(device, inode, &vnode, true, false);
4890 	if (status != B_OK)
4891 		return status;
4892 
4893 	status = vfs_stat_vnode(vnode, stat);
4894 
4895 	put_vnode(vnode);
4896 	return status;
4897 }
4898 
4899 
4900 status_t
4901 vfs_get_vnode_name(struct vnode* vnode, char* name, size_t nameSize)
4902 {
4903 	return get_vnode_name(vnode, NULL, name, nameSize, true);
4904 }
4905 
4906 
4907 status_t
4908 vfs_entry_ref_to_path(dev_t device, ino_t inode, const char* leaf,
4909 	bool kernel, char* path, size_t pathLength)
4910 {
4911 	struct vnode* vnode;
4912 	status_t status;
4913 
4914 	// filter invalid leaf names
4915 	if (leaf != NULL && (leaf[0] == '\0' || strchr(leaf, '/')))
4916 		return B_BAD_VALUE;
4917 
4918 	// get the vnode matching the dir's node_ref
4919 	if (leaf && (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0)) {
4920 		// special cases "." and "..": we can directly get the vnode of the
4921 		// referenced directory
4922 		status = entry_ref_to_vnode(device, inode, leaf, false, kernel, &vnode);
4923 		leaf = NULL;
4924 	} else
4925 		status = get_vnode(device, inode, &vnode, true, false);
4926 	if (status != B_OK)
4927 		return status;
4928 
4929 	// get the directory path
4930 	status = dir_vnode_to_path(vnode, path, pathLength, kernel);
4931 	put_vnode(vnode);
4932 		// we don't need the vnode anymore
4933 	if (status != B_OK)
4934 		return status;
4935 
4936 	// append the leaf name
4937 	if (leaf) {
4938 		// insert a directory separator if this is not the file system root
4939 		if ((strcmp(path, "/") && strlcat(path, "/", pathLength)
4940 				>= pathLength)
4941 			|| strlcat(path, leaf, pathLength) >= pathLength) {
4942 			return B_NAME_TOO_LONG;
4943 		}
4944 	}
4945 
4946 	return B_OK;
4947 }
4948 
4949 
4950 /*!	If the given descriptor locked its vnode, that lock will be released. */
4951 void
4952 vfs_unlock_vnode_if_locked(struct file_descriptor* descriptor)
4953 {
4954 	struct vnode* vnode = fd_vnode(descriptor);
4955 
4956 	if (vnode != NULL && vnode->mandatory_locked_by == descriptor)
4957 		vnode->mandatory_locked_by = NULL;
4958 }
4959 
4960 
4961 /*!	Releases any POSIX locks on the file descriptor. */
4962 status_t
4963 vfs_release_posix_lock(io_context* context, struct file_descriptor* descriptor)
4964 {
4965 	struct vnode* vnode = descriptor->u.vnode;
4966 	if (vnode == NULL)
4967 		return B_OK;
4968 
4969 	if (HAS_FS_CALL(vnode, release_lock))
4970 		return FS_CALL(vnode, release_lock, descriptor->cookie, NULL);
4971 
4972 	return release_advisory_lock(vnode, context, NULL, NULL);
4973 }
4974 
4975 
4976 /*!	Closes all file descriptors of the specified I/O context that
4977 	have the O_CLOEXEC flag set.
4978 */
4979 void
4980 vfs_exec_io_context(io_context* context)
4981 {
4982 	uint32 i;
4983 
4984 	for (i = 0; i < context->table_size; i++) {
4985 		mutex_lock(&context->io_mutex);
4986 
4987 		struct file_descriptor* descriptor = context->fds[i];
4988 		bool remove = false;
4989 
4990 		if (descriptor != NULL && fd_close_on_exec(context, i)) {
4991 			context->fds[i] = NULL;
4992 			context->num_used_fds--;
4993 
4994 			remove = true;
4995 		}
4996 
4997 		mutex_unlock(&context->io_mutex);
4998 
4999 		if (remove) {
5000 			close_fd(context, descriptor);
5001 			put_fd(descriptor);
5002 		}
5003 	}
5004 }
5005 
5006 
5007 /*! Sets up a new io_control structure, and inherits the properties
5008 	of the parent io_control if it is given.
5009 */
5010 io_context*
5011 vfs_new_io_context(io_context* parentContext, bool purgeCloseOnExec)
5012 {
5013 	io_context* context = (io_context*)malloc(sizeof(io_context));
5014 	if (context == NULL)
5015 		return NULL;
5016 
5017 	TIOC(NewIOContext(context, parentContext));
5018 
5019 	memset(context, 0, sizeof(io_context));
5020 	context->ref_count = 1;
5021 
5022 	MutexLocker parentLocker;
5023 
5024 	size_t tableSize;
5025 	if (parentContext != NULL) {
5026 		parentLocker.SetTo(parentContext->io_mutex, false);
5027 		tableSize = parentContext->table_size;
5028 	} else
5029 		tableSize = DEFAULT_FD_TABLE_SIZE;
5030 
5031 	// allocate space for FDs and their close-on-exec flag
5032 	context->fds = (file_descriptor**)malloc(
5033 		sizeof(struct file_descriptor*) * tableSize
5034 		+ sizeof(struct select_sync*) * tableSize
5035 		+ (tableSize + 7) / 8);
5036 	if (context->fds == NULL) {
5037 		free(context);
5038 		return NULL;
5039 	}
5040 
5041 	context->select_infos = (select_info**)(context->fds + tableSize);
5042 	context->fds_close_on_exec = (uint8*)(context->select_infos + tableSize);
5043 
5044 	memset(context->fds, 0, sizeof(struct file_descriptor*) * tableSize
5045 		+ sizeof(struct select_sync*) * tableSize
5046 		+ (tableSize + 7) / 8);
5047 
5048 	mutex_init(&context->io_mutex, "I/O context");
5049 
5050 	// Copy all parent file descriptors
5051 
5052 	if (parentContext != NULL) {
5053 		size_t i;
5054 
5055 		mutex_lock(&sIOContextRootLock);
5056 		context->root = parentContext->root;
5057 		if (context->root)
5058 			inc_vnode_ref_count(context->root);
5059 		mutex_unlock(&sIOContextRootLock);
5060 
5061 		context->cwd = parentContext->cwd;
5062 		if (context->cwd)
5063 			inc_vnode_ref_count(context->cwd);
5064 
5065 		if (parentContext->inherit_fds) {
5066 			for (i = 0; i < tableSize; i++) {
5067 				struct file_descriptor* descriptor = parentContext->fds[i];
5068 
5069 				if (descriptor != NULL
5070 					&& (descriptor->open_mode & O_DISCONNECTED) == 0) {
5071 					bool closeOnExec = fd_close_on_exec(parentContext, i);
5072 					if (closeOnExec && purgeCloseOnExec)
5073 						continue;
5074 
5075 					TFD(InheritFD(context, i, descriptor, parentContext));
5076 
5077 					context->fds[i] = descriptor;
5078 					context->num_used_fds++;
5079 					atomic_add(&descriptor->ref_count, 1);
5080 					atomic_add(&descriptor->open_count, 1);
5081 
5082 					if (closeOnExec)
5083 						fd_set_close_on_exec(context, i, true);
5084 				}
5085 			}
5086 		}
5087 
5088 		parentLocker.Unlock();
5089 	} else {
5090 		context->root = sRoot;
5091 		context->cwd = sRoot;
5092 
5093 		if (context->root)
5094 			inc_vnode_ref_count(context->root);
5095 
5096 		if (context->cwd)
5097 			inc_vnode_ref_count(context->cwd);
5098 	}
5099 
5100 	context->table_size = tableSize;
5101 	context->inherit_fds = parentContext != NULL;
5102 
5103 	list_init(&context->node_monitors);
5104 	context->max_monitors = DEFAULT_NODE_MONITORS;
5105 
5106 	return context;
5107 }
5108 
5109 
5110 void
5111 vfs_get_io_context(io_context* context)
5112 {
5113 	atomic_add(&context->ref_count, 1);
5114 }
5115 
5116 
5117 void
5118 vfs_put_io_context(io_context* context)
5119 {
5120 	if (atomic_add(&context->ref_count, -1) == 1)
5121 		free_io_context(context);
5122 }
5123 
5124 
5125 status_t
5126 vfs_resize_fd_table(struct io_context* context, uint32 newSize)
5127 {
5128 	if (newSize == 0 || newSize > MAX_FD_TABLE_SIZE)
5129 		return B_BAD_VALUE;
5130 
5131 	TIOC(ResizeIOContext(context, newSize));
5132 
5133 	MutexLocker _(context->io_mutex);
5134 
5135 	uint32 oldSize = context->table_size;
5136 	int oldCloseOnExitBitmapSize = (oldSize + 7) / 8;
5137 	int newCloseOnExitBitmapSize = (newSize + 7) / 8;
5138 
5139 	// If the tables shrink, make sure none of the fds being dropped are in use.
5140 	if (newSize < oldSize) {
5141 		for (uint32 i = oldSize; i-- > newSize;) {
5142 			if (context->fds[i])
5143 				return B_BUSY;
5144 		}
5145 	}
5146 
5147 	// store pointers to the old tables
5148 	file_descriptor** oldFDs = context->fds;
5149 	select_info** oldSelectInfos = context->select_infos;
5150 	uint8* oldCloseOnExecTable = context->fds_close_on_exec;
5151 
5152 	// allocate new tables
5153 	file_descriptor** newFDs = (file_descriptor**)malloc(
5154 		sizeof(struct file_descriptor*) * newSize
5155 		+ sizeof(struct select_sync*) * newSize
5156 		+ newCloseOnExitBitmapSize);
5157 	if (newFDs == NULL)
5158 		return B_NO_MEMORY;
5159 
5160 	context->fds = newFDs;
5161 	context->select_infos = (select_info**)(context->fds + newSize);
5162 	context->fds_close_on_exec = (uint8*)(context->select_infos + newSize);
5163 	context->table_size = newSize;
5164 
5165 	// copy entries from old tables
5166 	uint32 toCopy = min_c(oldSize, newSize);
5167 
5168 	memcpy(context->fds, oldFDs, sizeof(void*) * toCopy);
5169 	memcpy(context->select_infos, oldSelectInfos, sizeof(void*) * toCopy);
5170 	memcpy(context->fds_close_on_exec, oldCloseOnExecTable,
5171 		min_c(oldCloseOnExitBitmapSize, newCloseOnExitBitmapSize));
5172 
5173 	// clear additional entries, if the tables grow
5174 	if (newSize > oldSize) {
5175 		memset(context->fds + oldSize, 0, sizeof(void*) * (newSize - oldSize));
5176 		memset(context->select_infos + oldSize, 0,
5177 			sizeof(void*) * (newSize - oldSize));
5178 		memset(context->fds_close_on_exec + oldCloseOnExitBitmapSize, 0,
5179 			newCloseOnExitBitmapSize - oldCloseOnExitBitmapSize);
5180 	}
5181 
5182 	free(oldFDs);
5183 
5184 	return B_OK;
5185 }
5186 
5187 
5188 /*!	\brief Resolves a vnode to the vnode it is covered by, if any.
5189 
5190 	Given an arbitrary vnode (identified by mount and node ID), the function
5191 	checks, whether the vnode is covered by another vnode. If it is, the
5192 	function returns the mount and node ID of the covering vnode. Otherwise
5193 	it simply returns the supplied mount and node ID.
5194 
5195 	In case of error (e.g. the supplied node could not be found) the variables
5196 	for storing the resolved mount and node ID remain untouched and an error
5197 	code is returned.
5198 
5199 	\param mountID The mount ID of the vnode in question.
5200 	\param nodeID The node ID of the vnode in question.
5201 	\param resolvedMountID Pointer to storage for the resolved mount ID.
5202 	\param resolvedNodeID Pointer to storage for the resolved node ID.
5203 	\return
5204 	- \c B_OK, if everything went fine,
5205 	- another error code, if something went wrong.
5206 */
5207 status_t
5208 vfs_resolve_vnode_to_covering_vnode(dev_t mountID, ino_t nodeID,
5209 	dev_t* resolvedMountID, ino_t* resolvedNodeID)
5210 {
5211 	// get the node
5212 	struct vnode* node;
5213 	status_t error = get_vnode(mountID, nodeID, &node, true, false);
5214 	if (error != B_OK)
5215 		return error;
5216 
5217 	// resolve the node
5218 	if (Vnode* coveringNode = get_covering_vnode(node)) {
5219 		put_vnode(node);
5220 		node = coveringNode;
5221 	}
5222 
5223 	// set the return values
5224 	*resolvedMountID = node->device;
5225 	*resolvedNodeID = node->id;
5226 
5227 	put_vnode(node);
5228 
5229 	return B_OK;
5230 }
5231 
5232 
5233 status_t
5234 vfs_get_mount_point(dev_t mountID, dev_t* _mountPointMountID,
5235 	ino_t* _mountPointNodeID)
5236 {
5237 	ReadLocker nodeLocker(sVnodeLock);
5238 	MutexLocker mountLocker(sMountMutex);
5239 
5240 	struct fs_mount* mount = find_mount(mountID);
5241 	if (mount == NULL)
5242 		return B_BAD_VALUE;
5243 
5244 	Vnode* mountPoint = mount->covers_vnode;
5245 
5246 	*_mountPointMountID = mountPoint->device;
5247 	*_mountPointNodeID = mountPoint->id;
5248 
5249 	return B_OK;
5250 }
5251 
5252 
5253 status_t
5254 vfs_bind_mount_directory(dev_t mountID, ino_t nodeID, dev_t coveredMountID,
5255 	ino_t coveredNodeID)
5256 {
5257 	// get the vnodes
5258 	Vnode* vnode;
5259 	status_t error = get_vnode(mountID, nodeID, &vnode, true, false);
5260 	if (error != B_OK)
5261 		return B_BAD_VALUE;
5262 	VNodePutter vnodePutter(vnode);
5263 
5264 	Vnode* coveredVnode;
5265 	error = get_vnode(coveredMountID, coveredNodeID, &coveredVnode, true,
5266 		false);
5267 	if (error != B_OK)
5268 		return B_BAD_VALUE;
5269 	VNodePutter coveredVnodePutter(coveredVnode);
5270 
5271 	// establish the covered/covering links
5272 	WriteLocker locker(sVnodeLock);
5273 
5274 	if (vnode->covers != NULL || coveredVnode->covered_by != NULL
5275 		|| vnode->mount->unmounting || coveredVnode->mount->unmounting) {
5276 		return B_BUSY;
5277 	}
5278 
5279 	vnode->covers = coveredVnode;
5280 	vnode->SetCovering(true);
5281 
5282 	coveredVnode->covered_by = vnode;
5283 	coveredVnode->SetCovered(true);
5284 
5285 	// the vnodes do now reference each other
5286 	inc_vnode_ref_count(vnode);
5287 	inc_vnode_ref_count(coveredVnode);
5288 
5289 	return B_OK;
5290 }
5291 
5292 
5293 int
5294 vfs_getrlimit(int resource, struct rlimit* rlp)
5295 {
5296 	if (!rlp)
5297 		return B_BAD_ADDRESS;
5298 
5299 	switch (resource) {
5300 		case RLIMIT_NOFILE:
5301 		{
5302 			struct io_context* context = get_current_io_context(false);
5303 			MutexLocker _(context->io_mutex);
5304 
5305 			rlp->rlim_cur = context->table_size;
5306 			rlp->rlim_max = MAX_FD_TABLE_SIZE;
5307 			return 0;
5308 		}
5309 
5310 		case RLIMIT_NOVMON:
5311 		{
5312 			struct io_context* context = get_current_io_context(false);
5313 			MutexLocker _(context->io_mutex);
5314 
5315 			rlp->rlim_cur = context->max_monitors;
5316 			rlp->rlim_max = MAX_NODE_MONITORS;
5317 			return 0;
5318 		}
5319 
5320 		default:
5321 			return B_BAD_VALUE;
5322 	}
5323 }
5324 
5325 
5326 int
5327 vfs_setrlimit(int resource, const struct rlimit* rlp)
5328 {
5329 	if (!rlp)
5330 		return B_BAD_ADDRESS;
5331 
5332 	switch (resource) {
5333 		case RLIMIT_NOFILE:
5334 			/* TODO: check getuid() */
5335 			if (rlp->rlim_max != RLIM_SAVED_MAX
5336 				&& rlp->rlim_max != MAX_FD_TABLE_SIZE)
5337 				return B_NOT_ALLOWED;
5338 
5339 			return vfs_resize_fd_table(get_current_io_context(false),
5340 				rlp->rlim_cur);
5341 
5342 		case RLIMIT_NOVMON:
5343 			/* TODO: check getuid() */
5344 			if (rlp->rlim_max != RLIM_SAVED_MAX
5345 				&& rlp->rlim_max != MAX_NODE_MONITORS)
5346 				return B_NOT_ALLOWED;
5347 
5348 			return resize_monitor_table(get_current_io_context(false),
5349 				rlp->rlim_cur);
5350 
5351 		default:
5352 			return B_BAD_VALUE;
5353 	}
5354 }
5355 
5356 
5357 status_t
5358 vfs_init(kernel_args* args)
5359 {
5360 	vnode::StaticInit();
5361 
5362 	sVnodeTable = new(std::nothrow) VnodeTable();
5363 	if (sVnodeTable == NULL || sVnodeTable->Init(VNODE_HASH_TABLE_SIZE) != B_OK)
5364 		panic("vfs_init: error creating vnode hash table\n");
5365 
5366 	struct vnode dummy_vnode;
5367 	list_init_etc(&sUnusedVnodeList, offset_of_member(dummy_vnode, unused_link));
5368 
5369 	struct fs_mount dummyMount;
5370 	sMountsTable = new(std::nothrow) MountTable();
5371 	if (sMountsTable == NULL
5372 			|| sMountsTable->Init(MOUNTS_HASH_TABLE_SIZE) != B_OK)
5373 		panic("vfs_init: error creating mounts hash table\n");
5374 
5375 	sPathNameCache = create_object_cache("vfs path names",
5376 		B_PATH_NAME_LENGTH + 1, 8, NULL, NULL, NULL);
5377 	if (sPathNameCache == NULL)
5378 		panic("vfs_init: error creating path name object_cache\n");
5379 
5380 	sFileDescriptorCache = create_object_cache("vfs fds",
5381 		sizeof(file_descriptor), 8, NULL, NULL, NULL);
5382 	if (sFileDescriptorCache == NULL)
5383 		panic("vfs_init: error creating file descriptor object_cache\n");
5384 
5385 	node_monitor_init();
5386 
5387 	sRoot = NULL;
5388 
5389 	recursive_lock_init(&sMountOpLock, "vfs_mount_op_lock");
5390 
5391 	if (block_cache_init() != B_OK)
5392 		return B_ERROR;
5393 
5394 #ifdef ADD_DEBUGGER_COMMANDS
5395 	// add some debugger commands
5396 	add_debugger_command_etc("vnode", &dump_vnode,
5397 		"Print info about the specified vnode",
5398 		"[ \"-p\" ] ( <vnode> | <devID> <nodeID> )\n"
5399 		"Prints information about the vnode specified by address <vnode> or\n"
5400 		"<devID>, <vnodeID> pair. If \"-p\" is given, a path of the vnode is\n"
5401 		"constructed and printed. It might not be possible to construct a\n"
5402 		"complete path, though.\n",
5403 		0);
5404 	add_debugger_command("vnodes", &dump_vnodes,
5405 		"list all vnodes (from the specified device)");
5406 	add_debugger_command("vnode_caches", &dump_vnode_caches,
5407 		"list all vnode caches");
5408 	add_debugger_command("mount", &dump_mount,
5409 		"info about the specified fs_mount");
5410 	add_debugger_command("mounts", &dump_mounts, "list all fs_mounts");
5411 	add_debugger_command("io_context", &dump_io_context,
5412 		"info about the I/O context");
5413 	add_debugger_command("vnode_usage", &dump_vnode_usage,
5414 		"info about vnode usage");
5415 #endif
5416 
5417 	register_low_resource_handler(&vnode_low_resource_handler, NULL,
5418 		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
5419 			| B_KERNEL_RESOURCE_ADDRESS_SPACE,
5420 		0);
5421 
5422 	fifo_init();
5423 	file_map_init();
5424 
5425 	return file_cache_init();
5426 }
5427 
5428 
5429 //	#pragma mark - fd_ops implementations
5430 
5431 
5432 /*!
5433 	Calls fs_open() on the given vnode and returns a new
5434 	file descriptor for it
5435 */
5436 static int
5437 open_vnode(struct vnode* vnode, int openMode, bool kernel)
5438 {
5439 	void* cookie;
5440 	status_t status = FS_CALL(vnode, open, openMode, &cookie);
5441 	if (status != B_OK)
5442 		return status;
5443 
5444 	int fd = get_new_fd(FDTYPE_FILE, NULL, vnode, cookie, openMode, kernel);
5445 	if (fd < 0) {
5446 		FS_CALL(vnode, close, cookie);
5447 		FS_CALL(vnode, free_cookie, cookie);
5448 	}
5449 	return fd;
5450 }
5451 
5452 
5453 /*!
5454 	Calls fs_open() on the given vnode and returns a new
5455 	file descriptor for it
5456 */
5457 static int
5458 create_vnode(struct vnode* directory, const char* name, int openMode,
5459 	int perms, bool kernel)
5460 {
5461 	bool traverse = ((openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0);
5462 	status_t status = B_ERROR;
5463 	struct vnode* vnode;
5464 	void* cookie;
5465 	ino_t newID;
5466 
5467 	// This is somewhat tricky: If the entry already exists, the FS responsible
5468 	// for the directory might not necessarily also be the one responsible for
5469 	// the node the entry refers to (e.g. in case of mount points or FIFOs). So
5470 	// we can actually never call the create() hook without O_EXCL. Instead we
5471 	// try to look the entry up first. If it already exists, we just open the
5472 	// node (unless O_EXCL), otherwise we call create() with O_EXCL. This
5473 	// introduces a race condition, since someone else might have created the
5474 	// entry in the meantime. We hope the respective FS returns the correct
5475 	// error code and retry (up to 3 times) again.
5476 
5477 	for (int i = 0; i < 3 && status != B_OK; i++) {
5478 		// look the node up
5479 		status = lookup_dir_entry(directory, name, &vnode);
5480 		if (status == B_OK) {
5481 			VNodePutter putter(vnode);
5482 
5483 			if ((openMode & O_EXCL) != 0)
5484 				return B_FILE_EXISTS;
5485 
5486 			// If the node is a symlink, we have to follow it, unless
5487 			// O_NOTRAVERSE is set.
5488 			if (S_ISLNK(vnode->Type()) && traverse) {
5489 				putter.Put();
5490 				char clonedName[B_FILE_NAME_LENGTH + 1];
5491 				if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH)
5492 						>= B_FILE_NAME_LENGTH) {
5493 					return B_NAME_TOO_LONG;
5494 				}
5495 
5496 				inc_vnode_ref_count(directory);
5497 				status = vnode_path_to_vnode(directory, clonedName, true, 0,
5498 					kernel, &vnode, NULL);
5499 				if (status != B_OK)
5500 					return status;
5501 
5502 				putter.SetTo(vnode);
5503 			}
5504 
5505 			if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type()))
5506 				return B_LINK_LIMIT;
5507 
5508 			int fd = open_vnode(vnode, openMode & ~O_CREAT, kernel);
5509 			// on success keep the vnode reference for the FD
5510 			if (fd >= 0)
5511 				putter.Detach();
5512 
5513 			return fd;
5514 		}
5515 
5516 		// it doesn't exist yet -- try to create it
5517 
5518 		if (!HAS_FS_CALL(directory, create))
5519 			return B_READ_ONLY_DEVICE;
5520 
5521 		status = FS_CALL(directory, create, name, openMode | O_EXCL, perms,
5522 			&cookie, &newID);
5523 		if (status != B_OK
5524 			&& ((openMode & O_EXCL) != 0 || status != B_FILE_EXISTS)) {
5525 			return status;
5526 		}
5527 	}
5528 
5529 	if (status != B_OK)
5530 		return status;
5531 
5532 	// the node has been created successfully
5533 
5534 	rw_lock_read_lock(&sVnodeLock);
5535 	vnode = lookup_vnode(directory->device, newID);
5536 	rw_lock_read_unlock(&sVnodeLock);
5537 
5538 	if (vnode == NULL) {
5539 		panic("vfs: fs_create() returned success but there is no vnode, "
5540 			"mount ID %" B_PRIdDEV "!\n", directory->device);
5541 		return B_BAD_VALUE;
5542 	}
5543 
5544 	int fd = get_new_fd(FDTYPE_FILE, NULL, vnode, cookie, openMode, kernel);
5545 	if (fd >= 0)
5546 		return fd;
5547 
5548 	status = fd;
5549 
5550 	// something went wrong, clean up
5551 
5552 	FS_CALL(vnode, close, cookie);
5553 	FS_CALL(vnode, free_cookie, cookie);
5554 	put_vnode(vnode);
5555 
5556 	FS_CALL(directory, unlink, name);
5557 
5558 	return status;
5559 }
5560 
5561 
5562 /*! Calls fs open_dir() on the given vnode and returns a new
5563 	file descriptor for it
5564 */
5565 static int
5566 open_dir_vnode(struct vnode* vnode, bool kernel)
5567 {
5568 	void* cookie;
5569 	status_t status = FS_CALL(vnode, open_dir, &cookie);
5570 	if (status != B_OK)
5571 		return status;
5572 
5573 	// directory is opened, create a fd
5574 	status = get_new_fd(FDTYPE_DIR, NULL, vnode, cookie, O_CLOEXEC, kernel);
5575 	if (status >= 0)
5576 		return status;
5577 
5578 	FS_CALL(vnode, close_dir, cookie);
5579 	FS_CALL(vnode, free_dir_cookie, cookie);
5580 
5581 	return status;
5582 }
5583 
5584 
5585 /*! Calls fs open_attr_dir() on the given vnode and returns a new
5586 	file descriptor for it.
5587 	Used by attr_dir_open(), and attr_dir_open_fd().
5588 */
5589 static int
5590 open_attr_dir_vnode(struct vnode* vnode, bool kernel)
5591 {
5592 	if (!HAS_FS_CALL(vnode, open_attr_dir))
5593 		return B_UNSUPPORTED;
5594 
5595 	void* cookie;
5596 	status_t status = FS_CALL(vnode, open_attr_dir, &cookie);
5597 	if (status != B_OK)
5598 		return status;
5599 
5600 	// directory is opened, create a fd
5601 	status = get_new_fd(FDTYPE_ATTR_DIR, NULL, vnode, cookie, O_CLOEXEC,
5602 		kernel);
5603 	if (status >= 0)
5604 		return status;
5605 
5606 	FS_CALL(vnode, close_attr_dir, cookie);
5607 	FS_CALL(vnode, free_attr_dir_cookie, cookie);
5608 
5609 	return status;
5610 }
5611 
5612 
5613 static int
5614 file_create_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5615 	int openMode, int perms, bool kernel)
5616 {
5617 	FUNCTION(("file_create_entry_ref: name = '%s', omode %x, perms %d, "
5618 		"kernel %d\n", name, openMode, perms, kernel));
5619 
5620 	// get directory to put the new file in
5621 	struct vnode* directory;
5622 	status_t status = get_vnode(mountID, directoryID, &directory, true, false);
5623 	if (status != B_OK)
5624 		return status;
5625 
5626 	status = create_vnode(directory, name, openMode, perms, kernel);
5627 	put_vnode(directory);
5628 
5629 	return status;
5630 }
5631 
5632 
5633 static int
5634 file_create(int fd, char* path, int openMode, int perms, bool kernel)
5635 {
5636 	FUNCTION(("file_create: path '%s', omode %x, perms %d, kernel %d\n", path,
5637 		openMode, perms, kernel));
5638 
5639 	// get directory to put the new file in
5640 	char name[B_FILE_NAME_LENGTH];
5641 	struct vnode* directory;
5642 	status_t status = fd_and_path_to_dir_vnode(fd, path, &directory, name,
5643 		kernel);
5644 	if (status < 0)
5645 		return status;
5646 
5647 	status = create_vnode(directory, name, openMode, perms, kernel);
5648 
5649 	put_vnode(directory);
5650 	return status;
5651 }
5652 
5653 
5654 static int
5655 file_open_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5656 	int openMode, bool kernel)
5657 {
5658 	if (name == NULL || *name == '\0')
5659 		return B_BAD_VALUE;
5660 
5661 	FUNCTION(("file_open_entry_ref(ref = (%" B_PRId32 ", %" B_PRId64 ", %s), "
5662 		"openMode = %d)\n", mountID, directoryID, name, openMode));
5663 
5664 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5665 
5666 	// get the vnode matching the entry_ref
5667 	struct vnode* vnode;
5668 	status_t status = entry_ref_to_vnode(mountID, directoryID, name, traverse,
5669 		kernel, &vnode);
5670 	if (status != B_OK)
5671 		return status;
5672 
5673 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
5674 		put_vnode(vnode);
5675 		return B_LINK_LIMIT;
5676 	}
5677 
5678 	int newFD = open_vnode(vnode, openMode, kernel);
5679 	if (newFD >= 0) {
5680 		// The vnode reference has been transferred to the FD
5681 		cache_node_opened(vnode, FDTYPE_FILE, vnode->cache, mountID,
5682 			directoryID, vnode->id, name);
5683 	} else
5684 		put_vnode(vnode);
5685 
5686 	return newFD;
5687 }
5688 
5689 
5690 static int
5691 file_open(int fd, char* path, int openMode, bool kernel)
5692 {
5693 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5694 
5695 	FUNCTION(("file_open: fd: %d, entry path = '%s', omode %d, kernel %d\n",
5696 		fd, path, openMode, kernel));
5697 
5698 	// get the vnode matching the vnode + path combination
5699 	struct vnode* vnode;
5700 	ino_t parentID;
5701 	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode,
5702 		&parentID, kernel);
5703 	if (status != B_OK)
5704 		return status;
5705 
5706 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
5707 		put_vnode(vnode);
5708 		return B_LINK_LIMIT;
5709 	}
5710 
5711 	// open the vnode
5712 	int newFD = open_vnode(vnode, openMode, kernel);
5713 	if (newFD >= 0) {
5714 		// The vnode reference has been transferred to the FD
5715 		cache_node_opened(vnode, FDTYPE_FILE, vnode->cache,
5716 			vnode->device, parentID, vnode->id, NULL);
5717 	} else
5718 		put_vnode(vnode);
5719 
5720 	return newFD;
5721 }
5722 
5723 
5724 static status_t
5725 file_close(struct file_descriptor* descriptor)
5726 {
5727 	struct vnode* vnode = descriptor->u.vnode;
5728 	status_t status = B_OK;
5729 
5730 	FUNCTION(("file_close(descriptor = %p)\n", descriptor));
5731 
5732 	cache_node_closed(vnode, FDTYPE_FILE, vnode->cache, vnode->device,
5733 		vnode->id);
5734 	if (HAS_FS_CALL(vnode, close)) {
5735 		status = FS_CALL(vnode, close, descriptor->cookie);
5736 	}
5737 
5738 	if (status == B_OK) {
5739 		// remove all outstanding locks for this team
5740 		if (HAS_FS_CALL(vnode, release_lock))
5741 			status = FS_CALL(vnode, release_lock, descriptor->cookie, NULL);
5742 		else
5743 			status = release_advisory_lock(vnode, NULL, descriptor, NULL);
5744 	}
5745 	return status;
5746 }
5747 
5748 
5749 static void
5750 file_free_fd(struct file_descriptor* descriptor)
5751 {
5752 	struct vnode* vnode = descriptor->u.vnode;
5753 
5754 	if (vnode != NULL) {
5755 		FS_CALL(vnode, free_cookie, descriptor->cookie);
5756 		put_vnode(vnode);
5757 	}
5758 }
5759 
5760 
5761 static status_t
5762 file_read(struct file_descriptor* descriptor, off_t pos, void* buffer,
5763 	size_t* length)
5764 {
5765 	struct vnode* vnode = descriptor->u.vnode;
5766 	FUNCTION(("file_read: buf %p, pos %" B_PRIdOFF ", len %p = %ld\n", buffer,
5767 		pos, length, *length));
5768 
5769 	if (S_ISDIR(vnode->Type()))
5770 		return B_IS_A_DIRECTORY;
5771 
5772 	return FS_CALL(vnode, read, descriptor->cookie, pos, buffer, length);
5773 }
5774 
5775 
5776 static status_t
5777 file_write(struct file_descriptor* descriptor, off_t pos, const void* buffer,
5778 	size_t* length)
5779 {
5780 	struct vnode* vnode = descriptor->u.vnode;
5781 	FUNCTION(("file_write: buf %p, pos %" B_PRIdOFF ", len %p\n", buffer, pos,
5782 		length));
5783 
5784 	if (S_ISDIR(vnode->Type()))
5785 		return B_IS_A_DIRECTORY;
5786 	if (!HAS_FS_CALL(vnode, write))
5787 		return B_READ_ONLY_DEVICE;
5788 
5789 	return FS_CALL(vnode, write, descriptor->cookie, pos, buffer, length);
5790 }
5791 
5792 
5793 static off_t
5794 file_seek(struct file_descriptor* descriptor, off_t pos, int seekType)
5795 {
5796 	struct vnode* vnode = descriptor->u.vnode;
5797 	off_t offset;
5798 	bool isDevice = false;
5799 
5800 	FUNCTION(("file_seek(pos = %" B_PRIdOFF ", seekType = %d)\n", pos,
5801 		seekType));
5802 
5803 	// some kinds of files are not seekable
5804 	switch (vnode->Type() & S_IFMT) {
5805 		case S_IFIFO:
5806 		case S_IFSOCK:
5807 			return ESPIPE;
5808 
5809 		// drivers publish block devices as chr, so pick both
5810 		case S_IFBLK:
5811 		case S_IFCHR:
5812 			isDevice = true;
5813 			break;
5814 		// The Open Group Base Specs don't mention any file types besides pipes,
5815 		// fifos, and sockets specially, so we allow seeking them.
5816 		case S_IFREG:
5817 		case S_IFDIR:
5818 		case S_IFLNK:
5819 			break;
5820 	}
5821 
5822 	switch (seekType) {
5823 		case SEEK_SET:
5824 			offset = 0;
5825 			break;
5826 		case SEEK_CUR:
5827 			offset = descriptor->pos;
5828 			break;
5829 		case SEEK_END:
5830 		{
5831 			// stat() the node
5832 			if (!HAS_FS_CALL(vnode, read_stat))
5833 				return B_UNSUPPORTED;
5834 
5835 			struct stat stat;
5836 			status_t status = FS_CALL(vnode, read_stat, &stat);
5837 			if (status != B_OK)
5838 				return status;
5839 
5840 			offset = stat.st_size;
5841 
5842 			if (offset == 0 && isDevice) {
5843 				// stat() on regular drivers doesn't report size
5844 				device_geometry geometry;
5845 
5846 				if (HAS_FS_CALL(vnode, ioctl)) {
5847 					status = FS_CALL(vnode, ioctl, descriptor->cookie,
5848 						B_GET_GEOMETRY, &geometry, sizeof(geometry));
5849 					if (status == B_OK)
5850 						offset = (off_t)geometry.bytes_per_sector
5851 							* geometry.sectors_per_track
5852 							* geometry.cylinder_count
5853 							* geometry.head_count;
5854 				}
5855 			}
5856 
5857 			break;
5858 		}
5859 		default:
5860 			return B_BAD_VALUE;
5861 	}
5862 
5863 	// assumes off_t is 64 bits wide
5864 	if (offset > 0 && LONGLONG_MAX - offset < pos)
5865 		return B_BUFFER_OVERFLOW;
5866 
5867 	pos += offset;
5868 	if (pos < 0)
5869 		return B_BAD_VALUE;
5870 
5871 	return descriptor->pos = pos;
5872 }
5873 
5874 
5875 static status_t
5876 file_select(struct file_descriptor* descriptor, uint8 event,
5877 	struct selectsync* sync)
5878 {
5879 	FUNCTION(("file_select(%p, %u, %p)\n", descriptor, event, sync));
5880 
5881 	struct vnode* vnode = descriptor->u.vnode;
5882 
5883 	// If the FS has no select() hook, notify select() now.
5884 	if (!HAS_FS_CALL(vnode, select)) {
5885 		if (!SELECT_TYPE_IS_OUTPUT_ONLY(event))
5886 			return notify_select_event(sync, event);
5887 		else
5888 			return B_OK;
5889 	}
5890 
5891 	return FS_CALL(vnode, select, descriptor->cookie, event, sync);
5892 }
5893 
5894 
5895 static status_t
5896 file_deselect(struct file_descriptor* descriptor, uint8 event,
5897 	struct selectsync* sync)
5898 {
5899 	struct vnode* vnode = descriptor->u.vnode;
5900 
5901 	if (!HAS_FS_CALL(vnode, deselect))
5902 		return B_OK;
5903 
5904 	return FS_CALL(vnode, deselect, descriptor->cookie, event, sync);
5905 }
5906 
5907 
5908 static status_t
5909 dir_create_entry_ref(dev_t mountID, ino_t parentID, const char* name, int perms,
5910 	bool kernel)
5911 {
5912 	struct vnode* vnode;
5913 	status_t status;
5914 
5915 	if (name == NULL || *name == '\0')
5916 		return B_BAD_VALUE;
5917 
5918 	FUNCTION(("dir_create_entry_ref(dev = %" B_PRId32 ", ino = %" B_PRId64 ", "
5919 		"name = '%s', perms = %d)\n", mountID, parentID, name, perms));
5920 
5921 	status = get_vnode(mountID, parentID, &vnode, true, false);
5922 	if (status != B_OK)
5923 		return status;
5924 
5925 	if (HAS_FS_CALL(vnode, create_dir))
5926 		status = FS_CALL(vnode, create_dir, name, perms);
5927 	else
5928 		status = B_READ_ONLY_DEVICE;
5929 
5930 	put_vnode(vnode);
5931 	return status;
5932 }
5933 
5934 
5935 static status_t
5936 dir_create(int fd, char* path, int perms, bool kernel)
5937 {
5938 	char filename[B_FILE_NAME_LENGTH];
5939 	struct vnode* vnode;
5940 	status_t status;
5941 
5942 	FUNCTION(("dir_create: path '%s', perms %d, kernel %d\n", path, perms,
5943 		kernel));
5944 
5945 	status = fd_and_path_to_dir_vnode(fd, path, &vnode, filename, kernel);
5946 	if (status < 0)
5947 		return status;
5948 
5949 	if (HAS_FS_CALL(vnode, create_dir)) {
5950 		status = FS_CALL(vnode, create_dir, filename, perms);
5951 	} else
5952 		status = B_READ_ONLY_DEVICE;
5953 
5954 	put_vnode(vnode);
5955 	return status;
5956 }
5957 
5958 
5959 static int
5960 dir_open_entry_ref(dev_t mountID, ino_t parentID, const char* name, bool kernel)
5961 {
5962 	FUNCTION(("dir_open_entry_ref()\n"));
5963 
5964 	if (name && name[0] == '\0')
5965 		return B_BAD_VALUE;
5966 
5967 	// get the vnode matching the entry_ref/node_ref
5968 	struct vnode* vnode;
5969 	status_t status;
5970 	if (name) {
5971 		status = entry_ref_to_vnode(mountID, parentID, name, true, kernel,
5972 			&vnode);
5973 	} else
5974 		status = get_vnode(mountID, parentID, &vnode, true, false);
5975 	if (status != B_OK)
5976 		return status;
5977 
5978 	int newFD = open_dir_vnode(vnode, kernel);
5979 	if (newFD >= 0) {
5980 		// The vnode reference has been transferred to the FD
5981 		cache_node_opened(vnode, FDTYPE_DIR, vnode->cache, mountID, parentID,
5982 			vnode->id, name);
5983 	} else
5984 		put_vnode(vnode);
5985 
5986 	return newFD;
5987 }
5988 
5989 
5990 static int
5991 dir_open(int fd, char* path, bool kernel)
5992 {
5993 	FUNCTION(("dir_open: fd: %d, entry path = '%s', kernel %d\n", fd, path,
5994 		kernel));
5995 
5996 	// get the vnode matching the vnode + path combination
5997 	struct vnode* vnode = NULL;
5998 	ino_t parentID;
5999 	status_t status = fd_and_path_to_vnode(fd, path, true, &vnode, &parentID,
6000 		kernel);
6001 	if (status != B_OK)
6002 		return status;
6003 
6004 	// open the dir
6005 	int newFD = open_dir_vnode(vnode, kernel);
6006 	if (newFD >= 0) {
6007 		// The vnode reference has been transferred to the FD
6008 		cache_node_opened(vnode, FDTYPE_DIR, vnode->cache, vnode->device,
6009 			parentID, vnode->id, NULL);
6010 	} else
6011 		put_vnode(vnode);
6012 
6013 	return newFD;
6014 }
6015 
6016 
6017 static status_t
6018 dir_close(struct file_descriptor* descriptor)
6019 {
6020 	struct vnode* vnode = descriptor->u.vnode;
6021 
6022 	FUNCTION(("dir_close(descriptor = %p)\n", descriptor));
6023 
6024 	cache_node_closed(vnode, FDTYPE_DIR, vnode->cache, vnode->device,
6025 		vnode->id);
6026 	if (HAS_FS_CALL(vnode, close_dir))
6027 		return FS_CALL(vnode, close_dir, descriptor->cookie);
6028 
6029 	return B_OK;
6030 }
6031 
6032 
6033 static void
6034 dir_free_fd(struct file_descriptor* descriptor)
6035 {
6036 	struct vnode* vnode = descriptor->u.vnode;
6037 
6038 	if (vnode != NULL) {
6039 		FS_CALL(vnode, free_dir_cookie, descriptor->cookie);
6040 		put_vnode(vnode);
6041 	}
6042 }
6043 
6044 
6045 static status_t
6046 dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
6047 	struct dirent* buffer, size_t bufferSize, uint32* _count)
6048 {
6049 	return dir_read(ioContext, descriptor->u.vnode, descriptor->cookie, buffer,
6050 		bufferSize, _count);
6051 }
6052 
6053 
6054 static status_t
6055 fix_dirent(struct vnode* parent, struct dirent* entry,
6056 	struct io_context* ioContext)
6057 {
6058 	// set d_pdev and d_pino
6059 	entry->d_pdev = parent->device;
6060 	entry->d_pino = parent->id;
6061 
6062 	// If this is the ".." entry and the directory covering another vnode,
6063 	// we need to replace d_dev and d_ino with the actual values.
6064 	if (strcmp(entry->d_name, "..") == 0 && parent->IsCovering()) {
6065 		return resolve_covered_parent(parent, &entry->d_dev, &entry->d_ino,
6066 			ioContext);
6067 	}
6068 
6069 	// resolve covered vnodes
6070 	ReadLocker _(&sVnodeLock);
6071 
6072 	struct vnode* vnode = lookup_vnode(entry->d_dev, entry->d_ino);
6073 	if (vnode != NULL && vnode->covered_by != NULL) {
6074 		do {
6075 			vnode = vnode->covered_by;
6076 		} while (vnode->covered_by != NULL);
6077 
6078 		entry->d_dev = vnode->device;
6079 		entry->d_ino = vnode->id;
6080 	}
6081 
6082 	return B_OK;
6083 }
6084 
6085 
6086 static status_t
6087 dir_read(struct io_context* ioContext, struct vnode* vnode, void* cookie,
6088 	struct dirent* buffer, size_t bufferSize, uint32* _count)
6089 {
6090 	if (!HAS_FS_CALL(vnode, read_dir))
6091 		return B_UNSUPPORTED;
6092 
6093 	status_t error = FS_CALL(vnode, read_dir, cookie, buffer, bufferSize,
6094 		_count);
6095 	if (error != B_OK)
6096 		return error;
6097 
6098 	// we need to adjust the read dirents
6099 	uint32 count = *_count;
6100 	for (uint32 i = 0; i < count; i++) {
6101 		error = fix_dirent(vnode, buffer, ioContext);
6102 		if (error != B_OK)
6103 			return error;
6104 
6105 		buffer = (struct dirent*)((uint8*)buffer + buffer->d_reclen);
6106 	}
6107 
6108 	return error;
6109 }
6110 
6111 
6112 static status_t
6113 dir_rewind(struct file_descriptor* descriptor)
6114 {
6115 	struct vnode* vnode = descriptor->u.vnode;
6116 
6117 	if (HAS_FS_CALL(vnode, rewind_dir)) {
6118 		return FS_CALL(vnode, rewind_dir, descriptor->cookie);
6119 	}
6120 
6121 	return B_UNSUPPORTED;
6122 }
6123 
6124 
6125 static status_t
6126 dir_remove(int fd, char* path, bool kernel)
6127 {
6128 	char name[B_FILE_NAME_LENGTH];
6129 	struct vnode* directory;
6130 	status_t status;
6131 
6132 	if (path != NULL) {
6133 		// we need to make sure our path name doesn't stop with "/", ".",
6134 		// or ".."
6135 		char* lastSlash;
6136 		while ((lastSlash = strrchr(path, '/')) != NULL) {
6137 			char* leaf = lastSlash + 1;
6138 			if (!strcmp(leaf, ".."))
6139 				return B_NOT_ALLOWED;
6140 
6141 			// omit multiple slashes
6142 			while (lastSlash > path && lastSlash[-1] == '/')
6143 				lastSlash--;
6144 
6145 			if (leaf[0]
6146 				&& strcmp(leaf, ".")) {
6147 				break;
6148 			}
6149 			// "name/" -> "name", or "name/." -> "name"
6150 			lastSlash[0] = '\0';
6151 		}
6152 
6153 		if (!strcmp(path, ".") || !strcmp(path, ".."))
6154 			return B_NOT_ALLOWED;
6155 	}
6156 
6157 	status = fd_and_path_to_dir_vnode(fd, path, &directory, name, kernel);
6158 	if (status != B_OK)
6159 		return status;
6160 
6161 	if (HAS_FS_CALL(directory, remove_dir))
6162 		status = FS_CALL(directory, remove_dir, name);
6163 	else
6164 		status = B_READ_ONLY_DEVICE;
6165 
6166 	put_vnode(directory);
6167 	return status;
6168 }
6169 
6170 
6171 static status_t
6172 common_ioctl(struct file_descriptor* descriptor, ulong op, void* buffer,
6173 	size_t length)
6174 {
6175 	struct vnode* vnode = descriptor->u.vnode;
6176 
6177 	if (HAS_FS_CALL(vnode, ioctl))
6178 		return FS_CALL(vnode, ioctl, descriptor->cookie, op, buffer, length);
6179 
6180 	return B_DEV_INVALID_IOCTL;
6181 }
6182 
6183 
6184 static status_t
6185 common_fcntl(int fd, int op, size_t argument, bool kernel)
6186 {
6187 	struct flock flock;
6188 
6189 	FUNCTION(("common_fcntl(fd = %d, op = %d, argument = %lx, %s)\n",
6190 		fd, op, argument, kernel ? "kernel" : "user"));
6191 
6192 	struct io_context* context = get_current_io_context(kernel);
6193 
6194 	struct file_descriptor* descriptor = get_fd(context, fd);
6195 	if (descriptor == NULL)
6196 		return B_FILE_ERROR;
6197 
6198 	struct vnode* vnode = fd_vnode(descriptor);
6199 
6200 	status_t status = B_OK;
6201 
6202 	if (op == F_SETLK || op == F_SETLKW || op == F_GETLK) {
6203 		if (descriptor->type != FDTYPE_FILE)
6204 			status = B_BAD_VALUE;
6205 		else if (kernel)
6206 			memcpy(&flock, (struct flock*)argument, sizeof(struct flock));
6207 		else if (user_memcpy(&flock, (struct flock*)argument,
6208 				sizeof(struct flock)) != B_OK)
6209 			status = B_BAD_ADDRESS;
6210 		if (status != B_OK) {
6211 			put_fd(descriptor);
6212 			return status;
6213 		}
6214 	}
6215 
6216 	switch (op) {
6217 		case F_SETFD:
6218 		{
6219 			// Set file descriptor flags
6220 
6221 			// O_CLOEXEC is the only flag available at this time
6222 			mutex_lock(&context->io_mutex);
6223 			fd_set_close_on_exec(context, fd, (argument & FD_CLOEXEC) != 0);
6224 			mutex_unlock(&context->io_mutex);
6225 
6226 			status = B_OK;
6227 			break;
6228 		}
6229 
6230 		case F_GETFD:
6231 		{
6232 			// Get file descriptor flags
6233 			mutex_lock(&context->io_mutex);
6234 			status = fd_close_on_exec(context, fd) ? FD_CLOEXEC : 0;
6235 			mutex_unlock(&context->io_mutex);
6236 			break;
6237 		}
6238 
6239 		case F_SETFL:
6240 			// Set file descriptor open mode
6241 
6242 			// we only accept changes to O_APPEND and O_NONBLOCK
6243 			argument &= O_APPEND | O_NONBLOCK;
6244 			if (descriptor->ops->fd_set_flags != NULL) {
6245 				status = descriptor->ops->fd_set_flags(descriptor, argument);
6246 			} else if (vnode != NULL && HAS_FS_CALL(vnode, set_flags)) {
6247 				status = FS_CALL(vnode, set_flags, descriptor->cookie,
6248 					(int)argument);
6249 			} else
6250 				status = B_UNSUPPORTED;
6251 
6252 			if (status == B_OK) {
6253 				// update this descriptor's open_mode field
6254 				descriptor->open_mode = (descriptor->open_mode
6255 					& ~(O_APPEND | O_NONBLOCK)) | argument;
6256 			}
6257 
6258 			break;
6259 
6260 		case F_GETFL:
6261 			// Get file descriptor open mode
6262 			status = descriptor->open_mode;
6263 			break;
6264 
6265 		case F_DUPFD:
6266 		case F_DUPFD_CLOEXEC:
6267 		{
6268 			status = new_fd_etc(context, descriptor, (int)argument);
6269 			if (status >= 0) {
6270 				mutex_lock(&context->io_mutex);
6271 				fd_set_close_on_exec(context, status, op == F_DUPFD_CLOEXEC);
6272 				mutex_unlock(&context->io_mutex);
6273 
6274 				atomic_add(&descriptor->ref_count, 1);
6275 			}
6276 			break;
6277 		}
6278 
6279 		case F_GETLK:
6280 			if (vnode != NULL) {
6281 				struct flock normalizedLock;
6282 
6283 				memcpy(&normalizedLock, &flock, sizeof(struct flock));
6284 				status = normalize_flock(descriptor, &normalizedLock);
6285 				if (status != B_OK)
6286 					break;
6287 
6288 				if (HAS_FS_CALL(vnode, test_lock)) {
6289 					status = FS_CALL(vnode, test_lock, descriptor->cookie,
6290 						&normalizedLock);
6291 				} else
6292 					status = test_advisory_lock(vnode, &normalizedLock);
6293 				if (status == B_OK) {
6294 					if (normalizedLock.l_type == F_UNLCK) {
6295 						// no conflicting lock found, copy back the same struct
6296 						// we were given except change type to F_UNLCK
6297 						flock.l_type = F_UNLCK;
6298 						if (kernel) {
6299 							memcpy((struct flock*)argument, &flock,
6300 								sizeof(struct flock));
6301 						} else {
6302 							status = user_memcpy((struct flock*)argument,
6303 								&flock, sizeof(struct flock));
6304 						}
6305 					} else {
6306 						// a conflicting lock was found, copy back its range and
6307 						// type
6308 						if (normalizedLock.l_len == OFF_MAX)
6309 							normalizedLock.l_len = 0;
6310 
6311 						if (kernel) {
6312 							memcpy((struct flock*)argument,
6313 								&normalizedLock, sizeof(struct flock));
6314 						} else {
6315 							status = user_memcpy((struct flock*)argument,
6316 								&normalizedLock, sizeof(struct flock));
6317 						}
6318 					}
6319 				}
6320 			} else
6321 				status = B_BAD_VALUE;
6322 			break;
6323 
6324 		case F_SETLK:
6325 		case F_SETLKW:
6326 			status = normalize_flock(descriptor, &flock);
6327 			if (status != B_OK)
6328 				break;
6329 
6330 			if (vnode == NULL) {
6331 				status = B_BAD_VALUE;
6332 			} else if (flock.l_type == F_UNLCK) {
6333 				if (HAS_FS_CALL(vnode, release_lock)) {
6334 					status = FS_CALL(vnode, release_lock, descriptor->cookie,
6335 						&flock);
6336 				} else {
6337 					status = release_advisory_lock(vnode, context, NULL,
6338 						&flock);
6339 				}
6340 			} else {
6341 				// the open mode must match the lock type
6342 				if (((descriptor->open_mode & O_RWMASK) == O_RDONLY
6343 						&& flock.l_type == F_WRLCK)
6344 					|| ((descriptor->open_mode & O_RWMASK) == O_WRONLY
6345 						&& flock.l_type == F_RDLCK))
6346 					status = B_FILE_ERROR;
6347 				else {
6348 					if (HAS_FS_CALL(vnode, acquire_lock)) {
6349 						status = FS_CALL(vnode, acquire_lock,
6350 							descriptor->cookie, &flock, op == F_SETLKW);
6351 					} else {
6352 						status = acquire_advisory_lock(vnode, context, NULL,
6353 							&flock, op == F_SETLKW);
6354 					}
6355 				}
6356 			}
6357 			break;
6358 
6359 		// ToDo: add support for more ops?
6360 
6361 		default:
6362 			status = B_BAD_VALUE;
6363 	}
6364 
6365 	put_fd(descriptor);
6366 	return status;
6367 }
6368 
6369 
6370 static status_t
6371 common_sync(int fd, bool kernel)
6372 {
6373 	struct file_descriptor* descriptor;
6374 	struct vnode* vnode;
6375 	status_t status;
6376 
6377 	FUNCTION(("common_fsync: entry. fd %d kernel %d\n", fd, kernel));
6378 
6379 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6380 	if (descriptor == NULL)
6381 		return B_FILE_ERROR;
6382 
6383 	if (HAS_FS_CALL(vnode, fsync))
6384 		status = FS_CALL_NO_PARAMS(vnode, fsync);
6385 	else
6386 		status = B_UNSUPPORTED;
6387 
6388 	put_fd(descriptor);
6389 	return status;
6390 }
6391 
6392 
6393 static status_t
6394 common_lock_node(int fd, bool kernel)
6395 {
6396 	struct file_descriptor* descriptor;
6397 	struct vnode* vnode;
6398 
6399 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6400 	if (descriptor == NULL)
6401 		return B_FILE_ERROR;
6402 
6403 	status_t status = B_OK;
6404 
6405 	// We need to set the locking atomically - someone
6406 	// else might set one at the same time
6407 	if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by, descriptor,
6408 			(file_descriptor*)NULL) != NULL)
6409 		status = B_BUSY;
6410 
6411 	put_fd(descriptor);
6412 	return status;
6413 }
6414 
6415 
6416 static status_t
6417 common_unlock_node(int fd, bool kernel)
6418 {
6419 	struct file_descriptor* descriptor;
6420 	struct vnode* vnode;
6421 
6422 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6423 	if (descriptor == NULL)
6424 		return B_FILE_ERROR;
6425 
6426 	status_t status = B_OK;
6427 
6428 	// We need to set the locking atomically - someone
6429 	// else might set one at the same time
6430 	if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by,
6431 			(file_descriptor*)NULL, descriptor) != descriptor)
6432 		status = B_BAD_VALUE;
6433 
6434 	put_fd(descriptor);
6435 	return status;
6436 }
6437 
6438 
6439 static status_t
6440 common_read_link(int fd, char* path, char* buffer, size_t* _bufferSize,
6441 	bool kernel)
6442 {
6443 	struct vnode* vnode;
6444 	status_t status;
6445 
6446 	status = fd_and_path_to_vnode(fd, path, false, &vnode, NULL, kernel);
6447 	if (status != B_OK)
6448 		return status;
6449 
6450 	if (HAS_FS_CALL(vnode, read_symlink)) {
6451 		status = FS_CALL(vnode, read_symlink, buffer, _bufferSize);
6452 	} else
6453 		status = B_BAD_VALUE;
6454 
6455 	put_vnode(vnode);
6456 	return status;
6457 }
6458 
6459 
6460 static status_t
6461 common_create_symlink(int fd, char* path, const char* toPath, int mode,
6462 	bool kernel)
6463 {
6464 	// path validity checks have to be in the calling function!
6465 	char name[B_FILE_NAME_LENGTH];
6466 	struct vnode* vnode;
6467 	status_t status;
6468 
6469 	FUNCTION(("common_create_symlink(fd = %d, path = %s, toPath = %s, "
6470 		"mode = %d, kernel = %d)\n", fd, path, toPath, mode, kernel));
6471 
6472 	status = fd_and_path_to_dir_vnode(fd, path, &vnode, name, kernel);
6473 	if (status != B_OK)
6474 		return status;
6475 
6476 	if (HAS_FS_CALL(vnode, create_symlink))
6477 		status = FS_CALL(vnode, create_symlink, name, toPath, mode);
6478 	else {
6479 		status = HAS_FS_CALL(vnode, write)
6480 			? B_UNSUPPORTED : B_READ_ONLY_DEVICE;
6481 	}
6482 
6483 	put_vnode(vnode);
6484 
6485 	return status;
6486 }
6487 
6488 
6489 static status_t
6490 common_create_link(int pathFD, char* path, int toFD, char* toPath,
6491 	bool traverseLeafLink, bool kernel)
6492 {
6493 	// path validity checks have to be in the calling function!
6494 
6495 	FUNCTION(("common_create_link(path = %s, toPath = %s, kernel = %d)\n", path,
6496 		toPath, kernel));
6497 
6498 	char name[B_FILE_NAME_LENGTH];
6499 	struct vnode* directory;
6500 	status_t status = fd_and_path_to_dir_vnode(pathFD, path, &directory, name,
6501 		kernel);
6502 	if (status != B_OK)
6503 		return status;
6504 
6505 	struct vnode* vnode;
6506 	status = fd_and_path_to_vnode(toFD, toPath, traverseLeafLink, &vnode, NULL,
6507 		kernel);
6508 	if (status != B_OK)
6509 		goto err;
6510 
6511 	if (directory->mount != vnode->mount) {
6512 		status = B_CROSS_DEVICE_LINK;
6513 		goto err1;
6514 	}
6515 
6516 	if (HAS_FS_CALL(directory, link))
6517 		status = FS_CALL(directory, link, name, vnode);
6518 	else
6519 		status = B_READ_ONLY_DEVICE;
6520 
6521 err1:
6522 	put_vnode(vnode);
6523 err:
6524 	put_vnode(directory);
6525 
6526 	return status;
6527 }
6528 
6529 
6530 static status_t
6531 common_unlink(int fd, char* path, bool kernel)
6532 {
6533 	char filename[B_FILE_NAME_LENGTH];
6534 	struct vnode* vnode;
6535 	status_t status;
6536 
6537 	FUNCTION(("common_unlink: fd: %d, path '%s', kernel %d\n", fd, path,
6538 		kernel));
6539 
6540 	status = fd_and_path_to_dir_vnode(fd, path, &vnode, filename, kernel);
6541 	if (status < 0)
6542 		return status;
6543 
6544 	if (HAS_FS_CALL(vnode, unlink))
6545 		status = FS_CALL(vnode, unlink, filename);
6546 	else
6547 		status = B_READ_ONLY_DEVICE;
6548 
6549 	put_vnode(vnode);
6550 
6551 	return status;
6552 }
6553 
6554 
6555 static status_t
6556 common_access(int fd, char* path, int mode, bool effectiveUserGroup, bool kernel)
6557 {
6558 	struct vnode* vnode;
6559 	status_t status;
6560 
6561 	// TODO: honor effectiveUserGroup argument
6562 
6563 	status = fd_and_path_to_vnode(fd, path, true, &vnode, NULL, kernel);
6564 	if (status != B_OK)
6565 		return status;
6566 
6567 	if (HAS_FS_CALL(vnode, access))
6568 		status = FS_CALL(vnode, access, mode);
6569 	else
6570 		status = B_OK;
6571 
6572 	put_vnode(vnode);
6573 
6574 	return status;
6575 }
6576 
6577 
6578 static status_t
6579 common_rename(int fd, char* path, int newFD, char* newPath, bool kernel)
6580 {
6581 	struct vnode* fromVnode;
6582 	struct vnode* toVnode;
6583 	char fromName[B_FILE_NAME_LENGTH];
6584 	char toName[B_FILE_NAME_LENGTH];
6585 	status_t status;
6586 
6587 	FUNCTION(("common_rename(fd = %d, path = %s, newFD = %d, newPath = %s, "
6588 		"kernel = %d)\n", fd, path, newFD, newPath, kernel));
6589 
6590 	status = fd_and_path_to_dir_vnode(fd, path, &fromVnode, fromName, kernel);
6591 	if (status != B_OK)
6592 		return status;
6593 
6594 	status = fd_and_path_to_dir_vnode(newFD, newPath, &toVnode, toName, kernel);
6595 	if (status != B_OK)
6596 		goto err1;
6597 
6598 	if (fromVnode->device != toVnode->device) {
6599 		status = B_CROSS_DEVICE_LINK;
6600 		goto err2;
6601 	}
6602 
6603 	if (fromName[0] == '\0' || toName[0] == '\0'
6604 		|| !strcmp(fromName, ".") || !strcmp(fromName, "..")
6605 		|| !strcmp(toName, ".") || !strcmp(toName, "..")
6606 		|| (fromVnode == toVnode && !strcmp(fromName, toName))) {
6607 		status = B_BAD_VALUE;
6608 		goto err2;
6609 	}
6610 
6611 	if (HAS_FS_CALL(fromVnode, rename))
6612 		status = FS_CALL(fromVnode, rename, fromName, toVnode, toName);
6613 	else
6614 		status = B_READ_ONLY_DEVICE;
6615 
6616 err2:
6617 	put_vnode(toVnode);
6618 err1:
6619 	put_vnode(fromVnode);
6620 
6621 	return status;
6622 }
6623 
6624 
6625 static status_t
6626 common_read_stat(struct file_descriptor* descriptor, struct stat* stat)
6627 {
6628 	struct vnode* vnode = descriptor->u.vnode;
6629 
6630 	FUNCTION(("common_read_stat: stat %p\n", stat));
6631 
6632 	// TODO: remove this once all file systems properly set them!
6633 	stat->st_crtim.tv_nsec = 0;
6634 	stat->st_ctim.tv_nsec = 0;
6635 	stat->st_mtim.tv_nsec = 0;
6636 	stat->st_atim.tv_nsec = 0;
6637 
6638 	return vfs_stat_vnode(vnode, stat);
6639 }
6640 
6641 
6642 static status_t
6643 common_write_stat(struct file_descriptor* descriptor, const struct stat* stat,
6644 	int statMask)
6645 {
6646 	struct vnode* vnode = descriptor->u.vnode;
6647 
6648 	FUNCTION(("common_write_stat(vnode = %p, stat = %p, statMask = %d)\n",
6649 		vnode, stat, statMask));
6650 
6651 	if ((descriptor->open_mode & O_RWMASK) == O_RDONLY
6652 		&& (statMask & B_STAT_SIZE) != 0) {
6653 		return B_BAD_VALUE;
6654 	}
6655 
6656 	if (!HAS_FS_CALL(vnode, write_stat))
6657 		return B_READ_ONLY_DEVICE;
6658 
6659 	return FS_CALL(vnode, write_stat, stat, statMask);
6660 }
6661 
6662 
6663 static status_t
6664 common_path_read_stat(int fd, char* path, bool traverseLeafLink,
6665 	struct stat* stat, bool kernel)
6666 {
6667 	FUNCTION(("common_path_read_stat: fd: %d, path '%s', stat %p,\n", fd, path,
6668 		stat));
6669 
6670 	struct vnode* vnode;
6671 	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6672 		NULL, kernel);
6673 	if (status != B_OK)
6674 		return status;
6675 
6676 	status = vfs_stat_vnode(vnode, stat);
6677 
6678 	put_vnode(vnode);
6679 	return status;
6680 }
6681 
6682 
6683 static status_t
6684 common_path_write_stat(int fd, char* path, bool traverseLeafLink,
6685 	const struct stat* stat, int statMask, bool kernel)
6686 {
6687 	FUNCTION(("common_write_stat: fd: %d, path '%s', stat %p, stat_mask %d, "
6688 		"kernel %d\n", fd, path, stat, statMask, kernel));
6689 
6690 	struct vnode* vnode;
6691 	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6692 		NULL, kernel);
6693 	if (status != B_OK)
6694 		return status;
6695 
6696 	if (HAS_FS_CALL(vnode, write_stat))
6697 		status = FS_CALL(vnode, write_stat, stat, statMask);
6698 	else
6699 		status = B_READ_ONLY_DEVICE;
6700 
6701 	put_vnode(vnode);
6702 
6703 	return status;
6704 }
6705 
6706 
6707 static int
6708 attr_dir_open(int fd, char* path, bool traverseLeafLink, bool kernel)
6709 {
6710 	FUNCTION(("attr_dir_open(fd = %d, path = '%s', kernel = %d)\n", fd, path,
6711 		kernel));
6712 
6713 	struct vnode* vnode;
6714 	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6715 		NULL, kernel);
6716 	if (status != B_OK)
6717 		return status;
6718 
6719 	status = open_attr_dir_vnode(vnode, kernel);
6720 	if (status < 0)
6721 		put_vnode(vnode);
6722 
6723 	return status;
6724 }
6725 
6726 
6727 static status_t
6728 attr_dir_close(struct file_descriptor* descriptor)
6729 {
6730 	struct vnode* vnode = descriptor->u.vnode;
6731 
6732 	FUNCTION(("attr_dir_close(descriptor = %p)\n", descriptor));
6733 
6734 	if (HAS_FS_CALL(vnode, close_attr_dir))
6735 		return FS_CALL(vnode, close_attr_dir, descriptor->cookie);
6736 
6737 	return B_OK;
6738 }
6739 
6740 
6741 static void
6742 attr_dir_free_fd(struct file_descriptor* descriptor)
6743 {
6744 	struct vnode* vnode = descriptor->u.vnode;
6745 
6746 	if (vnode != NULL) {
6747 		FS_CALL(vnode, free_attr_dir_cookie, descriptor->cookie);
6748 		put_vnode(vnode);
6749 	}
6750 }
6751 
6752 
6753 static status_t
6754 attr_dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
6755 	struct dirent* buffer, size_t bufferSize, uint32* _count)
6756 {
6757 	struct vnode* vnode = descriptor->u.vnode;
6758 
6759 	FUNCTION(("attr_dir_read(descriptor = %p)\n", descriptor));
6760 
6761 	if (HAS_FS_CALL(vnode, read_attr_dir))
6762 		return FS_CALL(vnode, read_attr_dir, descriptor->cookie, buffer,
6763 			bufferSize, _count);
6764 
6765 	return B_UNSUPPORTED;
6766 }
6767 
6768 
6769 static status_t
6770 attr_dir_rewind(struct file_descriptor* descriptor)
6771 {
6772 	struct vnode* vnode = descriptor->u.vnode;
6773 
6774 	FUNCTION(("attr_dir_rewind(descriptor = %p)\n", descriptor));
6775 
6776 	if (HAS_FS_CALL(vnode, rewind_attr_dir))
6777 		return FS_CALL(vnode, rewind_attr_dir, descriptor->cookie);
6778 
6779 	return B_UNSUPPORTED;
6780 }
6781 
6782 
6783 static int
6784 attr_create(int fd, char* path, const char* name, uint32 type,
6785 	int openMode, bool kernel)
6786 {
6787 	if (name == NULL || *name == '\0')
6788 		return B_BAD_VALUE;
6789 
6790 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
6791 	struct vnode* vnode;
6792 	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode, NULL,
6793 		kernel);
6794 	if (status != B_OK)
6795 		return status;
6796 
6797 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
6798 		status = B_LINK_LIMIT;
6799 		goto err;
6800 	}
6801 
6802 	if (!HAS_FS_CALL(vnode, create_attr)) {
6803 		status = B_READ_ONLY_DEVICE;
6804 		goto err;
6805 	}
6806 
6807 	void* cookie;
6808 	status = FS_CALL(vnode, create_attr, name, type, openMode, &cookie);
6809 	if (status != B_OK)
6810 		goto err;
6811 
6812 	fd = get_new_fd(FDTYPE_ATTR, NULL, vnode, cookie, openMode, kernel);
6813 	if (fd >= 0)
6814 		return fd;
6815 
6816 	status = fd;
6817 
6818 	FS_CALL(vnode, close_attr, cookie);
6819 	FS_CALL(vnode, free_attr_cookie, cookie);
6820 
6821 	FS_CALL(vnode, remove_attr, name);
6822 
6823 err:
6824 	put_vnode(vnode);
6825 
6826 	return status;
6827 }
6828 
6829 
6830 static int
6831 attr_open(int fd, char* path, const char* name, int openMode, bool kernel)
6832 {
6833 	if (name == NULL || *name == '\0')
6834 		return B_BAD_VALUE;
6835 
6836 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
6837 	struct vnode* vnode;
6838 	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode, NULL,
6839 		kernel);
6840 	if (status != B_OK)
6841 		return status;
6842 
6843 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
6844 		status = B_LINK_LIMIT;
6845 		goto err;
6846 	}
6847 
6848 	if (!HAS_FS_CALL(vnode, open_attr)) {
6849 		status = B_UNSUPPORTED;
6850 		goto err;
6851 	}
6852 
6853 	void* cookie;
6854 	status = FS_CALL(vnode, open_attr, name, openMode, &cookie);
6855 	if (status != B_OK)
6856 		goto err;
6857 
6858 	// now we only need a file descriptor for this attribute and we're done
6859 	fd = get_new_fd(FDTYPE_ATTR, NULL, vnode, cookie, openMode, kernel);
6860 	if (fd >= 0)
6861 		return fd;
6862 
6863 	status = fd;
6864 
6865 	FS_CALL(vnode, close_attr, cookie);
6866 	FS_CALL(vnode, free_attr_cookie, cookie);
6867 
6868 err:
6869 	put_vnode(vnode);
6870 
6871 	return status;
6872 }
6873 
6874 
6875 static status_t
6876 attr_close(struct file_descriptor* descriptor)
6877 {
6878 	struct vnode* vnode = descriptor->u.vnode;
6879 
6880 	FUNCTION(("attr_close(descriptor = %p)\n", descriptor));
6881 
6882 	if (HAS_FS_CALL(vnode, close_attr))
6883 		return FS_CALL(vnode, close_attr, descriptor->cookie);
6884 
6885 	return B_OK;
6886 }
6887 
6888 
6889 static void
6890 attr_free_fd(struct file_descriptor* descriptor)
6891 {
6892 	struct vnode* vnode = descriptor->u.vnode;
6893 
6894 	if (vnode != NULL) {
6895 		FS_CALL(vnode, free_attr_cookie, descriptor->cookie);
6896 		put_vnode(vnode);
6897 	}
6898 }
6899 
6900 
6901 static status_t
6902 attr_read(struct file_descriptor* descriptor, off_t pos, void* buffer,
6903 	size_t* length)
6904 {
6905 	struct vnode* vnode = descriptor->u.vnode;
6906 
6907 	FUNCTION(("attr_read: buf %p, pos %" B_PRIdOFF ", len %p = %ld\n", buffer,
6908 		pos, length, *length));
6909 
6910 	if (!HAS_FS_CALL(vnode, read_attr))
6911 		return B_UNSUPPORTED;
6912 
6913 	return FS_CALL(vnode, read_attr, descriptor->cookie, pos, buffer, length);
6914 }
6915 
6916 
6917 static status_t
6918 attr_write(struct file_descriptor* descriptor, off_t pos, const void* buffer,
6919 	size_t* length)
6920 {
6921 	struct vnode* vnode = descriptor->u.vnode;
6922 
6923 	FUNCTION(("attr_write: buf %p, pos %" B_PRIdOFF ", len %p\n", buffer, pos,
6924 		length));
6925 
6926 	if (!HAS_FS_CALL(vnode, write_attr))
6927 		return B_UNSUPPORTED;
6928 
6929 	return FS_CALL(vnode, write_attr, descriptor->cookie, pos, buffer, length);
6930 }
6931 
6932 
6933 static off_t
6934 attr_seek(struct file_descriptor* descriptor, off_t pos, int seekType)
6935 {
6936 	off_t offset;
6937 
6938 	switch (seekType) {
6939 		case SEEK_SET:
6940 			offset = 0;
6941 			break;
6942 		case SEEK_CUR:
6943 			offset = descriptor->pos;
6944 			break;
6945 		case SEEK_END:
6946 		{
6947 			struct vnode* vnode = descriptor->u.vnode;
6948 			if (!HAS_FS_CALL(vnode, read_stat))
6949 				return B_UNSUPPORTED;
6950 
6951 			struct stat stat;
6952 			status_t status = FS_CALL(vnode, read_attr_stat, descriptor->cookie,
6953 				&stat);
6954 			if (status != B_OK)
6955 				return status;
6956 
6957 			offset = stat.st_size;
6958 			break;
6959 		}
6960 		default:
6961 			return B_BAD_VALUE;
6962 	}
6963 
6964 	// assumes off_t is 64 bits wide
6965 	if (offset > 0 && LONGLONG_MAX - offset < pos)
6966 		return B_BUFFER_OVERFLOW;
6967 
6968 	pos += offset;
6969 	if (pos < 0)
6970 		return B_BAD_VALUE;
6971 
6972 	return descriptor->pos = pos;
6973 }
6974 
6975 
6976 static status_t
6977 attr_read_stat(struct file_descriptor* descriptor, struct stat* stat)
6978 {
6979 	struct vnode* vnode = descriptor->u.vnode;
6980 
6981 	FUNCTION(("attr_read_stat: stat 0x%p\n", stat));
6982 
6983 	if (!HAS_FS_CALL(vnode, read_attr_stat))
6984 		return B_UNSUPPORTED;
6985 
6986 	return FS_CALL(vnode, read_attr_stat, descriptor->cookie, stat);
6987 }
6988 
6989 
6990 static status_t
6991 attr_write_stat(struct file_descriptor* descriptor, const struct stat* stat,
6992 	int statMask)
6993 {
6994 	struct vnode* vnode = descriptor->u.vnode;
6995 
6996 	FUNCTION(("attr_write_stat: stat = %p, statMask %d\n", stat, statMask));
6997 
6998 	if (!HAS_FS_CALL(vnode, write_attr_stat))
6999 		return B_READ_ONLY_DEVICE;
7000 
7001 	return FS_CALL(vnode, write_attr_stat, descriptor->cookie, stat, statMask);
7002 }
7003 
7004 
7005 static status_t
7006 attr_remove(int fd, const char* name, bool kernel)
7007 {
7008 	struct file_descriptor* descriptor;
7009 	struct vnode* vnode;
7010 	status_t status;
7011 
7012 	if (name == NULL || *name == '\0')
7013 		return B_BAD_VALUE;
7014 
7015 	FUNCTION(("attr_remove: fd = %d, name = \"%s\", kernel %d\n", fd, name,
7016 		kernel));
7017 
7018 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
7019 	if (descriptor == NULL)
7020 		return B_FILE_ERROR;
7021 
7022 	if (HAS_FS_CALL(vnode, remove_attr))
7023 		status = FS_CALL(vnode, remove_attr, name);
7024 	else
7025 		status = B_READ_ONLY_DEVICE;
7026 
7027 	put_fd(descriptor);
7028 
7029 	return status;
7030 }
7031 
7032 
7033 static status_t
7034 attr_rename(int fromFD, const char* fromName, int toFD, const char* toName,
7035 	bool kernel)
7036 {
7037 	struct file_descriptor* fromDescriptor;
7038 	struct file_descriptor* toDescriptor;
7039 	struct vnode* fromVnode;
7040 	struct vnode* toVnode;
7041 	status_t status;
7042 
7043 	if (fromName == NULL || *fromName == '\0' || toName == NULL
7044 		|| *toName == '\0')
7045 		return B_BAD_VALUE;
7046 
7047 	FUNCTION(("attr_rename: from fd = %d, from name = \"%s\", to fd = %d, to "
7048 		"name = \"%s\", kernel %d\n", fromFD, fromName, toFD, toName, kernel));
7049 
7050 	fromDescriptor = get_fd_and_vnode(fromFD, &fromVnode, kernel);
7051 	if (fromDescriptor == NULL)
7052 		return B_FILE_ERROR;
7053 
7054 	toDescriptor = get_fd_and_vnode(toFD, &toVnode, kernel);
7055 	if (toDescriptor == NULL) {
7056 		status = B_FILE_ERROR;
7057 		goto err;
7058 	}
7059 
7060 	// are the files on the same volume?
7061 	if (fromVnode->device != toVnode->device) {
7062 		status = B_CROSS_DEVICE_LINK;
7063 		goto err1;
7064 	}
7065 
7066 	if (HAS_FS_CALL(fromVnode, rename_attr)) {
7067 		status = FS_CALL(fromVnode, rename_attr, fromName, toVnode, toName);
7068 	} else
7069 		status = B_READ_ONLY_DEVICE;
7070 
7071 err1:
7072 	put_fd(toDescriptor);
7073 err:
7074 	put_fd(fromDescriptor);
7075 
7076 	return status;
7077 }
7078 
7079 
7080 static int
7081 index_dir_open(dev_t mountID, bool kernel)
7082 {
7083 	struct fs_mount* mount;
7084 	void* cookie;
7085 
7086 	FUNCTION(("index_dir_open(mountID = %" B_PRId32 ", kernel = %d)\n", mountID,
7087 		kernel));
7088 
7089 	status_t status = get_mount(mountID, &mount);
7090 	if (status != B_OK)
7091 		return status;
7092 
7093 	if (!HAS_FS_MOUNT_CALL(mount, open_index_dir)) {
7094 		status = B_UNSUPPORTED;
7095 		goto error;
7096 	}
7097 
7098 	status = FS_MOUNT_CALL(mount, open_index_dir, &cookie);
7099 	if (status != B_OK)
7100 		goto error;
7101 
7102 	// get fd for the index directory
7103 	int fd;
7104 	fd = get_new_fd(FDTYPE_INDEX_DIR, mount, NULL, cookie, O_CLOEXEC, kernel);
7105 	if (fd >= 0)
7106 		return fd;
7107 
7108 	// something went wrong
7109 	FS_MOUNT_CALL(mount, close_index_dir, cookie);
7110 	FS_MOUNT_CALL(mount, free_index_dir_cookie, cookie);
7111 
7112 	status = fd;
7113 
7114 error:
7115 	put_mount(mount);
7116 	return status;
7117 }
7118 
7119 
7120 static status_t
7121 index_dir_close(struct file_descriptor* descriptor)
7122 {
7123 	struct fs_mount* mount = descriptor->u.mount;
7124 
7125 	FUNCTION(("index_dir_close(descriptor = %p)\n", descriptor));
7126 
7127 	if (HAS_FS_MOUNT_CALL(mount, close_index_dir))
7128 		return FS_MOUNT_CALL(mount, close_index_dir, descriptor->cookie);
7129 
7130 	return B_OK;
7131 }
7132 
7133 
7134 static void
7135 index_dir_free_fd(struct file_descriptor* descriptor)
7136 {
7137 	struct fs_mount* mount = descriptor->u.mount;
7138 
7139 	if (mount != NULL) {
7140 		FS_MOUNT_CALL(mount, free_index_dir_cookie, descriptor->cookie);
7141 		put_mount(mount);
7142 	}
7143 }
7144 
7145 
7146 static status_t
7147 index_dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
7148 	struct dirent* buffer, size_t bufferSize, uint32* _count)
7149 {
7150 	struct fs_mount* mount = descriptor->u.mount;
7151 
7152 	if (HAS_FS_MOUNT_CALL(mount, read_index_dir)) {
7153 		return FS_MOUNT_CALL(mount, read_index_dir, descriptor->cookie, buffer,
7154 			bufferSize, _count);
7155 	}
7156 
7157 	return B_UNSUPPORTED;
7158 }
7159 
7160 
7161 static status_t
7162 index_dir_rewind(struct file_descriptor* descriptor)
7163 {
7164 	struct fs_mount* mount = descriptor->u.mount;
7165 
7166 	if (HAS_FS_MOUNT_CALL(mount, rewind_index_dir))
7167 		return FS_MOUNT_CALL(mount, rewind_index_dir, descriptor->cookie);
7168 
7169 	return B_UNSUPPORTED;
7170 }
7171 
7172 
7173 static status_t
7174 index_create(dev_t mountID, const char* name, uint32 type, uint32 flags,
7175 	bool kernel)
7176 {
7177 	FUNCTION(("index_create(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7178 		mountID, name, kernel));
7179 
7180 	struct fs_mount* mount;
7181 	status_t status = get_mount(mountID, &mount);
7182 	if (status != B_OK)
7183 		return status;
7184 
7185 	if (!HAS_FS_MOUNT_CALL(mount, create_index)) {
7186 		status = B_READ_ONLY_DEVICE;
7187 		goto out;
7188 	}
7189 
7190 	status = FS_MOUNT_CALL(mount, create_index, name, type, flags);
7191 
7192 out:
7193 	put_mount(mount);
7194 	return status;
7195 }
7196 
7197 
7198 #if 0
7199 static status_t
7200 index_read_stat(struct file_descriptor* descriptor, struct stat* stat)
7201 {
7202 	struct vnode* vnode = descriptor->u.vnode;
7203 
7204 	// ToDo: currently unused!
7205 	FUNCTION(("index_read_stat: stat 0x%p\n", stat));
7206 	if (!HAS_FS_CALL(vnode, read_index_stat))
7207 		return B_UNSUPPORTED;
7208 
7209 	return B_UNSUPPORTED;
7210 	//return FS_CALL(vnode, read_index_stat, descriptor->cookie, stat);
7211 }
7212 
7213 
7214 static void
7215 index_free_fd(struct file_descriptor* descriptor)
7216 {
7217 	struct vnode* vnode = descriptor->u.vnode;
7218 
7219 	if (vnode != NULL) {
7220 		FS_CALL(vnode, free_index_cookie, descriptor->cookie);
7221 		put_vnode(vnode);
7222 	}
7223 }
7224 #endif
7225 
7226 
7227 static status_t
7228 index_name_read_stat(dev_t mountID, const char* name, struct stat* stat,
7229 	bool kernel)
7230 {
7231 	FUNCTION(("index_remove(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7232 		mountID, name, kernel));
7233 
7234 	struct fs_mount* mount;
7235 	status_t status = get_mount(mountID, &mount);
7236 	if (status != B_OK)
7237 		return status;
7238 
7239 	if (!HAS_FS_MOUNT_CALL(mount, read_index_stat)) {
7240 		status = B_UNSUPPORTED;
7241 		goto out;
7242 	}
7243 
7244 	status = FS_MOUNT_CALL(mount, read_index_stat, name, stat);
7245 
7246 out:
7247 	put_mount(mount);
7248 	return status;
7249 }
7250 
7251 
7252 static status_t
7253 index_remove(dev_t mountID, const char* name, bool kernel)
7254 {
7255 	FUNCTION(("index_remove(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7256 		mountID, name, kernel));
7257 
7258 	struct fs_mount* mount;
7259 	status_t status = get_mount(mountID, &mount);
7260 	if (status != B_OK)
7261 		return status;
7262 
7263 	if (!HAS_FS_MOUNT_CALL(mount, remove_index)) {
7264 		status = B_READ_ONLY_DEVICE;
7265 		goto out;
7266 	}
7267 
7268 	status = FS_MOUNT_CALL(mount, remove_index, name);
7269 
7270 out:
7271 	put_mount(mount);
7272 	return status;
7273 }
7274 
7275 
7276 /*!	TODO: the query FS API is still the pretty much the same as in R5.
7277 		It would be nice if the FS would find some more kernel support
7278 		for them.
7279 		For example, query parsing should be moved into the kernel.
7280 */
7281 static int
7282 query_open(dev_t device, const char* query, uint32 flags, port_id port,
7283 	int32 token, bool kernel)
7284 {
7285 	struct fs_mount* mount;
7286 	void* cookie;
7287 
7288 	FUNCTION(("query_open(device = %" B_PRId32 ", query = \"%s\", kernel = %d)\n",
7289 		device, query, kernel));
7290 
7291 	status_t status = get_mount(device, &mount);
7292 	if (status != B_OK)
7293 		return status;
7294 
7295 	if (!HAS_FS_MOUNT_CALL(mount, open_query)) {
7296 		status = B_UNSUPPORTED;
7297 		goto error;
7298 	}
7299 
7300 	status = FS_MOUNT_CALL(mount, open_query, query, flags, port, token,
7301 		&cookie);
7302 	if (status != B_OK)
7303 		goto error;
7304 
7305 	// get fd for the index directory
7306 	int fd;
7307 	fd = get_new_fd(FDTYPE_QUERY, mount, NULL, cookie, O_CLOEXEC, kernel);
7308 	if (fd >= 0)
7309 		return fd;
7310 
7311 	status = fd;
7312 
7313 	// something went wrong
7314 	FS_MOUNT_CALL(mount, close_query, cookie);
7315 	FS_MOUNT_CALL(mount, free_query_cookie, cookie);
7316 
7317 error:
7318 	put_mount(mount);
7319 	return status;
7320 }
7321 
7322 
7323 static status_t
7324 query_close(struct file_descriptor* descriptor)
7325 {
7326 	struct fs_mount* mount = descriptor->u.mount;
7327 
7328 	FUNCTION(("query_close(descriptor = %p)\n", descriptor));
7329 
7330 	if (HAS_FS_MOUNT_CALL(mount, close_query))
7331 		return FS_MOUNT_CALL(mount, close_query, descriptor->cookie);
7332 
7333 	return B_OK;
7334 }
7335 
7336 
7337 static void
7338 query_free_fd(struct file_descriptor* descriptor)
7339 {
7340 	struct fs_mount* mount = descriptor->u.mount;
7341 
7342 	if (mount != NULL) {
7343 		FS_MOUNT_CALL(mount, free_query_cookie, descriptor->cookie);
7344 		put_mount(mount);
7345 	}
7346 }
7347 
7348 
7349 static status_t
7350 query_read(struct io_context* ioContext, struct file_descriptor* descriptor,
7351 	struct dirent* buffer, size_t bufferSize, uint32* _count)
7352 {
7353 	struct fs_mount* mount = descriptor->u.mount;
7354 
7355 	if (HAS_FS_MOUNT_CALL(mount, read_query)) {
7356 		return FS_MOUNT_CALL(mount, read_query, descriptor->cookie, buffer,
7357 			bufferSize, _count);
7358 	}
7359 
7360 	return B_UNSUPPORTED;
7361 }
7362 
7363 
7364 static status_t
7365 query_rewind(struct file_descriptor* descriptor)
7366 {
7367 	struct fs_mount* mount = descriptor->u.mount;
7368 
7369 	if (HAS_FS_MOUNT_CALL(mount, rewind_query))
7370 		return FS_MOUNT_CALL(mount, rewind_query, descriptor->cookie);
7371 
7372 	return B_UNSUPPORTED;
7373 }
7374 
7375 
7376 //	#pragma mark - General File System functions
7377 
7378 
7379 static dev_t
7380 fs_mount(char* path, const char* device, const char* fsName, uint32 flags,
7381 	const char* args, bool kernel)
7382 {
7383 	struct ::fs_mount* mount;
7384 	status_t status = B_OK;
7385 	fs_volume* volume = NULL;
7386 	int32 layer = 0;
7387 	Vnode* coveredNode = NULL;
7388 
7389 	FUNCTION(("fs_mount: path = '%s', device = '%s', fs_name = '%s', flags = %#"
7390 		B_PRIx32 ", args = '%s'\n", path, device, fsName, flags, args));
7391 
7392 	// The path is always safe, we just have to make sure that fsName is
7393 	// almost valid - we can't make any assumptions about args, though.
7394 	// A NULL fsName is OK, if a device was given and the FS is not virtual.
7395 	// We'll get it from the DDM later.
7396 	if (fsName == NULL) {
7397 		if (!device || flags & B_MOUNT_VIRTUAL_DEVICE)
7398 			return B_BAD_VALUE;
7399 	} else if (fsName[0] == '\0')
7400 		return B_BAD_VALUE;
7401 
7402 	RecursiveLocker mountOpLocker(sMountOpLock);
7403 
7404 	// Helper to delete a newly created file device on failure.
7405 	// Not exactly beautiful, but helps to keep the code below cleaner.
7406 	struct FileDeviceDeleter {
7407 		FileDeviceDeleter() : id(-1) {}
7408 		~FileDeviceDeleter()
7409 		{
7410 			KDiskDeviceManager::Default()->DeleteFileDevice(id);
7411 		}
7412 
7413 		partition_id id;
7414 	} fileDeviceDeleter;
7415 
7416 	// If the file system is not a "virtual" one, the device argument should
7417 	// point to a real file/device (if given at all).
7418 	// get the partition
7419 	KDiskDeviceManager* ddm = KDiskDeviceManager::Default();
7420 	KPartition* partition = NULL;
7421 	KPath normalizedDevice;
7422 	bool newlyCreatedFileDevice = false;
7423 
7424 	if (!(flags & B_MOUNT_VIRTUAL_DEVICE) && device != NULL) {
7425 		// normalize the device path
7426 		status = normalizedDevice.SetTo(device, true);
7427 		if (status != B_OK)
7428 			return status;
7429 
7430 		// get a corresponding partition from the DDM
7431 		partition = ddm->RegisterPartition(normalizedDevice.Path());
7432 		if (partition == NULL) {
7433 			// Partition not found: This either means, the user supplied
7434 			// an invalid path, or the path refers to an image file. We try
7435 			// to let the DDM create a file device for the path.
7436 			partition_id deviceID = ddm->CreateFileDevice(
7437 				normalizedDevice.Path(), &newlyCreatedFileDevice);
7438 			if (deviceID >= 0) {
7439 				partition = ddm->RegisterPartition(deviceID);
7440 				if (newlyCreatedFileDevice)
7441 					fileDeviceDeleter.id = deviceID;
7442 			}
7443 		}
7444 
7445 		if (!partition) {
7446 			TRACE(("fs_mount(): Partition `%s' not found.\n",
7447 				normalizedDevice.Path()));
7448 			return B_ENTRY_NOT_FOUND;
7449 		}
7450 
7451 		device = normalizedDevice.Path();
7452 			// correct path to file device
7453 	}
7454 	PartitionRegistrar partitionRegistrar(partition, true);
7455 
7456 	// Write lock the partition's device. For the time being, we keep the lock
7457 	// until we're done mounting -- not nice, but ensure, that no-one is
7458 	// interfering.
7459 	// TODO: Just mark the partition busy while mounting!
7460 	KDiskDevice* diskDevice = NULL;
7461 	if (partition) {
7462 		diskDevice = ddm->WriteLockDevice(partition->Device()->ID());
7463 		if (!diskDevice) {
7464 			TRACE(("fs_mount(): Failed to lock disk device!\n"));
7465 			return B_ERROR;
7466 		}
7467 	}
7468 
7469 	DeviceWriteLocker writeLocker(diskDevice, true);
7470 		// this takes over the write lock acquired before
7471 
7472 	if (partition != NULL) {
7473 		// make sure, that the partition is not busy
7474 		if (partition->IsBusy()) {
7475 			TRACE(("fs_mount(): Partition is busy.\n"));
7476 			return B_BUSY;
7477 		}
7478 
7479 		// if no FS name had been supplied, we get it from the partition
7480 		if (fsName == NULL) {
7481 			KDiskSystem* diskSystem = partition->DiskSystem();
7482 			if (!diskSystem) {
7483 				TRACE(("fs_mount(): No FS name was given, and the DDM didn't "
7484 					"recognize it.\n"));
7485 				return B_BAD_VALUE;
7486 			}
7487 
7488 			if (!diskSystem->IsFileSystem()) {
7489 				TRACE(("fs_mount(): No FS name was given, and the DDM found a "
7490 					"partitioning system.\n"));
7491 				return B_BAD_VALUE;
7492 			}
7493 
7494 			// The disk system name will not change, and the KDiskSystem
7495 			// object will not go away while the disk device is locked (and
7496 			// the partition has a reference to it), so this is safe.
7497 			fsName = diskSystem->Name();
7498 		}
7499 	}
7500 
7501 	mount = new(std::nothrow) (struct ::fs_mount);
7502 	if (mount == NULL)
7503 		return B_NO_MEMORY;
7504 
7505 	mount->device_name = strdup(device);
7506 		// "device" can be NULL
7507 
7508 	status = mount->entry_cache.Init();
7509 	if (status != B_OK)
7510 		goto err1;
7511 
7512 	// initialize structure
7513 	mount->id = sNextMountID++;
7514 	mount->partition = NULL;
7515 	mount->root_vnode = NULL;
7516 	mount->covers_vnode = NULL;
7517 	mount->unmounting = false;
7518 	mount->owns_file_device = false;
7519 	mount->volume = NULL;
7520 
7521 	// build up the volume(s)
7522 	while (true) {
7523 		char* layerFSName = get_file_system_name_for_layer(fsName, layer);
7524 		if (layerFSName == NULL) {
7525 			if (layer == 0) {
7526 				status = B_NO_MEMORY;
7527 				goto err1;
7528 			}
7529 
7530 			break;
7531 		}
7532 		MemoryDeleter layerFSNameDeleter(layerFSName);
7533 
7534 		volume = (fs_volume*)malloc(sizeof(fs_volume));
7535 		if (volume == NULL) {
7536 			status = B_NO_MEMORY;
7537 			goto err1;
7538 		}
7539 
7540 		volume->id = mount->id;
7541 		volume->partition = partition != NULL ? partition->ID() : -1;
7542 		volume->layer = layer++;
7543 		volume->private_volume = NULL;
7544 		volume->ops = NULL;
7545 		volume->sub_volume = NULL;
7546 		volume->super_volume = NULL;
7547 		volume->file_system = NULL;
7548 		volume->file_system_name = NULL;
7549 
7550 		volume->file_system_name = get_file_system_name(layerFSName);
7551 		if (volume->file_system_name == NULL) {
7552 			status = B_NO_MEMORY;
7553 			free(volume);
7554 			goto err1;
7555 		}
7556 
7557 		volume->file_system = get_file_system(layerFSName);
7558 		if (volume->file_system == NULL) {
7559 			status = B_DEVICE_NOT_FOUND;
7560 			free(volume->file_system_name);
7561 			free(volume);
7562 			goto err1;
7563 		}
7564 
7565 		if (mount->volume == NULL)
7566 			mount->volume = volume;
7567 		else {
7568 			volume->super_volume = mount->volume;
7569 			mount->volume->sub_volume = volume;
7570 			mount->volume = volume;
7571 		}
7572 	}
7573 
7574 	// insert mount struct into list before we call FS's mount() function
7575 	// so that vnodes can be created for this mount
7576 	mutex_lock(&sMountMutex);
7577 	sMountsTable->Insert(mount);
7578 	mutex_unlock(&sMountMutex);
7579 
7580 	ino_t rootID;
7581 
7582 	if (!sRoot) {
7583 		// we haven't mounted anything yet
7584 		if (strcmp(path, "/") != 0) {
7585 			status = B_ERROR;
7586 			goto err2;
7587 		}
7588 
7589 		status = mount->volume->file_system->mount(mount->volume, device, flags,
7590 			args, &rootID);
7591 		if (status != B_OK || mount->volume->ops == NULL)
7592 			goto err2;
7593 	} else {
7594 		status = path_to_vnode(path, true, &coveredNode, NULL, kernel);
7595 		if (status != B_OK)
7596 			goto err2;
7597 
7598 		mount->covers_vnode = coveredNode;
7599 
7600 		// make sure covered_vnode is a directory
7601 		if (!S_ISDIR(coveredNode->Type())) {
7602 			status = B_NOT_A_DIRECTORY;
7603 			goto err3;
7604 		}
7605 
7606 		if (coveredNode->IsCovered()) {
7607 			// this is already a covered vnode
7608 			status = B_BUSY;
7609 			goto err3;
7610 		}
7611 
7612 		// mount it/them
7613 		fs_volume* volume = mount->volume;
7614 		while (volume) {
7615 			status = volume->file_system->mount(volume, device, flags, args,
7616 				&rootID);
7617 			if (status != B_OK || volume->ops == NULL) {
7618 				if (status == B_OK && volume->ops == NULL)
7619 					panic("fs_mount: mount() succeeded but ops is NULL!");
7620 				if (volume->sub_volume)
7621 					goto err4;
7622 				goto err3;
7623 			}
7624 
7625 			volume = volume->super_volume;
7626 		}
7627 
7628 		volume = mount->volume;
7629 		while (volume) {
7630 			if (volume->ops->all_layers_mounted != NULL)
7631 				volume->ops->all_layers_mounted(volume);
7632 			volume = volume->super_volume;
7633 		}
7634 	}
7635 
7636 	// the root node is supposed to be owned by the file system - it must
7637 	// exist at this point
7638 	mount->root_vnode = lookup_vnode(mount->id, rootID);
7639 	if (mount->root_vnode == NULL || mount->root_vnode->ref_count != 1) {
7640 		panic("fs_mount: file system does not own its root node!\n");
7641 		status = B_ERROR;
7642 		goto err4;
7643 	}
7644 
7645 	// set up the links between the root vnode and the vnode it covers
7646 	rw_lock_write_lock(&sVnodeLock);
7647 	if (coveredNode != NULL) {
7648 		if (coveredNode->IsCovered()) {
7649 			// the vnode is covered now
7650 			status = B_BUSY;
7651 			rw_lock_write_unlock(&sVnodeLock);
7652 			goto err4;
7653 		}
7654 
7655 		mount->root_vnode->covers = coveredNode;
7656 		mount->root_vnode->SetCovering(true);
7657 
7658 		coveredNode->covered_by = mount->root_vnode;
7659 		coveredNode->SetCovered(true);
7660 	}
7661 	rw_lock_write_unlock(&sVnodeLock);
7662 
7663 	if (!sRoot) {
7664 		sRoot = mount->root_vnode;
7665 		mutex_lock(&sIOContextRootLock);
7666 		get_current_io_context(true)->root = sRoot;
7667 		mutex_unlock(&sIOContextRootLock);
7668 		inc_vnode_ref_count(sRoot);
7669 	}
7670 
7671 	// supply the partition (if any) with the mount cookie and mark it mounted
7672 	if (partition) {
7673 		partition->SetMountCookie(mount->volume->private_volume);
7674 		partition->SetVolumeID(mount->id);
7675 
7676 		// keep a partition reference as long as the partition is mounted
7677 		partitionRegistrar.Detach();
7678 		mount->partition = partition;
7679 		mount->owns_file_device = newlyCreatedFileDevice;
7680 		fileDeviceDeleter.id = -1;
7681 	}
7682 
7683 	notify_mount(mount->id,
7684 		coveredNode != NULL ? coveredNode->device : -1,
7685 		coveredNode ? coveredNode->id : -1);
7686 
7687 	return mount->id;
7688 
7689 err4:
7690 	FS_MOUNT_CALL_NO_PARAMS(mount, unmount);
7691 err3:
7692 	if (coveredNode != NULL)
7693 		put_vnode(coveredNode);
7694 err2:
7695 	mutex_lock(&sMountMutex);
7696 	sMountsTable->Remove(mount);
7697 	mutex_unlock(&sMountMutex);
7698 err1:
7699 	delete mount;
7700 
7701 	return status;
7702 }
7703 
7704 
7705 static status_t
7706 fs_unmount(char* path, dev_t mountID, uint32 flags, bool kernel)
7707 {
7708 	struct fs_mount* mount;
7709 	status_t err;
7710 
7711 	FUNCTION(("fs_unmount(path '%s', dev %" B_PRId32 ", kernel %d\n", path,
7712 		mountID, kernel));
7713 
7714 	struct vnode* pathVnode = NULL;
7715 	if (path != NULL) {
7716 		err = path_to_vnode(path, true, &pathVnode, NULL, kernel);
7717 		if (err != B_OK)
7718 			return B_ENTRY_NOT_FOUND;
7719 	}
7720 
7721 	RecursiveLocker mountOpLocker(sMountOpLock);
7722 
7723 	// this lock is not strictly necessary, but here in case of KDEBUG
7724 	// to keep the ASSERT in find_mount() working.
7725 	KDEBUG_ONLY(mutex_lock(&sMountMutex));
7726 	mount = find_mount(path != NULL ? pathVnode->device : mountID);
7727 	KDEBUG_ONLY(mutex_unlock(&sMountMutex));
7728 	if (mount == NULL) {
7729 		panic("fs_unmount: find_mount() failed on root vnode @%p of mount\n",
7730 			pathVnode);
7731 	}
7732 
7733 	if (path != NULL) {
7734 		put_vnode(pathVnode);
7735 
7736 		if (mount->root_vnode != pathVnode) {
7737 			// not mountpoint
7738 			return B_BAD_VALUE;
7739 		}
7740 	}
7741 
7742 	// if the volume is associated with a partition, lock the device of the
7743 	// partition as long as we are unmounting
7744 	KDiskDeviceManager* ddm = KDiskDeviceManager::Default();
7745 	KPartition* partition = mount->partition;
7746 	KDiskDevice* diskDevice = NULL;
7747 	if (partition != NULL) {
7748 		if (partition->Device() == NULL) {
7749 			dprintf("fs_unmount(): There is no device!\n");
7750 			return B_ERROR;
7751 		}
7752 		diskDevice = ddm->WriteLockDevice(partition->Device()->ID());
7753 		if (!diskDevice) {
7754 			TRACE(("fs_unmount(): Failed to lock disk device!\n"));
7755 			return B_ERROR;
7756 		}
7757 	}
7758 	DeviceWriteLocker writeLocker(diskDevice, true);
7759 
7760 	// make sure, that the partition is not busy
7761 	if (partition != NULL) {
7762 		if ((flags & B_UNMOUNT_BUSY_PARTITION) == 0 && partition->IsBusy()) {
7763 			TRACE(("fs_unmount(): Partition is busy.\n"));
7764 			return B_BUSY;
7765 		}
7766 	}
7767 
7768 	// grab the vnode master mutex to keep someone from creating
7769 	// a vnode while we're figuring out if we can continue
7770 	WriteLocker vnodesWriteLocker(&sVnodeLock);
7771 
7772 	bool disconnectedDescriptors = false;
7773 
7774 	while (true) {
7775 		bool busy = false;
7776 
7777 		// cycle through the list of vnodes associated with this mount and
7778 		// make sure all of them are not busy or have refs on them
7779 		VnodeList::Iterator iterator = mount->vnodes.GetIterator();
7780 		while (struct vnode* vnode = iterator.Next()) {
7781 			if (vnode->IsBusy()) {
7782 				busy = true;
7783 				break;
7784 			}
7785 
7786 			// check the vnode's ref count -- subtract additional references for
7787 			// covering
7788 			int32 refCount = vnode->ref_count;
7789 			if (vnode->covers != NULL)
7790 				refCount--;
7791 			if (vnode->covered_by != NULL)
7792 				refCount--;
7793 
7794 			if (refCount != 0) {
7795 				// there are still vnodes in use on this mount, so we cannot
7796 				// unmount yet
7797 				busy = true;
7798 				break;
7799 			}
7800 		}
7801 
7802 		if (!busy)
7803 			break;
7804 
7805 		if ((flags & B_FORCE_UNMOUNT) == 0)
7806 			return B_BUSY;
7807 
7808 		if (disconnectedDescriptors) {
7809 			// wait a bit until the last access is finished, and then try again
7810 			vnodesWriteLocker.Unlock();
7811 			snooze(100000);
7812 			// TODO: if there is some kind of bug that prevents the ref counts
7813 			// from getting back to zero, this will fall into an endless loop...
7814 			vnodesWriteLocker.Lock();
7815 			continue;
7816 		}
7817 
7818 		// the file system is still busy - but we're forced to unmount it,
7819 		// so let's disconnect all open file descriptors
7820 
7821 		mount->unmounting = true;
7822 			// prevent new vnodes from being created
7823 
7824 		vnodesWriteLocker.Unlock();
7825 
7826 		disconnect_mount_or_vnode_fds(mount, NULL);
7827 		disconnectedDescriptors = true;
7828 
7829 		vnodesWriteLocker.Lock();
7830 	}
7831 
7832 	// We can safely continue. Mark all of the vnodes busy and this mount
7833 	// structure in unmounting state. Also undo the vnode covers/covered_by
7834 	// links.
7835 	mount->unmounting = true;
7836 
7837 	VnodeList::Iterator iterator = mount->vnodes.GetIterator();
7838 	while (struct vnode* vnode = iterator.Next()) {
7839 		// Remove all covers/covered_by links from other mounts' nodes to this
7840 		// vnode and adjust the node ref count accordingly. We will release the
7841 		// references to the external vnodes below.
7842 		if (Vnode* coveredNode = vnode->covers) {
7843 			if (Vnode* coveringNode = vnode->covered_by) {
7844 				// We have both covered and covering vnodes, so just remove us
7845 				// from the chain.
7846 				coveredNode->covered_by = coveringNode;
7847 				coveringNode->covers = coveredNode;
7848 				vnode->ref_count -= 2;
7849 
7850 				vnode->covered_by = NULL;
7851 				vnode->covers = NULL;
7852 				vnode->SetCovering(false);
7853 				vnode->SetCovered(false);
7854 			} else {
7855 				// We only have a covered vnode. Remove its link to us.
7856 				coveredNode->covered_by = NULL;
7857 				coveredNode->SetCovered(false);
7858 				vnode->ref_count--;
7859 
7860 				// If the other node is an external vnode, we keep its link
7861 				// link around so we can put the reference later on. Otherwise
7862 				// we get rid of it right now.
7863 				if (coveredNode->mount == mount) {
7864 					vnode->covers = NULL;
7865 					coveredNode->ref_count--;
7866 				}
7867 			}
7868 		} else if (Vnode* coveringNode = vnode->covered_by) {
7869 			// We only have a covering vnode. Remove its link to us.
7870 			coveringNode->covers = NULL;
7871 			coveringNode->SetCovering(false);
7872 			vnode->ref_count--;
7873 
7874 			// If the other node is an external vnode, we keep its link
7875 			// link around so we can put the reference later on. Otherwise
7876 			// we get rid of it right now.
7877 			if (coveringNode->mount == mount) {
7878 				vnode->covered_by = NULL;
7879 				coveringNode->ref_count--;
7880 			}
7881 		}
7882 
7883 		vnode->SetBusy(true);
7884 		vnode_to_be_freed(vnode);
7885 	}
7886 
7887 	vnodesWriteLocker.Unlock();
7888 
7889 	// Free all vnodes associated with this mount.
7890 	// They will be removed from the mount list by free_vnode(), so
7891 	// we don't have to do this.
7892 	while (struct vnode* vnode = mount->vnodes.Head()) {
7893 		// Put the references to external covered/covering vnodes we kept above.
7894 		if (Vnode* coveredNode = vnode->covers)
7895 			put_vnode(coveredNode);
7896 		if (Vnode* coveringNode = vnode->covered_by)
7897 			put_vnode(coveringNode);
7898 
7899 		free_vnode(vnode, false);
7900 	}
7901 
7902 	// remove the mount structure from the hash table
7903 	mutex_lock(&sMountMutex);
7904 	sMountsTable->Remove(mount);
7905 	mutex_unlock(&sMountMutex);
7906 
7907 	mountOpLocker.Unlock();
7908 
7909 	FS_MOUNT_CALL_NO_PARAMS(mount, unmount);
7910 	notify_unmount(mount->id);
7911 
7912 	// dereference the partition and mark it unmounted
7913 	if (partition) {
7914 		partition->SetVolumeID(-1);
7915 		partition->SetMountCookie(NULL);
7916 
7917 		if (mount->owns_file_device)
7918 			KDiskDeviceManager::Default()->DeleteFileDevice(partition->ID());
7919 		partition->Unregister();
7920 	}
7921 
7922 	delete mount;
7923 	return B_OK;
7924 }
7925 
7926 
7927 static status_t
7928 fs_sync(dev_t device)
7929 {
7930 	struct fs_mount* mount;
7931 	status_t status = get_mount(device, &mount);
7932 	if (status != B_OK)
7933 		return status;
7934 
7935 	struct vnode marker;
7936 	memset(&marker, 0, sizeof(marker));
7937 	marker.SetBusy(true);
7938 	marker.SetRemoved(true);
7939 
7940 	// First, synchronize all file caches
7941 
7942 	while (true) {
7943 		WriteLocker locker(sVnodeLock);
7944 			// Note: That's the easy way. Which is probably OK for sync(),
7945 			// since it's a relatively rare call and doesn't need to allow for
7946 			// a lot of concurrency. Using a read lock would be possible, but
7947 			// also more involved, since we had to lock the individual nodes
7948 			// and take care of the locking order, which we might not want to
7949 			// do while holding fs_mount::lock.
7950 
7951 		// synchronize access to vnode list
7952 		mutex_lock(&mount->lock);
7953 
7954 		struct vnode* vnode;
7955 		if (!marker.IsRemoved()) {
7956 			vnode = mount->vnodes.GetNext(&marker);
7957 			mount->vnodes.Remove(&marker);
7958 			marker.SetRemoved(true);
7959 		} else
7960 			vnode = mount->vnodes.First();
7961 
7962 		while (vnode != NULL && (vnode->cache == NULL
7963 			|| vnode->IsRemoved() || vnode->IsBusy())) {
7964 			// TODO: we could track writes (and writable mapped vnodes)
7965 			//	and have a simple flag that we could test for here
7966 			vnode = mount->vnodes.GetNext(vnode);
7967 		}
7968 
7969 		if (vnode != NULL) {
7970 			// insert marker vnode again
7971 			mount->vnodes.Insert(mount->vnodes.GetNext(vnode), &marker);
7972 			marker.SetRemoved(false);
7973 		}
7974 
7975 		mutex_unlock(&mount->lock);
7976 
7977 		if (vnode == NULL)
7978 			break;
7979 
7980 		vnode = lookup_vnode(mount->id, vnode->id);
7981 		if (vnode == NULL || vnode->IsBusy())
7982 			continue;
7983 
7984 		if (vnode->ref_count == 0) {
7985 			// this vnode has been unused before
7986 			vnode_used(vnode);
7987 		}
7988 		inc_vnode_ref_count(vnode);
7989 
7990 		locker.Unlock();
7991 
7992 		if (vnode->cache != NULL && !vnode->IsRemoved())
7993 			vnode->cache->WriteModified();
7994 
7995 		put_vnode(vnode);
7996 	}
7997 
7998 	// Let the file systems do their synchronizing work
7999 	if (HAS_FS_MOUNT_CALL(mount, sync))
8000 		status = FS_MOUNT_CALL_NO_PARAMS(mount, sync);
8001 
8002 	// Finally, flush the underlying device's write cache (if possible.)
8003 	if (mount->partition != NULL && mount->partition->Device() != NULL)
8004 		ioctl(mount->partition->Device()->FD(), B_FLUSH_DRIVE_CACHE);
8005 
8006 	put_mount(mount);
8007 	return status;
8008 }
8009 
8010 
8011 static status_t
8012 fs_read_info(dev_t device, struct fs_info* info)
8013 {
8014 	struct fs_mount* mount;
8015 	status_t status = get_mount(device, &mount);
8016 	if (status != B_OK)
8017 		return status;
8018 
8019 	memset(info, 0, sizeof(struct fs_info));
8020 
8021 	if (HAS_FS_MOUNT_CALL(mount, read_fs_info))
8022 		status = FS_MOUNT_CALL(mount, read_fs_info, info);
8023 
8024 	// fill in info the file system doesn't (have to) know about
8025 	if (status == B_OK) {
8026 		info->dev = mount->id;
8027 		info->root = mount->root_vnode->id;
8028 
8029 		fs_volume* volume = mount->volume;
8030 		while (volume->super_volume != NULL)
8031 			volume = volume->super_volume;
8032 
8033 		strlcpy(info->fsh_name, volume->file_system_name,
8034 			sizeof(info->fsh_name));
8035 		if (mount->device_name != NULL) {
8036 			strlcpy(info->device_name, mount->device_name,
8037 				sizeof(info->device_name));
8038 		}
8039 	}
8040 
8041 	// if the call is not supported by the file system, there are still
8042 	// the parts that we filled out ourselves
8043 
8044 	put_mount(mount);
8045 	return status;
8046 }
8047 
8048 
8049 static status_t
8050 fs_write_info(dev_t device, const struct fs_info* info, int mask)
8051 {
8052 	struct fs_mount* mount;
8053 	status_t status = get_mount(device, &mount);
8054 	if (status != B_OK)
8055 		return status;
8056 
8057 	if (HAS_FS_MOUNT_CALL(mount, write_fs_info))
8058 		status = FS_MOUNT_CALL(mount, write_fs_info, info, mask);
8059 	else
8060 		status = B_READ_ONLY_DEVICE;
8061 
8062 	put_mount(mount);
8063 	return status;
8064 }
8065 
8066 
8067 static dev_t
8068 fs_next_device(int32* _cookie)
8069 {
8070 	struct fs_mount* mount = NULL;
8071 	dev_t device = *_cookie;
8072 
8073 	mutex_lock(&sMountMutex);
8074 
8075 	// Since device IDs are assigned sequentially, this algorithm
8076 	// does work good enough. It makes sure that the device list
8077 	// returned is sorted, and that no device is skipped when an
8078 	// already visited device got unmounted.
8079 
8080 	while (device < sNextMountID) {
8081 		mount = find_mount(device++);
8082 		if (mount != NULL && mount->volume->private_volume != NULL)
8083 			break;
8084 	}
8085 
8086 	*_cookie = device;
8087 
8088 	if (mount != NULL)
8089 		device = mount->id;
8090 	else
8091 		device = B_BAD_VALUE;
8092 
8093 	mutex_unlock(&sMountMutex);
8094 
8095 	return device;
8096 }
8097 
8098 
8099 ssize_t
8100 fs_read_attr(int fd, const char *attribute, uint32 type, off_t pos,
8101 	void *buffer, size_t readBytes)
8102 {
8103 	int attrFD = attr_open(fd, NULL, attribute, O_RDONLY, true);
8104 	if (attrFD < 0)
8105 		return attrFD;
8106 
8107 	ssize_t bytesRead = _kern_read(attrFD, pos, buffer, readBytes);
8108 
8109 	_kern_close(attrFD);
8110 
8111 	return bytesRead;
8112 }
8113 
8114 
8115 static status_t
8116 get_cwd(char* buffer, size_t size, bool kernel)
8117 {
8118 	// Get current working directory from io context
8119 	struct io_context* context = get_current_io_context(kernel);
8120 	status_t status;
8121 
8122 	FUNCTION(("vfs_get_cwd: buf %p, size %ld\n", buffer, size));
8123 
8124 	mutex_lock(&context->io_mutex);
8125 
8126 	struct vnode* vnode = context->cwd;
8127 	if (vnode)
8128 		inc_vnode_ref_count(vnode);
8129 
8130 	mutex_unlock(&context->io_mutex);
8131 
8132 	if (vnode) {
8133 		status = dir_vnode_to_path(vnode, buffer, size, kernel);
8134 		put_vnode(vnode);
8135 	} else
8136 		status = B_ERROR;
8137 
8138 	return status;
8139 }
8140 
8141 
8142 static status_t
8143 set_cwd(int fd, char* path, bool kernel)
8144 {
8145 	struct io_context* context;
8146 	struct vnode* vnode = NULL;
8147 	struct vnode* oldDirectory;
8148 	status_t status;
8149 
8150 	FUNCTION(("set_cwd: path = \'%s\'\n", path));
8151 
8152 	// Get vnode for passed path, and bail if it failed
8153 	status = fd_and_path_to_vnode(fd, path, true, &vnode, NULL, kernel);
8154 	if (status < 0)
8155 		return status;
8156 
8157 	if (!S_ISDIR(vnode->Type())) {
8158 		// nope, can't cwd to here
8159 		status = B_NOT_A_DIRECTORY;
8160 		goto err;
8161 	}
8162 
8163 	// We need to have the permission to enter the directory, too
8164 	if (HAS_FS_CALL(vnode, access)) {
8165 		status = FS_CALL(vnode, access, X_OK);
8166 		if (status != B_OK)
8167 			goto err;
8168 	}
8169 
8170 	// Get current io context and lock
8171 	context = get_current_io_context(kernel);
8172 	mutex_lock(&context->io_mutex);
8173 
8174 	// save the old current working directory first
8175 	oldDirectory = context->cwd;
8176 	context->cwd = vnode;
8177 
8178 	mutex_unlock(&context->io_mutex);
8179 
8180 	if (oldDirectory)
8181 		put_vnode(oldDirectory);
8182 
8183 	return B_NO_ERROR;
8184 
8185 err:
8186 	put_vnode(vnode);
8187 	return status;
8188 }
8189 
8190 
8191 static status_t
8192 user_copy_name(char* to, const char* from, size_t length)
8193 {
8194 	ssize_t len = user_strlcpy(to, from, length);
8195 	if (len < 0)
8196 		return len;
8197 	if (len >= (ssize_t)length)
8198 		return B_NAME_TOO_LONG;
8199 	return B_OK;
8200 }
8201 
8202 
8203 //	#pragma mark - kernel mirrored syscalls
8204 
8205 
8206 dev_t
8207 _kern_mount(const char* path, const char* device, const char* fsName,
8208 	uint32 flags, const char* args, size_t argsLength)
8209 {
8210 	KPath pathBuffer(path);
8211 	if (pathBuffer.InitCheck() != B_OK)
8212 		return B_NO_MEMORY;
8213 
8214 	return fs_mount(pathBuffer.LockBuffer(), device, fsName, flags, args, true);
8215 }
8216 
8217 
8218 status_t
8219 _kern_unmount(const char* path, uint32 flags)
8220 {
8221 	KPath pathBuffer(path);
8222 	if (pathBuffer.InitCheck() != B_OK)
8223 		return B_NO_MEMORY;
8224 
8225 	return fs_unmount(pathBuffer.LockBuffer(), -1, flags, true);
8226 }
8227 
8228 
8229 status_t
8230 _kern_read_fs_info(dev_t device, struct fs_info* info)
8231 {
8232 	if (info == NULL)
8233 		return B_BAD_VALUE;
8234 
8235 	return fs_read_info(device, info);
8236 }
8237 
8238 
8239 status_t
8240 _kern_write_fs_info(dev_t device, const struct fs_info* info, int mask)
8241 {
8242 	if (info == NULL)
8243 		return B_BAD_VALUE;
8244 
8245 	return fs_write_info(device, info, mask);
8246 }
8247 
8248 
8249 status_t
8250 _kern_sync(void)
8251 {
8252 	// Note: _kern_sync() is also called from _user_sync()
8253 	int32 cookie = 0;
8254 	dev_t device;
8255 	while ((device = next_dev(&cookie)) >= 0) {
8256 		status_t status = fs_sync(device);
8257 		if (status != B_OK && status != B_BAD_VALUE) {
8258 			dprintf("sync: device %" B_PRIdDEV " couldn't sync: %s\n", device,
8259 				strerror(status));
8260 		}
8261 	}
8262 
8263 	return B_OK;
8264 }
8265 
8266 
8267 dev_t
8268 _kern_next_device(int32* _cookie)
8269 {
8270 	return fs_next_device(_cookie);
8271 }
8272 
8273 
8274 status_t
8275 _kern_get_next_fd_info(team_id teamID, uint32* _cookie, fd_info* info,
8276 	size_t infoSize)
8277 {
8278 	if (infoSize != sizeof(fd_info))
8279 		return B_BAD_VALUE;
8280 
8281 	// get the team
8282 	Team* team = Team::Get(teamID);
8283 	if (team == NULL)
8284 		return B_BAD_TEAM_ID;
8285 	BReference<Team> teamReference(team, true);
8286 
8287 	// now that we have a team reference, its I/O context won't go away
8288 	io_context* context = team->io_context;
8289 	MutexLocker contextLocker(context->io_mutex);
8290 
8291 	uint32 slot = *_cookie;
8292 
8293 	struct file_descriptor* descriptor;
8294 	while (slot < context->table_size
8295 		&& (descriptor = context->fds[slot]) == NULL) {
8296 		slot++;
8297 	}
8298 
8299 	if (slot >= context->table_size)
8300 		return B_ENTRY_NOT_FOUND;
8301 
8302 	info->number = slot;
8303 	info->open_mode = descriptor->open_mode;
8304 
8305 	struct vnode* vnode = fd_vnode(descriptor);
8306 	if (vnode != NULL) {
8307 		info->device = vnode->device;
8308 		info->node = vnode->id;
8309 	} else if (descriptor->u.mount != NULL) {
8310 		info->device = descriptor->u.mount->id;
8311 		info->node = -1;
8312 	}
8313 
8314 	*_cookie = slot + 1;
8315 	return B_OK;
8316 }
8317 
8318 
8319 int
8320 _kern_open_entry_ref(dev_t device, ino_t inode, const char* name, int openMode,
8321 	int perms)
8322 {
8323 	if ((openMode & O_CREAT) != 0) {
8324 		return file_create_entry_ref(device, inode, name, openMode, perms,
8325 			true);
8326 	}
8327 
8328 	return file_open_entry_ref(device, inode, name, openMode, true);
8329 }
8330 
8331 
8332 /*!	\brief Opens a node specified by a FD + path pair.
8333 
8334 	At least one of \a fd and \a path must be specified.
8335 	If only \a fd is given, the function opens the node identified by this
8336 	FD. If only a path is given, this path is opened. If both are given and
8337 	the path is absolute, \a fd is ignored; a relative path is reckoned off
8338 	of the directory (!) identified by \a fd.
8339 
8340 	\param fd The FD. May be < 0.
8341 	\param path The absolute or relative path. May be \c NULL.
8342 	\param openMode The open mode.
8343 	\return A FD referring to the newly opened node, or an error code,
8344 			if an error occurs.
8345 */
8346 int
8347 _kern_open(int fd, const char* path, int openMode, int perms)
8348 {
8349 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8350 	if (pathBuffer.InitCheck() != B_OK)
8351 		return B_NO_MEMORY;
8352 
8353 	if ((openMode & O_CREAT) != 0)
8354 		return file_create(fd, pathBuffer.LockBuffer(), openMode, perms, true);
8355 
8356 	return file_open(fd, pathBuffer.LockBuffer(), openMode, true);
8357 }
8358 
8359 
8360 /*!	\brief Opens a directory specified by entry_ref or node_ref.
8361 
8362 	The supplied name may be \c NULL, in which case directory identified
8363 	by \a device and \a inode will be opened. Otherwise \a device and
8364 	\a inode identify the parent directory of the directory to be opened
8365 	and \a name its entry name.
8366 
8367 	\param device If \a name is specified the ID of the device the parent
8368 		   directory of the directory to be opened resides on, otherwise
8369 		   the device of the directory itself.
8370 	\param inode If \a name is specified the node ID of the parent
8371 		   directory of the directory to be opened, otherwise node ID of the
8372 		   directory itself.
8373 	\param name The entry name of the directory to be opened. If \c NULL,
8374 		   the \a device + \a inode pair identify the node to be opened.
8375 	\return The FD of the newly opened directory or an error code, if
8376 			something went wrong.
8377 */
8378 int
8379 _kern_open_dir_entry_ref(dev_t device, ino_t inode, const char* name)
8380 {
8381 	return dir_open_entry_ref(device, inode, name, true);
8382 }
8383 
8384 
8385 /*!	\brief Opens a directory specified by a FD + path pair.
8386 
8387 	At least one of \a fd and \a path must be specified.
8388 	If only \a fd is given, the function opens the directory identified by this
8389 	FD. If only a path is given, this path is opened. If both are given and
8390 	the path is absolute, \a fd is ignored; a relative path is reckoned off
8391 	of the directory (!) identified by \a fd.
8392 
8393 	\param fd The FD. May be < 0.
8394 	\param path The absolute or relative path. May be \c NULL.
8395 	\return A FD referring to the newly opened directory, or an error code,
8396 			if an error occurs.
8397 */
8398 int
8399 _kern_open_dir(int fd, const char* path)
8400 {
8401 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8402 	if (pathBuffer.InitCheck() != B_OK)
8403 		return B_NO_MEMORY;
8404 
8405 	return dir_open(fd, pathBuffer.LockBuffer(), true);
8406 }
8407 
8408 
8409 status_t
8410 _kern_fcntl(int fd, int op, size_t argument)
8411 {
8412 	return common_fcntl(fd, op, argument, true);
8413 }
8414 
8415 
8416 status_t
8417 _kern_fsync(int fd)
8418 {
8419 	return common_sync(fd, true);
8420 }
8421 
8422 
8423 status_t
8424 _kern_lock_node(int fd)
8425 {
8426 	return common_lock_node(fd, true);
8427 }
8428 
8429 
8430 status_t
8431 _kern_unlock_node(int fd)
8432 {
8433 	return common_unlock_node(fd, true);
8434 }
8435 
8436 
8437 status_t
8438 _kern_create_dir_entry_ref(dev_t device, ino_t inode, const char* name,
8439 	int perms)
8440 {
8441 	return dir_create_entry_ref(device, inode, name, perms, true);
8442 }
8443 
8444 
8445 /*!	\brief Creates a directory specified by a FD + path pair.
8446 
8447 	\a path must always be specified (it contains the name of the new directory
8448 	at least). If only a path is given, this path identifies the location at
8449 	which the directory shall be created. If both \a fd and \a path are given
8450 	and the path is absolute, \a fd is ignored; a relative path is reckoned off
8451 	of the directory (!) identified by \a fd.
8452 
8453 	\param fd The FD. May be < 0.
8454 	\param path The absolute or relative path. Must not be \c NULL.
8455 	\param perms The access permissions the new directory shall have.
8456 	\return \c B_OK, if the directory has been created successfully, another
8457 			error code otherwise.
8458 */
8459 status_t
8460 _kern_create_dir(int fd, const char* path, int perms)
8461 {
8462 	KPath pathBuffer(path, KPath::DEFAULT);
8463 	if (pathBuffer.InitCheck() != B_OK)
8464 		return B_NO_MEMORY;
8465 
8466 	return dir_create(fd, pathBuffer.LockBuffer(), perms, true);
8467 }
8468 
8469 
8470 status_t
8471 _kern_remove_dir(int fd, const char* path)
8472 {
8473 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8474 	if (pathBuffer.InitCheck() != B_OK)
8475 		return B_NO_MEMORY;
8476 
8477 	return dir_remove(fd, pathBuffer.LockBuffer(), true);
8478 }
8479 
8480 
8481 /*!	\brief Reads the contents of a symlink referred to by a FD + path pair.
8482 
8483 	At least one of \a fd and \a path must be specified.
8484 	If only \a fd is given, the function the symlink to be read is the node
8485 	identified by this FD. If only a path is given, this path identifies the
8486 	symlink to be read. If both are given and the path is absolute, \a fd is
8487 	ignored; a relative path is reckoned off of the directory (!) identified
8488 	by \a fd.
8489 	If this function fails with B_BUFFER_OVERFLOW, the \a _bufferSize pointer
8490 	will still be updated to reflect the required buffer size.
8491 
8492 	\param fd The FD. May be < 0.
8493 	\param path The absolute or relative path. May be \c NULL.
8494 	\param buffer The buffer into which the contents of the symlink shall be
8495 		   written.
8496 	\param _bufferSize A pointer to the size of the supplied buffer.
8497 	\return The length of the link on success or an appropriate error code
8498 */
8499 status_t
8500 _kern_read_link(int fd, const char* path, char* buffer, size_t* _bufferSize)
8501 {
8502 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8503 	if (pathBuffer.InitCheck() != B_OK)
8504 		return B_NO_MEMORY;
8505 
8506 	return common_read_link(fd, pathBuffer.LockBuffer(),
8507 		buffer, _bufferSize, true);
8508 }
8509 
8510 
8511 /*!	\brief Creates a symlink specified by a FD + path pair.
8512 
8513 	\a path must always be specified (it contains the name of the new symlink
8514 	at least). If only a path is given, this path identifies the location at
8515 	which the symlink shall be created. If both \a fd and \a path are given and
8516 	the path is absolute, \a fd is ignored; a relative path is reckoned off
8517 	of the directory (!) identified by \a fd.
8518 
8519 	\param fd The FD. May be < 0.
8520 	\param toPath The absolute or relative path. Must not be \c NULL.
8521 	\param mode The access permissions the new symlink shall have.
8522 	\return \c B_OK, if the symlink has been created successfully, another
8523 			error code otherwise.
8524 */
8525 status_t
8526 _kern_create_symlink(int fd, const char* path, const char* toPath, int mode)
8527 {
8528 	KPath pathBuffer(path);
8529 	if (pathBuffer.InitCheck() != B_OK)
8530 		return B_NO_MEMORY;
8531 
8532 	return common_create_symlink(fd, pathBuffer.LockBuffer(),
8533 		toPath, mode, true);
8534 }
8535 
8536 
8537 status_t
8538 _kern_create_link(int pathFD, const char* path, int toFD, const char* toPath,
8539 	bool traverseLeafLink)
8540 {
8541 	KPath pathBuffer(path);
8542 	KPath toPathBuffer(toPath);
8543 	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
8544 		return B_NO_MEMORY;
8545 
8546 	return common_create_link(pathFD, pathBuffer.LockBuffer(), toFD,
8547 		toPathBuffer.LockBuffer(), traverseLeafLink, true);
8548 }
8549 
8550 
8551 /*!	\brief Removes an entry specified by a FD + path pair from its directory.
8552 
8553 	\a path must always be specified (it contains at least the name of the entry
8554 	to be deleted). If only a path is given, this path identifies the entry
8555 	directly. If both \a fd and \a path are given and the path is absolute,
8556 	\a fd is ignored; a relative path is reckoned off of the directory (!)
8557 	identified by \a fd.
8558 
8559 	\param fd The FD. May be < 0.
8560 	\param path The absolute or relative path. Must not be \c NULL.
8561 	\return \c B_OK, if the entry has been removed successfully, another
8562 			error code otherwise.
8563 */
8564 status_t
8565 _kern_unlink(int fd, const char* path)
8566 {
8567 	KPath pathBuffer(path);
8568 	if (pathBuffer.InitCheck() != B_OK)
8569 		return B_NO_MEMORY;
8570 
8571 	return common_unlink(fd, pathBuffer.LockBuffer(), true);
8572 }
8573 
8574 
8575 /*!	\brief Moves an entry specified by a FD + path pair to a an entry specified
8576 		   by another FD + path pair.
8577 
8578 	\a oldPath and \a newPath must always be specified (they contain at least
8579 	the name of the entry). If only a path is given, this path identifies the
8580 	entry directly. If both a FD and a path are given and the path is absolute,
8581 	the FD is ignored; a relative path is reckoned off of the directory (!)
8582 	identified by the respective FD.
8583 
8584 	\param oldFD The FD of the old location. May be < 0.
8585 	\param oldPath The absolute or relative path of the old location. Must not
8586 		   be \c NULL.
8587 	\param newFD The FD of the new location. May be < 0.
8588 	\param newPath The absolute or relative path of the new location. Must not
8589 		   be \c NULL.
8590 	\return \c B_OK, if the entry has been moved successfully, another
8591 			error code otherwise.
8592 */
8593 status_t
8594 _kern_rename(int oldFD, const char* oldPath, int newFD, const char* newPath)
8595 {
8596 	KPath oldPathBuffer(oldPath);
8597 	KPath newPathBuffer(newPath);
8598 	if (oldPathBuffer.InitCheck() != B_OK || newPathBuffer.InitCheck() != B_OK)
8599 		return B_NO_MEMORY;
8600 
8601 	return common_rename(oldFD, oldPathBuffer.LockBuffer(),
8602 		newFD, newPathBuffer.LockBuffer(), true);
8603 }
8604 
8605 
8606 status_t
8607 _kern_access(int fd, const char* path, int mode, bool effectiveUserGroup)
8608 {
8609 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8610 	if (pathBuffer.InitCheck() != B_OK)
8611 		return B_NO_MEMORY;
8612 
8613 	return common_access(fd, pathBuffer.LockBuffer(), mode, effectiveUserGroup,
8614 		true);
8615 }
8616 
8617 
8618 /*!	\brief Reads stat data of an entity specified by a FD + path pair.
8619 
8620 	If only \a fd is given, the stat operation associated with the type
8621 	of the FD (node, attr, attr dir etc.) is performed. If only \a path is
8622 	given, this path identifies the entry for whose node to retrieve the
8623 	stat data. If both \a fd and \a path are given and the path is absolute,
8624 	\a fd is ignored; a relative path is reckoned off of the directory (!)
8625 	identified by \a fd and specifies the entry whose stat data shall be
8626 	retrieved.
8627 
8628 	\param fd The FD. May be < 0.
8629 	\param path The absolute or relative path. Must not be \c NULL.
8630 	\param traverseLeafLink If \a path is given, \c true specifies that the
8631 		   function shall not stick to symlinks, but traverse them.
8632 	\param stat The buffer the stat data shall be written into.
8633 	\param statSize The size of the supplied stat buffer.
8634 	\return \c B_OK, if the the stat data have been read successfully, another
8635 			error code otherwise.
8636 */
8637 status_t
8638 _kern_read_stat(int fd, const char* path, bool traverseLeafLink,
8639 	struct stat* stat, size_t statSize)
8640 {
8641 	struct stat completeStat;
8642 	struct stat* originalStat = NULL;
8643 	status_t status;
8644 
8645 	if (statSize > sizeof(struct stat))
8646 		return B_BAD_VALUE;
8647 
8648 	// this supports different stat extensions
8649 	if (statSize < sizeof(struct stat)) {
8650 		originalStat = stat;
8651 		stat = &completeStat;
8652 	}
8653 
8654 	status = vfs_read_stat(fd, path, traverseLeafLink, stat, true);
8655 
8656 	if (status == B_OK && originalStat != NULL)
8657 		memcpy(originalStat, stat, statSize);
8658 
8659 	return status;
8660 }
8661 
8662 
8663 /*!	\brief Writes stat data of an entity specified by a FD + path pair.
8664 
8665 	If only \a fd is given, the stat operation associated with the type
8666 	of the FD (node, attr, attr dir etc.) is performed. If only \a path is
8667 	given, this path identifies the entry for whose node to write the
8668 	stat data. If both \a fd and \a path are given and the path is absolute,
8669 	\a fd is ignored; a relative path is reckoned off of the directory (!)
8670 	identified by \a fd and specifies the entry whose stat data shall be
8671 	written.
8672 
8673 	\param fd The FD. May be < 0.
8674 	\param path The absolute or relative path. May be \c NULL.
8675 	\param traverseLeafLink If \a path is given, \c true specifies that the
8676 		   function shall not stick to symlinks, but traverse them.
8677 	\param stat The buffer containing the stat data to be written.
8678 	\param statSize The size of the supplied stat buffer.
8679 	\param statMask A mask specifying which parts of the stat data shall be
8680 		   written.
8681 	\return \c B_OK, if the the stat data have been written successfully,
8682 			another error code otherwise.
8683 */
8684 status_t
8685 _kern_write_stat(int fd, const char* path, bool traverseLeafLink,
8686 	const struct stat* stat, size_t statSize, int statMask)
8687 {
8688 	struct stat completeStat;
8689 
8690 	if (statSize > sizeof(struct stat))
8691 		return B_BAD_VALUE;
8692 
8693 	// this supports different stat extensions
8694 	if (statSize < sizeof(struct stat)) {
8695 		memset((uint8*)&completeStat + statSize, 0,
8696 			sizeof(struct stat) - statSize);
8697 		memcpy(&completeStat, stat, statSize);
8698 		stat = &completeStat;
8699 	}
8700 
8701 	status_t status;
8702 
8703 	if (path != NULL) {
8704 		// path given: write the stat of the node referred to by (fd, path)
8705 		KPath pathBuffer(path);
8706 		if (pathBuffer.InitCheck() != B_OK)
8707 			return B_NO_MEMORY;
8708 
8709 		status = common_path_write_stat(fd, pathBuffer.LockBuffer(),
8710 			traverseLeafLink, stat, statMask, true);
8711 	} else {
8712 		// no path given: get the FD and use the FD operation
8713 		struct file_descriptor* descriptor
8714 			= get_fd(get_current_io_context(true), fd);
8715 		if (descriptor == NULL)
8716 			return B_FILE_ERROR;
8717 
8718 		if (descriptor->ops->fd_write_stat)
8719 			status = descriptor->ops->fd_write_stat(descriptor, stat, statMask);
8720 		else
8721 			status = B_UNSUPPORTED;
8722 
8723 		put_fd(descriptor);
8724 	}
8725 
8726 	return status;
8727 }
8728 
8729 
8730 int
8731 _kern_open_attr_dir(int fd, const char* path, bool traverseLeafLink)
8732 {
8733 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8734 	if (pathBuffer.InitCheck() != B_OK)
8735 		return B_NO_MEMORY;
8736 
8737 	return attr_dir_open(fd, pathBuffer.LockBuffer(), traverseLeafLink, true);
8738 }
8739 
8740 
8741 int
8742 _kern_open_attr(int fd, const char* path, const char* name, uint32 type,
8743 	int openMode)
8744 {
8745 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8746 	if (pathBuffer.InitCheck() != B_OK)
8747 		return B_NO_MEMORY;
8748 
8749 	if ((openMode & O_CREAT) != 0) {
8750 		return attr_create(fd, pathBuffer.LockBuffer(), name, type, openMode,
8751 			true);
8752 	}
8753 
8754 	return attr_open(fd, pathBuffer.LockBuffer(), name, openMode, true);
8755 }
8756 
8757 
8758 status_t
8759 _kern_remove_attr(int fd, const char* name)
8760 {
8761 	return attr_remove(fd, name, true);
8762 }
8763 
8764 
8765 status_t
8766 _kern_rename_attr(int fromFile, const char* fromName, int toFile,
8767 	const char* toName)
8768 {
8769 	return attr_rename(fromFile, fromName, toFile, toName, true);
8770 }
8771 
8772 
8773 int
8774 _kern_open_index_dir(dev_t device)
8775 {
8776 	return index_dir_open(device, true);
8777 }
8778 
8779 
8780 status_t
8781 _kern_create_index(dev_t device, const char* name, uint32 type, uint32 flags)
8782 {
8783 	return index_create(device, name, type, flags, true);
8784 }
8785 
8786 
8787 status_t
8788 _kern_read_index_stat(dev_t device, const char* name, struct stat* stat)
8789 {
8790 	return index_name_read_stat(device, name, stat, true);
8791 }
8792 
8793 
8794 status_t
8795 _kern_remove_index(dev_t device, const char* name)
8796 {
8797 	return index_remove(device, name, true);
8798 }
8799 
8800 
8801 status_t
8802 _kern_getcwd(char* buffer, size_t size)
8803 {
8804 	TRACE(("_kern_getcwd: buf %p, %ld\n", buffer, size));
8805 
8806 	// Call vfs to get current working directory
8807 	return get_cwd(buffer, size, true);
8808 }
8809 
8810 
8811 status_t
8812 _kern_setcwd(int fd, const char* path)
8813 {
8814 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8815 	if (pathBuffer.InitCheck() != B_OK)
8816 		return B_NO_MEMORY;
8817 
8818 	return set_cwd(fd, pathBuffer.LockBuffer(), true);
8819 }
8820 
8821 
8822 //	#pragma mark - userland syscalls
8823 
8824 
8825 dev_t
8826 _user_mount(const char* userPath, const char* userDevice,
8827 	const char* userFileSystem, uint32 flags, const char* userArgs,
8828 	size_t argsLength)
8829 {
8830 	char fileSystem[B_FILE_NAME_LENGTH];
8831 	KPath path, device;
8832 	char* args = NULL;
8833 	status_t status;
8834 
8835 	if (!IS_USER_ADDRESS(userPath))
8836 		return B_BAD_ADDRESS;
8837 
8838 	if (path.InitCheck() != B_OK || device.InitCheck() != B_OK)
8839 		return B_NO_MEMORY;
8840 
8841 	status = user_copy_name(path.LockBuffer(), userPath,
8842 		B_PATH_NAME_LENGTH);
8843 	if (status != B_OK)
8844 		return status;
8845 	path.UnlockBuffer();
8846 
8847 	if (userFileSystem != NULL) {
8848 		if (!IS_USER_ADDRESS(userFileSystem))
8849 			return B_BAD_ADDRESS;
8850 
8851 		status = user_copy_name(fileSystem, userFileSystem, sizeof(fileSystem));
8852 		if (status != B_OK)
8853 			return status;
8854 	}
8855 
8856 	if (userDevice != NULL) {
8857 		if (!IS_USER_ADDRESS(userDevice))
8858 			return B_BAD_ADDRESS;
8859 
8860 		status = user_copy_name(device.LockBuffer(), userDevice,
8861 			B_PATH_NAME_LENGTH);
8862 		if (status != B_OK)
8863 			return status;
8864 		device.UnlockBuffer();
8865 	}
8866 
8867 	if (userArgs != NULL && argsLength > 0) {
8868 		if (!IS_USER_ADDRESS(userArgs))
8869 			return B_BAD_ADDRESS;
8870 
8871 		// this is a safety restriction
8872 		if (argsLength >= 65536)
8873 			return B_NAME_TOO_LONG;
8874 
8875 		args = (char*)malloc(argsLength + 1);
8876 		if (args == NULL)
8877 			return B_NO_MEMORY;
8878 
8879 		status = user_copy_name(args, userArgs, argsLength + 1);
8880 		if (status != B_OK) {
8881 			free(args);
8882 			return status;
8883 		}
8884 	}
8885 
8886 	status = fs_mount(path.LockBuffer(),
8887 		userDevice != NULL ? device.Path() : NULL,
8888 		userFileSystem ? fileSystem : NULL, flags, args, false);
8889 
8890 	free(args);
8891 	return status;
8892 }
8893 
8894 
8895 status_t
8896 _user_unmount(const char* userPath, uint32 flags)
8897 {
8898 	if (!IS_USER_ADDRESS(userPath))
8899 		return B_BAD_ADDRESS;
8900 
8901 	KPath pathBuffer;
8902 	if (pathBuffer.InitCheck() != B_OK)
8903 		return B_NO_MEMORY;
8904 
8905 	char* path = pathBuffer.LockBuffer();
8906 
8907 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
8908 	if (status != B_OK)
8909 		return status;
8910 
8911 	return fs_unmount(path, -1, flags & ~B_UNMOUNT_BUSY_PARTITION, false);
8912 }
8913 
8914 
8915 status_t
8916 _user_read_fs_info(dev_t device, struct fs_info* userInfo)
8917 {
8918 	struct fs_info info;
8919 	status_t status;
8920 
8921 	if (userInfo == NULL)
8922 		return B_BAD_VALUE;
8923 
8924 	if (!IS_USER_ADDRESS(userInfo))
8925 		return B_BAD_ADDRESS;
8926 
8927 	status = fs_read_info(device, &info);
8928 	if (status != B_OK)
8929 		return status;
8930 
8931 	if (user_memcpy(userInfo, &info, sizeof(struct fs_info)) != B_OK)
8932 		return B_BAD_ADDRESS;
8933 
8934 	return B_OK;
8935 }
8936 
8937 
8938 status_t
8939 _user_write_fs_info(dev_t device, const struct fs_info* userInfo, int mask)
8940 {
8941 	struct fs_info info;
8942 
8943 	if (userInfo == NULL)
8944 		return B_BAD_VALUE;
8945 
8946 	if (!IS_USER_ADDRESS(userInfo)
8947 		|| user_memcpy(&info, userInfo, sizeof(struct fs_info)) != B_OK)
8948 		return B_BAD_ADDRESS;
8949 
8950 	return fs_write_info(device, &info, mask);
8951 }
8952 
8953 
8954 dev_t
8955 _user_next_device(int32* _userCookie)
8956 {
8957 	int32 cookie;
8958 	dev_t device;
8959 
8960 	if (!IS_USER_ADDRESS(_userCookie)
8961 		|| user_memcpy(&cookie, _userCookie, sizeof(int32)) != B_OK)
8962 		return B_BAD_ADDRESS;
8963 
8964 	device = fs_next_device(&cookie);
8965 
8966 	if (device >= B_OK) {
8967 		// update user cookie
8968 		if (user_memcpy(_userCookie, &cookie, sizeof(int32)) != B_OK)
8969 			return B_BAD_ADDRESS;
8970 	}
8971 
8972 	return device;
8973 }
8974 
8975 
8976 status_t
8977 _user_sync(void)
8978 {
8979 	return _kern_sync();
8980 }
8981 
8982 
8983 status_t
8984 _user_get_next_fd_info(team_id team, uint32* userCookie, fd_info* userInfo,
8985 	size_t infoSize)
8986 {
8987 	struct fd_info info;
8988 	uint32 cookie;
8989 
8990 	// only root can do this (or should root's group be enough?)
8991 	if (geteuid() != 0)
8992 		return B_NOT_ALLOWED;
8993 
8994 	if (infoSize != sizeof(fd_info))
8995 		return B_BAD_VALUE;
8996 
8997 	if (!IS_USER_ADDRESS(userCookie) || !IS_USER_ADDRESS(userInfo)
8998 		|| user_memcpy(&cookie, userCookie, sizeof(uint32)) != B_OK)
8999 		return B_BAD_ADDRESS;
9000 
9001 	status_t status = _kern_get_next_fd_info(team, &cookie, &info, infoSize);
9002 	if (status != B_OK)
9003 		return status;
9004 
9005 	if (user_memcpy(userCookie, &cookie, sizeof(uint32)) != B_OK
9006 		|| user_memcpy(userInfo, &info, infoSize) != B_OK)
9007 		return B_BAD_ADDRESS;
9008 
9009 	return status;
9010 }
9011 
9012 
9013 status_t
9014 _user_entry_ref_to_path(dev_t device, ino_t inode, const char* leaf,
9015 	char* userPath, size_t pathLength)
9016 {
9017 	if (!IS_USER_ADDRESS(userPath))
9018 		return B_BAD_ADDRESS;
9019 
9020 	KPath path;
9021 	if (path.InitCheck() != B_OK)
9022 		return B_NO_MEMORY;
9023 
9024 	// copy the leaf name onto the stack
9025 	char stackLeaf[B_FILE_NAME_LENGTH];
9026 	if (leaf != NULL) {
9027 		if (!IS_USER_ADDRESS(leaf))
9028 			return B_BAD_ADDRESS;
9029 
9030 		int status = user_copy_name(stackLeaf, leaf, B_FILE_NAME_LENGTH);
9031 		if (status != B_OK)
9032 			return status;
9033 
9034 		leaf = stackLeaf;
9035 	}
9036 
9037 	status_t status = vfs_entry_ref_to_path(device, inode, leaf,
9038 		false, path.LockBuffer(), path.BufferSize());
9039 	if (status != B_OK)
9040 		return status;
9041 
9042 	path.UnlockBuffer();
9043 
9044 	int length = user_strlcpy(userPath, path.Path(), pathLength);
9045 	if (length < 0)
9046 		return length;
9047 	if (length >= (int)pathLength)
9048 		return B_BUFFER_OVERFLOW;
9049 
9050 	return B_OK;
9051 }
9052 
9053 
9054 status_t
9055 _user_normalize_path(const char* userPath, bool traverseLink, char* buffer)
9056 {
9057 	if (userPath == NULL || buffer == NULL)
9058 		return B_BAD_VALUE;
9059 	if (!IS_USER_ADDRESS(userPath) || !IS_USER_ADDRESS(buffer))
9060 		return B_BAD_ADDRESS;
9061 
9062 	// copy path from userland
9063 	KPath pathBuffer;
9064 	if (pathBuffer.InitCheck() != B_OK)
9065 		return B_NO_MEMORY;
9066 	char* path = pathBuffer.LockBuffer();
9067 
9068 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9069 	if (status != B_OK)
9070 		return status;
9071 
9072 	status_t error = normalize_path(path, pathBuffer.BufferSize(), traverseLink,
9073 		false);
9074 	if (error != B_OK)
9075 		return error;
9076 
9077 	// copy back to userland
9078 	int len = user_strlcpy(buffer, path, B_PATH_NAME_LENGTH);
9079 	if (len < 0)
9080 		return len;
9081 	if (len >= B_PATH_NAME_LENGTH)
9082 		return B_BUFFER_OVERFLOW;
9083 
9084 	return B_OK;
9085 }
9086 
9087 
9088 int
9089 _user_open_entry_ref(dev_t device, ino_t inode, const char* userName,
9090 	int openMode, int perms)
9091 {
9092 	char name[B_FILE_NAME_LENGTH];
9093 
9094 	if (userName == NULL || device < 0 || inode < 0)
9095 		return B_BAD_VALUE;
9096 	if (!IS_USER_ADDRESS(userName))
9097 		return B_BAD_ADDRESS;
9098 	status_t status = user_copy_name(name, userName, sizeof(name));
9099 	if (status != B_OK)
9100 		return status;
9101 
9102 	if ((openMode & O_CREAT) != 0) {
9103 		return file_create_entry_ref(device, inode, name, openMode, perms,
9104 			false);
9105 	}
9106 
9107 	return file_open_entry_ref(device, inode, name, openMode, false);
9108 }
9109 
9110 
9111 int
9112 _user_open(int fd, const char* userPath, int openMode, int perms)
9113 {
9114 	KPath path;
9115 	if (path.InitCheck() != B_OK)
9116 		return B_NO_MEMORY;
9117 
9118 	char* buffer = path.LockBuffer();
9119 
9120 	if (!IS_USER_ADDRESS(userPath))
9121 		return B_BAD_ADDRESS;
9122 	status_t status = user_copy_name(buffer, userPath, B_PATH_NAME_LENGTH);
9123 	if (status != B_OK)
9124 		return status;
9125 
9126 	if ((openMode & O_CREAT) != 0)
9127 		return file_create(fd, buffer, openMode, perms, false);
9128 
9129 	return file_open(fd, buffer, openMode, false);
9130 }
9131 
9132 
9133 int
9134 _user_open_dir_entry_ref(dev_t device, ino_t inode, const char* userName)
9135 {
9136 	if (userName != NULL) {
9137 		char name[B_FILE_NAME_LENGTH];
9138 
9139 		if (!IS_USER_ADDRESS(userName))
9140 			return B_BAD_ADDRESS;
9141 		status_t status = user_copy_name(name, userName, sizeof(name));
9142 		if (status != B_OK)
9143 			return status;
9144 
9145 		return dir_open_entry_ref(device, inode, name, false);
9146 	}
9147 	return dir_open_entry_ref(device, inode, NULL, false);
9148 }
9149 
9150 
9151 int
9152 _user_open_dir(int fd, const char* userPath)
9153 {
9154 	if (userPath == NULL)
9155 		return dir_open(fd, NULL, false);
9156 
9157 	KPath path;
9158 	if (path.InitCheck() != B_OK)
9159 		return B_NO_MEMORY;
9160 
9161 	char* buffer = path.LockBuffer();
9162 
9163 	if (!IS_USER_ADDRESS(userPath))
9164 		return B_BAD_ADDRESS;
9165 	status_t status = user_copy_name(buffer, userPath, B_PATH_NAME_LENGTH);
9166 	if (status != B_OK)
9167 		return status;
9168 
9169 	return dir_open(fd, buffer, false);
9170 }
9171 
9172 
9173 /*!	\brief Opens a directory's parent directory and returns the entry name
9174 		   of the former.
9175 
9176 	Aside from that it returns the directory's entry name, this method is
9177 	equivalent to \code _user_open_dir(fd, "..") \endcode. It really is
9178 	equivalent, if \a userName is \c NULL.
9179 
9180 	If a name buffer is supplied and the name does not fit the buffer, the
9181 	function fails. A buffer of size \c B_FILE_NAME_LENGTH should be safe.
9182 
9183 	\param fd A FD referring to a directory.
9184 	\param userName Buffer the directory's entry name shall be written into.
9185 		   May be \c NULL.
9186 	\param nameLength Size of the name buffer.
9187 	\return The file descriptor of the opened parent directory, if everything
9188 			went fine, an error code otherwise.
9189 */
9190 int
9191 _user_open_parent_dir(int fd, char* userName, size_t nameLength)
9192 {
9193 	bool kernel = false;
9194 
9195 	if (userName && !IS_USER_ADDRESS(userName))
9196 		return B_BAD_ADDRESS;
9197 
9198 	// open the parent dir
9199 	int parentFD = dir_open(fd, (char*)"..", kernel);
9200 	if (parentFD < 0)
9201 		return parentFD;
9202 	FDCloser fdCloser(parentFD, kernel);
9203 
9204 	if (userName) {
9205 		// get the vnodes
9206 		struct vnode* parentVNode = get_vnode_from_fd(parentFD, kernel);
9207 		struct vnode* dirVNode = get_vnode_from_fd(fd, kernel);
9208 		VNodePutter parentVNodePutter(parentVNode);
9209 		VNodePutter dirVNodePutter(dirVNode);
9210 		if (!parentVNode || !dirVNode)
9211 			return B_FILE_ERROR;
9212 
9213 		// get the vnode name
9214 		char _buffer[sizeof(struct dirent) + B_FILE_NAME_LENGTH];
9215 		struct dirent* buffer = (struct dirent*)_buffer;
9216 		status_t status = get_vnode_name(dirVNode, parentVNode, buffer,
9217 			sizeof(_buffer), get_current_io_context(false));
9218 		if (status != B_OK)
9219 			return status;
9220 
9221 		// copy the name to the userland buffer
9222 		int len = user_strlcpy(userName, buffer->d_name, nameLength);
9223 		if (len < 0)
9224 			return len;
9225 		if (len >= (int)nameLength)
9226 			return B_BUFFER_OVERFLOW;
9227 	}
9228 
9229 	return fdCloser.Detach();
9230 }
9231 
9232 
9233 status_t
9234 _user_fcntl(int fd, int op, size_t argument)
9235 {
9236 	status_t status = common_fcntl(fd, op, argument, false);
9237 	if (op == F_SETLKW)
9238 		syscall_restart_handle_post(status);
9239 
9240 	return status;
9241 }
9242 
9243 
9244 status_t
9245 _user_fsync(int fd)
9246 {
9247 	return common_sync(fd, false);
9248 }
9249 
9250 
9251 status_t
9252 _user_flock(int fd, int operation)
9253 {
9254 	FUNCTION(("_user_fcntl(fd = %d, op = %d)\n", fd, operation));
9255 
9256 	// Check if the operation is valid
9257 	switch (operation & ~LOCK_NB) {
9258 		case LOCK_UN:
9259 		case LOCK_SH:
9260 		case LOCK_EX:
9261 			break;
9262 
9263 		default:
9264 			return B_BAD_VALUE;
9265 	}
9266 
9267 	struct file_descriptor* descriptor;
9268 	struct vnode* vnode;
9269 	descriptor = get_fd_and_vnode(fd, &vnode, false);
9270 	if (descriptor == NULL)
9271 		return B_FILE_ERROR;
9272 
9273 	if (descriptor->type != FDTYPE_FILE) {
9274 		put_fd(descriptor);
9275 		return B_BAD_VALUE;
9276 	}
9277 
9278 	struct flock flock;
9279 	flock.l_start = 0;
9280 	flock.l_len = OFF_MAX;
9281 	flock.l_whence = 0;
9282 	flock.l_type = (operation & LOCK_SH) != 0 ? F_RDLCK : F_WRLCK;
9283 
9284 	status_t status;
9285 	if ((operation & LOCK_UN) != 0) {
9286 		if (HAS_FS_CALL(vnode, release_lock))
9287 			status = FS_CALL(vnode, release_lock, descriptor->cookie, &flock);
9288 		else
9289 			status = release_advisory_lock(vnode, NULL, descriptor, &flock);
9290 	} else {
9291 		if (HAS_FS_CALL(vnode, acquire_lock)) {
9292 			status = FS_CALL(vnode, acquire_lock, descriptor->cookie, &flock,
9293 				(operation & LOCK_NB) == 0);
9294 		} else {
9295 			status = acquire_advisory_lock(vnode, NULL, descriptor, &flock,
9296 				(operation & LOCK_NB) == 0);
9297 		}
9298 	}
9299 
9300 	syscall_restart_handle_post(status);
9301 
9302 	put_fd(descriptor);
9303 	return status;
9304 }
9305 
9306 
9307 status_t
9308 _user_lock_node(int fd)
9309 {
9310 	return common_lock_node(fd, false);
9311 }
9312 
9313 
9314 status_t
9315 _user_unlock_node(int fd)
9316 {
9317 	return common_unlock_node(fd, false);
9318 }
9319 
9320 
9321 status_t
9322 _user_create_dir_entry_ref(dev_t device, ino_t inode, const char* userName,
9323 	int perms)
9324 {
9325 	char name[B_FILE_NAME_LENGTH];
9326 	status_t status;
9327 
9328 	if (!IS_USER_ADDRESS(userName))
9329 		return B_BAD_ADDRESS;
9330 
9331 	status = user_copy_name(name, userName, sizeof(name));
9332 	if (status != B_OK)
9333 		return status;
9334 
9335 	return dir_create_entry_ref(device, inode, name, perms, false);
9336 }
9337 
9338 
9339 status_t
9340 _user_create_dir(int fd, const char* userPath, int perms)
9341 {
9342 	KPath pathBuffer;
9343 	if (pathBuffer.InitCheck() != B_OK)
9344 		return B_NO_MEMORY;
9345 
9346 	char* path = pathBuffer.LockBuffer();
9347 
9348 	if (!IS_USER_ADDRESS(userPath))
9349 		return B_BAD_ADDRESS;
9350 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9351 	if (status != B_OK)
9352 		return status;
9353 
9354 	return dir_create(fd, path, perms, false);
9355 }
9356 
9357 
9358 status_t
9359 _user_remove_dir(int fd, const char* userPath)
9360 {
9361 	KPath pathBuffer;
9362 	if (pathBuffer.InitCheck() != B_OK)
9363 		return B_NO_MEMORY;
9364 
9365 	char* path = pathBuffer.LockBuffer();
9366 
9367 	if (userPath != NULL) {
9368 		if (!IS_USER_ADDRESS(userPath))
9369 			return B_BAD_ADDRESS;
9370 		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9371 		if (status != B_OK)
9372 			return status;
9373 	}
9374 
9375 	return dir_remove(fd, userPath ? path : NULL, false);
9376 }
9377 
9378 
9379 status_t
9380 _user_read_link(int fd, const char* userPath, char* userBuffer,
9381 	size_t* userBufferSize)
9382 {
9383 	KPath pathBuffer, linkBuffer;
9384 	if (pathBuffer.InitCheck() != B_OK || linkBuffer.InitCheck() != B_OK)
9385 		return B_NO_MEMORY;
9386 
9387 	size_t bufferSize;
9388 
9389 	if (!IS_USER_ADDRESS(userBuffer) || !IS_USER_ADDRESS(userBufferSize)
9390 		|| user_memcpy(&bufferSize, userBufferSize, sizeof(size_t)) != B_OK)
9391 		return B_BAD_ADDRESS;
9392 
9393 	char* path = pathBuffer.LockBuffer();
9394 	char* buffer = linkBuffer.LockBuffer();
9395 
9396 	if (userPath) {
9397 		if (!IS_USER_ADDRESS(userPath))
9398 			return B_BAD_ADDRESS;
9399 		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9400 		if (status != B_OK)
9401 			return status;
9402 
9403 		if (bufferSize > B_PATH_NAME_LENGTH)
9404 			bufferSize = B_PATH_NAME_LENGTH;
9405 	}
9406 
9407 	status_t status = common_read_link(fd, userPath ? path : NULL, buffer,
9408 		&bufferSize, false);
9409 
9410 	// we also update the bufferSize in case of errors
9411 	// (the real length will be returned in case of B_BUFFER_OVERFLOW)
9412 	if (user_memcpy(userBufferSize, &bufferSize, sizeof(size_t)) != B_OK)
9413 		return B_BAD_ADDRESS;
9414 
9415 	if (status != B_OK)
9416 		return status;
9417 
9418 	if (user_memcpy(userBuffer, buffer, bufferSize) != B_OK)
9419 		return B_BAD_ADDRESS;
9420 
9421 	return B_OK;
9422 }
9423 
9424 
9425 status_t
9426 _user_create_symlink(int fd, const char* userPath, const char* userToPath,
9427 	int mode)
9428 {
9429 	KPath pathBuffer;
9430 	KPath toPathBuffer;
9431 	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
9432 		return B_NO_MEMORY;
9433 
9434 	char* path = pathBuffer.LockBuffer();
9435 	char* toPath = toPathBuffer.LockBuffer();
9436 
9437 	if (!IS_USER_ADDRESS(userPath) || !IS_USER_ADDRESS(userToPath))
9438 		return B_BAD_ADDRESS;
9439 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9440 	if (status != B_OK)
9441 		return status;
9442 	status = user_copy_name(toPath, userToPath, B_PATH_NAME_LENGTH);
9443 	if (status != B_OK)
9444 		return status;
9445 
9446 	return common_create_symlink(fd, path, toPath, mode, false);
9447 }
9448 
9449 
9450 status_t
9451 _user_create_link(int pathFD, const char* userPath, int toFD,
9452 	const char* userToPath, bool traverseLeafLink)
9453 {
9454 	KPath pathBuffer;
9455 	KPath toPathBuffer;
9456 	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
9457 		return B_NO_MEMORY;
9458 
9459 	char* path = pathBuffer.LockBuffer();
9460 	char* toPath = toPathBuffer.LockBuffer();
9461 
9462 	if (!IS_USER_ADDRESS(userPath) || !IS_USER_ADDRESS(userToPath))
9463 		return B_BAD_ADDRESS;
9464 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9465 	if (status != B_OK)
9466 		return status;
9467 	status = user_copy_name(toPath, userToPath, B_PATH_NAME_LENGTH);
9468 	if (status != B_OK)
9469 		return status;
9470 
9471 	status = check_path(toPath);
9472 	if (status != B_OK)
9473 		return status;
9474 
9475 	return common_create_link(pathFD, path, toFD, toPath, traverseLeafLink,
9476 		false);
9477 }
9478 
9479 
9480 status_t
9481 _user_unlink(int fd, const char* userPath)
9482 {
9483 	KPath pathBuffer;
9484 	if (pathBuffer.InitCheck() != B_OK)
9485 		return B_NO_MEMORY;
9486 
9487 	char* path = pathBuffer.LockBuffer();
9488 
9489 	if (!IS_USER_ADDRESS(userPath))
9490 		return B_BAD_ADDRESS;
9491 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9492 	if (status != B_OK)
9493 		return status;
9494 
9495 	return common_unlink(fd, path, false);
9496 }
9497 
9498 
9499 status_t
9500 _user_rename(int oldFD, const char* userOldPath, int newFD,
9501 	const char* userNewPath)
9502 {
9503 	KPath oldPathBuffer;
9504 	KPath newPathBuffer;
9505 	if (oldPathBuffer.InitCheck() != B_OK || newPathBuffer.InitCheck() != B_OK)
9506 		return B_NO_MEMORY;
9507 
9508 	char* oldPath = oldPathBuffer.LockBuffer();
9509 	char* newPath = newPathBuffer.LockBuffer();
9510 
9511 	if (!IS_USER_ADDRESS(userOldPath) || !IS_USER_ADDRESS(userNewPath))
9512 		return B_BAD_ADDRESS;
9513 	status_t status = user_copy_name(oldPath, userOldPath, B_PATH_NAME_LENGTH);
9514 	if (status != B_OK)
9515 		return status;
9516 	status = user_copy_name(newPath, userNewPath, B_PATH_NAME_LENGTH);
9517 	if (status != B_OK)
9518 		return status;
9519 
9520 	return common_rename(oldFD, oldPath, newFD, newPath, false);
9521 }
9522 
9523 
9524 status_t
9525 _user_create_fifo(int fd, const char* userPath, mode_t perms)
9526 {
9527 	KPath pathBuffer;
9528 	if (pathBuffer.InitCheck() != B_OK)
9529 		return B_NO_MEMORY;
9530 
9531 	char* path = pathBuffer.LockBuffer();
9532 
9533 	if (!IS_USER_ADDRESS(userPath))
9534 		return B_BAD_ADDRESS;
9535 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9536 	if (status != B_OK)
9537 		return status;
9538 
9539 	// split into directory vnode and filename path
9540 	char filename[B_FILE_NAME_LENGTH];
9541 	struct vnode* dir;
9542 	status = fd_and_path_to_dir_vnode(fd, path, &dir, filename, false);
9543 	if (status != B_OK)
9544 		return status;
9545 
9546 	VNodePutter _(dir);
9547 
9548 	// the underlying FS needs to support creating FIFOs
9549 	if (!HAS_FS_CALL(dir, create_special_node))
9550 		return B_UNSUPPORTED;
9551 
9552 	// create the entry	-- the FIFO sub node is set up automatically
9553 	fs_vnode superVnode;
9554 	ino_t nodeID;
9555 	status = FS_CALL(dir, create_special_node, filename, NULL,
9556 		S_IFIFO | (perms & S_IUMSK), 0, &superVnode, &nodeID);
9557 
9558 	// create_special_node() acquired a reference for us that we don't need.
9559 	if (status == B_OK)
9560 		put_vnode(dir->mount->volume, nodeID);
9561 
9562 	return status;
9563 }
9564 
9565 
9566 status_t
9567 _user_create_pipe(int* userFDs)
9568 {
9569 	// rootfs should support creating FIFOs, but let's be sure
9570 	if (!HAS_FS_CALL(sRoot, create_special_node))
9571 		return B_UNSUPPORTED;
9572 
9573 	// create the node	-- the FIFO sub node is set up automatically
9574 	fs_vnode superVnode;
9575 	ino_t nodeID;
9576 	status_t status = FS_CALL(sRoot, create_special_node, NULL, NULL,
9577 		S_IFIFO | S_IRUSR | S_IWUSR, 0, &superVnode, &nodeID);
9578 	if (status != B_OK)
9579 		return status;
9580 
9581 	// We've got one reference to the node and need another one.
9582 	struct vnode* vnode;
9583 	status = get_vnode(sRoot->mount->id, nodeID, &vnode, true, false);
9584 	if (status != B_OK) {
9585 		// that should not happen
9586 		dprintf("_user_create_pipe(): Failed to lookup vnode (%" B_PRIdDEV ", "
9587 			"%" B_PRIdINO ")\n", sRoot->mount->id, sRoot->id);
9588 		return status;
9589 	}
9590 
9591 	// Everything looks good so far. Open two FDs for reading respectively
9592 	// writing.
9593 	int fds[2];
9594 	fds[0] = open_vnode(vnode, O_RDONLY, false);
9595 	fds[1] = open_vnode(vnode, O_WRONLY, false);
9596 
9597 	FDCloser closer0(fds[0], false);
9598 	FDCloser closer1(fds[1], false);
9599 
9600 	status = (fds[0] >= 0 ? (fds[1] >= 0 ? B_OK : fds[1]) : fds[0]);
9601 
9602 	// copy FDs to userland
9603 	if (status == B_OK) {
9604 		if (!IS_USER_ADDRESS(userFDs)
9605 			|| user_memcpy(userFDs, fds, sizeof(fds)) != B_OK) {
9606 			status = B_BAD_ADDRESS;
9607 		}
9608 	}
9609 
9610 	// keep FDs, if everything went fine
9611 	if (status == B_OK) {
9612 		closer0.Detach();
9613 		closer1.Detach();
9614 	}
9615 
9616 	return status;
9617 }
9618 
9619 
9620 status_t
9621 _user_access(int fd, const char* userPath, int mode, bool effectiveUserGroup)
9622 {
9623 	KPath pathBuffer;
9624 	if (pathBuffer.InitCheck() != B_OK)
9625 		return B_NO_MEMORY;
9626 
9627 	char* path = pathBuffer.LockBuffer();
9628 
9629 	if (!IS_USER_ADDRESS(userPath))
9630 		return B_BAD_ADDRESS;
9631 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9632 	if (status != B_OK)
9633 		return status;
9634 
9635 	return common_access(fd, path, mode, effectiveUserGroup, false);
9636 }
9637 
9638 
9639 status_t
9640 _user_read_stat(int fd, const char* userPath, bool traverseLink,
9641 	struct stat* userStat, size_t statSize)
9642 {
9643 	struct stat stat;
9644 	status_t status;
9645 
9646 	if (statSize > sizeof(struct stat))
9647 		return B_BAD_VALUE;
9648 
9649 	if (!IS_USER_ADDRESS(userStat))
9650 		return B_BAD_ADDRESS;
9651 
9652 	if (userPath != NULL) {
9653 		// path given: get the stat of the node referred to by (fd, path)
9654 		if (!IS_USER_ADDRESS(userPath))
9655 			return B_BAD_ADDRESS;
9656 
9657 		KPath pathBuffer;
9658 		if (pathBuffer.InitCheck() != B_OK)
9659 			return B_NO_MEMORY;
9660 
9661 		char* path = pathBuffer.LockBuffer();
9662 
9663 		status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9664 		if (status != B_OK)
9665 			return status;
9666 
9667 		status = common_path_read_stat(fd, path, traverseLink, &stat, false);
9668 	} else {
9669 		// no path given: get the FD and use the FD operation
9670 		struct file_descriptor* descriptor
9671 			= get_fd(get_current_io_context(false), fd);
9672 		if (descriptor == NULL)
9673 			return B_FILE_ERROR;
9674 
9675 		if (descriptor->ops->fd_read_stat)
9676 			status = descriptor->ops->fd_read_stat(descriptor, &stat);
9677 		else
9678 			status = B_UNSUPPORTED;
9679 
9680 		put_fd(descriptor);
9681 	}
9682 
9683 	if (status != B_OK)
9684 		return status;
9685 
9686 	return user_memcpy(userStat, &stat, statSize);
9687 }
9688 
9689 
9690 status_t
9691 _user_write_stat(int fd, const char* userPath, bool traverseLeafLink,
9692 	const struct stat* userStat, size_t statSize, int statMask)
9693 {
9694 	if (statSize > sizeof(struct stat))
9695 		return B_BAD_VALUE;
9696 
9697 	struct stat stat;
9698 
9699 	if (!IS_USER_ADDRESS(userStat)
9700 		|| user_memcpy(&stat, userStat, statSize) < B_OK)
9701 		return B_BAD_ADDRESS;
9702 
9703 	// clear additional stat fields
9704 	if (statSize < sizeof(struct stat))
9705 		memset((uint8*)&stat + statSize, 0, sizeof(struct stat) - statSize);
9706 
9707 	status_t status;
9708 
9709 	if (userPath != NULL) {
9710 		// path given: write the stat of the node referred to by (fd, path)
9711 		if (!IS_USER_ADDRESS(userPath))
9712 			return B_BAD_ADDRESS;
9713 
9714 		KPath pathBuffer;
9715 		if (pathBuffer.InitCheck() != B_OK)
9716 			return B_NO_MEMORY;
9717 
9718 		char* path = pathBuffer.LockBuffer();
9719 
9720 		status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9721 		if (status != B_OK)
9722 			return status;
9723 
9724 		status = common_path_write_stat(fd, path, traverseLeafLink, &stat,
9725 			statMask, false);
9726 	} else {
9727 		// no path given: get the FD and use the FD operation
9728 		struct file_descriptor* descriptor
9729 			= get_fd(get_current_io_context(false), fd);
9730 		if (descriptor == NULL)
9731 			return B_FILE_ERROR;
9732 
9733 		if (descriptor->ops->fd_write_stat) {
9734 			status = descriptor->ops->fd_write_stat(descriptor, &stat,
9735 				statMask);
9736 		} else
9737 			status = B_UNSUPPORTED;
9738 
9739 		put_fd(descriptor);
9740 	}
9741 
9742 	return status;
9743 }
9744 
9745 
9746 int
9747 _user_open_attr_dir(int fd, const char* userPath, bool traverseLeafLink)
9748 {
9749 	KPath pathBuffer;
9750 	if (pathBuffer.InitCheck() != B_OK)
9751 		return B_NO_MEMORY;
9752 
9753 	char* path = pathBuffer.LockBuffer();
9754 
9755 	if (userPath != NULL) {
9756 		if (!IS_USER_ADDRESS(userPath))
9757 			return B_BAD_ADDRESS;
9758 		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9759 		if (status != B_OK)
9760 			return status;
9761 	}
9762 
9763 	return attr_dir_open(fd, userPath ? path : NULL, traverseLeafLink, false);
9764 }
9765 
9766 
9767 ssize_t
9768 _user_read_attr(int fd, const char* userAttribute, off_t pos, void* userBuffer,
9769 	size_t readBytes)
9770 {
9771 	char attribute[B_FILE_NAME_LENGTH];
9772 
9773 	if (userAttribute == NULL)
9774 		return B_BAD_VALUE;
9775 	if (!IS_USER_ADDRESS(userAttribute))
9776 		return B_BAD_ADDRESS;
9777 	status_t status = user_copy_name(attribute, userAttribute, sizeof(attribute));
9778 	if (status != B_OK)
9779 		return status;
9780 
9781 	int attr = attr_open(fd, NULL, attribute, O_RDONLY, false);
9782 	if (attr < 0)
9783 		return attr;
9784 
9785 	ssize_t bytes = _user_read(attr, pos, userBuffer, readBytes);
9786 	_user_close(attr);
9787 
9788 	return bytes;
9789 }
9790 
9791 
9792 ssize_t
9793 _user_write_attr(int fd, const char* userAttribute, uint32 type, off_t pos,
9794 	const void* buffer, size_t writeBytes)
9795 {
9796 	char attribute[B_FILE_NAME_LENGTH];
9797 
9798 	if (userAttribute == NULL)
9799 		return B_BAD_VALUE;
9800 	if (!IS_USER_ADDRESS(userAttribute))
9801 		return B_BAD_ADDRESS;
9802 	status_t status = user_copy_name(attribute, userAttribute, sizeof(attribute));
9803 	if (status != B_OK)
9804 		return status;
9805 
9806 	// Try to support the BeOS typical truncation as well as the position
9807 	// argument
9808 	int attr = attr_create(fd, NULL, attribute, type,
9809 		O_CREAT | O_WRONLY | (pos != 0 ? 0 : O_TRUNC), false);
9810 	if (attr < 0)
9811 		return attr;
9812 
9813 	ssize_t bytes = _user_write(attr, pos, buffer, writeBytes);
9814 	_user_close(attr);
9815 
9816 	return bytes;
9817 }
9818 
9819 
9820 status_t
9821 _user_stat_attr(int fd, const char* userAttribute,
9822 	struct attr_info* userAttrInfo)
9823 {
9824 	char attribute[B_FILE_NAME_LENGTH];
9825 
9826 	if (userAttribute == NULL || userAttrInfo == NULL)
9827 		return B_BAD_VALUE;
9828 	if (!IS_USER_ADDRESS(userAttribute) || !IS_USER_ADDRESS(userAttrInfo))
9829 		return B_BAD_ADDRESS;
9830 	status_t status = user_copy_name(attribute, userAttribute,
9831 		sizeof(attribute));
9832 	if (status != B_OK)
9833 		return status;
9834 
9835 	int attr = attr_open(fd, NULL, attribute, O_RDONLY, false);
9836 	if (attr < 0)
9837 		return attr;
9838 
9839 	struct file_descriptor* descriptor
9840 		= get_fd(get_current_io_context(false), attr);
9841 	if (descriptor == NULL) {
9842 		_user_close(attr);
9843 		return B_FILE_ERROR;
9844 	}
9845 
9846 	struct stat stat;
9847 	if (descriptor->ops->fd_read_stat)
9848 		status = descriptor->ops->fd_read_stat(descriptor, &stat);
9849 	else
9850 		status = B_UNSUPPORTED;
9851 
9852 	put_fd(descriptor);
9853 	_user_close(attr);
9854 
9855 	if (status == B_OK) {
9856 		attr_info info;
9857 		info.type = stat.st_type;
9858 		info.size = stat.st_size;
9859 
9860 		if (user_memcpy(userAttrInfo, &info, sizeof(struct attr_info)) != B_OK)
9861 			return B_BAD_ADDRESS;
9862 	}
9863 
9864 	return status;
9865 }
9866 
9867 
9868 int
9869 _user_open_attr(int fd, const char* userPath, const char* userName,
9870 	uint32 type, int openMode)
9871 {
9872 	char name[B_FILE_NAME_LENGTH];
9873 
9874 	if (!IS_USER_ADDRESS(userName))
9875 		return B_BAD_ADDRESS;
9876 	status_t status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
9877 	if (status != B_OK)
9878 		return status;
9879 
9880 	KPath pathBuffer;
9881 	if (pathBuffer.InitCheck() != B_OK)
9882 		return B_NO_MEMORY;
9883 
9884 	char* path = pathBuffer.LockBuffer();
9885 
9886 	if (userPath != NULL) {
9887 		if (!IS_USER_ADDRESS(userPath))
9888 			return B_BAD_ADDRESS;
9889 		status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9890 		if (status != B_OK)
9891 			return status;
9892 	}
9893 
9894 	if ((openMode & O_CREAT) != 0) {
9895 		return attr_create(fd, userPath ? path : NULL, name, type, openMode,
9896 			false);
9897 	}
9898 
9899 	return attr_open(fd, userPath ? path : NULL, name, openMode, false);
9900 }
9901 
9902 
9903 status_t
9904 _user_remove_attr(int fd, const char* userName)
9905 {
9906 	char name[B_FILE_NAME_LENGTH];
9907 
9908 	if (!IS_USER_ADDRESS(userName))
9909 		return B_BAD_ADDRESS;
9910 	status_t status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
9911 	if (status != B_OK)
9912 		return status;
9913 
9914 	return attr_remove(fd, name, false);
9915 }
9916 
9917 
9918 status_t
9919 _user_rename_attr(int fromFile, const char* userFromName, int toFile,
9920 	const char* userToName)
9921 {
9922 	if (!IS_USER_ADDRESS(userFromName)
9923 		|| !IS_USER_ADDRESS(userToName))
9924 		return B_BAD_ADDRESS;
9925 
9926 	KPath fromNameBuffer(B_FILE_NAME_LENGTH);
9927 	KPath toNameBuffer(B_FILE_NAME_LENGTH);
9928 	if (fromNameBuffer.InitCheck() != B_OK || toNameBuffer.InitCheck() != B_OK)
9929 		return B_NO_MEMORY;
9930 
9931 	char* fromName = fromNameBuffer.LockBuffer();
9932 	char* toName = toNameBuffer.LockBuffer();
9933 
9934 	status_t status = user_copy_name(fromName, userFromName, B_FILE_NAME_LENGTH);
9935 	if (status != B_OK)
9936 		return status;
9937 	status = user_copy_name(toName, userToName, B_FILE_NAME_LENGTH);
9938 	if (status != B_OK)
9939 		return status;
9940 
9941 	return attr_rename(fromFile, fromName, toFile, toName, false);
9942 }
9943 
9944 
9945 int
9946 _user_open_index_dir(dev_t device)
9947 {
9948 	return index_dir_open(device, false);
9949 }
9950 
9951 
9952 status_t
9953 _user_create_index(dev_t device, const char* userName, uint32 type,
9954 	uint32 flags)
9955 {
9956 	char name[B_FILE_NAME_LENGTH];
9957 
9958 	if (!IS_USER_ADDRESS(userName))
9959 		return B_BAD_ADDRESS;
9960 	status_t status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
9961 	if (status != B_OK)
9962 		return status;
9963 
9964 	return index_create(device, name, type, flags, false);
9965 }
9966 
9967 
9968 status_t
9969 _user_read_index_stat(dev_t device, const char* userName, struct stat* userStat)
9970 {
9971 	char name[B_FILE_NAME_LENGTH];
9972 	struct stat stat;
9973 	status_t status;
9974 
9975 	if (!IS_USER_ADDRESS(userName) || !IS_USER_ADDRESS(userStat))
9976 		return B_BAD_ADDRESS;
9977 	status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
9978 	if (status != B_OK)
9979 		return status;
9980 
9981 	status = index_name_read_stat(device, name, &stat, false);
9982 	if (status == B_OK) {
9983 		if (user_memcpy(userStat, &stat, sizeof(stat)) != B_OK)
9984 			return B_BAD_ADDRESS;
9985 	}
9986 
9987 	return status;
9988 }
9989 
9990 
9991 status_t
9992 _user_remove_index(dev_t device, const char* userName)
9993 {
9994 	char name[B_FILE_NAME_LENGTH];
9995 
9996 	if (!IS_USER_ADDRESS(userName))
9997 		return B_BAD_ADDRESS;
9998 	status_t status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
9999 	if (status != B_OK)
10000 		return status;
10001 
10002 	return index_remove(device, name, false);
10003 }
10004 
10005 
10006 status_t
10007 _user_getcwd(char* userBuffer, size_t size)
10008 {
10009 	if (size == 0)
10010 		return B_BAD_VALUE;
10011 	if (!IS_USER_ADDRESS(userBuffer))
10012 		return B_BAD_ADDRESS;
10013 
10014 	if (size > kMaxPathLength)
10015 		size = kMaxPathLength;
10016 
10017 	KPath pathBuffer(size);
10018 	if (pathBuffer.InitCheck() != B_OK)
10019 		return B_NO_MEMORY;
10020 
10021 	TRACE(("user_getcwd: buf %p, %ld\n", userBuffer, size));
10022 
10023 	char* path = pathBuffer.LockBuffer();
10024 
10025 	status_t status = get_cwd(path, size, false);
10026 	if (status != B_OK)
10027 		return status;
10028 
10029 	// Copy back the result
10030 	if (user_strlcpy(userBuffer, path, size) < B_OK)
10031 		return B_BAD_ADDRESS;
10032 
10033 	return status;
10034 }
10035 
10036 
10037 status_t
10038 _user_setcwd(int fd, const char* userPath)
10039 {
10040 	TRACE(("user_setcwd: path = %p\n", userPath));
10041 
10042 	KPath pathBuffer;
10043 	if (pathBuffer.InitCheck() != B_OK)
10044 		return B_NO_MEMORY;
10045 
10046 	char* path = pathBuffer.LockBuffer();
10047 
10048 	if (userPath != NULL) {
10049 		if (!IS_USER_ADDRESS(userPath))
10050 			return B_BAD_ADDRESS;
10051 		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
10052 		if (status != B_OK)
10053 			return status;
10054 	}
10055 
10056 	return set_cwd(fd, userPath != NULL ? path : NULL, false);
10057 }
10058 
10059 
10060 status_t
10061 _user_change_root(const char* userPath)
10062 {
10063 	// only root is allowed to chroot()
10064 	if (geteuid() != 0)
10065 		return B_NOT_ALLOWED;
10066 
10067 	// alloc path buffer
10068 	KPath pathBuffer;
10069 	if (pathBuffer.InitCheck() != B_OK)
10070 		return B_NO_MEMORY;
10071 
10072 	// copy userland path to kernel
10073 	char* path = pathBuffer.LockBuffer();
10074 	if (userPath != NULL) {
10075 		if (!IS_USER_ADDRESS(userPath))
10076 			return B_BAD_ADDRESS;
10077 		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
10078 		if (status != B_OK)
10079 			return status;
10080 	}
10081 
10082 	// get the vnode
10083 	struct vnode* vnode;
10084 	status_t status = path_to_vnode(path, true, &vnode, NULL, false);
10085 	if (status != B_OK)
10086 		return status;
10087 
10088 	// set the new root
10089 	struct io_context* context = get_current_io_context(false);
10090 	mutex_lock(&sIOContextRootLock);
10091 	struct vnode* oldRoot = context->root;
10092 	context->root = vnode;
10093 	mutex_unlock(&sIOContextRootLock);
10094 
10095 	put_vnode(oldRoot);
10096 
10097 	return B_OK;
10098 }
10099 
10100 
10101 int
10102 _user_open_query(dev_t device, const char* userQuery, size_t queryLength,
10103 	uint32 flags, port_id port, int32 token)
10104 {
10105 	char* query;
10106 
10107 	if (device < 0 || userQuery == NULL || queryLength == 0)
10108 		return B_BAD_VALUE;
10109 
10110 	if (!IS_USER_ADDRESS(userQuery))
10111 		return B_BAD_ADDRESS;
10112 
10113 	// this is a safety restriction
10114 	if (queryLength >= 65536)
10115 		return B_NAME_TOO_LONG;
10116 
10117 	query = (char*)malloc(queryLength + 1);
10118 	if (query == NULL)
10119 		return B_NO_MEMORY;
10120 	if (user_strlcpy(query, userQuery, queryLength + 1) < B_OK) {
10121 		free(query);
10122 		return B_BAD_ADDRESS;
10123 	}
10124 
10125 	int fd = query_open(device, query, flags, port, token, false);
10126 
10127 	free(query);
10128 	return fd;
10129 }
10130 
10131 
10132 #include "vfs_request_io.cpp"
10133