xref: /haiku/src/system/kernel/fs/vfs.cpp (revision 2897df967633aab846ff4917b53e2af7d1e54eeb)
1 /*
2  * Copyright 2005-2013, Ingo Weinhold, ingo_weinhold@gmx.de.
3  * Copyright 2002-2018, Axel Dörfler, axeld@pinc-software.de.
4  * Distributed under the terms of the MIT License.
5  *
6  * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
7  * Distributed under the terms of the NewOS License.
8  */
9 
10 
11 /*! Virtual File System and File System Interface Layer */
12 
13 
14 #include <ctype.h>
15 #include <fcntl.h>
16 #include <limits.h>
17 #include <stddef.h>
18 #include <stdio.h>
19 #include <string.h>
20 #include <sys/file.h>
21 #include <sys/resource.h>
22 #include <sys/stat.h>
23 #include <unistd.h>
24 
25 #include <fs_attr.h>
26 #include <fs_info.h>
27 #include <fs_interface.h>
28 #include <fs_volume.h>
29 #include <OS.h>
30 #include <StorageDefs.h>
31 
32 #include <AutoDeleter.h>
33 #include <block_cache.h>
34 #include <boot/kernel_args.h>
35 #include <debug_heap.h>
36 #include <disk_device_manager/KDiskDevice.h>
37 #include <disk_device_manager/KDiskDeviceManager.h>
38 #include <disk_device_manager/KDiskDeviceUtils.h>
39 #include <disk_device_manager/KDiskSystem.h>
40 #include <fd.h>
41 #include <file_cache.h>
42 #include <fs/node_monitor.h>
43 #include <KPath.h>
44 #include <lock.h>
45 #include <low_resource_manager.h>
46 #include <syscalls.h>
47 #include <syscall_restart.h>
48 #include <tracing.h>
49 #include <util/atomic.h>
50 #include <util/AutoLock.h>
51 #include <util/DoublyLinkedList.h>
52 #include <vfs.h>
53 #include <vm/vm.h>
54 #include <vm/VMCache.h>
55 #include <wait_for_objects.h>
56 
57 #include "EntryCache.h"
58 #include "fifo.h"
59 #include "IORequest.h"
60 #include "unused_vnodes.h"
61 #include "vfs_tracing.h"
62 #include "Vnode.h"
63 #include "../cache/vnode_store.h"
64 
65 
66 //#define TRACE_VFS
67 #ifdef TRACE_VFS
68 #	define TRACE(x) dprintf x
69 #	define FUNCTION(x) dprintf x
70 #else
71 #	define TRACE(x) ;
72 #	define FUNCTION(x) ;
73 #endif
74 
75 #define ADD_DEBUGGER_COMMANDS
76 
77 
78 #define HAS_FS_CALL(vnode, op)			(vnode->ops->op != NULL)
79 #define HAS_FS_MOUNT_CALL(mount, op)	(mount->volume->ops->op != NULL)
80 
81 #if KDEBUG
82 #	define FS_CALL(vnode, op, params...) \
83 		( HAS_FS_CALL(vnode, op) ? \
84 			vnode->ops->op(vnode->mount->volume, vnode, params) \
85 			: (panic("FS_CALL op " #op " is NULL"), 0))
86 #	define FS_CALL_NO_PARAMS(vnode, op) \
87 		( HAS_FS_CALL(vnode, op) ? \
88 			vnode->ops->op(vnode->mount->volume, vnode) \
89 			: (panic("FS_CALL_NO_PARAMS op " #op " is NULL"), 0))
90 #	define FS_MOUNT_CALL(mount, op, params...) \
91 		( HAS_FS_MOUNT_CALL(mount, op) ? \
92 			mount->volume->ops->op(mount->volume, params) \
93 			: (panic("FS_MOUNT_CALL op " #op " is NULL"), 0))
94 #	define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
95 		( HAS_FS_MOUNT_CALL(mount, op) ? \
96 			mount->volume->ops->op(mount->volume) \
97 			: (panic("FS_MOUNT_CALL_NO_PARAMS op " #op " is NULL"), 0))
98 #else
99 #	define FS_CALL(vnode, op, params...) \
100 			vnode->ops->op(vnode->mount->volume, vnode, params)
101 #	define FS_CALL_NO_PARAMS(vnode, op) \
102 			vnode->ops->op(vnode->mount->volume, vnode)
103 #	define FS_MOUNT_CALL(mount, op, params...) \
104 			mount->volume->ops->op(mount->volume, params)
105 #	define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
106 			mount->volume->ops->op(mount->volume)
107 #endif
108 
109 
110 const static size_t kMaxPathLength = 65536;
111 	// The absolute maximum path length (for getcwd() - this is not depending
112 	// on PATH_MAX
113 
114 
115 typedef DoublyLinkedList<vnode> VnodeList;
116 
117 /*!	\brief Structure to manage a mounted file system
118 
119 	Note: The root_vnode and root_vnode->covers fields (what others?) are
120 	initialized in fs_mount() and not changed afterwards. That is as soon
121 	as the mount is mounted and it is made sure it won't be unmounted
122 	(e.g. by holding a reference to a vnode of that mount) (read) access
123 	to those fields is always safe, even without additional locking. Morever
124 	while mounted the mount holds a reference to the root_vnode->covers vnode,
125 	and thus making the access path vnode->mount->root_vnode->covers->mount->...
126 	safe if a reference to vnode is held (note that for the root mount
127 	root_vnode->covers is NULL, though).
128 */
129 struct fs_mount {
130 	fs_mount()
131 		:
132 		volume(NULL),
133 		device_name(NULL)
134 	{
135 		recursive_lock_init(&rlock, "mount rlock");
136 	}
137 
138 	~fs_mount()
139 	{
140 		recursive_lock_destroy(&rlock);
141 		free(device_name);
142 
143 		while (volume) {
144 			fs_volume* superVolume = volume->super_volume;
145 
146 			if (volume->file_system != NULL)
147 				put_module(volume->file_system->info.name);
148 
149 			free(volume->file_system_name);
150 			free(volume);
151 			volume = superVolume;
152 		}
153 	}
154 
155 	struct fs_mount* next;
156 	dev_t			id;
157 	fs_volume*		volume;
158 	char*			device_name;
159 	recursive_lock	rlock;	// guards the vnodes list
160 		// TODO: Make this a mutex! It is never used recursively.
161 	struct vnode*	root_vnode;
162 	struct vnode*	covers_vnode;	// immutable
163 	KPartition*		partition;
164 	VnodeList		vnodes;
165 	EntryCache		entry_cache;
166 	bool			unmounting;
167 	bool			owns_file_device;
168 };
169 
170 
171 namespace {
172 
173 struct advisory_lock : public DoublyLinkedListLinkImpl<advisory_lock> {
174 	list_link		link;
175 	void*			bound_to;
176 	team_id			team;
177 	pid_t			session;
178 	off_t			start;
179 	off_t			end;
180 	bool			shared;
181 };
182 
183 typedef DoublyLinkedList<advisory_lock> LockList;
184 
185 } // namespace
186 
187 
188 struct advisory_locking {
189 	sem_id			lock;
190 	sem_id			wait_sem;
191 	LockList		locks;
192 
193 	advisory_locking()
194 		:
195 		lock(-1),
196 		wait_sem(-1)
197 	{
198 	}
199 
200 	~advisory_locking()
201 	{
202 		if (lock >= 0)
203 			delete_sem(lock);
204 		if (wait_sem >= 0)
205 			delete_sem(wait_sem);
206 	}
207 };
208 
209 /*!	\brief Guards sMountsTable.
210 
211 	The holder is allowed to read/write access the sMountsTable.
212 	Manipulation of the fs_mount structures themselves
213 	(and their destruction) requires different locks though.
214 */
215 static mutex sMountMutex = MUTEX_INITIALIZER("vfs_mount_lock");
216 
217 /*!	\brief Guards mount/unmount operations.
218 
219 	The fs_mount() and fs_unmount() hold the lock during their whole operation.
220 	That is locking the lock ensures that no FS is mounted/unmounted. In
221 	particular this means that
222 	- sMountsTable will not be modified,
223 	- the fields immutable after initialization of the fs_mount structures in
224 	  sMountsTable will not be modified,
225 
226 	The thread trying to lock the lock must not hold sVnodeLock or
227 	sMountMutex.
228 */
229 static recursive_lock sMountOpLock;
230 
231 /*!	\brief Guards sVnodeTable.
232 
233 	The holder is allowed read/write access to sVnodeTable and to
234 	any unbusy vnode in that table, save to the immutable fields (device, id,
235 	private_node, mount) to which only read-only access is allowed.
236 	The mutable fields advisory_locking, mandatory_locked_by, and ref_count, as
237 	well as the busy, removed, unused flags, and the vnode's type can also be
238 	write accessed when holding a read lock to sVnodeLock *and* having the vnode
239 	locked. Write access to covered_by and covers requires to write lock
240 	sVnodeLock.
241 
242 	The thread trying to acquire the lock must not hold sMountMutex.
243 	You must not hold this lock when calling create_sem(), as this might call
244 	vfs_free_unused_vnodes() and thus cause a deadlock.
245 */
246 static rw_lock sVnodeLock = RW_LOCK_INITIALIZER("vfs_vnode_lock");
247 
248 /*!	\brief Guards io_context::root.
249 
250 	Must be held when setting or getting the io_context::root field.
251 	The only operation allowed while holding this lock besides getting or
252 	setting the field is inc_vnode_ref_count() on io_context::root.
253 */
254 static mutex sIOContextRootLock = MUTEX_INITIALIZER("io_context::root lock");
255 
256 
257 namespace {
258 
259 struct vnode_hash_key {
260 	dev_t	device;
261 	ino_t	vnode;
262 };
263 
264 struct VnodeHash {
265 	typedef vnode_hash_key	KeyType;
266 	typedef	struct vnode	ValueType;
267 
268 #define VHASH(mountid, vnodeid) \
269 	(((uint32)((vnodeid) >> 32) + (uint32)(vnodeid)) ^ (uint32)(mountid))
270 
271 	size_t HashKey(KeyType key) const
272 	{
273 		return VHASH(key.device, key.vnode);
274 	}
275 
276 	size_t Hash(ValueType* vnode) const
277 	{
278 		return VHASH(vnode->device, vnode->id);
279 	}
280 
281 #undef VHASH
282 
283 	bool Compare(KeyType key, ValueType* vnode) const
284 	{
285 		return vnode->device == key.device && vnode->id == key.vnode;
286 	}
287 
288 	ValueType*& GetLink(ValueType* value) const
289 	{
290 		return value->next;
291 	}
292 };
293 
294 typedef BOpenHashTable<VnodeHash> VnodeTable;
295 
296 
297 struct MountHash {
298 	typedef dev_t			KeyType;
299 	typedef	struct fs_mount	ValueType;
300 
301 	size_t HashKey(KeyType key) const
302 	{
303 		return key;
304 	}
305 
306 	size_t Hash(ValueType* mount) const
307 	{
308 		return mount->id;
309 	}
310 
311 	bool Compare(KeyType key, ValueType* mount) const
312 	{
313 		return mount->id == key;
314 	}
315 
316 	ValueType*& GetLink(ValueType* value) const
317 	{
318 		return value->next;
319 	}
320 };
321 
322 typedef BOpenHashTable<MountHash> MountTable;
323 
324 } // namespace
325 
326 
327 #define VNODE_HASH_TABLE_SIZE 1024
328 static VnodeTable* sVnodeTable;
329 static struct vnode* sRoot;
330 
331 #define MOUNTS_HASH_TABLE_SIZE 16
332 static MountTable* sMountsTable;
333 static dev_t sNextMountID = 1;
334 
335 #define MAX_TEMP_IO_VECS 8
336 
337 // How long to wait for busy vnodes (10s)
338 #define BUSY_VNODE_RETRIES 2000
339 #define BUSY_VNODE_DELAY 5000
340 
341 mode_t __gUmask = 022;
342 
343 /* function declarations */
344 
345 static void free_unused_vnodes();
346 
347 // file descriptor operation prototypes
348 static status_t file_read(struct file_descriptor* descriptor, off_t pos,
349 	void* buffer, size_t* _bytes);
350 static status_t file_write(struct file_descriptor* descriptor, off_t pos,
351 	const void* buffer, size_t* _bytes);
352 static off_t file_seek(struct file_descriptor* descriptor, off_t pos,
353 	int seekType);
354 static void file_free_fd(struct file_descriptor* descriptor);
355 static status_t file_close(struct file_descriptor* descriptor);
356 static status_t file_select(struct file_descriptor* descriptor, uint8 event,
357 	struct selectsync* sync);
358 static status_t file_deselect(struct file_descriptor* descriptor, uint8 event,
359 	struct selectsync* sync);
360 static status_t dir_read(struct io_context* context,
361 	struct file_descriptor* descriptor, struct dirent* buffer,
362 	size_t bufferSize, uint32* _count);
363 static status_t dir_read(struct io_context* ioContext, struct vnode* vnode,
364 	void* cookie, struct dirent* buffer, size_t bufferSize, uint32* _count);
365 static status_t dir_rewind(struct file_descriptor* descriptor);
366 static void dir_free_fd(struct file_descriptor* descriptor);
367 static status_t dir_close(struct file_descriptor* descriptor);
368 static status_t attr_dir_read(struct io_context* context,
369 	struct file_descriptor* descriptor, struct dirent* buffer,
370 	size_t bufferSize, uint32* _count);
371 static status_t attr_dir_rewind(struct file_descriptor* descriptor);
372 static void attr_dir_free_fd(struct file_descriptor* descriptor);
373 static status_t attr_dir_close(struct file_descriptor* descriptor);
374 static status_t attr_read(struct file_descriptor* descriptor, off_t pos,
375 	void* buffer, size_t* _bytes);
376 static status_t attr_write(struct file_descriptor* descriptor, off_t pos,
377 	const void* buffer, size_t* _bytes);
378 static off_t attr_seek(struct file_descriptor* descriptor, off_t pos,
379 	int seekType);
380 static void attr_free_fd(struct file_descriptor* descriptor);
381 static status_t attr_close(struct file_descriptor* descriptor);
382 static status_t attr_read_stat(struct file_descriptor* descriptor,
383 	struct stat* statData);
384 static status_t attr_write_stat(struct file_descriptor* descriptor,
385 	const struct stat* stat, int statMask);
386 static status_t index_dir_read(struct io_context* context,
387 	struct file_descriptor* descriptor, struct dirent* buffer,
388 	size_t bufferSize, uint32* _count);
389 static status_t index_dir_rewind(struct file_descriptor* descriptor);
390 static void index_dir_free_fd(struct file_descriptor* descriptor);
391 static status_t index_dir_close(struct file_descriptor* descriptor);
392 static status_t query_read(struct io_context* context,
393 	struct file_descriptor* descriptor, struct dirent* buffer,
394 	size_t bufferSize, uint32* _count);
395 static status_t query_rewind(struct file_descriptor* descriptor);
396 static void query_free_fd(struct file_descriptor* descriptor);
397 static status_t query_close(struct file_descriptor* descriptor);
398 
399 static status_t common_ioctl(struct file_descriptor* descriptor, ulong op,
400 	void* buffer, size_t length);
401 static status_t common_read_stat(struct file_descriptor* descriptor,
402 	struct stat* statData);
403 static status_t common_write_stat(struct file_descriptor* descriptor,
404 	const struct stat* statData, int statMask);
405 static status_t common_path_read_stat(int fd, char* path, bool traverseLeafLink,
406 	struct stat* stat, bool kernel);
407 
408 static status_t vnode_path_to_vnode(struct vnode* vnode, char* path,
409 	bool traverseLeafLink, int count, bool kernel,
410 	struct vnode** _vnode, ino_t* _parentID);
411 static status_t dir_vnode_to_path(struct vnode* vnode, char* buffer,
412 	size_t bufferSize, bool kernel);
413 static status_t fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
414 	struct vnode** _vnode, ino_t* _parentID, bool kernel);
415 static void inc_vnode_ref_count(struct vnode* vnode);
416 static status_t dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree,
417 	bool reenter);
418 static inline void put_vnode(struct vnode* vnode);
419 static status_t fs_unmount(char* path, dev_t mountID, uint32 flags,
420 	bool kernel);
421 static int open_vnode(struct vnode* vnode, int openMode, bool kernel);
422 
423 
424 static struct fd_ops sFileOps = {
425 	file_read,
426 	file_write,
427 	file_seek,
428 	common_ioctl,
429 	NULL,		// set_flags
430 	file_select,
431 	file_deselect,
432 	NULL,		// read_dir()
433 	NULL,		// rewind_dir()
434 	common_read_stat,
435 	common_write_stat,
436 	file_close,
437 	file_free_fd
438 };
439 
440 static struct fd_ops sDirectoryOps = {
441 	NULL,		// read()
442 	NULL,		// write()
443 	NULL,		// seek()
444 	common_ioctl,
445 	NULL,		// set_flags
446 	NULL,		// select()
447 	NULL,		// deselect()
448 	dir_read,
449 	dir_rewind,
450 	common_read_stat,
451 	common_write_stat,
452 	dir_close,
453 	dir_free_fd
454 };
455 
456 static struct fd_ops sAttributeDirectoryOps = {
457 	NULL,		// read()
458 	NULL,		// write()
459 	NULL,		// seek()
460 	common_ioctl,
461 	NULL,		// set_flags
462 	NULL,		// select()
463 	NULL,		// deselect()
464 	attr_dir_read,
465 	attr_dir_rewind,
466 	common_read_stat,
467 	common_write_stat,
468 	attr_dir_close,
469 	attr_dir_free_fd
470 };
471 
472 static struct fd_ops sAttributeOps = {
473 	attr_read,
474 	attr_write,
475 	attr_seek,
476 	common_ioctl,
477 	NULL,		// set_flags
478 	NULL,		// select()
479 	NULL,		// deselect()
480 	NULL,		// read_dir()
481 	NULL,		// rewind_dir()
482 	attr_read_stat,
483 	attr_write_stat,
484 	attr_close,
485 	attr_free_fd
486 };
487 
488 static struct fd_ops sIndexDirectoryOps = {
489 	NULL,		// read()
490 	NULL,		// write()
491 	NULL,		// seek()
492 	NULL,		// ioctl()
493 	NULL,		// set_flags
494 	NULL,		// select()
495 	NULL,		// deselect()
496 	index_dir_read,
497 	index_dir_rewind,
498 	NULL,		// read_stat()
499 	NULL,		// write_stat()
500 	index_dir_close,
501 	index_dir_free_fd
502 };
503 
504 #if 0
505 static struct fd_ops sIndexOps = {
506 	NULL,		// read()
507 	NULL,		// write()
508 	NULL,		// seek()
509 	NULL,		// ioctl()
510 	NULL,		// set_flags
511 	NULL,		// select()
512 	NULL,		// deselect()
513 	NULL,		// dir_read()
514 	NULL,		// dir_rewind()
515 	index_read_stat,	// read_stat()
516 	NULL,		// write_stat()
517 	NULL,		// dir_close()
518 	NULL		// free_fd()
519 };
520 #endif
521 
522 static struct fd_ops sQueryOps = {
523 	NULL,		// read()
524 	NULL,		// write()
525 	NULL,		// seek()
526 	NULL,		// ioctl()
527 	NULL,		// set_flags
528 	NULL,		// select()
529 	NULL,		// deselect()
530 	query_read,
531 	query_rewind,
532 	NULL,		// read_stat()
533 	NULL,		// write_stat()
534 	query_close,
535 	query_free_fd
536 };
537 
538 
539 namespace {
540 
541 class VNodePutter {
542 public:
543 	VNodePutter(struct vnode* vnode = NULL) : fVNode(vnode) {}
544 
545 	~VNodePutter()
546 	{
547 		Put();
548 	}
549 
550 	void SetTo(struct vnode* vnode)
551 	{
552 		Put();
553 		fVNode = vnode;
554 	}
555 
556 	void Put()
557 	{
558 		if (fVNode) {
559 			put_vnode(fVNode);
560 			fVNode = NULL;
561 		}
562 	}
563 
564 	struct vnode* Detach()
565 	{
566 		struct vnode* vnode = fVNode;
567 		fVNode = NULL;
568 		return vnode;
569 	}
570 
571 private:
572 	struct vnode* fVNode;
573 };
574 
575 
576 class FDCloser {
577 public:
578 	FDCloser() : fFD(-1), fKernel(true) {}
579 
580 	FDCloser(int fd, bool kernel) : fFD(fd), fKernel(kernel) {}
581 
582 	~FDCloser()
583 	{
584 		Close();
585 	}
586 
587 	void SetTo(int fd, bool kernel)
588 	{
589 		Close();
590 		fFD = fd;
591 		fKernel = kernel;
592 	}
593 
594 	void Close()
595 	{
596 		if (fFD >= 0) {
597 			if (fKernel)
598 				_kern_close(fFD);
599 			else
600 				_user_close(fFD);
601 			fFD = -1;
602 		}
603 	}
604 
605 	int Detach()
606 	{
607 		int fd = fFD;
608 		fFD = -1;
609 		return fd;
610 	}
611 
612 private:
613 	int		fFD;
614 	bool	fKernel;
615 };
616 
617 } // namespace
618 
619 
620 #if VFS_PAGES_IO_TRACING
621 
622 namespace VFSPagesIOTracing {
623 
624 class PagesIOTraceEntry : public AbstractTraceEntry {
625 protected:
626 	PagesIOTraceEntry(struct vnode* vnode, void* cookie, off_t pos,
627 		const generic_io_vec* vecs, uint32 count, uint32 flags,
628 		generic_size_t bytesRequested, status_t status,
629 		generic_size_t bytesTransferred)
630 		:
631 		fVnode(vnode),
632 		fMountID(vnode->mount->id),
633 		fNodeID(vnode->id),
634 		fCookie(cookie),
635 		fPos(pos),
636 		fCount(count),
637 		fFlags(flags),
638 		fBytesRequested(bytesRequested),
639 		fStatus(status),
640 		fBytesTransferred(bytesTransferred)
641 	{
642 		fVecs = (generic_io_vec*)alloc_tracing_buffer_memcpy(vecs,
643 			sizeof(generic_io_vec) * count, false);
644 	}
645 
646 	void AddDump(TraceOutput& out, const char* mode)
647 	{
648 		out.Print("vfs pages io %5s: vnode: %p (%" B_PRId32 ", %" B_PRId64 "), "
649 			"cookie: %p, pos: %" B_PRIdOFF ", size: %" B_PRIu64 ", vecs: {",
650 			mode, fVnode, fMountID, fNodeID, fCookie, fPos,
651 			(uint64)fBytesRequested);
652 
653 		if (fVecs != NULL) {
654 			for (uint32 i = 0; i < fCount; i++) {
655 				if (i > 0)
656 					out.Print(", ");
657 				out.Print("(%" B_PRIx64 ", %" B_PRIu64 ")", (uint64)fVecs[i].base,
658 					(uint64)fVecs[i].length);
659 			}
660 		}
661 
662 		out.Print("}, flags: %#" B_PRIx32 " -> status: %#" B_PRIx32 ", "
663 			"transferred: %" B_PRIu64, fFlags, fStatus,
664 			(uint64)fBytesTransferred);
665 	}
666 
667 protected:
668 	struct vnode*	fVnode;
669 	dev_t			fMountID;
670 	ino_t			fNodeID;
671 	void*			fCookie;
672 	off_t			fPos;
673 	generic_io_vec*	fVecs;
674 	uint32			fCount;
675 	uint32			fFlags;
676 	generic_size_t	fBytesRequested;
677 	status_t		fStatus;
678 	generic_size_t	fBytesTransferred;
679 };
680 
681 
682 class ReadPages : public PagesIOTraceEntry {
683 public:
684 	ReadPages(struct vnode* vnode, void* cookie, off_t pos,
685 		const generic_io_vec* vecs, uint32 count, uint32 flags,
686 		generic_size_t bytesRequested, status_t status,
687 		generic_size_t bytesTransferred)
688 		:
689 		PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
690 			bytesRequested, status, bytesTransferred)
691 	{
692 		Initialized();
693 	}
694 
695 	virtual void AddDump(TraceOutput& out)
696 	{
697 		PagesIOTraceEntry::AddDump(out, "read");
698 	}
699 };
700 
701 
702 class WritePages : public PagesIOTraceEntry {
703 public:
704 	WritePages(struct vnode* vnode, void* cookie, off_t pos,
705 		const generic_io_vec* vecs, uint32 count, uint32 flags,
706 		generic_size_t bytesRequested, status_t status,
707 		generic_size_t bytesTransferred)
708 		:
709 		PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
710 			bytesRequested, status, bytesTransferred)
711 	{
712 		Initialized();
713 	}
714 
715 	virtual void AddDump(TraceOutput& out)
716 	{
717 		PagesIOTraceEntry::AddDump(out, "write");
718 	}
719 };
720 
721 }	// namespace VFSPagesIOTracing
722 
723 #	define TPIO(x) new(std::nothrow) VFSPagesIOTracing::x;
724 #else
725 #	define TPIO(x) ;
726 #endif	// VFS_PAGES_IO_TRACING
727 
728 
729 /*! Finds the mounted device (the fs_mount structure) with the given ID.
730 	Note, you must hold the gMountMutex lock when you call this function.
731 */
732 static struct fs_mount*
733 find_mount(dev_t id)
734 {
735 	ASSERT_LOCKED_MUTEX(&sMountMutex);
736 
737 	return sMountsTable->Lookup(id);
738 }
739 
740 
741 static status_t
742 get_mount(dev_t id, struct fs_mount** _mount)
743 {
744 	struct fs_mount* mount;
745 
746 	ReadLocker nodeLocker(sVnodeLock);
747 	MutexLocker mountLocker(sMountMutex);
748 
749 	mount = find_mount(id);
750 	if (mount == NULL)
751 		return B_BAD_VALUE;
752 
753 	struct vnode* rootNode = mount->root_vnode;
754 	if (mount->unmounting || rootNode == NULL || rootNode->IsBusy()
755 		|| rootNode->ref_count == 0) {
756 		// might have been called during a mount/unmount operation
757 		return B_BUSY;
758 	}
759 
760 	inc_vnode_ref_count(rootNode);
761 	*_mount = mount;
762 	return B_OK;
763 }
764 
765 
766 static void
767 put_mount(struct fs_mount* mount)
768 {
769 	if (mount)
770 		put_vnode(mount->root_vnode);
771 }
772 
773 
774 /*!	Tries to open the specified file system module.
775 	Accepts a file system name of the form "bfs" or "file_systems/bfs/v1".
776 	Returns a pointer to file system module interface, or NULL if it
777 	could not open the module.
778 */
779 static file_system_module_info*
780 get_file_system(const char* fsName)
781 {
782 	char name[B_FILE_NAME_LENGTH];
783 	if (strncmp(fsName, "file_systems/", strlen("file_systems/"))) {
784 		// construct module name if we didn't get one
785 		// (we currently support only one API)
786 		snprintf(name, sizeof(name), "file_systems/%s/v1", fsName);
787 		fsName = NULL;
788 	}
789 
790 	file_system_module_info* info;
791 	if (get_module(fsName ? fsName : name, (module_info**)&info) != B_OK)
792 		return NULL;
793 
794 	return info;
795 }
796 
797 
798 /*!	Accepts a file system name of the form "bfs" or "file_systems/bfs/v1"
799 	and returns a compatible fs_info.fsh_name name ("bfs" in both cases).
800 	The name is allocated for you, and you have to free() it when you're
801 	done with it.
802 	Returns NULL if the required memory is not available.
803 */
804 static char*
805 get_file_system_name(const char* fsName)
806 {
807 	const size_t length = strlen("file_systems/");
808 
809 	if (strncmp(fsName, "file_systems/", length)) {
810 		// the name already seems to be the module's file name
811 		return strdup(fsName);
812 	}
813 
814 	fsName += length;
815 	const char* end = strchr(fsName, '/');
816 	if (end == NULL) {
817 		// this doesn't seem to be a valid name, but well...
818 		return strdup(fsName);
819 	}
820 
821 	// cut off the trailing /v1
822 
823 	char* name = (char*)malloc(end + 1 - fsName);
824 	if (name == NULL)
825 		return NULL;
826 
827 	strlcpy(name, fsName, end + 1 - fsName);
828 	return name;
829 }
830 
831 
832 /*!	Accepts a list of file system names separated by a colon, one for each
833 	layer and returns the file system name for the specified layer.
834 	The name is allocated for you, and you have to free() it when you're
835 	done with it.
836 	Returns NULL if the required memory is not available or if there is no
837 	name for the specified layer.
838 */
839 static char*
840 get_file_system_name_for_layer(const char* fsNames, int32 layer)
841 {
842 	while (layer >= 0) {
843 		const char* end = strchr(fsNames, ':');
844 		if (end == NULL) {
845 			if (layer == 0)
846 				return strdup(fsNames);
847 			return NULL;
848 		}
849 
850 		if (layer == 0) {
851 			size_t length = end - fsNames + 1;
852 			char* result = (char*)malloc(length);
853 			strlcpy(result, fsNames, length);
854 			return result;
855 		}
856 
857 		fsNames = end + 1;
858 		layer--;
859 	}
860 
861 	return NULL;
862 }
863 
864 
865 static void
866 add_vnode_to_mount_list(struct vnode* vnode, struct fs_mount* mount)
867 {
868 	RecursiveLocker _(mount->rlock);
869 	mount->vnodes.Add(vnode);
870 }
871 
872 
873 static void
874 remove_vnode_from_mount_list(struct vnode* vnode, struct fs_mount* mount)
875 {
876 	RecursiveLocker _(mount->rlock);
877 	mount->vnodes.Remove(vnode);
878 }
879 
880 
881 /*!	\brief Looks up a vnode by mount and node ID in the sVnodeTable.
882 
883 	The caller must hold the sVnodeLock (read lock at least).
884 
885 	\param mountID the mount ID.
886 	\param vnodeID the node ID.
887 
888 	\return The vnode structure, if it was found in the hash table, \c NULL
889 			otherwise.
890 */
891 static struct vnode*
892 lookup_vnode(dev_t mountID, ino_t vnodeID)
893 {
894 	struct vnode_hash_key key;
895 
896 	key.device = mountID;
897 	key.vnode = vnodeID;
898 
899 	return sVnodeTable->Lookup(key);
900 }
901 
902 
903 /*!	\brief Checks whether or not a busy vnode should be waited for (again).
904 
905 	This will also wait for BUSY_VNODE_DELAY before returning if one should
906 	still wait for the vnode becoming unbusy.
907 
908 	\return \c true if one should retry, \c false if not.
909 */
910 static bool
911 retry_busy_vnode(int32& tries, dev_t mountID, ino_t vnodeID)
912 {
913 	if (--tries < 0) {
914 		// vnode doesn't seem to become unbusy
915 		dprintf("vnode %" B_PRIdDEV ":%" B_PRIdINO
916 			" is not becoming unbusy!\n", mountID, vnodeID);
917 		return false;
918 	}
919 	snooze(BUSY_VNODE_DELAY);
920 	return true;
921 }
922 
923 
924 /*!	Creates a new vnode with the given mount and node ID.
925 	If the node already exists, it is returned instead and no new node is
926 	created. In either case -- but not, if an error occurs -- the function write
927 	locks \c sVnodeLock and keeps it locked for the caller when returning. On
928 	error the lock is not held on return.
929 
930 	\param mountID The mount ID.
931 	\param vnodeID The vnode ID.
932 	\param _vnode Will be set to the new vnode on success.
933 	\param _nodeCreated Will be set to \c true when the returned vnode has
934 		been newly created, \c false when it already existed. Will not be
935 		changed on error.
936 	\return \c B_OK, when the vnode was successfully created and inserted or
937 		a node with the given ID was found, \c B_NO_MEMORY or
938 		\c B_ENTRY_NOT_FOUND on error.
939 */
940 static status_t
941 create_new_vnode_and_lock(dev_t mountID, ino_t vnodeID, struct vnode*& _vnode,
942 	bool& _nodeCreated)
943 {
944 	FUNCTION(("create_new_vnode_and_lock()\n"));
945 
946 	struct vnode* vnode = (struct vnode*)malloc(sizeof(struct vnode));
947 	if (vnode == NULL)
948 		return B_NO_MEMORY;
949 
950 	// initialize basic values
951 	memset(vnode, 0, sizeof(struct vnode));
952 	vnode->device = mountID;
953 	vnode->id = vnodeID;
954 	vnode->ref_count = 1;
955 	vnode->SetBusy(true);
956 
957 	// look up the node -- it might have been added by someone else in the
958 	// meantime
959 	rw_lock_write_lock(&sVnodeLock);
960 	struct vnode* existingVnode = lookup_vnode(mountID, vnodeID);
961 	if (existingVnode != NULL) {
962 		free(vnode);
963 		_vnode = existingVnode;
964 		_nodeCreated = false;
965 		return B_OK;
966 	}
967 
968 	// get the mount structure
969 	mutex_lock(&sMountMutex);
970 	vnode->mount = find_mount(mountID);
971 	if (!vnode->mount || vnode->mount->unmounting) {
972 		mutex_unlock(&sMountMutex);
973 		rw_lock_write_unlock(&sVnodeLock);
974 		free(vnode);
975 		return B_ENTRY_NOT_FOUND;
976 	}
977 
978 	// add the vnode to the mount's node list and the hash table
979 	sVnodeTable->Insert(vnode);
980 	add_vnode_to_mount_list(vnode, vnode->mount);
981 
982 	mutex_unlock(&sMountMutex);
983 
984 	_vnode = vnode;
985 	_nodeCreated = true;
986 
987 	// keep the vnode lock locked
988 	return B_OK;
989 }
990 
991 
992 /*!	Frees the vnode and all resources it has acquired, and removes
993 	it from the vnode hash as well as from its mount structure.
994 	Will also make sure that any cache modifications are written back.
995 */
996 static void
997 free_vnode(struct vnode* vnode, bool reenter)
998 {
999 	ASSERT_PRINT(vnode->ref_count == 0 && vnode->IsBusy(), "vnode: %p\n",
1000 		vnode);
1001 	ASSERT_PRINT(vnode->advisory_locking == NULL, "vnode: %p\n", vnode);
1002 
1003 	// write back any changes in this vnode's cache -- but only
1004 	// if the vnode won't be deleted, in which case the changes
1005 	// will be discarded
1006 
1007 	if (!vnode->IsRemoved() && HAS_FS_CALL(vnode, fsync))
1008 		FS_CALL_NO_PARAMS(vnode, fsync);
1009 
1010 	// Note: If this vnode has a cache attached, there will still be two
1011 	// references to that cache at this point. The last one belongs to the vnode
1012 	// itself (cf. vfs_get_vnode_cache()) and one belongs to the node's file
1013 	// cache. Each but the last reference to a cache also includes a reference
1014 	// to the vnode. The file cache, however, released its reference (cf.
1015 	// file_cache_create()), so that this vnode's ref count has the chance to
1016 	// ever drop to 0. Deleting the file cache now, will cause the next to last
1017 	// cache reference to be released, which will also release a (no longer
1018 	// existing) vnode reference. To avoid problems, we set the vnode's ref
1019 	// count, so that it will neither become negative nor 0.
1020 	vnode->ref_count = 2;
1021 
1022 	if (!vnode->IsUnpublished()) {
1023 		if (vnode->IsRemoved())
1024 			FS_CALL(vnode, remove_vnode, reenter);
1025 		else
1026 			FS_CALL(vnode, put_vnode, reenter);
1027 	}
1028 
1029 	// If the vnode has a VMCache attached, make sure that it won't try to get
1030 	// another reference via VMVnodeCache::AcquireUnreferencedStoreRef(). As
1031 	// long as the vnode is busy and in the hash, that won't happen, but as
1032 	// soon as we've removed it from the hash, it could reload the vnode -- with
1033 	// a new cache attached!
1034 	if (vnode->cache != NULL)
1035 		((VMVnodeCache*)vnode->cache)->VnodeDeleted();
1036 
1037 	// The file system has removed the resources of the vnode now, so we can
1038 	// make it available again (by removing the busy vnode from the hash).
1039 	rw_lock_write_lock(&sVnodeLock);
1040 	sVnodeTable->Remove(vnode);
1041 	rw_lock_write_unlock(&sVnodeLock);
1042 
1043 	// if we have a VMCache attached, remove it
1044 	if (vnode->cache)
1045 		vnode->cache->ReleaseRef();
1046 
1047 	vnode->cache = NULL;
1048 
1049 	remove_vnode_from_mount_list(vnode, vnode->mount);
1050 
1051 	free(vnode);
1052 }
1053 
1054 
1055 /*!	\brief Decrements the reference counter of the given vnode and deletes it,
1056 	if the counter dropped to 0.
1057 
1058 	The caller must, of course, own a reference to the vnode to call this
1059 	function.
1060 	The caller must not hold the sVnodeLock or the sMountMutex.
1061 
1062 	\param vnode the vnode.
1063 	\param alwaysFree don't move this vnode into the unused list, but really
1064 		   delete it if possible.
1065 	\param reenter \c true, if this function is called (indirectly) from within
1066 		   a file system. This will be passed to file system hooks only.
1067 	\return \c B_OK, if everything went fine, an error code otherwise.
1068 */
1069 static status_t
1070 dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree, bool reenter)
1071 {
1072 	ReadLocker locker(sVnodeLock);
1073 	AutoLocker<Vnode> nodeLocker(vnode);
1074 
1075 	int32 oldRefCount = atomic_add(&vnode->ref_count, -1);
1076 
1077 	ASSERT_PRINT(oldRefCount > 0, "vnode %p\n", vnode);
1078 
1079 	TRACE(("dec_vnode_ref_count: vnode %p, ref now %" B_PRId32 "\n", vnode,
1080 		vnode->ref_count));
1081 
1082 	if (oldRefCount != 1)
1083 		return B_OK;
1084 
1085 	if (vnode->IsBusy())
1086 		panic("dec_vnode_ref_count: called on busy vnode %p\n", vnode);
1087 
1088 	bool freeNode = false;
1089 	bool freeUnusedNodes = false;
1090 
1091 	// Just insert the vnode into an unused list if we don't need
1092 	// to delete it
1093 	if (vnode->IsRemoved() || alwaysFree) {
1094 		vnode_to_be_freed(vnode);
1095 		vnode->SetBusy(true);
1096 		freeNode = true;
1097 	} else
1098 		freeUnusedNodes = vnode_unused(vnode);
1099 
1100 	nodeLocker.Unlock();
1101 	locker.Unlock();
1102 
1103 	if (freeNode)
1104 		free_vnode(vnode, reenter);
1105 	else if (freeUnusedNodes)
1106 		free_unused_vnodes();
1107 
1108 	return B_OK;
1109 }
1110 
1111 
1112 /*!	\brief Increments the reference counter of the given vnode.
1113 
1114 	The caller must make sure that the node isn't deleted while this function
1115 	is called. This can be done either:
1116 	- by ensuring that a reference to the node exists and remains in existence,
1117 	  or
1118 	- by holding the vnode's lock (which also requires read locking sVnodeLock)
1119 	  or by holding sVnodeLock write locked.
1120 
1121 	In the second case the caller is responsible for dealing with the ref count
1122 	0 -> 1 transition. That is 1. this function must not be invoked when the
1123 	node is busy in the first place and 2. vnode_used() must be called for the
1124 	node.
1125 
1126 	\param vnode the vnode.
1127 */
1128 static void
1129 inc_vnode_ref_count(struct vnode* vnode)
1130 {
1131 	atomic_add(&vnode->ref_count, 1);
1132 	TRACE(("inc_vnode_ref_count: vnode %p, ref now %" B_PRId32 "\n", vnode,
1133 		vnode->ref_count));
1134 }
1135 
1136 
1137 static bool
1138 is_special_node_type(int type)
1139 {
1140 	// at the moment only FIFOs are supported
1141 	return S_ISFIFO(type);
1142 }
1143 
1144 
1145 static status_t
1146 create_special_sub_node(struct vnode* vnode, uint32 flags)
1147 {
1148 	if (S_ISFIFO(vnode->Type()))
1149 		return create_fifo_vnode(vnode->mount->volume, vnode);
1150 
1151 	return B_BAD_VALUE;
1152 }
1153 
1154 
1155 /*!	\brief Retrieves a vnode for a given mount ID, node ID pair.
1156 
1157 	If the node is not yet in memory, it will be loaded.
1158 
1159 	The caller must not hold the sVnodeLock or the sMountMutex.
1160 
1161 	\param mountID the mount ID.
1162 	\param vnodeID the node ID.
1163 	\param _vnode Pointer to a vnode* variable into which the pointer to the
1164 		   retrieved vnode structure shall be written.
1165 	\param reenter \c true, if this function is called (indirectly) from within
1166 		   a file system.
1167 	\return \c B_OK, if everything when fine, an error code otherwise.
1168 */
1169 static status_t
1170 get_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode, bool canWait,
1171 	int reenter)
1172 {
1173 	FUNCTION(("get_vnode: mountid %" B_PRId32 " vnid 0x%" B_PRIx64 " %p\n",
1174 		mountID, vnodeID, _vnode));
1175 
1176 	rw_lock_read_lock(&sVnodeLock);
1177 
1178 	int32 tries = BUSY_VNODE_RETRIES;
1179 restart:
1180 	struct vnode* vnode = lookup_vnode(mountID, vnodeID);
1181 	AutoLocker<Vnode> nodeLocker(vnode);
1182 
1183 	if (vnode && vnode->IsBusy()) {
1184 		nodeLocker.Unlock();
1185 		rw_lock_read_unlock(&sVnodeLock);
1186 		if (!canWait) {
1187 			dprintf("vnode %" B_PRIdDEV ":%" B_PRIdINO " is busy!\n",
1188 				mountID, vnodeID);
1189 			return B_BUSY;
1190 		}
1191 		if (!retry_busy_vnode(tries, mountID, vnodeID))
1192 			return B_BUSY;
1193 
1194 		rw_lock_read_lock(&sVnodeLock);
1195 		goto restart;
1196 	}
1197 
1198 	TRACE(("get_vnode: tried to lookup vnode, got %p\n", vnode));
1199 
1200 	status_t status;
1201 
1202 	if (vnode) {
1203 		if (vnode->ref_count == 0) {
1204 			// this vnode has been unused before
1205 			vnode_used(vnode);
1206 		}
1207 		inc_vnode_ref_count(vnode);
1208 
1209 		nodeLocker.Unlock();
1210 		rw_lock_read_unlock(&sVnodeLock);
1211 	} else {
1212 		// we need to create a new vnode and read it in
1213 		rw_lock_read_unlock(&sVnodeLock);
1214 			// unlock -- create_new_vnode_and_lock() write-locks on success
1215 		bool nodeCreated;
1216 		status = create_new_vnode_and_lock(mountID, vnodeID, vnode,
1217 			nodeCreated);
1218 		if (status != B_OK)
1219 			return status;
1220 
1221 		if (!nodeCreated) {
1222 			rw_lock_read_lock(&sVnodeLock);
1223 			rw_lock_write_unlock(&sVnodeLock);
1224 			goto restart;
1225 		}
1226 
1227 		rw_lock_write_unlock(&sVnodeLock);
1228 
1229 		int type;
1230 		uint32 flags;
1231 		status = FS_MOUNT_CALL(vnode->mount, get_vnode, vnodeID, vnode, &type,
1232 			&flags, reenter);
1233 		if (status == B_OK && vnode->private_node == NULL)
1234 			status = B_BAD_VALUE;
1235 
1236 		bool gotNode = status == B_OK;
1237 		bool publishSpecialSubNode = false;
1238 		if (gotNode) {
1239 			vnode->SetType(type);
1240 			publishSpecialSubNode = is_special_node_type(type)
1241 				&& (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
1242 		}
1243 
1244 		if (gotNode && publishSpecialSubNode)
1245 			status = create_special_sub_node(vnode, flags);
1246 
1247 		if (status != B_OK) {
1248 			if (gotNode)
1249 				FS_CALL(vnode, put_vnode, reenter);
1250 
1251 			rw_lock_write_lock(&sVnodeLock);
1252 			sVnodeTable->Remove(vnode);
1253 			remove_vnode_from_mount_list(vnode, vnode->mount);
1254 			rw_lock_write_unlock(&sVnodeLock);
1255 
1256 			free(vnode);
1257 			return status;
1258 		}
1259 
1260 		rw_lock_read_lock(&sVnodeLock);
1261 		vnode->Lock();
1262 
1263 		vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
1264 		vnode->SetBusy(false);
1265 
1266 		vnode->Unlock();
1267 		rw_lock_read_unlock(&sVnodeLock);
1268 	}
1269 
1270 	TRACE(("get_vnode: returning %p\n", vnode));
1271 
1272 	*_vnode = vnode;
1273 	return B_OK;
1274 }
1275 
1276 
1277 /*!	\brief Decrements the reference counter of the given vnode and deletes it,
1278 	if the counter dropped to 0.
1279 
1280 	The caller must, of course, own a reference to the vnode to call this
1281 	function.
1282 	The caller must not hold the sVnodeLock or the sMountMutex.
1283 
1284 	\param vnode the vnode.
1285 */
1286 static inline void
1287 put_vnode(struct vnode* vnode)
1288 {
1289 	dec_vnode_ref_count(vnode, false, false);
1290 }
1291 
1292 
1293 static void
1294 free_unused_vnodes(int32 level)
1295 {
1296 	unused_vnodes_check_started();
1297 
1298 	if (level == B_NO_LOW_RESOURCE) {
1299 		unused_vnodes_check_done();
1300 		return;
1301 	}
1302 
1303 	flush_hot_vnodes();
1304 
1305 	// determine how many nodes to free
1306 	uint32 count = 1;
1307 	{
1308 		MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1309 
1310 		switch (level) {
1311 			case B_LOW_RESOURCE_NOTE:
1312 				count = sUnusedVnodes / 100;
1313 				break;
1314 			case B_LOW_RESOURCE_WARNING:
1315 				count = sUnusedVnodes / 10;
1316 				break;
1317 			case B_LOW_RESOURCE_CRITICAL:
1318 				count = sUnusedVnodes;
1319 				break;
1320 		}
1321 
1322 		if (count > sUnusedVnodes)
1323 			count = sUnusedVnodes;
1324 	}
1325 
1326 	// Write back the modified pages of some unused vnodes and free them.
1327 
1328 	for (uint32 i = 0; i < count; i++) {
1329 		ReadLocker vnodesReadLocker(sVnodeLock);
1330 
1331 		// get the first node
1332 		MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1333 		struct vnode* vnode = (struct vnode*)list_get_first_item(
1334 			&sUnusedVnodeList);
1335 		unusedVnodesLocker.Unlock();
1336 
1337 		if (vnode == NULL)
1338 			break;
1339 
1340 		// lock the node
1341 		AutoLocker<Vnode> nodeLocker(vnode);
1342 
1343 		// Check whether the node is still unused -- since we only append to the
1344 		// tail of the unused queue, the vnode should still be at its head.
1345 		// Alternatively we could check its ref count for 0 and its busy flag,
1346 		// but if the node is no longer at the head of the queue, it means it
1347 		// has been touched in the meantime, i.e. it is no longer the least
1348 		// recently used unused vnode and we rather don't free it.
1349 		unusedVnodesLocker.Lock();
1350 		if (vnode != list_get_first_item(&sUnusedVnodeList))
1351 			continue;
1352 		unusedVnodesLocker.Unlock();
1353 
1354 		ASSERT(!vnode->IsBusy());
1355 
1356 		// grab a reference
1357 		inc_vnode_ref_count(vnode);
1358 		vnode_used(vnode);
1359 
1360 		// write back changes and free the node
1361 		nodeLocker.Unlock();
1362 		vnodesReadLocker.Unlock();
1363 
1364 		if (vnode->cache != NULL)
1365 			vnode->cache->WriteModified();
1366 
1367 		dec_vnode_ref_count(vnode, true, false);
1368 			// this should free the vnode when it's still unused
1369 	}
1370 
1371 	unused_vnodes_check_done();
1372 }
1373 
1374 
1375 /*!	Gets the vnode the given vnode is covering.
1376 
1377 	The caller must have \c sVnodeLock read-locked at least.
1378 
1379 	The function returns a reference to the retrieved vnode (if any), the caller
1380 	is responsible to free.
1381 
1382 	\param vnode The vnode whose covered node shall be returned.
1383 	\return The covered vnode, or \c NULL if the given vnode doesn't cover any
1384 		vnode.
1385 */
1386 static inline Vnode*
1387 get_covered_vnode_locked(Vnode* vnode)
1388 {
1389 	if (Vnode* coveredNode = vnode->covers) {
1390 		while (coveredNode->covers != NULL)
1391 			coveredNode = coveredNode->covers;
1392 
1393 		inc_vnode_ref_count(coveredNode);
1394 		return coveredNode;
1395 	}
1396 
1397 	return NULL;
1398 }
1399 
1400 
1401 /*!	Gets the vnode the given vnode is covering.
1402 
1403 	The caller must not hold \c sVnodeLock. Note that this implies a race
1404 	condition, since the situation can change at any time.
1405 
1406 	The function returns a reference to the retrieved vnode (if any), the caller
1407 	is responsible to free.
1408 
1409 	\param vnode The vnode whose covered node shall be returned.
1410 	\return The covered vnode, or \c NULL if the given vnode doesn't cover any
1411 		vnode.
1412 */
1413 static inline Vnode*
1414 get_covered_vnode(Vnode* vnode)
1415 {
1416 	if (!vnode->IsCovering())
1417 		return NULL;
1418 
1419 	ReadLocker vnodeReadLocker(sVnodeLock);
1420 	return get_covered_vnode_locked(vnode);
1421 }
1422 
1423 
1424 /*!	Gets the vnode the given vnode is covered by.
1425 
1426 	The caller must have \c sVnodeLock read-locked at least.
1427 
1428 	The function returns a reference to the retrieved vnode (if any), the caller
1429 	is responsible to free.
1430 
1431 	\param vnode The vnode whose covering node shall be returned.
1432 	\return The covering vnode, or \c NULL if the given vnode isn't covered by
1433 		any vnode.
1434 */
1435 static Vnode*
1436 get_covering_vnode_locked(Vnode* vnode)
1437 {
1438 	if (Vnode* coveringNode = vnode->covered_by) {
1439 		while (coveringNode->covered_by != NULL)
1440 			coveringNode = coveringNode->covered_by;
1441 
1442 		inc_vnode_ref_count(coveringNode);
1443 		return coveringNode;
1444 	}
1445 
1446 	return NULL;
1447 }
1448 
1449 
1450 /*!	Gets the vnode the given vnode is covered by.
1451 
1452 	The caller must not hold \c sVnodeLock. Note that this implies a race
1453 	condition, since the situation can change at any time.
1454 
1455 	The function returns a reference to the retrieved vnode (if any), the caller
1456 	is responsible to free.
1457 
1458 	\param vnode The vnode whose covering node shall be returned.
1459 	\return The covering vnode, or \c NULL if the given vnode isn't covered by
1460 		any vnode.
1461 */
1462 static inline Vnode*
1463 get_covering_vnode(Vnode* vnode)
1464 {
1465 	if (!vnode->IsCovered())
1466 		return NULL;
1467 
1468 	ReadLocker vnodeReadLocker(sVnodeLock);
1469 	return get_covering_vnode_locked(vnode);
1470 }
1471 
1472 
1473 static void
1474 free_unused_vnodes()
1475 {
1476 	free_unused_vnodes(
1477 		low_resource_state(B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1478 			| B_KERNEL_RESOURCE_ADDRESS_SPACE));
1479 }
1480 
1481 
1482 static void
1483 vnode_low_resource_handler(void* /*data*/, uint32 resources, int32 level)
1484 {
1485 	TRACE(("vnode_low_resource_handler(level = %" B_PRId32 ")\n", level));
1486 
1487 	free_unused_vnodes(level);
1488 }
1489 
1490 
1491 static inline void
1492 put_advisory_locking(struct advisory_locking* locking)
1493 {
1494 	release_sem(locking->lock);
1495 }
1496 
1497 
1498 /*!	Returns the advisory_locking object of the \a vnode in case it
1499 	has one, and locks it.
1500 	You have to call put_advisory_locking() when you're done with
1501 	it.
1502 	Note, you must not have the vnode mutex locked when calling
1503 	this function.
1504 */
1505 static struct advisory_locking*
1506 get_advisory_locking(struct vnode* vnode)
1507 {
1508 	rw_lock_read_lock(&sVnodeLock);
1509 	vnode->Lock();
1510 
1511 	struct advisory_locking* locking = vnode->advisory_locking;
1512 	sem_id lock = locking != NULL ? locking->lock : B_ERROR;
1513 
1514 	vnode->Unlock();
1515 	rw_lock_read_unlock(&sVnodeLock);
1516 
1517 	if (lock >= 0)
1518 		lock = acquire_sem(lock);
1519 	if (lock < 0) {
1520 		// This means the locking has been deleted in the mean time
1521 		// or had never existed in the first place - otherwise, we
1522 		// would get the lock at some point.
1523 		return NULL;
1524 	}
1525 
1526 	return locking;
1527 }
1528 
1529 
1530 /*!	Creates a locked advisory_locking object, and attaches it to the
1531 	given \a vnode.
1532 	Returns B_OK in case of success - also if the vnode got such an
1533 	object from someone else in the mean time, you'll still get this
1534 	one locked then.
1535 */
1536 static status_t
1537 create_advisory_locking(struct vnode* vnode)
1538 {
1539 	if (vnode == NULL)
1540 		return B_FILE_ERROR;
1541 
1542 	ObjectDeleter<advisory_locking> lockingDeleter;
1543 	struct advisory_locking* locking = NULL;
1544 
1545 	while (get_advisory_locking(vnode) == NULL) {
1546 		// no locking object set on the vnode yet, create one
1547 		if (locking == NULL) {
1548 			locking = new(std::nothrow) advisory_locking;
1549 			if (locking == NULL)
1550 				return B_NO_MEMORY;
1551 			lockingDeleter.SetTo(locking);
1552 
1553 			locking->wait_sem = create_sem(0, "advisory lock");
1554 			if (locking->wait_sem < 0)
1555 				return locking->wait_sem;
1556 
1557 			locking->lock = create_sem(0, "advisory locking");
1558 			if (locking->lock < 0)
1559 				return locking->lock;
1560 		}
1561 
1562 		// set our newly created locking object
1563 		ReadLocker _(sVnodeLock);
1564 		AutoLocker<Vnode> nodeLocker(vnode);
1565 		if (vnode->advisory_locking == NULL) {
1566 			vnode->advisory_locking = locking;
1567 			lockingDeleter.Detach();
1568 			return B_OK;
1569 		}
1570 	}
1571 
1572 	// The vnode already had a locking object. That's just as well.
1573 
1574 	return B_OK;
1575 }
1576 
1577 
1578 /*! Returns \c true when either \a flock is \c NULL or the \a flock intersects
1579 	with the advisory_lock \a lock.
1580 */
1581 static bool
1582 advisory_lock_intersects(struct advisory_lock* lock, struct flock* flock)
1583 {
1584 	if (flock == NULL)
1585 		return true;
1586 
1587 	return lock->start <= flock->l_start - 1 + flock->l_len
1588 		&& lock->end >= flock->l_start;
1589 }
1590 
1591 
1592 /*!	Tests whether acquiring a lock would block.
1593 */
1594 static status_t
1595 test_advisory_lock(struct vnode* vnode, struct flock* flock)
1596 {
1597 	flock->l_type = F_UNLCK;
1598 
1599 	struct advisory_locking* locking = get_advisory_locking(vnode);
1600 	if (locking == NULL)
1601 		return B_OK;
1602 
1603 	team_id team = team_get_current_team_id();
1604 
1605 	LockList::Iterator iterator = locking->locks.GetIterator();
1606 	while (iterator.HasNext()) {
1607 		struct advisory_lock* lock = iterator.Next();
1608 
1609 		 if (lock->team != team && advisory_lock_intersects(lock, flock)) {
1610 			// locks do overlap
1611 			if (flock->l_type != F_RDLCK || !lock->shared) {
1612 				// collision
1613 				flock->l_type = lock->shared ? F_RDLCK : F_WRLCK;
1614 				flock->l_whence = SEEK_SET;
1615 				flock->l_start = lock->start;
1616 				flock->l_len = lock->end - lock->start + 1;
1617 				flock->l_pid = lock->team;
1618 				break;
1619 			}
1620 		}
1621 	}
1622 
1623 	put_advisory_locking(locking);
1624 	return B_OK;
1625 }
1626 
1627 
1628 /*!	Removes the specified lock, or all locks of the calling team
1629 	if \a flock is NULL.
1630 */
1631 static status_t
1632 release_advisory_lock(struct vnode* vnode, struct io_context* context,
1633 	struct file_descriptor* descriptor, struct flock* flock)
1634 {
1635 	FUNCTION(("release_advisory_lock(vnode = %p, flock = %p)\n", vnode, flock));
1636 
1637 	struct advisory_locking* locking = get_advisory_locking(vnode);
1638 	if (locking == NULL)
1639 		return B_OK;
1640 
1641 	// find matching lock entries
1642 
1643 	LockList::Iterator iterator = locking->locks.GetIterator();
1644 	while (iterator.HasNext()) {
1645 		struct advisory_lock* lock = iterator.Next();
1646 		bool removeLock = false;
1647 
1648 		if (descriptor != NULL && lock->bound_to == descriptor) {
1649 			// Remove flock() locks
1650 			removeLock = true;
1651 		} else if (lock->bound_to == context
1652 				&& advisory_lock_intersects(lock, flock)) {
1653 			// Remove POSIX locks
1654 			bool endsBeyond = false;
1655 			bool startsBefore = false;
1656 			if (flock != NULL) {
1657 				startsBefore = lock->start < flock->l_start;
1658 				endsBeyond = lock->end > flock->l_start - 1 + flock->l_len;
1659 			}
1660 
1661 			if (!startsBefore && !endsBeyond) {
1662 				// lock is completely contained in flock
1663 				removeLock = true;
1664 			} else if (startsBefore && !endsBeyond) {
1665 				// cut the end of the lock
1666 				lock->end = flock->l_start - 1;
1667 			} else if (!startsBefore && endsBeyond) {
1668 				// cut the start of the lock
1669 				lock->start = flock->l_start + flock->l_len;
1670 			} else {
1671 				// divide the lock into two locks
1672 				struct advisory_lock* secondLock = new advisory_lock;
1673 				if (secondLock == NULL) {
1674 					// TODO: we should probably revert the locks we already
1675 					// changed... (ie. allocate upfront)
1676 					put_advisory_locking(locking);
1677 					return B_NO_MEMORY;
1678 				}
1679 
1680 				lock->end = flock->l_start - 1;
1681 
1682 				secondLock->bound_to = context;
1683 				secondLock->team = lock->team;
1684 				secondLock->session = lock->session;
1685 				// values must already be normalized when getting here
1686 				secondLock->start = flock->l_start + flock->l_len;
1687 				secondLock->end = lock->end;
1688 				secondLock->shared = lock->shared;
1689 
1690 				locking->locks.Add(secondLock);
1691 			}
1692 		}
1693 
1694 		if (removeLock) {
1695 			// this lock is no longer used
1696 			iterator.Remove();
1697 			free(lock);
1698 		}
1699 	}
1700 
1701 	bool removeLocking = locking->locks.IsEmpty();
1702 	release_sem_etc(locking->wait_sem, 1, B_RELEASE_ALL);
1703 
1704 	put_advisory_locking(locking);
1705 
1706 	if (removeLocking) {
1707 		// We can remove the whole advisory locking structure; it's no
1708 		// longer used
1709 		locking = get_advisory_locking(vnode);
1710 		if (locking != NULL) {
1711 			ReadLocker locker(sVnodeLock);
1712 			AutoLocker<Vnode> nodeLocker(vnode);
1713 
1714 			// the locking could have been changed in the mean time
1715 			if (locking->locks.IsEmpty()) {
1716 				vnode->advisory_locking = NULL;
1717 				nodeLocker.Unlock();
1718 				locker.Unlock();
1719 
1720 				// we've detached the locking from the vnode, so we can
1721 				// safely delete it
1722 				delete locking;
1723 			} else {
1724 				// the locking is in use again
1725 				nodeLocker.Unlock();
1726 				locker.Unlock();
1727 				release_sem_etc(locking->lock, 1, B_DO_NOT_RESCHEDULE);
1728 			}
1729 		}
1730 	}
1731 
1732 	return B_OK;
1733 }
1734 
1735 
1736 /*!	Acquires an advisory lock for the \a vnode. If \a wait is \c true, it
1737 	will wait for the lock to become available, if there are any collisions
1738 	(it will return B_PERMISSION_DENIED in this case if \a wait is \c false).
1739 
1740 	If \a descriptor is NULL, POSIX semantics are used for this lock. Otherwise,
1741 	BSD flock() semantics are used, that is, all children can unlock the file
1742 	in question (we even allow parents to remove the lock, though, but that
1743 	seems to be in line to what the BSD's are doing).
1744 */
1745 static status_t
1746 acquire_advisory_lock(struct vnode* vnode, io_context* context,
1747 	struct file_descriptor* descriptor, struct flock* flock, bool wait)
1748 {
1749 	FUNCTION(("acquire_advisory_lock(vnode = %p, flock = %p, wait = %s)\n",
1750 		vnode, flock, wait ? "yes" : "no"));
1751 	dprintf("acquire_advisory_lock(vnode = %p, flock = %p, wait = %s)\n",
1752 		vnode, flock, wait ? "yes" : "no");
1753 
1754 	bool shared = flock->l_type == F_RDLCK;
1755 	void* boundTo = descriptor != NULL ? (void*)descriptor : (void*)context;
1756 	status_t status = B_OK;
1757 
1758 	// TODO: do deadlock detection!
1759 
1760 	struct advisory_locking* locking;
1761 
1762 	while (true) {
1763 		// if this vnode has an advisory_locking structure attached,
1764 		// lock that one and search for any colliding file lock
1765 		status = create_advisory_locking(vnode);
1766 		if (status != B_OK)
1767 			return status;
1768 
1769 		locking = vnode->advisory_locking;
1770 		team_id team = team_get_current_team_id();
1771 		sem_id waitForLock = -1;
1772 
1773 		// test for collisions
1774 		LockList::Iterator iterator = locking->locks.GetIterator();
1775 		while (iterator.HasNext()) {
1776 			struct advisory_lock* lock = iterator.Next();
1777 
1778 			// TODO: locks from the same team might be joinable!
1779 			if ((lock->team != team || lock->bound_to != boundTo)
1780 					&& advisory_lock_intersects(lock, flock)) {
1781 				// locks do overlap
1782 				if (!shared || !lock->shared) {
1783 					// we need to wait
1784 					waitForLock = locking->wait_sem;
1785 					break;
1786 				}
1787 			}
1788 		}
1789 
1790 		if (waitForLock < 0)
1791 			break;
1792 
1793 		// We need to wait. Do that or fail now, if we've been asked not to.
1794 
1795 		if (!wait) {
1796 			put_advisory_locking(locking);
1797 			return descriptor != NULL ? B_WOULD_BLOCK : B_PERMISSION_DENIED;
1798 		}
1799 
1800 		status = switch_sem_etc(locking->lock, waitForLock, 1,
1801 			B_CAN_INTERRUPT, 0);
1802 		if (status != B_OK && status != B_BAD_SEM_ID)
1803 			return status;
1804 
1805 		// We have been notified, but we need to re-lock the locking object. So
1806 		// go another round...
1807 	}
1808 
1809 	// install new lock
1810 
1811 	struct advisory_lock* lock = (struct advisory_lock*)malloc(
1812 		sizeof(struct advisory_lock));
1813 	if (lock == NULL) {
1814 		put_advisory_locking(locking);
1815 		return B_NO_MEMORY;
1816 	}
1817 
1818 	lock->bound_to = boundTo;
1819 	lock->team = team_get_current_team_id();
1820 	lock->session = thread_get_current_thread()->team->session_id;
1821 	// values must already be normalized when getting here
1822 	lock->start = flock->l_start;
1823 	lock->end = flock->l_start - 1 + flock->l_len;
1824 	lock->shared = shared;
1825 
1826 	locking->locks.Add(lock);
1827 	put_advisory_locking(locking);
1828 
1829 	return status;
1830 }
1831 
1832 
1833 /*!	Normalizes the \a flock structure to make it easier to compare the
1834 	structure with others. The l_start and l_len fields are set to absolute
1835 	values according to the l_whence field.
1836 */
1837 static status_t
1838 normalize_flock(struct file_descriptor* descriptor, struct flock* flock)
1839 {
1840 	switch (flock->l_whence) {
1841 		case SEEK_SET:
1842 			break;
1843 		case SEEK_CUR:
1844 			flock->l_start += descriptor->pos;
1845 			break;
1846 		case SEEK_END:
1847 		{
1848 			struct vnode* vnode = descriptor->u.vnode;
1849 			struct stat stat;
1850 			status_t status;
1851 
1852 			if (!HAS_FS_CALL(vnode, read_stat))
1853 				return B_UNSUPPORTED;
1854 
1855 			status = FS_CALL(vnode, read_stat, &stat);
1856 			if (status != B_OK)
1857 				return status;
1858 
1859 			flock->l_start += stat.st_size;
1860 			break;
1861 		}
1862 		default:
1863 			return B_BAD_VALUE;
1864 	}
1865 
1866 	if (flock->l_start < 0)
1867 		flock->l_start = 0;
1868 	if (flock->l_len == 0)
1869 		flock->l_len = OFF_MAX;
1870 
1871 	// don't let the offset and length overflow
1872 	if (flock->l_start > 0 && OFF_MAX - flock->l_start < flock->l_len)
1873 		flock->l_len = OFF_MAX - flock->l_start;
1874 
1875 	if (flock->l_len < 0) {
1876 		// a negative length reverses the region
1877 		flock->l_start += flock->l_len;
1878 		flock->l_len = -flock->l_len;
1879 	}
1880 
1881 	return B_OK;
1882 }
1883 
1884 
1885 static void
1886 replace_vnode_if_disconnected(struct fs_mount* mount,
1887 	struct vnode* vnodeToDisconnect, struct vnode*& vnode,
1888 	struct vnode* fallBack, bool lockRootLock)
1889 {
1890 	struct vnode* givenVnode = vnode;
1891 	bool vnodeReplaced = false;
1892 
1893 	ReadLocker vnodeReadLocker(sVnodeLock);
1894 
1895 	if (lockRootLock)
1896 		mutex_lock(&sIOContextRootLock);
1897 
1898 	while (vnode != NULL && vnode->mount == mount
1899 		&& (vnodeToDisconnect == NULL || vnodeToDisconnect == vnode)) {
1900 		if (vnode->covers != NULL) {
1901 			// redirect the vnode to the covered vnode
1902 			vnode = vnode->covers;
1903 		} else
1904 			vnode = fallBack;
1905 
1906 		vnodeReplaced = true;
1907 	}
1908 
1909 	// If we've replaced the node, grab a reference for the new one.
1910 	if (vnodeReplaced && vnode != NULL)
1911 		inc_vnode_ref_count(vnode);
1912 
1913 	if (lockRootLock)
1914 		mutex_unlock(&sIOContextRootLock);
1915 
1916 	vnodeReadLocker.Unlock();
1917 
1918 	if (vnodeReplaced)
1919 		put_vnode(givenVnode);
1920 }
1921 
1922 
1923 /*!	Disconnects all file descriptors that are associated with the
1924 	\a vnodeToDisconnect, or if this is NULL, all vnodes of the specified
1925 	\a mount object.
1926 
1927 	Note, after you've called this function, there might still be ongoing
1928 	accesses - they won't be interrupted if they already happened before.
1929 	However, any subsequent access will fail.
1930 
1931 	This is not a cheap function and should be used with care and rarely.
1932 	TODO: there is currently no means to stop a blocking read/write!
1933 */
1934 static void
1935 disconnect_mount_or_vnode_fds(struct fs_mount* mount,
1936 	struct vnode* vnodeToDisconnect)
1937 {
1938 	// iterate over all teams and peek into their file descriptors
1939 	TeamListIterator teamIterator;
1940 	while (Team* team = teamIterator.Next()) {
1941 		BReference<Team> teamReference(team, true);
1942 		TeamLocker teamLocker(team);
1943 
1944 		// lock the I/O context
1945 		io_context* context = team->io_context;
1946 		if (context == NULL)
1947 			continue;
1948 		MutexLocker contextLocker(context->io_mutex);
1949 
1950 		teamLocker.Unlock();
1951 
1952 		replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->root,
1953 			sRoot, true);
1954 		replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->cwd,
1955 			sRoot, false);
1956 
1957 		for (uint32 i = 0; i < context->table_size; i++) {
1958 			if (struct file_descriptor* descriptor = context->fds[i]) {
1959 				inc_fd_ref_count(descriptor);
1960 
1961 				// if this descriptor points at this mount, we
1962 				// need to disconnect it to be able to unmount
1963 				struct vnode* vnode = fd_vnode(descriptor);
1964 				if (vnodeToDisconnect != NULL) {
1965 					if (vnode == vnodeToDisconnect)
1966 						disconnect_fd(descriptor);
1967 				} else if ((vnode != NULL && vnode->mount == mount)
1968 					|| (vnode == NULL && descriptor->u.mount == mount))
1969 					disconnect_fd(descriptor);
1970 
1971 				put_fd(descriptor);
1972 			}
1973 		}
1974 	}
1975 }
1976 
1977 
1978 /*!	\brief Gets the root node of the current IO context.
1979 	If \a kernel is \c true, the kernel IO context will be used.
1980 	The caller obtains a reference to the returned node.
1981 */
1982 struct vnode*
1983 get_root_vnode(bool kernel)
1984 {
1985 	if (!kernel) {
1986 		// Get current working directory from io context
1987 		struct io_context* context = get_current_io_context(kernel);
1988 
1989 		mutex_lock(&sIOContextRootLock);
1990 
1991 		struct vnode* root = context->root;
1992 		if (root != NULL)
1993 			inc_vnode_ref_count(root);
1994 
1995 		mutex_unlock(&sIOContextRootLock);
1996 
1997 		if (root != NULL)
1998 			return root;
1999 
2000 		// That should never happen.
2001 		dprintf("get_root_vnode(): IO context for team %" B_PRId32 " doesn't "
2002 			"have a root\n", team_get_current_team_id());
2003 	}
2004 
2005 	inc_vnode_ref_count(sRoot);
2006 	return sRoot;
2007 }
2008 
2009 
2010 /*!	\brief Gets the directory path and leaf name for a given path.
2011 
2012 	The supplied \a path is transformed to refer to the directory part of
2013 	the entry identified by the original path, and into the buffer \a filename
2014 	the leaf name of the original entry is written.
2015 	Neither the returned path nor the leaf name can be expected to be
2016 	canonical.
2017 
2018 	\param path The path to be analyzed. Must be able to store at least one
2019 		   additional character.
2020 	\param filename The buffer into which the leaf name will be written.
2021 		   Must be of size B_FILE_NAME_LENGTH at least.
2022 	\return \c B_OK, if everything went fine, \c B_NAME_TOO_LONG, if the leaf
2023 		   name is longer than \c B_FILE_NAME_LENGTH, or \c B_ENTRY_NOT_FOUND,
2024 		   if the given path name is empty.
2025 */
2026 static status_t
2027 get_dir_path_and_leaf(char* path, char* filename)
2028 {
2029 	if (*path == '\0')
2030 		return B_ENTRY_NOT_FOUND;
2031 
2032 	char* last = strrchr(path, '/');
2033 		// '/' are not allowed in file names!
2034 
2035 	FUNCTION(("get_dir_path_and_leaf(path = %s)\n", path));
2036 
2037 	if (last == NULL) {
2038 		// this path is single segment with no '/' in it
2039 		// ex. "foo"
2040 		if (strlcpy(filename, path, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2041 			return B_NAME_TOO_LONG;
2042 
2043 		strcpy(path, ".");
2044 	} else {
2045 		last++;
2046 		if (last[0] == '\0') {
2047 			// special case: the path ends in one or more '/' - remove them
2048 			while (*--last == '/' && last != path);
2049 			last[1] = '\0';
2050 
2051 			if (last == path && last[0] == '/') {
2052 				// This path points to the root of the file system
2053 				strcpy(filename, ".");
2054 				return B_OK;
2055 			}
2056 			for (; last != path && *(last - 1) != '/'; last--);
2057 				// rewind to the start of the leaf before the '/'
2058 		}
2059 
2060 		// normal leaf: replace the leaf portion of the path with a '.'
2061 		if (strlcpy(filename, last, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2062 			return B_NAME_TOO_LONG;
2063 
2064 		last[0] = '.';
2065 		last[1] = '\0';
2066 	}
2067 	return B_OK;
2068 }
2069 
2070 
2071 static status_t
2072 entry_ref_to_vnode(dev_t mountID, ino_t directoryID, const char* name,
2073 	bool traverse, bool kernel, struct vnode** _vnode)
2074 {
2075 	char clonedName[B_FILE_NAME_LENGTH + 1];
2076 	if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2077 		return B_NAME_TOO_LONG;
2078 
2079 	// get the directory vnode and let vnode_path_to_vnode() do the rest
2080 	struct vnode* directory;
2081 
2082 	status_t status = get_vnode(mountID, directoryID, &directory, true, false);
2083 	if (status < 0)
2084 		return status;
2085 
2086 	return vnode_path_to_vnode(directory, clonedName, traverse, 0, kernel,
2087 		_vnode, NULL);
2088 }
2089 
2090 
2091 /*!	Looks up the entry with name \a name in the directory represented by \a dir
2092 	and returns the respective vnode.
2093 	On success a reference to the vnode is acquired for the caller.
2094 */
2095 static status_t
2096 lookup_dir_entry(struct vnode* dir, const char* name, struct vnode** _vnode)
2097 {
2098 	ino_t id;
2099 	bool missing;
2100 
2101 	if (dir->mount->entry_cache.Lookup(dir->id, name, id, missing)) {
2102 		return missing ? B_ENTRY_NOT_FOUND
2103 			: get_vnode(dir->device, id, _vnode, true, false);
2104 	}
2105 
2106 	status_t status = FS_CALL(dir, lookup, name, &id);
2107 	if (status != B_OK)
2108 		return status;
2109 
2110 	// The lookup() hook calls get_vnode() or publish_vnode(), so we do already
2111 	// have a reference and just need to look the node up.
2112 	rw_lock_read_lock(&sVnodeLock);
2113 	*_vnode = lookup_vnode(dir->device, id);
2114 	rw_lock_read_unlock(&sVnodeLock);
2115 
2116 	if (*_vnode == NULL) {
2117 		panic("lookup_dir_entry(): could not lookup vnode (mountid 0x%" B_PRIx32
2118 			" vnid 0x%" B_PRIx64 ")\n", dir->device, id);
2119 		return B_ENTRY_NOT_FOUND;
2120 	}
2121 
2122 //	ktrace_printf("lookup_dir_entry(): dir: %p (%ld, %lld), name: \"%s\" -> "
2123 //		"%p (%ld, %lld)", dir, dir->mount->id, dir->id, name, *_vnode,
2124 //		(*_vnode)->mount->id, (*_vnode)->id);
2125 
2126 	return B_OK;
2127 }
2128 
2129 
2130 /*!	Returns the vnode for the relative path starting at the specified \a vnode.
2131 	\a path must not be NULL.
2132 	If it returns successfully, \a path contains the name of the last path
2133 	component. This function clobbers the buffer pointed to by \a path only
2134 	if it does contain more than one component.
2135 	Note, this reduces the ref_count of the starting \a vnode, no matter if
2136 	it is successful or not!
2137 */
2138 static status_t
2139 vnode_path_to_vnode(struct vnode* vnode, char* path, bool traverseLeafLink,
2140 	int count, struct io_context* ioContext, struct vnode** _vnode,
2141 	ino_t* _parentID)
2142 {
2143 	status_t status = B_OK;
2144 	ino_t lastParentID = vnode->id;
2145 
2146 	FUNCTION(("vnode_path_to_vnode(vnode = %p, path = %s)\n", vnode, path));
2147 
2148 	if (path == NULL) {
2149 		put_vnode(vnode);
2150 		return B_BAD_VALUE;
2151 	}
2152 
2153 	if (*path == '\0') {
2154 		put_vnode(vnode);
2155 		return B_ENTRY_NOT_FOUND;
2156 	}
2157 
2158 	while (true) {
2159 		struct vnode* nextVnode;
2160 		char* nextPath;
2161 
2162 		TRACE(("vnode_path_to_vnode: top of loop. p = %p, p = '%s'\n", path,
2163 			path));
2164 
2165 		// done?
2166 		if (path[0] == '\0')
2167 			break;
2168 
2169 		// walk to find the next path component ("path" will point to a single
2170 		// path component), and filter out multiple slashes
2171 		for (nextPath = path + 1; *nextPath != '\0' && *nextPath != '/';
2172 				nextPath++);
2173 
2174 		if (*nextPath == '/') {
2175 			*nextPath = '\0';
2176 			do
2177 				nextPath++;
2178 			while (*nextPath == '/');
2179 		}
2180 
2181 		// See if the '..' is at a covering vnode move to the covered
2182 		// vnode so we pass the '..' path to the underlying filesystem.
2183 		// Also prevent breaking the root of the IO context.
2184 		if (strcmp("..", path) == 0) {
2185 			if (vnode == ioContext->root) {
2186 				// Attempted prison break! Keep it contained.
2187 				path = nextPath;
2188 				continue;
2189 			}
2190 
2191 			if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2192 				nextVnode = coveredVnode;
2193 				put_vnode(vnode);
2194 				vnode = nextVnode;
2195 			}
2196 		}
2197 
2198 		// check if vnode is really a directory
2199 		if (status == B_OK && !S_ISDIR(vnode->Type()))
2200 			status = B_NOT_A_DIRECTORY;
2201 
2202 		// Check if we have the right to search the current directory vnode.
2203 		// If a file system doesn't have the access() function, we assume that
2204 		// searching a directory is always allowed
2205 		if (status == B_OK && HAS_FS_CALL(vnode, access))
2206 			status = FS_CALL(vnode, access, X_OK);
2207 
2208 		// Tell the filesystem to get the vnode of this path component (if we
2209 		// got the permission from the call above)
2210 		if (status == B_OK)
2211 			status = lookup_dir_entry(vnode, path, &nextVnode);
2212 
2213 		if (status != B_OK) {
2214 			put_vnode(vnode);
2215 			return status;
2216 		}
2217 
2218 		// If the new node is a symbolic link, resolve it (if we've been told
2219 		// to do it)
2220 		if (S_ISLNK(nextVnode->Type())
2221 			&& (traverseLeafLink || nextPath[0] != '\0')) {
2222 			size_t bufferSize;
2223 			char* buffer;
2224 
2225 			TRACE(("traverse link\n"));
2226 
2227 			// it's not exactly nice style using goto in this way, but hey,
2228 			// it works :-/
2229 			if (count + 1 > B_MAX_SYMLINKS) {
2230 				status = B_LINK_LIMIT;
2231 				goto resolve_link_error;
2232 			}
2233 
2234 			buffer = (char*)malloc(bufferSize = B_PATH_NAME_LENGTH);
2235 			if (buffer == NULL) {
2236 				status = B_NO_MEMORY;
2237 				goto resolve_link_error;
2238 			}
2239 
2240 			if (HAS_FS_CALL(nextVnode, read_symlink)) {
2241 				bufferSize--;
2242 				status = FS_CALL(nextVnode, read_symlink, buffer, &bufferSize);
2243 				// null-terminate
2244 				if (status >= 0)
2245 					buffer[bufferSize] = '\0';
2246 			} else
2247 				status = B_BAD_VALUE;
2248 
2249 			if (status != B_OK) {
2250 				free(buffer);
2251 
2252 		resolve_link_error:
2253 				put_vnode(vnode);
2254 				put_vnode(nextVnode);
2255 
2256 				return status;
2257 			}
2258 			put_vnode(nextVnode);
2259 
2260 			// Check if we start from the root directory or the current
2261 			// directory ("vnode" still points to that one).
2262 			// Cut off all leading slashes if it's the root directory
2263 			path = buffer;
2264 			bool absoluteSymlink = false;
2265 			if (path[0] == '/') {
2266 				// we don't need the old directory anymore
2267 				put_vnode(vnode);
2268 
2269 				while (*++path == '/')
2270 					;
2271 
2272 				mutex_lock(&sIOContextRootLock);
2273 				vnode = ioContext->root;
2274 				inc_vnode_ref_count(vnode);
2275 				mutex_unlock(&sIOContextRootLock);
2276 
2277 				absoluteSymlink = true;
2278 			}
2279 
2280 			inc_vnode_ref_count(vnode);
2281 				// balance the next recursion - we will decrement the
2282 				// ref_count of the vnode, no matter if we succeeded or not
2283 
2284 			if (absoluteSymlink && *path == '\0') {
2285 				// symlink was just "/"
2286 				nextVnode = vnode;
2287 			} else {
2288 				status = vnode_path_to_vnode(vnode, path, true, count + 1,
2289 					ioContext, &nextVnode, &lastParentID);
2290 			}
2291 
2292 			free(buffer);
2293 
2294 			if (status != B_OK) {
2295 				put_vnode(vnode);
2296 				return status;
2297 			}
2298 		} else
2299 			lastParentID = vnode->id;
2300 
2301 		// decrease the ref count on the old dir we just looked up into
2302 		put_vnode(vnode);
2303 
2304 		path = nextPath;
2305 		vnode = nextVnode;
2306 
2307 		// see if we hit a covered node
2308 		if (Vnode* coveringNode = get_covering_vnode(vnode)) {
2309 			put_vnode(vnode);
2310 			vnode = coveringNode;
2311 		}
2312 	}
2313 
2314 	*_vnode = vnode;
2315 	if (_parentID)
2316 		*_parentID = lastParentID;
2317 
2318 	return B_OK;
2319 }
2320 
2321 
2322 static status_t
2323 vnode_path_to_vnode(struct vnode* vnode, char* path, bool traverseLeafLink,
2324 	int count, bool kernel, struct vnode** _vnode, ino_t* _parentID)
2325 {
2326 	return vnode_path_to_vnode(vnode, path, traverseLeafLink, count,
2327 		get_current_io_context(kernel), _vnode, _parentID);
2328 }
2329 
2330 
2331 static status_t
2332 path_to_vnode(char* path, bool traverseLink, struct vnode** _vnode,
2333 	ino_t* _parentID, bool kernel)
2334 {
2335 	struct vnode* start = NULL;
2336 
2337 	FUNCTION(("path_to_vnode(path = \"%s\")\n", path));
2338 
2339 	if (!path)
2340 		return B_BAD_VALUE;
2341 
2342 	if (*path == '\0')
2343 		return B_ENTRY_NOT_FOUND;
2344 
2345 	// figure out if we need to start at root or at cwd
2346 	if (*path == '/') {
2347 		if (sRoot == NULL) {
2348 			// we're a bit early, aren't we?
2349 			return B_ERROR;
2350 		}
2351 
2352 		while (*++path == '/')
2353 			;
2354 		start = get_root_vnode(kernel);
2355 
2356 		if (*path == '\0') {
2357 			*_vnode = start;
2358 			return B_OK;
2359 		}
2360 
2361 	} else {
2362 		struct io_context* context = get_current_io_context(kernel);
2363 
2364 		mutex_lock(&context->io_mutex);
2365 		start = context->cwd;
2366 		if (start != NULL)
2367 			inc_vnode_ref_count(start);
2368 		mutex_unlock(&context->io_mutex);
2369 
2370 		if (start == NULL)
2371 			return B_ERROR;
2372 	}
2373 
2374 	return vnode_path_to_vnode(start, path, traverseLink, 0, kernel, _vnode,
2375 		_parentID);
2376 }
2377 
2378 
2379 /*! Returns the vnode in the next to last segment of the path, and returns
2380 	the last portion in filename.
2381 	The path buffer must be able to store at least one additional character.
2382 */
2383 static status_t
2384 path_to_dir_vnode(char* path, struct vnode** _vnode, char* filename,
2385 	bool kernel)
2386 {
2387 	status_t status = get_dir_path_and_leaf(path, filename);
2388 	if (status != B_OK)
2389 		return status;
2390 
2391 	return path_to_vnode(path, true, _vnode, NULL, kernel);
2392 }
2393 
2394 
2395 /*!	\brief Retrieves the directory vnode and the leaf name of an entry referred
2396 		   to by a FD + path pair.
2397 
2398 	\a path must be given in either case. \a fd might be omitted, in which
2399 	case \a path is either an absolute path or one relative to the current
2400 	directory. If both a supplied and \a path is relative it is reckoned off
2401 	of the directory referred to by \a fd. If \a path is absolute \a fd is
2402 	ignored.
2403 
2404 	The caller has the responsibility to call put_vnode() on the returned
2405 	directory vnode.
2406 
2407 	\param fd The FD. May be < 0.
2408 	\param path The absolute or relative path. Must not be \c NULL. The buffer
2409 	       is modified by this function. It must have at least room for a
2410 	       string one character longer than the path it contains.
2411 	\param _vnode A pointer to a variable the directory vnode shall be written
2412 		   into.
2413 	\param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2414 		   the leaf name of the specified entry will be written.
2415 	\param kernel \c true, if invoked from inside the kernel, \c false if
2416 		   invoked from userland.
2417 	\return \c B_OK, if everything went fine, another error code otherwise.
2418 */
2419 static status_t
2420 fd_and_path_to_dir_vnode(int fd, char* path, struct vnode** _vnode,
2421 	char* filename, bool kernel)
2422 {
2423 	if (!path)
2424 		return B_BAD_VALUE;
2425 	if (*path == '\0')
2426 		return B_ENTRY_NOT_FOUND;
2427 	if (fd < 0)
2428 		return path_to_dir_vnode(path, _vnode, filename, kernel);
2429 
2430 	status_t status = get_dir_path_and_leaf(path, filename);
2431 	if (status != B_OK)
2432 		return status;
2433 
2434 	return fd_and_path_to_vnode(fd, path, true, _vnode, NULL, kernel);
2435 }
2436 
2437 
2438 /*!	\brief Retrieves the directory vnode and the leaf name of an entry referred
2439 		   to by a vnode + path pair.
2440 
2441 	\a path must be given in either case. \a vnode might be omitted, in which
2442 	case \a path is either an absolute path or one relative to the current
2443 	directory. If both a supplied and \a path is relative it is reckoned off
2444 	of the directory referred to by \a vnode. If \a path is absolute \a vnode is
2445 	ignored.
2446 
2447 	The caller has the responsibility to call put_vnode() on the returned
2448 	directory vnode.
2449 
2450 	\param vnode The vnode. May be \c NULL.
2451 	\param path The absolute or relative path. Must not be \c NULL. The buffer
2452 	       is modified by this function. It must have at least room for a
2453 	       string one character longer than the path it contains.
2454 	\param _vnode A pointer to a variable the directory vnode shall be written
2455 		   into.
2456 	\param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2457 		   the leaf name of the specified entry will be written.
2458 	\param kernel \c true, if invoked from inside the kernel, \c false if
2459 		   invoked from userland.
2460 	\return \c B_OK, if everything went fine, another error code otherwise.
2461 */
2462 static status_t
2463 vnode_and_path_to_dir_vnode(struct vnode* vnode, char* path,
2464 	struct vnode** _vnode, char* filename, bool kernel)
2465 {
2466 	if (!path)
2467 		return B_BAD_VALUE;
2468 	if (*path == '\0')
2469 		return B_ENTRY_NOT_FOUND;
2470 	if (vnode == NULL || path[0] == '/')
2471 		return path_to_dir_vnode(path, _vnode, filename, kernel);
2472 
2473 	status_t status = get_dir_path_and_leaf(path, filename);
2474 	if (status != B_OK)
2475 		return status;
2476 
2477 	inc_vnode_ref_count(vnode);
2478 		// vnode_path_to_vnode() always decrements the ref count
2479 
2480 	return vnode_path_to_vnode(vnode, path, true, 0, kernel, _vnode, NULL);
2481 }
2482 
2483 
2484 /*! Returns a vnode's name in the d_name field of a supplied dirent buffer.
2485 */
2486 static status_t
2487 get_vnode_name(struct vnode* vnode, struct vnode* parent, struct dirent* buffer,
2488 	size_t bufferSize, struct io_context* ioContext)
2489 {
2490 	if (bufferSize < sizeof(struct dirent))
2491 		return B_BAD_VALUE;
2492 
2493 	// See if the vnode is covering another vnode and move to the covered
2494 	// vnode so we get the underlying file system
2495 	VNodePutter vnodePutter;
2496 	if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2497 		vnode = coveredVnode;
2498 		vnodePutter.SetTo(vnode);
2499 	}
2500 
2501 	if (HAS_FS_CALL(vnode, get_vnode_name)) {
2502 		// The FS supports getting the name of a vnode.
2503 		if (FS_CALL(vnode, get_vnode_name, buffer->d_name,
2504 			(char*)buffer + bufferSize - buffer->d_name) == B_OK)
2505 			return B_OK;
2506 	}
2507 
2508 	// The FS doesn't support getting the name of a vnode. So we search the
2509 	// parent directory for the vnode, if the caller let us.
2510 
2511 	if (parent == NULL || !HAS_FS_CALL(parent, read_dir))
2512 		return B_UNSUPPORTED;
2513 
2514 	void* cookie;
2515 
2516 	status_t status = FS_CALL(parent, open_dir, &cookie);
2517 	if (status >= B_OK) {
2518 		while (true) {
2519 			uint32 num = 1;
2520 			// We use the FS hook directly instead of dir_read(), since we don't
2521 			// want the entries to be fixed. We have already resolved vnode to
2522 			// the covered node.
2523 			status = FS_CALL(parent, read_dir, cookie, buffer, bufferSize,
2524 				&num);
2525 			if (status != B_OK)
2526 				break;
2527 			if (num == 0) {
2528 				status = B_ENTRY_NOT_FOUND;
2529 				break;
2530 			}
2531 
2532 			if (vnode->id == buffer->d_ino) {
2533 				// found correct entry!
2534 				break;
2535 			}
2536 		}
2537 
2538 		FS_CALL(parent, close_dir, cookie);
2539 		FS_CALL(parent, free_dir_cookie, cookie);
2540 	}
2541 	return status;
2542 }
2543 
2544 
2545 static status_t
2546 get_vnode_name(struct vnode* vnode, struct vnode* parent, char* name,
2547 	size_t nameSize, bool kernel)
2548 {
2549 	char buffer[sizeof(struct dirent) + B_FILE_NAME_LENGTH];
2550 	struct dirent* dirent = (struct dirent*)buffer;
2551 
2552 	status_t status = get_vnode_name(vnode, parent, dirent, sizeof(buffer),
2553 		get_current_io_context(kernel));
2554 	if (status != B_OK)
2555 		return status;
2556 
2557 	if (strlcpy(name, dirent->d_name, nameSize) >= nameSize)
2558 		return B_BUFFER_OVERFLOW;
2559 
2560 	return B_OK;
2561 }
2562 
2563 
2564 /*!	Gets the full path to a given directory vnode.
2565 	It uses the fs_get_vnode_name() call to get the name of a vnode; if a
2566 	file system doesn't support this call, it will fall back to iterating
2567 	through the parent directory to get the name of the child.
2568 
2569 	To protect against circular loops, it supports a maximum tree depth
2570 	of 256 levels.
2571 
2572 	Note that the path may not be correct the time this function returns!
2573 	It doesn't use any locking to prevent returning the correct path, as
2574 	paths aren't safe anyway: the path to a file can change at any time.
2575 
2576 	It might be a good idea, though, to check if the returned path exists
2577 	in the calling function (it's not done here because of efficiency)
2578 */
2579 static status_t
2580 dir_vnode_to_path(struct vnode* vnode, char* buffer, size_t bufferSize,
2581 	bool kernel)
2582 {
2583 	FUNCTION(("dir_vnode_to_path(%p, %p, %lu)\n", vnode, buffer, bufferSize));
2584 
2585 	if (vnode == NULL || buffer == NULL || bufferSize == 0)
2586 		return B_BAD_VALUE;
2587 
2588 	if (!S_ISDIR(vnode->Type()))
2589 		return B_NOT_A_DIRECTORY;
2590 
2591 	char* path = buffer;
2592 	int32 insert = bufferSize;
2593 	int32 maxLevel = 256;
2594 	int32 length;
2595 	status_t status = B_OK;
2596 	struct io_context* ioContext = get_current_io_context(kernel);
2597 
2598 	// we don't use get_vnode() here because this call is more
2599 	// efficient and does all we need from get_vnode()
2600 	inc_vnode_ref_count(vnode);
2601 
2602 	path[--insert] = '\0';
2603 		// the path is filled right to left
2604 
2605 	while (true) {
2606 		// If the node is the context's root, bail out. Otherwise resolve mount
2607 		// points.
2608 		if (vnode == ioContext->root)
2609 			break;
2610 
2611 		if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2612 			put_vnode(vnode);
2613 			vnode = coveredVnode;
2614 		}
2615 
2616 		// lookup the parent vnode
2617 		struct vnode* parentVnode;
2618 		status = lookup_dir_entry(vnode, "..", &parentVnode);
2619 		if (status != B_OK)
2620 			goto out;
2621 
2622 		if (parentVnode == vnode) {
2623 			// The caller apparently got their hands on a node outside of their
2624 			// context's root. Now we've hit the global root.
2625 			put_vnode(parentVnode);
2626 			break;
2627 		}
2628 
2629 		// get the node's name
2630 		char nameBuffer[sizeof(struct dirent) + B_FILE_NAME_LENGTH];
2631 			// also used for fs_read_dir()
2632 		char* name = &((struct dirent*)nameBuffer)->d_name[0];
2633 		status = get_vnode_name(vnode, parentVnode, (struct dirent*)nameBuffer,
2634 			sizeof(nameBuffer), ioContext);
2635 
2636 		// release the current vnode, we only need its parent from now on
2637 		put_vnode(vnode);
2638 		vnode = parentVnode;
2639 
2640 		if (status != B_OK)
2641 			goto out;
2642 
2643 		// TODO: add an explicit check for loops in about 10 levels to do
2644 		// real loop detection
2645 
2646 		// don't go deeper as 'maxLevel' to prevent circular loops
2647 		if (maxLevel-- < 0) {
2648 			status = B_LINK_LIMIT;
2649 			goto out;
2650 		}
2651 
2652 		// add the name in front of the current path
2653 		name[B_FILE_NAME_LENGTH - 1] = '\0';
2654 		length = strlen(name);
2655 		insert -= length;
2656 		if (insert <= 0) {
2657 			status = B_RESULT_NOT_REPRESENTABLE;
2658 			goto out;
2659 		}
2660 		memcpy(path + insert, name, length);
2661 		path[--insert] = '/';
2662 	}
2663 
2664 	// the root dir will result in an empty path: fix it
2665 	if (path[insert] == '\0')
2666 		path[--insert] = '/';
2667 
2668 	TRACE(("  path is: %s\n", path + insert));
2669 
2670 	// move the path to the start of the buffer
2671 	length = bufferSize - insert;
2672 	memmove(buffer, path + insert, length);
2673 
2674 out:
2675 	put_vnode(vnode);
2676 	return status;
2677 }
2678 
2679 
2680 /*!	Checks the length of every path component, and adds a '.'
2681 	if the path ends in a slash.
2682 	The given path buffer must be able to store at least one
2683 	additional character.
2684 */
2685 static status_t
2686 check_path(char* to)
2687 {
2688 	int32 length = 0;
2689 
2690 	// check length of every path component
2691 
2692 	while (*to) {
2693 		char* begin;
2694 		if (*to == '/')
2695 			to++, length++;
2696 
2697 		begin = to;
2698 		while (*to != '/' && *to)
2699 			to++, length++;
2700 
2701 		if (to - begin > B_FILE_NAME_LENGTH)
2702 			return B_NAME_TOO_LONG;
2703 	}
2704 
2705 	if (length == 0)
2706 		return B_ENTRY_NOT_FOUND;
2707 
2708 	// complete path if there is a slash at the end
2709 
2710 	if (*(to - 1) == '/') {
2711 		if (length > B_PATH_NAME_LENGTH - 2)
2712 			return B_NAME_TOO_LONG;
2713 
2714 		to[0] = '.';
2715 		to[1] = '\0';
2716 	}
2717 
2718 	return B_OK;
2719 }
2720 
2721 
2722 static struct file_descriptor*
2723 get_fd_and_vnode(int fd, struct vnode** _vnode, bool kernel)
2724 {
2725 	struct file_descriptor* descriptor
2726 		= get_fd(get_current_io_context(kernel), fd);
2727 	if (descriptor == NULL)
2728 		return NULL;
2729 
2730 	struct vnode* vnode = fd_vnode(descriptor);
2731 	if (vnode == NULL) {
2732 		put_fd(descriptor);
2733 		return NULL;
2734 	}
2735 
2736 	// ToDo: when we can close a file descriptor at any point, investigate
2737 	//	if this is still valid to do (accessing the vnode without ref_count
2738 	//	or locking)
2739 	*_vnode = vnode;
2740 	return descriptor;
2741 }
2742 
2743 
2744 static struct vnode*
2745 get_vnode_from_fd(int fd, bool kernel)
2746 {
2747 	struct file_descriptor* descriptor;
2748 	struct vnode* vnode;
2749 
2750 	descriptor = get_fd(get_current_io_context(kernel), fd);
2751 	if (descriptor == NULL)
2752 		return NULL;
2753 
2754 	vnode = fd_vnode(descriptor);
2755 	if (vnode != NULL)
2756 		inc_vnode_ref_count(vnode);
2757 
2758 	put_fd(descriptor);
2759 	return vnode;
2760 }
2761 
2762 
2763 /*!	Gets the vnode from an FD + path combination. If \a fd is lower than zero,
2764 	only the path will be considered. In this case, the \a path must not be
2765 	NULL.
2766 	If \a fd is a valid file descriptor, \a path may be NULL for directories,
2767 	and should be NULL for files.
2768 */
2769 static status_t
2770 fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
2771 	struct vnode** _vnode, ino_t* _parentID, bool kernel)
2772 {
2773 	if (fd < 0 && !path)
2774 		return B_BAD_VALUE;
2775 
2776 	if (path != NULL && *path == '\0')
2777 		return B_ENTRY_NOT_FOUND;
2778 
2779 	if (fd < 0 || (path != NULL && path[0] == '/')) {
2780 		// no FD or absolute path
2781 		return path_to_vnode(path, traverseLeafLink, _vnode, _parentID, kernel);
2782 	}
2783 
2784 	// FD only, or FD + relative path
2785 	struct vnode* vnode = get_vnode_from_fd(fd, kernel);
2786 	if (vnode == NULL)
2787 		return B_FILE_ERROR;
2788 
2789 	if (path != NULL) {
2790 		return vnode_path_to_vnode(vnode, path, traverseLeafLink, 0, kernel,
2791 			_vnode, _parentID);
2792 	}
2793 
2794 	// there is no relative path to take into account
2795 
2796 	*_vnode = vnode;
2797 	if (_parentID)
2798 		*_parentID = -1;
2799 
2800 	return B_OK;
2801 }
2802 
2803 
2804 static int
2805 get_new_fd(int type, struct fs_mount* mount, struct vnode* vnode,
2806 	void* cookie, int openMode, bool kernel)
2807 {
2808 	struct file_descriptor* descriptor;
2809 	int fd;
2810 
2811 	// If the vnode is locked, we don't allow creating a new file/directory
2812 	// file_descriptor for it
2813 	if (vnode && vnode->mandatory_locked_by != NULL
2814 		&& (type == FDTYPE_FILE || type == FDTYPE_DIR))
2815 		return B_BUSY;
2816 
2817 	descriptor = alloc_fd();
2818 	if (!descriptor)
2819 		return B_NO_MEMORY;
2820 
2821 	if (vnode)
2822 		descriptor->u.vnode = vnode;
2823 	else
2824 		descriptor->u.mount = mount;
2825 	descriptor->cookie = cookie;
2826 
2827 	switch (type) {
2828 		// vnode types
2829 		case FDTYPE_FILE:
2830 			descriptor->ops = &sFileOps;
2831 			break;
2832 		case FDTYPE_DIR:
2833 			descriptor->ops = &sDirectoryOps;
2834 			break;
2835 		case FDTYPE_ATTR:
2836 			descriptor->ops = &sAttributeOps;
2837 			break;
2838 		case FDTYPE_ATTR_DIR:
2839 			descriptor->ops = &sAttributeDirectoryOps;
2840 			break;
2841 
2842 		// mount types
2843 		case FDTYPE_INDEX_DIR:
2844 			descriptor->ops = &sIndexDirectoryOps;
2845 			break;
2846 		case FDTYPE_QUERY:
2847 			descriptor->ops = &sQueryOps;
2848 			break;
2849 
2850 		default:
2851 			panic("get_new_fd() called with unknown type %d\n", type);
2852 			break;
2853 	}
2854 	descriptor->type = type;
2855 	descriptor->open_mode = openMode;
2856 
2857 	io_context* context = get_current_io_context(kernel);
2858 	fd = new_fd(context, descriptor);
2859 	if (fd < 0) {
2860 		free(descriptor);
2861 		return B_NO_MORE_FDS;
2862 	}
2863 
2864 	mutex_lock(&context->io_mutex);
2865 	fd_set_close_on_exec(context, fd, (openMode & O_CLOEXEC) != 0);
2866 	mutex_unlock(&context->io_mutex);
2867 
2868 	return fd;
2869 }
2870 
2871 
2872 /*!	In-place normalizes \a path. It's otherwise semantically equivalent to
2873 	vfs_normalize_path(). See there for more documentation.
2874 */
2875 static status_t
2876 normalize_path(char* path, size_t pathSize, bool traverseLink, bool kernel)
2877 {
2878 	VNodePutter dirPutter;
2879 	struct vnode* dir = NULL;
2880 	status_t error;
2881 
2882 	for (int i = 0; i < B_MAX_SYMLINKS; i++) {
2883 		// get dir vnode + leaf name
2884 		struct vnode* nextDir;
2885 		char leaf[B_FILE_NAME_LENGTH];
2886 		error = vnode_and_path_to_dir_vnode(dir, path, &nextDir, leaf, kernel);
2887 		if (error != B_OK)
2888 			return error;
2889 
2890 		dir = nextDir;
2891 		strcpy(path, leaf);
2892 		dirPutter.SetTo(dir);
2893 
2894 		// get file vnode, if we shall resolve links
2895 		bool fileExists = false;
2896 		struct vnode* fileVnode;
2897 		VNodePutter fileVnodePutter;
2898 		if (traverseLink) {
2899 			inc_vnode_ref_count(dir);
2900 			if (vnode_path_to_vnode(dir, path, false, 0, kernel, &fileVnode,
2901 					NULL) == B_OK) {
2902 				fileVnodePutter.SetTo(fileVnode);
2903 				fileExists = true;
2904 			}
2905 		}
2906 
2907 		if (!fileExists || !traverseLink || !S_ISLNK(fileVnode->Type())) {
2908 			// we're done -- construct the path
2909 			bool hasLeaf = true;
2910 			if (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0) {
2911 				// special cases "." and ".." -- get the dir, forget the leaf
2912 				inc_vnode_ref_count(dir);
2913 				error = vnode_path_to_vnode(dir, leaf, false, 0, kernel,
2914 					&nextDir, NULL);
2915 				if (error != B_OK)
2916 					return error;
2917 				dir = nextDir;
2918 				dirPutter.SetTo(dir);
2919 				hasLeaf = false;
2920 			}
2921 
2922 			// get the directory path
2923 			error = dir_vnode_to_path(dir, path, B_PATH_NAME_LENGTH, kernel);
2924 			if (error != B_OK)
2925 				return error;
2926 
2927 			// append the leaf name
2928 			if (hasLeaf) {
2929 				// insert a directory separator if this is not the file system
2930 				// root
2931 				if ((strcmp(path, "/") != 0
2932 					&& strlcat(path, "/", pathSize) >= pathSize)
2933 					|| strlcat(path, leaf, pathSize) >= pathSize) {
2934 					return B_NAME_TOO_LONG;
2935 				}
2936 			}
2937 
2938 			return B_OK;
2939 		}
2940 
2941 		// read link
2942 		if (HAS_FS_CALL(fileVnode, read_symlink)) {
2943 			size_t bufferSize = B_PATH_NAME_LENGTH - 1;
2944 			error = FS_CALL(fileVnode, read_symlink, path, &bufferSize);
2945 			if (error != B_OK)
2946 				return error;
2947 			path[bufferSize] = '\0';
2948 		} else
2949 			return B_BAD_VALUE;
2950 	}
2951 
2952 	return B_LINK_LIMIT;
2953 }
2954 
2955 
2956 static status_t
2957 resolve_covered_parent(struct vnode* parent, dev_t* _device, ino_t* _node,
2958 	struct io_context* ioContext)
2959 {
2960 	// Make sure the IO context root is not bypassed.
2961 	if (parent == ioContext->root) {
2962 		*_device = parent->device;
2963 		*_node = parent->id;
2964 		return B_OK;
2965 	}
2966 
2967 	inc_vnode_ref_count(parent);
2968 		// vnode_path_to_vnode() puts the node
2969 
2970 	// ".." is guaranteed not to be clobbered by this call
2971 	struct vnode* vnode;
2972 	status_t status = vnode_path_to_vnode(parent, (char*)"..", false, 0,
2973 		ioContext, &vnode, NULL);
2974 	if (status == B_OK) {
2975 		*_device = vnode->device;
2976 		*_node = vnode->id;
2977 		put_vnode(vnode);
2978 	}
2979 
2980 	return status;
2981 }
2982 
2983 
2984 #ifdef ADD_DEBUGGER_COMMANDS
2985 
2986 
2987 static void
2988 _dump_advisory_locking(advisory_locking* locking)
2989 {
2990 	if (locking == NULL)
2991 		return;
2992 
2993 	kprintf("   lock:        %" B_PRId32, locking->lock);
2994 	kprintf("   wait_sem:    %" B_PRId32, locking->wait_sem);
2995 
2996 	int32 index = 0;
2997 	LockList::Iterator iterator = locking->locks.GetIterator();
2998 	while (iterator.HasNext()) {
2999 		struct advisory_lock* lock = iterator.Next();
3000 
3001 		kprintf("   [%2" B_PRId32 "] team:   %" B_PRId32 "\n", index++, lock->team);
3002 		kprintf("        start:  %" B_PRIdOFF "\n", lock->start);
3003 		kprintf("        end:    %" B_PRIdOFF "\n", lock->end);
3004 		kprintf("        shared? %s\n", lock->shared ? "yes" : "no");
3005 	}
3006 }
3007 
3008 
3009 static void
3010 _dump_mount(struct fs_mount* mount)
3011 {
3012 	kprintf("MOUNT: %p\n", mount);
3013 	kprintf(" id:            %" B_PRIdDEV "\n", mount->id);
3014 	kprintf(" device_name:   %s\n", mount->device_name);
3015 	kprintf(" root_vnode:    %p\n", mount->root_vnode);
3016 	kprintf(" covers:        %p\n", mount->root_vnode->covers);
3017 	kprintf(" partition:     %p\n", mount->partition);
3018 	kprintf(" lock:          %p\n", &mount->rlock);
3019 	kprintf(" flags:        %s%s\n", mount->unmounting ? " unmounting" : "",
3020 		mount->owns_file_device ? " owns_file_device" : "");
3021 
3022 	fs_volume* volume = mount->volume;
3023 	while (volume != NULL) {
3024 		kprintf(" volume %p:\n", volume);
3025 		kprintf("  layer:            %" B_PRId32 "\n", volume->layer);
3026 		kprintf("  private_volume:   %p\n", volume->private_volume);
3027 		kprintf("  ops:              %p\n", volume->ops);
3028 		kprintf("  file_system:      %p\n", volume->file_system);
3029 		kprintf("  file_system_name: %s\n", volume->file_system_name);
3030 		volume = volume->super_volume;
3031 	}
3032 
3033 	set_debug_variable("_volume", (addr_t)mount->volume->private_volume);
3034 	set_debug_variable("_root", (addr_t)mount->root_vnode);
3035 	set_debug_variable("_covers", (addr_t)mount->root_vnode->covers);
3036 	set_debug_variable("_partition", (addr_t)mount->partition);
3037 }
3038 
3039 
3040 static bool
3041 debug_prepend_vnode_name_to_path(char* buffer, size_t& bufferSize,
3042 	const char* name)
3043 {
3044 	bool insertSlash = buffer[bufferSize] != '\0';
3045 	size_t nameLength = strlen(name);
3046 
3047 	if (bufferSize < nameLength + (insertSlash ? 1 : 0))
3048 		return false;
3049 
3050 	if (insertSlash)
3051 		buffer[--bufferSize] = '/';
3052 
3053 	bufferSize -= nameLength;
3054 	memcpy(buffer + bufferSize, name, nameLength);
3055 
3056 	return true;
3057 }
3058 
3059 
3060 static bool
3061 debug_prepend_vnode_id_to_path(char* buffer, size_t& bufferSize, dev_t devID,
3062 	ino_t nodeID)
3063 {
3064 	if (bufferSize == 0)
3065 		return false;
3066 
3067 	bool insertSlash = buffer[bufferSize] != '\0';
3068 	if (insertSlash)
3069 		buffer[--bufferSize] = '/';
3070 
3071 	size_t size = snprintf(buffer, bufferSize,
3072 		"<%" B_PRIdDEV ",%" B_PRIdINO ">", devID, nodeID);
3073 	if (size > bufferSize) {
3074 		if (insertSlash)
3075 			bufferSize++;
3076 		return false;
3077 	}
3078 
3079 	if (size < bufferSize)
3080 		memmove(buffer + bufferSize - size, buffer, size);
3081 
3082 	bufferSize -= size;
3083 	return true;
3084 }
3085 
3086 
3087 static char*
3088 debug_resolve_vnode_path(struct vnode* vnode, char* buffer, size_t bufferSize,
3089 	bool& _truncated)
3090 {
3091 	// null-terminate the path
3092 	buffer[--bufferSize] = '\0';
3093 
3094 	while (true) {
3095 		while (vnode->covers != NULL)
3096 			vnode = vnode->covers;
3097 
3098 		if (vnode == sRoot) {
3099 			_truncated = bufferSize == 0;
3100 			if (!_truncated)
3101 				buffer[--bufferSize] = '/';
3102 			return buffer + bufferSize;
3103 		}
3104 
3105 		// resolve the name
3106 		ino_t dirID;
3107 		const char* name = vnode->mount->entry_cache.DebugReverseLookup(
3108 			vnode->id, dirID);
3109 		if (name == NULL) {
3110 			// Failed to resolve the name -- prepend "<dev,node>/".
3111 			_truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3112 				vnode->mount->id, vnode->id);
3113 			return buffer + bufferSize;
3114 		}
3115 
3116 		// prepend the name
3117 		if (!debug_prepend_vnode_name_to_path(buffer, bufferSize, name)) {
3118 			_truncated = true;
3119 			return buffer + bufferSize;
3120 		}
3121 
3122 		// resolve the directory node
3123 		struct vnode* nextVnode = lookup_vnode(vnode->mount->id, dirID);
3124 		if (nextVnode == NULL) {
3125 			_truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3126 				vnode->mount->id, dirID);
3127 			return buffer + bufferSize;
3128 		}
3129 
3130 		vnode = nextVnode;
3131 	}
3132 }
3133 
3134 
3135 static void
3136 _dump_vnode(struct vnode* vnode, bool printPath)
3137 {
3138 	kprintf("VNODE: %p\n", vnode);
3139 	kprintf(" device:        %" B_PRIdDEV "\n", vnode->device);
3140 	kprintf(" id:            %" B_PRIdINO "\n", vnode->id);
3141 	kprintf(" ref_count:     %" B_PRId32 "\n", vnode->ref_count);
3142 	kprintf(" private_node:  %p\n", vnode->private_node);
3143 	kprintf(" mount:         %p\n", vnode->mount);
3144 	kprintf(" covered_by:    %p\n", vnode->covered_by);
3145 	kprintf(" covers:        %p\n", vnode->covers);
3146 	kprintf(" cache:         %p\n", vnode->cache);
3147 	kprintf(" type:          %#" B_PRIx32 "\n", vnode->Type());
3148 	kprintf(" flags:         %s%s%s\n", vnode->IsRemoved() ? "r" : "-",
3149 		vnode->IsBusy() ? "b" : "-", vnode->IsUnpublished() ? "u" : "-");
3150 	kprintf(" advisory_lock: %p\n", vnode->advisory_locking);
3151 
3152 	_dump_advisory_locking(vnode->advisory_locking);
3153 
3154 	if (printPath) {
3155 		void* buffer = debug_malloc(B_PATH_NAME_LENGTH);
3156 		if (buffer != NULL) {
3157 			bool truncated;
3158 			char* path = debug_resolve_vnode_path(vnode, (char*)buffer,
3159 				B_PATH_NAME_LENGTH, truncated);
3160 			if (path != NULL) {
3161 				kprintf(" path:          ");
3162 				if (truncated)
3163 					kputs("<truncated>/");
3164 				kputs(path);
3165 				kputs("\n");
3166 			} else
3167 				kprintf("Failed to resolve vnode path.\n");
3168 
3169 			debug_free(buffer);
3170 		} else
3171 			kprintf("Failed to allocate memory for constructing the path.\n");
3172 	}
3173 
3174 	set_debug_variable("_node", (addr_t)vnode->private_node);
3175 	set_debug_variable("_mount", (addr_t)vnode->mount);
3176 	set_debug_variable("_covered_by", (addr_t)vnode->covered_by);
3177 	set_debug_variable("_covers", (addr_t)vnode->covers);
3178 	set_debug_variable("_adv_lock", (addr_t)vnode->advisory_locking);
3179 }
3180 
3181 
3182 static int
3183 dump_mount(int argc, char** argv)
3184 {
3185 	if (argc != 2 || !strcmp(argv[1], "--help")) {
3186 		kprintf("usage: %s [id|address]\n", argv[0]);
3187 		return 0;
3188 	}
3189 
3190 	ulong val = parse_expression(argv[1]);
3191 	uint32 id = val;
3192 
3193 	struct fs_mount* mount = sMountsTable->Lookup(id);
3194 	if (mount == NULL) {
3195 		if (IS_USER_ADDRESS(id)) {
3196 			kprintf("fs_mount not found\n");
3197 			return 0;
3198 		}
3199 		mount = (fs_mount*)val;
3200 	}
3201 
3202 	_dump_mount(mount);
3203 	return 0;
3204 }
3205 
3206 
3207 static int
3208 dump_mounts(int argc, char** argv)
3209 {
3210 	if (argc != 1) {
3211 		kprintf("usage: %s\n", argv[0]);
3212 		return 0;
3213 	}
3214 
3215 	kprintf("%-*s    id %-*s   %-*s   %-*s   fs_name\n",
3216 		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "root",
3217 		B_PRINTF_POINTER_WIDTH, "covers", B_PRINTF_POINTER_WIDTH, "cookie");
3218 
3219 	struct fs_mount* mount;
3220 
3221 	MountTable::Iterator iterator(sMountsTable);
3222 	while (iterator.HasNext()) {
3223 		mount = iterator.Next();
3224 		kprintf("%p%4" B_PRIdDEV " %p %p %p %s\n", mount, mount->id, mount->root_vnode,
3225 			mount->root_vnode->covers, mount->volume->private_volume,
3226 			mount->volume->file_system_name);
3227 
3228 		fs_volume* volume = mount->volume;
3229 		while (volume->super_volume != NULL) {
3230 			volume = volume->super_volume;
3231 			kprintf("                                     %p %s\n",
3232 				volume->private_volume, volume->file_system_name);
3233 		}
3234 	}
3235 
3236 	return 0;
3237 }
3238 
3239 
3240 static int
3241 dump_vnode(int argc, char** argv)
3242 {
3243 	bool printPath = false;
3244 	int argi = 1;
3245 	if (argc >= 2 && strcmp(argv[argi], "-p") == 0) {
3246 		printPath = true;
3247 		argi++;
3248 	}
3249 
3250 	if (argi >= argc || argi + 2 < argc) {
3251 		print_debugger_command_usage(argv[0]);
3252 		return 0;
3253 	}
3254 
3255 	struct vnode* vnode = NULL;
3256 
3257 	if (argi + 1 == argc) {
3258 		vnode = (struct vnode*)parse_expression(argv[argi]);
3259 		if (IS_USER_ADDRESS(vnode)) {
3260 			kprintf("invalid vnode address\n");
3261 			return 0;
3262 		}
3263 		_dump_vnode(vnode, printPath);
3264 		return 0;
3265 	}
3266 
3267 	dev_t device = parse_expression(argv[argi]);
3268 	ino_t id = parse_expression(argv[argi + 1]);
3269 
3270 	VnodeTable::Iterator iterator(sVnodeTable);
3271 	while (iterator.HasNext()) {
3272 		vnode = iterator.Next();
3273 		if (vnode->id != id || vnode->device != device)
3274 			continue;
3275 
3276 		_dump_vnode(vnode, printPath);
3277 	}
3278 
3279 	return 0;
3280 }
3281 
3282 
3283 static int
3284 dump_vnodes(int argc, char** argv)
3285 {
3286 	if (argc != 2 || !strcmp(argv[1], "--help")) {
3287 		kprintf("usage: %s [device]\n", argv[0]);
3288 		return 0;
3289 	}
3290 
3291 	// restrict dumped nodes to a certain device if requested
3292 	dev_t device = parse_expression(argv[1]);
3293 
3294 	struct vnode* vnode;
3295 
3296 	kprintf("%-*s   dev     inode  ref %-*s   %-*s   %-*s   flags\n",
3297 		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache",
3298 		B_PRINTF_POINTER_WIDTH, "fs-node", B_PRINTF_POINTER_WIDTH, "locking");
3299 
3300 	VnodeTable::Iterator iterator(sVnodeTable);
3301 	while (iterator.HasNext()) {
3302 		vnode = iterator.Next();
3303 		if (vnode->device != device)
3304 			continue;
3305 
3306 		kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO "%5" B_PRId32 " %p %p %p %s%s%s\n",
3307 			vnode, vnode->device, vnode->id, vnode->ref_count, vnode->cache,
3308 			vnode->private_node, vnode->advisory_locking,
3309 			vnode->IsRemoved() ? "r" : "-", vnode->IsBusy() ? "b" : "-",
3310 			vnode->IsUnpublished() ? "u" : "-");
3311 	}
3312 
3313 	return 0;
3314 }
3315 
3316 
3317 static int
3318 dump_vnode_caches(int argc, char** argv)
3319 {
3320 	struct vnode* vnode;
3321 
3322 	if (argc > 2 || !strcmp(argv[1], "--help")) {
3323 		kprintf("usage: %s [device]\n", argv[0]);
3324 		return 0;
3325 	}
3326 
3327 	// restrict dumped nodes to a certain device if requested
3328 	dev_t device = -1;
3329 	if (argc > 1)
3330 		device = parse_expression(argv[1]);
3331 
3332 	kprintf("%-*s   dev     inode %-*s       size   pages\n",
3333 		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache");
3334 
3335 	VnodeTable::Iterator iterator(sVnodeTable);
3336 	while (iterator.HasNext()) {
3337 		vnode = iterator.Next();
3338 		if (vnode->cache == NULL)
3339 			continue;
3340 		if (device != -1 && vnode->device != device)
3341 			continue;
3342 
3343 		kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO " %p %8" B_PRIdOFF "%8" B_PRId32 "\n",
3344 			vnode, vnode->device, vnode->id, vnode->cache,
3345 			(vnode->cache->virtual_end + B_PAGE_SIZE - 1) / B_PAGE_SIZE,
3346 			vnode->cache->page_count);
3347 	}
3348 
3349 	return 0;
3350 }
3351 
3352 
3353 int
3354 dump_io_context(int argc, char** argv)
3355 {
3356 	if (argc > 2 || !strcmp(argv[1], "--help")) {
3357 		kprintf("usage: %s [team-id|address]\n", argv[0]);
3358 		return 0;
3359 	}
3360 
3361 	struct io_context* context = NULL;
3362 
3363 	if (argc > 1) {
3364 		ulong num = parse_expression(argv[1]);
3365 		if (IS_KERNEL_ADDRESS(num))
3366 			context = (struct io_context*)num;
3367 		else {
3368 			Team* team = team_get_team_struct_locked(num);
3369 			if (team == NULL) {
3370 				kprintf("could not find team with ID %lu\n", num);
3371 				return 0;
3372 			}
3373 			context = (struct io_context*)team->io_context;
3374 		}
3375 	} else
3376 		context = get_current_io_context(true);
3377 
3378 	kprintf("I/O CONTEXT: %p\n", context);
3379 	kprintf(" root vnode:\t%p\n", context->root);
3380 	kprintf(" cwd vnode:\t%p\n", context->cwd);
3381 	kprintf(" used fds:\t%" B_PRIu32 "\n", context->num_used_fds);
3382 	kprintf(" max fds:\t%" B_PRIu32 "\n", context->table_size);
3383 
3384 	if (context->num_used_fds) {
3385 		kprintf("   no.  type    %*s  ref  open  mode         pos    %*s\n",
3386 			B_PRINTF_POINTER_WIDTH, "ops", B_PRINTF_POINTER_WIDTH, "cookie");
3387 	}
3388 
3389 	for (uint32 i = 0; i < context->table_size; i++) {
3390 		struct file_descriptor* fd = context->fds[i];
3391 		if (fd == NULL)
3392 			continue;
3393 
3394 		kprintf("  %3" B_PRIu32 ":  %4" B_PRId32 "  %p  %3" B_PRId32 "  %4"
3395 			B_PRIu32 "  %4" B_PRIx32 "  %10" B_PRIdOFF "  %p  %s %p\n", i,
3396 			fd->type, fd->ops, fd->ref_count, fd->open_count, fd->open_mode,
3397 			fd->pos, fd->cookie,
3398 			fd->type >= FDTYPE_INDEX && fd->type <= FDTYPE_QUERY
3399 				? "mount" : "vnode",
3400 			fd->u.vnode);
3401 	}
3402 
3403 	kprintf(" used monitors:\t%" B_PRIu32 "\n", context->num_monitors);
3404 	kprintf(" max monitors:\t%" B_PRIu32 "\n", context->max_monitors);
3405 
3406 	set_debug_variable("_cwd", (addr_t)context->cwd);
3407 
3408 	return 0;
3409 }
3410 
3411 
3412 int
3413 dump_vnode_usage(int argc, char** argv)
3414 {
3415 	if (argc != 1) {
3416 		kprintf("usage: %s\n", argv[0]);
3417 		return 0;
3418 	}
3419 
3420 	kprintf("Unused vnodes: %" B_PRIu32 " (max unused %" B_PRIu32 ")\n",
3421 		sUnusedVnodes, kMaxUnusedVnodes);
3422 
3423 	uint32 count = sVnodeTable->CountElements();
3424 
3425 	kprintf("%" B_PRIu32 " vnodes total (%" B_PRIu32 " in use).\n", count,
3426 		count - sUnusedVnodes);
3427 	return 0;
3428 }
3429 
3430 #endif	// ADD_DEBUGGER_COMMANDS
3431 
3432 
3433 /*!	Clears memory specified by an iovec array.
3434 */
3435 static void
3436 zero_iovecs(const iovec* vecs, size_t vecCount, size_t bytes)
3437 {
3438 	for (size_t i = 0; i < vecCount && bytes > 0; i++) {
3439 		size_t length = std::min(vecs[i].iov_len, bytes);
3440 		memset(vecs[i].iov_base, 0, length);
3441 		bytes -= length;
3442 	}
3443 }
3444 
3445 
3446 /*!	Does the dirty work of combining the file_io_vecs with the iovecs
3447 	and calls the file system hooks to read/write the request to disk.
3448 */
3449 static status_t
3450 common_file_io_vec_pages(struct vnode* vnode, void* cookie,
3451 	const file_io_vec* fileVecs, size_t fileVecCount, const iovec* vecs,
3452 	size_t vecCount, uint32* _vecIndex, size_t* _vecOffset, size_t* _numBytes,
3453 	bool doWrite)
3454 {
3455 	if (fileVecCount == 0) {
3456 		// There are no file vecs at this offset, so we're obviously trying
3457 		// to access the file outside of its bounds
3458 		return B_BAD_VALUE;
3459 	}
3460 
3461 	size_t numBytes = *_numBytes;
3462 	uint32 fileVecIndex;
3463 	size_t vecOffset = *_vecOffset;
3464 	uint32 vecIndex = *_vecIndex;
3465 	status_t status;
3466 	size_t size;
3467 
3468 	if (!doWrite && vecOffset == 0) {
3469 		// now directly read the data from the device
3470 		// the first file_io_vec can be read directly
3471 
3472 		if (fileVecs[0].length < (off_t)numBytes)
3473 			size = fileVecs[0].length;
3474 		else
3475 			size = numBytes;
3476 
3477 		if (fileVecs[0].offset >= 0) {
3478 			status = FS_CALL(vnode, read_pages, cookie, fileVecs[0].offset,
3479 				&vecs[vecIndex], vecCount - vecIndex, &size);
3480 		} else {
3481 			// sparse read
3482 			zero_iovecs(&vecs[vecIndex], vecCount - vecIndex, size);
3483 			status = B_OK;
3484 		}
3485 		if (status != B_OK)
3486 			return status;
3487 
3488 		// TODO: this is a work-around for buggy device drivers!
3489 		//	When our own drivers honour the length, we can:
3490 		//	a) also use this direct I/O for writes (otherwise, it would
3491 		//	   overwrite precious data)
3492 		//	b) panic if the term below is true (at least for writes)
3493 		if ((off_t)size > fileVecs[0].length) {
3494 			//dprintf("warning: device driver %p doesn't respect total length "
3495 			//	"in read_pages() call!\n", ref->device);
3496 			size = fileVecs[0].length;
3497 		}
3498 
3499 		ASSERT((off_t)size <= fileVecs[0].length);
3500 
3501 		// If the file portion was contiguous, we're already done now
3502 		if (size == numBytes)
3503 			return B_OK;
3504 
3505 		// if we reached the end of the file, we can return as well
3506 		if ((off_t)size != fileVecs[0].length) {
3507 			*_numBytes = size;
3508 			return B_OK;
3509 		}
3510 
3511 		fileVecIndex = 1;
3512 
3513 		// first, find out where we have to continue in our iovecs
3514 		for (; vecIndex < vecCount; vecIndex++) {
3515 			if (size < vecs[vecIndex].iov_len)
3516 				break;
3517 
3518 			size -= vecs[vecIndex].iov_len;
3519 		}
3520 
3521 		vecOffset = size;
3522 	} else {
3523 		fileVecIndex = 0;
3524 		size = 0;
3525 	}
3526 
3527 	// Too bad, let's process the rest of the file_io_vecs
3528 
3529 	size_t totalSize = size;
3530 	size_t bytesLeft = numBytes - size;
3531 
3532 	for (; fileVecIndex < fileVecCount; fileVecIndex++) {
3533 		const file_io_vec &fileVec = fileVecs[fileVecIndex];
3534 		off_t fileOffset = fileVec.offset;
3535 		off_t fileLeft = min_c(fileVec.length, (off_t)bytesLeft);
3536 
3537 		TRACE(("FILE VEC [%" B_PRIu32 "] length %" B_PRIdOFF "\n", fileVecIndex,
3538 			fileLeft));
3539 
3540 		// process the complete fileVec
3541 		while (fileLeft > 0) {
3542 			iovec tempVecs[MAX_TEMP_IO_VECS];
3543 			uint32 tempCount = 0;
3544 
3545 			// size tracks how much of what is left of the current fileVec
3546 			// (fileLeft) has been assigned to tempVecs
3547 			size = 0;
3548 
3549 			// assign what is left of the current fileVec to the tempVecs
3550 			for (size = 0; (off_t)size < fileLeft && vecIndex < vecCount
3551 					&& tempCount < MAX_TEMP_IO_VECS;) {
3552 				// try to satisfy one iovec per iteration (or as much as
3553 				// possible)
3554 
3555 				// bytes left of the current iovec
3556 				size_t vecLeft = vecs[vecIndex].iov_len - vecOffset;
3557 				if (vecLeft == 0) {
3558 					vecOffset = 0;
3559 					vecIndex++;
3560 					continue;
3561 				}
3562 
3563 				TRACE(("fill vec %" B_PRIu32 ", offset = %lu, size = %lu\n",
3564 					vecIndex, vecOffset, size));
3565 
3566 				// actually available bytes
3567 				size_t tempVecSize = min_c(vecLeft, fileLeft - size);
3568 
3569 				tempVecs[tempCount].iov_base
3570 					= (void*)((addr_t)vecs[vecIndex].iov_base + vecOffset);
3571 				tempVecs[tempCount].iov_len = tempVecSize;
3572 				tempCount++;
3573 
3574 				size += tempVecSize;
3575 				vecOffset += tempVecSize;
3576 			}
3577 
3578 			size_t bytes = size;
3579 
3580 			if (fileOffset == -1) {
3581 				if (doWrite) {
3582 					panic("sparse write attempt: vnode %p", vnode);
3583 					status = B_IO_ERROR;
3584 				} else {
3585 					// sparse read
3586 					zero_iovecs(tempVecs, tempCount, bytes);
3587 					status = B_OK;
3588 				}
3589 			} else if (doWrite) {
3590 				status = FS_CALL(vnode, write_pages, cookie, fileOffset,
3591 					tempVecs, tempCount, &bytes);
3592 			} else {
3593 				status = FS_CALL(vnode, read_pages, cookie, fileOffset,
3594 					tempVecs, tempCount, &bytes);
3595 			}
3596 			if (status != B_OK)
3597 				return status;
3598 
3599 			totalSize += bytes;
3600 			bytesLeft -= size;
3601 			if (fileOffset >= 0)
3602 				fileOffset += size;
3603 			fileLeft -= size;
3604 			//dprintf("-> file left = %Lu\n", fileLeft);
3605 
3606 			if (size != bytes || vecIndex >= vecCount) {
3607 				// there are no more bytes or iovecs, let's bail out
3608 				*_numBytes = totalSize;
3609 				return B_OK;
3610 			}
3611 		}
3612 	}
3613 
3614 	*_vecIndex = vecIndex;
3615 	*_vecOffset = vecOffset;
3616 	*_numBytes = totalSize;
3617 	return B_OK;
3618 }
3619 
3620 
3621 static bool
3622 is_user_in_group(gid_t gid)
3623 {
3624 	if (gid == getegid())
3625 		return true;
3626 
3627 	gid_t groups[NGROUPS_MAX];
3628 	int groupCount = getgroups(NGROUPS_MAX, groups);
3629 	for (int i = 0; i < groupCount; i++) {
3630 		if (gid == groups[i])
3631 			return true;
3632 	}
3633 
3634 	return false;
3635 }
3636 
3637 
3638 static status_t
3639 free_io_context(io_context* context)
3640 {
3641 	uint32 i;
3642 
3643 	TIOC(FreeIOContext(context));
3644 
3645 	if (context->root)
3646 		put_vnode(context->root);
3647 
3648 	if (context->cwd)
3649 		put_vnode(context->cwd);
3650 
3651 	mutex_lock(&context->io_mutex);
3652 
3653 	for (i = 0; i < context->table_size; i++) {
3654 		if (struct file_descriptor* descriptor = context->fds[i]) {
3655 			close_fd(context, descriptor);
3656 			put_fd(descriptor);
3657 		}
3658 	}
3659 
3660 	mutex_destroy(&context->io_mutex);
3661 
3662 	remove_node_monitors(context);
3663 	free(context->fds);
3664 	free(context);
3665 
3666 	return B_OK;
3667 }
3668 
3669 
3670 static status_t
3671 resize_monitor_table(struct io_context* context, const int newSize)
3672 {
3673 	int	status = B_OK;
3674 
3675 	if (newSize <= 0 || newSize > MAX_NODE_MONITORS)
3676 		return B_BAD_VALUE;
3677 
3678 	mutex_lock(&context->io_mutex);
3679 
3680 	if ((size_t)newSize < context->num_monitors) {
3681 		status = B_BUSY;
3682 		goto out;
3683 	}
3684 	context->max_monitors = newSize;
3685 
3686 out:
3687 	mutex_unlock(&context->io_mutex);
3688 	return status;
3689 }
3690 
3691 
3692 //	#pragma mark - public API for file systems
3693 
3694 
3695 extern "C" status_t
3696 new_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3697 	fs_vnode_ops* ops)
3698 {
3699 	FUNCTION(("new_vnode(volume = %p (%" B_PRId32 "), vnodeID = %" B_PRId64
3700 		", node = %p)\n", volume, volume->id, vnodeID, privateNode));
3701 
3702 	if (privateNode == NULL)
3703 		return B_BAD_VALUE;
3704 
3705 	int32 tries = BUSY_VNODE_RETRIES;
3706 restart:
3707 	// create the node
3708 	bool nodeCreated;
3709 	struct vnode* vnode;
3710 	status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3711 		nodeCreated);
3712 	if (status != B_OK)
3713 		return status;
3714 
3715 	WriteLocker nodeLocker(sVnodeLock, true);
3716 		// create_new_vnode_and_lock() has locked for us
3717 
3718 	if (!nodeCreated && vnode->IsBusy()) {
3719 		nodeLocker.Unlock();
3720 		if (!retry_busy_vnode(tries, volume->id, vnodeID))
3721 			return B_BUSY;
3722 		goto restart;
3723 	}
3724 
3725 	// file system integrity check:
3726 	// test if the vnode already exists and bail out if this is the case!
3727 	if (!nodeCreated) {
3728 		panic("vnode %" B_PRIdDEV ":%" B_PRIdINO " already exists (node = %p, "
3729 			"vnode->node = %p)!", volume->id, vnodeID, privateNode,
3730 			vnode->private_node);
3731 		return B_ERROR;
3732 	}
3733 
3734 	vnode->private_node = privateNode;
3735 	vnode->ops = ops;
3736 	vnode->SetUnpublished(true);
3737 
3738 	TRACE(("returns: %s\n", strerror(status)));
3739 
3740 	return status;
3741 }
3742 
3743 
3744 extern "C" status_t
3745 publish_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3746 	fs_vnode_ops* ops, int type, uint32 flags)
3747 {
3748 	FUNCTION(("publish_vnode()\n"));
3749 
3750 	int32 tries = BUSY_VNODE_RETRIES;
3751 restart:
3752 	WriteLocker locker(sVnodeLock);
3753 
3754 	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3755 
3756 	bool nodeCreated = false;
3757 	if (vnode == NULL) {
3758 		if (privateNode == NULL)
3759 			return B_BAD_VALUE;
3760 
3761 		// create the node
3762 		locker.Unlock();
3763 			// create_new_vnode_and_lock() will re-lock for us on success
3764 		status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3765 			nodeCreated);
3766 		if (status != B_OK)
3767 			return status;
3768 
3769 		locker.SetTo(sVnodeLock, true);
3770 	}
3771 
3772 	if (nodeCreated) {
3773 		vnode->private_node = privateNode;
3774 		vnode->ops = ops;
3775 		vnode->SetUnpublished(true);
3776 	} else if (vnode->IsBusy() && vnode->IsUnpublished()
3777 		&& vnode->private_node == privateNode && vnode->ops == ops) {
3778 		// already known, but not published
3779 	} else if (vnode->IsBusy()) {
3780 		locker.Unlock();
3781 		if (!retry_busy_vnode(tries, volume->id, vnodeID))
3782 			return B_BUSY;
3783 		goto restart;
3784 	} else
3785 		return B_BAD_VALUE;
3786 
3787 	bool publishSpecialSubNode = false;
3788 
3789 	vnode->SetType(type);
3790 	vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
3791 	publishSpecialSubNode = is_special_node_type(type)
3792 		&& (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
3793 
3794 	status_t status = B_OK;
3795 
3796 	// create sub vnodes, if necessary
3797 	if (volume->sub_volume != NULL || publishSpecialSubNode) {
3798 		locker.Unlock();
3799 
3800 		fs_volume* subVolume = volume;
3801 		if (volume->sub_volume != NULL) {
3802 			while (status == B_OK && subVolume->sub_volume != NULL) {
3803 				subVolume = subVolume->sub_volume;
3804 				status = subVolume->ops->create_sub_vnode(subVolume, vnodeID,
3805 					vnode);
3806 			}
3807 		}
3808 
3809 		if (status == B_OK && publishSpecialSubNode)
3810 			status = create_special_sub_node(vnode, flags);
3811 
3812 		if (status != B_OK) {
3813 			// error -- clean up the created sub vnodes
3814 			while (subVolume->super_volume != volume) {
3815 				subVolume = subVolume->super_volume;
3816 				subVolume->ops->delete_sub_vnode(subVolume, vnode);
3817 			}
3818 		}
3819 
3820 		if (status == B_OK) {
3821 			ReadLocker vnodesReadLocker(sVnodeLock);
3822 			AutoLocker<Vnode> nodeLocker(vnode);
3823 			vnode->SetBusy(false);
3824 			vnode->SetUnpublished(false);
3825 		} else {
3826 			locker.Lock();
3827 			sVnodeTable->Remove(vnode);
3828 			remove_vnode_from_mount_list(vnode, vnode->mount);
3829 			free(vnode);
3830 		}
3831 	} else {
3832 		// we still hold the write lock -- mark the node unbusy and published
3833 		vnode->SetBusy(false);
3834 		vnode->SetUnpublished(false);
3835 	}
3836 
3837 	TRACE(("returns: %s\n", strerror(status)));
3838 
3839 	return status;
3840 }
3841 
3842 
3843 extern "C" status_t
3844 get_vnode(fs_volume* volume, ino_t vnodeID, void** _privateNode)
3845 {
3846 	struct vnode* vnode;
3847 
3848 	if (volume == NULL)
3849 		return B_BAD_VALUE;
3850 
3851 	status_t status = get_vnode(volume->id, vnodeID, &vnode, true, true);
3852 	if (status != B_OK)
3853 		return status;
3854 
3855 	// If this is a layered FS, we need to get the node cookie for the requested
3856 	// layer.
3857 	if (HAS_FS_CALL(vnode, get_super_vnode)) {
3858 		fs_vnode resolvedNode;
3859 		status_t status = FS_CALL(vnode, get_super_vnode, volume,
3860 			&resolvedNode);
3861 		if (status != B_OK) {
3862 			panic("get_vnode(): Failed to get super node for vnode %p, "
3863 				"volume: %p", vnode, volume);
3864 			put_vnode(vnode);
3865 			return status;
3866 		}
3867 
3868 		if (_privateNode != NULL)
3869 			*_privateNode = resolvedNode.private_node;
3870 	} else if (_privateNode != NULL)
3871 		*_privateNode = vnode->private_node;
3872 
3873 	return B_OK;
3874 }
3875 
3876 
3877 extern "C" status_t
3878 acquire_vnode(fs_volume* volume, ino_t vnodeID)
3879 {
3880 	struct vnode* vnode;
3881 
3882 	rw_lock_read_lock(&sVnodeLock);
3883 	vnode = lookup_vnode(volume->id, vnodeID);
3884 	rw_lock_read_unlock(&sVnodeLock);
3885 
3886 	if (vnode == NULL)
3887 		return B_BAD_VALUE;
3888 
3889 	inc_vnode_ref_count(vnode);
3890 	return B_OK;
3891 }
3892 
3893 
3894 extern "C" status_t
3895 put_vnode(fs_volume* volume, ino_t vnodeID)
3896 {
3897 	struct vnode* vnode;
3898 
3899 	rw_lock_read_lock(&sVnodeLock);
3900 	vnode = lookup_vnode(volume->id, vnodeID);
3901 	rw_lock_read_unlock(&sVnodeLock);
3902 
3903 	if (vnode == NULL)
3904 		return B_BAD_VALUE;
3905 
3906 	dec_vnode_ref_count(vnode, false, true);
3907 	return B_OK;
3908 }
3909 
3910 
3911 extern "C" status_t
3912 remove_vnode(fs_volume* volume, ino_t vnodeID)
3913 {
3914 	ReadLocker locker(sVnodeLock);
3915 
3916 	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3917 	if (vnode == NULL)
3918 		return B_ENTRY_NOT_FOUND;
3919 
3920 	if (vnode->covered_by != NULL || vnode->covers != NULL) {
3921 		// this vnode is in use
3922 		return B_BUSY;
3923 	}
3924 
3925 	vnode->Lock();
3926 
3927 	vnode->SetRemoved(true);
3928 	bool removeUnpublished = false;
3929 
3930 	if (vnode->IsUnpublished()) {
3931 		// prepare the vnode for deletion
3932 		removeUnpublished = true;
3933 		vnode->SetBusy(true);
3934 	}
3935 
3936 	vnode->Unlock();
3937 	locker.Unlock();
3938 
3939 	if (removeUnpublished) {
3940 		// If the vnode hasn't been published yet, we delete it here
3941 		atomic_add(&vnode->ref_count, -1);
3942 		free_vnode(vnode, true);
3943 	}
3944 
3945 	return B_OK;
3946 }
3947 
3948 
3949 extern "C" status_t
3950 unremove_vnode(fs_volume* volume, ino_t vnodeID)
3951 {
3952 	struct vnode* vnode;
3953 
3954 	rw_lock_read_lock(&sVnodeLock);
3955 
3956 	vnode = lookup_vnode(volume->id, vnodeID);
3957 	if (vnode) {
3958 		AutoLocker<Vnode> nodeLocker(vnode);
3959 		vnode->SetRemoved(false);
3960 	}
3961 
3962 	rw_lock_read_unlock(&sVnodeLock);
3963 	return B_OK;
3964 }
3965 
3966 
3967 extern "C" status_t
3968 get_vnode_removed(fs_volume* volume, ino_t vnodeID, bool* _removed)
3969 {
3970 	ReadLocker _(sVnodeLock);
3971 
3972 	if (struct vnode* vnode = lookup_vnode(volume->id, vnodeID)) {
3973 		if (_removed != NULL)
3974 			*_removed = vnode->IsRemoved();
3975 		return B_OK;
3976 	}
3977 
3978 	return B_BAD_VALUE;
3979 }
3980 
3981 
3982 extern "C" fs_volume*
3983 volume_for_vnode(fs_vnode* _vnode)
3984 {
3985 	if (_vnode == NULL)
3986 		return NULL;
3987 
3988 	struct vnode* vnode = static_cast<struct vnode*>(_vnode);
3989 	return vnode->mount->volume;
3990 }
3991 
3992 
3993 extern "C" status_t
3994 check_access_permissions(int accessMode, mode_t mode, gid_t nodeGroupID,
3995 	uid_t nodeUserID)
3996 {
3997 	// get node permissions
3998 	int userPermissions = (mode & S_IRWXU) >> 6;
3999 	int groupPermissions = (mode & S_IRWXG) >> 3;
4000 	int otherPermissions = mode & S_IRWXO;
4001 
4002 	// get the node permissions for this uid/gid
4003 	int permissions = 0;
4004 	uid_t uid = geteuid();
4005 
4006 	if (uid == 0) {
4007 		// user is root
4008 		// root has always read/write permission, but at least one of the
4009 		// X bits must be set for execute permission
4010 		permissions = userPermissions | groupPermissions | otherPermissions
4011 			| S_IROTH | S_IWOTH;
4012 		if (S_ISDIR(mode))
4013 			permissions |= S_IXOTH;
4014 	} else if (uid == nodeUserID) {
4015 		// user is node owner
4016 		permissions = userPermissions;
4017 	} else if (is_user_in_group(nodeGroupID)) {
4018 		// user is in owning group
4019 		permissions = groupPermissions;
4020 	} else {
4021 		// user is one of the others
4022 		permissions = otherPermissions;
4023 	}
4024 
4025 	return (accessMode & ~permissions) == 0 ? B_OK : B_PERMISSION_DENIED;
4026 }
4027 
4028 
4029 #if 0
4030 extern "C" status_t
4031 read_pages(int fd, off_t pos, const iovec* vecs, size_t count,
4032 	size_t* _numBytes)
4033 {
4034 	struct file_descriptor* descriptor;
4035 	struct vnode* vnode;
4036 
4037 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4038 	if (descriptor == NULL)
4039 		return B_FILE_ERROR;
4040 
4041 	status_t status = vfs_read_pages(vnode, descriptor->cookie, pos, vecs,
4042 		count, 0, _numBytes);
4043 
4044 	put_fd(descriptor);
4045 	return status;
4046 }
4047 
4048 
4049 extern "C" status_t
4050 write_pages(int fd, off_t pos, const iovec* vecs, size_t count,
4051 	size_t* _numBytes)
4052 {
4053 	struct file_descriptor* descriptor;
4054 	struct vnode* vnode;
4055 
4056 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4057 	if (descriptor == NULL)
4058 		return B_FILE_ERROR;
4059 
4060 	status_t status = vfs_write_pages(vnode, descriptor->cookie, pos, vecs,
4061 		count, 0, _numBytes);
4062 
4063 	put_fd(descriptor);
4064 	return status;
4065 }
4066 #endif
4067 
4068 
4069 extern "C" status_t
4070 read_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
4071 	const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
4072 	size_t* _bytes)
4073 {
4074 	struct file_descriptor* descriptor;
4075 	struct vnode* vnode;
4076 
4077 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4078 	if (descriptor == NULL)
4079 		return B_FILE_ERROR;
4080 
4081 	status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
4082 		fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
4083 		false);
4084 
4085 	put_fd(descriptor);
4086 	return status;
4087 }
4088 
4089 
4090 extern "C" status_t
4091 write_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
4092 	const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
4093 	size_t* _bytes)
4094 {
4095 	struct file_descriptor* descriptor;
4096 	struct vnode* vnode;
4097 
4098 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4099 	if (descriptor == NULL)
4100 		return B_FILE_ERROR;
4101 
4102 	status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
4103 		fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
4104 		true);
4105 
4106 	put_fd(descriptor);
4107 	return status;
4108 }
4109 
4110 
4111 extern "C" status_t
4112 entry_cache_add(dev_t mountID, ino_t dirID, const char* name, ino_t nodeID)
4113 {
4114 	// lookup mount -- the caller is required to make sure that the mount
4115 	// won't go away
4116 	MutexLocker locker(sMountMutex);
4117 	struct fs_mount* mount = find_mount(mountID);
4118 	if (mount == NULL)
4119 		return B_BAD_VALUE;
4120 	locker.Unlock();
4121 
4122 	return mount->entry_cache.Add(dirID, name, nodeID, false);
4123 }
4124 
4125 
4126 extern "C" status_t
4127 entry_cache_add_missing(dev_t mountID, ino_t dirID, const char* name)
4128 {
4129 	// lookup mount -- the caller is required to make sure that the mount
4130 	// won't go away
4131 	MutexLocker locker(sMountMutex);
4132 	struct fs_mount* mount = find_mount(mountID);
4133 	if (mount == NULL)
4134 		return B_BAD_VALUE;
4135 	locker.Unlock();
4136 
4137 	return mount->entry_cache.Add(dirID, name, -1, true);
4138 }
4139 
4140 
4141 extern "C" status_t
4142 entry_cache_remove(dev_t mountID, ino_t dirID, const char* name)
4143 {
4144 	// lookup mount -- the caller is required to make sure that the mount
4145 	// won't go away
4146 	MutexLocker locker(sMountMutex);
4147 	struct fs_mount* mount = find_mount(mountID);
4148 	if (mount == NULL)
4149 		return B_BAD_VALUE;
4150 	locker.Unlock();
4151 
4152 	return mount->entry_cache.Remove(dirID, name);
4153 }
4154 
4155 
4156 //	#pragma mark - private VFS API
4157 //	Functions the VFS exports for other parts of the kernel
4158 
4159 
4160 /*! Acquires another reference to the vnode that has to be released
4161 	by calling vfs_put_vnode().
4162 */
4163 void
4164 vfs_acquire_vnode(struct vnode* vnode)
4165 {
4166 	inc_vnode_ref_count(vnode);
4167 }
4168 
4169 
4170 /*! This is currently called from file_cache_create() only.
4171 	It's probably a temporary solution as long as devfs requires that
4172 	fs_read_pages()/fs_write_pages() are called with the standard
4173 	open cookie and not with a device cookie.
4174 	If that's done differently, remove this call; it has no other
4175 	purpose.
4176 */
4177 extern "C" status_t
4178 vfs_get_cookie_from_fd(int fd, void** _cookie)
4179 {
4180 	struct file_descriptor* descriptor;
4181 
4182 	descriptor = get_fd(get_current_io_context(true), fd);
4183 	if (descriptor == NULL)
4184 		return B_FILE_ERROR;
4185 
4186 	*_cookie = descriptor->cookie;
4187 	return B_OK;
4188 }
4189 
4190 
4191 extern "C" status_t
4192 vfs_get_vnode_from_fd(int fd, bool kernel, struct vnode** vnode)
4193 {
4194 	*vnode = get_vnode_from_fd(fd, kernel);
4195 
4196 	if (*vnode == NULL)
4197 		return B_FILE_ERROR;
4198 
4199 	return B_NO_ERROR;
4200 }
4201 
4202 
4203 extern "C" status_t
4204 vfs_get_vnode_from_path(const char* path, bool kernel, struct vnode** _vnode)
4205 {
4206 	TRACE(("vfs_get_vnode_from_path: entry. path = '%s', kernel %d\n",
4207 		path, kernel));
4208 
4209 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
4210 	if (pathBuffer.InitCheck() != B_OK)
4211 		return B_NO_MEMORY;
4212 
4213 	char* buffer = pathBuffer.LockBuffer();
4214 	strlcpy(buffer, path, pathBuffer.BufferSize());
4215 
4216 	struct vnode* vnode;
4217 	status_t status = path_to_vnode(buffer, true, &vnode, NULL, kernel);
4218 	if (status != B_OK)
4219 		return status;
4220 
4221 	*_vnode = vnode;
4222 	return B_OK;
4223 }
4224 
4225 
4226 extern "C" status_t
4227 vfs_get_vnode(dev_t mountID, ino_t vnodeID, bool canWait, struct vnode** _vnode)
4228 {
4229 	struct vnode* vnode = NULL;
4230 
4231 	status_t status = get_vnode(mountID, vnodeID, &vnode, canWait, false);
4232 	if (status != B_OK)
4233 		return status;
4234 
4235 	*_vnode = vnode;
4236 	return B_OK;
4237 }
4238 
4239 
4240 extern "C" status_t
4241 vfs_entry_ref_to_vnode(dev_t mountID, ino_t directoryID,
4242 	const char* name, struct vnode** _vnode)
4243 {
4244 	return entry_ref_to_vnode(mountID, directoryID, name, false, true, _vnode);
4245 }
4246 
4247 
4248 extern "C" void
4249 vfs_vnode_to_node_ref(struct vnode* vnode, dev_t* _mountID, ino_t* _vnodeID)
4250 {
4251 	*_mountID = vnode->device;
4252 	*_vnodeID = vnode->id;
4253 }
4254 
4255 
4256 /*!
4257 	Helper function abstracting the process of "converting" a given
4258 	vnode-pointer to a fs_vnode-pointer.
4259 	Currently only used in bindfs.
4260 */
4261 extern "C" fs_vnode*
4262 vfs_fsnode_for_vnode(struct vnode* vnode)
4263 {
4264 	return vnode;
4265 }
4266 
4267 
4268 /*!
4269 	Calls fs_open() on the given vnode and returns a new
4270 	file descriptor for it
4271 */
4272 int
4273 vfs_open_vnode(struct vnode* vnode, int openMode, bool kernel)
4274 {
4275 	return open_vnode(vnode, openMode, kernel);
4276 }
4277 
4278 
4279 /*!	Looks up a vnode with the given mount and vnode ID.
4280 	Must only be used with "in-use" vnodes as it doesn't grab a reference
4281 	to the node.
4282 	It's currently only be used by file_cache_create().
4283 */
4284 extern "C" status_t
4285 vfs_lookup_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode)
4286 {
4287 	rw_lock_read_lock(&sVnodeLock);
4288 	struct vnode* vnode = lookup_vnode(mountID, vnodeID);
4289 	rw_lock_read_unlock(&sVnodeLock);
4290 
4291 	if (vnode == NULL)
4292 		return B_ERROR;
4293 
4294 	*_vnode = vnode;
4295 	return B_OK;
4296 }
4297 
4298 
4299 extern "C" status_t
4300 vfs_get_fs_node_from_path(fs_volume* volume, const char* path,
4301 	bool traverseLeafLink, bool kernel, void** _node)
4302 {
4303 	TRACE(("vfs_get_fs_node_from_path(volume = %p, path = \"%s\", kernel %d)\n",
4304 		volume, path, kernel));
4305 
4306 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
4307 	if (pathBuffer.InitCheck() != B_OK)
4308 		return B_NO_MEMORY;
4309 
4310 	fs_mount* mount;
4311 	status_t status = get_mount(volume->id, &mount);
4312 	if (status != B_OK)
4313 		return status;
4314 
4315 	char* buffer = pathBuffer.LockBuffer();
4316 	strlcpy(buffer, path, pathBuffer.BufferSize());
4317 
4318 	struct vnode* vnode = mount->root_vnode;
4319 
4320 	if (buffer[0] == '/')
4321 		status = path_to_vnode(buffer, traverseLeafLink, &vnode, NULL, kernel);
4322 	else {
4323 		inc_vnode_ref_count(vnode);
4324 			// vnode_path_to_vnode() releases a reference to the starting vnode
4325 		status = vnode_path_to_vnode(vnode, buffer, traverseLeafLink, 0,
4326 			kernel, &vnode, NULL);
4327 	}
4328 
4329 	put_mount(mount);
4330 
4331 	if (status != B_OK)
4332 		return status;
4333 
4334 	if (vnode->device != volume->id) {
4335 		// wrong mount ID - must not gain access on foreign file system nodes
4336 		put_vnode(vnode);
4337 		return B_BAD_VALUE;
4338 	}
4339 
4340 	// Use get_vnode() to resolve the cookie for the right layer.
4341 	status = get_vnode(volume, vnode->id, _node);
4342 	put_vnode(vnode);
4343 
4344 	return status;
4345 }
4346 
4347 
4348 status_t
4349 vfs_read_stat(int fd, const char* path, bool traverseLeafLink,
4350 	struct stat* stat, bool kernel)
4351 {
4352 	status_t status;
4353 
4354 	if (path != NULL) {
4355 		// path given: get the stat of the node referred to by (fd, path)
4356 		KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
4357 		if (pathBuffer.InitCheck() != B_OK)
4358 			return B_NO_MEMORY;
4359 
4360 		status = common_path_read_stat(fd, pathBuffer.LockBuffer(),
4361 			traverseLeafLink, stat, kernel);
4362 	} else {
4363 		// no path given: get the FD and use the FD operation
4364 		struct file_descriptor* descriptor
4365 			= get_fd(get_current_io_context(kernel), fd);
4366 		if (descriptor == NULL)
4367 			return B_FILE_ERROR;
4368 
4369 		if (descriptor->ops->fd_read_stat)
4370 			status = descriptor->ops->fd_read_stat(descriptor, stat);
4371 		else
4372 			status = B_UNSUPPORTED;
4373 
4374 		put_fd(descriptor);
4375 	}
4376 
4377 	return status;
4378 }
4379 
4380 
4381 /*!	Finds the full path to the file that contains the module \a moduleName,
4382 	puts it into \a pathBuffer, and returns B_OK for success.
4383 	If \a pathBuffer was too small, it returns \c B_BUFFER_OVERFLOW,
4384 	\c B_ENTRY_NOT_FOUNT if no file could be found.
4385 	\a pathBuffer is clobbered in any case and must not be relied on if this
4386 	functions returns unsuccessfully.
4387 	\a basePath and \a pathBuffer must not point to the same space.
4388 */
4389 status_t
4390 vfs_get_module_path(const char* basePath, const char* moduleName,
4391 	char* pathBuffer, size_t bufferSize)
4392 {
4393 	struct vnode* dir;
4394 	struct vnode* file;
4395 	status_t status;
4396 	size_t length;
4397 	char* path;
4398 
4399 	if (bufferSize == 0
4400 		|| strlcpy(pathBuffer, basePath, bufferSize) >= bufferSize)
4401 		return B_BUFFER_OVERFLOW;
4402 
4403 	status = path_to_vnode(pathBuffer, true, &dir, NULL, true);
4404 	if (status != B_OK)
4405 		return status;
4406 
4407 	// the path buffer had been clobbered by the above call
4408 	length = strlcpy(pathBuffer, basePath, bufferSize);
4409 	if (pathBuffer[length - 1] != '/')
4410 		pathBuffer[length++] = '/';
4411 
4412 	path = pathBuffer + length;
4413 	bufferSize -= length;
4414 
4415 	while (moduleName) {
4416 		char* nextPath = strchr(moduleName, '/');
4417 		if (nextPath == NULL)
4418 			length = strlen(moduleName);
4419 		else {
4420 			length = nextPath - moduleName;
4421 			nextPath++;
4422 		}
4423 
4424 		if (length + 1 >= bufferSize) {
4425 			status = B_BUFFER_OVERFLOW;
4426 			goto err;
4427 		}
4428 
4429 		memcpy(path, moduleName, length);
4430 		path[length] = '\0';
4431 		moduleName = nextPath;
4432 
4433 		status = vnode_path_to_vnode(dir, path, true, 0, true, &file, NULL);
4434 		if (status != B_OK) {
4435 			// vnode_path_to_vnode() has already released the reference to dir
4436 			return status;
4437 		}
4438 
4439 		if (S_ISDIR(file->Type())) {
4440 			// goto the next directory
4441 			path[length] = '/';
4442 			path[length + 1] = '\0';
4443 			path += length + 1;
4444 			bufferSize -= length + 1;
4445 
4446 			dir = file;
4447 		} else if (S_ISREG(file->Type())) {
4448 			// it's a file so it should be what we've searched for
4449 			put_vnode(file);
4450 
4451 			return B_OK;
4452 		} else {
4453 			TRACE(("vfs_get_module_path(): something is strange here: "
4454 				"0x%08" B_PRIx32 "...\n", file->Type()));
4455 			status = B_ERROR;
4456 			dir = file;
4457 			goto err;
4458 		}
4459 	}
4460 
4461 	// if we got here, the moduleName just pointed to a directory, not to
4462 	// a real module - what should we do in this case?
4463 	status = B_ENTRY_NOT_FOUND;
4464 
4465 err:
4466 	put_vnode(dir);
4467 	return status;
4468 }
4469 
4470 
4471 /*!	\brief Normalizes a given path.
4472 
4473 	The path must refer to an existing or non-existing entry in an existing
4474 	directory, that is chopping off the leaf component the remaining path must
4475 	refer to an existing directory.
4476 
4477 	The returned will be canonical in that it will be absolute, will not
4478 	contain any "." or ".." components or duplicate occurrences of '/'s,
4479 	and none of the directory components will by symbolic links.
4480 
4481 	Any two paths referring to the same entry, will result in the same
4482 	normalized path (well, that is pretty much the definition of `normalized',
4483 	isn't it :-).
4484 
4485 	\param path The path to be normalized.
4486 	\param buffer The buffer into which the normalized path will be written.
4487 		   May be the same one as \a path.
4488 	\param bufferSize The size of \a buffer.
4489 	\param traverseLink If \c true, the function also resolves leaf symlinks.
4490 	\param kernel \c true, if the IO context of the kernel shall be used,
4491 		   otherwise that of the team this thread belongs to. Only relevant,
4492 		   if the path is relative (to get the CWD).
4493 	\return \c B_OK if everything went fine, another error code otherwise.
4494 */
4495 status_t
4496 vfs_normalize_path(const char* path, char* buffer, size_t bufferSize,
4497 	bool traverseLink, bool kernel)
4498 {
4499 	if (!path || !buffer || bufferSize < 1)
4500 		return B_BAD_VALUE;
4501 
4502 	if (path != buffer) {
4503 		if (strlcpy(buffer, path, bufferSize) >= bufferSize)
4504 			return B_BUFFER_OVERFLOW;
4505 	}
4506 
4507 	return normalize_path(buffer, bufferSize, traverseLink, kernel);
4508 }
4509 
4510 
4511 /*!	\brief Gets the parent of the passed in node.
4512 
4513 	Gets the parent of the passed in node, and correctly resolves covered
4514 	nodes.
4515 */
4516 extern "C" status_t
4517 vfs_resolve_parent(struct vnode* parent, dev_t* device, ino_t* node)
4518 {
4519 	return resolve_covered_parent(parent, device, node,
4520 		get_current_io_context(true));
4521 }
4522 
4523 
4524 /*!	\brief Creates a special node in the file system.
4525 
4526 	The caller gets a reference to the newly created node (which is passed
4527 	back through \a _createdVnode) and is responsible for releasing it.
4528 
4529 	\param path The path where to create the entry for the node. Can be \c NULL,
4530 		in which case the node is created without an entry in the root FS -- it
4531 		will automatically be deleted when the last reference has been released.
4532 	\param subVnode The definition of the subnode. Can be \c NULL, in which case
4533 		the target file system will just create the node with its standard
4534 		operations. Depending on the type of the node a subnode might be created
4535 		automatically, though.
4536 	\param mode The type and permissions for the node to be created.
4537 	\param flags Flags to be passed to the creating FS.
4538 	\param kernel \c true, if called in the kernel context (relevant only if
4539 		\a path is not \c NULL and not absolute).
4540 	\param _superVnode Pointer to a pre-allocated structure to be filled by the
4541 		file system creating the node, with the private data pointer and
4542 		operations for the super node. Can be \c NULL.
4543 	\param _createVnode Pointer to pre-allocated storage where to store the
4544 		pointer to the newly created node.
4545 	\return \c B_OK, if everything went fine, another error code otherwise.
4546 */
4547 status_t
4548 vfs_create_special_node(const char* path, fs_vnode* subVnode, mode_t mode,
4549 	uint32 flags, bool kernel, fs_vnode* _superVnode,
4550 	struct vnode** _createdVnode)
4551 {
4552 	struct vnode* dirNode;
4553 	char _leaf[B_FILE_NAME_LENGTH];
4554 	char* leaf = NULL;
4555 
4556 	if (path) {
4557 		// We've got a path. Get the dir vnode and the leaf name.
4558 		KPath tmpPathBuffer(B_PATH_NAME_LENGTH + 1);
4559 		if (tmpPathBuffer.InitCheck() != B_OK)
4560 			return B_NO_MEMORY;
4561 
4562 		char* tmpPath = tmpPathBuffer.LockBuffer();
4563 		if (strlcpy(tmpPath, path, B_PATH_NAME_LENGTH) >= B_PATH_NAME_LENGTH)
4564 			return B_NAME_TOO_LONG;
4565 
4566 		// get the dir vnode and the leaf name
4567 		leaf = _leaf;
4568 		status_t error = path_to_dir_vnode(tmpPath, &dirNode, leaf, kernel);
4569 		if (error != B_OK)
4570 			return error;
4571 	} else {
4572 		// No path. Create the node in the root FS.
4573 		dirNode = sRoot;
4574 		inc_vnode_ref_count(dirNode);
4575 	}
4576 
4577 	VNodePutter _(dirNode);
4578 
4579 	// check support for creating special nodes
4580 	if (!HAS_FS_CALL(dirNode, create_special_node))
4581 		return B_UNSUPPORTED;
4582 
4583 	// create the node
4584 	fs_vnode superVnode;
4585 	ino_t nodeID;
4586 	status_t status = FS_CALL(dirNode, create_special_node, leaf, subVnode,
4587 		mode, flags, _superVnode != NULL ? _superVnode : &superVnode, &nodeID);
4588 	if (status != B_OK)
4589 		return status;
4590 
4591 	// lookup the node
4592 	rw_lock_read_lock(&sVnodeLock);
4593 	*_createdVnode = lookup_vnode(dirNode->mount->id, nodeID);
4594 	rw_lock_read_unlock(&sVnodeLock);
4595 
4596 	if (*_createdVnode == NULL) {
4597 		panic("vfs_create_special_node(): lookup of node failed");
4598 		return B_ERROR;
4599 	}
4600 
4601 	return B_OK;
4602 }
4603 
4604 
4605 extern "C" void
4606 vfs_put_vnode(struct vnode* vnode)
4607 {
4608 	put_vnode(vnode);
4609 }
4610 
4611 
4612 extern "C" status_t
4613 vfs_get_cwd(dev_t* _mountID, ino_t* _vnodeID)
4614 {
4615 	// Get current working directory from io context
4616 	struct io_context* context = get_current_io_context(false);
4617 	status_t status = B_OK;
4618 
4619 	mutex_lock(&context->io_mutex);
4620 
4621 	if (context->cwd != NULL) {
4622 		*_mountID = context->cwd->device;
4623 		*_vnodeID = context->cwd->id;
4624 	} else
4625 		status = B_ERROR;
4626 
4627 	mutex_unlock(&context->io_mutex);
4628 	return status;
4629 }
4630 
4631 
4632 status_t
4633 vfs_unmount(dev_t mountID, uint32 flags)
4634 {
4635 	return fs_unmount(NULL, mountID, flags, true);
4636 }
4637 
4638 
4639 extern "C" status_t
4640 vfs_disconnect_vnode(dev_t mountID, ino_t vnodeID)
4641 {
4642 	struct vnode* vnode;
4643 
4644 	status_t status = get_vnode(mountID, vnodeID, &vnode, true, true);
4645 	if (status != B_OK)
4646 		return status;
4647 
4648 	disconnect_mount_or_vnode_fds(vnode->mount, vnode);
4649 	put_vnode(vnode);
4650 	return B_OK;
4651 }
4652 
4653 
4654 extern "C" void
4655 vfs_free_unused_vnodes(int32 level)
4656 {
4657 	vnode_low_resource_handler(NULL,
4658 		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
4659 			| B_KERNEL_RESOURCE_ADDRESS_SPACE,
4660 		level);
4661 }
4662 
4663 
4664 extern "C" bool
4665 vfs_can_page(struct vnode* vnode, void* cookie)
4666 {
4667 	FUNCTION(("vfs_canpage: vnode %p\n", vnode));
4668 
4669 	if (HAS_FS_CALL(vnode, can_page))
4670 		return FS_CALL(vnode, can_page, cookie);
4671 	return false;
4672 }
4673 
4674 
4675 extern "C" status_t
4676 vfs_read_pages(struct vnode* vnode, void* cookie, off_t pos,
4677 	const generic_io_vec* vecs, size_t count, uint32 flags,
4678 	generic_size_t* _numBytes)
4679 {
4680 	FUNCTION(("vfs_read_pages: vnode %p, vecs %p, pos %" B_PRIdOFF "\n", vnode,
4681 		vecs, pos));
4682 
4683 #if VFS_PAGES_IO_TRACING
4684 	generic_size_t bytesRequested = *_numBytes;
4685 #endif
4686 
4687 	IORequest request;
4688 	status_t status = request.Init(pos, vecs, count, *_numBytes, false, flags);
4689 	if (status == B_OK) {
4690 		status = vfs_vnode_io(vnode, cookie, &request);
4691 		if (status == B_OK)
4692 			status = request.Wait();
4693 		*_numBytes = request.TransferredBytes();
4694 	}
4695 
4696 	TPIO(ReadPages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4697 		status, *_numBytes));
4698 
4699 	return status;
4700 }
4701 
4702 
4703 extern "C" status_t
4704 vfs_write_pages(struct vnode* vnode, void* cookie, off_t pos,
4705 	const generic_io_vec* vecs, size_t count, uint32 flags,
4706 	generic_size_t* _numBytes)
4707 {
4708 	FUNCTION(("vfs_write_pages: vnode %p, vecs %p, pos %" B_PRIdOFF "\n", vnode,
4709 		vecs, pos));
4710 
4711 #if VFS_PAGES_IO_TRACING
4712 	generic_size_t bytesRequested = *_numBytes;
4713 #endif
4714 
4715 	IORequest request;
4716 	status_t status = request.Init(pos, vecs, count, *_numBytes, true, flags);
4717 	if (status == B_OK) {
4718 		status = vfs_vnode_io(vnode, cookie, &request);
4719 		if (status == B_OK)
4720 			status = request.Wait();
4721 		*_numBytes = request.TransferredBytes();
4722 	}
4723 
4724 	TPIO(WritePages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4725 		status, *_numBytes));
4726 
4727 	return status;
4728 }
4729 
4730 
4731 /*!	Gets the vnode's VMCache object. If it didn't have one, it will be
4732 	created if \a allocate is \c true.
4733 	In case it's successful, it will also grab a reference to the cache
4734 	it returns.
4735 */
4736 extern "C" status_t
4737 vfs_get_vnode_cache(struct vnode* vnode, VMCache** _cache, bool allocate)
4738 {
4739 	if (vnode->cache != NULL) {
4740 		vnode->cache->AcquireRef();
4741 		*_cache = vnode->cache;
4742 		return B_OK;
4743 	}
4744 
4745 	rw_lock_read_lock(&sVnodeLock);
4746 	vnode->Lock();
4747 
4748 	status_t status = B_OK;
4749 
4750 	// The cache could have been created in the meantime
4751 	if (vnode->cache == NULL) {
4752 		if (allocate) {
4753 			// TODO: actually the vnode needs to be busy already here, or
4754 			//	else this won't work...
4755 			bool wasBusy = vnode->IsBusy();
4756 			vnode->SetBusy(true);
4757 
4758 			vnode->Unlock();
4759 			rw_lock_read_unlock(&sVnodeLock);
4760 
4761 			status = vm_create_vnode_cache(vnode, &vnode->cache);
4762 
4763 			rw_lock_read_lock(&sVnodeLock);
4764 			vnode->Lock();
4765 			vnode->SetBusy(wasBusy);
4766 		} else
4767 			status = B_BAD_VALUE;
4768 	}
4769 
4770 	vnode->Unlock();
4771 	rw_lock_read_unlock(&sVnodeLock);
4772 
4773 	if (status == B_OK) {
4774 		vnode->cache->AcquireRef();
4775 		*_cache = vnode->cache;
4776 	}
4777 
4778 	return status;
4779 }
4780 
4781 
4782 status_t
4783 vfs_get_file_map(struct vnode* vnode, off_t offset, size_t size,
4784 	file_io_vec* vecs, size_t* _count)
4785 {
4786 	FUNCTION(("vfs_get_file_map: vnode %p, vecs %p, offset %" B_PRIdOFF
4787 		", size = %" B_PRIuSIZE "\n", vnode, vecs, offset, size));
4788 
4789 	return FS_CALL(vnode, get_file_map, offset, size, vecs, _count);
4790 }
4791 
4792 
4793 status_t
4794 vfs_stat_vnode(struct vnode* vnode, struct stat* stat)
4795 {
4796 	status_t status = FS_CALL(vnode, read_stat, stat);
4797 
4798 	// fill in the st_dev and st_ino fields
4799 	if (status == B_OK) {
4800 		stat->st_dev = vnode->device;
4801 		stat->st_ino = vnode->id;
4802 		// the rdev field must stay unset for non-special files
4803 		if (!S_ISBLK(stat->st_mode) && !S_ISCHR(stat->st_mode))
4804 			stat->st_rdev = -1;
4805 	}
4806 
4807 	return status;
4808 }
4809 
4810 
4811 status_t
4812 vfs_stat_node_ref(dev_t device, ino_t inode, struct stat* stat)
4813 {
4814 	struct vnode* vnode;
4815 	status_t status = get_vnode(device, inode, &vnode, true, false);
4816 	if (status != B_OK)
4817 		return status;
4818 
4819 	status = vfs_stat_vnode(vnode, stat);
4820 
4821 	put_vnode(vnode);
4822 	return status;
4823 }
4824 
4825 
4826 status_t
4827 vfs_get_vnode_name(struct vnode* vnode, char* name, size_t nameSize)
4828 {
4829 	return get_vnode_name(vnode, NULL, name, nameSize, true);
4830 }
4831 
4832 
4833 status_t
4834 vfs_entry_ref_to_path(dev_t device, ino_t inode, const char* leaf,
4835 	bool kernel, char* path, size_t pathLength)
4836 {
4837 	struct vnode* vnode;
4838 	status_t status;
4839 
4840 	// filter invalid leaf names
4841 	if (leaf != NULL && (leaf[0] == '\0' || strchr(leaf, '/')))
4842 		return B_BAD_VALUE;
4843 
4844 	// get the vnode matching the dir's node_ref
4845 	if (leaf && (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0)) {
4846 		// special cases "." and "..": we can directly get the vnode of the
4847 		// referenced directory
4848 		status = entry_ref_to_vnode(device, inode, leaf, false, kernel, &vnode);
4849 		leaf = NULL;
4850 	} else
4851 		status = get_vnode(device, inode, &vnode, true, false);
4852 	if (status != B_OK)
4853 		return status;
4854 
4855 	// get the directory path
4856 	status = dir_vnode_to_path(vnode, path, pathLength, kernel);
4857 	put_vnode(vnode);
4858 		// we don't need the vnode anymore
4859 	if (status != B_OK)
4860 		return status;
4861 
4862 	// append the leaf name
4863 	if (leaf) {
4864 		// insert a directory separator if this is not the file system root
4865 		if ((strcmp(path, "/") && strlcat(path, "/", pathLength)
4866 				>= pathLength)
4867 			|| strlcat(path, leaf, pathLength) >= pathLength) {
4868 			return B_NAME_TOO_LONG;
4869 		}
4870 	}
4871 
4872 	return B_OK;
4873 }
4874 
4875 
4876 /*!	If the given descriptor locked its vnode, that lock will be released. */
4877 void
4878 vfs_unlock_vnode_if_locked(struct file_descriptor* descriptor)
4879 {
4880 	struct vnode* vnode = fd_vnode(descriptor);
4881 
4882 	if (vnode != NULL && vnode->mandatory_locked_by == descriptor)
4883 		vnode->mandatory_locked_by = NULL;
4884 }
4885 
4886 
4887 /*!	Releases any POSIX locks on the file descriptor. */
4888 status_t
4889 vfs_release_posix_lock(io_context* context, struct file_descriptor* descriptor)
4890 {
4891 	struct vnode* vnode = descriptor->u.vnode;
4892 
4893 	if (HAS_FS_CALL(vnode, release_lock))
4894 		return FS_CALL(vnode, release_lock, descriptor->cookie, NULL);
4895 
4896 	return release_advisory_lock(vnode, context, NULL, NULL);
4897 }
4898 
4899 
4900 /*!	Closes all file descriptors of the specified I/O context that
4901 	have the O_CLOEXEC flag set.
4902 */
4903 void
4904 vfs_exec_io_context(io_context* context)
4905 {
4906 	uint32 i;
4907 
4908 	for (i = 0; i < context->table_size; i++) {
4909 		mutex_lock(&context->io_mutex);
4910 
4911 		struct file_descriptor* descriptor = context->fds[i];
4912 		bool remove = false;
4913 
4914 		if (descriptor != NULL && fd_close_on_exec(context, i)) {
4915 			context->fds[i] = NULL;
4916 			context->num_used_fds--;
4917 
4918 			remove = true;
4919 		}
4920 
4921 		mutex_unlock(&context->io_mutex);
4922 
4923 		if (remove) {
4924 			close_fd(context, descriptor);
4925 			put_fd(descriptor);
4926 		}
4927 	}
4928 }
4929 
4930 
4931 /*! Sets up a new io_control structure, and inherits the properties
4932 	of the parent io_control if it is given.
4933 */
4934 io_context*
4935 vfs_new_io_context(io_context* parentContext, bool purgeCloseOnExec)
4936 {
4937 	io_context* context = (io_context*)malloc(sizeof(io_context));
4938 	if (context == NULL)
4939 		return NULL;
4940 
4941 	TIOC(NewIOContext(context, parentContext));
4942 
4943 	memset(context, 0, sizeof(io_context));
4944 	context->ref_count = 1;
4945 
4946 	MutexLocker parentLocker;
4947 
4948 	size_t tableSize;
4949 	if (parentContext != NULL) {
4950 		parentLocker.SetTo(parentContext->io_mutex, false);
4951 		tableSize = parentContext->table_size;
4952 	} else
4953 		tableSize = DEFAULT_FD_TABLE_SIZE;
4954 
4955 	// allocate space for FDs and their close-on-exec flag
4956 	context->fds = (file_descriptor**)malloc(
4957 		sizeof(struct file_descriptor*) * tableSize
4958 		+ sizeof(struct select_sync*) * tableSize
4959 		+ (tableSize + 7) / 8);
4960 	if (context->fds == NULL) {
4961 		free(context);
4962 		return NULL;
4963 	}
4964 
4965 	context->select_infos = (select_info**)(context->fds + tableSize);
4966 	context->fds_close_on_exec = (uint8*)(context->select_infos + tableSize);
4967 
4968 	memset(context->fds, 0, sizeof(struct file_descriptor*) * tableSize
4969 		+ sizeof(struct select_sync*) * tableSize
4970 		+ (tableSize + 7) / 8);
4971 
4972 	mutex_init(&context->io_mutex, "I/O context");
4973 
4974 	// Copy all parent file descriptors
4975 
4976 	if (parentContext != NULL) {
4977 		size_t i;
4978 
4979 		mutex_lock(&sIOContextRootLock);
4980 		context->root = parentContext->root;
4981 		if (context->root)
4982 			inc_vnode_ref_count(context->root);
4983 		mutex_unlock(&sIOContextRootLock);
4984 
4985 		context->cwd = parentContext->cwd;
4986 		if (context->cwd)
4987 			inc_vnode_ref_count(context->cwd);
4988 
4989 		if (parentContext->inherit_fds) {
4990 			for (i = 0; i < tableSize; i++) {
4991 				struct file_descriptor* descriptor = parentContext->fds[i];
4992 
4993 				if (descriptor != NULL
4994 					&& (descriptor->open_mode & O_DISCONNECTED) == 0) {
4995 					bool closeOnExec = fd_close_on_exec(parentContext, i);
4996 					if (closeOnExec && purgeCloseOnExec)
4997 						continue;
4998 
4999 					TFD(InheritFD(context, i, descriptor, parentContext));
5000 
5001 					context->fds[i] = descriptor;
5002 					context->num_used_fds++;
5003 					atomic_add(&descriptor->ref_count, 1);
5004 					atomic_add(&descriptor->open_count, 1);
5005 
5006 					if (closeOnExec)
5007 						fd_set_close_on_exec(context, i, true);
5008 				}
5009 			}
5010 		}
5011 
5012 		parentLocker.Unlock();
5013 	} else {
5014 		context->root = sRoot;
5015 		context->cwd = sRoot;
5016 
5017 		if (context->root)
5018 			inc_vnode_ref_count(context->root);
5019 
5020 		if (context->cwd)
5021 			inc_vnode_ref_count(context->cwd);
5022 	}
5023 
5024 	context->table_size = tableSize;
5025 	context->inherit_fds = parentContext != NULL;
5026 
5027 	list_init(&context->node_monitors);
5028 	context->max_monitors = DEFAULT_NODE_MONITORS;
5029 
5030 	return context;
5031 }
5032 
5033 
5034 void
5035 vfs_get_io_context(io_context* context)
5036 {
5037 	atomic_add(&context->ref_count, 1);
5038 }
5039 
5040 
5041 void
5042 vfs_put_io_context(io_context* context)
5043 {
5044 	if (atomic_add(&context->ref_count, -1) == 1)
5045 		free_io_context(context);
5046 }
5047 
5048 
5049 status_t
5050 vfs_resize_fd_table(struct io_context* context, uint32 newSize)
5051 {
5052 	if (newSize == 0 || newSize > MAX_FD_TABLE_SIZE)
5053 		return B_BAD_VALUE;
5054 
5055 	TIOC(ResizeIOContext(context, newSize));
5056 
5057 	MutexLocker _(context->io_mutex);
5058 
5059 	uint32 oldSize = context->table_size;
5060 	int oldCloseOnExitBitmapSize = (oldSize + 7) / 8;
5061 	int newCloseOnExitBitmapSize = (newSize + 7) / 8;
5062 
5063 	// If the tables shrink, make sure none of the fds being dropped are in use.
5064 	if (newSize < oldSize) {
5065 		for (uint32 i = oldSize; i-- > newSize;) {
5066 			if (context->fds[i])
5067 				return B_BUSY;
5068 		}
5069 	}
5070 
5071 	// store pointers to the old tables
5072 	file_descriptor** oldFDs = context->fds;
5073 	select_info** oldSelectInfos = context->select_infos;
5074 	uint8* oldCloseOnExecTable = context->fds_close_on_exec;
5075 
5076 	// allocate new tables
5077 	file_descriptor** newFDs = (file_descriptor**)malloc(
5078 		sizeof(struct file_descriptor*) * newSize
5079 		+ sizeof(struct select_sync*) * newSize
5080 		+ newCloseOnExitBitmapSize);
5081 	if (newFDs == NULL)
5082 		return B_NO_MEMORY;
5083 
5084 	context->fds = newFDs;
5085 	context->select_infos = (select_info**)(context->fds + newSize);
5086 	context->fds_close_on_exec = (uint8*)(context->select_infos + newSize);
5087 	context->table_size = newSize;
5088 
5089 	// copy entries from old tables
5090 	uint32 toCopy = min_c(oldSize, newSize);
5091 
5092 	memcpy(context->fds, oldFDs, sizeof(void*) * toCopy);
5093 	memcpy(context->select_infos, oldSelectInfos, sizeof(void*) * toCopy);
5094 	memcpy(context->fds_close_on_exec, oldCloseOnExecTable,
5095 		min_c(oldCloseOnExitBitmapSize, newCloseOnExitBitmapSize));
5096 
5097 	// clear additional entries, if the tables grow
5098 	if (newSize > oldSize) {
5099 		memset(context->fds + oldSize, 0, sizeof(void*) * (newSize - oldSize));
5100 		memset(context->select_infos + oldSize, 0,
5101 			sizeof(void*) * (newSize - oldSize));
5102 		memset(context->fds_close_on_exec + oldCloseOnExitBitmapSize, 0,
5103 			newCloseOnExitBitmapSize - oldCloseOnExitBitmapSize);
5104 	}
5105 
5106 	free(oldFDs);
5107 
5108 	return B_OK;
5109 }
5110 
5111 
5112 /*!	\brief Resolves a vnode to the vnode it is covered by, if any.
5113 
5114 	Given an arbitrary vnode (identified by mount and node ID), the function
5115 	checks, whether the vnode is covered by another vnode. If it is, the
5116 	function returns the mount and node ID of the covering vnode. Otherwise
5117 	it simply returns the supplied mount and node ID.
5118 
5119 	In case of error (e.g. the supplied node could not be found) the variables
5120 	for storing the resolved mount and node ID remain untouched and an error
5121 	code is returned.
5122 
5123 	\param mountID The mount ID of the vnode in question.
5124 	\param nodeID The node ID of the vnode in question.
5125 	\param resolvedMountID Pointer to storage for the resolved mount ID.
5126 	\param resolvedNodeID Pointer to storage for the resolved node ID.
5127 	\return
5128 	- \c B_OK, if everything went fine,
5129 	- another error code, if something went wrong.
5130 */
5131 status_t
5132 vfs_resolve_vnode_to_covering_vnode(dev_t mountID, ino_t nodeID,
5133 	dev_t* resolvedMountID, ino_t* resolvedNodeID)
5134 {
5135 	// get the node
5136 	struct vnode* node;
5137 	status_t error = get_vnode(mountID, nodeID, &node, true, false);
5138 	if (error != B_OK)
5139 		return error;
5140 
5141 	// resolve the node
5142 	if (Vnode* coveringNode = get_covering_vnode(node)) {
5143 		put_vnode(node);
5144 		node = coveringNode;
5145 	}
5146 
5147 	// set the return values
5148 	*resolvedMountID = node->device;
5149 	*resolvedNodeID = node->id;
5150 
5151 	put_vnode(node);
5152 
5153 	return B_OK;
5154 }
5155 
5156 
5157 status_t
5158 vfs_get_mount_point(dev_t mountID, dev_t* _mountPointMountID,
5159 	ino_t* _mountPointNodeID)
5160 {
5161 	ReadLocker nodeLocker(sVnodeLock);
5162 	MutexLocker mountLocker(sMountMutex);
5163 
5164 	struct fs_mount* mount = find_mount(mountID);
5165 	if (mount == NULL)
5166 		return B_BAD_VALUE;
5167 
5168 	Vnode* mountPoint = mount->covers_vnode;
5169 
5170 	*_mountPointMountID = mountPoint->device;
5171 	*_mountPointNodeID = mountPoint->id;
5172 
5173 	return B_OK;
5174 }
5175 
5176 
5177 status_t
5178 vfs_bind_mount_directory(dev_t mountID, ino_t nodeID, dev_t coveredMountID,
5179 	ino_t coveredNodeID)
5180 {
5181 	// get the vnodes
5182 	Vnode* vnode;
5183 	status_t error = get_vnode(mountID, nodeID, &vnode, true, false);
5184 	if (error != B_OK)
5185 		return B_BAD_VALUE;
5186 	VNodePutter vnodePutter(vnode);
5187 
5188 	Vnode* coveredVnode;
5189 	error = get_vnode(coveredMountID, coveredNodeID, &coveredVnode, true,
5190 		false);
5191 	if (error != B_OK)
5192 		return B_BAD_VALUE;
5193 	VNodePutter coveredVnodePutter(coveredVnode);
5194 
5195 	// establish the covered/covering links
5196 	WriteLocker locker(sVnodeLock);
5197 
5198 	if (vnode->covers != NULL || coveredVnode->covered_by != NULL
5199 		|| vnode->mount->unmounting || coveredVnode->mount->unmounting) {
5200 		return B_BUSY;
5201 	}
5202 
5203 	vnode->covers = coveredVnode;
5204 	vnode->SetCovering(true);
5205 
5206 	coveredVnode->covered_by = vnode;
5207 	coveredVnode->SetCovered(true);
5208 
5209 	// the vnodes do now reference each other
5210 	inc_vnode_ref_count(vnode);
5211 	inc_vnode_ref_count(coveredVnode);
5212 
5213 	return B_OK;
5214 }
5215 
5216 
5217 int
5218 vfs_getrlimit(int resource, struct rlimit* rlp)
5219 {
5220 	if (!rlp)
5221 		return B_BAD_ADDRESS;
5222 
5223 	switch (resource) {
5224 		case RLIMIT_NOFILE:
5225 		{
5226 			struct io_context* context = get_current_io_context(false);
5227 			MutexLocker _(context->io_mutex);
5228 
5229 			rlp->rlim_cur = context->table_size;
5230 			rlp->rlim_max = MAX_FD_TABLE_SIZE;
5231 			return 0;
5232 		}
5233 
5234 		case RLIMIT_NOVMON:
5235 		{
5236 			struct io_context* context = get_current_io_context(false);
5237 			MutexLocker _(context->io_mutex);
5238 
5239 			rlp->rlim_cur = context->max_monitors;
5240 			rlp->rlim_max = MAX_NODE_MONITORS;
5241 			return 0;
5242 		}
5243 
5244 		default:
5245 			return B_BAD_VALUE;
5246 	}
5247 }
5248 
5249 
5250 int
5251 vfs_setrlimit(int resource, const struct rlimit* rlp)
5252 {
5253 	if (!rlp)
5254 		return B_BAD_ADDRESS;
5255 
5256 	switch (resource) {
5257 		case RLIMIT_NOFILE:
5258 			/* TODO: check getuid() */
5259 			if (rlp->rlim_max != RLIM_SAVED_MAX
5260 				&& rlp->rlim_max != MAX_FD_TABLE_SIZE)
5261 				return B_NOT_ALLOWED;
5262 
5263 			return vfs_resize_fd_table(get_current_io_context(false),
5264 				rlp->rlim_cur);
5265 
5266 		case RLIMIT_NOVMON:
5267 			/* TODO: check getuid() */
5268 			if (rlp->rlim_max != RLIM_SAVED_MAX
5269 				&& rlp->rlim_max != MAX_NODE_MONITORS)
5270 				return B_NOT_ALLOWED;
5271 
5272 			return resize_monitor_table(get_current_io_context(false),
5273 				rlp->rlim_cur);
5274 
5275 		default:
5276 			return B_BAD_VALUE;
5277 	}
5278 }
5279 
5280 
5281 status_t
5282 vfs_init(kernel_args* args)
5283 {
5284 	vnode::StaticInit();
5285 
5286 	sVnodeTable = new(std::nothrow) VnodeTable();
5287 	if (sVnodeTable == NULL || sVnodeTable->Init(VNODE_HASH_TABLE_SIZE) != B_OK)
5288 		panic("vfs_init: error creating vnode hash table\n");
5289 
5290 	struct vnode dummy_vnode;
5291 	list_init_etc(&sUnusedVnodeList, offset_of_member(dummy_vnode, unused_link));
5292 
5293 	struct fs_mount dummyMount;
5294 	sMountsTable = new(std::nothrow) MountTable();
5295 	if (sMountsTable == NULL
5296 			|| sMountsTable->Init(MOUNTS_HASH_TABLE_SIZE) != B_OK)
5297 		panic("vfs_init: error creating mounts hash table\n");
5298 
5299 	node_monitor_init();
5300 
5301 	sRoot = NULL;
5302 
5303 	recursive_lock_init(&sMountOpLock, "vfs_mount_op_lock");
5304 
5305 	if (block_cache_init() != B_OK)
5306 		return B_ERROR;
5307 
5308 #ifdef ADD_DEBUGGER_COMMANDS
5309 	// add some debugger commands
5310 	add_debugger_command_etc("vnode", &dump_vnode,
5311 		"Print info about the specified vnode",
5312 		"[ \"-p\" ] ( <vnode> | <devID> <nodeID> )\n"
5313 		"Prints information about the vnode specified by address <vnode> or\n"
5314 		"<devID>, <vnodeID> pair. If \"-p\" is given, a path of the vnode is\n"
5315 		"constructed and printed. It might not be possible to construct a\n"
5316 		"complete path, though.\n",
5317 		0);
5318 	add_debugger_command("vnodes", &dump_vnodes,
5319 		"list all vnodes (from the specified device)");
5320 	add_debugger_command("vnode_caches", &dump_vnode_caches,
5321 		"list all vnode caches");
5322 	add_debugger_command("mount", &dump_mount,
5323 		"info about the specified fs_mount");
5324 	add_debugger_command("mounts", &dump_mounts, "list all fs_mounts");
5325 	add_debugger_command("io_context", &dump_io_context,
5326 		"info about the I/O context");
5327 	add_debugger_command("vnode_usage", &dump_vnode_usage,
5328 		"info about vnode usage");
5329 #endif
5330 
5331 	register_low_resource_handler(&vnode_low_resource_handler, NULL,
5332 		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
5333 			| B_KERNEL_RESOURCE_ADDRESS_SPACE,
5334 		0);
5335 
5336 	fifo_init();
5337 	file_map_init();
5338 
5339 	return file_cache_init();
5340 }
5341 
5342 
5343 //	#pragma mark - fd_ops implementations
5344 
5345 
5346 /*!
5347 	Calls fs_open() on the given vnode and returns a new
5348 	file descriptor for it
5349 */
5350 static int
5351 open_vnode(struct vnode* vnode, int openMode, bool kernel)
5352 {
5353 	void* cookie;
5354 	status_t status = FS_CALL(vnode, open, openMode, &cookie);
5355 	if (status != B_OK)
5356 		return status;
5357 
5358 	int fd = get_new_fd(FDTYPE_FILE, NULL, vnode, cookie, openMode, kernel);
5359 	if (fd < 0) {
5360 		FS_CALL(vnode, close, cookie);
5361 		FS_CALL(vnode, free_cookie, cookie);
5362 	}
5363 	return fd;
5364 }
5365 
5366 
5367 /*!
5368 	Calls fs_open() on the given vnode and returns a new
5369 	file descriptor for it
5370 */
5371 static int
5372 create_vnode(struct vnode* directory, const char* name, int openMode,
5373 	int perms, bool kernel)
5374 {
5375 	bool traverse = ((openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0);
5376 	status_t status = B_ERROR;
5377 	struct vnode* vnode;
5378 	void* cookie;
5379 	ino_t newID;
5380 
5381 	// This is somewhat tricky: If the entry already exists, the FS responsible
5382 	// for the directory might not necessarily also be the one responsible for
5383 	// the node the entry refers to (e.g. in case of mount points or FIFOs). So
5384 	// we can actually never call the create() hook without O_EXCL. Instead we
5385 	// try to look the entry up first. If it already exists, we just open the
5386 	// node (unless O_EXCL), otherwise we call create() with O_EXCL. This
5387 	// introduces a race condition, since someone else might have created the
5388 	// entry in the meantime. We hope the respective FS returns the correct
5389 	// error code and retry (up to 3 times) again.
5390 
5391 	for (int i = 0; i < 3 && status != B_OK; i++) {
5392 		// look the node up
5393 		status = lookup_dir_entry(directory, name, &vnode);
5394 		if (status == B_OK) {
5395 			VNodePutter putter(vnode);
5396 
5397 			if ((openMode & O_EXCL) != 0)
5398 				return B_FILE_EXISTS;
5399 
5400 			// If the node is a symlink, we have to follow it, unless
5401 			// O_NOTRAVERSE is set.
5402 			if (S_ISLNK(vnode->Type()) && traverse) {
5403 				putter.Put();
5404 				char clonedName[B_FILE_NAME_LENGTH + 1];
5405 				if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH)
5406 						>= B_FILE_NAME_LENGTH) {
5407 					return B_NAME_TOO_LONG;
5408 				}
5409 
5410 				inc_vnode_ref_count(directory);
5411 				status = vnode_path_to_vnode(directory, clonedName, true, 0,
5412 					kernel, &vnode, NULL);
5413 				if (status != B_OK)
5414 					return status;
5415 
5416 				putter.SetTo(vnode);
5417 			}
5418 
5419 			if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type()))
5420 				return B_LINK_LIMIT;
5421 
5422 			int fd = open_vnode(vnode, openMode & ~O_CREAT, kernel);
5423 			// on success keep the vnode reference for the FD
5424 			if (fd >= 0)
5425 				putter.Detach();
5426 
5427 			return fd;
5428 		}
5429 
5430 		// it doesn't exist yet -- try to create it
5431 
5432 		if (!HAS_FS_CALL(directory, create))
5433 			return B_READ_ONLY_DEVICE;
5434 
5435 		status = FS_CALL(directory, create, name, openMode | O_EXCL, perms,
5436 			&cookie, &newID);
5437 		if (status != B_OK
5438 			&& ((openMode & O_EXCL) != 0 || status != B_FILE_EXISTS)) {
5439 			return status;
5440 		}
5441 	}
5442 
5443 	if (status != B_OK)
5444 		return status;
5445 
5446 	// the node has been created successfully
5447 
5448 	rw_lock_read_lock(&sVnodeLock);
5449 	vnode = lookup_vnode(directory->device, newID);
5450 	rw_lock_read_unlock(&sVnodeLock);
5451 
5452 	if (vnode == NULL) {
5453 		panic("vfs: fs_create() returned success but there is no vnode, "
5454 			"mount ID %" B_PRIdDEV "!\n", directory->device);
5455 		return B_BAD_VALUE;
5456 	}
5457 
5458 	int fd = get_new_fd(FDTYPE_FILE, NULL, vnode, cookie, openMode, kernel);
5459 	if (fd >= 0)
5460 		return fd;
5461 
5462 	status = fd;
5463 
5464 	// something went wrong, clean up
5465 
5466 	FS_CALL(vnode, close, cookie);
5467 	FS_CALL(vnode, free_cookie, cookie);
5468 	put_vnode(vnode);
5469 
5470 	FS_CALL(directory, unlink, name);
5471 
5472 	return status;
5473 }
5474 
5475 
5476 /*! Calls fs open_dir() on the given vnode and returns a new
5477 	file descriptor for it
5478 */
5479 static int
5480 open_dir_vnode(struct vnode* vnode, bool kernel)
5481 {
5482 	void* cookie;
5483 	status_t status = FS_CALL(vnode, open_dir, &cookie);
5484 	if (status != B_OK)
5485 		return status;
5486 
5487 	// directory is opened, create a fd
5488 	status = get_new_fd(FDTYPE_DIR, NULL, vnode, cookie, O_CLOEXEC, kernel);
5489 	if (status >= 0)
5490 		return status;
5491 
5492 	FS_CALL(vnode, close_dir, cookie);
5493 	FS_CALL(vnode, free_dir_cookie, cookie);
5494 
5495 	return status;
5496 }
5497 
5498 
5499 /*! Calls fs open_attr_dir() on the given vnode and returns a new
5500 	file descriptor for it.
5501 	Used by attr_dir_open(), and attr_dir_open_fd().
5502 */
5503 static int
5504 open_attr_dir_vnode(struct vnode* vnode, bool kernel)
5505 {
5506 	if (!HAS_FS_CALL(vnode, open_attr_dir))
5507 		return B_UNSUPPORTED;
5508 
5509 	void* cookie;
5510 	status_t status = FS_CALL(vnode, open_attr_dir, &cookie);
5511 	if (status != B_OK)
5512 		return status;
5513 
5514 	// directory is opened, create a fd
5515 	status = get_new_fd(FDTYPE_ATTR_DIR, NULL, vnode, cookie, O_CLOEXEC,
5516 		kernel);
5517 	if (status >= 0)
5518 		return status;
5519 
5520 	FS_CALL(vnode, close_attr_dir, cookie);
5521 	FS_CALL(vnode, free_attr_dir_cookie, cookie);
5522 
5523 	return status;
5524 }
5525 
5526 
5527 static int
5528 file_create_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5529 	int openMode, int perms, bool kernel)
5530 {
5531 	FUNCTION(("file_create_entry_ref: name = '%s', omode %x, perms %d, "
5532 		"kernel %d\n", name, openMode, perms, kernel));
5533 
5534 	// get directory to put the new file in
5535 	struct vnode* directory;
5536 	status_t status = get_vnode(mountID, directoryID, &directory, true, false);
5537 	if (status != B_OK)
5538 		return status;
5539 
5540 	status = create_vnode(directory, name, openMode, perms, kernel);
5541 	put_vnode(directory);
5542 
5543 	return status;
5544 }
5545 
5546 
5547 static int
5548 file_create(int fd, char* path, int openMode, int perms, bool kernel)
5549 {
5550 	FUNCTION(("file_create: path '%s', omode %x, perms %d, kernel %d\n", path,
5551 		openMode, perms, kernel));
5552 
5553 	// get directory to put the new file in
5554 	char name[B_FILE_NAME_LENGTH];
5555 	struct vnode* directory;
5556 	status_t status = fd_and_path_to_dir_vnode(fd, path, &directory, name,
5557 		kernel);
5558 	if (status < 0)
5559 		return status;
5560 
5561 	status = create_vnode(directory, name, openMode, perms, kernel);
5562 
5563 	put_vnode(directory);
5564 	return status;
5565 }
5566 
5567 
5568 static int
5569 file_open_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5570 	int openMode, bool kernel)
5571 {
5572 	if (name == NULL || *name == '\0')
5573 		return B_BAD_VALUE;
5574 
5575 	FUNCTION(("file_open_entry_ref(ref = (%" B_PRId32 ", %" B_PRId64 ", %s), "
5576 		"openMode = %d)\n", mountID, directoryID, name, openMode));
5577 
5578 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5579 
5580 	// get the vnode matching the entry_ref
5581 	struct vnode* vnode;
5582 	status_t status = entry_ref_to_vnode(mountID, directoryID, name, traverse,
5583 		kernel, &vnode);
5584 	if (status != B_OK)
5585 		return status;
5586 
5587 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
5588 		put_vnode(vnode);
5589 		return B_LINK_LIMIT;
5590 	}
5591 
5592 	int newFD = open_vnode(vnode, openMode, kernel);
5593 	if (newFD >= 0) {
5594 		// The vnode reference has been transferred to the FD
5595 		cache_node_opened(vnode, FDTYPE_FILE, vnode->cache, mountID,
5596 			directoryID, vnode->id, name);
5597 	} else
5598 		put_vnode(vnode);
5599 
5600 	return newFD;
5601 }
5602 
5603 
5604 static int
5605 file_open(int fd, char* path, int openMode, bool kernel)
5606 {
5607 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5608 
5609 	FUNCTION(("file_open: fd: %d, entry path = '%s', omode %d, kernel %d\n",
5610 		fd, path, openMode, kernel));
5611 
5612 	// get the vnode matching the vnode + path combination
5613 	struct vnode* vnode;
5614 	ino_t parentID;
5615 	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode,
5616 		&parentID, kernel);
5617 	if (status != B_OK)
5618 		return status;
5619 
5620 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
5621 		put_vnode(vnode);
5622 		return B_LINK_LIMIT;
5623 	}
5624 
5625 	// open the vnode
5626 	int newFD = open_vnode(vnode, openMode, kernel);
5627 	if (newFD >= 0) {
5628 		// The vnode reference has been transferred to the FD
5629 		cache_node_opened(vnode, FDTYPE_FILE, vnode->cache,
5630 			vnode->device, parentID, vnode->id, NULL);
5631 	} else
5632 		put_vnode(vnode);
5633 
5634 	return newFD;
5635 }
5636 
5637 
5638 static status_t
5639 file_close(struct file_descriptor* descriptor)
5640 {
5641 	struct vnode* vnode = descriptor->u.vnode;
5642 	status_t status = B_OK;
5643 
5644 	FUNCTION(("file_close(descriptor = %p)\n", descriptor));
5645 
5646 	cache_node_closed(vnode, FDTYPE_FILE, vnode->cache, vnode->device,
5647 		vnode->id);
5648 	if (HAS_FS_CALL(vnode, close)) {
5649 		status = FS_CALL(vnode, close, descriptor->cookie);
5650 	}
5651 
5652 	if (status == B_OK) {
5653 		// remove all outstanding locks for this team
5654 		if (HAS_FS_CALL(vnode, release_lock))
5655 			status = FS_CALL(vnode, release_lock, descriptor->cookie, NULL);
5656 		else
5657 			status = release_advisory_lock(vnode, NULL, descriptor, NULL);
5658 	}
5659 	return status;
5660 }
5661 
5662 
5663 static void
5664 file_free_fd(struct file_descriptor* descriptor)
5665 {
5666 	struct vnode* vnode = descriptor->u.vnode;
5667 
5668 	if (vnode != NULL) {
5669 		FS_CALL(vnode, free_cookie, descriptor->cookie);
5670 		put_vnode(vnode);
5671 	}
5672 }
5673 
5674 
5675 static status_t
5676 file_read(struct file_descriptor* descriptor, off_t pos, void* buffer,
5677 	size_t* length)
5678 {
5679 	struct vnode* vnode = descriptor->u.vnode;
5680 	FUNCTION(("file_read: buf %p, pos %" B_PRIdOFF ", len %p = %ld\n", buffer,
5681 		pos, length, *length));
5682 
5683 	if (S_ISDIR(vnode->Type()))
5684 		return B_IS_A_DIRECTORY;
5685 
5686 	return FS_CALL(vnode, read, descriptor->cookie, pos, buffer, length);
5687 }
5688 
5689 
5690 static status_t
5691 file_write(struct file_descriptor* descriptor, off_t pos, const void* buffer,
5692 	size_t* length)
5693 {
5694 	struct vnode* vnode = descriptor->u.vnode;
5695 	FUNCTION(("file_write: buf %p, pos %" B_PRIdOFF ", len %p\n", buffer, pos,
5696 		length));
5697 
5698 	if (S_ISDIR(vnode->Type()))
5699 		return B_IS_A_DIRECTORY;
5700 	if (!HAS_FS_CALL(vnode, write))
5701 		return B_READ_ONLY_DEVICE;
5702 
5703 	return FS_CALL(vnode, write, descriptor->cookie, pos, buffer, length);
5704 }
5705 
5706 
5707 static off_t
5708 file_seek(struct file_descriptor* descriptor, off_t pos, int seekType)
5709 {
5710 	struct vnode* vnode = descriptor->u.vnode;
5711 	off_t offset;
5712 	bool isDevice = false;
5713 
5714 	FUNCTION(("file_seek(pos = %" B_PRIdOFF ", seekType = %d)\n", pos,
5715 		seekType));
5716 
5717 	// some kinds of files are not seekable
5718 	switch (vnode->Type() & S_IFMT) {
5719 		case S_IFIFO:
5720 		case S_IFSOCK:
5721 			return ESPIPE;
5722 
5723 		// drivers publish block devices as chr, so pick both
5724 		case S_IFBLK:
5725 		case S_IFCHR:
5726 			isDevice = true;
5727 			break;
5728 		// The Open Group Base Specs don't mention any file types besides pipes,
5729 		// fifos, and sockets specially, so we allow seeking them.
5730 		case S_IFREG:
5731 		case S_IFDIR:
5732 		case S_IFLNK:
5733 			break;
5734 	}
5735 
5736 	switch (seekType) {
5737 		case SEEK_SET:
5738 			offset = 0;
5739 			break;
5740 		case SEEK_CUR:
5741 			offset = descriptor->pos;
5742 			break;
5743 		case SEEK_END:
5744 		{
5745 			// stat() the node
5746 			if (!HAS_FS_CALL(vnode, read_stat))
5747 				return B_UNSUPPORTED;
5748 
5749 			struct stat stat;
5750 			status_t status = FS_CALL(vnode, read_stat, &stat);
5751 			if (status != B_OK)
5752 				return status;
5753 
5754 			offset = stat.st_size;
5755 
5756 			if (offset == 0 && isDevice) {
5757 				// stat() on regular drivers doesn't report size
5758 				device_geometry geometry;
5759 
5760 				if (HAS_FS_CALL(vnode, ioctl)) {
5761 					status = FS_CALL(vnode, ioctl, descriptor->cookie,
5762 						B_GET_GEOMETRY, &geometry, sizeof(geometry));
5763 					if (status == B_OK)
5764 						offset = (off_t)geometry.bytes_per_sector
5765 							* geometry.sectors_per_track
5766 							* geometry.cylinder_count
5767 							* geometry.head_count;
5768 				}
5769 			}
5770 
5771 			break;
5772 		}
5773 		default:
5774 			return B_BAD_VALUE;
5775 	}
5776 
5777 	// assumes off_t is 64 bits wide
5778 	if (offset > 0 && LONGLONG_MAX - offset < pos)
5779 		return B_BUFFER_OVERFLOW;
5780 
5781 	pos += offset;
5782 	if (pos < 0)
5783 		return B_BAD_VALUE;
5784 
5785 	return descriptor->pos = pos;
5786 }
5787 
5788 
5789 static status_t
5790 file_select(struct file_descriptor* descriptor, uint8 event,
5791 	struct selectsync* sync)
5792 {
5793 	FUNCTION(("file_select(%p, %u, %p)\n", descriptor, event, sync));
5794 
5795 	struct vnode* vnode = descriptor->u.vnode;
5796 
5797 	// If the FS has no select() hook, notify select() now.
5798 	if (!HAS_FS_CALL(vnode, select)) {
5799 		if (!SELECT_TYPE_IS_OUTPUT_ONLY(event))
5800 			return notify_select_event(sync, event);
5801 		else
5802 			return B_OK;
5803 	}
5804 
5805 	return FS_CALL(vnode, select, descriptor->cookie, event, sync);
5806 }
5807 
5808 
5809 static status_t
5810 file_deselect(struct file_descriptor* descriptor, uint8 event,
5811 	struct selectsync* sync)
5812 {
5813 	struct vnode* vnode = descriptor->u.vnode;
5814 
5815 	if (!HAS_FS_CALL(vnode, deselect))
5816 		return B_OK;
5817 
5818 	return FS_CALL(vnode, deselect, descriptor->cookie, event, sync);
5819 }
5820 
5821 
5822 static status_t
5823 dir_create_entry_ref(dev_t mountID, ino_t parentID, const char* name, int perms,
5824 	bool kernel)
5825 {
5826 	struct vnode* vnode;
5827 	status_t status;
5828 
5829 	if (name == NULL || *name == '\0')
5830 		return B_BAD_VALUE;
5831 
5832 	FUNCTION(("dir_create_entry_ref(dev = %" B_PRId32 ", ino = %" B_PRId64 ", "
5833 		"name = '%s', perms = %d)\n", mountID, parentID, name, perms));
5834 
5835 	status = get_vnode(mountID, parentID, &vnode, true, false);
5836 	if (status != B_OK)
5837 		return status;
5838 
5839 	if (HAS_FS_CALL(vnode, create_dir))
5840 		status = FS_CALL(vnode, create_dir, name, perms);
5841 	else
5842 		status = B_READ_ONLY_DEVICE;
5843 
5844 	put_vnode(vnode);
5845 	return status;
5846 }
5847 
5848 
5849 static status_t
5850 dir_create(int fd, char* path, int perms, bool kernel)
5851 {
5852 	char filename[B_FILE_NAME_LENGTH];
5853 	struct vnode* vnode;
5854 	status_t status;
5855 
5856 	FUNCTION(("dir_create: path '%s', perms %d, kernel %d\n", path, perms,
5857 		kernel));
5858 
5859 	status = fd_and_path_to_dir_vnode(fd, path, &vnode, filename, kernel);
5860 	if (status < 0)
5861 		return status;
5862 
5863 	if (HAS_FS_CALL(vnode, create_dir)) {
5864 		status = FS_CALL(vnode, create_dir, filename, perms);
5865 	} else
5866 		status = B_READ_ONLY_DEVICE;
5867 
5868 	put_vnode(vnode);
5869 	return status;
5870 }
5871 
5872 
5873 static int
5874 dir_open_entry_ref(dev_t mountID, ino_t parentID, const char* name, bool kernel)
5875 {
5876 	FUNCTION(("dir_open_entry_ref()\n"));
5877 
5878 	if (name && name[0] == '\0')
5879 		return B_BAD_VALUE;
5880 
5881 	// get the vnode matching the entry_ref/node_ref
5882 	struct vnode* vnode;
5883 	status_t status;
5884 	if (name) {
5885 		status = entry_ref_to_vnode(mountID, parentID, name, true, kernel,
5886 			&vnode);
5887 	} else
5888 		status = get_vnode(mountID, parentID, &vnode, true, false);
5889 	if (status != B_OK)
5890 		return status;
5891 
5892 	int newFD = open_dir_vnode(vnode, kernel);
5893 	if (newFD >= 0) {
5894 		// The vnode reference has been transferred to the FD
5895 		cache_node_opened(vnode, FDTYPE_DIR, vnode->cache, mountID, parentID,
5896 			vnode->id, name);
5897 	} else
5898 		put_vnode(vnode);
5899 
5900 	return newFD;
5901 }
5902 
5903 
5904 static int
5905 dir_open(int fd, char* path, bool kernel)
5906 {
5907 	FUNCTION(("dir_open: fd: %d, entry path = '%s', kernel %d\n", fd, path,
5908 		kernel));
5909 
5910 	// get the vnode matching the vnode + path combination
5911 	struct vnode* vnode = NULL;
5912 	ino_t parentID;
5913 	status_t status = fd_and_path_to_vnode(fd, path, true, &vnode, &parentID,
5914 		kernel);
5915 	if (status != B_OK)
5916 		return status;
5917 
5918 	// open the dir
5919 	int newFD = open_dir_vnode(vnode, kernel);
5920 	if (newFD >= 0) {
5921 		// The vnode reference has been transferred to the FD
5922 		cache_node_opened(vnode, FDTYPE_DIR, vnode->cache, vnode->device,
5923 			parentID, vnode->id, NULL);
5924 	} else
5925 		put_vnode(vnode);
5926 
5927 	return newFD;
5928 }
5929 
5930 
5931 static status_t
5932 dir_close(struct file_descriptor* descriptor)
5933 {
5934 	struct vnode* vnode = descriptor->u.vnode;
5935 
5936 	FUNCTION(("dir_close(descriptor = %p)\n", descriptor));
5937 
5938 	cache_node_closed(vnode, FDTYPE_DIR, vnode->cache, vnode->device,
5939 		vnode->id);
5940 	if (HAS_FS_CALL(vnode, close_dir))
5941 		return FS_CALL(vnode, close_dir, descriptor->cookie);
5942 
5943 	return B_OK;
5944 }
5945 
5946 
5947 static void
5948 dir_free_fd(struct file_descriptor* descriptor)
5949 {
5950 	struct vnode* vnode = descriptor->u.vnode;
5951 
5952 	if (vnode != NULL) {
5953 		FS_CALL(vnode, free_dir_cookie, descriptor->cookie);
5954 		put_vnode(vnode);
5955 	}
5956 }
5957 
5958 
5959 static status_t
5960 dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
5961 	struct dirent* buffer, size_t bufferSize, uint32* _count)
5962 {
5963 	return dir_read(ioContext, descriptor->u.vnode, descriptor->cookie, buffer,
5964 		bufferSize, _count);
5965 }
5966 
5967 
5968 static status_t
5969 fix_dirent(struct vnode* parent, struct dirent* entry,
5970 	struct io_context* ioContext)
5971 {
5972 	// set d_pdev and d_pino
5973 	entry->d_pdev = parent->device;
5974 	entry->d_pino = parent->id;
5975 
5976 	// If this is the ".." entry and the directory covering another vnode,
5977 	// we need to replace d_dev and d_ino with the actual values.
5978 	if (strcmp(entry->d_name, "..") == 0 && parent->IsCovering()) {
5979 		return resolve_covered_parent(parent, &entry->d_dev, &entry->d_ino,
5980 			ioContext);
5981 	}
5982 
5983 	// resolve covered vnodes
5984 	ReadLocker _(&sVnodeLock);
5985 
5986 	struct vnode* vnode = lookup_vnode(entry->d_dev, entry->d_ino);
5987 	if (vnode != NULL && vnode->covered_by != NULL) {
5988 		do {
5989 			vnode = vnode->covered_by;
5990 		} while (vnode->covered_by != NULL);
5991 
5992 		entry->d_dev = vnode->device;
5993 		entry->d_ino = vnode->id;
5994 	}
5995 
5996 	return B_OK;
5997 }
5998 
5999 
6000 static status_t
6001 dir_read(struct io_context* ioContext, struct vnode* vnode, void* cookie,
6002 	struct dirent* buffer, size_t bufferSize, uint32* _count)
6003 {
6004 	if (!HAS_FS_CALL(vnode, read_dir))
6005 		return B_UNSUPPORTED;
6006 
6007 	status_t error = FS_CALL(vnode, read_dir, cookie, buffer, bufferSize,
6008 		_count);
6009 	if (error != B_OK)
6010 		return error;
6011 
6012 	// we need to adjust the read dirents
6013 	uint32 count = *_count;
6014 	for (uint32 i = 0; i < count; i++) {
6015 		error = fix_dirent(vnode, buffer, ioContext);
6016 		if (error != B_OK)
6017 			return error;
6018 
6019 		buffer = (struct dirent*)((uint8*)buffer + buffer->d_reclen);
6020 	}
6021 
6022 	return error;
6023 }
6024 
6025 
6026 static status_t
6027 dir_rewind(struct file_descriptor* descriptor)
6028 {
6029 	struct vnode* vnode = descriptor->u.vnode;
6030 
6031 	if (HAS_FS_CALL(vnode, rewind_dir)) {
6032 		return FS_CALL(vnode, rewind_dir, descriptor->cookie);
6033 	}
6034 
6035 	return B_UNSUPPORTED;
6036 }
6037 
6038 
6039 static status_t
6040 dir_remove(int fd, char* path, bool kernel)
6041 {
6042 	char name[B_FILE_NAME_LENGTH];
6043 	struct vnode* directory;
6044 	status_t status;
6045 
6046 	if (path != NULL) {
6047 		// we need to make sure our path name doesn't stop with "/", ".",
6048 		// or ".."
6049 		char* lastSlash;
6050 		while ((lastSlash = strrchr(path, '/')) != NULL) {
6051 			char* leaf = lastSlash + 1;
6052 			if (!strcmp(leaf, ".."))
6053 				return B_NOT_ALLOWED;
6054 
6055 			// omit multiple slashes
6056 			while (lastSlash > path && lastSlash[-1] == '/')
6057 				lastSlash--;
6058 
6059 			if (leaf[0]
6060 				&& strcmp(leaf, ".")) {
6061 				break;
6062 			}
6063 			// "name/" -> "name", or "name/." -> "name"
6064 			lastSlash[0] = '\0';
6065 		}
6066 
6067 		if (!strcmp(path, ".") || !strcmp(path, ".."))
6068 			return B_NOT_ALLOWED;
6069 	}
6070 
6071 	status = fd_and_path_to_dir_vnode(fd, path, &directory, name, kernel);
6072 	if (status != B_OK)
6073 		return status;
6074 
6075 	if (HAS_FS_CALL(directory, remove_dir))
6076 		status = FS_CALL(directory, remove_dir, name);
6077 	else
6078 		status = B_READ_ONLY_DEVICE;
6079 
6080 	put_vnode(directory);
6081 	return status;
6082 }
6083 
6084 
6085 static status_t
6086 common_ioctl(struct file_descriptor* descriptor, ulong op, void* buffer,
6087 	size_t length)
6088 {
6089 	struct vnode* vnode = descriptor->u.vnode;
6090 
6091 	if (HAS_FS_CALL(vnode, ioctl))
6092 		return FS_CALL(vnode, ioctl, descriptor->cookie, op, buffer, length);
6093 
6094 	return B_DEV_INVALID_IOCTL;
6095 }
6096 
6097 
6098 static status_t
6099 common_fcntl(int fd, int op, size_t argument, bool kernel)
6100 {
6101 	struct flock flock;
6102 
6103 	FUNCTION(("common_fcntl(fd = %d, op = %d, argument = %lx, %s)\n",
6104 		fd, op, argument, kernel ? "kernel" : "user"));
6105 
6106 	struct io_context* context = get_current_io_context(kernel);
6107 
6108 	struct file_descriptor* descriptor = get_fd(context, fd);
6109 	if (descriptor == NULL)
6110 		return B_FILE_ERROR;
6111 
6112 	struct vnode* vnode = fd_vnode(descriptor);
6113 
6114 	status_t status = B_OK;
6115 
6116 	if (op == F_SETLK || op == F_SETLKW || op == F_GETLK) {
6117 		if (descriptor->type != FDTYPE_FILE)
6118 			status = B_BAD_VALUE;
6119 		else if (kernel)
6120 			memcpy(&flock, (struct flock*)argument, sizeof(struct flock));
6121 		else if (user_memcpy(&flock, (struct flock*)argument,
6122 				sizeof(struct flock)) != B_OK)
6123 			status = B_BAD_ADDRESS;
6124 		if (status != B_OK) {
6125 			put_fd(descriptor);
6126 			return status;
6127 		}
6128 	}
6129 
6130 	switch (op) {
6131 		case F_SETFD:
6132 		{
6133 			// Set file descriptor flags
6134 
6135 			// O_CLOEXEC is the only flag available at this time
6136 			mutex_lock(&context->io_mutex);
6137 			fd_set_close_on_exec(context, fd, (argument & FD_CLOEXEC) != 0);
6138 			mutex_unlock(&context->io_mutex);
6139 
6140 			status = B_OK;
6141 			break;
6142 		}
6143 
6144 		case F_GETFD:
6145 		{
6146 			// Get file descriptor flags
6147 			mutex_lock(&context->io_mutex);
6148 			status = fd_close_on_exec(context, fd) ? FD_CLOEXEC : 0;
6149 			mutex_unlock(&context->io_mutex);
6150 			break;
6151 		}
6152 
6153 		case F_SETFL:
6154 			// Set file descriptor open mode
6155 
6156 			// we only accept changes to O_APPEND and O_NONBLOCK
6157 			argument &= O_APPEND | O_NONBLOCK;
6158 			if (descriptor->ops->fd_set_flags != NULL) {
6159 				status = descriptor->ops->fd_set_flags(descriptor, argument);
6160 			} else if (vnode != NULL && HAS_FS_CALL(vnode, set_flags)) {
6161 				status = FS_CALL(vnode, set_flags, descriptor->cookie,
6162 					(int)argument);
6163 			} else
6164 				status = B_UNSUPPORTED;
6165 
6166 			if (status == B_OK) {
6167 				// update this descriptor's open_mode field
6168 				descriptor->open_mode = (descriptor->open_mode
6169 					& ~(O_APPEND | O_NONBLOCK)) | argument;
6170 			}
6171 
6172 			break;
6173 
6174 		case F_GETFL:
6175 			// Get file descriptor open mode
6176 			status = descriptor->open_mode;
6177 			break;
6178 
6179 		case F_DUPFD:
6180 		case F_DUPFD_CLOEXEC:
6181 		{
6182 			status = new_fd_etc(context, descriptor, (int)argument);
6183 			if (status >= 0) {
6184 				mutex_lock(&context->io_mutex);
6185 				fd_set_close_on_exec(context, fd, op == F_DUPFD_CLOEXEC);
6186 				mutex_unlock(&context->io_mutex);
6187 
6188 				atomic_add(&descriptor->ref_count, 1);
6189 			}
6190 			break;
6191 		}
6192 
6193 		case F_GETLK:
6194 			if (vnode != NULL) {
6195 				struct flock normalizedLock;
6196 
6197 				memcpy(&normalizedLock, &flock, sizeof(struct flock));
6198 				status = normalize_flock(descriptor, &normalizedLock);
6199 				if (status != B_OK)
6200 					break;
6201 
6202 				if (HAS_FS_CALL(vnode, test_lock)) {
6203 					status = FS_CALL(vnode, test_lock, descriptor->cookie,
6204 						&normalizedLock);
6205 				} else
6206 					status = test_advisory_lock(vnode, &normalizedLock);
6207 				if (status == B_OK) {
6208 					if (normalizedLock.l_type == F_UNLCK) {
6209 						// no conflicting lock found, copy back the same struct
6210 						// we were given except change type to F_UNLCK
6211 						flock.l_type = F_UNLCK;
6212 						if (kernel) {
6213 							memcpy((struct flock*)argument, &flock,
6214 								sizeof(struct flock));
6215 						} else {
6216 							status = user_memcpy((struct flock*)argument,
6217 								&flock, sizeof(struct flock));
6218 						}
6219 					} else {
6220 						// a conflicting lock was found, copy back its range and
6221 						// type
6222 						if (normalizedLock.l_len == OFF_MAX)
6223 							normalizedLock.l_len = 0;
6224 
6225 						if (kernel) {
6226 							memcpy((struct flock*)argument,
6227 								&normalizedLock, sizeof(struct flock));
6228 						} else {
6229 							status = user_memcpy((struct flock*)argument,
6230 								&normalizedLock, sizeof(struct flock));
6231 						}
6232 					}
6233 				}
6234 			} else
6235 				status = B_BAD_VALUE;
6236 			break;
6237 
6238 		case F_SETLK:
6239 		case F_SETLKW:
6240 			status = normalize_flock(descriptor, &flock);
6241 			if (status != B_OK)
6242 				break;
6243 
6244 			if (vnode == NULL) {
6245 				status = B_BAD_VALUE;
6246 			} else if (flock.l_type == F_UNLCK) {
6247 				if (HAS_FS_CALL(vnode, release_lock)) {
6248 					status = FS_CALL(vnode, release_lock, descriptor->cookie,
6249 						&flock);
6250 				} else {
6251 					status = release_advisory_lock(vnode, context, NULL,
6252 						&flock);
6253 				}
6254 			} else {
6255 				// the open mode must match the lock type
6256 				if (((descriptor->open_mode & O_RWMASK) == O_RDONLY
6257 						&& flock.l_type == F_WRLCK)
6258 					|| ((descriptor->open_mode & O_RWMASK) == O_WRONLY
6259 						&& flock.l_type == F_RDLCK))
6260 					status = B_FILE_ERROR;
6261 				else {
6262 					if (HAS_FS_CALL(vnode, acquire_lock)) {
6263 						status = FS_CALL(vnode, acquire_lock,
6264 							descriptor->cookie, &flock, op == F_SETLKW);
6265 					} else {
6266 						status = acquire_advisory_lock(vnode, context, NULL,
6267 							&flock, op == F_SETLKW);
6268 					}
6269 				}
6270 			}
6271 			break;
6272 
6273 		// ToDo: add support for more ops?
6274 
6275 		default:
6276 			status = B_BAD_VALUE;
6277 	}
6278 
6279 	put_fd(descriptor);
6280 	return status;
6281 }
6282 
6283 
6284 static status_t
6285 common_sync(int fd, bool kernel)
6286 {
6287 	struct file_descriptor* descriptor;
6288 	struct vnode* vnode;
6289 	status_t status;
6290 
6291 	FUNCTION(("common_fsync: entry. fd %d kernel %d\n", fd, kernel));
6292 
6293 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6294 	if (descriptor == NULL)
6295 		return B_FILE_ERROR;
6296 
6297 	if (HAS_FS_CALL(vnode, fsync))
6298 		status = FS_CALL_NO_PARAMS(vnode, fsync);
6299 	else
6300 		status = B_UNSUPPORTED;
6301 
6302 	put_fd(descriptor);
6303 	return status;
6304 }
6305 
6306 
6307 static status_t
6308 common_lock_node(int fd, bool kernel)
6309 {
6310 	struct file_descriptor* descriptor;
6311 	struct vnode* vnode;
6312 
6313 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6314 	if (descriptor == NULL)
6315 		return B_FILE_ERROR;
6316 
6317 	status_t status = B_OK;
6318 
6319 	// We need to set the locking atomically - someone
6320 	// else might set one at the same time
6321 	if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by, descriptor,
6322 			(file_descriptor*)NULL) != NULL)
6323 		status = B_BUSY;
6324 
6325 	put_fd(descriptor);
6326 	return status;
6327 }
6328 
6329 
6330 static status_t
6331 common_unlock_node(int fd, bool kernel)
6332 {
6333 	struct file_descriptor* descriptor;
6334 	struct vnode* vnode;
6335 
6336 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6337 	if (descriptor == NULL)
6338 		return B_FILE_ERROR;
6339 
6340 	status_t status = B_OK;
6341 
6342 	// We need to set the locking atomically - someone
6343 	// else might set one at the same time
6344 	if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by,
6345 			(file_descriptor*)NULL, descriptor) != descriptor)
6346 		status = B_BAD_VALUE;
6347 
6348 	put_fd(descriptor);
6349 	return status;
6350 }
6351 
6352 
6353 static status_t
6354 common_read_link(int fd, char* path, char* buffer, size_t* _bufferSize,
6355 	bool kernel)
6356 {
6357 	struct vnode* vnode;
6358 	status_t status;
6359 
6360 	status = fd_and_path_to_vnode(fd, path, false, &vnode, NULL, kernel);
6361 	if (status != B_OK)
6362 		return status;
6363 
6364 	if (HAS_FS_CALL(vnode, read_symlink)) {
6365 		status = FS_CALL(vnode, read_symlink, buffer, _bufferSize);
6366 	} else
6367 		status = B_BAD_VALUE;
6368 
6369 	put_vnode(vnode);
6370 	return status;
6371 }
6372 
6373 
6374 static status_t
6375 common_create_symlink(int fd, char* path, const char* toPath, int mode,
6376 	bool kernel)
6377 {
6378 	// path validity checks have to be in the calling function!
6379 	char name[B_FILE_NAME_LENGTH];
6380 	struct vnode* vnode;
6381 	status_t status;
6382 
6383 	FUNCTION(("common_create_symlink(fd = %d, path = %s, toPath = %s, "
6384 		"mode = %d, kernel = %d)\n", fd, path, toPath, mode, kernel));
6385 
6386 	status = fd_and_path_to_dir_vnode(fd, path, &vnode, name, kernel);
6387 	if (status != B_OK)
6388 		return status;
6389 
6390 	if (HAS_FS_CALL(vnode, create_symlink))
6391 		status = FS_CALL(vnode, create_symlink, name, toPath, mode);
6392 	else {
6393 		status = HAS_FS_CALL(vnode, write)
6394 			? B_UNSUPPORTED : B_READ_ONLY_DEVICE;
6395 	}
6396 
6397 	put_vnode(vnode);
6398 
6399 	return status;
6400 }
6401 
6402 
6403 static status_t
6404 common_create_link(int pathFD, char* path, int toFD, char* toPath,
6405 	bool traverseLeafLink, bool kernel)
6406 {
6407 	// path validity checks have to be in the calling function!
6408 
6409 	FUNCTION(("common_create_link(path = %s, toPath = %s, kernel = %d)\n", path,
6410 		toPath, kernel));
6411 
6412 	char name[B_FILE_NAME_LENGTH];
6413 	struct vnode* directory;
6414 	status_t status = fd_and_path_to_dir_vnode(pathFD, path, &directory, name,
6415 		kernel);
6416 	if (status != B_OK)
6417 		return status;
6418 
6419 	struct vnode* vnode;
6420 	status = fd_and_path_to_vnode(toFD, toPath, traverseLeafLink, &vnode, NULL,
6421 		kernel);
6422 	if (status != B_OK)
6423 		goto err;
6424 
6425 	if (directory->mount != vnode->mount) {
6426 		status = B_CROSS_DEVICE_LINK;
6427 		goto err1;
6428 	}
6429 
6430 	if (HAS_FS_CALL(directory, link))
6431 		status = FS_CALL(directory, link, name, vnode);
6432 	else
6433 		status = B_READ_ONLY_DEVICE;
6434 
6435 err1:
6436 	put_vnode(vnode);
6437 err:
6438 	put_vnode(directory);
6439 
6440 	return status;
6441 }
6442 
6443 
6444 static status_t
6445 common_unlink(int fd, char* path, bool kernel)
6446 {
6447 	char filename[B_FILE_NAME_LENGTH];
6448 	struct vnode* vnode;
6449 	status_t status;
6450 
6451 	FUNCTION(("common_unlink: fd: %d, path '%s', kernel %d\n", fd, path,
6452 		kernel));
6453 
6454 	status = fd_and_path_to_dir_vnode(fd, path, &vnode, filename, kernel);
6455 	if (status < 0)
6456 		return status;
6457 
6458 	if (HAS_FS_CALL(vnode, unlink))
6459 		status = FS_CALL(vnode, unlink, filename);
6460 	else
6461 		status = B_READ_ONLY_DEVICE;
6462 
6463 	put_vnode(vnode);
6464 
6465 	return status;
6466 }
6467 
6468 
6469 static status_t
6470 common_access(int fd, char* path, int mode, bool effectiveUserGroup, bool kernel)
6471 {
6472 	struct vnode* vnode;
6473 	status_t status;
6474 
6475 	// TODO: honor effectiveUserGroup argument
6476 
6477 	status = fd_and_path_to_vnode(fd, path, true, &vnode, NULL, kernel);
6478 	if (status != B_OK)
6479 		return status;
6480 
6481 	if (HAS_FS_CALL(vnode, access))
6482 		status = FS_CALL(vnode, access, mode);
6483 	else
6484 		status = B_OK;
6485 
6486 	put_vnode(vnode);
6487 
6488 	return status;
6489 }
6490 
6491 
6492 static status_t
6493 common_rename(int fd, char* path, int newFD, char* newPath, bool kernel)
6494 {
6495 	struct vnode* fromVnode;
6496 	struct vnode* toVnode;
6497 	char fromName[B_FILE_NAME_LENGTH];
6498 	char toName[B_FILE_NAME_LENGTH];
6499 	status_t status;
6500 
6501 	FUNCTION(("common_rename(fd = %d, path = %s, newFD = %d, newPath = %s, "
6502 		"kernel = %d)\n", fd, path, newFD, newPath, kernel));
6503 
6504 	status = fd_and_path_to_dir_vnode(fd, path, &fromVnode, fromName, kernel);
6505 	if (status != B_OK)
6506 		return status;
6507 
6508 	status = fd_and_path_to_dir_vnode(newFD, newPath, &toVnode, toName, kernel);
6509 	if (status != B_OK)
6510 		goto err1;
6511 
6512 	if (fromVnode->device != toVnode->device) {
6513 		status = B_CROSS_DEVICE_LINK;
6514 		goto err2;
6515 	}
6516 
6517 	if (fromName[0] == '\0' || toName[0] == '\0'
6518 		|| !strcmp(fromName, ".") || !strcmp(fromName, "..")
6519 		|| !strcmp(toName, ".") || !strcmp(toName, "..")
6520 		|| (fromVnode == toVnode && !strcmp(fromName, toName))) {
6521 		status = B_BAD_VALUE;
6522 		goto err2;
6523 	}
6524 
6525 	if (HAS_FS_CALL(fromVnode, rename))
6526 		status = FS_CALL(fromVnode, rename, fromName, toVnode, toName);
6527 	else
6528 		status = B_READ_ONLY_DEVICE;
6529 
6530 err2:
6531 	put_vnode(toVnode);
6532 err1:
6533 	put_vnode(fromVnode);
6534 
6535 	return status;
6536 }
6537 
6538 
6539 static status_t
6540 common_read_stat(struct file_descriptor* descriptor, struct stat* stat)
6541 {
6542 	struct vnode* vnode = descriptor->u.vnode;
6543 
6544 	FUNCTION(("common_read_stat: stat %p\n", stat));
6545 
6546 	// TODO: remove this once all file systems properly set them!
6547 	stat->st_crtim.tv_nsec = 0;
6548 	stat->st_ctim.tv_nsec = 0;
6549 	stat->st_mtim.tv_nsec = 0;
6550 	stat->st_atim.tv_nsec = 0;
6551 
6552 	return vfs_stat_vnode(vnode, stat);
6553 }
6554 
6555 
6556 static status_t
6557 common_write_stat(struct file_descriptor* descriptor, const struct stat* stat,
6558 	int statMask)
6559 {
6560 	struct vnode* vnode = descriptor->u.vnode;
6561 
6562 	FUNCTION(("common_write_stat(vnode = %p, stat = %p, statMask = %d)\n",
6563 		vnode, stat, statMask));
6564 
6565 	if ((descriptor->open_mode & O_RWMASK) == O_RDONLY)
6566 		return B_BAD_VALUE;
6567 
6568 	if (!HAS_FS_CALL(vnode, write_stat))
6569 		return B_READ_ONLY_DEVICE;
6570 
6571 	return FS_CALL(vnode, write_stat, stat, statMask);
6572 }
6573 
6574 
6575 static status_t
6576 common_path_read_stat(int fd, char* path, bool traverseLeafLink,
6577 	struct stat* stat, bool kernel)
6578 {
6579 	FUNCTION(("common_path_read_stat: fd: %d, path '%s', stat %p,\n", fd, path,
6580 		stat));
6581 
6582 	struct vnode* vnode;
6583 	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6584 		NULL, kernel);
6585 	if (status != B_OK)
6586 		return status;
6587 
6588 	status = vfs_stat_vnode(vnode, stat);
6589 
6590 	put_vnode(vnode);
6591 	return status;
6592 }
6593 
6594 
6595 static status_t
6596 common_path_write_stat(int fd, char* path, bool traverseLeafLink,
6597 	const struct stat* stat, int statMask, bool kernel)
6598 {
6599 	FUNCTION(("common_write_stat: fd: %d, path '%s', stat %p, stat_mask %d, "
6600 		"kernel %d\n", fd, path, stat, statMask, kernel));
6601 
6602 	struct vnode* vnode;
6603 	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6604 		NULL, kernel);
6605 	if (status != B_OK)
6606 		return status;
6607 
6608 	if (HAS_FS_CALL(vnode, write_stat))
6609 		status = FS_CALL(vnode, write_stat, stat, statMask);
6610 	else
6611 		status = B_READ_ONLY_DEVICE;
6612 
6613 	put_vnode(vnode);
6614 
6615 	return status;
6616 }
6617 
6618 
6619 static int
6620 attr_dir_open(int fd, char* path, bool traverseLeafLink, bool kernel)
6621 {
6622 	FUNCTION(("attr_dir_open(fd = %d, path = '%s', kernel = %d)\n", fd, path,
6623 		kernel));
6624 
6625 	struct vnode* vnode;
6626 	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6627 		NULL, kernel);
6628 	if (status != B_OK)
6629 		return status;
6630 
6631 	status = open_attr_dir_vnode(vnode, kernel);
6632 	if (status < 0)
6633 		put_vnode(vnode);
6634 
6635 	return status;
6636 }
6637 
6638 
6639 static status_t
6640 attr_dir_close(struct file_descriptor* descriptor)
6641 {
6642 	struct vnode* vnode = descriptor->u.vnode;
6643 
6644 	FUNCTION(("attr_dir_close(descriptor = %p)\n", descriptor));
6645 
6646 	if (HAS_FS_CALL(vnode, close_attr_dir))
6647 		return FS_CALL(vnode, close_attr_dir, descriptor->cookie);
6648 
6649 	return B_OK;
6650 }
6651 
6652 
6653 static void
6654 attr_dir_free_fd(struct file_descriptor* descriptor)
6655 {
6656 	struct vnode* vnode = descriptor->u.vnode;
6657 
6658 	if (vnode != NULL) {
6659 		FS_CALL(vnode, free_attr_dir_cookie, descriptor->cookie);
6660 		put_vnode(vnode);
6661 	}
6662 }
6663 
6664 
6665 static status_t
6666 attr_dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
6667 	struct dirent* buffer, size_t bufferSize, uint32* _count)
6668 {
6669 	struct vnode* vnode = descriptor->u.vnode;
6670 
6671 	FUNCTION(("attr_dir_read(descriptor = %p)\n", descriptor));
6672 
6673 	if (HAS_FS_CALL(vnode, read_attr_dir))
6674 		return FS_CALL(vnode, read_attr_dir, descriptor->cookie, buffer,
6675 			bufferSize, _count);
6676 
6677 	return B_UNSUPPORTED;
6678 }
6679 
6680 
6681 static status_t
6682 attr_dir_rewind(struct file_descriptor* descriptor)
6683 {
6684 	struct vnode* vnode = descriptor->u.vnode;
6685 
6686 	FUNCTION(("attr_dir_rewind(descriptor = %p)\n", descriptor));
6687 
6688 	if (HAS_FS_CALL(vnode, rewind_attr_dir))
6689 		return FS_CALL(vnode, rewind_attr_dir, descriptor->cookie);
6690 
6691 	return B_UNSUPPORTED;
6692 }
6693 
6694 
6695 static int
6696 attr_create(int fd, char* path, const char* name, uint32 type,
6697 	int openMode, bool kernel)
6698 {
6699 	if (name == NULL || *name == '\0')
6700 		return B_BAD_VALUE;
6701 
6702 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
6703 	struct vnode* vnode;
6704 	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode, NULL,
6705 		kernel);
6706 	if (status != B_OK)
6707 		return status;
6708 
6709 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
6710 		status = B_LINK_LIMIT;
6711 		goto err;
6712 	}
6713 
6714 	if (!HAS_FS_CALL(vnode, create_attr)) {
6715 		status = B_READ_ONLY_DEVICE;
6716 		goto err;
6717 	}
6718 
6719 	void* cookie;
6720 	status = FS_CALL(vnode, create_attr, name, type, openMode, &cookie);
6721 	if (status != B_OK)
6722 		goto err;
6723 
6724 	fd = get_new_fd(FDTYPE_ATTR, NULL, vnode, cookie, openMode, kernel);
6725 	if (fd >= 0)
6726 		return fd;
6727 
6728 	status = fd;
6729 
6730 	FS_CALL(vnode, close_attr, cookie);
6731 	FS_CALL(vnode, free_attr_cookie, cookie);
6732 
6733 	FS_CALL(vnode, remove_attr, name);
6734 
6735 err:
6736 	put_vnode(vnode);
6737 
6738 	return status;
6739 }
6740 
6741 
6742 static int
6743 attr_open(int fd, char* path, const char* name, int openMode, bool kernel)
6744 {
6745 	if (name == NULL || *name == '\0')
6746 		return B_BAD_VALUE;
6747 
6748 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
6749 	struct vnode* vnode;
6750 	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode, NULL,
6751 		kernel);
6752 	if (status != B_OK)
6753 		return status;
6754 
6755 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
6756 		status = B_LINK_LIMIT;
6757 		goto err;
6758 	}
6759 
6760 	if (!HAS_FS_CALL(vnode, open_attr)) {
6761 		status = B_UNSUPPORTED;
6762 		goto err;
6763 	}
6764 
6765 	void* cookie;
6766 	status = FS_CALL(vnode, open_attr, name, openMode, &cookie);
6767 	if (status != B_OK)
6768 		goto err;
6769 
6770 	// now we only need a file descriptor for this attribute and we're done
6771 	fd = get_new_fd(FDTYPE_ATTR, NULL, vnode, cookie, openMode, kernel);
6772 	if (fd >= 0)
6773 		return fd;
6774 
6775 	status = fd;
6776 
6777 	FS_CALL(vnode, close_attr, cookie);
6778 	FS_CALL(vnode, free_attr_cookie, cookie);
6779 
6780 err:
6781 	put_vnode(vnode);
6782 
6783 	return status;
6784 }
6785 
6786 
6787 static status_t
6788 attr_close(struct file_descriptor* descriptor)
6789 {
6790 	struct vnode* vnode = descriptor->u.vnode;
6791 
6792 	FUNCTION(("attr_close(descriptor = %p)\n", descriptor));
6793 
6794 	if (HAS_FS_CALL(vnode, close_attr))
6795 		return FS_CALL(vnode, close_attr, descriptor->cookie);
6796 
6797 	return B_OK;
6798 }
6799 
6800 
6801 static void
6802 attr_free_fd(struct file_descriptor* descriptor)
6803 {
6804 	struct vnode* vnode = descriptor->u.vnode;
6805 
6806 	if (vnode != NULL) {
6807 		FS_CALL(vnode, free_attr_cookie, descriptor->cookie);
6808 		put_vnode(vnode);
6809 	}
6810 }
6811 
6812 
6813 static status_t
6814 attr_read(struct file_descriptor* descriptor, off_t pos, void* buffer,
6815 	size_t* length)
6816 {
6817 	struct vnode* vnode = descriptor->u.vnode;
6818 
6819 	FUNCTION(("attr_read: buf %p, pos %" B_PRIdOFF ", len %p = %ld\n", buffer,
6820 		pos, length, *length));
6821 
6822 	if (!HAS_FS_CALL(vnode, read_attr))
6823 		return B_UNSUPPORTED;
6824 
6825 	return FS_CALL(vnode, read_attr, descriptor->cookie, pos, buffer, length);
6826 }
6827 
6828 
6829 static status_t
6830 attr_write(struct file_descriptor* descriptor, off_t pos, const void* buffer,
6831 	size_t* length)
6832 {
6833 	struct vnode* vnode = descriptor->u.vnode;
6834 
6835 	FUNCTION(("attr_write: buf %p, pos %" B_PRIdOFF ", len %p\n", buffer, pos,
6836 		length));
6837 
6838 	if (!HAS_FS_CALL(vnode, write_attr))
6839 		return B_UNSUPPORTED;
6840 
6841 	return FS_CALL(vnode, write_attr, descriptor->cookie, pos, buffer, length);
6842 }
6843 
6844 
6845 static off_t
6846 attr_seek(struct file_descriptor* descriptor, off_t pos, int seekType)
6847 {
6848 	off_t offset;
6849 
6850 	switch (seekType) {
6851 		case SEEK_SET:
6852 			offset = 0;
6853 			break;
6854 		case SEEK_CUR:
6855 			offset = descriptor->pos;
6856 			break;
6857 		case SEEK_END:
6858 		{
6859 			struct vnode* vnode = descriptor->u.vnode;
6860 			if (!HAS_FS_CALL(vnode, read_stat))
6861 				return B_UNSUPPORTED;
6862 
6863 			struct stat stat;
6864 			status_t status = FS_CALL(vnode, read_attr_stat, descriptor->cookie,
6865 				&stat);
6866 			if (status != B_OK)
6867 				return status;
6868 
6869 			offset = stat.st_size;
6870 			break;
6871 		}
6872 		default:
6873 			return B_BAD_VALUE;
6874 	}
6875 
6876 	// assumes off_t is 64 bits wide
6877 	if (offset > 0 && LONGLONG_MAX - offset < pos)
6878 		return B_BUFFER_OVERFLOW;
6879 
6880 	pos += offset;
6881 	if (pos < 0)
6882 		return B_BAD_VALUE;
6883 
6884 	return descriptor->pos = pos;
6885 }
6886 
6887 
6888 static status_t
6889 attr_read_stat(struct file_descriptor* descriptor, struct stat* stat)
6890 {
6891 	struct vnode* vnode = descriptor->u.vnode;
6892 
6893 	FUNCTION(("attr_read_stat: stat 0x%p\n", stat));
6894 
6895 	if (!HAS_FS_CALL(vnode, read_attr_stat))
6896 		return B_UNSUPPORTED;
6897 
6898 	return FS_CALL(vnode, read_attr_stat, descriptor->cookie, stat);
6899 }
6900 
6901 
6902 static status_t
6903 attr_write_stat(struct file_descriptor* descriptor, const struct stat* stat,
6904 	int statMask)
6905 {
6906 	struct vnode* vnode = descriptor->u.vnode;
6907 
6908 	FUNCTION(("attr_write_stat: stat = %p, statMask %d\n", stat, statMask));
6909 
6910 	if (!HAS_FS_CALL(vnode, write_attr_stat))
6911 		return B_READ_ONLY_DEVICE;
6912 
6913 	return FS_CALL(vnode, write_attr_stat, descriptor->cookie, stat, statMask);
6914 }
6915 
6916 
6917 static status_t
6918 attr_remove(int fd, const char* name, bool kernel)
6919 {
6920 	struct file_descriptor* descriptor;
6921 	struct vnode* vnode;
6922 	status_t status;
6923 
6924 	if (name == NULL || *name == '\0')
6925 		return B_BAD_VALUE;
6926 
6927 	FUNCTION(("attr_remove: fd = %d, name = \"%s\", kernel %d\n", fd, name,
6928 		kernel));
6929 
6930 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6931 	if (descriptor == NULL)
6932 		return B_FILE_ERROR;
6933 
6934 	if (HAS_FS_CALL(vnode, remove_attr))
6935 		status = FS_CALL(vnode, remove_attr, name);
6936 	else
6937 		status = B_READ_ONLY_DEVICE;
6938 
6939 	put_fd(descriptor);
6940 
6941 	return status;
6942 }
6943 
6944 
6945 static status_t
6946 attr_rename(int fromFD, const char* fromName, int toFD, const char* toName,
6947 	bool kernel)
6948 {
6949 	struct file_descriptor* fromDescriptor;
6950 	struct file_descriptor* toDescriptor;
6951 	struct vnode* fromVnode;
6952 	struct vnode* toVnode;
6953 	status_t status;
6954 
6955 	if (fromName == NULL || *fromName == '\0' || toName == NULL
6956 		|| *toName == '\0')
6957 		return B_BAD_VALUE;
6958 
6959 	FUNCTION(("attr_rename: from fd = %d, from name = \"%s\", to fd = %d, to "
6960 		"name = \"%s\", kernel %d\n", fromFD, fromName, toFD, toName, kernel));
6961 
6962 	fromDescriptor = get_fd_and_vnode(fromFD, &fromVnode, kernel);
6963 	if (fromDescriptor == NULL)
6964 		return B_FILE_ERROR;
6965 
6966 	toDescriptor = get_fd_and_vnode(toFD, &toVnode, kernel);
6967 	if (toDescriptor == NULL) {
6968 		status = B_FILE_ERROR;
6969 		goto err;
6970 	}
6971 
6972 	// are the files on the same volume?
6973 	if (fromVnode->device != toVnode->device) {
6974 		status = B_CROSS_DEVICE_LINK;
6975 		goto err1;
6976 	}
6977 
6978 	if (HAS_FS_CALL(fromVnode, rename_attr)) {
6979 		status = FS_CALL(fromVnode, rename_attr, fromName, toVnode, toName);
6980 	} else
6981 		status = B_READ_ONLY_DEVICE;
6982 
6983 err1:
6984 	put_fd(toDescriptor);
6985 err:
6986 	put_fd(fromDescriptor);
6987 
6988 	return status;
6989 }
6990 
6991 
6992 static int
6993 index_dir_open(dev_t mountID, bool kernel)
6994 {
6995 	struct fs_mount* mount;
6996 	void* cookie;
6997 
6998 	FUNCTION(("index_dir_open(mountID = %" B_PRId32 ", kernel = %d)\n", mountID,
6999 		kernel));
7000 
7001 	status_t status = get_mount(mountID, &mount);
7002 	if (status != B_OK)
7003 		return status;
7004 
7005 	if (!HAS_FS_MOUNT_CALL(mount, open_index_dir)) {
7006 		status = B_UNSUPPORTED;
7007 		goto error;
7008 	}
7009 
7010 	status = FS_MOUNT_CALL(mount, open_index_dir, &cookie);
7011 	if (status != B_OK)
7012 		goto error;
7013 
7014 	// get fd for the index directory
7015 	int fd;
7016 	fd = get_new_fd(FDTYPE_INDEX_DIR, mount, NULL, cookie, O_CLOEXEC, kernel);
7017 	if (fd >= 0)
7018 		return fd;
7019 
7020 	// something went wrong
7021 	FS_MOUNT_CALL(mount, close_index_dir, cookie);
7022 	FS_MOUNT_CALL(mount, free_index_dir_cookie, cookie);
7023 
7024 	status = fd;
7025 
7026 error:
7027 	put_mount(mount);
7028 	return status;
7029 }
7030 
7031 
7032 static status_t
7033 index_dir_close(struct file_descriptor* descriptor)
7034 {
7035 	struct fs_mount* mount = descriptor->u.mount;
7036 
7037 	FUNCTION(("index_dir_close(descriptor = %p)\n", descriptor));
7038 
7039 	if (HAS_FS_MOUNT_CALL(mount, close_index_dir))
7040 		return FS_MOUNT_CALL(mount, close_index_dir, descriptor->cookie);
7041 
7042 	return B_OK;
7043 }
7044 
7045 
7046 static void
7047 index_dir_free_fd(struct file_descriptor* descriptor)
7048 {
7049 	struct fs_mount* mount = descriptor->u.mount;
7050 
7051 	if (mount != NULL) {
7052 		FS_MOUNT_CALL(mount, free_index_dir_cookie, descriptor->cookie);
7053 		put_mount(mount);
7054 	}
7055 }
7056 
7057 
7058 static status_t
7059 index_dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
7060 	struct dirent* buffer, size_t bufferSize, uint32* _count)
7061 {
7062 	struct fs_mount* mount = descriptor->u.mount;
7063 
7064 	if (HAS_FS_MOUNT_CALL(mount, read_index_dir)) {
7065 		return FS_MOUNT_CALL(mount, read_index_dir, descriptor->cookie, buffer,
7066 			bufferSize, _count);
7067 	}
7068 
7069 	return B_UNSUPPORTED;
7070 }
7071 
7072 
7073 static status_t
7074 index_dir_rewind(struct file_descriptor* descriptor)
7075 {
7076 	struct fs_mount* mount = descriptor->u.mount;
7077 
7078 	if (HAS_FS_MOUNT_CALL(mount, rewind_index_dir))
7079 		return FS_MOUNT_CALL(mount, rewind_index_dir, descriptor->cookie);
7080 
7081 	return B_UNSUPPORTED;
7082 }
7083 
7084 
7085 static status_t
7086 index_create(dev_t mountID, const char* name, uint32 type, uint32 flags,
7087 	bool kernel)
7088 {
7089 	FUNCTION(("index_create(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7090 		mountID, name, kernel));
7091 
7092 	struct fs_mount* mount;
7093 	status_t status = get_mount(mountID, &mount);
7094 	if (status != B_OK)
7095 		return status;
7096 
7097 	if (!HAS_FS_MOUNT_CALL(mount, create_index)) {
7098 		status = B_READ_ONLY_DEVICE;
7099 		goto out;
7100 	}
7101 
7102 	status = FS_MOUNT_CALL(mount, create_index, name, type, flags);
7103 
7104 out:
7105 	put_mount(mount);
7106 	return status;
7107 }
7108 
7109 
7110 #if 0
7111 static status_t
7112 index_read_stat(struct file_descriptor* descriptor, struct stat* stat)
7113 {
7114 	struct vnode* vnode = descriptor->u.vnode;
7115 
7116 	// ToDo: currently unused!
7117 	FUNCTION(("index_read_stat: stat 0x%p\n", stat));
7118 	if (!HAS_FS_CALL(vnode, read_index_stat))
7119 		return B_UNSUPPORTED;
7120 
7121 	return B_UNSUPPORTED;
7122 	//return FS_CALL(vnode, read_index_stat, descriptor->cookie, stat);
7123 }
7124 
7125 
7126 static void
7127 index_free_fd(struct file_descriptor* descriptor)
7128 {
7129 	struct vnode* vnode = descriptor->u.vnode;
7130 
7131 	if (vnode != NULL) {
7132 		FS_CALL(vnode, free_index_cookie, descriptor->cookie);
7133 		put_vnode(vnode);
7134 	}
7135 }
7136 #endif
7137 
7138 
7139 static status_t
7140 index_name_read_stat(dev_t mountID, const char* name, struct stat* stat,
7141 	bool kernel)
7142 {
7143 	FUNCTION(("index_remove(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7144 		mountID, name, kernel));
7145 
7146 	struct fs_mount* mount;
7147 	status_t status = get_mount(mountID, &mount);
7148 	if (status != B_OK)
7149 		return status;
7150 
7151 	if (!HAS_FS_MOUNT_CALL(mount, read_index_stat)) {
7152 		status = B_UNSUPPORTED;
7153 		goto out;
7154 	}
7155 
7156 	status = FS_MOUNT_CALL(mount, read_index_stat, name, stat);
7157 
7158 out:
7159 	put_mount(mount);
7160 	return status;
7161 }
7162 
7163 
7164 static status_t
7165 index_remove(dev_t mountID, const char* name, bool kernel)
7166 {
7167 	FUNCTION(("index_remove(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7168 		mountID, name, kernel));
7169 
7170 	struct fs_mount* mount;
7171 	status_t status = get_mount(mountID, &mount);
7172 	if (status != B_OK)
7173 		return status;
7174 
7175 	if (!HAS_FS_MOUNT_CALL(mount, remove_index)) {
7176 		status = B_READ_ONLY_DEVICE;
7177 		goto out;
7178 	}
7179 
7180 	status = FS_MOUNT_CALL(mount, remove_index, name);
7181 
7182 out:
7183 	put_mount(mount);
7184 	return status;
7185 }
7186 
7187 
7188 /*!	TODO: the query FS API is still the pretty much the same as in R5.
7189 		It would be nice if the FS would find some more kernel support
7190 		for them.
7191 		For example, query parsing should be moved into the kernel.
7192 */
7193 static int
7194 query_open(dev_t device, const char* query, uint32 flags, port_id port,
7195 	int32 token, bool kernel)
7196 {
7197 	struct fs_mount* mount;
7198 	void* cookie;
7199 
7200 	FUNCTION(("query_open(device = %" B_PRId32 ", query = \"%s\", kernel = %d)\n",
7201 		device, query, kernel));
7202 
7203 	status_t status = get_mount(device, &mount);
7204 	if (status != B_OK)
7205 		return status;
7206 
7207 	if (!HAS_FS_MOUNT_CALL(mount, open_query)) {
7208 		status = B_UNSUPPORTED;
7209 		goto error;
7210 	}
7211 
7212 	status = FS_MOUNT_CALL(mount, open_query, query, flags, port, token,
7213 		&cookie);
7214 	if (status != B_OK)
7215 		goto error;
7216 
7217 	// get fd for the index directory
7218 	int fd;
7219 	fd = get_new_fd(FDTYPE_QUERY, mount, NULL, cookie, O_CLOEXEC, kernel);
7220 	if (fd >= 0)
7221 		return fd;
7222 
7223 	status = fd;
7224 
7225 	// something went wrong
7226 	FS_MOUNT_CALL(mount, close_query, cookie);
7227 	FS_MOUNT_CALL(mount, free_query_cookie, cookie);
7228 
7229 error:
7230 	put_mount(mount);
7231 	return status;
7232 }
7233 
7234 
7235 static status_t
7236 query_close(struct file_descriptor* descriptor)
7237 {
7238 	struct fs_mount* mount = descriptor->u.mount;
7239 
7240 	FUNCTION(("query_close(descriptor = %p)\n", descriptor));
7241 
7242 	if (HAS_FS_MOUNT_CALL(mount, close_query))
7243 		return FS_MOUNT_CALL(mount, close_query, descriptor->cookie);
7244 
7245 	return B_OK;
7246 }
7247 
7248 
7249 static void
7250 query_free_fd(struct file_descriptor* descriptor)
7251 {
7252 	struct fs_mount* mount = descriptor->u.mount;
7253 
7254 	if (mount != NULL) {
7255 		FS_MOUNT_CALL(mount, free_query_cookie, descriptor->cookie);
7256 		put_mount(mount);
7257 	}
7258 }
7259 
7260 
7261 static status_t
7262 query_read(struct io_context* ioContext, struct file_descriptor* descriptor,
7263 	struct dirent* buffer, size_t bufferSize, uint32* _count)
7264 {
7265 	struct fs_mount* mount = descriptor->u.mount;
7266 
7267 	if (HAS_FS_MOUNT_CALL(mount, read_query)) {
7268 		return FS_MOUNT_CALL(mount, read_query, descriptor->cookie, buffer,
7269 			bufferSize, _count);
7270 	}
7271 
7272 	return B_UNSUPPORTED;
7273 }
7274 
7275 
7276 static status_t
7277 query_rewind(struct file_descriptor* descriptor)
7278 {
7279 	struct fs_mount* mount = descriptor->u.mount;
7280 
7281 	if (HAS_FS_MOUNT_CALL(mount, rewind_query))
7282 		return FS_MOUNT_CALL(mount, rewind_query, descriptor->cookie);
7283 
7284 	return B_UNSUPPORTED;
7285 }
7286 
7287 
7288 //	#pragma mark - General File System functions
7289 
7290 
7291 static dev_t
7292 fs_mount(char* path, const char* device, const char* fsName, uint32 flags,
7293 	const char* args, bool kernel)
7294 {
7295 	struct ::fs_mount* mount;
7296 	status_t status = B_OK;
7297 	fs_volume* volume = NULL;
7298 	int32 layer = 0;
7299 	Vnode* coveredNode = NULL;
7300 
7301 	FUNCTION(("fs_mount: path = '%s', device = '%s', fs_name = '%s', flags = %#"
7302 		B_PRIx32 ", args = '%s'\n", path, device, fsName, flags, args));
7303 
7304 	// The path is always safe, we just have to make sure that fsName is
7305 	// almost valid - we can't make any assumptions about args, though.
7306 	// A NULL fsName is OK, if a device was given and the FS is not virtual.
7307 	// We'll get it from the DDM later.
7308 	if (fsName == NULL) {
7309 		if (!device || flags & B_MOUNT_VIRTUAL_DEVICE)
7310 			return B_BAD_VALUE;
7311 	} else if (fsName[0] == '\0')
7312 		return B_BAD_VALUE;
7313 
7314 	RecursiveLocker mountOpLocker(sMountOpLock);
7315 
7316 	// Helper to delete a newly created file device on failure.
7317 	// Not exactly beautiful, but helps to keep the code below cleaner.
7318 	struct FileDeviceDeleter {
7319 		FileDeviceDeleter() : id(-1) {}
7320 		~FileDeviceDeleter()
7321 		{
7322 			KDiskDeviceManager::Default()->DeleteFileDevice(id);
7323 		}
7324 
7325 		partition_id id;
7326 	} fileDeviceDeleter;
7327 
7328 	// If the file system is not a "virtual" one, the device argument should
7329 	// point to a real file/device (if given at all).
7330 	// get the partition
7331 	KDiskDeviceManager* ddm = KDiskDeviceManager::Default();
7332 	KPartition* partition = NULL;
7333 	KPath normalizedDevice;
7334 	bool newlyCreatedFileDevice = false;
7335 
7336 	if (!(flags & B_MOUNT_VIRTUAL_DEVICE) && device != NULL) {
7337 		// normalize the device path
7338 		status = normalizedDevice.SetTo(device, true);
7339 		if (status != B_OK)
7340 			return status;
7341 
7342 		// get a corresponding partition from the DDM
7343 		partition = ddm->RegisterPartition(normalizedDevice.Path());
7344 		if (partition == NULL) {
7345 			// Partition not found: This either means, the user supplied
7346 			// an invalid path, or the path refers to an image file. We try
7347 			// to let the DDM create a file device for the path.
7348 			partition_id deviceID = ddm->CreateFileDevice(
7349 				normalizedDevice.Path(), &newlyCreatedFileDevice);
7350 			if (deviceID >= 0) {
7351 				partition = ddm->RegisterPartition(deviceID);
7352 				if (newlyCreatedFileDevice)
7353 					fileDeviceDeleter.id = deviceID;
7354 			}
7355 		}
7356 
7357 		if (!partition) {
7358 			TRACE(("fs_mount(): Partition `%s' not found.\n",
7359 				normalizedDevice.Path()));
7360 			return B_ENTRY_NOT_FOUND;
7361 		}
7362 
7363 		device = normalizedDevice.Path();
7364 			// correct path to file device
7365 	}
7366 	PartitionRegistrar partitionRegistrar(partition, true);
7367 
7368 	// Write lock the partition's device. For the time being, we keep the lock
7369 	// until we're done mounting -- not nice, but ensure, that no-one is
7370 	// interfering.
7371 	// TODO: Just mark the partition busy while mounting!
7372 	KDiskDevice* diskDevice = NULL;
7373 	if (partition) {
7374 		diskDevice = ddm->WriteLockDevice(partition->Device()->ID());
7375 		if (!diskDevice) {
7376 			TRACE(("fs_mount(): Failed to lock disk device!\n"));
7377 			return B_ERROR;
7378 		}
7379 	}
7380 
7381 	DeviceWriteLocker writeLocker(diskDevice, true);
7382 		// this takes over the write lock acquired before
7383 
7384 	if (partition != NULL) {
7385 		// make sure, that the partition is not busy
7386 		if (partition->IsBusy()) {
7387 			TRACE(("fs_mount(): Partition is busy.\n"));
7388 			return B_BUSY;
7389 		}
7390 
7391 		// if no FS name had been supplied, we get it from the partition
7392 		if (fsName == NULL) {
7393 			KDiskSystem* diskSystem = partition->DiskSystem();
7394 			if (!diskSystem) {
7395 				TRACE(("fs_mount(): No FS name was given, and the DDM didn't "
7396 					"recognize it.\n"));
7397 				return B_BAD_VALUE;
7398 			}
7399 
7400 			if (!diskSystem->IsFileSystem()) {
7401 				TRACE(("fs_mount(): No FS name was given, and the DDM found a "
7402 					"partitioning system.\n"));
7403 				return B_BAD_VALUE;
7404 			}
7405 
7406 			// The disk system name will not change, and the KDiskSystem
7407 			// object will not go away while the disk device is locked (and
7408 			// the partition has a reference to it), so this is safe.
7409 			fsName = diskSystem->Name();
7410 		}
7411 	}
7412 
7413 	mount = new(std::nothrow) (struct ::fs_mount);
7414 	if (mount == NULL)
7415 		return B_NO_MEMORY;
7416 
7417 	mount->device_name = strdup(device);
7418 		// "device" can be NULL
7419 
7420 	status = mount->entry_cache.Init();
7421 	if (status != B_OK)
7422 		goto err1;
7423 
7424 	// initialize structure
7425 	mount->id = sNextMountID++;
7426 	mount->partition = NULL;
7427 	mount->root_vnode = NULL;
7428 	mount->covers_vnode = NULL;
7429 	mount->unmounting = false;
7430 	mount->owns_file_device = false;
7431 	mount->volume = NULL;
7432 
7433 	// build up the volume(s)
7434 	while (true) {
7435 		char* layerFSName = get_file_system_name_for_layer(fsName, layer);
7436 		if (layerFSName == NULL) {
7437 			if (layer == 0) {
7438 				status = B_NO_MEMORY;
7439 				goto err1;
7440 			}
7441 
7442 			break;
7443 		}
7444 		MemoryDeleter layerFSNameDeleter(layerFSName);
7445 
7446 		volume = (fs_volume*)malloc(sizeof(fs_volume));
7447 		if (volume == NULL) {
7448 			status = B_NO_MEMORY;
7449 			goto err1;
7450 		}
7451 
7452 		volume->id = mount->id;
7453 		volume->partition = partition != NULL ? partition->ID() : -1;
7454 		volume->layer = layer++;
7455 		volume->private_volume = NULL;
7456 		volume->ops = NULL;
7457 		volume->sub_volume = NULL;
7458 		volume->super_volume = NULL;
7459 		volume->file_system = NULL;
7460 		volume->file_system_name = NULL;
7461 
7462 		volume->file_system_name = get_file_system_name(layerFSName);
7463 		if (volume->file_system_name == NULL) {
7464 			status = B_NO_MEMORY;
7465 			free(volume);
7466 			goto err1;
7467 		}
7468 
7469 		volume->file_system = get_file_system(layerFSName);
7470 		if (volume->file_system == NULL) {
7471 			status = B_DEVICE_NOT_FOUND;
7472 			free(volume->file_system_name);
7473 			free(volume);
7474 			goto err1;
7475 		}
7476 
7477 		if (mount->volume == NULL)
7478 			mount->volume = volume;
7479 		else {
7480 			volume->super_volume = mount->volume;
7481 			mount->volume->sub_volume = volume;
7482 			mount->volume = volume;
7483 		}
7484 	}
7485 
7486 	// insert mount struct into list before we call FS's mount() function
7487 	// so that vnodes can be created for this mount
7488 	mutex_lock(&sMountMutex);
7489 	sMountsTable->Insert(mount);
7490 	mutex_unlock(&sMountMutex);
7491 
7492 	ino_t rootID;
7493 
7494 	if (!sRoot) {
7495 		// we haven't mounted anything yet
7496 		if (strcmp(path, "/") != 0) {
7497 			status = B_ERROR;
7498 			goto err2;
7499 		}
7500 
7501 		status = mount->volume->file_system->mount(mount->volume, device, flags,
7502 			args, &rootID);
7503 		if (status != B_OK || mount->volume->ops == NULL)
7504 			goto err2;
7505 	} else {
7506 		status = path_to_vnode(path, true, &coveredNode, NULL, kernel);
7507 		if (status != B_OK)
7508 			goto err2;
7509 
7510 		mount->covers_vnode = coveredNode;
7511 
7512 		// make sure covered_vnode is a directory
7513 		if (!S_ISDIR(coveredNode->Type())) {
7514 			status = B_NOT_A_DIRECTORY;
7515 			goto err3;
7516 		}
7517 
7518 		if (coveredNode->IsCovered()) {
7519 			// this is already a covered vnode
7520 			status = B_BUSY;
7521 			goto err3;
7522 		}
7523 
7524 		// mount it/them
7525 		fs_volume* volume = mount->volume;
7526 		while (volume) {
7527 			status = volume->file_system->mount(volume, device, flags, args,
7528 				&rootID);
7529 			if (status != B_OK || volume->ops == NULL) {
7530 				if (status == B_OK && volume->ops == NULL)
7531 					panic("fs_mount: mount() succeeded but ops is NULL!");
7532 				if (volume->sub_volume)
7533 					goto err4;
7534 				goto err3;
7535 			}
7536 
7537 			volume = volume->super_volume;
7538 		}
7539 
7540 		volume = mount->volume;
7541 		while (volume) {
7542 			if (volume->ops->all_layers_mounted != NULL)
7543 				volume->ops->all_layers_mounted(volume);
7544 			volume = volume->super_volume;
7545 		}
7546 	}
7547 
7548 	// the root node is supposed to be owned by the file system - it must
7549 	// exist at this point
7550 	mount->root_vnode = lookup_vnode(mount->id, rootID);
7551 	if (mount->root_vnode == NULL || mount->root_vnode->ref_count != 1) {
7552 		panic("fs_mount: file system does not own its root node!\n");
7553 		status = B_ERROR;
7554 		goto err4;
7555 	}
7556 
7557 	// set up the links between the root vnode and the vnode it covers
7558 	rw_lock_write_lock(&sVnodeLock);
7559 	if (coveredNode != NULL) {
7560 		if (coveredNode->IsCovered()) {
7561 			// the vnode is covered now
7562 			status = B_BUSY;
7563 			rw_lock_write_unlock(&sVnodeLock);
7564 			goto err4;
7565 		}
7566 
7567 		mount->root_vnode->covers = coveredNode;
7568 		mount->root_vnode->SetCovering(true);
7569 
7570 		coveredNode->covered_by = mount->root_vnode;
7571 		coveredNode->SetCovered(true);
7572 	}
7573 	rw_lock_write_unlock(&sVnodeLock);
7574 
7575 	if (!sRoot) {
7576 		sRoot = mount->root_vnode;
7577 		mutex_lock(&sIOContextRootLock);
7578 		get_current_io_context(true)->root = sRoot;
7579 		mutex_unlock(&sIOContextRootLock);
7580 		inc_vnode_ref_count(sRoot);
7581 	}
7582 
7583 	// supply the partition (if any) with the mount cookie and mark it mounted
7584 	if (partition) {
7585 		partition->SetMountCookie(mount->volume->private_volume);
7586 		partition->SetVolumeID(mount->id);
7587 
7588 		// keep a partition reference as long as the partition is mounted
7589 		partitionRegistrar.Detach();
7590 		mount->partition = partition;
7591 		mount->owns_file_device = newlyCreatedFileDevice;
7592 		fileDeviceDeleter.id = -1;
7593 	}
7594 
7595 	notify_mount(mount->id,
7596 		coveredNode != NULL ? coveredNode->device : -1,
7597 		coveredNode ? coveredNode->id : -1);
7598 
7599 	return mount->id;
7600 
7601 err4:
7602 	FS_MOUNT_CALL_NO_PARAMS(mount, unmount);
7603 err3:
7604 	if (coveredNode != NULL)
7605 		put_vnode(coveredNode);
7606 err2:
7607 	mutex_lock(&sMountMutex);
7608 	sMountsTable->Remove(mount);
7609 	mutex_unlock(&sMountMutex);
7610 err1:
7611 	delete mount;
7612 
7613 	return status;
7614 }
7615 
7616 
7617 static status_t
7618 fs_unmount(char* path, dev_t mountID, uint32 flags, bool kernel)
7619 {
7620 	struct fs_mount* mount;
7621 	status_t err;
7622 
7623 	FUNCTION(("fs_unmount(path '%s', dev %" B_PRId32 ", kernel %d\n", path,
7624 		mountID, kernel));
7625 
7626 	struct vnode* pathVnode = NULL;
7627 	if (path != NULL) {
7628 		err = path_to_vnode(path, true, &pathVnode, NULL, kernel);
7629 		if (err != B_OK)
7630 			return B_ENTRY_NOT_FOUND;
7631 	}
7632 
7633 	RecursiveLocker mountOpLocker(sMountOpLock);
7634 
7635 	// this lock is not strictly necessary, but here in case of KDEBUG
7636 	// to keep the ASSERT in find_mount() working.
7637 	KDEBUG_ONLY(mutex_lock(&sMountMutex));
7638 	mount = find_mount(path != NULL ? pathVnode->device : mountID);
7639 	KDEBUG_ONLY(mutex_unlock(&sMountMutex));
7640 	if (mount == NULL) {
7641 		panic("fs_unmount: find_mount() failed on root vnode @%p of mount\n",
7642 			pathVnode);
7643 	}
7644 
7645 	if (path != NULL) {
7646 		put_vnode(pathVnode);
7647 
7648 		if (mount->root_vnode != pathVnode) {
7649 			// not mountpoint
7650 			return B_BAD_VALUE;
7651 		}
7652 	}
7653 
7654 	// if the volume is associated with a partition, lock the device of the
7655 	// partition as long as we are unmounting
7656 	KDiskDeviceManager* ddm = KDiskDeviceManager::Default();
7657 	KPartition* partition = mount->partition;
7658 	KDiskDevice* diskDevice = NULL;
7659 	if (partition != NULL) {
7660 		if (partition->Device() == NULL) {
7661 			dprintf("fs_unmount(): There is no device!\n");
7662 			return B_ERROR;
7663 		}
7664 		diskDevice = ddm->WriteLockDevice(partition->Device()->ID());
7665 		if (!diskDevice) {
7666 			TRACE(("fs_unmount(): Failed to lock disk device!\n"));
7667 			return B_ERROR;
7668 		}
7669 	}
7670 	DeviceWriteLocker writeLocker(diskDevice, true);
7671 
7672 	// make sure, that the partition is not busy
7673 	if (partition != NULL) {
7674 		if ((flags & B_UNMOUNT_BUSY_PARTITION) == 0 && partition->IsBusy()) {
7675 			TRACE(("fs_unmount(): Partition is busy.\n"));
7676 			return B_BUSY;
7677 		}
7678 	}
7679 
7680 	// grab the vnode master mutex to keep someone from creating
7681 	// a vnode while we're figuring out if we can continue
7682 	WriteLocker vnodesWriteLocker(&sVnodeLock);
7683 
7684 	bool disconnectedDescriptors = false;
7685 
7686 	while (true) {
7687 		bool busy = false;
7688 
7689 		// cycle through the list of vnodes associated with this mount and
7690 		// make sure all of them are not busy or have refs on them
7691 		VnodeList::Iterator iterator = mount->vnodes.GetIterator();
7692 		while (struct vnode* vnode = iterator.Next()) {
7693 			if (vnode->IsBusy()) {
7694 				busy = true;
7695 				break;
7696 			}
7697 
7698 			// check the vnode's ref count -- subtract additional references for
7699 			// covering
7700 			int32 refCount = vnode->ref_count;
7701 			if (vnode->covers != NULL)
7702 				refCount--;
7703 			if (vnode->covered_by != NULL)
7704 				refCount--;
7705 
7706 			if (refCount != 0) {
7707 				// there are still vnodes in use on this mount, so we cannot
7708 				// unmount yet
7709 				busy = true;
7710 				break;
7711 			}
7712 		}
7713 
7714 		if (!busy)
7715 			break;
7716 
7717 		if ((flags & B_FORCE_UNMOUNT) == 0)
7718 			return B_BUSY;
7719 
7720 		if (disconnectedDescriptors) {
7721 			// wait a bit until the last access is finished, and then try again
7722 			vnodesWriteLocker.Unlock();
7723 			snooze(100000);
7724 			// TODO: if there is some kind of bug that prevents the ref counts
7725 			// from getting back to zero, this will fall into an endless loop...
7726 			vnodesWriteLocker.Lock();
7727 			continue;
7728 		}
7729 
7730 		// the file system is still busy - but we're forced to unmount it,
7731 		// so let's disconnect all open file descriptors
7732 
7733 		mount->unmounting = true;
7734 			// prevent new vnodes from being created
7735 
7736 		vnodesWriteLocker.Unlock();
7737 
7738 		disconnect_mount_or_vnode_fds(mount, NULL);
7739 		disconnectedDescriptors = true;
7740 
7741 		vnodesWriteLocker.Lock();
7742 	}
7743 
7744 	// We can safely continue. Mark all of the vnodes busy and this mount
7745 	// structure in unmounting state. Also undo the vnode covers/covered_by
7746 	// links.
7747 	mount->unmounting = true;
7748 
7749 	VnodeList::Iterator iterator = mount->vnodes.GetIterator();
7750 	while (struct vnode* vnode = iterator.Next()) {
7751 		// Remove all covers/covered_by links from other mounts' nodes to this
7752 		// vnode and adjust the node ref count accordingly. We will release the
7753 		// references to the external vnodes below.
7754 		if (Vnode* coveredNode = vnode->covers) {
7755 			if (Vnode* coveringNode = vnode->covered_by) {
7756 				// We have both covered and covering vnodes, so just remove us
7757 				// from the chain.
7758 				coveredNode->covered_by = coveringNode;
7759 				coveringNode->covers = coveredNode;
7760 				vnode->ref_count -= 2;
7761 
7762 				vnode->covered_by = NULL;
7763 				vnode->covers = NULL;
7764 				vnode->SetCovering(false);
7765 				vnode->SetCovered(false);
7766 			} else {
7767 				// We only have a covered vnode. Remove its link to us.
7768 				coveredNode->covered_by = NULL;
7769 				coveredNode->SetCovered(false);
7770 				vnode->ref_count--;
7771 
7772 				// If the other node is an external vnode, we keep its link
7773 				// link around so we can put the reference later on. Otherwise
7774 				// we get rid of it right now.
7775 				if (coveredNode->mount == mount) {
7776 					vnode->covers = NULL;
7777 					coveredNode->ref_count--;
7778 				}
7779 			}
7780 		} else if (Vnode* coveringNode = vnode->covered_by) {
7781 			// We only have a covering vnode. Remove its link to us.
7782 			coveringNode->covers = NULL;
7783 			coveringNode->SetCovering(false);
7784 			vnode->ref_count--;
7785 
7786 			// If the other node is an external vnode, we keep its link
7787 			// link around so we can put the reference later on. Otherwise
7788 			// we get rid of it right now.
7789 			if (coveringNode->mount == mount) {
7790 				vnode->covered_by = NULL;
7791 				coveringNode->ref_count--;
7792 			}
7793 		}
7794 
7795 		vnode->SetBusy(true);
7796 		vnode_to_be_freed(vnode);
7797 	}
7798 
7799 	vnodesWriteLocker.Unlock();
7800 
7801 	// Free all vnodes associated with this mount.
7802 	// They will be removed from the mount list by free_vnode(), so
7803 	// we don't have to do this.
7804 	while (struct vnode* vnode = mount->vnodes.Head()) {
7805 		// Put the references to external covered/covering vnodes we kept above.
7806 		if (Vnode* coveredNode = vnode->covers)
7807 			put_vnode(coveredNode);
7808 		if (Vnode* coveringNode = vnode->covered_by)
7809 			put_vnode(coveringNode);
7810 
7811 		free_vnode(vnode, false);
7812 	}
7813 
7814 	// remove the mount structure from the hash table
7815 	mutex_lock(&sMountMutex);
7816 	sMountsTable->Remove(mount);
7817 	mutex_unlock(&sMountMutex);
7818 
7819 	mountOpLocker.Unlock();
7820 
7821 	FS_MOUNT_CALL_NO_PARAMS(mount, unmount);
7822 	notify_unmount(mount->id);
7823 
7824 	// dereference the partition and mark it unmounted
7825 	if (partition) {
7826 		partition->SetVolumeID(-1);
7827 		partition->SetMountCookie(NULL);
7828 
7829 		if (mount->owns_file_device)
7830 			KDiskDeviceManager::Default()->DeleteFileDevice(partition->ID());
7831 		partition->Unregister();
7832 	}
7833 
7834 	delete mount;
7835 	return B_OK;
7836 }
7837 
7838 
7839 static status_t
7840 fs_sync(dev_t device)
7841 {
7842 	struct fs_mount* mount;
7843 	status_t status = get_mount(device, &mount);
7844 	if (status != B_OK)
7845 		return status;
7846 
7847 	struct vnode marker;
7848 	memset(&marker, 0, sizeof(marker));
7849 	marker.SetBusy(true);
7850 	marker.SetRemoved(true);
7851 
7852 	// First, synchronize all file caches
7853 
7854 	while (true) {
7855 		WriteLocker locker(sVnodeLock);
7856 			// Note: That's the easy way. Which is probably OK for sync(),
7857 			// since it's a relatively rare call and doesn't need to allow for
7858 			// a lot of concurrency. Using a read lock would be possible, but
7859 			// also more involved, since we had to lock the individual nodes
7860 			// and take care of the locking order, which we might not want to
7861 			// do while holding fs_mount::rlock.
7862 
7863 		// synchronize access to vnode list
7864 		recursive_lock_lock(&mount->rlock);
7865 
7866 		struct vnode* vnode;
7867 		if (!marker.IsRemoved()) {
7868 			vnode = mount->vnodes.GetNext(&marker);
7869 			mount->vnodes.Remove(&marker);
7870 			marker.SetRemoved(true);
7871 		} else
7872 			vnode = mount->vnodes.First();
7873 
7874 		while (vnode != NULL && (vnode->cache == NULL
7875 			|| vnode->IsRemoved() || vnode->IsBusy())) {
7876 			// TODO: we could track writes (and writable mapped vnodes)
7877 			//	and have a simple flag that we could test for here
7878 			vnode = mount->vnodes.GetNext(vnode);
7879 		}
7880 
7881 		if (vnode != NULL) {
7882 			// insert marker vnode again
7883 			mount->vnodes.Insert(mount->vnodes.GetNext(vnode), &marker);
7884 			marker.SetRemoved(false);
7885 		}
7886 
7887 		recursive_lock_unlock(&mount->rlock);
7888 
7889 		if (vnode == NULL)
7890 			break;
7891 
7892 		vnode = lookup_vnode(mount->id, vnode->id);
7893 		if (vnode == NULL || vnode->IsBusy())
7894 			continue;
7895 
7896 		if (vnode->ref_count == 0) {
7897 			// this vnode has been unused before
7898 			vnode_used(vnode);
7899 		}
7900 		inc_vnode_ref_count(vnode);
7901 
7902 		locker.Unlock();
7903 
7904 		if (vnode->cache != NULL && !vnode->IsRemoved())
7905 			vnode->cache->WriteModified();
7906 
7907 		put_vnode(vnode);
7908 	}
7909 
7910 	// Let the file systems do their synchronizing work
7911 	if (HAS_FS_MOUNT_CALL(mount, sync))
7912 		status = FS_MOUNT_CALL_NO_PARAMS(mount, sync);
7913 
7914 	// Finally, flush the underlying device's write cache (if possible.)
7915 	if (mount->partition != NULL && mount->partition->Device() != NULL)
7916 		ioctl(mount->partition->Device()->FD(), B_FLUSH_DRIVE_CACHE);
7917 
7918 	put_mount(mount);
7919 	return status;
7920 }
7921 
7922 
7923 static status_t
7924 fs_read_info(dev_t device, struct fs_info* info)
7925 {
7926 	struct fs_mount* mount;
7927 	status_t status = get_mount(device, &mount);
7928 	if (status != B_OK)
7929 		return status;
7930 
7931 	memset(info, 0, sizeof(struct fs_info));
7932 
7933 	if (HAS_FS_MOUNT_CALL(mount, read_fs_info))
7934 		status = FS_MOUNT_CALL(mount, read_fs_info, info);
7935 
7936 	// fill in info the file system doesn't (have to) know about
7937 	if (status == B_OK) {
7938 		info->dev = mount->id;
7939 		info->root = mount->root_vnode->id;
7940 
7941 		fs_volume* volume = mount->volume;
7942 		while (volume->super_volume != NULL)
7943 			volume = volume->super_volume;
7944 
7945 		strlcpy(info->fsh_name, volume->file_system_name,
7946 			sizeof(info->fsh_name));
7947 		if (mount->device_name != NULL) {
7948 			strlcpy(info->device_name, mount->device_name,
7949 				sizeof(info->device_name));
7950 		}
7951 	}
7952 
7953 	// if the call is not supported by the file system, there are still
7954 	// the parts that we filled out ourselves
7955 
7956 	put_mount(mount);
7957 	return status;
7958 }
7959 
7960 
7961 static status_t
7962 fs_write_info(dev_t device, const struct fs_info* info, int mask)
7963 {
7964 	struct fs_mount* mount;
7965 	status_t status = get_mount(device, &mount);
7966 	if (status != B_OK)
7967 		return status;
7968 
7969 	if (HAS_FS_MOUNT_CALL(mount, write_fs_info))
7970 		status = FS_MOUNT_CALL(mount, write_fs_info, info, mask);
7971 	else
7972 		status = B_READ_ONLY_DEVICE;
7973 
7974 	put_mount(mount);
7975 	return status;
7976 }
7977 
7978 
7979 static dev_t
7980 fs_next_device(int32* _cookie)
7981 {
7982 	struct fs_mount* mount = NULL;
7983 	dev_t device = *_cookie;
7984 
7985 	mutex_lock(&sMountMutex);
7986 
7987 	// Since device IDs are assigned sequentially, this algorithm
7988 	// does work good enough. It makes sure that the device list
7989 	// returned is sorted, and that no device is skipped when an
7990 	// already visited device got unmounted.
7991 
7992 	while (device < sNextMountID) {
7993 		mount = find_mount(device++);
7994 		if (mount != NULL && mount->volume->private_volume != NULL)
7995 			break;
7996 	}
7997 
7998 	*_cookie = device;
7999 
8000 	if (mount != NULL)
8001 		device = mount->id;
8002 	else
8003 		device = B_BAD_VALUE;
8004 
8005 	mutex_unlock(&sMountMutex);
8006 
8007 	return device;
8008 }
8009 
8010 
8011 ssize_t
8012 fs_read_attr(int fd, const char *attribute, uint32 type, off_t pos,
8013 	void *buffer, size_t readBytes)
8014 {
8015 	int attrFD = attr_open(fd, NULL, attribute, O_RDONLY, true);
8016 	if (attrFD < 0)
8017 		return attrFD;
8018 
8019 	ssize_t bytesRead = _kern_read(attrFD, pos, buffer, readBytes);
8020 
8021 	_kern_close(attrFD);
8022 
8023 	return bytesRead;
8024 }
8025 
8026 
8027 static status_t
8028 get_cwd(char* buffer, size_t size, bool kernel)
8029 {
8030 	// Get current working directory from io context
8031 	struct io_context* context = get_current_io_context(kernel);
8032 	status_t status;
8033 
8034 	FUNCTION(("vfs_get_cwd: buf %p, size %ld\n", buffer, size));
8035 
8036 	mutex_lock(&context->io_mutex);
8037 
8038 	struct vnode* vnode = context->cwd;
8039 	if (vnode)
8040 		inc_vnode_ref_count(vnode);
8041 
8042 	mutex_unlock(&context->io_mutex);
8043 
8044 	if (vnode) {
8045 		status = dir_vnode_to_path(vnode, buffer, size, kernel);
8046 		put_vnode(vnode);
8047 	} else
8048 		status = B_ERROR;
8049 
8050 	return status;
8051 }
8052 
8053 
8054 static status_t
8055 set_cwd(int fd, char* path, bool kernel)
8056 {
8057 	struct io_context* context;
8058 	struct vnode* vnode = NULL;
8059 	struct vnode* oldDirectory;
8060 	status_t status;
8061 
8062 	FUNCTION(("set_cwd: path = \'%s\'\n", path));
8063 
8064 	// Get vnode for passed path, and bail if it failed
8065 	status = fd_and_path_to_vnode(fd, path, true, &vnode, NULL, kernel);
8066 	if (status < 0)
8067 		return status;
8068 
8069 	if (!S_ISDIR(vnode->Type())) {
8070 		// nope, can't cwd to here
8071 		status = B_NOT_A_DIRECTORY;
8072 		goto err;
8073 	}
8074 
8075 	// We need to have the permission to enter the directory, too
8076 	if (HAS_FS_CALL(vnode, access)) {
8077 		status = FS_CALL(vnode, access, X_OK);
8078 		if (status != B_OK)
8079 			goto err;
8080 	}
8081 
8082 	// Get current io context and lock
8083 	context = get_current_io_context(kernel);
8084 	mutex_lock(&context->io_mutex);
8085 
8086 	// save the old current working directory first
8087 	oldDirectory = context->cwd;
8088 	context->cwd = vnode;
8089 
8090 	mutex_unlock(&context->io_mutex);
8091 
8092 	if (oldDirectory)
8093 		put_vnode(oldDirectory);
8094 
8095 	return B_NO_ERROR;
8096 
8097 err:
8098 	put_vnode(vnode);
8099 	return status;
8100 }
8101 
8102 
8103 //	#pragma mark - kernel mirrored syscalls
8104 
8105 
8106 dev_t
8107 _kern_mount(const char* path, const char* device, const char* fsName,
8108 	uint32 flags, const char* args, size_t argsLength)
8109 {
8110 	KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8111 	if (pathBuffer.InitCheck() != B_OK)
8112 		return B_NO_MEMORY;
8113 
8114 	return fs_mount(pathBuffer.LockBuffer(), device, fsName, flags, args, true);
8115 }
8116 
8117 
8118 status_t
8119 _kern_unmount(const char* path, uint32 flags)
8120 {
8121 	KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8122 	if (pathBuffer.InitCheck() != B_OK)
8123 		return B_NO_MEMORY;
8124 
8125 	return fs_unmount(pathBuffer.LockBuffer(), -1, flags, true);
8126 }
8127 
8128 
8129 status_t
8130 _kern_read_fs_info(dev_t device, struct fs_info* info)
8131 {
8132 	if (info == NULL)
8133 		return B_BAD_VALUE;
8134 
8135 	return fs_read_info(device, info);
8136 }
8137 
8138 
8139 status_t
8140 _kern_write_fs_info(dev_t device, const struct fs_info* info, int mask)
8141 {
8142 	if (info == NULL)
8143 		return B_BAD_VALUE;
8144 
8145 	return fs_write_info(device, info, mask);
8146 }
8147 
8148 
8149 status_t
8150 _kern_sync(void)
8151 {
8152 	// Note: _kern_sync() is also called from _user_sync()
8153 	int32 cookie = 0;
8154 	dev_t device;
8155 	while ((device = next_dev(&cookie)) >= 0) {
8156 		status_t status = fs_sync(device);
8157 		if (status != B_OK && status != B_BAD_VALUE) {
8158 			dprintf("sync: device %" B_PRIdDEV " couldn't sync: %s\n", device,
8159 				strerror(status));
8160 		}
8161 	}
8162 
8163 	return B_OK;
8164 }
8165 
8166 
8167 dev_t
8168 _kern_next_device(int32* _cookie)
8169 {
8170 	return fs_next_device(_cookie);
8171 }
8172 
8173 
8174 status_t
8175 _kern_get_next_fd_info(team_id teamID, uint32* _cookie, fd_info* info,
8176 	size_t infoSize)
8177 {
8178 	if (infoSize != sizeof(fd_info))
8179 		return B_BAD_VALUE;
8180 
8181 	// get the team
8182 	Team* team = Team::Get(teamID);
8183 	if (team == NULL)
8184 		return B_BAD_TEAM_ID;
8185 	BReference<Team> teamReference(team, true);
8186 
8187 	// now that we have a team reference, its I/O context won't go away
8188 	io_context* context = team->io_context;
8189 	MutexLocker contextLocker(context->io_mutex);
8190 
8191 	uint32 slot = *_cookie;
8192 
8193 	struct file_descriptor* descriptor;
8194 	while (slot < context->table_size
8195 		&& (descriptor = context->fds[slot]) == NULL) {
8196 		slot++;
8197 	}
8198 
8199 	if (slot >= context->table_size)
8200 		return B_ENTRY_NOT_FOUND;
8201 
8202 	info->number = slot;
8203 	info->open_mode = descriptor->open_mode;
8204 
8205 	struct vnode* vnode = fd_vnode(descriptor);
8206 	if (vnode != NULL) {
8207 		info->device = vnode->device;
8208 		info->node = vnode->id;
8209 	} else if (descriptor->u.mount != NULL) {
8210 		info->device = descriptor->u.mount->id;
8211 		info->node = -1;
8212 	}
8213 
8214 	*_cookie = slot + 1;
8215 	return B_OK;
8216 }
8217 
8218 
8219 int
8220 _kern_open_entry_ref(dev_t device, ino_t inode, const char* name, int openMode,
8221 	int perms)
8222 {
8223 	if ((openMode & O_CREAT) != 0) {
8224 		return file_create_entry_ref(device, inode, name, openMode, perms,
8225 			true);
8226 	}
8227 
8228 	return file_open_entry_ref(device, inode, name, openMode, true);
8229 }
8230 
8231 
8232 /*!	\brief Opens a node specified by a FD + path pair.
8233 
8234 	At least one of \a fd and \a path must be specified.
8235 	If only \a fd is given, the function opens the node identified by this
8236 	FD. If only a path is given, this path is opened. If both are given and
8237 	the path is absolute, \a fd is ignored; a relative path is reckoned off
8238 	of the directory (!) identified by \a fd.
8239 
8240 	\param fd The FD. May be < 0.
8241 	\param path The absolute or relative path. May be \c NULL.
8242 	\param openMode The open mode.
8243 	\return A FD referring to the newly opened node, or an error code,
8244 			if an error occurs.
8245 */
8246 int
8247 _kern_open(int fd, const char* path, int openMode, int perms)
8248 {
8249 	KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8250 	if (pathBuffer.InitCheck() != B_OK)
8251 		return B_NO_MEMORY;
8252 
8253 	if ((openMode & O_CREAT) != 0)
8254 		return file_create(fd, pathBuffer.LockBuffer(), openMode, perms, true);
8255 
8256 	return file_open(fd, pathBuffer.LockBuffer(), openMode, true);
8257 }
8258 
8259 
8260 /*!	\brief Opens a directory specified by entry_ref or node_ref.
8261 
8262 	The supplied name may be \c NULL, in which case directory identified
8263 	by \a device and \a inode will be opened. Otherwise \a device and
8264 	\a inode identify the parent directory of the directory to be opened
8265 	and \a name its entry name.
8266 
8267 	\param device If \a name is specified the ID of the device the parent
8268 		   directory of the directory to be opened resides on, otherwise
8269 		   the device of the directory itself.
8270 	\param inode If \a name is specified the node ID of the parent
8271 		   directory of the directory to be opened, otherwise node ID of the
8272 		   directory itself.
8273 	\param name The entry name of the directory to be opened. If \c NULL,
8274 		   the \a device + \a inode pair identify the node to be opened.
8275 	\return The FD of the newly opened directory or an error code, if
8276 			something went wrong.
8277 */
8278 int
8279 _kern_open_dir_entry_ref(dev_t device, ino_t inode, const char* name)
8280 {
8281 	return dir_open_entry_ref(device, inode, name, true);
8282 }
8283 
8284 
8285 /*!	\brief Opens a directory specified by a FD + path pair.
8286 
8287 	At least one of \a fd and \a path must be specified.
8288 	If only \a fd is given, the function opens the directory identified by this
8289 	FD. If only a path is given, this path is opened. If both are given and
8290 	the path is absolute, \a fd is ignored; a relative path is reckoned off
8291 	of the directory (!) identified by \a fd.
8292 
8293 	\param fd The FD. May be < 0.
8294 	\param path The absolute or relative path. May be \c NULL.
8295 	\return A FD referring to the newly opened directory, or an error code,
8296 			if an error occurs.
8297 */
8298 int
8299 _kern_open_dir(int fd, const char* path)
8300 {
8301 	KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8302 	if (pathBuffer.InitCheck() != B_OK)
8303 		return B_NO_MEMORY;
8304 
8305 	return dir_open(fd, pathBuffer.LockBuffer(), true);
8306 }
8307 
8308 
8309 status_t
8310 _kern_fcntl(int fd, int op, size_t argument)
8311 {
8312 	return common_fcntl(fd, op, argument, true);
8313 }
8314 
8315 
8316 status_t
8317 _kern_fsync(int fd)
8318 {
8319 	return common_sync(fd, true);
8320 }
8321 
8322 
8323 status_t
8324 _kern_lock_node(int fd)
8325 {
8326 	return common_lock_node(fd, true);
8327 }
8328 
8329 
8330 status_t
8331 _kern_unlock_node(int fd)
8332 {
8333 	return common_unlock_node(fd, true);
8334 }
8335 
8336 
8337 status_t
8338 _kern_create_dir_entry_ref(dev_t device, ino_t inode, const char* name,
8339 	int perms)
8340 {
8341 	return dir_create_entry_ref(device, inode, name, perms, true);
8342 }
8343 
8344 
8345 /*!	\brief Creates a directory specified by a FD + path pair.
8346 
8347 	\a path must always be specified (it contains the name of the new directory
8348 	at least). If only a path is given, this path identifies the location at
8349 	which the directory shall be created. If both \a fd and \a path are given
8350 	and the path is absolute, \a fd is ignored; a relative path is reckoned off
8351 	of the directory (!) identified by \a fd.
8352 
8353 	\param fd The FD. May be < 0.
8354 	\param path The absolute or relative path. Must not be \c NULL.
8355 	\param perms The access permissions the new directory shall have.
8356 	\return \c B_OK, if the directory has been created successfully, another
8357 			error code otherwise.
8358 */
8359 status_t
8360 _kern_create_dir(int fd, const char* path, int perms)
8361 {
8362 	KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8363 	if (pathBuffer.InitCheck() != B_OK)
8364 		return B_NO_MEMORY;
8365 
8366 	return dir_create(fd, pathBuffer.LockBuffer(), perms, true);
8367 }
8368 
8369 
8370 status_t
8371 _kern_remove_dir(int fd, const char* path)
8372 {
8373 	KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8374 	if (pathBuffer.InitCheck() != B_OK)
8375 		return B_NO_MEMORY;
8376 
8377 	return dir_remove(fd, pathBuffer.LockBuffer(), true);
8378 }
8379 
8380 
8381 /*!	\brief Reads the contents of a symlink referred to by a FD + path pair.
8382 
8383 	At least one of \a fd and \a path must be specified.
8384 	If only \a fd is given, the function the symlink to be read is the node
8385 	identified by this FD. If only a path is given, this path identifies the
8386 	symlink to be read. If both are given and the path is absolute, \a fd is
8387 	ignored; a relative path is reckoned off of the directory (!) identified
8388 	by \a fd.
8389 	If this function fails with B_BUFFER_OVERFLOW, the \a _bufferSize pointer
8390 	will still be updated to reflect the required buffer size.
8391 
8392 	\param fd The FD. May be < 0.
8393 	\param path The absolute or relative path. May be \c NULL.
8394 	\param buffer The buffer into which the contents of the symlink shall be
8395 		   written.
8396 	\param _bufferSize A pointer to the size of the supplied buffer.
8397 	\return The length of the link on success or an appropriate error code
8398 */
8399 status_t
8400 _kern_read_link(int fd, const char* path, char* buffer, size_t* _bufferSize)
8401 {
8402 	KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8403 	if (pathBuffer.InitCheck() != B_OK)
8404 		return B_NO_MEMORY;
8405 
8406 	return common_read_link(fd, pathBuffer.LockBuffer(),
8407 		buffer, _bufferSize, true);
8408 }
8409 
8410 
8411 /*!	\brief Creates a symlink specified by a FD + path pair.
8412 
8413 	\a path must always be specified (it contains the name of the new symlink
8414 	at least). If only a path is given, this path identifies the location at
8415 	which the symlink shall be created. If both \a fd and \a path are given and
8416 	the path is absolute, \a fd is ignored; a relative path is reckoned off
8417 	of the directory (!) identified by \a fd.
8418 
8419 	\param fd The FD. May be < 0.
8420 	\param toPath The absolute or relative path. Must not be \c NULL.
8421 	\param mode The access permissions the new symlink shall have.
8422 	\return \c B_OK, if the symlink has been created successfully, another
8423 			error code otherwise.
8424 */
8425 status_t
8426 _kern_create_symlink(int fd, const char* path, const char* toPath, int mode)
8427 {
8428 	KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8429 	if (pathBuffer.InitCheck() != B_OK)
8430 		return B_NO_MEMORY;
8431 
8432 	return common_create_symlink(fd, pathBuffer.LockBuffer(),
8433 		toPath, mode, true);
8434 }
8435 
8436 
8437 status_t
8438 _kern_create_link(int pathFD, const char* path, int toFD, const char* toPath,
8439 	bool traverseLeafLink)
8440 {
8441 	KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8442 	KPath toPathBuffer(toPath, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8443 	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
8444 		return B_NO_MEMORY;
8445 
8446 	return common_create_link(pathFD, pathBuffer.LockBuffer(), toFD,
8447 		toPathBuffer.LockBuffer(), traverseLeafLink, true);
8448 }
8449 
8450 
8451 /*!	\brief Removes an entry specified by a FD + path pair from its directory.
8452 
8453 	\a path must always be specified (it contains at least the name of the entry
8454 	to be deleted). If only a path is given, this path identifies the entry
8455 	directly. If both \a fd and \a path are given and the path is absolute,
8456 	\a fd is ignored; a relative path is reckoned off of the directory (!)
8457 	identified by \a fd.
8458 
8459 	\param fd The FD. May be < 0.
8460 	\param path The absolute or relative path. Must not be \c NULL.
8461 	\return \c B_OK, if the entry has been removed successfully, another
8462 			error code otherwise.
8463 */
8464 status_t
8465 _kern_unlink(int fd, const char* path)
8466 {
8467 	KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8468 	if (pathBuffer.InitCheck() != B_OK)
8469 		return B_NO_MEMORY;
8470 
8471 	return common_unlink(fd, pathBuffer.LockBuffer(), true);
8472 }
8473 
8474 
8475 /*!	\brief Moves an entry specified by a FD + path pair to a an entry specified
8476 		   by another FD + path pair.
8477 
8478 	\a oldPath and \a newPath must always be specified (they contain at least
8479 	the name of the entry). If only a path is given, this path identifies the
8480 	entry directly. If both a FD and a path are given and the path is absolute,
8481 	the FD is ignored; a relative path is reckoned off of the directory (!)
8482 	identified by the respective FD.
8483 
8484 	\param oldFD The FD of the old location. May be < 0.
8485 	\param oldPath The absolute or relative path of the old location. Must not
8486 		   be \c NULL.
8487 	\param newFD The FD of the new location. May be < 0.
8488 	\param newPath The absolute or relative path of the new location. Must not
8489 		   be \c NULL.
8490 	\return \c B_OK, if the entry has been moved successfully, another
8491 			error code otherwise.
8492 */
8493 status_t
8494 _kern_rename(int oldFD, const char* oldPath, int newFD, const char* newPath)
8495 {
8496 	KPath oldPathBuffer(oldPath, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8497 	KPath newPathBuffer(newPath, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8498 	if (oldPathBuffer.InitCheck() != B_OK || newPathBuffer.InitCheck() != B_OK)
8499 		return B_NO_MEMORY;
8500 
8501 	return common_rename(oldFD, oldPathBuffer.LockBuffer(),
8502 		newFD, newPathBuffer.LockBuffer(), true);
8503 }
8504 
8505 
8506 status_t
8507 _kern_access(int fd, const char* path, int mode, bool effectiveUserGroup)
8508 {
8509 	KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8510 	if (pathBuffer.InitCheck() != B_OK)
8511 		return B_NO_MEMORY;
8512 
8513 	return common_access(fd, pathBuffer.LockBuffer(), mode, effectiveUserGroup,
8514 		true);
8515 }
8516 
8517 
8518 /*!	\brief Reads stat data of an entity specified by a FD + path pair.
8519 
8520 	If only \a fd is given, the stat operation associated with the type
8521 	of the FD (node, attr, attr dir etc.) is performed. If only \a path is
8522 	given, this path identifies the entry for whose node to retrieve the
8523 	stat data. If both \a fd and \a path are given and the path is absolute,
8524 	\a fd is ignored; a relative path is reckoned off of the directory (!)
8525 	identified by \a fd and specifies the entry whose stat data shall be
8526 	retrieved.
8527 
8528 	\param fd The FD. May be < 0.
8529 	\param path The absolute or relative path. Must not be \c NULL.
8530 	\param traverseLeafLink If \a path is given, \c true specifies that the
8531 		   function shall not stick to symlinks, but traverse them.
8532 	\param stat The buffer the stat data shall be written into.
8533 	\param statSize The size of the supplied stat buffer.
8534 	\return \c B_OK, if the the stat data have been read successfully, another
8535 			error code otherwise.
8536 */
8537 status_t
8538 _kern_read_stat(int fd, const char* path, bool traverseLeafLink,
8539 	struct stat* stat, size_t statSize)
8540 {
8541 	struct stat completeStat;
8542 	struct stat* originalStat = NULL;
8543 	status_t status;
8544 
8545 	if (statSize > sizeof(struct stat))
8546 		return B_BAD_VALUE;
8547 
8548 	// this supports different stat extensions
8549 	if (statSize < sizeof(struct stat)) {
8550 		originalStat = stat;
8551 		stat = &completeStat;
8552 	}
8553 
8554 	status = vfs_read_stat(fd, path, traverseLeafLink, stat, true);
8555 
8556 	if (status == B_OK && originalStat != NULL)
8557 		memcpy(originalStat, stat, statSize);
8558 
8559 	return status;
8560 }
8561 
8562 
8563 /*!	\brief Writes stat data of an entity specified by a FD + path pair.
8564 
8565 	If only \a fd is given, the stat operation associated with the type
8566 	of the FD (node, attr, attr dir etc.) is performed. If only \a path is
8567 	given, this path identifies the entry for whose node to write the
8568 	stat data. If both \a fd and \a path are given and the path is absolute,
8569 	\a fd is ignored; a relative path is reckoned off of the directory (!)
8570 	identified by \a fd and specifies the entry whose stat data shall be
8571 	written.
8572 
8573 	\param fd The FD. May be < 0.
8574 	\param path The absolute or relative path. May be \c NULL.
8575 	\param traverseLeafLink If \a path is given, \c true specifies that the
8576 		   function shall not stick to symlinks, but traverse them.
8577 	\param stat The buffer containing the stat data to be written.
8578 	\param statSize The size of the supplied stat buffer.
8579 	\param statMask A mask specifying which parts of the stat data shall be
8580 		   written.
8581 	\return \c B_OK, if the the stat data have been written successfully,
8582 			another error code otherwise.
8583 */
8584 status_t
8585 _kern_write_stat(int fd, const char* path, bool traverseLeafLink,
8586 	const struct stat* stat, size_t statSize, int statMask)
8587 {
8588 	struct stat completeStat;
8589 
8590 	if (statSize > sizeof(struct stat))
8591 		return B_BAD_VALUE;
8592 
8593 	// this supports different stat extensions
8594 	if (statSize < sizeof(struct stat)) {
8595 		memset((uint8*)&completeStat + statSize, 0,
8596 			sizeof(struct stat) - statSize);
8597 		memcpy(&completeStat, stat, statSize);
8598 		stat = &completeStat;
8599 	}
8600 
8601 	status_t status;
8602 
8603 	if (path != NULL) {
8604 		// path given: write the stat of the node referred to by (fd, path)
8605 		KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8606 		if (pathBuffer.InitCheck() != B_OK)
8607 			return B_NO_MEMORY;
8608 
8609 		status = common_path_write_stat(fd, pathBuffer.LockBuffer(),
8610 			traverseLeafLink, stat, statMask, true);
8611 	} else {
8612 		// no path given: get the FD and use the FD operation
8613 		struct file_descriptor* descriptor
8614 			= get_fd(get_current_io_context(true), fd);
8615 		if (descriptor == NULL)
8616 			return B_FILE_ERROR;
8617 
8618 		if (descriptor->ops->fd_write_stat)
8619 			status = descriptor->ops->fd_write_stat(descriptor, stat, statMask);
8620 		else
8621 			status = B_UNSUPPORTED;
8622 
8623 		put_fd(descriptor);
8624 	}
8625 
8626 	return status;
8627 }
8628 
8629 
8630 int
8631 _kern_open_attr_dir(int fd, const char* path, bool traverseLeafLink)
8632 {
8633 	KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8634 	if (pathBuffer.InitCheck() != B_OK)
8635 		return B_NO_MEMORY;
8636 
8637 	return attr_dir_open(fd, pathBuffer.LockBuffer(), traverseLeafLink, true);
8638 }
8639 
8640 
8641 int
8642 _kern_open_attr(int fd, const char* path, const char* name, uint32 type,
8643 	int openMode)
8644 {
8645 	KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8646 	if (pathBuffer.InitCheck() != B_OK)
8647 		return B_NO_MEMORY;
8648 
8649 	if ((openMode & O_CREAT) != 0) {
8650 		return attr_create(fd, pathBuffer.LockBuffer(), name, type, openMode,
8651 			true);
8652 	}
8653 
8654 	return attr_open(fd, pathBuffer.LockBuffer(), name, openMode, true);
8655 }
8656 
8657 
8658 status_t
8659 _kern_remove_attr(int fd, const char* name)
8660 {
8661 	return attr_remove(fd, name, true);
8662 }
8663 
8664 
8665 status_t
8666 _kern_rename_attr(int fromFile, const char* fromName, int toFile,
8667 	const char* toName)
8668 {
8669 	return attr_rename(fromFile, fromName, toFile, toName, true);
8670 }
8671 
8672 
8673 int
8674 _kern_open_index_dir(dev_t device)
8675 {
8676 	return index_dir_open(device, true);
8677 }
8678 
8679 
8680 status_t
8681 _kern_create_index(dev_t device, const char* name, uint32 type, uint32 flags)
8682 {
8683 	return index_create(device, name, type, flags, true);
8684 }
8685 
8686 
8687 status_t
8688 _kern_read_index_stat(dev_t device, const char* name, struct stat* stat)
8689 {
8690 	return index_name_read_stat(device, name, stat, true);
8691 }
8692 
8693 
8694 status_t
8695 _kern_remove_index(dev_t device, const char* name)
8696 {
8697 	return index_remove(device, name, true);
8698 }
8699 
8700 
8701 status_t
8702 _kern_getcwd(char* buffer, size_t size)
8703 {
8704 	TRACE(("_kern_getcwd: buf %p, %ld\n", buffer, size));
8705 
8706 	// Call vfs to get current working directory
8707 	return get_cwd(buffer, size, true);
8708 }
8709 
8710 
8711 status_t
8712 _kern_setcwd(int fd, const char* path)
8713 {
8714 	KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8715 	if (pathBuffer.InitCheck() != B_OK)
8716 		return B_NO_MEMORY;
8717 
8718 	return set_cwd(fd, pathBuffer.LockBuffer(), true);
8719 }
8720 
8721 
8722 //	#pragma mark - userland syscalls
8723 
8724 
8725 dev_t
8726 _user_mount(const char* userPath, const char* userDevice,
8727 	const char* userFileSystem, uint32 flags, const char* userArgs,
8728 	size_t argsLength)
8729 {
8730 	char fileSystem[B_FILE_NAME_LENGTH];
8731 	KPath path, device;
8732 	char* args = NULL;
8733 	status_t status;
8734 
8735 	if (!IS_USER_ADDRESS(userPath)
8736 		|| !IS_USER_ADDRESS(userFileSystem)
8737 		|| !IS_USER_ADDRESS(userDevice))
8738 		return B_BAD_ADDRESS;
8739 
8740 	if (path.InitCheck() != B_OK || device.InitCheck() != B_OK)
8741 		return B_NO_MEMORY;
8742 
8743 	if (user_strlcpy(path.LockBuffer(), userPath, B_PATH_NAME_LENGTH) < B_OK)
8744 		return B_BAD_ADDRESS;
8745 
8746 	if (userFileSystem != NULL
8747 		&& user_strlcpy(fileSystem, userFileSystem, sizeof(fileSystem)) < B_OK)
8748 		return B_BAD_ADDRESS;
8749 
8750 	if (userDevice != NULL
8751 		&& user_strlcpy(device.LockBuffer(), userDevice, B_PATH_NAME_LENGTH)
8752 			< B_OK)
8753 		return B_BAD_ADDRESS;
8754 
8755 	if (userArgs != NULL && argsLength > 0) {
8756 		if (!IS_USER_ADDRESS(userArgs))
8757 			return B_BAD_ADDRESS;
8758 
8759 		// this is a safety restriction
8760 		if (argsLength >= 65536)
8761 			return B_NAME_TOO_LONG;
8762 
8763 		args = (char*)malloc(argsLength + 1);
8764 		if (args == NULL)
8765 			return B_NO_MEMORY;
8766 
8767 		if (user_strlcpy(args, userArgs, argsLength + 1) < B_OK) {
8768 			free(args);
8769 			return B_BAD_ADDRESS;
8770 		}
8771 	}
8772 	path.UnlockBuffer();
8773 	device.UnlockBuffer();
8774 
8775 	status = fs_mount(path.LockBuffer(),
8776 		userDevice != NULL ? device.Path() : NULL,
8777 		userFileSystem ? fileSystem : NULL, flags, args, false);
8778 
8779 	free(args);
8780 	return status;
8781 }
8782 
8783 
8784 status_t
8785 _user_unmount(const char* userPath, uint32 flags)
8786 {
8787 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
8788 
8789 	if (!IS_USER_ADDRESS(userPath))
8790 		return B_BAD_ADDRESS;
8791 
8792 	if (pathBuffer.InitCheck() != B_OK)
8793 		return B_NO_MEMORY;
8794 
8795 	char* path = pathBuffer.LockBuffer();
8796 
8797 	if (user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
8798 		return B_BAD_ADDRESS;
8799 
8800 	return fs_unmount(path, -1, flags & ~B_UNMOUNT_BUSY_PARTITION, false);
8801 }
8802 
8803 
8804 status_t
8805 _user_read_fs_info(dev_t device, struct fs_info* userInfo)
8806 {
8807 	struct fs_info info;
8808 	status_t status;
8809 
8810 	if (userInfo == NULL)
8811 		return B_BAD_VALUE;
8812 
8813 	if (!IS_USER_ADDRESS(userInfo))
8814 		return B_BAD_ADDRESS;
8815 
8816 	status = fs_read_info(device, &info);
8817 	if (status != B_OK)
8818 		return status;
8819 
8820 	if (user_memcpy(userInfo, &info, sizeof(struct fs_info)) != B_OK)
8821 		return B_BAD_ADDRESS;
8822 
8823 	return B_OK;
8824 }
8825 
8826 
8827 status_t
8828 _user_write_fs_info(dev_t device, const struct fs_info* userInfo, int mask)
8829 {
8830 	struct fs_info info;
8831 
8832 	if (userInfo == NULL)
8833 		return B_BAD_VALUE;
8834 
8835 	if (!IS_USER_ADDRESS(userInfo)
8836 		|| user_memcpy(&info, userInfo, sizeof(struct fs_info)) != B_OK)
8837 		return B_BAD_ADDRESS;
8838 
8839 	return fs_write_info(device, &info, mask);
8840 }
8841 
8842 
8843 dev_t
8844 _user_next_device(int32* _userCookie)
8845 {
8846 	int32 cookie;
8847 	dev_t device;
8848 
8849 	if (!IS_USER_ADDRESS(_userCookie)
8850 		|| user_memcpy(&cookie, _userCookie, sizeof(int32)) != B_OK)
8851 		return B_BAD_ADDRESS;
8852 
8853 	device = fs_next_device(&cookie);
8854 
8855 	if (device >= B_OK) {
8856 		// update user cookie
8857 		if (user_memcpy(_userCookie, &cookie, sizeof(int32)) != B_OK)
8858 			return B_BAD_ADDRESS;
8859 	}
8860 
8861 	return device;
8862 }
8863 
8864 
8865 status_t
8866 _user_sync(void)
8867 {
8868 	return _kern_sync();
8869 }
8870 
8871 
8872 status_t
8873 _user_get_next_fd_info(team_id team, uint32* userCookie, fd_info* userInfo,
8874 	size_t infoSize)
8875 {
8876 	struct fd_info info;
8877 	uint32 cookie;
8878 
8879 	// only root can do this (or should root's group be enough?)
8880 	if (geteuid() != 0)
8881 		return B_NOT_ALLOWED;
8882 
8883 	if (infoSize != sizeof(fd_info))
8884 		return B_BAD_VALUE;
8885 
8886 	if (!IS_USER_ADDRESS(userCookie) || !IS_USER_ADDRESS(userInfo)
8887 		|| user_memcpy(&cookie, userCookie, sizeof(uint32)) != B_OK)
8888 		return B_BAD_ADDRESS;
8889 
8890 	status_t status = _kern_get_next_fd_info(team, &cookie, &info, infoSize);
8891 	if (status != B_OK)
8892 		return status;
8893 
8894 	if (user_memcpy(userCookie, &cookie, sizeof(uint32)) != B_OK
8895 		|| user_memcpy(userInfo, &info, infoSize) != B_OK)
8896 		return B_BAD_ADDRESS;
8897 
8898 	return status;
8899 }
8900 
8901 
8902 status_t
8903 _user_entry_ref_to_path(dev_t device, ino_t inode, const char* leaf,
8904 	char* userPath, size_t pathLength)
8905 {
8906 	if (!IS_USER_ADDRESS(userPath))
8907 		return B_BAD_ADDRESS;
8908 
8909 	KPath path(B_PATH_NAME_LENGTH + 1);
8910 	if (path.InitCheck() != B_OK)
8911 		return B_NO_MEMORY;
8912 
8913 	// copy the leaf name onto the stack
8914 	char stackLeaf[B_FILE_NAME_LENGTH];
8915 	if (leaf != NULL) {
8916 		if (!IS_USER_ADDRESS(leaf))
8917 			return B_BAD_ADDRESS;
8918 
8919 		int length = user_strlcpy(stackLeaf, leaf, B_FILE_NAME_LENGTH);
8920 		if (length < 0)
8921 			return length;
8922 		if (length >= B_FILE_NAME_LENGTH)
8923 			return B_NAME_TOO_LONG;
8924 
8925 		leaf = stackLeaf;
8926 	}
8927 
8928 	status_t status = vfs_entry_ref_to_path(device, inode, leaf,
8929 		false, path.LockBuffer(), path.BufferSize());
8930 	if (status != B_OK)
8931 		return status;
8932 
8933 	path.UnlockBuffer();
8934 
8935 	int length = user_strlcpy(userPath, path.Path(), pathLength);
8936 	if (length < 0)
8937 		return length;
8938 	if (length >= (int)pathLength)
8939 		return B_BUFFER_OVERFLOW;
8940 
8941 	return B_OK;
8942 }
8943 
8944 
8945 status_t
8946 _user_normalize_path(const char* userPath, bool traverseLink, char* buffer)
8947 {
8948 	if (userPath == NULL || buffer == NULL)
8949 		return B_BAD_VALUE;
8950 	if (!IS_USER_ADDRESS(userPath) || !IS_USER_ADDRESS(buffer))
8951 		return B_BAD_ADDRESS;
8952 
8953 	// copy path from userland
8954 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
8955 	if (pathBuffer.InitCheck() != B_OK)
8956 		return B_NO_MEMORY;
8957 	char* path = pathBuffer.LockBuffer();
8958 
8959 	if (user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
8960 		return B_BAD_ADDRESS;
8961 
8962 	status_t error = normalize_path(path, pathBuffer.BufferSize(), traverseLink,
8963 		false);
8964 	if (error != B_OK)
8965 		return error;
8966 
8967 	// copy back to userland
8968 	int len = user_strlcpy(buffer, path, B_PATH_NAME_LENGTH);
8969 	if (len < 0)
8970 		return len;
8971 	if (len >= B_PATH_NAME_LENGTH)
8972 		return B_BUFFER_OVERFLOW;
8973 
8974 	return B_OK;
8975 }
8976 
8977 
8978 int
8979 _user_open_entry_ref(dev_t device, ino_t inode, const char* userName,
8980 	int openMode, int perms)
8981 {
8982 	char name[B_FILE_NAME_LENGTH];
8983 
8984 	if (userName == NULL || device < 0 || inode < 0)
8985 		return B_BAD_VALUE;
8986 	if (!IS_USER_ADDRESS(userName)
8987 		|| user_strlcpy(name, userName, sizeof(name)) < B_OK)
8988 		return B_BAD_ADDRESS;
8989 
8990 	if ((openMode & O_CREAT) != 0) {
8991 		return file_create_entry_ref(device, inode, name, openMode, perms,
8992 			false);
8993 	}
8994 
8995 	return file_open_entry_ref(device, inode, name, openMode, false);
8996 }
8997 
8998 
8999 int
9000 _user_open(int fd, const char* userPath, int openMode, int perms)
9001 {
9002 	KPath path(B_PATH_NAME_LENGTH + 1);
9003 	if (path.InitCheck() != B_OK)
9004 		return B_NO_MEMORY;
9005 
9006 	char* buffer = path.LockBuffer();
9007 
9008 	if (!IS_USER_ADDRESS(userPath)
9009 		|| user_strlcpy(buffer, userPath, B_PATH_NAME_LENGTH) < B_OK)
9010 		return B_BAD_ADDRESS;
9011 
9012 	if ((openMode & O_CREAT) != 0)
9013 		return file_create(fd, buffer, openMode, perms, false);
9014 
9015 	return file_open(fd, buffer, openMode, false);
9016 }
9017 
9018 
9019 int
9020 _user_open_dir_entry_ref(dev_t device, ino_t inode, const char* userName)
9021 {
9022 	if (userName != NULL) {
9023 		char name[B_FILE_NAME_LENGTH];
9024 
9025 		if (!IS_USER_ADDRESS(userName)
9026 			|| user_strlcpy(name, userName, sizeof(name)) < B_OK)
9027 			return B_BAD_ADDRESS;
9028 
9029 		return dir_open_entry_ref(device, inode, name, false);
9030 	}
9031 	return dir_open_entry_ref(device, inode, NULL, false);
9032 }
9033 
9034 
9035 int
9036 _user_open_dir(int fd, const char* userPath)
9037 {
9038 	if (userPath == NULL)
9039 		return dir_open(fd, NULL, false);
9040 
9041 	KPath path(B_PATH_NAME_LENGTH + 1);
9042 	if (path.InitCheck() != B_OK)
9043 		return B_NO_MEMORY;
9044 
9045 	char* buffer = path.LockBuffer();
9046 
9047 	if (!IS_USER_ADDRESS(userPath)
9048 		|| user_strlcpy(buffer, userPath, B_PATH_NAME_LENGTH) < B_OK)
9049 		return B_BAD_ADDRESS;
9050 
9051 	return dir_open(fd, buffer, false);
9052 }
9053 
9054 
9055 /*!	\brief Opens a directory's parent directory and returns the entry name
9056 		   of the former.
9057 
9058 	Aside from that it returns the directory's entry name, this method is
9059 	equivalent to \code _user_open_dir(fd, "..") \endcode. It really is
9060 	equivalent, if \a userName is \c NULL.
9061 
9062 	If a name buffer is supplied and the name does not fit the buffer, the
9063 	function fails. A buffer of size \c B_FILE_NAME_LENGTH should be safe.
9064 
9065 	\param fd A FD referring to a directory.
9066 	\param userName Buffer the directory's entry name shall be written into.
9067 		   May be \c NULL.
9068 	\param nameLength Size of the name buffer.
9069 	\return The file descriptor of the opened parent directory, if everything
9070 			went fine, an error code otherwise.
9071 */
9072 int
9073 _user_open_parent_dir(int fd, char* userName, size_t nameLength)
9074 {
9075 	bool kernel = false;
9076 
9077 	if (userName && !IS_USER_ADDRESS(userName))
9078 		return B_BAD_ADDRESS;
9079 
9080 	// open the parent dir
9081 	int parentFD = dir_open(fd, (char*)"..", kernel);
9082 	if (parentFD < 0)
9083 		return parentFD;
9084 	FDCloser fdCloser(parentFD, kernel);
9085 
9086 	if (userName) {
9087 		// get the vnodes
9088 		struct vnode* parentVNode = get_vnode_from_fd(parentFD, kernel);
9089 		struct vnode* dirVNode = get_vnode_from_fd(fd, kernel);
9090 		VNodePutter parentVNodePutter(parentVNode);
9091 		VNodePutter dirVNodePutter(dirVNode);
9092 		if (!parentVNode || !dirVNode)
9093 			return B_FILE_ERROR;
9094 
9095 		// get the vnode name
9096 		char _buffer[sizeof(struct dirent) + B_FILE_NAME_LENGTH];
9097 		struct dirent* buffer = (struct dirent*)_buffer;
9098 		status_t status = get_vnode_name(dirVNode, parentVNode, buffer,
9099 			sizeof(_buffer), get_current_io_context(false));
9100 		if (status != B_OK)
9101 			return status;
9102 
9103 		// copy the name to the userland buffer
9104 		int len = user_strlcpy(userName, buffer->d_name, nameLength);
9105 		if (len < 0)
9106 			return len;
9107 		if (len >= (int)nameLength)
9108 			return B_BUFFER_OVERFLOW;
9109 	}
9110 
9111 	return fdCloser.Detach();
9112 }
9113 
9114 
9115 status_t
9116 _user_fcntl(int fd, int op, size_t argument)
9117 {
9118 	status_t status = common_fcntl(fd, op, argument, false);
9119 	if (op == F_SETLKW)
9120 		syscall_restart_handle_post(status);
9121 
9122 	return status;
9123 }
9124 
9125 
9126 status_t
9127 _user_fsync(int fd)
9128 {
9129 	return common_sync(fd, false);
9130 }
9131 
9132 
9133 status_t
9134 _user_flock(int fd, int operation)
9135 {
9136 	FUNCTION(("_user_fcntl(fd = %d, op = %d)\n", fd, operation));
9137 
9138 	// Check if the operation is valid
9139 	switch (operation & ~LOCK_NB) {
9140 		case LOCK_UN:
9141 		case LOCK_SH:
9142 		case LOCK_EX:
9143 			break;
9144 
9145 		default:
9146 			return B_BAD_VALUE;
9147 	}
9148 
9149 	struct file_descriptor* descriptor;
9150 	struct vnode* vnode;
9151 	descriptor = get_fd_and_vnode(fd, &vnode, false);
9152 	if (descriptor == NULL)
9153 		return B_FILE_ERROR;
9154 
9155 	if (descriptor->type != FDTYPE_FILE) {
9156 		put_fd(descriptor);
9157 		return B_BAD_VALUE;
9158 	}
9159 
9160 	struct flock flock;
9161 	flock.l_start = 0;
9162 	flock.l_len = OFF_MAX;
9163 	flock.l_whence = 0;
9164 	flock.l_type = (operation & LOCK_SH) != 0 ? F_RDLCK : F_WRLCK;
9165 
9166 	status_t status;
9167 	if ((operation & LOCK_UN) != 0) {
9168 		if (HAS_FS_CALL(vnode, release_lock))
9169 			status = FS_CALL(vnode, release_lock, descriptor->cookie, &flock);
9170 		else
9171 			status = release_advisory_lock(vnode, NULL, descriptor, &flock);
9172 	} else {
9173 		if (HAS_FS_CALL(vnode, acquire_lock)) {
9174 			status = FS_CALL(vnode, acquire_lock, descriptor->cookie, &flock,
9175 				(operation & LOCK_NB) == 0);
9176 		} else {
9177 			status = acquire_advisory_lock(vnode, NULL, descriptor, &flock,
9178 				(operation & LOCK_NB) == 0);
9179 		}
9180 	}
9181 
9182 	syscall_restart_handle_post(status);
9183 
9184 	put_fd(descriptor);
9185 	return status;
9186 }
9187 
9188 
9189 status_t
9190 _user_lock_node(int fd)
9191 {
9192 	return common_lock_node(fd, false);
9193 }
9194 
9195 
9196 status_t
9197 _user_unlock_node(int fd)
9198 {
9199 	return common_unlock_node(fd, false);
9200 }
9201 
9202 
9203 status_t
9204 _user_create_dir_entry_ref(dev_t device, ino_t inode, const char* userName,
9205 	int perms)
9206 {
9207 	char name[B_FILE_NAME_LENGTH];
9208 	status_t status;
9209 
9210 	if (!IS_USER_ADDRESS(userName))
9211 		return B_BAD_ADDRESS;
9212 
9213 	status = user_strlcpy(name, userName, sizeof(name));
9214 	if (status < 0)
9215 		return status;
9216 
9217 	return dir_create_entry_ref(device, inode, name, perms, false);
9218 }
9219 
9220 
9221 status_t
9222 _user_create_dir(int fd, const char* userPath, int perms)
9223 {
9224 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9225 	if (pathBuffer.InitCheck() != B_OK)
9226 		return B_NO_MEMORY;
9227 
9228 	char* path = pathBuffer.LockBuffer();
9229 
9230 	if (!IS_USER_ADDRESS(userPath)
9231 		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9232 		return B_BAD_ADDRESS;
9233 
9234 	return dir_create(fd, path, perms, false);
9235 }
9236 
9237 
9238 status_t
9239 _user_remove_dir(int fd, const char* userPath)
9240 {
9241 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9242 	if (pathBuffer.InitCheck() != B_OK)
9243 		return B_NO_MEMORY;
9244 
9245 	char* path = pathBuffer.LockBuffer();
9246 
9247 	if (userPath != NULL) {
9248 		if (!IS_USER_ADDRESS(userPath)
9249 			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9250 			return B_BAD_ADDRESS;
9251 	}
9252 
9253 	return dir_remove(fd, userPath ? path : NULL, false);
9254 }
9255 
9256 
9257 status_t
9258 _user_read_link(int fd, const char* userPath, char* userBuffer,
9259 	size_t* userBufferSize)
9260 {
9261 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1), linkBuffer;
9262 	if (pathBuffer.InitCheck() != B_OK || linkBuffer.InitCheck() != B_OK)
9263 		return B_NO_MEMORY;
9264 
9265 	size_t bufferSize;
9266 
9267 	if (!IS_USER_ADDRESS(userBuffer) || !IS_USER_ADDRESS(userBufferSize)
9268 		|| user_memcpy(&bufferSize, userBufferSize, sizeof(size_t)) != B_OK)
9269 		return B_BAD_ADDRESS;
9270 
9271 	char* path = pathBuffer.LockBuffer();
9272 	char* buffer = linkBuffer.LockBuffer();
9273 
9274 	if (userPath) {
9275 		if (!IS_USER_ADDRESS(userPath)
9276 			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9277 			return B_BAD_ADDRESS;
9278 
9279 		if (bufferSize > B_PATH_NAME_LENGTH)
9280 			bufferSize = B_PATH_NAME_LENGTH;
9281 	}
9282 
9283 	status_t status = common_read_link(fd, userPath ? path : NULL, buffer,
9284 		&bufferSize, false);
9285 
9286 	// we also update the bufferSize in case of errors
9287 	// (the real length will be returned in case of B_BUFFER_OVERFLOW)
9288 	if (user_memcpy(userBufferSize, &bufferSize, sizeof(size_t)) != B_OK)
9289 		return B_BAD_ADDRESS;
9290 
9291 	if (status != B_OK)
9292 		return status;
9293 
9294 	if (user_memcpy(userBuffer, buffer, bufferSize) != B_OK)
9295 		return B_BAD_ADDRESS;
9296 
9297 	return B_OK;
9298 }
9299 
9300 
9301 status_t
9302 _user_create_symlink(int fd, const char* userPath, const char* userToPath,
9303 	int mode)
9304 {
9305 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9306 	KPath toPathBuffer(B_PATH_NAME_LENGTH + 1);
9307 	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
9308 		return B_NO_MEMORY;
9309 
9310 	char* path = pathBuffer.LockBuffer();
9311 	char* toPath = toPathBuffer.LockBuffer();
9312 
9313 	if (!IS_USER_ADDRESS(userPath)
9314 		|| !IS_USER_ADDRESS(userToPath)
9315 		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK
9316 		|| user_strlcpy(toPath, userToPath, B_PATH_NAME_LENGTH) < B_OK)
9317 		return B_BAD_ADDRESS;
9318 
9319 	return common_create_symlink(fd, path, toPath, mode, false);
9320 }
9321 
9322 
9323 status_t
9324 _user_create_link(int pathFD, const char* userPath, int toFD,
9325 	const char* userToPath, bool traverseLeafLink)
9326 {
9327 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9328 	KPath toPathBuffer(B_PATH_NAME_LENGTH + 1);
9329 	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
9330 		return B_NO_MEMORY;
9331 
9332 	char* path = pathBuffer.LockBuffer();
9333 	char* toPath = toPathBuffer.LockBuffer();
9334 
9335 	if (!IS_USER_ADDRESS(userPath)
9336 		|| !IS_USER_ADDRESS(userToPath)
9337 		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK
9338 		|| user_strlcpy(toPath, userToPath, B_PATH_NAME_LENGTH) < B_OK)
9339 		return B_BAD_ADDRESS;
9340 
9341 	status_t status = check_path(toPath);
9342 	if (status != B_OK)
9343 		return status;
9344 
9345 	return common_create_link(pathFD, path, toFD, toPath, traverseLeafLink,
9346 		false);
9347 }
9348 
9349 
9350 status_t
9351 _user_unlink(int fd, const char* userPath)
9352 {
9353 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9354 	if (pathBuffer.InitCheck() != B_OK)
9355 		return B_NO_MEMORY;
9356 
9357 	char* path = pathBuffer.LockBuffer();
9358 
9359 	if (!IS_USER_ADDRESS(userPath)
9360 		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9361 		return B_BAD_ADDRESS;
9362 
9363 	return common_unlink(fd, path, false);
9364 }
9365 
9366 
9367 status_t
9368 _user_rename(int oldFD, const char* userOldPath, int newFD,
9369 	const char* userNewPath)
9370 {
9371 	KPath oldPathBuffer(B_PATH_NAME_LENGTH + 1);
9372 	KPath newPathBuffer(B_PATH_NAME_LENGTH + 1);
9373 	if (oldPathBuffer.InitCheck() != B_OK || newPathBuffer.InitCheck() != B_OK)
9374 		return B_NO_MEMORY;
9375 
9376 	char* oldPath = oldPathBuffer.LockBuffer();
9377 	char* newPath = newPathBuffer.LockBuffer();
9378 
9379 	if (!IS_USER_ADDRESS(userOldPath) || !IS_USER_ADDRESS(userNewPath)
9380 		|| user_strlcpy(oldPath, userOldPath, B_PATH_NAME_LENGTH) < B_OK
9381 		|| user_strlcpy(newPath, userNewPath, B_PATH_NAME_LENGTH) < B_OK)
9382 		return B_BAD_ADDRESS;
9383 
9384 	return common_rename(oldFD, oldPath, newFD, newPath, false);
9385 }
9386 
9387 
9388 status_t
9389 _user_create_fifo(int fd, const char* userPath, mode_t perms)
9390 {
9391 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9392 	if (pathBuffer.InitCheck() != B_OK)
9393 		return B_NO_MEMORY;
9394 
9395 	char* path = pathBuffer.LockBuffer();
9396 
9397 	if (!IS_USER_ADDRESS(userPath)
9398 		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK) {
9399 		return B_BAD_ADDRESS;
9400 	}
9401 
9402 	// split into directory vnode and filename path
9403 	char filename[B_FILE_NAME_LENGTH];
9404 	struct vnode* dir;
9405 	status_t status = fd_and_path_to_dir_vnode(fd, path, &dir, filename, false);
9406 	if (status != B_OK)
9407 		return status;
9408 
9409 	VNodePutter _(dir);
9410 
9411 	// the underlying FS needs to support creating FIFOs
9412 	if (!HAS_FS_CALL(dir, create_special_node))
9413 		return B_UNSUPPORTED;
9414 
9415 	// create the entry	-- the FIFO sub node is set up automatically
9416 	fs_vnode superVnode;
9417 	ino_t nodeID;
9418 	status = FS_CALL(dir, create_special_node, filename, NULL,
9419 		S_IFIFO | (perms & S_IUMSK), 0, &superVnode, &nodeID);
9420 
9421 	// create_special_node() acquired a reference for us that we don't need.
9422 	if (status == B_OK)
9423 		put_vnode(dir->mount->volume, nodeID);
9424 
9425 	return status;
9426 }
9427 
9428 
9429 status_t
9430 _user_create_pipe(int* userFDs)
9431 {
9432 	// rootfs should support creating FIFOs, but let's be sure
9433 	if (!HAS_FS_CALL(sRoot, create_special_node))
9434 		return B_UNSUPPORTED;
9435 
9436 	// create the node	-- the FIFO sub node is set up automatically
9437 	fs_vnode superVnode;
9438 	ino_t nodeID;
9439 	status_t status = FS_CALL(sRoot, create_special_node, NULL, NULL,
9440 		S_IFIFO | S_IRUSR | S_IWUSR, 0, &superVnode, &nodeID);
9441 	if (status != B_OK)
9442 		return status;
9443 
9444 	// We've got one reference to the node and need another one.
9445 	struct vnode* vnode;
9446 	status = get_vnode(sRoot->mount->id, nodeID, &vnode, true, false);
9447 	if (status != B_OK) {
9448 		// that should not happen
9449 		dprintf("_user_create_pipe(): Failed to lookup vnode (%" B_PRIdDEV ", "
9450 			"%" B_PRIdINO ")\n", sRoot->mount->id, sRoot->id);
9451 		return status;
9452 	}
9453 
9454 	// Everything looks good so far. Open two FDs for reading respectively
9455 	// writing.
9456 	int fds[2];
9457 	fds[0] = open_vnode(vnode, O_RDONLY, false);
9458 	fds[1] = open_vnode(vnode, O_WRONLY, false);
9459 
9460 	FDCloser closer0(fds[0], false);
9461 	FDCloser closer1(fds[1], false);
9462 
9463 	status = (fds[0] >= 0 ? (fds[1] >= 0 ? B_OK : fds[1]) : fds[0]);
9464 
9465 	// copy FDs to userland
9466 	if (status == B_OK) {
9467 		if (!IS_USER_ADDRESS(userFDs)
9468 			|| user_memcpy(userFDs, fds, sizeof(fds)) != B_OK) {
9469 			status = B_BAD_ADDRESS;
9470 		}
9471 	}
9472 
9473 	// keep FDs, if everything went fine
9474 	if (status == B_OK) {
9475 		closer0.Detach();
9476 		closer1.Detach();
9477 	}
9478 
9479 	return status;
9480 }
9481 
9482 
9483 status_t
9484 _user_access(int fd, const char* userPath, int mode, bool effectiveUserGroup)
9485 {
9486 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9487 	if (pathBuffer.InitCheck() != B_OK)
9488 		return B_NO_MEMORY;
9489 
9490 	char* path = pathBuffer.LockBuffer();
9491 
9492 	if (!IS_USER_ADDRESS(userPath)
9493 		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9494 		return B_BAD_ADDRESS;
9495 
9496 	return common_access(fd, path, mode, effectiveUserGroup, false);
9497 }
9498 
9499 
9500 status_t
9501 _user_read_stat(int fd, const char* userPath, bool traverseLink,
9502 	struct stat* userStat, size_t statSize)
9503 {
9504 	struct stat stat;
9505 	status_t status;
9506 
9507 	if (statSize > sizeof(struct stat))
9508 		return B_BAD_VALUE;
9509 
9510 	if (!IS_USER_ADDRESS(userStat))
9511 		return B_BAD_ADDRESS;
9512 
9513 	if (userPath != NULL) {
9514 		// path given: get the stat of the node referred to by (fd, path)
9515 		if (!IS_USER_ADDRESS(userPath))
9516 			return B_BAD_ADDRESS;
9517 
9518 		KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9519 		if (pathBuffer.InitCheck() != B_OK)
9520 			return B_NO_MEMORY;
9521 
9522 		char* path = pathBuffer.LockBuffer();
9523 
9524 		ssize_t length = user_strlcpy(path, userPath, B_PATH_NAME_LENGTH);
9525 		if (length < B_OK)
9526 			return length;
9527 		if (length >= B_PATH_NAME_LENGTH)
9528 			return B_NAME_TOO_LONG;
9529 
9530 		status = common_path_read_stat(fd, path, traverseLink, &stat, false);
9531 	} else {
9532 		// no path given: get the FD and use the FD operation
9533 		struct file_descriptor* descriptor
9534 			= get_fd(get_current_io_context(false), fd);
9535 		if (descriptor == NULL)
9536 			return B_FILE_ERROR;
9537 
9538 		if (descriptor->ops->fd_read_stat)
9539 			status = descriptor->ops->fd_read_stat(descriptor, &stat);
9540 		else
9541 			status = B_UNSUPPORTED;
9542 
9543 		put_fd(descriptor);
9544 	}
9545 
9546 	if (status != B_OK)
9547 		return status;
9548 
9549 	return user_memcpy(userStat, &stat, statSize);
9550 }
9551 
9552 
9553 status_t
9554 _user_write_stat(int fd, const char* userPath, bool traverseLeafLink,
9555 	const struct stat* userStat, size_t statSize, int statMask)
9556 {
9557 	if (statSize > sizeof(struct stat))
9558 		return B_BAD_VALUE;
9559 
9560 	struct stat stat;
9561 
9562 	if (!IS_USER_ADDRESS(userStat)
9563 		|| user_memcpy(&stat, userStat, statSize) < B_OK)
9564 		return B_BAD_ADDRESS;
9565 
9566 	// clear additional stat fields
9567 	if (statSize < sizeof(struct stat))
9568 		memset((uint8*)&stat + statSize, 0, sizeof(struct stat) - statSize);
9569 
9570 	status_t status;
9571 
9572 	if (userPath != NULL) {
9573 		// path given: write the stat of the node referred to by (fd, path)
9574 		if (!IS_USER_ADDRESS(userPath))
9575 			return B_BAD_ADDRESS;
9576 
9577 		KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9578 		if (pathBuffer.InitCheck() != B_OK)
9579 			return B_NO_MEMORY;
9580 
9581 		char* path = pathBuffer.LockBuffer();
9582 
9583 		ssize_t length = user_strlcpy(path, userPath, B_PATH_NAME_LENGTH);
9584 		if (length < B_OK)
9585 			return length;
9586 		if (length >= B_PATH_NAME_LENGTH)
9587 			return B_NAME_TOO_LONG;
9588 
9589 		status = common_path_write_stat(fd, path, traverseLeafLink, &stat,
9590 			statMask, false);
9591 	} else {
9592 		// no path given: get the FD and use the FD operation
9593 		struct file_descriptor* descriptor
9594 			= get_fd(get_current_io_context(false), fd);
9595 		if (descriptor == NULL)
9596 			return B_FILE_ERROR;
9597 
9598 		if (descriptor->ops->fd_write_stat) {
9599 			status = descriptor->ops->fd_write_stat(descriptor, &stat,
9600 				statMask);
9601 		} else
9602 			status = B_UNSUPPORTED;
9603 
9604 		put_fd(descriptor);
9605 	}
9606 
9607 	return status;
9608 }
9609 
9610 
9611 int
9612 _user_open_attr_dir(int fd, const char* userPath, bool traverseLeafLink)
9613 {
9614 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9615 	if (pathBuffer.InitCheck() != B_OK)
9616 		return B_NO_MEMORY;
9617 
9618 	char* path = pathBuffer.LockBuffer();
9619 
9620 	if (userPath != NULL) {
9621 		if (!IS_USER_ADDRESS(userPath)
9622 			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9623 			return B_BAD_ADDRESS;
9624 	}
9625 
9626 	return attr_dir_open(fd, userPath ? path : NULL, traverseLeafLink, false);
9627 }
9628 
9629 
9630 ssize_t
9631 _user_read_attr(int fd, const char* userAttribute, off_t pos, void* userBuffer,
9632 	size_t readBytes)
9633 {
9634 	char attribute[B_FILE_NAME_LENGTH];
9635 
9636 	if (userAttribute == NULL)
9637 		return B_BAD_VALUE;
9638 	if (!IS_USER_ADDRESS(userAttribute)
9639 		|| user_strlcpy(attribute, userAttribute, sizeof(attribute)) < B_OK) {
9640 		return B_BAD_ADDRESS;
9641 	}
9642 
9643 	int attr = attr_open(fd, NULL, attribute, O_RDONLY, false);
9644 	if (attr < 0)
9645 		return attr;
9646 
9647 	ssize_t bytes = _user_read(attr, pos, userBuffer, readBytes);
9648 	_user_close(attr);
9649 
9650 	return bytes;
9651 }
9652 
9653 
9654 ssize_t
9655 _user_write_attr(int fd, const char* userAttribute, uint32 type, off_t pos,
9656 	const void* buffer, size_t writeBytes)
9657 {
9658 	char attribute[B_FILE_NAME_LENGTH];
9659 
9660 	if (userAttribute == NULL)
9661 		return B_BAD_VALUE;
9662 	if (!IS_USER_ADDRESS(userAttribute)
9663 		|| user_strlcpy(attribute, userAttribute, sizeof(attribute)) < B_OK) {
9664 		return B_BAD_ADDRESS;
9665 	}
9666 
9667 	// Try to support the BeOS typical truncation as well as the position
9668 	// argument
9669 	int attr = attr_create(fd, NULL, attribute, type,
9670 		O_CREAT | O_WRONLY | (pos != 0 ? 0 : O_TRUNC), false);
9671 	if (attr < 0)
9672 		return attr;
9673 
9674 	ssize_t bytes = _user_write(attr, pos, buffer, writeBytes);
9675 	_user_close(attr);
9676 
9677 	return bytes;
9678 }
9679 
9680 
9681 status_t
9682 _user_stat_attr(int fd, const char* userAttribute,
9683 	struct attr_info* userAttrInfo)
9684 {
9685 	char attribute[B_FILE_NAME_LENGTH];
9686 
9687 	if (userAttribute == NULL || userAttrInfo == NULL)
9688 		return B_BAD_VALUE;
9689 	if (!IS_USER_ADDRESS(userAttribute) || !IS_USER_ADDRESS(userAttrInfo)
9690 		|| user_strlcpy(attribute, userAttribute, sizeof(attribute)) < B_OK) {
9691 		return B_BAD_ADDRESS;
9692 	}
9693 
9694 	int attr = attr_open(fd, NULL, attribute, O_RDONLY, false);
9695 	if (attr < 0)
9696 		return attr;
9697 
9698 	struct file_descriptor* descriptor
9699 		= get_fd(get_current_io_context(false), attr);
9700 	if (descriptor == NULL) {
9701 		_user_close(attr);
9702 		return B_FILE_ERROR;
9703 	}
9704 
9705 	struct stat stat;
9706 	status_t status;
9707 	if (descriptor->ops->fd_read_stat)
9708 		status = descriptor->ops->fd_read_stat(descriptor, &stat);
9709 	else
9710 		status = B_UNSUPPORTED;
9711 
9712 	put_fd(descriptor);
9713 	_user_close(attr);
9714 
9715 	if (status == B_OK) {
9716 		attr_info info;
9717 		info.type = stat.st_type;
9718 		info.size = stat.st_size;
9719 
9720 		if (user_memcpy(userAttrInfo, &info, sizeof(struct attr_info)) != B_OK)
9721 			return B_BAD_ADDRESS;
9722 	}
9723 
9724 	return status;
9725 }
9726 
9727 
9728 int
9729 _user_open_attr(int fd, const char* userPath, const char* userName,
9730 	uint32 type, int openMode)
9731 {
9732 	char name[B_FILE_NAME_LENGTH];
9733 
9734 	if (!IS_USER_ADDRESS(userName)
9735 		|| user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9736 		return B_BAD_ADDRESS;
9737 
9738 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9739 	if (pathBuffer.InitCheck() != B_OK)
9740 		return B_NO_MEMORY;
9741 
9742 	char* path = pathBuffer.LockBuffer();
9743 
9744 	if (userPath != NULL) {
9745 		if (!IS_USER_ADDRESS(userPath)
9746 			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9747 			return B_BAD_ADDRESS;
9748 	}
9749 
9750 	if ((openMode & O_CREAT) != 0) {
9751 		return attr_create(fd, userPath ? path : NULL, name, type, openMode,
9752 			false);
9753 	}
9754 
9755 	return attr_open(fd, userPath ? path : NULL, name, openMode, false);
9756 }
9757 
9758 
9759 status_t
9760 _user_remove_attr(int fd, const char* userName)
9761 {
9762 	char name[B_FILE_NAME_LENGTH];
9763 
9764 	if (!IS_USER_ADDRESS(userName)
9765 		|| user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9766 		return B_BAD_ADDRESS;
9767 
9768 	return attr_remove(fd, name, false);
9769 }
9770 
9771 
9772 status_t
9773 _user_rename_attr(int fromFile, const char* userFromName, int toFile,
9774 	const char* userToName)
9775 {
9776 	if (!IS_USER_ADDRESS(userFromName)
9777 		|| !IS_USER_ADDRESS(userToName))
9778 		return B_BAD_ADDRESS;
9779 
9780 	KPath fromNameBuffer(B_FILE_NAME_LENGTH);
9781 	KPath toNameBuffer(B_FILE_NAME_LENGTH);
9782 	if (fromNameBuffer.InitCheck() != B_OK || toNameBuffer.InitCheck() != B_OK)
9783 		return B_NO_MEMORY;
9784 
9785 	char* fromName = fromNameBuffer.LockBuffer();
9786 	char* toName = toNameBuffer.LockBuffer();
9787 
9788 	if (user_strlcpy(fromName, userFromName, B_FILE_NAME_LENGTH) < B_OK
9789 		|| user_strlcpy(toName, userToName, B_FILE_NAME_LENGTH) < B_OK)
9790 		return B_BAD_ADDRESS;
9791 
9792 	return attr_rename(fromFile, fromName, toFile, toName, false);
9793 }
9794 
9795 
9796 int
9797 _user_open_index_dir(dev_t device)
9798 {
9799 	return index_dir_open(device, false);
9800 }
9801 
9802 
9803 status_t
9804 _user_create_index(dev_t device, const char* userName, uint32 type,
9805 	uint32 flags)
9806 {
9807 	char name[B_FILE_NAME_LENGTH];
9808 
9809 	if (!IS_USER_ADDRESS(userName)
9810 		|| user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9811 		return B_BAD_ADDRESS;
9812 
9813 	return index_create(device, name, type, flags, false);
9814 }
9815 
9816 
9817 status_t
9818 _user_read_index_stat(dev_t device, const char* userName, struct stat* userStat)
9819 {
9820 	char name[B_FILE_NAME_LENGTH];
9821 	struct stat stat;
9822 	status_t status;
9823 
9824 	if (!IS_USER_ADDRESS(userName)
9825 		|| !IS_USER_ADDRESS(userStat)
9826 		|| user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9827 		return B_BAD_ADDRESS;
9828 
9829 	status = index_name_read_stat(device, name, &stat, false);
9830 	if (status == B_OK) {
9831 		if (user_memcpy(userStat, &stat, sizeof(stat)) != B_OK)
9832 			return B_BAD_ADDRESS;
9833 	}
9834 
9835 	return status;
9836 }
9837 
9838 
9839 status_t
9840 _user_remove_index(dev_t device, const char* userName)
9841 {
9842 	char name[B_FILE_NAME_LENGTH];
9843 
9844 	if (!IS_USER_ADDRESS(userName)
9845 		|| user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9846 		return B_BAD_ADDRESS;
9847 
9848 	return index_remove(device, name, false);
9849 }
9850 
9851 
9852 status_t
9853 _user_getcwd(char* userBuffer, size_t size)
9854 {
9855 	if (size == 0)
9856 		return B_BAD_VALUE;
9857 	if (!IS_USER_ADDRESS(userBuffer))
9858 		return B_BAD_ADDRESS;
9859 
9860 	if (size > kMaxPathLength)
9861 		size = kMaxPathLength;
9862 
9863 	KPath pathBuffer(size);
9864 	if (pathBuffer.InitCheck() != B_OK)
9865 		return B_NO_MEMORY;
9866 
9867 	TRACE(("user_getcwd: buf %p, %ld\n", userBuffer, size));
9868 
9869 	char* path = pathBuffer.LockBuffer();
9870 
9871 	status_t status = get_cwd(path, size, false);
9872 	if (status != B_OK)
9873 		return status;
9874 
9875 	// Copy back the result
9876 	if (user_strlcpy(userBuffer, path, size) < B_OK)
9877 		return B_BAD_ADDRESS;
9878 
9879 	return status;
9880 }
9881 
9882 
9883 status_t
9884 _user_setcwd(int fd, const char* userPath)
9885 {
9886 	TRACE(("user_setcwd: path = %p\n", userPath));
9887 
9888 	KPath pathBuffer(B_PATH_NAME_LENGTH);
9889 	if (pathBuffer.InitCheck() != B_OK)
9890 		return B_NO_MEMORY;
9891 
9892 	char* path = pathBuffer.LockBuffer();
9893 
9894 	if (userPath != NULL) {
9895 		if (!IS_USER_ADDRESS(userPath)
9896 			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9897 			return B_BAD_ADDRESS;
9898 	}
9899 
9900 	return set_cwd(fd, userPath != NULL ? path : NULL, false);
9901 }
9902 
9903 
9904 status_t
9905 _user_change_root(const char* userPath)
9906 {
9907 	// only root is allowed to chroot()
9908 	if (geteuid() != 0)
9909 		return B_NOT_ALLOWED;
9910 
9911 	// alloc path buffer
9912 	KPath pathBuffer(B_PATH_NAME_LENGTH);
9913 	if (pathBuffer.InitCheck() != B_OK)
9914 		return B_NO_MEMORY;
9915 
9916 	// copy userland path to kernel
9917 	char* path = pathBuffer.LockBuffer();
9918 	if (userPath != NULL) {
9919 		if (!IS_USER_ADDRESS(userPath)
9920 			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9921 			return B_BAD_ADDRESS;
9922 	}
9923 
9924 	// get the vnode
9925 	struct vnode* vnode;
9926 	status_t status = path_to_vnode(path, true, &vnode, NULL, false);
9927 	if (status != B_OK)
9928 		return status;
9929 
9930 	// set the new root
9931 	struct io_context* context = get_current_io_context(false);
9932 	mutex_lock(&sIOContextRootLock);
9933 	struct vnode* oldRoot = context->root;
9934 	context->root = vnode;
9935 	mutex_unlock(&sIOContextRootLock);
9936 
9937 	put_vnode(oldRoot);
9938 
9939 	return B_OK;
9940 }
9941 
9942 
9943 int
9944 _user_open_query(dev_t device, const char* userQuery, size_t queryLength,
9945 	uint32 flags, port_id port, int32 token)
9946 {
9947 	char* query;
9948 
9949 	if (device < 0 || userQuery == NULL || queryLength == 0)
9950 		return B_BAD_VALUE;
9951 
9952 	if (!IS_USER_ADDRESS(userQuery))
9953 		return B_BAD_ADDRESS;
9954 
9955 	// this is a safety restriction
9956 	if (queryLength >= 65536)
9957 		return B_NAME_TOO_LONG;
9958 
9959 	query = (char*)malloc(queryLength + 1);
9960 	if (query == NULL)
9961 		return B_NO_MEMORY;
9962 	if (user_strlcpy(query, userQuery, queryLength + 1) < B_OK) {
9963 		free(query);
9964 		return B_BAD_ADDRESS;
9965 	}
9966 
9967 	int fd = query_open(device, query, flags, port, token, false);
9968 
9969 	free(query);
9970 	return fd;
9971 }
9972 
9973 
9974 #include "vfs_request_io.cpp"
9975