xref: /haiku/src/system/kernel/fs/vfs.cpp (revision fce4895d1884da5ae6fb299d23c735c598e690b1)
1 /*
2  * Copyright 2005-2013, Ingo Weinhold, ingo_weinhold@gmx.de.
3  * Copyright 2002-2016, Axel Dörfler, axeld@pinc-software.de.
4  * Distributed under the terms of the MIT License.
5  *
6  * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
7  * Distributed under the terms of the NewOS License.
8  */
9 
10 
11 /*! Virtual File System and File System Interface Layer */
12 
13 
14 #include <ctype.h>
15 #include <fcntl.h>
16 #include <limits.h>
17 #include <stddef.h>
18 #include <stdio.h>
19 #include <string.h>
20 #include <sys/file.h>
21 #include <sys/resource.h>
22 #include <sys/stat.h>
23 #include <unistd.h>
24 
25 #include <fs_attr.h>
26 #include <fs_info.h>
27 #include <fs_interface.h>
28 #include <fs_volume.h>
29 #include <OS.h>
30 #include <StorageDefs.h>
31 
32 #include <AutoDeleter.h>
33 #include <block_cache.h>
34 #include <boot/kernel_args.h>
35 #include <debug_heap.h>
36 #include <disk_device_manager/KDiskDevice.h>
37 #include <disk_device_manager/KDiskDeviceManager.h>
38 #include <disk_device_manager/KDiskDeviceUtils.h>
39 #include <disk_device_manager/KDiskSystem.h>
40 #include <fd.h>
41 #include <file_cache.h>
42 #include <fs/node_monitor.h>
43 #include <KPath.h>
44 #include <lock.h>
45 #include <low_resource_manager.h>
46 #include <syscalls.h>
47 #include <syscall_restart.h>
48 #include <tracing.h>
49 #include <util/atomic.h>
50 #include <util/AutoLock.h>
51 #include <util/DoublyLinkedList.h>
52 #include <vfs.h>
53 #include <vm/vm.h>
54 #include <vm/VMCache.h>
55 
56 #include "EntryCache.h"
57 #include "fifo.h"
58 #include "IORequest.h"
59 #include "unused_vnodes.h"
60 #include "vfs_tracing.h"
61 #include "Vnode.h"
62 #include "../cache/vnode_store.h"
63 
64 
65 //#define TRACE_VFS
66 #ifdef TRACE_VFS
67 #	define TRACE(x) dprintf x
68 #	define FUNCTION(x) dprintf x
69 #else
70 #	define TRACE(x) ;
71 #	define FUNCTION(x) ;
72 #endif
73 
74 #define ADD_DEBUGGER_COMMANDS
75 
76 
77 #define HAS_FS_CALL(vnode, op)			(vnode->ops->op != NULL)
78 #define HAS_FS_MOUNT_CALL(mount, op)	(mount->volume->ops->op != NULL)
79 
80 #if KDEBUG
81 #	define FS_CALL(vnode, op, params...) \
82 		( HAS_FS_CALL(vnode, op) ? \
83 			vnode->ops->op(vnode->mount->volume, vnode, params) \
84 			: (panic("FS_CALL op " #op " is NULL"), 0))
85 #	define FS_CALL_NO_PARAMS(vnode, op) \
86 		( HAS_FS_CALL(vnode, op) ? \
87 			vnode->ops->op(vnode->mount->volume, vnode) \
88 			: (panic("FS_CALL_NO_PARAMS op " #op " is NULL"), 0))
89 #	define FS_MOUNT_CALL(mount, op, params...) \
90 		( HAS_FS_MOUNT_CALL(mount, op) ? \
91 			mount->volume->ops->op(mount->volume, params) \
92 			: (panic("FS_MOUNT_CALL op " #op " is NULL"), 0))
93 #	define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
94 		( HAS_FS_MOUNT_CALL(mount, op) ? \
95 			mount->volume->ops->op(mount->volume) \
96 			: (panic("FS_MOUNT_CALL_NO_PARAMS op " #op " is NULL"), 0))
97 #else
98 #	define FS_CALL(vnode, op, params...) \
99 			vnode->ops->op(vnode->mount->volume, vnode, params)
100 #	define FS_CALL_NO_PARAMS(vnode, op) \
101 			vnode->ops->op(vnode->mount->volume, vnode)
102 #	define FS_MOUNT_CALL(mount, op, params...) \
103 			mount->volume->ops->op(mount->volume, params)
104 #	define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
105 			mount->volume->ops->op(mount->volume)
106 #endif
107 
108 
109 const static size_t kMaxPathLength = 65536;
110 	// The absolute maximum path length (for getcwd() - this is not depending
111 	// on PATH_MAX
112 
113 
114 typedef DoublyLinkedList<vnode> VnodeList;
115 
116 /*!	\brief Structure to manage a mounted file system
117 
118 	Note: The root_vnode and root_vnode->covers fields (what others?) are
119 	initialized in fs_mount() and not changed afterwards. That is as soon
120 	as the mount is mounted and it is made sure it won't be unmounted
121 	(e.g. by holding a reference to a vnode of that mount) (read) access
122 	to those fields is always safe, even without additional locking. Morever
123 	while mounted the mount holds a reference to the root_vnode->covers vnode,
124 	and thus making the access path vnode->mount->root_vnode->covers->mount->...
125 	safe if a reference to vnode is held (note that for the root mount
126 	root_vnode->covers is NULL, though).
127 */
128 struct fs_mount {
129 	fs_mount()
130 		:
131 		volume(NULL),
132 		device_name(NULL)
133 	{
134 		recursive_lock_init(&rlock, "mount rlock");
135 	}
136 
137 	~fs_mount()
138 	{
139 		recursive_lock_destroy(&rlock);
140 		free(device_name);
141 
142 		while (volume) {
143 			fs_volume* superVolume = volume->super_volume;
144 
145 			if (volume->file_system != NULL)
146 				put_module(volume->file_system->info.name);
147 
148 			free(volume->file_system_name);
149 			free(volume);
150 			volume = superVolume;
151 		}
152 	}
153 
154 	struct fs_mount* next;
155 	dev_t			id;
156 	fs_volume*		volume;
157 	char*			device_name;
158 	recursive_lock	rlock;	// guards the vnodes list
159 		// TODO: Make this a mutex! It is never used recursively.
160 	struct vnode*	root_vnode;
161 	struct vnode*	covers_vnode;	// immutable
162 	KPartition*		partition;
163 	VnodeList		vnodes;
164 	EntryCache		entry_cache;
165 	bool			unmounting;
166 	bool			owns_file_device;
167 };
168 
169 
170 namespace {
171 
172 struct advisory_lock : public DoublyLinkedListLinkImpl<advisory_lock> {
173 	list_link		link;
174 	team_id			team;
175 	pid_t			session;
176 	off_t			start;
177 	off_t			end;
178 	bool			shared;
179 };
180 
181 typedef DoublyLinkedList<advisory_lock> LockList;
182 
183 } // namespace
184 
185 
186 struct advisory_locking {
187 	sem_id			lock;
188 	sem_id			wait_sem;
189 	LockList		locks;
190 
191 	advisory_locking()
192 		:
193 		lock(-1),
194 		wait_sem(-1)
195 	{
196 	}
197 
198 	~advisory_locking()
199 	{
200 		if (lock >= 0)
201 			delete_sem(lock);
202 		if (wait_sem >= 0)
203 			delete_sem(wait_sem);
204 	}
205 };
206 
207 /*!	\brief Guards sMountsTable.
208 
209 	The holder is allowed to read/write access the sMountsTable.
210 	Manipulation of the fs_mount structures themselves
211 	(and their destruction) requires different locks though.
212 */
213 static mutex sMountMutex = MUTEX_INITIALIZER("vfs_mount_lock");
214 
215 /*!	\brief Guards mount/unmount operations.
216 
217 	The fs_mount() and fs_unmount() hold the lock during their whole operation.
218 	That is locking the lock ensures that no FS is mounted/unmounted. In
219 	particular this means that
220 	- sMountsTable will not be modified,
221 	- the fields immutable after initialization of the fs_mount structures in
222 	  sMountsTable will not be modified,
223 
224 	The thread trying to lock the lock must not hold sVnodeLock or
225 	sMountMutex.
226 */
227 static recursive_lock sMountOpLock;
228 
229 /*!	\brief Guards sVnodeTable.
230 
231 	The holder is allowed read/write access to sVnodeTable and to
232 	any unbusy vnode in that table, save to the immutable fields (device, id,
233 	private_node, mount) to which only read-only access is allowed.
234 	The mutable fields advisory_locking, mandatory_locked_by, and ref_count, as
235 	well as the busy, removed, unused flags, and the vnode's type can also be
236 	write accessed when holding a read lock to sVnodeLock *and* having the vnode
237 	locked. Write access to covered_by and covers requires to write lock
238 	sVnodeLock.
239 
240 	The thread trying to acquire the lock must not hold sMountMutex.
241 	You must not hold this lock when calling create_sem(), as this might call
242 	vfs_free_unused_vnodes() and thus cause a deadlock.
243 */
244 static rw_lock sVnodeLock = RW_LOCK_INITIALIZER("vfs_vnode_lock");
245 
246 /*!	\brief Guards io_context::root.
247 
248 	Must be held when setting or getting the io_context::root field.
249 	The only operation allowed while holding this lock besides getting or
250 	setting the field is inc_vnode_ref_count() on io_context::root.
251 */
252 static mutex sIOContextRootLock = MUTEX_INITIALIZER("io_context::root lock");
253 
254 
255 namespace {
256 
257 struct vnode_hash_key {
258 	dev_t	device;
259 	ino_t	vnode;
260 };
261 
262 struct VnodeHash {
263 	typedef vnode_hash_key	KeyType;
264 	typedef	struct vnode	ValueType;
265 
266 #define VHASH(mountid, vnodeid) \
267 	(((uint32)((vnodeid) >> 32) + (uint32)(vnodeid)) ^ (uint32)(mountid))
268 
269 	size_t HashKey(KeyType key) const
270 	{
271 		return VHASH(key.device, key.vnode);
272 	}
273 
274 	size_t Hash(ValueType* vnode) const
275 	{
276 		return VHASH(vnode->device, vnode->id);
277 	}
278 
279 #undef VHASH
280 
281 	bool Compare(KeyType key, ValueType* vnode) const
282 	{
283 		return vnode->device == key.device && vnode->id == key.vnode;
284 	}
285 
286 	ValueType*& GetLink(ValueType* value) const
287 	{
288 		return value->next;
289 	}
290 };
291 
292 typedef BOpenHashTable<VnodeHash> VnodeTable;
293 
294 
295 struct MountHash {
296 	typedef dev_t			KeyType;
297 	typedef	struct fs_mount	ValueType;
298 
299 	size_t HashKey(KeyType key) const
300 	{
301 		return key;
302 	}
303 
304 	size_t Hash(ValueType* mount) const
305 	{
306 		return mount->id;
307 	}
308 
309 	bool Compare(KeyType key, ValueType* mount) const
310 	{
311 		return mount->id == key;
312 	}
313 
314 	ValueType*& GetLink(ValueType* value) const
315 	{
316 		return value->next;
317 	}
318 };
319 
320 typedef BOpenHashTable<MountHash> MountTable;
321 
322 } // namespace
323 
324 
325 #define VNODE_HASH_TABLE_SIZE 1024
326 static VnodeTable* sVnodeTable;
327 static struct vnode* sRoot;
328 
329 #define MOUNTS_HASH_TABLE_SIZE 16
330 static MountTable* sMountsTable;
331 static dev_t sNextMountID = 1;
332 
333 #define MAX_TEMP_IO_VECS 8
334 
335 mode_t __gUmask = 022;
336 
337 /* function declarations */
338 
339 static void free_unused_vnodes();
340 
341 // file descriptor operation prototypes
342 static status_t file_read(struct file_descriptor* descriptor, off_t pos,
343 	void* buffer, size_t* _bytes);
344 static status_t file_write(struct file_descriptor* descriptor, off_t pos,
345 	const void* buffer, size_t* _bytes);
346 static off_t file_seek(struct file_descriptor* descriptor, off_t pos,
347 	int seekType);
348 static void file_free_fd(struct file_descriptor* descriptor);
349 static status_t file_close(struct file_descriptor* descriptor);
350 static status_t file_select(struct file_descriptor* descriptor, uint8 event,
351 	struct selectsync* sync);
352 static status_t file_deselect(struct file_descriptor* descriptor, uint8 event,
353 	struct selectsync* sync);
354 static status_t dir_read(struct io_context* context,
355 	struct file_descriptor* descriptor, struct dirent* buffer,
356 	size_t bufferSize, uint32* _count);
357 static status_t dir_read(struct io_context* ioContext, struct vnode* vnode,
358 	void* cookie, struct dirent* buffer, size_t bufferSize, uint32* _count);
359 static status_t dir_rewind(struct file_descriptor* descriptor);
360 static void dir_free_fd(struct file_descriptor* descriptor);
361 static status_t dir_close(struct file_descriptor* descriptor);
362 static status_t attr_dir_read(struct io_context* context,
363 	struct file_descriptor* descriptor, struct dirent* buffer,
364 	size_t bufferSize, uint32* _count);
365 static status_t attr_dir_rewind(struct file_descriptor* descriptor);
366 static void attr_dir_free_fd(struct file_descriptor* descriptor);
367 static status_t attr_dir_close(struct file_descriptor* descriptor);
368 static status_t attr_read(struct file_descriptor* descriptor, off_t pos,
369 	void* buffer, size_t* _bytes);
370 static status_t attr_write(struct file_descriptor* descriptor, off_t pos,
371 	const void* buffer, size_t* _bytes);
372 static off_t attr_seek(struct file_descriptor* descriptor, off_t pos,
373 	int seekType);
374 static void attr_free_fd(struct file_descriptor* descriptor);
375 static status_t attr_close(struct file_descriptor* descriptor);
376 static status_t attr_read_stat(struct file_descriptor* descriptor,
377 	struct stat* statData);
378 static status_t attr_write_stat(struct file_descriptor* descriptor,
379 	const struct stat* stat, int statMask);
380 static status_t index_dir_read(struct io_context* context,
381 	struct file_descriptor* descriptor, struct dirent* buffer,
382 	size_t bufferSize, uint32* _count);
383 static status_t index_dir_rewind(struct file_descriptor* descriptor);
384 static void index_dir_free_fd(struct file_descriptor* descriptor);
385 static status_t index_dir_close(struct file_descriptor* descriptor);
386 static status_t query_read(struct io_context* context,
387 	struct file_descriptor* descriptor, struct dirent* buffer,
388 	size_t bufferSize, uint32* _count);
389 static status_t query_rewind(struct file_descriptor* descriptor);
390 static void query_free_fd(struct file_descriptor* descriptor);
391 static status_t query_close(struct file_descriptor* descriptor);
392 
393 static status_t common_ioctl(struct file_descriptor* descriptor, ulong op,
394 	void* buffer, size_t length);
395 static status_t common_read_stat(struct file_descriptor* descriptor,
396 	struct stat* statData);
397 static status_t common_write_stat(struct file_descriptor* descriptor,
398 	const struct stat* statData, int statMask);
399 static status_t common_path_read_stat(int fd, char* path, bool traverseLeafLink,
400 	struct stat* stat, bool kernel);
401 
402 static status_t vnode_path_to_vnode(struct vnode* vnode, char* path,
403 	bool traverseLeafLink, int count, bool kernel,
404 	struct vnode** _vnode, ino_t* _parentID);
405 static status_t dir_vnode_to_path(struct vnode* vnode, char* buffer,
406 	size_t bufferSize, bool kernel);
407 static status_t fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
408 	struct vnode** _vnode, ino_t* _parentID, bool kernel);
409 static void inc_vnode_ref_count(struct vnode* vnode);
410 static status_t dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree,
411 	bool reenter);
412 static inline void put_vnode(struct vnode* vnode);
413 static status_t fs_unmount(char* path, dev_t mountID, uint32 flags,
414 	bool kernel);
415 static int open_vnode(struct vnode* vnode, int openMode, bool kernel);
416 
417 
418 static struct fd_ops sFileOps = {
419 	file_read,
420 	file_write,
421 	file_seek,
422 	common_ioctl,
423 	NULL,		// set_flags
424 	file_select,
425 	file_deselect,
426 	NULL,		// read_dir()
427 	NULL,		// rewind_dir()
428 	common_read_stat,
429 	common_write_stat,
430 	file_close,
431 	file_free_fd
432 };
433 
434 static struct fd_ops sDirectoryOps = {
435 	NULL,		// read()
436 	NULL,		// write()
437 	NULL,		// seek()
438 	common_ioctl,
439 	NULL,		// set_flags
440 	NULL,		// select()
441 	NULL,		// deselect()
442 	dir_read,
443 	dir_rewind,
444 	common_read_stat,
445 	common_write_stat,
446 	dir_close,
447 	dir_free_fd
448 };
449 
450 static struct fd_ops sAttributeDirectoryOps = {
451 	NULL,		// read()
452 	NULL,		// write()
453 	NULL,		// seek()
454 	common_ioctl,
455 	NULL,		// set_flags
456 	NULL,		// select()
457 	NULL,		// deselect()
458 	attr_dir_read,
459 	attr_dir_rewind,
460 	common_read_stat,
461 	common_write_stat,
462 	attr_dir_close,
463 	attr_dir_free_fd
464 };
465 
466 static struct fd_ops sAttributeOps = {
467 	attr_read,
468 	attr_write,
469 	attr_seek,
470 	common_ioctl,
471 	NULL,		// set_flags
472 	NULL,		// select()
473 	NULL,		// deselect()
474 	NULL,		// read_dir()
475 	NULL,		// rewind_dir()
476 	attr_read_stat,
477 	attr_write_stat,
478 	attr_close,
479 	attr_free_fd
480 };
481 
482 static struct fd_ops sIndexDirectoryOps = {
483 	NULL,		// read()
484 	NULL,		// write()
485 	NULL,		// seek()
486 	NULL,		// ioctl()
487 	NULL,		// set_flags
488 	NULL,		// select()
489 	NULL,		// deselect()
490 	index_dir_read,
491 	index_dir_rewind,
492 	NULL,		// read_stat()
493 	NULL,		// write_stat()
494 	index_dir_close,
495 	index_dir_free_fd
496 };
497 
498 #if 0
499 static struct fd_ops sIndexOps = {
500 	NULL,		// read()
501 	NULL,		// write()
502 	NULL,		// seek()
503 	NULL,		// ioctl()
504 	NULL,		// set_flags
505 	NULL,		// select()
506 	NULL,		// deselect()
507 	NULL,		// dir_read()
508 	NULL,		// dir_rewind()
509 	index_read_stat,	// read_stat()
510 	NULL,		// write_stat()
511 	NULL,		// dir_close()
512 	NULL		// free_fd()
513 };
514 #endif
515 
516 static struct fd_ops sQueryOps = {
517 	NULL,		// read()
518 	NULL,		// write()
519 	NULL,		// seek()
520 	NULL,		// ioctl()
521 	NULL,		// set_flags
522 	NULL,		// select()
523 	NULL,		// deselect()
524 	query_read,
525 	query_rewind,
526 	NULL,		// read_stat()
527 	NULL,		// write_stat()
528 	query_close,
529 	query_free_fd
530 };
531 
532 
533 namespace {
534 
535 class VNodePutter {
536 public:
537 	VNodePutter(struct vnode* vnode = NULL) : fVNode(vnode) {}
538 
539 	~VNodePutter()
540 	{
541 		Put();
542 	}
543 
544 	void SetTo(struct vnode* vnode)
545 	{
546 		Put();
547 		fVNode = vnode;
548 	}
549 
550 	void Put()
551 	{
552 		if (fVNode) {
553 			put_vnode(fVNode);
554 			fVNode = NULL;
555 		}
556 	}
557 
558 	struct vnode* Detach()
559 	{
560 		struct vnode* vnode = fVNode;
561 		fVNode = NULL;
562 		return vnode;
563 	}
564 
565 private:
566 	struct vnode* fVNode;
567 };
568 
569 
570 class FDCloser {
571 public:
572 	FDCloser() : fFD(-1), fKernel(true) {}
573 
574 	FDCloser(int fd, bool kernel) : fFD(fd), fKernel(kernel) {}
575 
576 	~FDCloser()
577 	{
578 		Close();
579 	}
580 
581 	void SetTo(int fd, bool kernel)
582 	{
583 		Close();
584 		fFD = fd;
585 		fKernel = kernel;
586 	}
587 
588 	void Close()
589 	{
590 		if (fFD >= 0) {
591 			if (fKernel)
592 				_kern_close(fFD);
593 			else
594 				_user_close(fFD);
595 			fFD = -1;
596 		}
597 	}
598 
599 	int Detach()
600 	{
601 		int fd = fFD;
602 		fFD = -1;
603 		return fd;
604 	}
605 
606 private:
607 	int		fFD;
608 	bool	fKernel;
609 };
610 
611 } // namespace
612 
613 
614 #if VFS_PAGES_IO_TRACING
615 
616 namespace VFSPagesIOTracing {
617 
618 class PagesIOTraceEntry : public AbstractTraceEntry {
619 protected:
620 	PagesIOTraceEntry(struct vnode* vnode, void* cookie, off_t pos,
621 		const generic_io_vec* vecs, uint32 count, uint32 flags,
622 		generic_size_t bytesRequested, status_t status,
623 		generic_size_t bytesTransferred)
624 		:
625 		fVnode(vnode),
626 		fMountID(vnode->mount->id),
627 		fNodeID(vnode->id),
628 		fCookie(cookie),
629 		fPos(pos),
630 		fCount(count),
631 		fFlags(flags),
632 		fBytesRequested(bytesRequested),
633 		fStatus(status),
634 		fBytesTransferred(bytesTransferred)
635 	{
636 		fVecs = (generic_io_vec*)alloc_tracing_buffer_memcpy(vecs,
637 			sizeof(generic_io_vec) * count, false);
638 	}
639 
640 	void AddDump(TraceOutput& out, const char* mode)
641 	{
642 		out.Print("vfs pages io %5s: vnode: %p (%" B_PRId32 ", %" B_PRId64 "), "
643 			"cookie: %p, pos: %" B_PRIdOFF ", size: %" B_PRIu64 ", vecs: {",
644 			mode, fVnode, fMountID, fNodeID, fCookie, fPos,
645 			(uint64)fBytesRequested);
646 
647 		if (fVecs != NULL) {
648 			for (uint32 i = 0; i < fCount; i++) {
649 				if (i > 0)
650 					out.Print(", ");
651 				out.Print("(%" B_PRIx64 ", %" B_PRIu64 ")", (uint64)fVecs[i].base,
652 					(uint64)fVecs[i].length);
653 			}
654 		}
655 
656 		out.Print("}, flags: %#" B_PRIx32 " -> status: %#" B_PRIx32 ", "
657 			"transferred: %" B_PRIu64, fFlags, fStatus,
658 			(uint64)fBytesTransferred);
659 	}
660 
661 protected:
662 	struct vnode*	fVnode;
663 	dev_t			fMountID;
664 	ino_t			fNodeID;
665 	void*			fCookie;
666 	off_t			fPos;
667 	generic_io_vec*	fVecs;
668 	uint32			fCount;
669 	uint32			fFlags;
670 	generic_size_t	fBytesRequested;
671 	status_t		fStatus;
672 	generic_size_t	fBytesTransferred;
673 };
674 
675 
676 class ReadPages : public PagesIOTraceEntry {
677 public:
678 	ReadPages(struct vnode* vnode, void* cookie, off_t pos,
679 		const generic_io_vec* vecs, uint32 count, uint32 flags,
680 		generic_size_t bytesRequested, status_t status,
681 		generic_size_t bytesTransferred)
682 		:
683 		PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
684 			bytesRequested, status, bytesTransferred)
685 	{
686 		Initialized();
687 	}
688 
689 	virtual void AddDump(TraceOutput& out)
690 	{
691 		PagesIOTraceEntry::AddDump(out, "read");
692 	}
693 };
694 
695 
696 class WritePages : public PagesIOTraceEntry {
697 public:
698 	WritePages(struct vnode* vnode, void* cookie, off_t pos,
699 		const generic_io_vec* vecs, uint32 count, uint32 flags,
700 		generic_size_t bytesRequested, status_t status,
701 		generic_size_t bytesTransferred)
702 		:
703 		PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
704 			bytesRequested, status, bytesTransferred)
705 	{
706 		Initialized();
707 	}
708 
709 	virtual void AddDump(TraceOutput& out)
710 	{
711 		PagesIOTraceEntry::AddDump(out, "write");
712 	}
713 };
714 
715 }	// namespace VFSPagesIOTracing
716 
717 #	define TPIO(x) new(std::nothrow) VFSPagesIOTracing::x;
718 #else
719 #	define TPIO(x) ;
720 #endif	// VFS_PAGES_IO_TRACING
721 
722 
723 /*! Finds the mounted device (the fs_mount structure) with the given ID.
724 	Note, you must hold the gMountMutex lock when you call this function.
725 */
726 static struct fs_mount*
727 find_mount(dev_t id)
728 {
729 	ASSERT_LOCKED_MUTEX(&sMountMutex);
730 
731 	return sMountsTable->Lookup(id);
732 }
733 
734 
735 static status_t
736 get_mount(dev_t id, struct fs_mount** _mount)
737 {
738 	struct fs_mount* mount;
739 
740 	ReadLocker nodeLocker(sVnodeLock);
741 	MutexLocker mountLocker(sMountMutex);
742 
743 	mount = find_mount(id);
744 	if (mount == NULL)
745 		return B_BAD_VALUE;
746 
747 	struct vnode* rootNode = mount->root_vnode;
748 	if (mount->unmounting || rootNode == NULL || rootNode->IsBusy()
749 		|| rootNode->ref_count == 0) {
750 		// might have been called during a mount/unmount operation
751 		return B_BUSY;
752 	}
753 
754 	inc_vnode_ref_count(rootNode);
755 	*_mount = mount;
756 	return B_OK;
757 }
758 
759 
760 static void
761 put_mount(struct fs_mount* mount)
762 {
763 	if (mount)
764 		put_vnode(mount->root_vnode);
765 }
766 
767 
768 /*!	Tries to open the specified file system module.
769 	Accepts a file system name of the form "bfs" or "file_systems/bfs/v1".
770 	Returns a pointer to file system module interface, or NULL if it
771 	could not open the module.
772 */
773 static file_system_module_info*
774 get_file_system(const char* fsName)
775 {
776 	char name[B_FILE_NAME_LENGTH];
777 	if (strncmp(fsName, "file_systems/", strlen("file_systems/"))) {
778 		// construct module name if we didn't get one
779 		// (we currently support only one API)
780 		snprintf(name, sizeof(name), "file_systems/%s/v1", fsName);
781 		fsName = NULL;
782 	}
783 
784 	file_system_module_info* info;
785 	if (get_module(fsName ? fsName : name, (module_info**)&info) != B_OK)
786 		return NULL;
787 
788 	return info;
789 }
790 
791 
792 /*!	Accepts a file system name of the form "bfs" or "file_systems/bfs/v1"
793 	and returns a compatible fs_info.fsh_name name ("bfs" in both cases).
794 	The name is allocated for you, and you have to free() it when you're
795 	done with it.
796 	Returns NULL if the required memory is not available.
797 */
798 static char*
799 get_file_system_name(const char* fsName)
800 {
801 	const size_t length = strlen("file_systems/");
802 
803 	if (strncmp(fsName, "file_systems/", length)) {
804 		// the name already seems to be the module's file name
805 		return strdup(fsName);
806 	}
807 
808 	fsName += length;
809 	const char* end = strchr(fsName, '/');
810 	if (end == NULL) {
811 		// this doesn't seem to be a valid name, but well...
812 		return strdup(fsName);
813 	}
814 
815 	// cut off the trailing /v1
816 
817 	char* name = (char*)malloc(end + 1 - fsName);
818 	if (name == NULL)
819 		return NULL;
820 
821 	strlcpy(name, fsName, end + 1 - fsName);
822 	return name;
823 }
824 
825 
826 /*!	Accepts a list of file system names separated by a colon, one for each
827 	layer and returns the file system name for the specified layer.
828 	The name is allocated for you, and you have to free() it when you're
829 	done with it.
830 	Returns NULL if the required memory is not available or if there is no
831 	name for the specified layer.
832 */
833 static char*
834 get_file_system_name_for_layer(const char* fsNames, int32 layer)
835 {
836 	while (layer >= 0) {
837 		const char* end = strchr(fsNames, ':');
838 		if (end == NULL) {
839 			if (layer == 0)
840 				return strdup(fsNames);
841 			return NULL;
842 		}
843 
844 		if (layer == 0) {
845 			size_t length = end - fsNames + 1;
846 			char* result = (char*)malloc(length);
847 			strlcpy(result, fsNames, length);
848 			return result;
849 		}
850 
851 		fsNames = end + 1;
852 		layer--;
853 	}
854 
855 	return NULL;
856 }
857 
858 
859 static void
860 add_vnode_to_mount_list(struct vnode* vnode, struct fs_mount* mount)
861 {
862 	RecursiveLocker _(mount->rlock);
863 	mount->vnodes.Add(vnode);
864 }
865 
866 
867 static void
868 remove_vnode_from_mount_list(struct vnode* vnode, struct fs_mount* mount)
869 {
870 	RecursiveLocker _(mount->rlock);
871 	mount->vnodes.Remove(vnode);
872 }
873 
874 
875 /*!	\brief Looks up a vnode by mount and node ID in the sVnodeTable.
876 
877 	The caller must hold the sVnodeLock (read lock at least).
878 
879 	\param mountID the mount ID.
880 	\param vnodeID the node ID.
881 
882 	\return The vnode structure, if it was found in the hash table, \c NULL
883 			otherwise.
884 */
885 static struct vnode*
886 lookup_vnode(dev_t mountID, ino_t vnodeID)
887 {
888 	struct vnode_hash_key key;
889 
890 	key.device = mountID;
891 	key.vnode = vnodeID;
892 
893 	return sVnodeTable->Lookup(key);
894 }
895 
896 
897 /*!	Creates a new vnode with the given mount and node ID.
898 	If the node already exists, it is returned instead and no new node is
899 	created. In either case -- but not, if an error occurs -- the function write
900 	locks \c sVnodeLock and keeps it locked for the caller when returning. On
901 	error the lock is not held on return.
902 
903 	\param mountID The mount ID.
904 	\param vnodeID The vnode ID.
905 	\param _vnode Will be set to the new vnode on success.
906 	\param _nodeCreated Will be set to \c true when the returned vnode has
907 		been newly created, \c false when it already existed. Will not be
908 		changed on error.
909 	\return \c B_OK, when the vnode was successfully created and inserted or
910 		a node with the given ID was found, \c B_NO_MEMORY or
911 		\c B_ENTRY_NOT_FOUND on error.
912 */
913 static status_t
914 create_new_vnode_and_lock(dev_t mountID, ino_t vnodeID, struct vnode*& _vnode,
915 	bool& _nodeCreated)
916 {
917 	FUNCTION(("create_new_vnode_and_lock()\n"));
918 
919 	struct vnode* vnode = (struct vnode*)malloc(sizeof(struct vnode));
920 	if (vnode == NULL)
921 		return B_NO_MEMORY;
922 
923 	// initialize basic values
924 	memset(vnode, 0, sizeof(struct vnode));
925 	vnode->device = mountID;
926 	vnode->id = vnodeID;
927 	vnode->ref_count = 1;
928 	vnode->SetBusy(true);
929 
930 	// look up the the node -- it might have been added by someone else in the
931 	// meantime
932 	rw_lock_write_lock(&sVnodeLock);
933 	struct vnode* existingVnode = lookup_vnode(mountID, vnodeID);
934 	if (existingVnode != NULL) {
935 		free(vnode);
936 		_vnode = existingVnode;
937 		_nodeCreated = false;
938 		return B_OK;
939 	}
940 
941 	// get the mount structure
942 	mutex_lock(&sMountMutex);
943 	vnode->mount = find_mount(mountID);
944 	if (!vnode->mount || vnode->mount->unmounting) {
945 		mutex_unlock(&sMountMutex);
946 		rw_lock_write_unlock(&sVnodeLock);
947 		free(vnode);
948 		return B_ENTRY_NOT_FOUND;
949 	}
950 
951 	// add the vnode to the mount's node list and the hash table
952 	sVnodeTable->Insert(vnode);
953 	add_vnode_to_mount_list(vnode, vnode->mount);
954 
955 	mutex_unlock(&sMountMutex);
956 
957 	_vnode = vnode;
958 	_nodeCreated = true;
959 
960 	// keep the vnode lock locked
961 	return B_OK;
962 }
963 
964 
965 /*!	Frees the vnode and all resources it has acquired, and removes
966 	it from the vnode hash as well as from its mount structure.
967 	Will also make sure that any cache modifications are written back.
968 */
969 static void
970 free_vnode(struct vnode* vnode, bool reenter)
971 {
972 	ASSERT_PRINT(vnode->ref_count == 0 && vnode->IsBusy(), "vnode: %p\n",
973 		vnode);
974 
975 	// write back any changes in this vnode's cache -- but only
976 	// if the vnode won't be deleted, in which case the changes
977 	// will be discarded
978 
979 	if (!vnode->IsRemoved() && HAS_FS_CALL(vnode, fsync))
980 		FS_CALL_NO_PARAMS(vnode, fsync);
981 
982 	// Note: If this vnode has a cache attached, there will still be two
983 	// references to that cache at this point. The last one belongs to the vnode
984 	// itself (cf. vfs_get_vnode_cache()) and one belongs to the node's file
985 	// cache. Each but the last reference to a cache also includes a reference
986 	// to the vnode. The file cache, however, released its reference (cf.
987 	// file_cache_create()), so that this vnode's ref count has the chance to
988 	// ever drop to 0. Deleting the file cache now, will cause the next to last
989 	// cache reference to be released, which will also release a (no longer
990 	// existing) vnode reference. To avoid problems, we set the vnode's ref
991 	// count, so that it will neither become negative nor 0.
992 	vnode->ref_count = 2;
993 
994 	if (!vnode->IsUnpublished()) {
995 		if (vnode->IsRemoved())
996 			FS_CALL(vnode, remove_vnode, reenter);
997 		else
998 			FS_CALL(vnode, put_vnode, reenter);
999 	}
1000 
1001 	// If the vnode has a VMCache attached, make sure that it won't try to get
1002 	// another reference via VMVnodeCache::AcquireUnreferencedStoreRef(). As
1003 	// long as the vnode is busy and in the hash, that won't happen, but as
1004 	// soon as we've removed it from the hash, it could reload the vnode -- with
1005 	// a new cache attached!
1006 	if (vnode->cache != NULL)
1007 		((VMVnodeCache*)vnode->cache)->VnodeDeleted();
1008 
1009 	// The file system has removed the resources of the vnode now, so we can
1010 	// make it available again (by removing the busy vnode from the hash).
1011 	rw_lock_write_lock(&sVnodeLock);
1012 	sVnodeTable->Remove(vnode);
1013 	rw_lock_write_unlock(&sVnodeLock);
1014 
1015 	// if we have a VMCache attached, remove it
1016 	if (vnode->cache)
1017 		vnode->cache->ReleaseRef();
1018 
1019 	vnode->cache = NULL;
1020 
1021 	remove_vnode_from_mount_list(vnode, vnode->mount);
1022 
1023 	free(vnode);
1024 }
1025 
1026 
1027 /*!	\brief Decrements the reference counter of the given vnode and deletes it,
1028 	if the counter dropped to 0.
1029 
1030 	The caller must, of course, own a reference to the vnode to call this
1031 	function.
1032 	The caller must not hold the sVnodeLock or the sMountMutex.
1033 
1034 	\param vnode the vnode.
1035 	\param alwaysFree don't move this vnode into the unused list, but really
1036 		   delete it if possible.
1037 	\param reenter \c true, if this function is called (indirectly) from within
1038 		   a file system. This will be passed to file system hooks only.
1039 	\return \c B_OK, if everything went fine, an error code otherwise.
1040 */
1041 static status_t
1042 dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree, bool reenter)
1043 {
1044 	ReadLocker locker(sVnodeLock);
1045 	AutoLocker<Vnode> nodeLocker(vnode);
1046 
1047 	int32 oldRefCount = atomic_add(&vnode->ref_count, -1);
1048 
1049 	ASSERT_PRINT(oldRefCount > 0, "vnode %p\n", vnode);
1050 
1051 	TRACE(("dec_vnode_ref_count: vnode %p, ref now %" B_PRId32 "\n", vnode,
1052 		vnode->ref_count));
1053 
1054 	if (oldRefCount != 1)
1055 		return B_OK;
1056 
1057 	if (vnode->IsBusy())
1058 		panic("dec_vnode_ref_count: called on busy vnode %p\n", vnode);
1059 
1060 	bool freeNode = false;
1061 	bool freeUnusedNodes = false;
1062 
1063 	// Just insert the vnode into an unused list if we don't need
1064 	// to delete it
1065 	if (vnode->IsRemoved() || alwaysFree) {
1066 		vnode_to_be_freed(vnode);
1067 		vnode->SetBusy(true);
1068 		freeNode = true;
1069 	} else
1070 		freeUnusedNodes = vnode_unused(vnode);
1071 
1072 	nodeLocker.Unlock();
1073 	locker.Unlock();
1074 
1075 	if (freeNode)
1076 		free_vnode(vnode, reenter);
1077 	else if (freeUnusedNodes)
1078 		free_unused_vnodes();
1079 
1080 	return B_OK;
1081 }
1082 
1083 
1084 /*!	\brief Increments the reference counter of the given vnode.
1085 
1086 	The caller must make sure that the node isn't deleted while this function
1087 	is called. This can be done either:
1088 	- by ensuring that a reference to the node exists and remains in existence,
1089 	  or
1090 	- by holding the vnode's lock (which also requires read locking sVnodeLock)
1091 	  or by holding sVnodeLock write locked.
1092 
1093 	In the second case the caller is responsible for dealing with the ref count
1094 	0 -> 1 transition. That is 1. this function must not be invoked when the
1095 	node is busy in the first place and 2. vnode_used() must be called for the
1096 	node.
1097 
1098 	\param vnode the vnode.
1099 */
1100 static void
1101 inc_vnode_ref_count(struct vnode* vnode)
1102 {
1103 	atomic_add(&vnode->ref_count, 1);
1104 	TRACE(("inc_vnode_ref_count: vnode %p, ref now %" B_PRId32 "\n", vnode,
1105 		vnode->ref_count));
1106 }
1107 
1108 
1109 static bool
1110 is_special_node_type(int type)
1111 {
1112 	// at the moment only FIFOs are supported
1113 	return S_ISFIFO(type);
1114 }
1115 
1116 
1117 static status_t
1118 create_special_sub_node(struct vnode* vnode, uint32 flags)
1119 {
1120 	if (S_ISFIFO(vnode->Type()))
1121 		return create_fifo_vnode(vnode->mount->volume, vnode);
1122 
1123 	return B_BAD_VALUE;
1124 }
1125 
1126 
1127 /*!	\brief Retrieves a vnode for a given mount ID, node ID pair.
1128 
1129 	If the node is not yet in memory, it will be loaded.
1130 
1131 	The caller must not hold the sVnodeLock or the sMountMutex.
1132 
1133 	\param mountID the mount ID.
1134 	\param vnodeID the node ID.
1135 	\param _vnode Pointer to a vnode* variable into which the pointer to the
1136 		   retrieved vnode structure shall be written.
1137 	\param reenter \c true, if this function is called (indirectly) from within
1138 		   a file system.
1139 	\return \c B_OK, if everything when fine, an error code otherwise.
1140 */
1141 static status_t
1142 get_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode, bool canWait,
1143 	int reenter)
1144 {
1145 	FUNCTION(("get_vnode: mountid %" B_PRId32 " vnid 0x%" B_PRIx64 " %p\n",
1146 		mountID, vnodeID, _vnode));
1147 
1148 	rw_lock_read_lock(&sVnodeLock);
1149 
1150 	int32 tries = 2000;
1151 		// try for 10 secs
1152 restart:
1153 	struct vnode* vnode = lookup_vnode(mountID, vnodeID);
1154 	AutoLocker<Vnode> nodeLocker(vnode);
1155 
1156 	if (vnode && vnode->IsBusy()) {
1157 		nodeLocker.Unlock();
1158 		rw_lock_read_unlock(&sVnodeLock);
1159 		if (!canWait || --tries < 0) {
1160 			// vnode doesn't seem to become unbusy
1161 			dprintf("vnode %" B_PRIdDEV ":%" B_PRIdINO
1162 				" is not becoming unbusy!\n", mountID, vnodeID);
1163 			return B_BUSY;
1164 		}
1165 		snooze(5000); // 5 ms
1166 		rw_lock_read_lock(&sVnodeLock);
1167 		goto restart;
1168 	}
1169 
1170 	TRACE(("get_vnode: tried to lookup vnode, got %p\n", vnode));
1171 
1172 	status_t status;
1173 
1174 	if (vnode) {
1175 		if (vnode->ref_count == 0) {
1176 			// this vnode has been unused before
1177 			vnode_used(vnode);
1178 		}
1179 		inc_vnode_ref_count(vnode);
1180 
1181 		nodeLocker.Unlock();
1182 		rw_lock_read_unlock(&sVnodeLock);
1183 	} else {
1184 		// we need to create a new vnode and read it in
1185 		rw_lock_read_unlock(&sVnodeLock);
1186 			// unlock -- create_new_vnode_and_lock() write-locks on success
1187 		bool nodeCreated;
1188 		status = create_new_vnode_and_lock(mountID, vnodeID, vnode,
1189 			nodeCreated);
1190 		if (status != B_OK)
1191 			return status;
1192 
1193 		if (!nodeCreated) {
1194 			rw_lock_read_lock(&sVnodeLock);
1195 			rw_lock_write_unlock(&sVnodeLock);
1196 			goto restart;
1197 		}
1198 
1199 		rw_lock_write_unlock(&sVnodeLock);
1200 
1201 		int type;
1202 		uint32 flags;
1203 		status = FS_MOUNT_CALL(vnode->mount, get_vnode, vnodeID, vnode, &type,
1204 			&flags, reenter);
1205 		if (status == B_OK && vnode->private_node == NULL)
1206 			status = B_BAD_VALUE;
1207 
1208 		bool gotNode = status == B_OK;
1209 		bool publishSpecialSubNode = false;
1210 		if (gotNode) {
1211 			vnode->SetType(type);
1212 			publishSpecialSubNode = is_special_node_type(type)
1213 				&& (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
1214 		}
1215 
1216 		if (gotNode && publishSpecialSubNode)
1217 			status = create_special_sub_node(vnode, flags);
1218 
1219 		if (status != B_OK) {
1220 			if (gotNode)
1221 				FS_CALL(vnode, put_vnode, reenter);
1222 
1223 			rw_lock_write_lock(&sVnodeLock);
1224 			sVnodeTable->Remove(vnode);
1225 			remove_vnode_from_mount_list(vnode, vnode->mount);
1226 			rw_lock_write_unlock(&sVnodeLock);
1227 
1228 			free(vnode);
1229 			return status;
1230 		}
1231 
1232 		rw_lock_read_lock(&sVnodeLock);
1233 		vnode->Lock();
1234 
1235 		vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
1236 		vnode->SetBusy(false);
1237 
1238 		vnode->Unlock();
1239 		rw_lock_read_unlock(&sVnodeLock);
1240 	}
1241 
1242 	TRACE(("get_vnode: returning %p\n", vnode));
1243 
1244 	*_vnode = vnode;
1245 	return B_OK;
1246 }
1247 
1248 
1249 /*!	\brief Decrements the reference counter of the given vnode and deletes it,
1250 	if the counter dropped to 0.
1251 
1252 	The caller must, of course, own a reference to the vnode to call this
1253 	function.
1254 	The caller must not hold the sVnodeLock or the sMountMutex.
1255 
1256 	\param vnode the vnode.
1257 */
1258 static inline void
1259 put_vnode(struct vnode* vnode)
1260 {
1261 	dec_vnode_ref_count(vnode, false, false);
1262 }
1263 
1264 
1265 static void
1266 free_unused_vnodes(int32 level)
1267 {
1268 	unused_vnodes_check_started();
1269 
1270 	if (level == B_NO_LOW_RESOURCE) {
1271 		unused_vnodes_check_done();
1272 		return;
1273 	}
1274 
1275 	flush_hot_vnodes();
1276 
1277 	// determine how many nodes to free
1278 	uint32 count = 1;
1279 	{
1280 		MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1281 
1282 		switch (level) {
1283 			case B_LOW_RESOURCE_NOTE:
1284 				count = sUnusedVnodes / 100;
1285 				break;
1286 			case B_LOW_RESOURCE_WARNING:
1287 				count = sUnusedVnodes / 10;
1288 				break;
1289 			case B_LOW_RESOURCE_CRITICAL:
1290 				count = sUnusedVnodes;
1291 				break;
1292 		}
1293 
1294 		if (count > sUnusedVnodes)
1295 			count = sUnusedVnodes;
1296 	}
1297 
1298 	// Write back the modified pages of some unused vnodes and free them.
1299 
1300 	for (uint32 i = 0; i < count; i++) {
1301 		ReadLocker vnodesReadLocker(sVnodeLock);
1302 
1303 		// get the first node
1304 		MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1305 		struct vnode* vnode = (struct vnode*)list_get_first_item(
1306 			&sUnusedVnodeList);
1307 		unusedVnodesLocker.Unlock();
1308 
1309 		if (vnode == NULL)
1310 			break;
1311 
1312 		// lock the node
1313 		AutoLocker<Vnode> nodeLocker(vnode);
1314 
1315 		// Check whether the node is still unused -- since we only append to the
1316 		// tail of the unused queue, the vnode should still be at its head.
1317 		// Alternatively we could check its ref count for 0 and its busy flag,
1318 		// but if the node is no longer at the head of the queue, it means it
1319 		// has been touched in the meantime, i.e. it is no longer the least
1320 		// recently used unused vnode and we rather don't free it.
1321 		unusedVnodesLocker.Lock();
1322 		if (vnode != list_get_first_item(&sUnusedVnodeList))
1323 			continue;
1324 		unusedVnodesLocker.Unlock();
1325 
1326 		ASSERT(!vnode->IsBusy());
1327 
1328 		// grab a reference
1329 		inc_vnode_ref_count(vnode);
1330 		vnode_used(vnode);
1331 
1332 		// write back changes and free the node
1333 		nodeLocker.Unlock();
1334 		vnodesReadLocker.Unlock();
1335 
1336 		if (vnode->cache != NULL)
1337 			vnode->cache->WriteModified();
1338 
1339 		dec_vnode_ref_count(vnode, true, false);
1340 			// this should free the vnode when it's still unused
1341 	}
1342 
1343 	unused_vnodes_check_done();
1344 }
1345 
1346 
1347 /*!	Gets the vnode the given vnode is covering.
1348 
1349 	The caller must have \c sVnodeLock read-locked at least.
1350 
1351 	The function returns a reference to the retrieved vnode (if any), the caller
1352 	is responsible to free.
1353 
1354 	\param vnode The vnode whose covered node shall be returned.
1355 	\return The covered vnode, or \c NULL if the given vnode doesn't cover any
1356 		vnode.
1357 */
1358 static inline Vnode*
1359 get_covered_vnode_locked(Vnode* vnode)
1360 {
1361 	if (Vnode* coveredNode = vnode->covers) {
1362 		while (coveredNode->covers != NULL)
1363 			coveredNode = coveredNode->covers;
1364 
1365 		inc_vnode_ref_count(coveredNode);
1366 		return coveredNode;
1367 	}
1368 
1369 	return NULL;
1370 }
1371 
1372 
1373 /*!	Gets the vnode the given vnode is covering.
1374 
1375 	The caller must not hold \c sVnodeLock. Note that this implies a race
1376 	condition, since the situation can change at any time.
1377 
1378 	The function returns a reference to the retrieved vnode (if any), the caller
1379 	is responsible to free.
1380 
1381 	\param vnode The vnode whose covered node shall be returned.
1382 	\return The covered vnode, or \c NULL if the given vnode doesn't cover any
1383 		vnode.
1384 */
1385 static inline Vnode*
1386 get_covered_vnode(Vnode* vnode)
1387 {
1388 	if (!vnode->IsCovering())
1389 		return NULL;
1390 
1391 	ReadLocker vnodeReadLocker(sVnodeLock);
1392 	return get_covered_vnode_locked(vnode);
1393 }
1394 
1395 
1396 /*!	Gets the vnode the given vnode is covered by.
1397 
1398 	The caller must have \c sVnodeLock read-locked at least.
1399 
1400 	The function returns a reference to the retrieved vnode (if any), the caller
1401 	is responsible to free.
1402 
1403 	\param vnode The vnode whose covering node shall be returned.
1404 	\return The covering vnode, or \c NULL if the given vnode isn't covered by
1405 		any vnode.
1406 */
1407 static Vnode*
1408 get_covering_vnode_locked(Vnode* vnode)
1409 {
1410 	if (Vnode* coveringNode = vnode->covered_by) {
1411 		while (coveringNode->covered_by != NULL)
1412 			coveringNode = coveringNode->covered_by;
1413 
1414 		inc_vnode_ref_count(coveringNode);
1415 		return coveringNode;
1416 	}
1417 
1418 	return NULL;
1419 }
1420 
1421 
1422 /*!	Gets the vnode the given vnode is covered by.
1423 
1424 	The caller must not hold \c sVnodeLock. Note that this implies a race
1425 	condition, since the situation can change at any time.
1426 
1427 	The function returns a reference to the retrieved vnode (if any), the caller
1428 	is responsible to free.
1429 
1430 	\param vnode The vnode whose covering node shall be returned.
1431 	\return The covering vnode, or \c NULL if the given vnode isn't covered by
1432 		any vnode.
1433 */
1434 static inline Vnode*
1435 get_covering_vnode(Vnode* vnode)
1436 {
1437 	if (!vnode->IsCovered())
1438 		return NULL;
1439 
1440 	ReadLocker vnodeReadLocker(sVnodeLock);
1441 	return get_covering_vnode_locked(vnode);
1442 }
1443 
1444 
1445 static void
1446 free_unused_vnodes()
1447 {
1448 	free_unused_vnodes(
1449 		low_resource_state(B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1450 			| B_KERNEL_RESOURCE_ADDRESS_SPACE));
1451 }
1452 
1453 
1454 static void
1455 vnode_low_resource_handler(void* /*data*/, uint32 resources, int32 level)
1456 {
1457 	TRACE(("vnode_low_resource_handler(level = %" B_PRId32 ")\n", level));
1458 
1459 	free_unused_vnodes(level);
1460 }
1461 
1462 
1463 static inline void
1464 put_advisory_locking(struct advisory_locking* locking)
1465 {
1466 	release_sem(locking->lock);
1467 }
1468 
1469 
1470 /*!	Returns the advisory_locking object of the \a vnode in case it
1471 	has one, and locks it.
1472 	You have to call put_advisory_locking() when you're done with
1473 	it.
1474 	Note, you must not have the vnode mutex locked when calling
1475 	this function.
1476 */
1477 static struct advisory_locking*
1478 get_advisory_locking(struct vnode* vnode)
1479 {
1480 	rw_lock_read_lock(&sVnodeLock);
1481 	vnode->Lock();
1482 
1483 	struct advisory_locking* locking = vnode->advisory_locking;
1484 	sem_id lock = locking != NULL ? locking->lock : B_ERROR;
1485 
1486 	vnode->Unlock();
1487 	rw_lock_read_unlock(&sVnodeLock);
1488 
1489 	if (lock >= 0)
1490 		lock = acquire_sem(lock);
1491 	if (lock < 0) {
1492 		// This means the locking has been deleted in the mean time
1493 		// or had never existed in the first place - otherwise, we
1494 		// would get the lock at some point.
1495 		return NULL;
1496 	}
1497 
1498 	return locking;
1499 }
1500 
1501 
1502 /*!	Creates a locked advisory_locking object, and attaches it to the
1503 	given \a vnode.
1504 	Returns B_OK in case of success - also if the vnode got such an
1505 	object from someone else in the mean time, you'll still get this
1506 	one locked then.
1507 */
1508 static status_t
1509 create_advisory_locking(struct vnode* vnode)
1510 {
1511 	if (vnode == NULL)
1512 		return B_FILE_ERROR;
1513 
1514 	ObjectDeleter<advisory_locking> lockingDeleter;
1515 	struct advisory_locking* locking = NULL;
1516 
1517 	while (get_advisory_locking(vnode) == NULL) {
1518 		// no locking object set on the vnode yet, create one
1519 		if (locking == NULL) {
1520 			locking = new(std::nothrow) advisory_locking;
1521 			if (locking == NULL)
1522 				return B_NO_MEMORY;
1523 			lockingDeleter.SetTo(locking);
1524 
1525 			locking->wait_sem = create_sem(0, "advisory lock");
1526 			if (locking->wait_sem < 0)
1527 				return locking->wait_sem;
1528 
1529 			locking->lock = create_sem(0, "advisory locking");
1530 			if (locking->lock < 0)
1531 				return locking->lock;
1532 		}
1533 
1534 		// set our newly created locking object
1535 		ReadLocker _(sVnodeLock);
1536 		AutoLocker<Vnode> nodeLocker(vnode);
1537 		if (vnode->advisory_locking == NULL) {
1538 			vnode->advisory_locking = locking;
1539 			lockingDeleter.Detach();
1540 			return B_OK;
1541 		}
1542 	}
1543 
1544 	// The vnode already had a locking object. That's just as well.
1545 
1546 	return B_OK;
1547 }
1548 
1549 
1550 /*! Returns \c true when either \a flock is \c NULL or the \a flock intersects
1551 	with the advisory_lock \a lock.
1552 */
1553 static bool
1554 advisory_lock_intersects(struct advisory_lock* lock, struct flock* flock)
1555 {
1556 	if (flock == NULL)
1557 		return true;
1558 
1559 	return lock->start <= flock->l_start - 1 + flock->l_len
1560 		&& lock->end >= flock->l_start;
1561 }
1562 
1563 
1564 /*!	Tests whether acquiring a lock would block.
1565 */
1566 static status_t
1567 test_advisory_lock(struct vnode* vnode, struct flock* flock)
1568 {
1569 	flock->l_type = F_UNLCK;
1570 
1571 	struct advisory_locking* locking = get_advisory_locking(vnode);
1572 	if (locking == NULL)
1573 		return B_OK;
1574 
1575 	team_id team = team_get_current_team_id();
1576 
1577 	LockList::Iterator iterator = locking->locks.GetIterator();
1578 	while (iterator.HasNext()) {
1579 		struct advisory_lock* lock = iterator.Next();
1580 
1581 		 if (lock->team != team && advisory_lock_intersects(lock, flock)) {
1582 			// locks do overlap
1583 			if (flock->l_type != F_RDLCK || !lock->shared) {
1584 				// collision
1585 				flock->l_type = lock->shared ? F_RDLCK : F_WRLCK;
1586 				flock->l_whence = SEEK_SET;
1587 				flock->l_start = lock->start;
1588 				flock->l_len = lock->end - lock->start + 1;
1589 				flock->l_pid = lock->team;
1590 				break;
1591 			}
1592 		}
1593 	}
1594 
1595 	put_advisory_locking(locking);
1596 	return B_OK;
1597 }
1598 
1599 
1600 /*!	Removes the specified lock, or all locks of the calling team
1601 	if \a flock is NULL.
1602 */
1603 static status_t
1604 release_advisory_lock(struct vnode* vnode, struct flock* flock)
1605 {
1606 	FUNCTION(("release_advisory_lock(vnode = %p, flock = %p)\n", vnode, flock));
1607 
1608 	struct advisory_locking* locking = get_advisory_locking(vnode);
1609 	if (locking == NULL)
1610 		return B_OK;
1611 
1612 	// TODO: use the thread ID instead??
1613 	team_id team = team_get_current_team_id();
1614 	pid_t session = thread_get_current_thread()->team->session_id;
1615 
1616 	// find matching lock entries
1617 
1618 	LockList::Iterator iterator = locking->locks.GetIterator();
1619 	while (iterator.HasNext()) {
1620 		struct advisory_lock* lock = iterator.Next();
1621 		bool removeLock = false;
1622 
1623 		if (lock->session == session)
1624 			removeLock = true;
1625 		else if (lock->team == team && advisory_lock_intersects(lock, flock)) {
1626 			bool endsBeyond = false;
1627 			bool startsBefore = false;
1628 			if (flock != NULL) {
1629 				startsBefore = lock->start < flock->l_start;
1630 				endsBeyond = lock->end > flock->l_start - 1 + flock->l_len;
1631 			}
1632 
1633 			if (!startsBefore && !endsBeyond) {
1634 				// lock is completely contained in flock
1635 				removeLock = true;
1636 			} else if (startsBefore && !endsBeyond) {
1637 				// cut the end of the lock
1638 				lock->end = flock->l_start - 1;
1639 			} else if (!startsBefore && endsBeyond) {
1640 				// cut the start of the lock
1641 				lock->start = flock->l_start + flock->l_len;
1642 			} else {
1643 				// divide the lock into two locks
1644 				struct advisory_lock* secondLock = new advisory_lock;
1645 				if (secondLock == NULL) {
1646 					// TODO: we should probably revert the locks we already
1647 					// changed... (ie. allocate upfront)
1648 					put_advisory_locking(locking);
1649 					return B_NO_MEMORY;
1650 				}
1651 
1652 				lock->end = flock->l_start - 1;
1653 
1654 				secondLock->team = lock->team;
1655 				secondLock->session = lock->session;
1656 				// values must already be normalized when getting here
1657 				secondLock->start = flock->l_start + flock->l_len;
1658 				secondLock->end = lock->end;
1659 				secondLock->shared = lock->shared;
1660 
1661 				locking->locks.Add(secondLock);
1662 			}
1663 		}
1664 
1665 		if (removeLock) {
1666 			// this lock is no longer used
1667 			iterator.Remove();
1668 			free(lock);
1669 		}
1670 	}
1671 
1672 	bool removeLocking = locking->locks.IsEmpty();
1673 	release_sem_etc(locking->wait_sem, 1, B_RELEASE_ALL);
1674 
1675 	put_advisory_locking(locking);
1676 
1677 	if (removeLocking) {
1678 		// We can remove the whole advisory locking structure; it's no
1679 		// longer used
1680 		locking = get_advisory_locking(vnode);
1681 		if (locking != NULL) {
1682 			ReadLocker locker(sVnodeLock);
1683 			AutoLocker<Vnode> nodeLocker(vnode);
1684 
1685 			// the locking could have been changed in the mean time
1686 			if (locking->locks.IsEmpty()) {
1687 				vnode->advisory_locking = NULL;
1688 				nodeLocker.Unlock();
1689 				locker.Unlock();
1690 
1691 				// we've detached the locking from the vnode, so we can
1692 				// safely delete it
1693 				delete locking;
1694 			} else {
1695 				// the locking is in use again
1696 				nodeLocker.Unlock();
1697 				locker.Unlock();
1698 				release_sem_etc(locking->lock, 1, B_DO_NOT_RESCHEDULE);
1699 			}
1700 		}
1701 	}
1702 
1703 	return B_OK;
1704 }
1705 
1706 
1707 /*!	Acquires an advisory lock for the \a vnode. If \a wait is \c true, it
1708 	will wait for the lock to become available, if there are any collisions
1709 	(it will return B_PERMISSION_DENIED in this case if \a wait is \c false).
1710 
1711 	If \a session is -1, POSIX semantics are used for this lock. Otherwise,
1712 	BSD flock() semantics are used, that is, all children can unlock the file
1713 	in question (we even allow parents to remove the lock, though, but that
1714 	seems to be in line to what the BSD's are doing).
1715 */
1716 static status_t
1717 acquire_advisory_lock(struct vnode* vnode, pid_t session, struct flock* flock,
1718 	bool wait)
1719 {
1720 	FUNCTION(("acquire_advisory_lock(vnode = %p, flock = %p, wait = %s)\n",
1721 		vnode, flock, wait ? "yes" : "no"));
1722 
1723 	bool shared = flock->l_type == F_RDLCK;
1724 	status_t status = B_OK;
1725 
1726 	// TODO: do deadlock detection!
1727 
1728 	struct advisory_locking* locking;
1729 
1730 	while (true) {
1731 		// if this vnode has an advisory_locking structure attached,
1732 		// lock that one and search for any colliding file lock
1733 		status = create_advisory_locking(vnode);
1734 		if (status != B_OK)
1735 			return status;
1736 
1737 		locking = vnode->advisory_locking;
1738 		team_id team = team_get_current_team_id();
1739 		sem_id waitForLock = -1;
1740 
1741 		// test for collisions
1742 		LockList::Iterator iterator = locking->locks.GetIterator();
1743 		while (iterator.HasNext()) {
1744 			struct advisory_lock* lock = iterator.Next();
1745 
1746 			// TODO: locks from the same team might be joinable!
1747 			if (lock->team != team && advisory_lock_intersects(lock, flock)) {
1748 				// locks do overlap
1749 				if (!shared || !lock->shared) {
1750 					// we need to wait
1751 					waitForLock = locking->wait_sem;
1752 					break;
1753 				}
1754 			}
1755 		}
1756 
1757 		if (waitForLock < 0)
1758 			break;
1759 
1760 		// We need to wait. Do that or fail now, if we've been asked not to.
1761 
1762 		if (!wait) {
1763 			put_advisory_locking(locking);
1764 			return session != -1 ? B_WOULD_BLOCK : B_PERMISSION_DENIED;
1765 		}
1766 
1767 		status = switch_sem_etc(locking->lock, waitForLock, 1,
1768 			B_CAN_INTERRUPT, 0);
1769 		if (status != B_OK && status != B_BAD_SEM_ID)
1770 			return status;
1771 
1772 		// We have been notified, but we need to re-lock the locking object. So
1773 		// go another round...
1774 	}
1775 
1776 	// install new lock
1777 
1778 	struct advisory_lock* lock = (struct advisory_lock*)malloc(
1779 		sizeof(struct advisory_lock));
1780 	if (lock == NULL) {
1781 		put_advisory_locking(locking);
1782 		return B_NO_MEMORY;
1783 	}
1784 
1785 	lock->team = team_get_current_team_id();
1786 	lock->session = session;
1787 	// values must already be normalized when getting here
1788 	lock->start = flock->l_start;
1789 	lock->end = flock->l_start - 1 + flock->l_len;
1790 	lock->shared = shared;
1791 
1792 	locking->locks.Add(lock);
1793 	put_advisory_locking(locking);
1794 
1795 	return status;
1796 }
1797 
1798 
1799 /*!	Normalizes the \a flock structure to make it easier to compare the
1800 	structure with others. The l_start and l_len fields are set to absolute
1801 	values according to the l_whence field.
1802 */
1803 static status_t
1804 normalize_flock(struct file_descriptor* descriptor, struct flock* flock)
1805 {
1806 	switch (flock->l_whence) {
1807 		case SEEK_SET:
1808 			break;
1809 		case SEEK_CUR:
1810 			flock->l_start += descriptor->pos;
1811 			break;
1812 		case SEEK_END:
1813 		{
1814 			struct vnode* vnode = descriptor->u.vnode;
1815 			struct stat stat;
1816 			status_t status;
1817 
1818 			if (!HAS_FS_CALL(vnode, read_stat))
1819 				return B_UNSUPPORTED;
1820 
1821 			status = FS_CALL(vnode, read_stat, &stat);
1822 			if (status != B_OK)
1823 				return status;
1824 
1825 			flock->l_start += stat.st_size;
1826 			break;
1827 		}
1828 		default:
1829 			return B_BAD_VALUE;
1830 	}
1831 
1832 	if (flock->l_start < 0)
1833 		flock->l_start = 0;
1834 	if (flock->l_len == 0)
1835 		flock->l_len = OFF_MAX;
1836 
1837 	// don't let the offset and length overflow
1838 	if (flock->l_start > 0 && OFF_MAX - flock->l_start < flock->l_len)
1839 		flock->l_len = OFF_MAX - flock->l_start;
1840 
1841 	if (flock->l_len < 0) {
1842 		// a negative length reverses the region
1843 		flock->l_start += flock->l_len;
1844 		flock->l_len = -flock->l_len;
1845 	}
1846 
1847 	return B_OK;
1848 }
1849 
1850 
1851 static void
1852 replace_vnode_if_disconnected(struct fs_mount* mount,
1853 	struct vnode* vnodeToDisconnect, struct vnode*& vnode,
1854 	struct vnode* fallBack, bool lockRootLock)
1855 {
1856 	struct vnode* givenVnode = vnode;
1857 	bool vnodeReplaced = false;
1858 
1859 	ReadLocker vnodeReadLocker(sVnodeLock);
1860 
1861 	if (lockRootLock)
1862 		mutex_lock(&sIOContextRootLock);
1863 
1864 	while (vnode != NULL && vnode->mount == mount
1865 		&& (vnodeToDisconnect == NULL || vnodeToDisconnect == vnode)) {
1866 		if (vnode->covers != NULL) {
1867 			// redirect the vnode to the covered vnode
1868 			vnode = vnode->covers;
1869 		} else
1870 			vnode = fallBack;
1871 
1872 		vnodeReplaced = true;
1873 	}
1874 
1875 	// If we've replaced the node, grab a reference for the new one.
1876 	if (vnodeReplaced && vnode != NULL)
1877 		inc_vnode_ref_count(vnode);
1878 
1879 	if (lockRootLock)
1880 		mutex_unlock(&sIOContextRootLock);
1881 
1882 	vnodeReadLocker.Unlock();
1883 
1884 	if (vnodeReplaced)
1885 		put_vnode(givenVnode);
1886 }
1887 
1888 
1889 /*!	Disconnects all file descriptors that are associated with the
1890 	\a vnodeToDisconnect, or if this is NULL, all vnodes of the specified
1891 	\a mount object.
1892 
1893 	Note, after you've called this function, there might still be ongoing
1894 	accesses - they won't be interrupted if they already happened before.
1895 	However, any subsequent access will fail.
1896 
1897 	This is not a cheap function and should be used with care and rarely.
1898 	TODO: there is currently no means to stop a blocking read/write!
1899 */
1900 static void
1901 disconnect_mount_or_vnode_fds(struct fs_mount* mount,
1902 	struct vnode* vnodeToDisconnect)
1903 {
1904 	// iterate over all teams and peek into their file descriptors
1905 	TeamListIterator teamIterator;
1906 	while (Team* team = teamIterator.Next()) {
1907 		BReference<Team> teamReference(team, true);
1908 
1909 		// lock the I/O context
1910 		io_context* context = team->io_context;
1911 		MutexLocker contextLocker(context->io_mutex);
1912 
1913 		replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->root,
1914 			sRoot, true);
1915 		replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->cwd,
1916 			sRoot, false);
1917 
1918 		for (uint32 i = 0; i < context->table_size; i++) {
1919 			if (struct file_descriptor* descriptor = context->fds[i]) {
1920 				inc_fd_ref_count(descriptor);
1921 
1922 				// if this descriptor points at this mount, we
1923 				// need to disconnect it to be able to unmount
1924 				struct vnode* vnode = fd_vnode(descriptor);
1925 				if (vnodeToDisconnect != NULL) {
1926 					if (vnode == vnodeToDisconnect)
1927 						disconnect_fd(descriptor);
1928 				} else if ((vnode != NULL && vnode->mount == mount)
1929 					|| (vnode == NULL && descriptor->u.mount == mount))
1930 					disconnect_fd(descriptor);
1931 
1932 				put_fd(descriptor);
1933 			}
1934 		}
1935 	}
1936 }
1937 
1938 
1939 /*!	\brief Gets the root node of the current IO context.
1940 	If \a kernel is \c true, the kernel IO context will be used.
1941 	The caller obtains a reference to the returned node.
1942 */
1943 struct vnode*
1944 get_root_vnode(bool kernel)
1945 {
1946 	if (!kernel) {
1947 		// Get current working directory from io context
1948 		struct io_context* context = get_current_io_context(kernel);
1949 
1950 		mutex_lock(&sIOContextRootLock);
1951 
1952 		struct vnode* root = context->root;
1953 		if (root != NULL)
1954 			inc_vnode_ref_count(root);
1955 
1956 		mutex_unlock(&sIOContextRootLock);
1957 
1958 		if (root != NULL)
1959 			return root;
1960 
1961 		// That should never happen.
1962 		dprintf("get_root_vnode(): IO context for team %" B_PRId32 " doesn't "
1963 			"have a root\n", team_get_current_team_id());
1964 	}
1965 
1966 	inc_vnode_ref_count(sRoot);
1967 	return sRoot;
1968 }
1969 
1970 
1971 /*!	\brief Gets the directory path and leaf name for a given path.
1972 
1973 	The supplied \a path is transformed to refer to the directory part of
1974 	the entry identified by the original path, and into the buffer \a filename
1975 	the leaf name of the original entry is written.
1976 	Neither the returned path nor the leaf name can be expected to be
1977 	canonical.
1978 
1979 	\param path The path to be analyzed. Must be able to store at least one
1980 		   additional character.
1981 	\param filename The buffer into which the leaf name will be written.
1982 		   Must be of size B_FILE_NAME_LENGTH at least.
1983 	\return \c B_OK, if everything went fine, \c B_NAME_TOO_LONG, if the leaf
1984 		   name is longer than \c B_FILE_NAME_LENGTH, or \c B_ENTRY_NOT_FOUND,
1985 		   if the given path name is empty.
1986 */
1987 static status_t
1988 get_dir_path_and_leaf(char* path, char* filename)
1989 {
1990 	if (*path == '\0')
1991 		return B_ENTRY_NOT_FOUND;
1992 
1993 	char* last = strrchr(path, '/');
1994 		// '/' are not allowed in file names!
1995 
1996 	FUNCTION(("get_dir_path_and_leaf(path = %s)\n", path));
1997 
1998 	if (last == NULL) {
1999 		// this path is single segment with no '/' in it
2000 		// ex. "foo"
2001 		if (strlcpy(filename, path, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2002 			return B_NAME_TOO_LONG;
2003 
2004 		strcpy(path, ".");
2005 	} else {
2006 		last++;
2007 		if (last[0] == '\0') {
2008 			// special case: the path ends in one or more '/' - remove them
2009 			while (*--last == '/' && last != path);
2010 			last[1] = '\0';
2011 
2012 			if (last == path && last[0] == '/') {
2013 				// This path points to the root of the file system
2014 				strcpy(filename, ".");
2015 				return B_OK;
2016 			}
2017 			for (; last != path && *(last - 1) != '/'; last--);
2018 				// rewind to the start of the leaf before the '/'
2019 		}
2020 
2021 		// normal leaf: replace the leaf portion of the path with a '.'
2022 		if (strlcpy(filename, last, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2023 			return B_NAME_TOO_LONG;
2024 
2025 		last[0] = '.';
2026 		last[1] = '\0';
2027 	}
2028 	return B_OK;
2029 }
2030 
2031 
2032 static status_t
2033 entry_ref_to_vnode(dev_t mountID, ino_t directoryID, const char* name,
2034 	bool traverse, bool kernel, struct vnode** _vnode)
2035 {
2036 	char clonedName[B_FILE_NAME_LENGTH + 1];
2037 	if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2038 		return B_NAME_TOO_LONG;
2039 
2040 	// get the directory vnode and let vnode_path_to_vnode() do the rest
2041 	struct vnode* directory;
2042 
2043 	status_t status = get_vnode(mountID, directoryID, &directory, true, false);
2044 	if (status < 0)
2045 		return status;
2046 
2047 	return vnode_path_to_vnode(directory, clonedName, traverse, 0, kernel,
2048 		_vnode, NULL);
2049 }
2050 
2051 
2052 /*!	Looks up the entry with name \a name in the directory represented by \a dir
2053 	and returns the respective vnode.
2054 	On success a reference to the vnode is acquired for the caller.
2055 */
2056 static status_t
2057 lookup_dir_entry(struct vnode* dir, const char* name, struct vnode** _vnode)
2058 {
2059 	ino_t id;
2060 	bool missing;
2061 
2062 	if (dir->mount->entry_cache.Lookup(dir->id, name, id, missing)) {
2063 		return missing ? B_ENTRY_NOT_FOUND
2064 			: get_vnode(dir->device, id, _vnode, true, false);
2065 	}
2066 
2067 	status_t status = FS_CALL(dir, lookup, name, &id);
2068 	if (status != B_OK)
2069 		return status;
2070 
2071 	// The lookup() hook calls get_vnode() or publish_vnode(), so we do already
2072 	// have a reference and just need to look the node up.
2073 	rw_lock_read_lock(&sVnodeLock);
2074 	*_vnode = lookup_vnode(dir->device, id);
2075 	rw_lock_read_unlock(&sVnodeLock);
2076 
2077 	if (*_vnode == NULL) {
2078 		panic("lookup_dir_entry(): could not lookup vnode (mountid 0x%" B_PRIx32
2079 			" vnid 0x%" B_PRIx64 ")\n", dir->device, id);
2080 		return B_ENTRY_NOT_FOUND;
2081 	}
2082 
2083 //	ktrace_printf("lookup_dir_entry(): dir: %p (%ld, %lld), name: \"%s\" -> "
2084 //		"%p (%ld, %lld)", dir, dir->mount->id, dir->id, name, *_vnode,
2085 //		(*_vnode)->mount->id, (*_vnode)->id);
2086 
2087 	return B_OK;
2088 }
2089 
2090 
2091 /*!	Returns the vnode for the relative path starting at the specified \a vnode.
2092 	\a path must not be NULL.
2093 	If it returns successfully, \a path contains the name of the last path
2094 	component. This function clobbers the buffer pointed to by \a path only
2095 	if it does contain more than one component.
2096 	Note, this reduces the ref_count of the starting \a vnode, no matter if
2097 	it is successful or not!
2098 */
2099 static status_t
2100 vnode_path_to_vnode(struct vnode* vnode, char* path, bool traverseLeafLink,
2101 	int count, struct io_context* ioContext, struct vnode** _vnode,
2102 	ino_t* _parentID)
2103 {
2104 	status_t status = B_OK;
2105 	ino_t lastParentID = vnode->id;
2106 
2107 	FUNCTION(("vnode_path_to_vnode(vnode = %p, path = %s)\n", vnode, path));
2108 
2109 	if (path == NULL) {
2110 		put_vnode(vnode);
2111 		return B_BAD_VALUE;
2112 	}
2113 
2114 	if (*path == '\0') {
2115 		put_vnode(vnode);
2116 		return B_ENTRY_NOT_FOUND;
2117 	}
2118 
2119 	while (true) {
2120 		struct vnode* nextVnode;
2121 		char* nextPath;
2122 
2123 		TRACE(("vnode_path_to_vnode: top of loop. p = %p, p = '%s'\n", path,
2124 			path));
2125 
2126 		// done?
2127 		if (path[0] == '\0')
2128 			break;
2129 
2130 		// walk to find the next path component ("path" will point to a single
2131 		// path component), and filter out multiple slashes
2132 		for (nextPath = path + 1; *nextPath != '\0' && *nextPath != '/';
2133 				nextPath++);
2134 
2135 		if (*nextPath == '/') {
2136 			*nextPath = '\0';
2137 			do
2138 				nextPath++;
2139 			while (*nextPath == '/');
2140 		}
2141 
2142 		// See if the '..' is at a covering vnode move to the covered
2143 		// vnode so we pass the '..' path to the underlying filesystem.
2144 		// Also prevent breaking the root of the IO context.
2145 		if (strcmp("..", path) == 0) {
2146 			if (vnode == ioContext->root) {
2147 				// Attempted prison break! Keep it contained.
2148 				path = nextPath;
2149 				continue;
2150 			}
2151 
2152 			if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2153 				nextVnode = coveredVnode;
2154 				put_vnode(vnode);
2155 				vnode = nextVnode;
2156 			}
2157 		}
2158 
2159 		// check if vnode is really a directory
2160 		if (status == B_OK && !S_ISDIR(vnode->Type()))
2161 			status = B_NOT_A_DIRECTORY;
2162 
2163 		// Check if we have the right to search the current directory vnode.
2164 		// If a file system doesn't have the access() function, we assume that
2165 		// searching a directory is always allowed
2166 		if (status == B_OK && HAS_FS_CALL(vnode, access))
2167 			status = FS_CALL(vnode, access, X_OK);
2168 
2169 		// Tell the filesystem to get the vnode of this path component (if we
2170 		// got the permission from the call above)
2171 		if (status == B_OK)
2172 			status = lookup_dir_entry(vnode, path, &nextVnode);
2173 
2174 		if (status != B_OK) {
2175 			put_vnode(vnode);
2176 			return status;
2177 		}
2178 
2179 		// If the new node is a symbolic link, resolve it (if we've been told
2180 		// to do it)
2181 		if (S_ISLNK(nextVnode->Type())
2182 			&& (traverseLeafLink || nextPath[0] != '\0')) {
2183 			size_t bufferSize;
2184 			char* buffer;
2185 
2186 			TRACE(("traverse link\n"));
2187 
2188 			// it's not exactly nice style using goto in this way, but hey,
2189 			// it works :-/
2190 			if (count + 1 > B_MAX_SYMLINKS) {
2191 				status = B_LINK_LIMIT;
2192 				goto resolve_link_error;
2193 			}
2194 
2195 			buffer = (char*)malloc(bufferSize = B_PATH_NAME_LENGTH);
2196 			if (buffer == NULL) {
2197 				status = B_NO_MEMORY;
2198 				goto resolve_link_error;
2199 			}
2200 
2201 			if (HAS_FS_CALL(nextVnode, read_symlink)) {
2202 				bufferSize--;
2203 				status = FS_CALL(nextVnode, read_symlink, buffer, &bufferSize);
2204 				// null-terminate
2205 				if (status >= 0)
2206 					buffer[bufferSize] = '\0';
2207 			} else
2208 				status = B_BAD_VALUE;
2209 
2210 			if (status != B_OK) {
2211 				free(buffer);
2212 
2213 		resolve_link_error:
2214 				put_vnode(vnode);
2215 				put_vnode(nextVnode);
2216 
2217 				return status;
2218 			}
2219 			put_vnode(nextVnode);
2220 
2221 			// Check if we start from the root directory or the current
2222 			// directory ("vnode" still points to that one).
2223 			// Cut off all leading slashes if it's the root directory
2224 			path = buffer;
2225 			bool absoluteSymlink = false;
2226 			if (path[0] == '/') {
2227 				// we don't need the old directory anymore
2228 				put_vnode(vnode);
2229 
2230 				while (*++path == '/')
2231 					;
2232 
2233 				mutex_lock(&sIOContextRootLock);
2234 				vnode = ioContext->root;
2235 				inc_vnode_ref_count(vnode);
2236 				mutex_unlock(&sIOContextRootLock);
2237 
2238 				absoluteSymlink = true;
2239 			}
2240 
2241 			inc_vnode_ref_count(vnode);
2242 				// balance the next recursion - we will decrement the
2243 				// ref_count of the vnode, no matter if we succeeded or not
2244 
2245 			if (absoluteSymlink && *path == '\0') {
2246 				// symlink was just "/"
2247 				nextVnode = vnode;
2248 			} else {
2249 				status = vnode_path_to_vnode(vnode, path, true, count + 1,
2250 					ioContext, &nextVnode, &lastParentID);
2251 			}
2252 
2253 			free(buffer);
2254 
2255 			if (status != B_OK) {
2256 				put_vnode(vnode);
2257 				return status;
2258 			}
2259 		} else
2260 			lastParentID = vnode->id;
2261 
2262 		// decrease the ref count on the old dir we just looked up into
2263 		put_vnode(vnode);
2264 
2265 		path = nextPath;
2266 		vnode = nextVnode;
2267 
2268 		// see if we hit a covered node
2269 		if (Vnode* coveringNode = get_covering_vnode(vnode)) {
2270 			put_vnode(vnode);
2271 			vnode = coveringNode;
2272 		}
2273 	}
2274 
2275 	*_vnode = vnode;
2276 	if (_parentID)
2277 		*_parentID = lastParentID;
2278 
2279 	return B_OK;
2280 }
2281 
2282 
2283 static status_t
2284 vnode_path_to_vnode(struct vnode* vnode, char* path, bool traverseLeafLink,
2285 	int count, bool kernel, struct vnode** _vnode, ino_t* _parentID)
2286 {
2287 	return vnode_path_to_vnode(vnode, path, traverseLeafLink, count,
2288 		get_current_io_context(kernel), _vnode, _parentID);
2289 }
2290 
2291 
2292 static status_t
2293 path_to_vnode(char* path, bool traverseLink, struct vnode** _vnode,
2294 	ino_t* _parentID, bool kernel)
2295 {
2296 	struct vnode* start = NULL;
2297 
2298 	FUNCTION(("path_to_vnode(path = \"%s\")\n", path));
2299 
2300 	if (!path)
2301 		return B_BAD_VALUE;
2302 
2303 	if (*path == '\0')
2304 		return B_ENTRY_NOT_FOUND;
2305 
2306 	// figure out if we need to start at root or at cwd
2307 	if (*path == '/') {
2308 		if (sRoot == NULL) {
2309 			// we're a bit early, aren't we?
2310 			return B_ERROR;
2311 		}
2312 
2313 		while (*++path == '/')
2314 			;
2315 		start = get_root_vnode(kernel);
2316 
2317 		if (*path == '\0') {
2318 			*_vnode = start;
2319 			return B_OK;
2320 		}
2321 
2322 	} else {
2323 		struct io_context* context = get_current_io_context(kernel);
2324 
2325 		mutex_lock(&context->io_mutex);
2326 		start = context->cwd;
2327 		if (start != NULL)
2328 			inc_vnode_ref_count(start);
2329 		mutex_unlock(&context->io_mutex);
2330 
2331 		if (start == NULL)
2332 			return B_ERROR;
2333 	}
2334 
2335 	return vnode_path_to_vnode(start, path, traverseLink, 0, kernel, _vnode,
2336 		_parentID);
2337 }
2338 
2339 
2340 /*! Returns the vnode in the next to last segment of the path, and returns
2341 	the last portion in filename.
2342 	The path buffer must be able to store at least one additional character.
2343 */
2344 static status_t
2345 path_to_dir_vnode(char* path, struct vnode** _vnode, char* filename,
2346 	bool kernel)
2347 {
2348 	status_t status = get_dir_path_and_leaf(path, filename);
2349 	if (status != B_OK)
2350 		return status;
2351 
2352 	return path_to_vnode(path, true, _vnode, NULL, kernel);
2353 }
2354 
2355 
2356 /*!	\brief Retrieves the directory vnode and the leaf name of an entry referred
2357 		   to by a FD + path pair.
2358 
2359 	\a path must be given in either case. \a fd might be omitted, in which
2360 	case \a path is either an absolute path or one relative to the current
2361 	directory. If both a supplied and \a path is relative it is reckoned off
2362 	of the directory referred to by \a fd. If \a path is absolute \a fd is
2363 	ignored.
2364 
2365 	The caller has the responsibility to call put_vnode() on the returned
2366 	directory vnode.
2367 
2368 	\param fd The FD. May be < 0.
2369 	\param path The absolute or relative path. Must not be \c NULL. The buffer
2370 	       is modified by this function. It must have at least room for a
2371 	       string one character longer than the path it contains.
2372 	\param _vnode A pointer to a variable the directory vnode shall be written
2373 		   into.
2374 	\param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2375 		   the leaf name of the specified entry will be written.
2376 	\param kernel \c true, if invoked from inside the kernel, \c false if
2377 		   invoked from userland.
2378 	\return \c B_OK, if everything went fine, another error code otherwise.
2379 */
2380 static status_t
2381 fd_and_path_to_dir_vnode(int fd, char* path, struct vnode** _vnode,
2382 	char* filename, bool kernel)
2383 {
2384 	if (!path)
2385 		return B_BAD_VALUE;
2386 	if (*path == '\0')
2387 		return B_ENTRY_NOT_FOUND;
2388 	if (fd < 0)
2389 		return path_to_dir_vnode(path, _vnode, filename, kernel);
2390 
2391 	status_t status = get_dir_path_and_leaf(path, filename);
2392 	if (status != B_OK)
2393 		return status;
2394 
2395 	return fd_and_path_to_vnode(fd, path, true, _vnode, NULL, kernel);
2396 }
2397 
2398 
2399 /*!	\brief Retrieves the directory vnode and the leaf name of an entry referred
2400 		   to by a vnode + path pair.
2401 
2402 	\a path must be given in either case. \a vnode might be omitted, in which
2403 	case \a path is either an absolute path or one relative to the current
2404 	directory. If both a supplied and \a path is relative it is reckoned off
2405 	of the directory referred to by \a vnode. If \a path is absolute \a vnode is
2406 	ignored.
2407 
2408 	The caller has the responsibility to call put_vnode() on the returned
2409 	directory vnode.
2410 
2411 	\param vnode The vnode. May be \c NULL.
2412 	\param path The absolute or relative path. Must not be \c NULL. The buffer
2413 	       is modified by this function. It must have at least room for a
2414 	       string one character longer than the path it contains.
2415 	\param _vnode A pointer to a variable the directory vnode shall be written
2416 		   into.
2417 	\param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2418 		   the leaf name of the specified entry will be written.
2419 	\param kernel \c true, if invoked from inside the kernel, \c false if
2420 		   invoked from userland.
2421 	\return \c B_OK, if everything went fine, another error code otherwise.
2422 */
2423 static status_t
2424 vnode_and_path_to_dir_vnode(struct vnode* vnode, char* path,
2425 	struct vnode** _vnode, char* filename, bool kernel)
2426 {
2427 	if (!path)
2428 		return B_BAD_VALUE;
2429 	if (*path == '\0')
2430 		return B_ENTRY_NOT_FOUND;
2431 	if (vnode == NULL || path[0] == '/')
2432 		return path_to_dir_vnode(path, _vnode, filename, kernel);
2433 
2434 	status_t status = get_dir_path_and_leaf(path, filename);
2435 	if (status != B_OK)
2436 		return status;
2437 
2438 	inc_vnode_ref_count(vnode);
2439 		// vnode_path_to_vnode() always decrements the ref count
2440 
2441 	return vnode_path_to_vnode(vnode, path, true, 0, kernel, _vnode, NULL);
2442 }
2443 
2444 
2445 /*! Returns a vnode's name in the d_name field of a supplied dirent buffer.
2446 */
2447 static status_t
2448 get_vnode_name(struct vnode* vnode, struct vnode* parent, struct dirent* buffer,
2449 	size_t bufferSize, struct io_context* ioContext)
2450 {
2451 	if (bufferSize < sizeof(struct dirent))
2452 		return B_BAD_VALUE;
2453 
2454 	// See if the vnode is covering another vnode and move to the covered
2455 	// vnode so we get the underlying file system
2456 	VNodePutter vnodePutter;
2457 	if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2458 		vnode = coveredVnode;
2459 		vnodePutter.SetTo(vnode);
2460 	}
2461 
2462 	if (HAS_FS_CALL(vnode, get_vnode_name)) {
2463 		// The FS supports getting the name of a vnode.
2464 		if (FS_CALL(vnode, get_vnode_name, buffer->d_name,
2465 			(char*)buffer + bufferSize - buffer->d_name) == B_OK)
2466 			return B_OK;
2467 	}
2468 
2469 	// The FS doesn't support getting the name of a vnode. So we search the
2470 	// parent directory for the vnode, if the caller let us.
2471 
2472 	if (parent == NULL || !HAS_FS_CALL(parent, read_dir))
2473 		return B_UNSUPPORTED;
2474 
2475 	void* cookie;
2476 
2477 	status_t status = FS_CALL(parent, open_dir, &cookie);
2478 	if (status >= B_OK) {
2479 		while (true) {
2480 			uint32 num = 1;
2481 			// We use the FS hook directly instead of dir_read(), since we don't
2482 			// want the entries to be fixed. We have already resolved vnode to
2483 			// the covered node.
2484 			status = FS_CALL(parent, read_dir, cookie, buffer, bufferSize,
2485 				&num);
2486 			if (status != B_OK)
2487 				break;
2488 			if (num == 0) {
2489 				status = B_ENTRY_NOT_FOUND;
2490 				break;
2491 			}
2492 
2493 			if (vnode->id == buffer->d_ino) {
2494 				// found correct entry!
2495 				break;
2496 			}
2497 		}
2498 
2499 		FS_CALL(parent, close_dir, cookie);
2500 		FS_CALL(parent, free_dir_cookie, cookie);
2501 	}
2502 	return status;
2503 }
2504 
2505 
2506 static status_t
2507 get_vnode_name(struct vnode* vnode, struct vnode* parent, char* name,
2508 	size_t nameSize, bool kernel)
2509 {
2510 	char buffer[sizeof(struct dirent) + B_FILE_NAME_LENGTH];
2511 	struct dirent* dirent = (struct dirent*)buffer;
2512 
2513 	status_t status = get_vnode_name(vnode, parent, dirent, sizeof(buffer),
2514 		get_current_io_context(kernel));
2515 	if (status != B_OK)
2516 		return status;
2517 
2518 	if (strlcpy(name, dirent->d_name, nameSize) >= nameSize)
2519 		return B_BUFFER_OVERFLOW;
2520 
2521 	return B_OK;
2522 }
2523 
2524 
2525 /*!	Gets the full path to a given directory vnode.
2526 	It uses the fs_get_vnode_name() call to get the name of a vnode; if a
2527 	file system doesn't support this call, it will fall back to iterating
2528 	through the parent directory to get the name of the child.
2529 
2530 	To protect against circular loops, it supports a maximum tree depth
2531 	of 256 levels.
2532 
2533 	Note that the path may not be correct the time this function returns!
2534 	It doesn't use any locking to prevent returning the correct path, as
2535 	paths aren't safe anyway: the path to a file can change at any time.
2536 
2537 	It might be a good idea, though, to check if the returned path exists
2538 	in the calling function (it's not done here because of efficiency)
2539 */
2540 static status_t
2541 dir_vnode_to_path(struct vnode* vnode, char* buffer, size_t bufferSize,
2542 	bool kernel)
2543 {
2544 	FUNCTION(("dir_vnode_to_path(%p, %p, %lu)\n", vnode, buffer, bufferSize));
2545 
2546 	if (vnode == NULL || buffer == NULL || bufferSize == 0)
2547 		return B_BAD_VALUE;
2548 
2549 	if (!S_ISDIR(vnode->Type()))
2550 		return B_NOT_A_DIRECTORY;
2551 
2552 	char* path = buffer;
2553 	int32 insert = bufferSize;
2554 	int32 maxLevel = 256;
2555 	int32 length;
2556 	status_t status = B_OK;
2557 	struct io_context* ioContext = get_current_io_context(kernel);
2558 
2559 	// we don't use get_vnode() here because this call is more
2560 	// efficient and does all we need from get_vnode()
2561 	inc_vnode_ref_count(vnode);
2562 
2563 	path[--insert] = '\0';
2564 		// the path is filled right to left
2565 
2566 	while (true) {
2567 		// If the node is the context's root, bail out. Otherwise resolve mount
2568 		// points.
2569 		if (vnode == ioContext->root)
2570 			break;
2571 
2572 		if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2573 			put_vnode(vnode);
2574 			vnode = coveredVnode;
2575 		}
2576 
2577 		// lookup the parent vnode
2578 		struct vnode* parentVnode;
2579 		status = lookup_dir_entry(vnode, "..", &parentVnode);
2580 		if (status != B_OK)
2581 			goto out;
2582 
2583 		if (parentVnode == vnode) {
2584 			// The caller apparently got their hands on a node outside of their
2585 			// context's root. Now we've hit the global root.
2586 			put_vnode(parentVnode);
2587 			break;
2588 		}
2589 
2590 		// get the node's name
2591 		char nameBuffer[sizeof(struct dirent) + B_FILE_NAME_LENGTH];
2592 			// also used for fs_read_dir()
2593 		char* name = &((struct dirent*)nameBuffer)->d_name[0];
2594 		status = get_vnode_name(vnode, parentVnode, (struct dirent*)nameBuffer,
2595 			sizeof(nameBuffer), ioContext);
2596 
2597 		// release the current vnode, we only need its parent from now on
2598 		put_vnode(vnode);
2599 		vnode = parentVnode;
2600 
2601 		if (status != B_OK)
2602 			goto out;
2603 
2604 		// TODO: add an explicit check for loops in about 10 levels to do
2605 		// real loop detection
2606 
2607 		// don't go deeper as 'maxLevel' to prevent circular loops
2608 		if (maxLevel-- < 0) {
2609 			status = B_LINK_LIMIT;
2610 			goto out;
2611 		}
2612 
2613 		// add the name in front of the current path
2614 		name[B_FILE_NAME_LENGTH - 1] = '\0';
2615 		length = strlen(name);
2616 		insert -= length;
2617 		if (insert <= 0) {
2618 			status = B_RESULT_NOT_REPRESENTABLE;
2619 			goto out;
2620 		}
2621 		memcpy(path + insert, name, length);
2622 		path[--insert] = '/';
2623 	}
2624 
2625 	// the root dir will result in an empty path: fix it
2626 	if (path[insert] == '\0')
2627 		path[--insert] = '/';
2628 
2629 	TRACE(("  path is: %s\n", path + insert));
2630 
2631 	// move the path to the start of the buffer
2632 	length = bufferSize - insert;
2633 	memmove(buffer, path + insert, length);
2634 
2635 out:
2636 	put_vnode(vnode);
2637 	return status;
2638 }
2639 
2640 
2641 /*!	Checks the length of every path component, and adds a '.'
2642 	if the path ends in a slash.
2643 	The given path buffer must be able to store at least one
2644 	additional character.
2645 */
2646 static status_t
2647 check_path(char* to)
2648 {
2649 	int32 length = 0;
2650 
2651 	// check length of every path component
2652 
2653 	while (*to) {
2654 		char* begin;
2655 		if (*to == '/')
2656 			to++, length++;
2657 
2658 		begin = to;
2659 		while (*to != '/' && *to)
2660 			to++, length++;
2661 
2662 		if (to - begin > B_FILE_NAME_LENGTH)
2663 			return B_NAME_TOO_LONG;
2664 	}
2665 
2666 	if (length == 0)
2667 		return B_ENTRY_NOT_FOUND;
2668 
2669 	// complete path if there is a slash at the end
2670 
2671 	if (*(to - 1) == '/') {
2672 		if (length > B_PATH_NAME_LENGTH - 2)
2673 			return B_NAME_TOO_LONG;
2674 
2675 		to[0] = '.';
2676 		to[1] = '\0';
2677 	}
2678 
2679 	return B_OK;
2680 }
2681 
2682 
2683 static struct file_descriptor*
2684 get_fd_and_vnode(int fd, struct vnode** _vnode, bool kernel)
2685 {
2686 	struct file_descriptor* descriptor
2687 		= get_fd(get_current_io_context(kernel), fd);
2688 	if (descriptor == NULL)
2689 		return NULL;
2690 
2691 	struct vnode* vnode = fd_vnode(descriptor);
2692 	if (vnode == NULL) {
2693 		put_fd(descriptor);
2694 		return NULL;
2695 	}
2696 
2697 	// ToDo: when we can close a file descriptor at any point, investigate
2698 	//	if this is still valid to do (accessing the vnode without ref_count
2699 	//	or locking)
2700 	*_vnode = vnode;
2701 	return descriptor;
2702 }
2703 
2704 
2705 static struct vnode*
2706 get_vnode_from_fd(int fd, bool kernel)
2707 {
2708 	struct file_descriptor* descriptor;
2709 	struct vnode* vnode;
2710 
2711 	descriptor = get_fd(get_current_io_context(kernel), fd);
2712 	if (descriptor == NULL)
2713 		return NULL;
2714 
2715 	vnode = fd_vnode(descriptor);
2716 	if (vnode != NULL)
2717 		inc_vnode_ref_count(vnode);
2718 
2719 	put_fd(descriptor);
2720 	return vnode;
2721 }
2722 
2723 
2724 /*!	Gets the vnode from an FD + path combination. If \a fd is lower than zero,
2725 	only the path will be considered. In this case, the \a path must not be
2726 	NULL.
2727 	If \a fd is a valid file descriptor, \a path may be NULL for directories,
2728 	and should be NULL for files.
2729 */
2730 static status_t
2731 fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
2732 	struct vnode** _vnode, ino_t* _parentID, bool kernel)
2733 {
2734 	if (fd < 0 && !path)
2735 		return B_BAD_VALUE;
2736 
2737 	if (path != NULL && *path == '\0')
2738 		return B_ENTRY_NOT_FOUND;
2739 
2740 	if (fd < 0 || (path != NULL && path[0] == '/')) {
2741 		// no FD or absolute path
2742 		return path_to_vnode(path, traverseLeafLink, _vnode, _parentID, kernel);
2743 	}
2744 
2745 	// FD only, or FD + relative path
2746 	struct vnode* vnode = get_vnode_from_fd(fd, kernel);
2747 	if (!vnode)
2748 		return B_FILE_ERROR;
2749 
2750 	if (path != NULL) {
2751 		return vnode_path_to_vnode(vnode, path, traverseLeafLink, 0, kernel,
2752 			_vnode, _parentID);
2753 	}
2754 
2755 	// there is no relative path to take into account
2756 
2757 	*_vnode = vnode;
2758 	if (_parentID)
2759 		*_parentID = -1;
2760 
2761 	return B_OK;
2762 }
2763 
2764 
2765 static int
2766 get_new_fd(int type, struct fs_mount* mount, struct vnode* vnode,
2767 	void* cookie, int openMode, bool kernel)
2768 {
2769 	struct file_descriptor* descriptor;
2770 	int fd;
2771 
2772 	// If the vnode is locked, we don't allow creating a new file/directory
2773 	// file_descriptor for it
2774 	if (vnode && vnode->mandatory_locked_by != NULL
2775 		&& (type == FDTYPE_FILE || type == FDTYPE_DIR))
2776 		return B_BUSY;
2777 
2778 	descriptor = alloc_fd();
2779 	if (!descriptor)
2780 		return B_NO_MEMORY;
2781 
2782 	if (vnode)
2783 		descriptor->u.vnode = vnode;
2784 	else
2785 		descriptor->u.mount = mount;
2786 	descriptor->cookie = cookie;
2787 
2788 	switch (type) {
2789 		// vnode types
2790 		case FDTYPE_FILE:
2791 			descriptor->ops = &sFileOps;
2792 			break;
2793 		case FDTYPE_DIR:
2794 			descriptor->ops = &sDirectoryOps;
2795 			break;
2796 		case FDTYPE_ATTR:
2797 			descriptor->ops = &sAttributeOps;
2798 			break;
2799 		case FDTYPE_ATTR_DIR:
2800 			descriptor->ops = &sAttributeDirectoryOps;
2801 			break;
2802 
2803 		// mount types
2804 		case FDTYPE_INDEX_DIR:
2805 			descriptor->ops = &sIndexDirectoryOps;
2806 			break;
2807 		case FDTYPE_QUERY:
2808 			descriptor->ops = &sQueryOps;
2809 			break;
2810 
2811 		default:
2812 			panic("get_new_fd() called with unknown type %d\n", type);
2813 			break;
2814 	}
2815 	descriptor->type = type;
2816 	descriptor->open_mode = openMode;
2817 
2818 	io_context* context = get_current_io_context(kernel);
2819 	fd = new_fd(context, descriptor);
2820 	if (fd < 0) {
2821 		free(descriptor);
2822 		return B_NO_MORE_FDS;
2823 	}
2824 
2825 	mutex_lock(&context->io_mutex);
2826 	fd_set_close_on_exec(context, fd, (openMode & O_CLOEXEC) != 0);
2827 	mutex_unlock(&context->io_mutex);
2828 
2829 	return fd;
2830 }
2831 
2832 
2833 /*!	In-place normalizes \a path. It's otherwise semantically equivalent to
2834 	vfs_normalize_path(). See there for more documentation.
2835 */
2836 static status_t
2837 normalize_path(char* path, size_t pathSize, bool traverseLink, bool kernel)
2838 {
2839 	VNodePutter dirPutter;
2840 	struct vnode* dir = NULL;
2841 	status_t error;
2842 
2843 	for (int i = 0; i < B_MAX_SYMLINKS; i++) {
2844 		// get dir vnode + leaf name
2845 		struct vnode* nextDir;
2846 		char leaf[B_FILE_NAME_LENGTH];
2847 		error = vnode_and_path_to_dir_vnode(dir, path, &nextDir, leaf, kernel);
2848 		if (error != B_OK)
2849 			return error;
2850 
2851 		dir = nextDir;
2852 		strcpy(path, leaf);
2853 		dirPutter.SetTo(dir);
2854 
2855 		// get file vnode, if we shall resolve links
2856 		bool fileExists = false;
2857 		struct vnode* fileVnode;
2858 		VNodePutter fileVnodePutter;
2859 		if (traverseLink) {
2860 			inc_vnode_ref_count(dir);
2861 			if (vnode_path_to_vnode(dir, path, false, 0, kernel, &fileVnode,
2862 					NULL) == B_OK) {
2863 				fileVnodePutter.SetTo(fileVnode);
2864 				fileExists = true;
2865 			}
2866 		}
2867 
2868 		if (!fileExists || !traverseLink || !S_ISLNK(fileVnode->Type())) {
2869 			// we're done -- construct the path
2870 			bool hasLeaf = true;
2871 			if (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0) {
2872 				// special cases "." and ".." -- get the dir, forget the leaf
2873 				inc_vnode_ref_count(dir);
2874 				error = vnode_path_to_vnode(dir, leaf, false, 0, kernel,
2875 					&nextDir, NULL);
2876 				if (error != B_OK)
2877 					return error;
2878 				dir = nextDir;
2879 				dirPutter.SetTo(dir);
2880 				hasLeaf = false;
2881 			}
2882 
2883 			// get the directory path
2884 			error = dir_vnode_to_path(dir, path, B_PATH_NAME_LENGTH, kernel);
2885 			if (error != B_OK)
2886 				return error;
2887 
2888 			// append the leaf name
2889 			if (hasLeaf) {
2890 				// insert a directory separator if this is not the file system
2891 				// root
2892 				if ((strcmp(path, "/") != 0
2893 					&& strlcat(path, "/", pathSize) >= pathSize)
2894 					|| strlcat(path, leaf, pathSize) >= pathSize) {
2895 					return B_NAME_TOO_LONG;
2896 				}
2897 			}
2898 
2899 			return B_OK;
2900 		}
2901 
2902 		// read link
2903 		if (HAS_FS_CALL(fileVnode, read_symlink)) {
2904 			size_t bufferSize = B_PATH_NAME_LENGTH - 1;
2905 			error = FS_CALL(fileVnode, read_symlink, path, &bufferSize);
2906 			if (error != B_OK)
2907 				return error;
2908 			path[bufferSize] = '\0';
2909 		} else
2910 			return B_BAD_VALUE;
2911 	}
2912 
2913 	return B_LINK_LIMIT;
2914 }
2915 
2916 
2917 static status_t
2918 resolve_covered_parent(struct vnode* parent, dev_t* _device, ino_t* _node,
2919 	struct io_context* ioContext)
2920 {
2921 	// Make sure the IO context root is not bypassed.
2922 	if (parent == ioContext->root) {
2923 		*_device = parent->device;
2924 		*_node = parent->id;
2925 		return B_OK;
2926 	}
2927 
2928 	inc_vnode_ref_count(parent);
2929 		// vnode_path_to_vnode() puts the node
2930 
2931 	// ".." is guaranteed not to be clobbered by this call
2932 	struct vnode* vnode;
2933 	status_t status = vnode_path_to_vnode(parent, (char*)"..", false, 0,
2934 		ioContext, &vnode, NULL);
2935 	if (status == B_OK) {
2936 		*_device = vnode->device;
2937 		*_node = vnode->id;
2938 		put_vnode(vnode);
2939 	}
2940 
2941 	return status;
2942 }
2943 
2944 
2945 #ifdef ADD_DEBUGGER_COMMANDS
2946 
2947 
2948 static void
2949 _dump_advisory_locking(advisory_locking* locking)
2950 {
2951 	if (locking == NULL)
2952 		return;
2953 
2954 	kprintf("   lock:        %" B_PRId32, locking->lock);
2955 	kprintf("   wait_sem:    %" B_PRId32, locking->wait_sem);
2956 
2957 	int32 index = 0;
2958 	LockList::Iterator iterator = locking->locks.GetIterator();
2959 	while (iterator.HasNext()) {
2960 		struct advisory_lock* lock = iterator.Next();
2961 
2962 		kprintf("   [%2" B_PRId32 "] team:   %" B_PRId32 "\n", index++, lock->team);
2963 		kprintf("        start:  %" B_PRIdOFF "\n", lock->start);
2964 		kprintf("        end:    %" B_PRIdOFF "\n", lock->end);
2965 		kprintf("        shared? %s\n", lock->shared ? "yes" : "no");
2966 	}
2967 }
2968 
2969 
2970 static void
2971 _dump_mount(struct fs_mount* mount)
2972 {
2973 	kprintf("MOUNT: %p\n", mount);
2974 	kprintf(" id:            %" B_PRIdDEV "\n", mount->id);
2975 	kprintf(" device_name:   %s\n", mount->device_name);
2976 	kprintf(" root_vnode:    %p\n", mount->root_vnode);
2977 	kprintf(" covers:        %p\n", mount->root_vnode->covers);
2978 	kprintf(" partition:     %p\n", mount->partition);
2979 	kprintf(" lock:          %p\n", &mount->rlock);
2980 	kprintf(" flags:        %s%s\n", mount->unmounting ? " unmounting" : "",
2981 		mount->owns_file_device ? " owns_file_device" : "");
2982 
2983 	fs_volume* volume = mount->volume;
2984 	while (volume != NULL) {
2985 		kprintf(" volume %p:\n", volume);
2986 		kprintf("  layer:            %" B_PRId32 "\n", volume->layer);
2987 		kprintf("  private_volume:   %p\n", volume->private_volume);
2988 		kprintf("  ops:              %p\n", volume->ops);
2989 		kprintf("  file_system:      %p\n", volume->file_system);
2990 		kprintf("  file_system_name: %s\n", volume->file_system_name);
2991 		volume = volume->super_volume;
2992 	}
2993 
2994 	set_debug_variable("_volume", (addr_t)mount->volume->private_volume);
2995 	set_debug_variable("_root", (addr_t)mount->root_vnode);
2996 	set_debug_variable("_covers", (addr_t)mount->root_vnode->covers);
2997 	set_debug_variable("_partition", (addr_t)mount->partition);
2998 }
2999 
3000 
3001 static bool
3002 debug_prepend_vnode_name_to_path(char* buffer, size_t& bufferSize,
3003 	const char* name)
3004 {
3005 	bool insertSlash = buffer[bufferSize] != '\0';
3006 	size_t nameLength = strlen(name);
3007 
3008 	if (bufferSize < nameLength + (insertSlash ? 1 : 0))
3009 		return false;
3010 
3011 	if (insertSlash)
3012 		buffer[--bufferSize] = '/';
3013 
3014 	bufferSize -= nameLength;
3015 	memcpy(buffer + bufferSize, name, nameLength);
3016 
3017 	return true;
3018 }
3019 
3020 
3021 static bool
3022 debug_prepend_vnode_id_to_path(char* buffer, size_t& bufferSize, dev_t devID,
3023 	ino_t nodeID)
3024 {
3025 	if (bufferSize == 0)
3026 		return false;
3027 
3028 	bool insertSlash = buffer[bufferSize] != '\0';
3029 	if (insertSlash)
3030 		buffer[--bufferSize] = '/';
3031 
3032 	size_t size = snprintf(buffer, bufferSize,
3033 		"<%" B_PRIdDEV ",%" B_PRIdINO ">", devID, nodeID);
3034 	if (size > bufferSize) {
3035 		if (insertSlash)
3036 			bufferSize++;
3037 		return false;
3038 	}
3039 
3040 	if (size < bufferSize)
3041 		memmove(buffer + bufferSize - size, buffer, size);
3042 
3043 	bufferSize -= size;
3044 	return true;
3045 }
3046 
3047 
3048 static char*
3049 debug_resolve_vnode_path(struct vnode* vnode, char* buffer, size_t bufferSize,
3050 	bool& _truncated)
3051 {
3052 	// null-terminate the path
3053 	buffer[--bufferSize] = '\0';
3054 
3055 	while (true) {
3056 		while (vnode->covers != NULL)
3057 			vnode = vnode->covers;
3058 
3059 		if (vnode == sRoot) {
3060 			_truncated = bufferSize == 0;
3061 			if (!_truncated)
3062 				buffer[--bufferSize] = '/';
3063 			return buffer + bufferSize;
3064 		}
3065 
3066 		// resolve the name
3067 		ino_t dirID;
3068 		const char* name = vnode->mount->entry_cache.DebugReverseLookup(
3069 			vnode->id, dirID);
3070 		if (name == NULL) {
3071 			// Failed to resolve the name -- prepend "<dev,node>/".
3072 			_truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3073 				vnode->mount->id, vnode->id);
3074 			return buffer + bufferSize;
3075 		}
3076 
3077 		// prepend the name
3078 		if (!debug_prepend_vnode_name_to_path(buffer, bufferSize, name)) {
3079 			_truncated = true;
3080 			return buffer + bufferSize;
3081 		}
3082 
3083 		// resolve the directory node
3084 		struct vnode* nextVnode = lookup_vnode(vnode->mount->id, dirID);
3085 		if (nextVnode == NULL) {
3086 			_truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3087 				vnode->mount->id, dirID);
3088 			return buffer + bufferSize;
3089 		}
3090 
3091 		vnode = nextVnode;
3092 	}
3093 }
3094 
3095 
3096 static void
3097 _dump_vnode(struct vnode* vnode, bool printPath)
3098 {
3099 	kprintf("VNODE: %p\n", vnode);
3100 	kprintf(" device:        %" B_PRIdDEV "\n", vnode->device);
3101 	kprintf(" id:            %" B_PRIdINO "\n", vnode->id);
3102 	kprintf(" ref_count:     %" B_PRId32 "\n", vnode->ref_count);
3103 	kprintf(" private_node:  %p\n", vnode->private_node);
3104 	kprintf(" mount:         %p\n", vnode->mount);
3105 	kprintf(" covered_by:    %p\n", vnode->covered_by);
3106 	kprintf(" covers:        %p\n", vnode->covers);
3107 	kprintf(" cache:         %p\n", vnode->cache);
3108 	kprintf(" type:          %#" B_PRIx32 "\n", vnode->Type());
3109 	kprintf(" flags:         %s%s%s\n", vnode->IsRemoved() ? "r" : "-",
3110 		vnode->IsBusy() ? "b" : "-", vnode->IsUnpublished() ? "u" : "-");
3111 	kprintf(" advisory_lock: %p\n", vnode->advisory_locking);
3112 
3113 	_dump_advisory_locking(vnode->advisory_locking);
3114 
3115 	if (printPath) {
3116 		void* buffer = debug_malloc(B_PATH_NAME_LENGTH);
3117 		if (buffer != NULL) {
3118 			bool truncated;
3119 			char* path = debug_resolve_vnode_path(vnode, (char*)buffer,
3120 				B_PATH_NAME_LENGTH, truncated);
3121 			if (path != NULL) {
3122 				kprintf(" path:          ");
3123 				if (truncated)
3124 					kputs("<truncated>/");
3125 				kputs(path);
3126 				kputs("\n");
3127 			} else
3128 				kprintf("Failed to resolve vnode path.\n");
3129 
3130 			debug_free(buffer);
3131 		} else
3132 			kprintf("Failed to allocate memory for constructing the path.\n");
3133 	}
3134 
3135 	set_debug_variable("_node", (addr_t)vnode->private_node);
3136 	set_debug_variable("_mount", (addr_t)vnode->mount);
3137 	set_debug_variable("_covered_by", (addr_t)vnode->covered_by);
3138 	set_debug_variable("_covers", (addr_t)vnode->covers);
3139 	set_debug_variable("_adv_lock", (addr_t)vnode->advisory_locking);
3140 }
3141 
3142 
3143 static int
3144 dump_mount(int argc, char** argv)
3145 {
3146 	if (argc != 2 || !strcmp(argv[1], "--help")) {
3147 		kprintf("usage: %s [id|address]\n", argv[0]);
3148 		return 0;
3149 	}
3150 
3151 	ulong val = parse_expression(argv[1]);
3152 	uint32 id = val;
3153 
3154 	struct fs_mount* mount = sMountsTable->Lookup(id);
3155 	if (mount == NULL) {
3156 		if (IS_USER_ADDRESS(id)) {
3157 			kprintf("fs_mount not found\n");
3158 			return 0;
3159 		}
3160 		mount = (fs_mount*)val;
3161 	}
3162 
3163 	_dump_mount(mount);
3164 	return 0;
3165 }
3166 
3167 
3168 static int
3169 dump_mounts(int argc, char** argv)
3170 {
3171 	if (argc != 1) {
3172 		kprintf("usage: %s\n", argv[0]);
3173 		return 0;
3174 	}
3175 
3176 	kprintf("%-*s    id %-*s   %-*s   %-*s   fs_name\n",
3177 		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "root",
3178 		B_PRINTF_POINTER_WIDTH, "covers", B_PRINTF_POINTER_WIDTH, "cookie");
3179 
3180 	struct fs_mount* mount;
3181 
3182 	MountTable::Iterator iterator(sMountsTable);
3183 	while (iterator.HasNext()) {
3184 		mount = iterator.Next();
3185 		kprintf("%p%4" B_PRIdDEV " %p %p %p %s\n", mount, mount->id, mount->root_vnode,
3186 			mount->root_vnode->covers, mount->volume->private_volume,
3187 			mount->volume->file_system_name);
3188 
3189 		fs_volume* volume = mount->volume;
3190 		while (volume->super_volume != NULL) {
3191 			volume = volume->super_volume;
3192 			kprintf("                                     %p %s\n",
3193 				volume->private_volume, volume->file_system_name);
3194 		}
3195 	}
3196 
3197 	return 0;
3198 }
3199 
3200 
3201 static int
3202 dump_vnode(int argc, char** argv)
3203 {
3204 	bool printPath = false;
3205 	int argi = 1;
3206 	if (argc >= 2 && strcmp(argv[argi], "-p") == 0) {
3207 		printPath = true;
3208 		argi++;
3209 	}
3210 
3211 	if (argi >= argc || argi + 2 < argc) {
3212 		print_debugger_command_usage(argv[0]);
3213 		return 0;
3214 	}
3215 
3216 	struct vnode* vnode = NULL;
3217 
3218 	if (argi + 1 == argc) {
3219 		vnode = (struct vnode*)parse_expression(argv[argi]);
3220 		if (IS_USER_ADDRESS(vnode)) {
3221 			kprintf("invalid vnode address\n");
3222 			return 0;
3223 		}
3224 		_dump_vnode(vnode, printPath);
3225 		return 0;
3226 	}
3227 
3228 	dev_t device = parse_expression(argv[argi]);
3229 	ino_t id = parse_expression(argv[argi + 1]);
3230 
3231 	VnodeTable::Iterator iterator(sVnodeTable);
3232 	while (iterator.HasNext()) {
3233 		vnode = iterator.Next();
3234 		if (vnode->id != id || vnode->device != device)
3235 			continue;
3236 
3237 		_dump_vnode(vnode, printPath);
3238 	}
3239 
3240 	return 0;
3241 }
3242 
3243 
3244 static int
3245 dump_vnodes(int argc, char** argv)
3246 {
3247 	if (argc != 2 || !strcmp(argv[1], "--help")) {
3248 		kprintf("usage: %s [device]\n", argv[0]);
3249 		return 0;
3250 	}
3251 
3252 	// restrict dumped nodes to a certain device if requested
3253 	dev_t device = parse_expression(argv[1]);
3254 
3255 	struct vnode* vnode;
3256 
3257 	kprintf("%-*s   dev     inode  ref %-*s   %-*s   %-*s   flags\n",
3258 		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache",
3259 		B_PRINTF_POINTER_WIDTH, "fs-node", B_PRINTF_POINTER_WIDTH, "locking");
3260 
3261 	VnodeTable::Iterator iterator(sVnodeTable);
3262 	while (iterator.HasNext()) {
3263 		vnode = iterator.Next();
3264 		if (vnode->device != device)
3265 			continue;
3266 
3267 		kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO "%5" B_PRId32 " %p %p %p %s%s%s\n",
3268 			vnode, vnode->device, vnode->id, vnode->ref_count, vnode->cache,
3269 			vnode->private_node, vnode->advisory_locking,
3270 			vnode->IsRemoved() ? "r" : "-", vnode->IsBusy() ? "b" : "-",
3271 			vnode->IsUnpublished() ? "u" : "-");
3272 	}
3273 
3274 	return 0;
3275 }
3276 
3277 
3278 static int
3279 dump_vnode_caches(int argc, char** argv)
3280 {
3281 	struct vnode* vnode;
3282 
3283 	if (argc > 2 || !strcmp(argv[1], "--help")) {
3284 		kprintf("usage: %s [device]\n", argv[0]);
3285 		return 0;
3286 	}
3287 
3288 	// restrict dumped nodes to a certain device if requested
3289 	dev_t device = -1;
3290 	if (argc > 1)
3291 		device = parse_expression(argv[1]);
3292 
3293 	kprintf("%-*s   dev     inode %-*s       size   pages\n",
3294 		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache");
3295 
3296 	VnodeTable::Iterator iterator(sVnodeTable);
3297 	while (iterator.HasNext()) {
3298 		vnode = iterator.Next();
3299 		if (vnode->cache == NULL)
3300 			continue;
3301 		if (device != -1 && vnode->device != device)
3302 			continue;
3303 
3304 		kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO " %p %8" B_PRIdOFF "%8" B_PRId32 "\n",
3305 			vnode, vnode->device, vnode->id, vnode->cache,
3306 			(vnode->cache->virtual_end + B_PAGE_SIZE - 1) / B_PAGE_SIZE,
3307 			vnode->cache->page_count);
3308 	}
3309 
3310 	return 0;
3311 }
3312 
3313 
3314 int
3315 dump_io_context(int argc, char** argv)
3316 {
3317 	if (argc > 2 || !strcmp(argv[1], "--help")) {
3318 		kprintf("usage: %s [team-id|address]\n", argv[0]);
3319 		return 0;
3320 	}
3321 
3322 	struct io_context* context = NULL;
3323 
3324 	if (argc > 1) {
3325 		ulong num = parse_expression(argv[1]);
3326 		if (IS_KERNEL_ADDRESS(num))
3327 			context = (struct io_context*)num;
3328 		else {
3329 			Team* team = team_get_team_struct_locked(num);
3330 			if (team == NULL) {
3331 				kprintf("could not find team with ID %lu\n", num);
3332 				return 0;
3333 			}
3334 			context = (struct io_context*)team->io_context;
3335 		}
3336 	} else
3337 		context = get_current_io_context(true);
3338 
3339 	kprintf("I/O CONTEXT: %p\n", context);
3340 	kprintf(" root vnode:\t%p\n", context->root);
3341 	kprintf(" cwd vnode:\t%p\n", context->cwd);
3342 	kprintf(" used fds:\t%" B_PRIu32 "\n", context->num_used_fds);
3343 	kprintf(" max fds:\t%" B_PRIu32 "\n", context->table_size);
3344 
3345 	if (context->num_used_fds) {
3346 		kprintf("   no.  type    %*s  ref  open  mode         pos    %*s\n",
3347 			B_PRINTF_POINTER_WIDTH, "ops", B_PRINTF_POINTER_WIDTH, "cookie");
3348 	}
3349 
3350 	for (uint32 i = 0; i < context->table_size; i++) {
3351 		struct file_descriptor* fd = context->fds[i];
3352 		if (fd == NULL)
3353 			continue;
3354 
3355 		kprintf("  %3" B_PRIu32 ":  %4" B_PRId32 "  %p  %3" B_PRId32 "  %4"
3356 			B_PRIu32 "  %4" B_PRIx32 "  %10" B_PRIdOFF "  %p  %s %p\n", i,
3357 			fd->type, fd->ops, fd->ref_count, fd->open_count, fd->open_mode,
3358 			fd->pos, fd->cookie,
3359 			fd->type >= FDTYPE_INDEX && fd->type <= FDTYPE_QUERY
3360 				? "mount" : "vnode",
3361 			fd->u.vnode);
3362 	}
3363 
3364 	kprintf(" used monitors:\t%" B_PRIu32 "\n", context->num_monitors);
3365 	kprintf(" max monitors:\t%" B_PRIu32 "\n", context->max_monitors);
3366 
3367 	set_debug_variable("_cwd", (addr_t)context->cwd);
3368 
3369 	return 0;
3370 }
3371 
3372 
3373 int
3374 dump_vnode_usage(int argc, char** argv)
3375 {
3376 	if (argc != 1) {
3377 		kprintf("usage: %s\n", argv[0]);
3378 		return 0;
3379 	}
3380 
3381 	kprintf("Unused vnodes: %" B_PRIu32 " (max unused %" B_PRIu32 ")\n",
3382 		sUnusedVnodes, kMaxUnusedVnodes);
3383 
3384 	uint32 count = sVnodeTable->CountElements();
3385 
3386 	kprintf("%" B_PRIu32 " vnodes total (%" B_PRIu32 " in use).\n", count,
3387 		count - sUnusedVnodes);
3388 	return 0;
3389 }
3390 
3391 #endif	// ADD_DEBUGGER_COMMANDS
3392 
3393 
3394 /*!	Clears memory specified by an iovec array.
3395 */
3396 static void
3397 zero_iovecs(const iovec* vecs, size_t vecCount, size_t bytes)
3398 {
3399 	for (size_t i = 0; i < vecCount && bytes > 0; i++) {
3400 		size_t length = std::min(vecs[i].iov_len, bytes);
3401 		memset(vecs[i].iov_base, 0, length);
3402 		bytes -= length;
3403 	}
3404 }
3405 
3406 
3407 /*!	Does the dirty work of combining the file_io_vecs with the iovecs
3408 	and calls the file system hooks to read/write the request to disk.
3409 */
3410 static status_t
3411 common_file_io_vec_pages(struct vnode* vnode, void* cookie,
3412 	const file_io_vec* fileVecs, size_t fileVecCount, const iovec* vecs,
3413 	size_t vecCount, uint32* _vecIndex, size_t* _vecOffset, size_t* _numBytes,
3414 	bool doWrite)
3415 {
3416 	if (fileVecCount == 0) {
3417 		// There are no file vecs at this offset, so we're obviously trying
3418 		// to access the file outside of its bounds
3419 		return B_BAD_VALUE;
3420 	}
3421 
3422 	size_t numBytes = *_numBytes;
3423 	uint32 fileVecIndex;
3424 	size_t vecOffset = *_vecOffset;
3425 	uint32 vecIndex = *_vecIndex;
3426 	status_t status;
3427 	size_t size;
3428 
3429 	if (!doWrite && vecOffset == 0) {
3430 		// now directly read the data from the device
3431 		// the first file_io_vec can be read directly
3432 
3433 		if (fileVecs[0].length < (off_t)numBytes)
3434 			size = fileVecs[0].length;
3435 		else
3436 			size = numBytes;
3437 
3438 		if (fileVecs[0].offset >= 0) {
3439 			status = FS_CALL(vnode, read_pages, cookie, fileVecs[0].offset,
3440 				&vecs[vecIndex], vecCount - vecIndex, &size);
3441 		} else {
3442 			// sparse read
3443 			zero_iovecs(&vecs[vecIndex], vecCount - vecIndex, size);
3444 			status = B_OK;
3445 		}
3446 		if (status != B_OK)
3447 			return status;
3448 
3449 		// TODO: this is a work-around for buggy device drivers!
3450 		//	When our own drivers honour the length, we can:
3451 		//	a) also use this direct I/O for writes (otherwise, it would
3452 		//	   overwrite precious data)
3453 		//	b) panic if the term below is true (at least for writes)
3454 		if ((off_t)size > fileVecs[0].length) {
3455 			//dprintf("warning: device driver %p doesn't respect total length "
3456 			//	"in read_pages() call!\n", ref->device);
3457 			size = fileVecs[0].length;
3458 		}
3459 
3460 		ASSERT((off_t)size <= fileVecs[0].length);
3461 
3462 		// If the file portion was contiguous, we're already done now
3463 		if (size == numBytes)
3464 			return B_OK;
3465 
3466 		// if we reached the end of the file, we can return as well
3467 		if ((off_t)size != fileVecs[0].length) {
3468 			*_numBytes = size;
3469 			return B_OK;
3470 		}
3471 
3472 		fileVecIndex = 1;
3473 
3474 		// first, find out where we have to continue in our iovecs
3475 		for (; vecIndex < vecCount; vecIndex++) {
3476 			if (size < vecs[vecIndex].iov_len)
3477 				break;
3478 
3479 			size -= vecs[vecIndex].iov_len;
3480 		}
3481 
3482 		vecOffset = size;
3483 	} else {
3484 		fileVecIndex = 0;
3485 		size = 0;
3486 	}
3487 
3488 	// Too bad, let's process the rest of the file_io_vecs
3489 
3490 	size_t totalSize = size;
3491 	size_t bytesLeft = numBytes - size;
3492 
3493 	for (; fileVecIndex < fileVecCount; fileVecIndex++) {
3494 		const file_io_vec &fileVec = fileVecs[fileVecIndex];
3495 		off_t fileOffset = fileVec.offset;
3496 		off_t fileLeft = min_c(fileVec.length, (off_t)bytesLeft);
3497 
3498 		TRACE(("FILE VEC [%" B_PRIu32 "] length %" B_PRIdOFF "\n", fileVecIndex,
3499 			fileLeft));
3500 
3501 		// process the complete fileVec
3502 		while (fileLeft > 0) {
3503 			iovec tempVecs[MAX_TEMP_IO_VECS];
3504 			uint32 tempCount = 0;
3505 
3506 			// size tracks how much of what is left of the current fileVec
3507 			// (fileLeft) has been assigned to tempVecs
3508 			size = 0;
3509 
3510 			// assign what is left of the current fileVec to the tempVecs
3511 			for (size = 0; (off_t)size < fileLeft && vecIndex < vecCount
3512 					&& tempCount < MAX_TEMP_IO_VECS;) {
3513 				// try to satisfy one iovec per iteration (or as much as
3514 				// possible)
3515 
3516 				// bytes left of the current iovec
3517 				size_t vecLeft = vecs[vecIndex].iov_len - vecOffset;
3518 				if (vecLeft == 0) {
3519 					vecOffset = 0;
3520 					vecIndex++;
3521 					continue;
3522 				}
3523 
3524 				TRACE(("fill vec %" B_PRIu32 ", offset = %lu, size = %lu\n",
3525 					vecIndex, vecOffset, size));
3526 
3527 				// actually available bytes
3528 				size_t tempVecSize = min_c(vecLeft, fileLeft - size);
3529 
3530 				tempVecs[tempCount].iov_base
3531 					= (void*)((addr_t)vecs[vecIndex].iov_base + vecOffset);
3532 				tempVecs[tempCount].iov_len = tempVecSize;
3533 				tempCount++;
3534 
3535 				size += tempVecSize;
3536 				vecOffset += tempVecSize;
3537 			}
3538 
3539 			size_t bytes = size;
3540 
3541 			if (fileOffset == -1) {
3542 				if (doWrite) {
3543 					panic("sparse write attempt: vnode %p", vnode);
3544 					status = B_IO_ERROR;
3545 				} else {
3546 					// sparse read
3547 					zero_iovecs(tempVecs, tempCount, bytes);
3548 					status = B_OK;
3549 				}
3550 			} else if (doWrite) {
3551 				status = FS_CALL(vnode, write_pages, cookie, fileOffset,
3552 					tempVecs, tempCount, &bytes);
3553 			} else {
3554 				status = FS_CALL(vnode, read_pages, cookie, fileOffset,
3555 					tempVecs, tempCount, &bytes);
3556 			}
3557 			if (status != B_OK)
3558 				return status;
3559 
3560 			totalSize += bytes;
3561 			bytesLeft -= size;
3562 			if (fileOffset >= 0)
3563 				fileOffset += size;
3564 			fileLeft -= size;
3565 			//dprintf("-> file left = %Lu\n", fileLeft);
3566 
3567 			if (size != bytes || vecIndex >= vecCount) {
3568 				// there are no more bytes or iovecs, let's bail out
3569 				*_numBytes = totalSize;
3570 				return B_OK;
3571 			}
3572 		}
3573 	}
3574 
3575 	*_vecIndex = vecIndex;
3576 	*_vecOffset = vecOffset;
3577 	*_numBytes = totalSize;
3578 	return B_OK;
3579 }
3580 
3581 
3582 static bool
3583 is_user_in_group(gid_t gid)
3584 {
3585 	if (gid == getegid())
3586 		return true;
3587 
3588 	gid_t groups[NGROUPS_MAX];
3589 	int groupCount = getgroups(NGROUPS_MAX, groups);
3590 	for (int i = 0; i < groupCount; i++) {
3591 		if (gid == groups[i])
3592 			return true;
3593 	}
3594 
3595 	return false;
3596 }
3597 
3598 
3599 static status_t
3600 free_io_context(io_context* context)
3601 {
3602 	uint32 i;
3603 
3604 	TIOC(FreeIOContext(context));
3605 
3606 	if (context->root)
3607 		put_vnode(context->root);
3608 
3609 	if (context->cwd)
3610 		put_vnode(context->cwd);
3611 
3612 	mutex_lock(&context->io_mutex);
3613 
3614 	for (i = 0; i < context->table_size; i++) {
3615 		if (struct file_descriptor* descriptor = context->fds[i]) {
3616 			close_fd(descriptor);
3617 			put_fd(descriptor);
3618 		}
3619 	}
3620 
3621 	mutex_destroy(&context->io_mutex);
3622 
3623 	remove_node_monitors(context);
3624 	free(context->fds);
3625 	free(context);
3626 
3627 	return B_OK;
3628 }
3629 
3630 
3631 static status_t
3632 resize_monitor_table(struct io_context* context, const int newSize)
3633 {
3634 	int	status = B_OK;
3635 
3636 	if (newSize <= 0 || newSize > MAX_NODE_MONITORS)
3637 		return B_BAD_VALUE;
3638 
3639 	mutex_lock(&context->io_mutex);
3640 
3641 	if ((size_t)newSize < context->num_monitors) {
3642 		status = B_BUSY;
3643 		goto out;
3644 	}
3645 	context->max_monitors = newSize;
3646 
3647 out:
3648 	mutex_unlock(&context->io_mutex);
3649 	return status;
3650 }
3651 
3652 
3653 //	#pragma mark - public API for file systems
3654 
3655 
3656 extern "C" status_t
3657 new_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3658 	fs_vnode_ops* ops)
3659 {
3660 	FUNCTION(("new_vnode(volume = %p (%" B_PRId32 "), vnodeID = %" B_PRId64
3661 		", node = %p)\n", volume, volume->id, vnodeID, privateNode));
3662 
3663 	if (privateNode == NULL)
3664 		return B_BAD_VALUE;
3665 
3666 	// create the node
3667 	bool nodeCreated;
3668 	struct vnode* vnode;
3669 	status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3670 		nodeCreated);
3671 	if (status != B_OK)
3672 		return status;
3673 
3674 	WriteLocker nodeLocker(sVnodeLock, true);
3675 		// create_new_vnode_and_lock() has locked for us
3676 
3677 	// file system integrity check:
3678 	// test if the vnode already exists and bail out if this is the case!
3679 	if (!nodeCreated) {
3680 		panic("vnode %" B_PRIdDEV ":%" B_PRIdINO " already exists (node = %p, "
3681 			"vnode->node = %p)!", volume->id, vnodeID, privateNode,
3682 			vnode->private_node);
3683 		return B_ERROR;
3684 	}
3685 
3686 	vnode->private_node = privateNode;
3687 	vnode->ops = ops;
3688 	vnode->SetUnpublished(true);
3689 
3690 	TRACE(("returns: %s\n", strerror(status)));
3691 
3692 	return status;
3693 }
3694 
3695 
3696 extern "C" status_t
3697 publish_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3698 	fs_vnode_ops* ops, int type, uint32 flags)
3699 {
3700 	FUNCTION(("publish_vnode()\n"));
3701 
3702 	WriteLocker locker(sVnodeLock);
3703 
3704 	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3705 
3706 	bool nodeCreated = false;
3707 	if (vnode == NULL) {
3708 		if (privateNode == NULL)
3709 			return B_BAD_VALUE;
3710 
3711 		// create the node
3712 		locker.Unlock();
3713 			// create_new_vnode_and_lock() will re-lock for us on success
3714 		status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3715 			nodeCreated);
3716 		if (status != B_OK)
3717 			return status;
3718 
3719 		locker.SetTo(sVnodeLock, true);
3720 	}
3721 
3722 	if (nodeCreated) {
3723 		vnode->private_node = privateNode;
3724 		vnode->ops = ops;
3725 		vnode->SetUnpublished(true);
3726 	} else if (vnode->IsBusy() && vnode->IsUnpublished()
3727 		&& vnode->private_node == privateNode && vnode->ops == ops) {
3728 		// already known, but not published
3729 	} else
3730 		return B_BAD_VALUE;
3731 
3732 	bool publishSpecialSubNode = false;
3733 
3734 	vnode->SetType(type);
3735 	vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
3736 	publishSpecialSubNode = is_special_node_type(type)
3737 		&& (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
3738 
3739 	status_t status = B_OK;
3740 
3741 	// create sub vnodes, if necessary
3742 	if (volume->sub_volume != NULL || publishSpecialSubNode) {
3743 		locker.Unlock();
3744 
3745 		fs_volume* subVolume = volume;
3746 		if (volume->sub_volume != NULL) {
3747 			while (status == B_OK && subVolume->sub_volume != NULL) {
3748 				subVolume = subVolume->sub_volume;
3749 				status = subVolume->ops->create_sub_vnode(subVolume, vnodeID,
3750 					vnode);
3751 			}
3752 		}
3753 
3754 		if (status == B_OK && publishSpecialSubNode)
3755 			status = create_special_sub_node(vnode, flags);
3756 
3757 		if (status != B_OK) {
3758 			// error -- clean up the created sub vnodes
3759 			while (subVolume->super_volume != volume) {
3760 				subVolume = subVolume->super_volume;
3761 				subVolume->ops->delete_sub_vnode(subVolume, vnode);
3762 			}
3763 		}
3764 
3765 		if (status == B_OK) {
3766 			ReadLocker vnodesReadLocker(sVnodeLock);
3767 			AutoLocker<Vnode> nodeLocker(vnode);
3768 			vnode->SetBusy(false);
3769 			vnode->SetUnpublished(false);
3770 		} else {
3771 			locker.Lock();
3772 			sVnodeTable->Remove(vnode);
3773 			remove_vnode_from_mount_list(vnode, vnode->mount);
3774 			free(vnode);
3775 		}
3776 	} else {
3777 		// we still hold the write lock -- mark the node unbusy and published
3778 		vnode->SetBusy(false);
3779 		vnode->SetUnpublished(false);
3780 	}
3781 
3782 	TRACE(("returns: %s\n", strerror(status)));
3783 
3784 	return status;
3785 }
3786 
3787 
3788 extern "C" status_t
3789 get_vnode(fs_volume* volume, ino_t vnodeID, void** _privateNode)
3790 {
3791 	struct vnode* vnode;
3792 
3793 	if (volume == NULL)
3794 		return B_BAD_VALUE;
3795 
3796 	status_t status = get_vnode(volume->id, vnodeID, &vnode, true, true);
3797 	if (status != B_OK)
3798 		return status;
3799 
3800 	// If this is a layered FS, we need to get the node cookie for the requested
3801 	// layer.
3802 	if (HAS_FS_CALL(vnode, get_super_vnode)) {
3803 		fs_vnode resolvedNode;
3804 		status_t status = FS_CALL(vnode, get_super_vnode, volume,
3805 			&resolvedNode);
3806 		if (status != B_OK) {
3807 			panic("get_vnode(): Failed to get super node for vnode %p, "
3808 				"volume: %p", vnode, volume);
3809 			put_vnode(vnode);
3810 			return status;
3811 		}
3812 
3813 		if (_privateNode != NULL)
3814 			*_privateNode = resolvedNode.private_node;
3815 	} else if (_privateNode != NULL)
3816 		*_privateNode = vnode->private_node;
3817 
3818 	return B_OK;
3819 }
3820 
3821 
3822 extern "C" status_t
3823 acquire_vnode(fs_volume* volume, ino_t vnodeID)
3824 {
3825 	struct vnode* vnode;
3826 
3827 	rw_lock_read_lock(&sVnodeLock);
3828 	vnode = lookup_vnode(volume->id, vnodeID);
3829 	rw_lock_read_unlock(&sVnodeLock);
3830 
3831 	if (vnode == NULL)
3832 		return B_BAD_VALUE;
3833 
3834 	inc_vnode_ref_count(vnode);
3835 	return B_OK;
3836 }
3837 
3838 
3839 extern "C" status_t
3840 put_vnode(fs_volume* volume, ino_t vnodeID)
3841 {
3842 	struct vnode* vnode;
3843 
3844 	rw_lock_read_lock(&sVnodeLock);
3845 	vnode = lookup_vnode(volume->id, vnodeID);
3846 	rw_lock_read_unlock(&sVnodeLock);
3847 
3848 	if (vnode == NULL)
3849 		return B_BAD_VALUE;
3850 
3851 	dec_vnode_ref_count(vnode, false, true);
3852 	return B_OK;
3853 }
3854 
3855 
3856 extern "C" status_t
3857 remove_vnode(fs_volume* volume, ino_t vnodeID)
3858 {
3859 	ReadLocker locker(sVnodeLock);
3860 
3861 	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3862 	if (vnode == NULL)
3863 		return B_ENTRY_NOT_FOUND;
3864 
3865 	if (vnode->covered_by != NULL || vnode->covers != NULL) {
3866 		// this vnode is in use
3867 		return B_BUSY;
3868 	}
3869 
3870 	vnode->Lock();
3871 
3872 	vnode->SetRemoved(true);
3873 	bool removeUnpublished = false;
3874 
3875 	if (vnode->IsUnpublished()) {
3876 		// prepare the vnode for deletion
3877 		removeUnpublished = true;
3878 		vnode->SetBusy(true);
3879 	}
3880 
3881 	vnode->Unlock();
3882 	locker.Unlock();
3883 
3884 	if (removeUnpublished) {
3885 		// If the vnode hasn't been published yet, we delete it here
3886 		atomic_add(&vnode->ref_count, -1);
3887 		free_vnode(vnode, true);
3888 	}
3889 
3890 	return B_OK;
3891 }
3892 
3893 
3894 extern "C" status_t
3895 unremove_vnode(fs_volume* volume, ino_t vnodeID)
3896 {
3897 	struct vnode* vnode;
3898 
3899 	rw_lock_read_lock(&sVnodeLock);
3900 
3901 	vnode = lookup_vnode(volume->id, vnodeID);
3902 	if (vnode) {
3903 		AutoLocker<Vnode> nodeLocker(vnode);
3904 		vnode->SetRemoved(false);
3905 	}
3906 
3907 	rw_lock_read_unlock(&sVnodeLock);
3908 	return B_OK;
3909 }
3910 
3911 
3912 extern "C" status_t
3913 get_vnode_removed(fs_volume* volume, ino_t vnodeID, bool* _removed)
3914 {
3915 	ReadLocker _(sVnodeLock);
3916 
3917 	if (struct vnode* vnode = lookup_vnode(volume->id, vnodeID)) {
3918 		if (_removed != NULL)
3919 			*_removed = vnode->IsRemoved();
3920 		return B_OK;
3921 	}
3922 
3923 	return B_BAD_VALUE;
3924 }
3925 
3926 
3927 extern "C" fs_volume*
3928 volume_for_vnode(fs_vnode* _vnode)
3929 {
3930 	if (_vnode == NULL)
3931 		return NULL;
3932 
3933 	struct vnode* vnode = static_cast<struct vnode*>(_vnode);
3934 	return vnode->mount->volume;
3935 }
3936 
3937 
3938 extern "C" status_t
3939 check_access_permissions(int accessMode, mode_t mode, gid_t nodeGroupID,
3940 	uid_t nodeUserID)
3941 {
3942 	// get node permissions
3943 	int userPermissions = (mode & S_IRWXU) >> 6;
3944 	int groupPermissions = (mode & S_IRWXG) >> 3;
3945 	int otherPermissions = mode & S_IRWXO;
3946 
3947 	// get the node permissions for this uid/gid
3948 	int permissions = 0;
3949 	uid_t uid = geteuid();
3950 
3951 	if (uid == 0) {
3952 		// user is root
3953 		// root has always read/write permission, but at least one of the
3954 		// X bits must be set for execute permission
3955 		permissions = userPermissions | groupPermissions | otherPermissions
3956 			| S_IROTH | S_IWOTH;
3957 		if (S_ISDIR(mode))
3958 			permissions |= S_IXOTH;
3959 	} else if (uid == nodeUserID) {
3960 		// user is node owner
3961 		permissions = userPermissions;
3962 	} else if (is_user_in_group(nodeGroupID)) {
3963 		// user is in owning group
3964 		permissions = groupPermissions;
3965 	} else {
3966 		// user is one of the others
3967 		permissions = otherPermissions;
3968 	}
3969 
3970 	return (accessMode & ~permissions) == 0 ? B_OK : B_PERMISSION_DENIED;
3971 }
3972 
3973 
3974 #if 0
3975 extern "C" status_t
3976 read_pages(int fd, off_t pos, const iovec* vecs, size_t count,
3977 	size_t* _numBytes)
3978 {
3979 	struct file_descriptor* descriptor;
3980 	struct vnode* vnode;
3981 
3982 	descriptor = get_fd_and_vnode(fd, &vnode, true);
3983 	if (descriptor == NULL)
3984 		return B_FILE_ERROR;
3985 
3986 	status_t status = vfs_read_pages(vnode, descriptor->cookie, pos, vecs,
3987 		count, 0, _numBytes);
3988 
3989 	put_fd(descriptor);
3990 	return status;
3991 }
3992 
3993 
3994 extern "C" status_t
3995 write_pages(int fd, off_t pos, const iovec* vecs, size_t count,
3996 	size_t* _numBytes)
3997 {
3998 	struct file_descriptor* descriptor;
3999 	struct vnode* vnode;
4000 
4001 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4002 	if (descriptor == NULL)
4003 		return B_FILE_ERROR;
4004 
4005 	status_t status = vfs_write_pages(vnode, descriptor->cookie, pos, vecs,
4006 		count, 0, _numBytes);
4007 
4008 	put_fd(descriptor);
4009 	return status;
4010 }
4011 #endif
4012 
4013 
4014 extern "C" status_t
4015 read_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
4016 	const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
4017 	size_t* _bytes)
4018 {
4019 	struct file_descriptor* descriptor;
4020 	struct vnode* vnode;
4021 
4022 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4023 	if (descriptor == NULL)
4024 		return B_FILE_ERROR;
4025 
4026 	status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
4027 		fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
4028 		false);
4029 
4030 	put_fd(descriptor);
4031 	return status;
4032 }
4033 
4034 
4035 extern "C" status_t
4036 write_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
4037 	const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
4038 	size_t* _bytes)
4039 {
4040 	struct file_descriptor* descriptor;
4041 	struct vnode* vnode;
4042 
4043 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4044 	if (descriptor == NULL)
4045 		return B_FILE_ERROR;
4046 
4047 	status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
4048 		fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
4049 		true);
4050 
4051 	put_fd(descriptor);
4052 	return status;
4053 }
4054 
4055 
4056 extern "C" status_t
4057 entry_cache_add(dev_t mountID, ino_t dirID, const char* name, ino_t nodeID)
4058 {
4059 	// lookup mount -- the caller is required to make sure that the mount
4060 	// won't go away
4061 	MutexLocker locker(sMountMutex);
4062 	struct fs_mount* mount = find_mount(mountID);
4063 	if (mount == NULL)
4064 		return B_BAD_VALUE;
4065 	locker.Unlock();
4066 
4067 	return mount->entry_cache.Add(dirID, name, nodeID, false);
4068 }
4069 
4070 
4071 extern "C" status_t
4072 entry_cache_add_missing(dev_t mountID, ino_t dirID, const char* name)
4073 {
4074 	// lookup mount -- the caller is required to make sure that the mount
4075 	// won't go away
4076 	MutexLocker locker(sMountMutex);
4077 	struct fs_mount* mount = find_mount(mountID);
4078 	if (mount == NULL)
4079 		return B_BAD_VALUE;
4080 	locker.Unlock();
4081 
4082 	return mount->entry_cache.Add(dirID, name, -1, true);
4083 }
4084 
4085 
4086 extern "C" status_t
4087 entry_cache_remove(dev_t mountID, ino_t dirID, const char* name)
4088 {
4089 	// lookup mount -- the caller is required to make sure that the mount
4090 	// won't go away
4091 	MutexLocker locker(sMountMutex);
4092 	struct fs_mount* mount = find_mount(mountID);
4093 	if (mount == NULL)
4094 		return B_BAD_VALUE;
4095 	locker.Unlock();
4096 
4097 	return mount->entry_cache.Remove(dirID, name);
4098 }
4099 
4100 
4101 //	#pragma mark - private VFS API
4102 //	Functions the VFS exports for other parts of the kernel
4103 
4104 
4105 /*! Acquires another reference to the vnode that has to be released
4106 	by calling vfs_put_vnode().
4107 */
4108 void
4109 vfs_acquire_vnode(struct vnode* vnode)
4110 {
4111 	inc_vnode_ref_count(vnode);
4112 }
4113 
4114 
4115 /*! This is currently called from file_cache_create() only.
4116 	It's probably a temporary solution as long as devfs requires that
4117 	fs_read_pages()/fs_write_pages() are called with the standard
4118 	open cookie and not with a device cookie.
4119 	If that's done differently, remove this call; it has no other
4120 	purpose.
4121 */
4122 extern "C" status_t
4123 vfs_get_cookie_from_fd(int fd, void** _cookie)
4124 {
4125 	struct file_descriptor* descriptor;
4126 
4127 	descriptor = get_fd(get_current_io_context(true), fd);
4128 	if (descriptor == NULL)
4129 		return B_FILE_ERROR;
4130 
4131 	*_cookie = descriptor->cookie;
4132 	return B_OK;
4133 }
4134 
4135 
4136 extern "C" status_t
4137 vfs_get_vnode_from_fd(int fd, bool kernel, struct vnode** vnode)
4138 {
4139 	*vnode = get_vnode_from_fd(fd, kernel);
4140 
4141 	if (*vnode == NULL)
4142 		return B_FILE_ERROR;
4143 
4144 	return B_NO_ERROR;
4145 }
4146 
4147 
4148 extern "C" status_t
4149 vfs_get_vnode_from_path(const char* path, bool kernel, struct vnode** _vnode)
4150 {
4151 	TRACE(("vfs_get_vnode_from_path: entry. path = '%s', kernel %d\n",
4152 		path, kernel));
4153 
4154 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
4155 	if (pathBuffer.InitCheck() != B_OK)
4156 		return B_NO_MEMORY;
4157 
4158 	char* buffer = pathBuffer.LockBuffer();
4159 	strlcpy(buffer, path, pathBuffer.BufferSize());
4160 
4161 	struct vnode* vnode;
4162 	status_t status = path_to_vnode(buffer, true, &vnode, NULL, kernel);
4163 	if (status != B_OK)
4164 		return status;
4165 
4166 	*_vnode = vnode;
4167 	return B_OK;
4168 }
4169 
4170 
4171 extern "C" status_t
4172 vfs_get_vnode(dev_t mountID, ino_t vnodeID, bool canWait, struct vnode** _vnode)
4173 {
4174 	struct vnode* vnode = NULL;
4175 
4176 	status_t status = get_vnode(mountID, vnodeID, &vnode, canWait, false);
4177 	if (status != B_OK)
4178 		return status;
4179 
4180 	*_vnode = vnode;
4181 	return B_OK;
4182 }
4183 
4184 
4185 extern "C" status_t
4186 vfs_entry_ref_to_vnode(dev_t mountID, ino_t directoryID,
4187 	const char* name, struct vnode** _vnode)
4188 {
4189 	return entry_ref_to_vnode(mountID, directoryID, name, false, true, _vnode);
4190 }
4191 
4192 
4193 extern "C" void
4194 vfs_vnode_to_node_ref(struct vnode* vnode, dev_t* _mountID, ino_t* _vnodeID)
4195 {
4196 	*_mountID = vnode->device;
4197 	*_vnodeID = vnode->id;
4198 }
4199 
4200 
4201 /*!
4202 	Helper function abstracting the process of "converting" a given
4203 	vnode-pointer to a fs_vnode-pointer.
4204 	Currently only used in bindfs.
4205 */
4206 extern "C" fs_vnode*
4207 vfs_fsnode_for_vnode(struct vnode* vnode)
4208 {
4209 	return vnode;
4210 }
4211 
4212 
4213 /*!
4214 	Calls fs_open() on the given vnode and returns a new
4215 	file descriptor for it
4216 */
4217 int
4218 vfs_open_vnode(struct vnode* vnode, int openMode, bool kernel)
4219 {
4220 	return open_vnode(vnode, openMode, kernel);
4221 }
4222 
4223 
4224 /*!	Looks up a vnode with the given mount and vnode ID.
4225 	Must only be used with "in-use" vnodes as it doesn't grab a reference
4226 	to the node.
4227 	It's currently only be used by file_cache_create().
4228 */
4229 extern "C" status_t
4230 vfs_lookup_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode)
4231 {
4232 	rw_lock_read_lock(&sVnodeLock);
4233 	struct vnode* vnode = lookup_vnode(mountID, vnodeID);
4234 	rw_lock_read_unlock(&sVnodeLock);
4235 
4236 	if (vnode == NULL)
4237 		return B_ERROR;
4238 
4239 	*_vnode = vnode;
4240 	return B_OK;
4241 }
4242 
4243 
4244 extern "C" status_t
4245 vfs_get_fs_node_from_path(fs_volume* volume, const char* path,
4246 	bool traverseLeafLink, bool kernel, void** _node)
4247 {
4248 	TRACE(("vfs_get_fs_node_from_path(volume = %p, path = \"%s\", kernel %d)\n",
4249 		volume, path, kernel));
4250 
4251 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
4252 	if (pathBuffer.InitCheck() != B_OK)
4253 		return B_NO_MEMORY;
4254 
4255 	fs_mount* mount;
4256 	status_t status = get_mount(volume->id, &mount);
4257 	if (status != B_OK)
4258 		return status;
4259 
4260 	char* buffer = pathBuffer.LockBuffer();
4261 	strlcpy(buffer, path, pathBuffer.BufferSize());
4262 
4263 	struct vnode* vnode = mount->root_vnode;
4264 
4265 	if (buffer[0] == '/')
4266 		status = path_to_vnode(buffer, traverseLeafLink, &vnode, NULL, kernel);
4267 	else {
4268 		inc_vnode_ref_count(vnode);
4269 			// vnode_path_to_vnode() releases a reference to the starting vnode
4270 		status = vnode_path_to_vnode(vnode, buffer, traverseLeafLink, 0,
4271 			kernel, &vnode, NULL);
4272 	}
4273 
4274 	put_mount(mount);
4275 
4276 	if (status != B_OK)
4277 		return status;
4278 
4279 	if (vnode->device != volume->id) {
4280 		// wrong mount ID - must not gain access on foreign file system nodes
4281 		put_vnode(vnode);
4282 		return B_BAD_VALUE;
4283 	}
4284 
4285 	// Use get_vnode() to resolve the cookie for the right layer.
4286 	status = get_vnode(volume, vnode->id, _node);
4287 	put_vnode(vnode);
4288 
4289 	return status;
4290 }
4291 
4292 
4293 status_t
4294 vfs_read_stat(int fd, const char* path, bool traverseLeafLink,
4295 	struct stat* stat, bool kernel)
4296 {
4297 	status_t status;
4298 
4299 	if (path) {
4300 		// path given: get the stat of the node referred to by (fd, path)
4301 		KPath pathBuffer(path, false, B_PATH_NAME_LENGTH + 1);
4302 		if (pathBuffer.InitCheck() != B_OK)
4303 			return B_NO_MEMORY;
4304 
4305 		status = common_path_read_stat(fd, pathBuffer.LockBuffer(),
4306 			traverseLeafLink, stat, kernel);
4307 	} else {
4308 		// no path given: get the FD and use the FD operation
4309 		struct file_descriptor* descriptor
4310 			= get_fd(get_current_io_context(kernel), fd);
4311 		if (descriptor == NULL)
4312 			return B_FILE_ERROR;
4313 
4314 		if (descriptor->ops->fd_read_stat)
4315 			status = descriptor->ops->fd_read_stat(descriptor, stat);
4316 		else
4317 			status = B_UNSUPPORTED;
4318 
4319 		put_fd(descriptor);
4320 	}
4321 
4322 	return status;
4323 }
4324 
4325 
4326 /*!	Finds the full path to the file that contains the module \a moduleName,
4327 	puts it into \a pathBuffer, and returns B_OK for success.
4328 	If \a pathBuffer was too small, it returns \c B_BUFFER_OVERFLOW,
4329 	\c B_ENTRY_NOT_FOUNT if no file could be found.
4330 	\a pathBuffer is clobbered in any case and must not be relied on if this
4331 	functions returns unsuccessfully.
4332 	\a basePath and \a pathBuffer must not point to the same space.
4333 */
4334 status_t
4335 vfs_get_module_path(const char* basePath, const char* moduleName,
4336 	char* pathBuffer, size_t bufferSize)
4337 {
4338 	struct vnode* dir;
4339 	struct vnode* file;
4340 	status_t status;
4341 	size_t length;
4342 	char* path;
4343 
4344 	if (bufferSize == 0
4345 		|| strlcpy(pathBuffer, basePath, bufferSize) >= bufferSize)
4346 		return B_BUFFER_OVERFLOW;
4347 
4348 	status = path_to_vnode(pathBuffer, true, &dir, NULL, true);
4349 	if (status != B_OK)
4350 		return status;
4351 
4352 	// the path buffer had been clobbered by the above call
4353 	length = strlcpy(pathBuffer, basePath, bufferSize);
4354 	if (pathBuffer[length - 1] != '/')
4355 		pathBuffer[length++] = '/';
4356 
4357 	path = pathBuffer + length;
4358 	bufferSize -= length;
4359 
4360 	while (moduleName) {
4361 		char* nextPath = strchr(moduleName, '/');
4362 		if (nextPath == NULL)
4363 			length = strlen(moduleName);
4364 		else {
4365 			length = nextPath - moduleName;
4366 			nextPath++;
4367 		}
4368 
4369 		if (length + 1 >= bufferSize) {
4370 			status = B_BUFFER_OVERFLOW;
4371 			goto err;
4372 		}
4373 
4374 		memcpy(path, moduleName, length);
4375 		path[length] = '\0';
4376 		moduleName = nextPath;
4377 
4378 		status = vnode_path_to_vnode(dir, path, true, 0, true, &file, NULL);
4379 		if (status != B_OK) {
4380 			// vnode_path_to_vnode() has already released the reference to dir
4381 			return status;
4382 		}
4383 
4384 		if (S_ISDIR(file->Type())) {
4385 			// goto the next directory
4386 			path[length] = '/';
4387 			path[length + 1] = '\0';
4388 			path += length + 1;
4389 			bufferSize -= length + 1;
4390 
4391 			dir = file;
4392 		} else if (S_ISREG(file->Type())) {
4393 			// it's a file so it should be what we've searched for
4394 			put_vnode(file);
4395 
4396 			return B_OK;
4397 		} else {
4398 			TRACE(("vfs_get_module_path(): something is strange here: "
4399 				"0x%08" B_PRIx32 "...\n", file->Type()));
4400 			status = B_ERROR;
4401 			dir = file;
4402 			goto err;
4403 		}
4404 	}
4405 
4406 	// if we got here, the moduleName just pointed to a directory, not to
4407 	// a real module - what should we do in this case?
4408 	status = B_ENTRY_NOT_FOUND;
4409 
4410 err:
4411 	put_vnode(dir);
4412 	return status;
4413 }
4414 
4415 
4416 /*!	\brief Normalizes a given path.
4417 
4418 	The path must refer to an existing or non-existing entry in an existing
4419 	directory, that is chopping off the leaf component the remaining path must
4420 	refer to an existing directory.
4421 
4422 	The returned will be canonical in that it will be absolute, will not
4423 	contain any "." or ".." components or duplicate occurrences of '/'s,
4424 	and none of the directory components will by symbolic links.
4425 
4426 	Any two paths referring to the same entry, will result in the same
4427 	normalized path (well, that is pretty much the definition of `normalized',
4428 	isn't it :-).
4429 
4430 	\param path The path to be normalized.
4431 	\param buffer The buffer into which the normalized path will be written.
4432 		   May be the same one as \a path.
4433 	\param bufferSize The size of \a buffer.
4434 	\param traverseLink If \c true, the function also resolves leaf symlinks.
4435 	\param kernel \c true, if the IO context of the kernel shall be used,
4436 		   otherwise that of the team this thread belongs to. Only relevant,
4437 		   if the path is relative (to get the CWD).
4438 	\return \c B_OK if everything went fine, another error code otherwise.
4439 */
4440 status_t
4441 vfs_normalize_path(const char* path, char* buffer, size_t bufferSize,
4442 	bool traverseLink, bool kernel)
4443 {
4444 	if (!path || !buffer || bufferSize < 1)
4445 		return B_BAD_VALUE;
4446 
4447 	if (path != buffer) {
4448 		if (strlcpy(buffer, path, bufferSize) >= bufferSize)
4449 			return B_BUFFER_OVERFLOW;
4450 	}
4451 
4452 	return normalize_path(buffer, bufferSize, traverseLink, kernel);
4453 }
4454 
4455 
4456 /*!	\brief Gets the parent of the passed in node.
4457 
4458 	Gets the parent of the passed in node, and correctly resolves covered
4459 	nodes.
4460 */
4461 extern "C" status_t
4462 vfs_resolve_parent(struct vnode* parent, dev_t* device, ino_t* node)
4463 {
4464 	return resolve_covered_parent(parent, device, node,
4465 		get_current_io_context(true));
4466 }
4467 
4468 
4469 /*!	\brief Creates a special node in the file system.
4470 
4471 	The caller gets a reference to the newly created node (which is passed
4472 	back through \a _createdVnode) and is responsible for releasing it.
4473 
4474 	\param path The path where to create the entry for the node. Can be \c NULL,
4475 		in which case the node is created without an entry in the root FS -- it
4476 		will automatically be deleted when the last reference has been released.
4477 	\param subVnode The definition of the subnode. Can be \c NULL, in which case
4478 		the target file system will just create the node with its standard
4479 		operations. Depending on the type of the node a subnode might be created
4480 		automatically, though.
4481 	\param mode The type and permissions for the node to be created.
4482 	\param flags Flags to be passed to the creating FS.
4483 	\param kernel \c true, if called in the kernel context (relevant only if
4484 		\a path is not \c NULL and not absolute).
4485 	\param _superVnode Pointer to a pre-allocated structure to be filled by the
4486 		file system creating the node, with the private data pointer and
4487 		operations for the super node. Can be \c NULL.
4488 	\param _createVnode Pointer to pre-allocated storage where to store the
4489 		pointer to the newly created node.
4490 	\return \c B_OK, if everything went fine, another error code otherwise.
4491 */
4492 status_t
4493 vfs_create_special_node(const char* path, fs_vnode* subVnode, mode_t mode,
4494 	uint32 flags, bool kernel, fs_vnode* _superVnode,
4495 	struct vnode** _createdVnode)
4496 {
4497 	struct vnode* dirNode;
4498 	char _leaf[B_FILE_NAME_LENGTH];
4499 	char* leaf = NULL;
4500 
4501 	if (path) {
4502 		// We've got a path. Get the dir vnode and the leaf name.
4503 		KPath tmpPathBuffer(B_PATH_NAME_LENGTH + 1);
4504 		if (tmpPathBuffer.InitCheck() != B_OK)
4505 			return B_NO_MEMORY;
4506 
4507 		char* tmpPath = tmpPathBuffer.LockBuffer();
4508 		if (strlcpy(tmpPath, path, B_PATH_NAME_LENGTH) >= B_PATH_NAME_LENGTH)
4509 			return B_NAME_TOO_LONG;
4510 
4511 		// get the dir vnode and the leaf name
4512 		leaf = _leaf;
4513 		status_t error = path_to_dir_vnode(tmpPath, &dirNode, leaf, kernel);
4514 		if (error != B_OK)
4515 			return error;
4516 	} else {
4517 		// No path. Create the node in the root FS.
4518 		dirNode = sRoot;
4519 		inc_vnode_ref_count(dirNode);
4520 	}
4521 
4522 	VNodePutter _(dirNode);
4523 
4524 	// check support for creating special nodes
4525 	if (!HAS_FS_CALL(dirNode, create_special_node))
4526 		return B_UNSUPPORTED;
4527 
4528 	// create the node
4529 	fs_vnode superVnode;
4530 	ino_t nodeID;
4531 	status_t status = FS_CALL(dirNode, create_special_node, leaf, subVnode,
4532 		mode, flags, _superVnode != NULL ? _superVnode : &superVnode, &nodeID);
4533 	if (status != B_OK)
4534 		return status;
4535 
4536 	// lookup the node
4537 	rw_lock_read_lock(&sVnodeLock);
4538 	*_createdVnode = lookup_vnode(dirNode->mount->id, nodeID);
4539 	rw_lock_read_unlock(&sVnodeLock);
4540 
4541 	if (*_createdVnode == NULL) {
4542 		panic("vfs_create_special_node(): lookup of node failed");
4543 		return B_ERROR;
4544 	}
4545 
4546 	return B_OK;
4547 }
4548 
4549 
4550 extern "C" void
4551 vfs_put_vnode(struct vnode* vnode)
4552 {
4553 	put_vnode(vnode);
4554 }
4555 
4556 
4557 extern "C" status_t
4558 vfs_get_cwd(dev_t* _mountID, ino_t* _vnodeID)
4559 {
4560 	// Get current working directory from io context
4561 	struct io_context* context = get_current_io_context(false);
4562 	status_t status = B_OK;
4563 
4564 	mutex_lock(&context->io_mutex);
4565 
4566 	if (context->cwd != NULL) {
4567 		*_mountID = context->cwd->device;
4568 		*_vnodeID = context->cwd->id;
4569 	} else
4570 		status = B_ERROR;
4571 
4572 	mutex_unlock(&context->io_mutex);
4573 	return status;
4574 }
4575 
4576 
4577 status_t
4578 vfs_unmount(dev_t mountID, uint32 flags)
4579 {
4580 	return fs_unmount(NULL, mountID, flags, true);
4581 }
4582 
4583 
4584 extern "C" status_t
4585 vfs_disconnect_vnode(dev_t mountID, ino_t vnodeID)
4586 {
4587 	struct vnode* vnode;
4588 
4589 	status_t status = get_vnode(mountID, vnodeID, &vnode, true, true);
4590 	if (status != B_OK)
4591 		return status;
4592 
4593 	disconnect_mount_or_vnode_fds(vnode->mount, vnode);
4594 	put_vnode(vnode);
4595 	return B_OK;
4596 }
4597 
4598 
4599 extern "C" void
4600 vfs_free_unused_vnodes(int32 level)
4601 {
4602 	vnode_low_resource_handler(NULL,
4603 		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
4604 			| B_KERNEL_RESOURCE_ADDRESS_SPACE,
4605 		level);
4606 }
4607 
4608 
4609 extern "C" bool
4610 vfs_can_page(struct vnode* vnode, void* cookie)
4611 {
4612 	FUNCTION(("vfs_canpage: vnode %p\n", vnode));
4613 
4614 	if (HAS_FS_CALL(vnode, can_page))
4615 		return FS_CALL(vnode, can_page, cookie);
4616 	return false;
4617 }
4618 
4619 
4620 extern "C" status_t
4621 vfs_read_pages(struct vnode* vnode, void* cookie, off_t pos,
4622 	const generic_io_vec* vecs, size_t count, uint32 flags,
4623 	generic_size_t* _numBytes)
4624 {
4625 	FUNCTION(("vfs_read_pages: vnode %p, vecs %p, pos %" B_PRIdOFF "\n", vnode,
4626 		vecs, pos));
4627 
4628 #if VFS_PAGES_IO_TRACING
4629 	generic_size_t bytesRequested = *_numBytes;
4630 #endif
4631 
4632 	IORequest request;
4633 	status_t status = request.Init(pos, vecs, count, *_numBytes, false, flags);
4634 	if (status == B_OK) {
4635 		status = vfs_vnode_io(vnode, cookie, &request);
4636 		if (status == B_OK)
4637 			status = request.Wait();
4638 		*_numBytes = request.TransferredBytes();
4639 	}
4640 
4641 	TPIO(ReadPages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4642 		status, *_numBytes));
4643 
4644 	return status;
4645 }
4646 
4647 
4648 extern "C" status_t
4649 vfs_write_pages(struct vnode* vnode, void* cookie, off_t pos,
4650 	const generic_io_vec* vecs, size_t count, uint32 flags,
4651 	generic_size_t* _numBytes)
4652 {
4653 	FUNCTION(("vfs_write_pages: vnode %p, vecs %p, pos %" B_PRIdOFF "\n", vnode,
4654 		vecs, pos));
4655 
4656 #if VFS_PAGES_IO_TRACING
4657 	generic_size_t bytesRequested = *_numBytes;
4658 #endif
4659 
4660 	IORequest request;
4661 	status_t status = request.Init(pos, vecs, count, *_numBytes, true, flags);
4662 	if (status == B_OK) {
4663 		status = vfs_vnode_io(vnode, cookie, &request);
4664 		if (status == B_OK)
4665 			status = request.Wait();
4666 		*_numBytes = request.TransferredBytes();
4667 	}
4668 
4669 	TPIO(WritePages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4670 		status, *_numBytes));
4671 
4672 	return status;
4673 }
4674 
4675 
4676 /*!	Gets the vnode's VMCache object. If it didn't have one, it will be
4677 	created if \a allocate is \c true.
4678 	In case it's successful, it will also grab a reference to the cache
4679 	it returns.
4680 */
4681 extern "C" status_t
4682 vfs_get_vnode_cache(struct vnode* vnode, VMCache** _cache, bool allocate)
4683 {
4684 	if (vnode->cache != NULL) {
4685 		vnode->cache->AcquireRef();
4686 		*_cache = vnode->cache;
4687 		return B_OK;
4688 	}
4689 
4690 	rw_lock_read_lock(&sVnodeLock);
4691 	vnode->Lock();
4692 
4693 	status_t status = B_OK;
4694 
4695 	// The cache could have been created in the meantime
4696 	if (vnode->cache == NULL) {
4697 		if (allocate) {
4698 			// TODO: actually the vnode needs to be busy already here, or
4699 			//	else this won't work...
4700 			bool wasBusy = vnode->IsBusy();
4701 			vnode->SetBusy(true);
4702 
4703 			vnode->Unlock();
4704 			rw_lock_read_unlock(&sVnodeLock);
4705 
4706 			status = vm_create_vnode_cache(vnode, &vnode->cache);
4707 
4708 			rw_lock_read_lock(&sVnodeLock);
4709 			vnode->Lock();
4710 			vnode->SetBusy(wasBusy);
4711 		} else
4712 			status = B_BAD_VALUE;
4713 	}
4714 
4715 	vnode->Unlock();
4716 	rw_lock_read_unlock(&sVnodeLock);
4717 
4718 	if (status == B_OK) {
4719 		vnode->cache->AcquireRef();
4720 		*_cache = vnode->cache;
4721 	}
4722 
4723 	return status;
4724 }
4725 
4726 
4727 status_t
4728 vfs_get_file_map(struct vnode* vnode, off_t offset, size_t size,
4729 	file_io_vec* vecs, size_t* _count)
4730 {
4731 	FUNCTION(("vfs_get_file_map: vnode %p, vecs %p, offset %" B_PRIdOFF
4732 		", size = %" B_PRIuSIZE "\n", vnode, vecs, offset, size));
4733 
4734 	return FS_CALL(vnode, get_file_map, offset, size, vecs, _count);
4735 }
4736 
4737 
4738 status_t
4739 vfs_stat_vnode(struct vnode* vnode, struct stat* stat)
4740 {
4741 	status_t status = FS_CALL(vnode, read_stat, stat);
4742 
4743 	// fill in the st_dev and st_ino fields
4744 	if (status == B_OK) {
4745 		stat->st_dev = vnode->device;
4746 		stat->st_ino = vnode->id;
4747 		// the rdev field must stay unset for non-special files
4748 		if (!S_ISBLK(stat->st_mode) && !S_ISCHR(stat->st_mode))
4749 			stat->st_rdev = -1;
4750 	}
4751 
4752 	return status;
4753 }
4754 
4755 
4756 status_t
4757 vfs_stat_node_ref(dev_t device, ino_t inode, struct stat* stat)
4758 {
4759 	struct vnode* vnode;
4760 	status_t status = get_vnode(device, inode, &vnode, true, false);
4761 	if (status != B_OK)
4762 		return status;
4763 
4764 	status = vfs_stat_vnode(vnode, stat);
4765 
4766 	put_vnode(vnode);
4767 	return status;
4768 }
4769 
4770 
4771 status_t
4772 vfs_get_vnode_name(struct vnode* vnode, char* name, size_t nameSize)
4773 {
4774 	return get_vnode_name(vnode, NULL, name, nameSize, true);
4775 }
4776 
4777 
4778 status_t
4779 vfs_entry_ref_to_path(dev_t device, ino_t inode, const char* leaf,
4780 	bool kernel, char* path, size_t pathLength)
4781 {
4782 	struct vnode* vnode;
4783 	status_t status;
4784 
4785 	// filter invalid leaf names
4786 	if (leaf != NULL && (leaf[0] == '\0' || strchr(leaf, '/')))
4787 		return B_BAD_VALUE;
4788 
4789 	// get the vnode matching the dir's node_ref
4790 	if (leaf && (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0)) {
4791 		// special cases "." and "..": we can directly get the vnode of the
4792 		// referenced directory
4793 		status = entry_ref_to_vnode(device, inode, leaf, false, kernel, &vnode);
4794 		leaf = NULL;
4795 	} else
4796 		status = get_vnode(device, inode, &vnode, true, false);
4797 	if (status != B_OK)
4798 		return status;
4799 
4800 	// get the directory path
4801 	status = dir_vnode_to_path(vnode, path, pathLength, kernel);
4802 	put_vnode(vnode);
4803 		// we don't need the vnode anymore
4804 	if (status != B_OK)
4805 		return status;
4806 
4807 	// append the leaf name
4808 	if (leaf) {
4809 		// insert a directory separator if this is not the file system root
4810 		if ((strcmp(path, "/") && strlcat(path, "/", pathLength)
4811 				>= pathLength)
4812 			|| strlcat(path, leaf, pathLength) >= pathLength) {
4813 			return B_NAME_TOO_LONG;
4814 		}
4815 	}
4816 
4817 	return B_OK;
4818 }
4819 
4820 
4821 /*!	If the given descriptor locked its vnode, that lock will be released. */
4822 void
4823 vfs_unlock_vnode_if_locked(struct file_descriptor* descriptor)
4824 {
4825 	struct vnode* vnode = fd_vnode(descriptor);
4826 
4827 	if (vnode != NULL && vnode->mandatory_locked_by == descriptor)
4828 		vnode->mandatory_locked_by = NULL;
4829 }
4830 
4831 
4832 /*!	Closes all file descriptors of the specified I/O context that
4833 	have the O_CLOEXEC flag set.
4834 */
4835 void
4836 vfs_exec_io_context(io_context* context)
4837 {
4838 	uint32 i;
4839 
4840 	for (i = 0; i < context->table_size; i++) {
4841 		mutex_lock(&context->io_mutex);
4842 
4843 		struct file_descriptor* descriptor = context->fds[i];
4844 		bool remove = false;
4845 
4846 		if (descriptor != NULL && fd_close_on_exec(context, i)) {
4847 			context->fds[i] = NULL;
4848 			context->num_used_fds--;
4849 
4850 			remove = true;
4851 		}
4852 
4853 		mutex_unlock(&context->io_mutex);
4854 
4855 		if (remove) {
4856 			close_fd(descriptor);
4857 			put_fd(descriptor);
4858 		}
4859 	}
4860 }
4861 
4862 
4863 /*! Sets up a new io_control structure, and inherits the properties
4864 	of the parent io_control if it is given.
4865 */
4866 io_context*
4867 vfs_new_io_context(io_context* parentContext, bool purgeCloseOnExec)
4868 {
4869 	io_context* context = (io_context*)malloc(sizeof(io_context));
4870 	if (context == NULL)
4871 		return NULL;
4872 
4873 	TIOC(NewIOContext(context, parentContext));
4874 
4875 	memset(context, 0, sizeof(io_context));
4876 	context->ref_count = 1;
4877 
4878 	MutexLocker parentLocker;
4879 
4880 	size_t tableSize;
4881 	if (parentContext != NULL) {
4882 		parentLocker.SetTo(parentContext->io_mutex, false);
4883 		tableSize = parentContext->table_size;
4884 	} else
4885 		tableSize = DEFAULT_FD_TABLE_SIZE;
4886 
4887 	// allocate space for FDs and their close-on-exec flag
4888 	context->fds = (file_descriptor**)malloc(
4889 		sizeof(struct file_descriptor*) * tableSize
4890 		+ sizeof(struct select_sync*) * tableSize
4891 		+ (tableSize + 7) / 8);
4892 	if (context->fds == NULL) {
4893 		free(context);
4894 		return NULL;
4895 	}
4896 
4897 	context->select_infos = (select_info**)(context->fds + tableSize);
4898 	context->fds_close_on_exec = (uint8*)(context->select_infos + tableSize);
4899 
4900 	memset(context->fds, 0, sizeof(struct file_descriptor*) * tableSize
4901 		+ sizeof(struct select_sync*) * tableSize
4902 		+ (tableSize + 7) / 8);
4903 
4904 	mutex_init(&context->io_mutex, "I/O context");
4905 
4906 	// Copy all parent file descriptors
4907 
4908 	if (parentContext != NULL) {
4909 		size_t i;
4910 
4911 		mutex_lock(&sIOContextRootLock);
4912 		context->root = parentContext->root;
4913 		if (context->root)
4914 			inc_vnode_ref_count(context->root);
4915 		mutex_unlock(&sIOContextRootLock);
4916 
4917 		context->cwd = parentContext->cwd;
4918 		if (context->cwd)
4919 			inc_vnode_ref_count(context->cwd);
4920 
4921 		if (parentContext->inherit_fds) {
4922 			for (i = 0; i < tableSize; i++) {
4923 				struct file_descriptor* descriptor = parentContext->fds[i];
4924 
4925 				if (descriptor != NULL
4926 					&& (descriptor->open_mode & O_DISCONNECTED) == 0) {
4927 					bool closeOnExec = fd_close_on_exec(parentContext, i);
4928 					if (closeOnExec && purgeCloseOnExec)
4929 						continue;
4930 
4931 					TFD(InheritFD(context, i, descriptor, parentContext));
4932 
4933 					context->fds[i] = descriptor;
4934 					context->num_used_fds++;
4935 					atomic_add(&descriptor->ref_count, 1);
4936 					atomic_add(&descriptor->open_count, 1);
4937 
4938 					if (closeOnExec)
4939 						fd_set_close_on_exec(context, i, true);
4940 				}
4941 			}
4942 		}
4943 
4944 		parentLocker.Unlock();
4945 	} else {
4946 		context->root = sRoot;
4947 		context->cwd = sRoot;
4948 
4949 		if (context->root)
4950 			inc_vnode_ref_count(context->root);
4951 
4952 		if (context->cwd)
4953 			inc_vnode_ref_count(context->cwd);
4954 	}
4955 
4956 	context->table_size = tableSize;
4957 	context->inherit_fds = parentContext != NULL;
4958 
4959 	list_init(&context->node_monitors);
4960 	context->max_monitors = DEFAULT_NODE_MONITORS;
4961 
4962 	return context;
4963 }
4964 
4965 
4966 void
4967 vfs_get_io_context(io_context* context)
4968 {
4969 	atomic_add(&context->ref_count, 1);
4970 }
4971 
4972 
4973 void
4974 vfs_put_io_context(io_context* context)
4975 {
4976 	if (atomic_add(&context->ref_count, -1) == 1)
4977 		free_io_context(context);
4978 }
4979 
4980 
4981 status_t
4982 vfs_resize_fd_table(struct io_context* context, uint32 newSize)
4983 {
4984 	if (newSize == 0 || newSize > MAX_FD_TABLE_SIZE)
4985 		return B_BAD_VALUE;
4986 
4987 	TIOC(ResizeIOContext(context, newSize));
4988 
4989 	MutexLocker _(context->io_mutex);
4990 
4991 	uint32 oldSize = context->table_size;
4992 	int oldCloseOnExitBitmapSize = (oldSize + 7) / 8;
4993 	int newCloseOnExitBitmapSize = (newSize + 7) / 8;
4994 
4995 	// If the tables shrink, make sure none of the fds being dropped are in use.
4996 	if (newSize < oldSize) {
4997 		for (uint32 i = oldSize; i-- > newSize;) {
4998 			if (context->fds[i])
4999 				return B_BUSY;
5000 		}
5001 	}
5002 
5003 	// store pointers to the old tables
5004 	file_descriptor** oldFDs = context->fds;
5005 	select_info** oldSelectInfos = context->select_infos;
5006 	uint8* oldCloseOnExecTable = context->fds_close_on_exec;
5007 
5008 	// allocate new tables
5009 	file_descriptor** newFDs = (file_descriptor**)malloc(
5010 		sizeof(struct file_descriptor*) * newSize
5011 		+ sizeof(struct select_sync*) * newSize
5012 		+ newCloseOnExitBitmapSize);
5013 	if (newFDs == NULL)
5014 		return B_NO_MEMORY;
5015 
5016 	context->fds = newFDs;
5017 	context->select_infos = (select_info**)(context->fds + newSize);
5018 	context->fds_close_on_exec = (uint8*)(context->select_infos + newSize);
5019 	context->table_size = newSize;
5020 
5021 	// copy entries from old tables
5022 	uint32 toCopy = min_c(oldSize, newSize);
5023 
5024 	memcpy(context->fds, oldFDs, sizeof(void*) * toCopy);
5025 	memcpy(context->select_infos, oldSelectInfos, sizeof(void*) * toCopy);
5026 	memcpy(context->fds_close_on_exec, oldCloseOnExecTable,
5027 		min_c(oldCloseOnExitBitmapSize, newCloseOnExitBitmapSize));
5028 
5029 	// clear additional entries, if the tables grow
5030 	if (newSize > oldSize) {
5031 		memset(context->fds + oldSize, 0, sizeof(void*) * (newSize - oldSize));
5032 		memset(context->select_infos + oldSize, 0,
5033 			sizeof(void*) * (newSize - oldSize));
5034 		memset(context->fds_close_on_exec + oldCloseOnExitBitmapSize, 0,
5035 			newCloseOnExitBitmapSize - oldCloseOnExitBitmapSize);
5036 	}
5037 
5038 	free(oldFDs);
5039 
5040 	return B_OK;
5041 }
5042 
5043 
5044 /*!	\brief Resolves a vnode to the vnode it is covered by, if any.
5045 
5046 	Given an arbitrary vnode (identified by mount and node ID), the function
5047 	checks, whether the vnode is covered by another vnode. If it is, the
5048 	function returns the mount and node ID of the covering vnode. Otherwise
5049 	it simply returns the supplied mount and node ID.
5050 
5051 	In case of error (e.g. the supplied node could not be found) the variables
5052 	for storing the resolved mount and node ID remain untouched and an error
5053 	code is returned.
5054 
5055 	\param mountID The mount ID of the vnode in question.
5056 	\param nodeID The node ID of the vnode in question.
5057 	\param resolvedMountID Pointer to storage for the resolved mount ID.
5058 	\param resolvedNodeID Pointer to storage for the resolved node ID.
5059 	\return
5060 	- \c B_OK, if everything went fine,
5061 	- another error code, if something went wrong.
5062 */
5063 status_t
5064 vfs_resolve_vnode_to_covering_vnode(dev_t mountID, ino_t nodeID,
5065 	dev_t* resolvedMountID, ino_t* resolvedNodeID)
5066 {
5067 	// get the node
5068 	struct vnode* node;
5069 	status_t error = get_vnode(mountID, nodeID, &node, true, false);
5070 	if (error != B_OK)
5071 		return error;
5072 
5073 	// resolve the node
5074 	if (Vnode* coveringNode = get_covering_vnode(node)) {
5075 		put_vnode(node);
5076 		node = coveringNode;
5077 	}
5078 
5079 	// set the return values
5080 	*resolvedMountID = node->device;
5081 	*resolvedNodeID = node->id;
5082 
5083 	put_vnode(node);
5084 
5085 	return B_OK;
5086 }
5087 
5088 
5089 status_t
5090 vfs_get_mount_point(dev_t mountID, dev_t* _mountPointMountID,
5091 	ino_t* _mountPointNodeID)
5092 {
5093 	ReadLocker nodeLocker(sVnodeLock);
5094 	MutexLocker mountLocker(sMountMutex);
5095 
5096 	struct fs_mount* mount = find_mount(mountID);
5097 	if (mount == NULL)
5098 		return B_BAD_VALUE;
5099 
5100 	Vnode* mountPoint = mount->covers_vnode;
5101 
5102 	*_mountPointMountID = mountPoint->device;
5103 	*_mountPointNodeID = mountPoint->id;
5104 
5105 	return B_OK;
5106 }
5107 
5108 
5109 status_t
5110 vfs_bind_mount_directory(dev_t mountID, ino_t nodeID, dev_t coveredMountID,
5111 	ino_t coveredNodeID)
5112 {
5113 	// get the vnodes
5114 	Vnode* vnode;
5115 	status_t error = get_vnode(mountID, nodeID, &vnode, true, false);
5116 	if (error != B_OK)
5117 		return B_BAD_VALUE;
5118 	VNodePutter vnodePutter(vnode);
5119 
5120 	Vnode* coveredVnode;
5121 	error = get_vnode(coveredMountID, coveredNodeID, &coveredVnode, true,
5122 		false);
5123 	if (error != B_OK)
5124 		return B_BAD_VALUE;
5125 	VNodePutter coveredVnodePutter(coveredVnode);
5126 
5127 	// establish the covered/covering links
5128 	WriteLocker locker(sVnodeLock);
5129 
5130 	if (vnode->covers != NULL || coveredVnode->covered_by != NULL
5131 		|| vnode->mount->unmounting || coveredVnode->mount->unmounting) {
5132 		return B_BUSY;
5133 	}
5134 
5135 	vnode->covers = coveredVnode;
5136 	vnode->SetCovering(true);
5137 
5138 	coveredVnode->covered_by = vnode;
5139 	coveredVnode->SetCovered(true);
5140 
5141 	// the vnodes do now reference each other
5142 	inc_vnode_ref_count(vnode);
5143 	inc_vnode_ref_count(coveredVnode);
5144 
5145 	return B_OK;
5146 }
5147 
5148 
5149 int
5150 vfs_getrlimit(int resource, struct rlimit* rlp)
5151 {
5152 	if (!rlp)
5153 		return B_BAD_ADDRESS;
5154 
5155 	switch (resource) {
5156 		case RLIMIT_NOFILE:
5157 		{
5158 			struct io_context* context = get_current_io_context(false);
5159 			MutexLocker _(context->io_mutex);
5160 
5161 			rlp->rlim_cur = context->table_size;
5162 			rlp->rlim_max = MAX_FD_TABLE_SIZE;
5163 			return 0;
5164 		}
5165 
5166 		case RLIMIT_NOVMON:
5167 		{
5168 			struct io_context* context = get_current_io_context(false);
5169 			MutexLocker _(context->io_mutex);
5170 
5171 			rlp->rlim_cur = context->max_monitors;
5172 			rlp->rlim_max = MAX_NODE_MONITORS;
5173 			return 0;
5174 		}
5175 
5176 		default:
5177 			return B_BAD_VALUE;
5178 	}
5179 }
5180 
5181 
5182 int
5183 vfs_setrlimit(int resource, const struct rlimit* rlp)
5184 {
5185 	if (!rlp)
5186 		return B_BAD_ADDRESS;
5187 
5188 	switch (resource) {
5189 		case RLIMIT_NOFILE:
5190 			/* TODO: check getuid() */
5191 			if (rlp->rlim_max != RLIM_SAVED_MAX
5192 				&& rlp->rlim_max != MAX_FD_TABLE_SIZE)
5193 				return B_NOT_ALLOWED;
5194 
5195 			return vfs_resize_fd_table(get_current_io_context(false),
5196 				rlp->rlim_cur);
5197 
5198 		case RLIMIT_NOVMON:
5199 			/* TODO: check getuid() */
5200 			if (rlp->rlim_max != RLIM_SAVED_MAX
5201 				&& rlp->rlim_max != MAX_NODE_MONITORS)
5202 				return B_NOT_ALLOWED;
5203 
5204 			return resize_monitor_table(get_current_io_context(false),
5205 				rlp->rlim_cur);
5206 
5207 		default:
5208 			return B_BAD_VALUE;
5209 	}
5210 }
5211 
5212 
5213 status_t
5214 vfs_init(kernel_args* args)
5215 {
5216 	vnode::StaticInit();
5217 
5218 	sVnodeTable = new(std::nothrow) VnodeTable();
5219 	if (sVnodeTable == NULL || sVnodeTable->Init(VNODE_HASH_TABLE_SIZE) != B_OK)
5220 		panic("vfs_init: error creating vnode hash table\n");
5221 
5222 	struct vnode dummy_vnode;
5223 	list_init_etc(&sUnusedVnodeList, offset_of_member(dummy_vnode, unused_link));
5224 
5225 	struct fs_mount dummyMount;
5226 	sMountsTable = new(std::nothrow) MountTable();
5227 	if (sMountsTable == NULL
5228 			|| sMountsTable->Init(MOUNTS_HASH_TABLE_SIZE) != B_OK)
5229 		panic("vfs_init: error creating mounts hash table\n");
5230 
5231 	node_monitor_init();
5232 
5233 	sRoot = NULL;
5234 
5235 	recursive_lock_init(&sMountOpLock, "vfs_mount_op_lock");
5236 
5237 	if (block_cache_init() != B_OK)
5238 		return B_ERROR;
5239 
5240 #ifdef ADD_DEBUGGER_COMMANDS
5241 	// add some debugger commands
5242 	add_debugger_command_etc("vnode", &dump_vnode,
5243 		"Print info about the specified vnode",
5244 		"[ \"-p\" ] ( <vnode> | <devID> <nodeID> )\n"
5245 		"Prints information about the vnode specified by address <vnode> or\n"
5246 		"<devID>, <vnodeID> pair. If \"-p\" is given, a path of the vnode is\n"
5247 		"constructed and printed. It might not be possible to construct a\n"
5248 		"complete path, though.\n",
5249 		0);
5250 	add_debugger_command("vnodes", &dump_vnodes,
5251 		"list all vnodes (from the specified device)");
5252 	add_debugger_command("vnode_caches", &dump_vnode_caches,
5253 		"list all vnode caches");
5254 	add_debugger_command("mount", &dump_mount,
5255 		"info about the specified fs_mount");
5256 	add_debugger_command("mounts", &dump_mounts, "list all fs_mounts");
5257 	add_debugger_command("io_context", &dump_io_context,
5258 		"info about the I/O context");
5259 	add_debugger_command("vnode_usage", &dump_vnode_usage,
5260 		"info about vnode usage");
5261 #endif
5262 
5263 	register_low_resource_handler(&vnode_low_resource_handler, NULL,
5264 		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
5265 			| B_KERNEL_RESOURCE_ADDRESS_SPACE,
5266 		0);
5267 
5268 	fifo_init();
5269 	file_map_init();
5270 
5271 	return file_cache_init();
5272 }
5273 
5274 
5275 //	#pragma mark - fd_ops implementations
5276 
5277 
5278 /*!
5279 	Calls fs_open() on the given vnode and returns a new
5280 	file descriptor for it
5281 */
5282 static int
5283 open_vnode(struct vnode* vnode, int openMode, bool kernel)
5284 {
5285 	void* cookie;
5286 	status_t status = FS_CALL(vnode, open, openMode, &cookie);
5287 	if (status != B_OK)
5288 		return status;
5289 
5290 	int fd = get_new_fd(FDTYPE_FILE, NULL, vnode, cookie, openMode, kernel);
5291 	if (fd < 0) {
5292 		FS_CALL(vnode, close, cookie);
5293 		FS_CALL(vnode, free_cookie, cookie);
5294 	}
5295 	return fd;
5296 }
5297 
5298 
5299 /*!
5300 	Calls fs_open() on the given vnode and returns a new
5301 	file descriptor for it
5302 */
5303 static int
5304 create_vnode(struct vnode* directory, const char* name, int openMode,
5305 	int perms, bool kernel)
5306 {
5307 	bool traverse = ((openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0);
5308 	status_t status = B_ERROR;
5309 	struct vnode* vnode;
5310 	void* cookie;
5311 	ino_t newID;
5312 
5313 	// This is somewhat tricky: If the entry already exists, the FS responsible
5314 	// for the directory might not necessarily also be the one responsible for
5315 	// the node the entry refers to (e.g. in case of mount points or FIFOs). So
5316 	// we can actually never call the create() hook without O_EXCL. Instead we
5317 	// try to look the entry up first. If it already exists, we just open the
5318 	// node (unless O_EXCL), otherwise we call create() with O_EXCL. This
5319 	// introduces a race condition, since someone else might have created the
5320 	// entry in the meantime. We hope the respective FS returns the correct
5321 	// error code and retry (up to 3 times) again.
5322 
5323 	for (int i = 0; i < 3 && status != B_OK; i++) {
5324 		// look the node up
5325 		status = lookup_dir_entry(directory, name, &vnode);
5326 		if (status == B_OK) {
5327 			VNodePutter putter(vnode);
5328 
5329 			if ((openMode & O_EXCL) != 0)
5330 				return B_FILE_EXISTS;
5331 
5332 			// If the node is a symlink, we have to follow it, unless
5333 			// O_NOTRAVERSE is set.
5334 			if (S_ISLNK(vnode->Type()) && traverse) {
5335 				putter.Put();
5336 				char clonedName[B_FILE_NAME_LENGTH + 1];
5337 				if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH)
5338 						>= B_FILE_NAME_LENGTH) {
5339 					return B_NAME_TOO_LONG;
5340 				}
5341 
5342 				inc_vnode_ref_count(directory);
5343 				status = vnode_path_to_vnode(directory, clonedName, true, 0,
5344 					kernel, &vnode, NULL);
5345 				if (status != B_OK)
5346 					return status;
5347 
5348 				putter.SetTo(vnode);
5349 			}
5350 
5351 			if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type()))
5352 				return B_LINK_LIMIT;
5353 
5354 			int fd = open_vnode(vnode, openMode & ~O_CREAT, kernel);
5355 			// on success keep the vnode reference for the FD
5356 			if (fd >= 0)
5357 				putter.Detach();
5358 
5359 			return fd;
5360 		}
5361 
5362 		// it doesn't exist yet -- try to create it
5363 
5364 		if (!HAS_FS_CALL(directory, create))
5365 			return B_READ_ONLY_DEVICE;
5366 
5367 		status = FS_CALL(directory, create, name, openMode | O_EXCL, perms,
5368 			&cookie, &newID);
5369 		if (status != B_OK
5370 			&& ((openMode & O_EXCL) != 0 || status != B_FILE_EXISTS)) {
5371 			return status;
5372 		}
5373 	}
5374 
5375 	if (status != B_OK)
5376 		return status;
5377 
5378 	// the node has been created successfully
5379 
5380 	rw_lock_read_lock(&sVnodeLock);
5381 	vnode = lookup_vnode(directory->device, newID);
5382 	rw_lock_read_unlock(&sVnodeLock);
5383 
5384 	if (vnode == NULL) {
5385 		panic("vfs: fs_create() returned success but there is no vnode, "
5386 			"mount ID %" B_PRIdDEV "!\n", directory->device);
5387 		return B_BAD_VALUE;
5388 	}
5389 
5390 	int fd = get_new_fd(FDTYPE_FILE, NULL, vnode, cookie, openMode, kernel);
5391 	if (fd >= 0)
5392 		return fd;
5393 
5394 	status = fd;
5395 
5396 	// something went wrong, clean up
5397 
5398 	FS_CALL(vnode, close, cookie);
5399 	FS_CALL(vnode, free_cookie, cookie);
5400 	put_vnode(vnode);
5401 
5402 	FS_CALL(directory, unlink, name);
5403 
5404 	return status;
5405 }
5406 
5407 
5408 /*! Calls fs open_dir() on the given vnode and returns a new
5409 	file descriptor for it
5410 */
5411 static int
5412 open_dir_vnode(struct vnode* vnode, bool kernel)
5413 {
5414 	void* cookie;
5415 	status_t status = FS_CALL(vnode, open_dir, &cookie);
5416 	if (status != B_OK)
5417 		return status;
5418 
5419 	// directory is opened, create a fd
5420 	status = get_new_fd(FDTYPE_DIR, NULL, vnode, cookie, O_CLOEXEC, kernel);
5421 	if (status >= 0)
5422 		return status;
5423 
5424 	FS_CALL(vnode, close_dir, cookie);
5425 	FS_CALL(vnode, free_dir_cookie, cookie);
5426 
5427 	return status;
5428 }
5429 
5430 
5431 /*! Calls fs open_attr_dir() on the given vnode and returns a new
5432 	file descriptor for it.
5433 	Used by attr_dir_open(), and attr_dir_open_fd().
5434 */
5435 static int
5436 open_attr_dir_vnode(struct vnode* vnode, bool kernel)
5437 {
5438 	if (!HAS_FS_CALL(vnode, open_attr_dir))
5439 		return B_UNSUPPORTED;
5440 
5441 	void* cookie;
5442 	status_t status = FS_CALL(vnode, open_attr_dir, &cookie);
5443 	if (status != B_OK)
5444 		return status;
5445 
5446 	// directory is opened, create a fd
5447 	status = get_new_fd(FDTYPE_ATTR_DIR, NULL, vnode, cookie, O_CLOEXEC,
5448 		kernel);
5449 	if (status >= 0)
5450 		return status;
5451 
5452 	FS_CALL(vnode, close_attr_dir, cookie);
5453 	FS_CALL(vnode, free_attr_dir_cookie, cookie);
5454 
5455 	return status;
5456 }
5457 
5458 
5459 static int
5460 file_create_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5461 	int openMode, int perms, bool kernel)
5462 {
5463 	FUNCTION(("file_create_entry_ref: name = '%s', omode %x, perms %d, "
5464 		"kernel %d\n", name, openMode, perms, kernel));
5465 
5466 	// get directory to put the new file in
5467 	struct vnode* directory;
5468 	status_t status = get_vnode(mountID, directoryID, &directory, true, false);
5469 	if (status != B_OK)
5470 		return status;
5471 
5472 	status = create_vnode(directory, name, openMode, perms, kernel);
5473 	put_vnode(directory);
5474 
5475 	return status;
5476 }
5477 
5478 
5479 static int
5480 file_create(int fd, char* path, int openMode, int perms, bool kernel)
5481 {
5482 	FUNCTION(("file_create: path '%s', omode %x, perms %d, kernel %d\n", path,
5483 		openMode, perms, kernel));
5484 
5485 	// get directory to put the new file in
5486 	char name[B_FILE_NAME_LENGTH];
5487 	struct vnode* directory;
5488 	status_t status = fd_and_path_to_dir_vnode(fd, path, &directory, name,
5489 		kernel);
5490 	if (status < 0)
5491 		return status;
5492 
5493 	status = create_vnode(directory, name, openMode, perms, kernel);
5494 
5495 	put_vnode(directory);
5496 	return status;
5497 }
5498 
5499 
5500 static int
5501 file_open_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5502 	int openMode, bool kernel)
5503 {
5504 	if (name == NULL || *name == '\0')
5505 		return B_BAD_VALUE;
5506 
5507 	FUNCTION(("file_open_entry_ref(ref = (%" B_PRId32 ", %" B_PRId64 ", %s), "
5508 		"openMode = %d)\n", mountID, directoryID, name, openMode));
5509 
5510 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5511 
5512 	// get the vnode matching the entry_ref
5513 	struct vnode* vnode;
5514 	status_t status = entry_ref_to_vnode(mountID, directoryID, name, traverse,
5515 		kernel, &vnode);
5516 	if (status != B_OK)
5517 		return status;
5518 
5519 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
5520 		put_vnode(vnode);
5521 		return B_LINK_LIMIT;
5522 	}
5523 
5524 	int newFD = open_vnode(vnode, openMode, kernel);
5525 	if (newFD >= 0) {
5526 		// The vnode reference has been transferred to the FD
5527 		cache_node_opened(vnode, FDTYPE_FILE, vnode->cache, mountID,
5528 			directoryID, vnode->id, name);
5529 	} else
5530 		put_vnode(vnode);
5531 
5532 	return newFD;
5533 }
5534 
5535 
5536 static int
5537 file_open(int fd, char* path, int openMode, bool kernel)
5538 {
5539 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5540 
5541 	FUNCTION(("file_open: fd: %d, entry path = '%s', omode %d, kernel %d\n",
5542 		fd, path, openMode, kernel));
5543 
5544 	// get the vnode matching the vnode + path combination
5545 	struct vnode* vnode;
5546 	ino_t parentID;
5547 	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode,
5548 		&parentID, kernel);
5549 	if (status != B_OK)
5550 		return status;
5551 
5552 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
5553 		put_vnode(vnode);
5554 		return B_LINK_LIMIT;
5555 	}
5556 
5557 	// open the vnode
5558 	int newFD = open_vnode(vnode, openMode, kernel);
5559 	if (newFD >= 0) {
5560 		// The vnode reference has been transferred to the FD
5561 		cache_node_opened(vnode, FDTYPE_FILE, vnode->cache,
5562 			vnode->device, parentID, vnode->id, NULL);
5563 	} else
5564 		put_vnode(vnode);
5565 
5566 	return newFD;
5567 }
5568 
5569 
5570 static status_t
5571 file_close(struct file_descriptor* descriptor)
5572 {
5573 	struct vnode* vnode = descriptor->u.vnode;
5574 	status_t status = B_OK;
5575 
5576 	FUNCTION(("file_close(descriptor = %p)\n", descriptor));
5577 
5578 	cache_node_closed(vnode, FDTYPE_FILE, vnode->cache, vnode->device,
5579 		vnode->id);
5580 	if (HAS_FS_CALL(vnode, close)) {
5581 		status = FS_CALL(vnode, close, descriptor->cookie);
5582 	}
5583 
5584 	if (status == B_OK) {
5585 		// remove all outstanding locks for this team
5586 		if (HAS_FS_CALL(vnode, release_lock))
5587 			status = FS_CALL(vnode, release_lock, descriptor->cookie, NULL);
5588 		else
5589 			status = release_advisory_lock(vnode, NULL);
5590 	}
5591 	return status;
5592 }
5593 
5594 
5595 static void
5596 file_free_fd(struct file_descriptor* descriptor)
5597 {
5598 	struct vnode* vnode = descriptor->u.vnode;
5599 
5600 	if (vnode != NULL) {
5601 		FS_CALL(vnode, free_cookie, descriptor->cookie);
5602 		put_vnode(vnode);
5603 	}
5604 }
5605 
5606 
5607 static status_t
5608 file_read(struct file_descriptor* descriptor, off_t pos, void* buffer,
5609 	size_t* length)
5610 {
5611 	struct vnode* vnode = descriptor->u.vnode;
5612 	FUNCTION(("file_read: buf %p, pos %" B_PRIdOFF ", len %p = %ld\n", buffer,
5613 		pos, length, *length));
5614 
5615 	if (S_ISDIR(vnode->Type()))
5616 		return B_IS_A_DIRECTORY;
5617 
5618 	return FS_CALL(vnode, read, descriptor->cookie, pos, buffer, length);
5619 }
5620 
5621 
5622 static status_t
5623 file_write(struct file_descriptor* descriptor, off_t pos, const void* buffer,
5624 	size_t* length)
5625 {
5626 	struct vnode* vnode = descriptor->u.vnode;
5627 	FUNCTION(("file_write: buf %p, pos %" B_PRIdOFF ", len %p\n", buffer, pos,
5628 		length));
5629 
5630 	if (S_ISDIR(vnode->Type()))
5631 		return B_IS_A_DIRECTORY;
5632 	if (!HAS_FS_CALL(vnode, write))
5633 		return B_READ_ONLY_DEVICE;
5634 
5635 	return FS_CALL(vnode, write, descriptor->cookie, pos, buffer, length);
5636 }
5637 
5638 
5639 static off_t
5640 file_seek(struct file_descriptor* descriptor, off_t pos, int seekType)
5641 {
5642 	struct vnode* vnode = descriptor->u.vnode;
5643 	off_t offset;
5644 	bool isDevice = false;
5645 
5646 	FUNCTION(("file_seek(pos = %" B_PRIdOFF ", seekType = %d)\n", pos,
5647 		seekType));
5648 
5649 	// some kinds of files are not seekable
5650 	switch (vnode->Type() & S_IFMT) {
5651 		case S_IFIFO:
5652 		case S_IFSOCK:
5653 			return ESPIPE;
5654 
5655 		// drivers publish block devices as chr, so pick both
5656 		case S_IFBLK:
5657 		case S_IFCHR:
5658 			isDevice = true;
5659 			break;
5660 		// The Open Group Base Specs don't mention any file types besides pipes,
5661 		// fifos, and sockets specially, so we allow seeking them.
5662 		case S_IFREG:
5663 		case S_IFDIR:
5664 		case S_IFLNK:
5665 			break;
5666 	}
5667 
5668 	switch (seekType) {
5669 		case SEEK_SET:
5670 			offset = 0;
5671 			break;
5672 		case SEEK_CUR:
5673 			offset = descriptor->pos;
5674 			break;
5675 		case SEEK_END:
5676 		{
5677 			// stat() the node
5678 			if (!HAS_FS_CALL(vnode, read_stat))
5679 				return B_UNSUPPORTED;
5680 
5681 			struct stat stat;
5682 			status_t status = FS_CALL(vnode, read_stat, &stat);
5683 			if (status != B_OK)
5684 				return status;
5685 
5686 			offset = stat.st_size;
5687 
5688 			if (offset == 0 && isDevice) {
5689 				// stat() on regular drivers doesn't report size
5690 				device_geometry geometry;
5691 
5692 				if (HAS_FS_CALL(vnode, ioctl)) {
5693 					status = FS_CALL(vnode, ioctl, descriptor->cookie,
5694 						B_GET_GEOMETRY, &geometry, sizeof(geometry));
5695 					if (status == B_OK)
5696 						offset = (off_t)geometry.bytes_per_sector
5697 							* geometry.sectors_per_track
5698 							* geometry.cylinder_count
5699 							* geometry.head_count;
5700 				}
5701 			}
5702 
5703 			break;
5704 		}
5705 		default:
5706 			return B_BAD_VALUE;
5707 	}
5708 
5709 	// assumes off_t is 64 bits wide
5710 	if (offset > 0 && LONGLONG_MAX - offset < pos)
5711 		return B_BUFFER_OVERFLOW;
5712 
5713 	pos += offset;
5714 	if (pos < 0)
5715 		return B_BAD_VALUE;
5716 
5717 	return descriptor->pos = pos;
5718 }
5719 
5720 
5721 static status_t
5722 file_select(struct file_descriptor* descriptor, uint8 event,
5723 	struct selectsync* sync)
5724 {
5725 	FUNCTION(("file_select(%p, %u, %p)\n", descriptor, event, sync));
5726 
5727 	struct vnode* vnode = descriptor->u.vnode;
5728 
5729 	// If the FS has no select() hook, notify select() now.
5730 	if (!HAS_FS_CALL(vnode, select))
5731 		return notify_select_event(sync, event);
5732 
5733 	return FS_CALL(vnode, select, descriptor->cookie, event, sync);
5734 }
5735 
5736 
5737 static status_t
5738 file_deselect(struct file_descriptor* descriptor, uint8 event,
5739 	struct selectsync* sync)
5740 {
5741 	struct vnode* vnode = descriptor->u.vnode;
5742 
5743 	if (!HAS_FS_CALL(vnode, deselect))
5744 		return B_OK;
5745 
5746 	return FS_CALL(vnode, deselect, descriptor->cookie, event, sync);
5747 }
5748 
5749 
5750 static status_t
5751 dir_create_entry_ref(dev_t mountID, ino_t parentID, const char* name, int perms,
5752 	bool kernel)
5753 {
5754 	struct vnode* vnode;
5755 	status_t status;
5756 
5757 	if (name == NULL || *name == '\0')
5758 		return B_BAD_VALUE;
5759 
5760 	FUNCTION(("dir_create_entry_ref(dev = %" B_PRId32 ", ino = %" B_PRId64 ", "
5761 		"name = '%s', perms = %d)\n", mountID, parentID, name, perms));
5762 
5763 	status = get_vnode(mountID, parentID, &vnode, true, false);
5764 	if (status != B_OK)
5765 		return status;
5766 
5767 	if (HAS_FS_CALL(vnode, create_dir))
5768 		status = FS_CALL(vnode, create_dir, name, perms);
5769 	else
5770 		status = B_READ_ONLY_DEVICE;
5771 
5772 	put_vnode(vnode);
5773 	return status;
5774 }
5775 
5776 
5777 static status_t
5778 dir_create(int fd, char* path, int perms, bool kernel)
5779 {
5780 	char filename[B_FILE_NAME_LENGTH];
5781 	struct vnode* vnode;
5782 	status_t status;
5783 
5784 	FUNCTION(("dir_create: path '%s', perms %d, kernel %d\n", path, perms,
5785 		kernel));
5786 
5787 	status = fd_and_path_to_dir_vnode(fd, path, &vnode, filename, kernel);
5788 	if (status < 0)
5789 		return status;
5790 
5791 	if (HAS_FS_CALL(vnode, create_dir)) {
5792 		status = FS_CALL(vnode, create_dir, filename, perms);
5793 	} else
5794 		status = B_READ_ONLY_DEVICE;
5795 
5796 	put_vnode(vnode);
5797 	return status;
5798 }
5799 
5800 
5801 static int
5802 dir_open_entry_ref(dev_t mountID, ino_t parentID, const char* name, bool kernel)
5803 {
5804 	FUNCTION(("dir_open_entry_ref()\n"));
5805 
5806 	if (name && name[0] == '\0')
5807 		return B_BAD_VALUE;
5808 
5809 	// get the vnode matching the entry_ref/node_ref
5810 	struct vnode* vnode;
5811 	status_t status;
5812 	if (name) {
5813 		status = entry_ref_to_vnode(mountID, parentID, name, true, kernel,
5814 			&vnode);
5815 	} else
5816 		status = get_vnode(mountID, parentID, &vnode, true, false);
5817 	if (status != B_OK)
5818 		return status;
5819 
5820 	int newFD = open_dir_vnode(vnode, kernel);
5821 	if (newFD >= 0) {
5822 		// The vnode reference has been transferred to the FD
5823 		cache_node_opened(vnode, FDTYPE_DIR, vnode->cache, mountID, parentID,
5824 			vnode->id, name);
5825 	} else
5826 		put_vnode(vnode);
5827 
5828 	return newFD;
5829 }
5830 
5831 
5832 static int
5833 dir_open(int fd, char* path, bool kernel)
5834 {
5835 	FUNCTION(("dir_open: fd: %d, entry path = '%s', kernel %d\n", fd, path,
5836 		kernel));
5837 
5838 	// get the vnode matching the vnode + path combination
5839 	struct vnode* vnode = NULL;
5840 	ino_t parentID;
5841 	status_t status = fd_and_path_to_vnode(fd, path, true, &vnode, &parentID,
5842 		kernel);
5843 	if (status != B_OK)
5844 		return status;
5845 
5846 	// open the dir
5847 	int newFD = open_dir_vnode(vnode, kernel);
5848 	if (newFD >= 0) {
5849 		// The vnode reference has been transferred to the FD
5850 		cache_node_opened(vnode, FDTYPE_DIR, vnode->cache, vnode->device,
5851 			parentID, vnode->id, NULL);
5852 	} else
5853 		put_vnode(vnode);
5854 
5855 	return newFD;
5856 }
5857 
5858 
5859 static status_t
5860 dir_close(struct file_descriptor* descriptor)
5861 {
5862 	struct vnode* vnode = descriptor->u.vnode;
5863 
5864 	FUNCTION(("dir_close(descriptor = %p)\n", descriptor));
5865 
5866 	cache_node_closed(vnode, FDTYPE_DIR, vnode->cache, vnode->device,
5867 		vnode->id);
5868 	if (HAS_FS_CALL(vnode, close_dir))
5869 		return FS_CALL(vnode, close_dir, descriptor->cookie);
5870 
5871 	return B_OK;
5872 }
5873 
5874 
5875 static void
5876 dir_free_fd(struct file_descriptor* descriptor)
5877 {
5878 	struct vnode* vnode = descriptor->u.vnode;
5879 
5880 	if (vnode != NULL) {
5881 		FS_CALL(vnode, free_dir_cookie, descriptor->cookie);
5882 		put_vnode(vnode);
5883 	}
5884 }
5885 
5886 
5887 static status_t
5888 dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
5889 	struct dirent* buffer, size_t bufferSize, uint32* _count)
5890 {
5891 	return dir_read(ioContext, descriptor->u.vnode, descriptor->cookie, buffer,
5892 		bufferSize, _count);
5893 }
5894 
5895 
5896 static status_t
5897 fix_dirent(struct vnode* parent, struct dirent* entry,
5898 	struct io_context* ioContext)
5899 {
5900 	// set d_pdev and d_pino
5901 	entry->d_pdev = parent->device;
5902 	entry->d_pino = parent->id;
5903 
5904 	// If this is the ".." entry and the directory covering another vnode,
5905 	// we need to replace d_dev and d_ino with the actual values.
5906 	if (strcmp(entry->d_name, "..") == 0 && parent->IsCovering()) {
5907 		return resolve_covered_parent(parent, &entry->d_dev, &entry->d_ino,
5908 			ioContext);
5909 	}
5910 
5911 	// resolve covered vnodes
5912 	ReadLocker _(&sVnodeLock);
5913 
5914 	struct vnode* vnode = lookup_vnode(entry->d_dev, entry->d_ino);
5915 	if (vnode != NULL && vnode->covered_by != NULL) {
5916 		do {
5917 			vnode = vnode->covered_by;
5918 		} while (vnode->covered_by != NULL);
5919 
5920 		entry->d_dev = vnode->device;
5921 		entry->d_ino = vnode->id;
5922 	}
5923 
5924 	return B_OK;
5925 }
5926 
5927 
5928 static status_t
5929 dir_read(struct io_context* ioContext, struct vnode* vnode, void* cookie,
5930 	struct dirent* buffer, size_t bufferSize, uint32* _count)
5931 {
5932 	if (!HAS_FS_CALL(vnode, read_dir))
5933 		return B_UNSUPPORTED;
5934 
5935 	status_t error = FS_CALL(vnode, read_dir, cookie, buffer, bufferSize,
5936 		_count);
5937 	if (error != B_OK)
5938 		return error;
5939 
5940 	// we need to adjust the read dirents
5941 	uint32 count = *_count;
5942 	for (uint32 i = 0; i < count; i++) {
5943 		error = fix_dirent(vnode, buffer, ioContext);
5944 		if (error != B_OK)
5945 			return error;
5946 
5947 		buffer = (struct dirent*)((uint8*)buffer + buffer->d_reclen);
5948 	}
5949 
5950 	return error;
5951 }
5952 
5953 
5954 static status_t
5955 dir_rewind(struct file_descriptor* descriptor)
5956 {
5957 	struct vnode* vnode = descriptor->u.vnode;
5958 
5959 	if (HAS_FS_CALL(vnode, rewind_dir)) {
5960 		return FS_CALL(vnode, rewind_dir, descriptor->cookie);
5961 	}
5962 
5963 	return B_UNSUPPORTED;
5964 }
5965 
5966 
5967 static status_t
5968 dir_remove(int fd, char* path, bool kernel)
5969 {
5970 	char name[B_FILE_NAME_LENGTH];
5971 	struct vnode* directory;
5972 	status_t status;
5973 
5974 	if (path != NULL) {
5975 		// we need to make sure our path name doesn't stop with "/", ".",
5976 		// or ".."
5977 		char* lastSlash;
5978 		while ((lastSlash = strrchr(path, '/')) != NULL) {
5979 			char* leaf = lastSlash + 1;
5980 			if (!strcmp(leaf, ".."))
5981 				return B_NOT_ALLOWED;
5982 
5983 			// omit multiple slashes
5984 			while (lastSlash > path && lastSlash[-1] == '/')
5985 				lastSlash--;
5986 
5987 			if (leaf[0]
5988 				&& strcmp(leaf, ".")) {
5989 				break;
5990 			}
5991 			// "name/" -> "name", or "name/." -> "name"
5992 			lastSlash[0] = '\0';
5993 		}
5994 
5995 		if (!strcmp(path, ".") || !strcmp(path, ".."))
5996 			return B_NOT_ALLOWED;
5997 	}
5998 
5999 	status = fd_and_path_to_dir_vnode(fd, path, &directory, name, kernel);
6000 	if (status != B_OK)
6001 		return status;
6002 
6003 	if (HAS_FS_CALL(directory, remove_dir))
6004 		status = FS_CALL(directory, remove_dir, name);
6005 	else
6006 		status = B_READ_ONLY_DEVICE;
6007 
6008 	put_vnode(directory);
6009 	return status;
6010 }
6011 
6012 
6013 static status_t
6014 common_ioctl(struct file_descriptor* descriptor, ulong op, void* buffer,
6015 	size_t length)
6016 {
6017 	struct vnode* vnode = descriptor->u.vnode;
6018 
6019 	if (HAS_FS_CALL(vnode, ioctl))
6020 		return FS_CALL(vnode, ioctl, descriptor->cookie, op, buffer, length);
6021 
6022 	return B_DEV_INVALID_IOCTL;
6023 }
6024 
6025 
6026 static status_t
6027 common_fcntl(int fd, int op, size_t argument, bool kernel)
6028 {
6029 	struct flock flock;
6030 
6031 	FUNCTION(("common_fcntl(fd = %d, op = %d, argument = %lx, %s)\n",
6032 		fd, op, argument, kernel ? "kernel" : "user"));
6033 
6034 	struct file_descriptor* descriptor = get_fd(get_current_io_context(kernel),
6035 		fd);
6036 	if (descriptor == NULL)
6037 		return B_FILE_ERROR;
6038 
6039 	struct vnode* vnode = fd_vnode(descriptor);
6040 
6041 	status_t status = B_OK;
6042 
6043 	if (op == F_SETLK || op == F_SETLKW || op == F_GETLK) {
6044 		if (descriptor->type != FDTYPE_FILE)
6045 			status = B_BAD_VALUE;
6046 		else if (user_memcpy(&flock, (struct flock*)argument,
6047 				sizeof(struct flock)) != B_OK)
6048 			status = B_BAD_ADDRESS;
6049 
6050 		if (status != B_OK) {
6051 			put_fd(descriptor);
6052 			return status;
6053 		}
6054 	}
6055 
6056 	switch (op) {
6057 		case F_SETFD:
6058 		{
6059 			struct io_context* context = get_current_io_context(kernel);
6060 			// Set file descriptor flags
6061 
6062 			// O_CLOEXEC is the only flag available at this time
6063 			mutex_lock(&context->io_mutex);
6064 			fd_set_close_on_exec(context, fd, (argument & FD_CLOEXEC) != 0);
6065 			mutex_unlock(&context->io_mutex);
6066 
6067 			status = B_OK;
6068 			break;
6069 		}
6070 
6071 		case F_GETFD:
6072 		{
6073 			struct io_context* context = get_current_io_context(kernel);
6074 
6075 			// Get file descriptor flags
6076 			mutex_lock(&context->io_mutex);
6077 			status = fd_close_on_exec(context, fd) ? FD_CLOEXEC : 0;
6078 			mutex_unlock(&context->io_mutex);
6079 			break;
6080 		}
6081 
6082 		case F_SETFL:
6083 			// Set file descriptor open mode
6084 
6085 			// we only accept changes to O_APPEND and O_NONBLOCK
6086 			argument &= O_APPEND | O_NONBLOCK;
6087 			if (descriptor->ops->fd_set_flags != NULL) {
6088 				status = descriptor->ops->fd_set_flags(descriptor, argument);
6089 			} else if (vnode != NULL && HAS_FS_CALL(vnode, set_flags)) {
6090 				status = FS_CALL(vnode, set_flags, descriptor->cookie,
6091 					(int)argument);
6092 			} else
6093 				status = B_UNSUPPORTED;
6094 
6095 			if (status == B_OK) {
6096 				// update this descriptor's open_mode field
6097 				descriptor->open_mode = (descriptor->open_mode
6098 					& ~(O_APPEND | O_NONBLOCK)) | argument;
6099 			}
6100 
6101 			break;
6102 
6103 		case F_GETFL:
6104 			// Get file descriptor open mode
6105 			status = descriptor->open_mode;
6106 			break;
6107 
6108 		case F_DUPFD:
6109 		{
6110 			struct io_context* context = get_current_io_context(kernel);
6111 
6112 			status = new_fd_etc(context, descriptor, (int)argument);
6113 			if (status >= 0) {
6114 				mutex_lock(&context->io_mutex);
6115 				fd_set_close_on_exec(context, fd, false);
6116 				mutex_unlock(&context->io_mutex);
6117 
6118 				atomic_add(&descriptor->ref_count, 1);
6119 			}
6120 			break;
6121 		}
6122 
6123 		case F_GETLK:
6124 			if (vnode != NULL) {
6125 				struct flock normalizedLock;
6126 
6127 				memcpy(&normalizedLock, &flock, sizeof(struct flock));
6128 				status = normalize_flock(descriptor, &normalizedLock);
6129 				if (status != B_OK)
6130 					break;
6131 
6132 				if (HAS_FS_CALL(vnode, test_lock)) {
6133 					status = FS_CALL(vnode, test_lock, descriptor->cookie,
6134 						&normalizedLock);
6135 				} else
6136 					status = test_advisory_lock(vnode, &normalizedLock);
6137 				if (status == B_OK) {
6138 					if (normalizedLock.l_type == F_UNLCK) {
6139 						// no conflicting lock found, copy back the same struct
6140 						// we were given except change type to F_UNLCK
6141 						flock.l_type = F_UNLCK;
6142 						status = user_memcpy((struct flock*)argument, &flock,
6143 							sizeof(struct flock));
6144 					} else {
6145 						// a conflicting lock was found, copy back its range and
6146 						// type
6147 						if (normalizedLock.l_len == OFF_MAX)
6148 							normalizedLock.l_len = 0;
6149 
6150 						status = user_memcpy((struct flock*)argument,
6151 							&normalizedLock, sizeof(struct flock));
6152 					}
6153 				}
6154 			} else
6155 				status = B_BAD_VALUE;
6156 			break;
6157 
6158 		case F_SETLK:
6159 		case F_SETLKW:
6160 			status = normalize_flock(descriptor, &flock);
6161 			if (status != B_OK)
6162 				break;
6163 
6164 			if (vnode == NULL) {
6165 				status = B_BAD_VALUE;
6166 			} else if (flock.l_type == F_UNLCK) {
6167 				if (HAS_FS_CALL(vnode, release_lock)) {
6168 					status = FS_CALL(vnode, release_lock, descriptor->cookie,
6169 						&flock);
6170 				} else
6171 					status = release_advisory_lock(vnode, &flock);
6172 			} else {
6173 				// the open mode must match the lock type
6174 				if (((descriptor->open_mode & O_RWMASK) == O_RDONLY
6175 						&& flock.l_type == F_WRLCK)
6176 					|| ((descriptor->open_mode & O_RWMASK) == O_WRONLY
6177 						&& flock.l_type == F_RDLCK))
6178 					status = B_FILE_ERROR;
6179 				else {
6180 					if (HAS_FS_CALL(vnode, acquire_lock)) {
6181 						status = FS_CALL(vnode, acquire_lock,
6182 							descriptor->cookie, &flock, op == F_SETLKW);
6183 					} else {
6184 						status = acquire_advisory_lock(vnode, -1,
6185 							&flock, op == F_SETLKW);
6186 					}
6187 				}
6188 			}
6189 			break;
6190 
6191 		// ToDo: add support for more ops?
6192 
6193 		default:
6194 			status = B_BAD_VALUE;
6195 	}
6196 
6197 	put_fd(descriptor);
6198 	return status;
6199 }
6200 
6201 
6202 static status_t
6203 common_sync(int fd, bool kernel)
6204 {
6205 	struct file_descriptor* descriptor;
6206 	struct vnode* vnode;
6207 	status_t status;
6208 
6209 	FUNCTION(("common_fsync: entry. fd %d kernel %d\n", fd, kernel));
6210 
6211 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6212 	if (descriptor == NULL)
6213 		return B_FILE_ERROR;
6214 
6215 	if (HAS_FS_CALL(vnode, fsync))
6216 		status = FS_CALL_NO_PARAMS(vnode, fsync);
6217 	else
6218 		status = B_UNSUPPORTED;
6219 
6220 	put_fd(descriptor);
6221 	return status;
6222 }
6223 
6224 
6225 static status_t
6226 common_lock_node(int fd, bool kernel)
6227 {
6228 	struct file_descriptor* descriptor;
6229 	struct vnode* vnode;
6230 
6231 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6232 	if (descriptor == NULL)
6233 		return B_FILE_ERROR;
6234 
6235 	status_t status = B_OK;
6236 
6237 	// We need to set the locking atomically - someone
6238 	// else might set one at the same time
6239 	if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by, descriptor,
6240 			(file_descriptor*)NULL) != NULL)
6241 		status = B_BUSY;
6242 
6243 	put_fd(descriptor);
6244 	return status;
6245 }
6246 
6247 
6248 static status_t
6249 common_unlock_node(int fd, bool kernel)
6250 {
6251 	struct file_descriptor* descriptor;
6252 	struct vnode* vnode;
6253 
6254 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6255 	if (descriptor == NULL)
6256 		return B_FILE_ERROR;
6257 
6258 	status_t status = B_OK;
6259 
6260 	// We need to set the locking atomically - someone
6261 	// else might set one at the same time
6262 	if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by,
6263 			(file_descriptor*)NULL, descriptor) != descriptor)
6264 		status = B_BAD_VALUE;
6265 
6266 	put_fd(descriptor);
6267 	return status;
6268 }
6269 
6270 
6271 static status_t
6272 common_read_link(int fd, char* path, char* buffer, size_t* _bufferSize,
6273 	bool kernel)
6274 {
6275 	struct vnode* vnode;
6276 	status_t status;
6277 
6278 	status = fd_and_path_to_vnode(fd, path, false, &vnode, NULL, kernel);
6279 	if (status != B_OK)
6280 		return status;
6281 
6282 	if (HAS_FS_CALL(vnode, read_symlink)) {
6283 		status = FS_CALL(vnode, read_symlink, buffer, _bufferSize);
6284 	} else
6285 		status = B_BAD_VALUE;
6286 
6287 	put_vnode(vnode);
6288 	return status;
6289 }
6290 
6291 
6292 static status_t
6293 common_create_symlink(int fd, char* path, const char* toPath, int mode,
6294 	bool kernel)
6295 {
6296 	// path validity checks have to be in the calling function!
6297 	char name[B_FILE_NAME_LENGTH];
6298 	struct vnode* vnode;
6299 	status_t status;
6300 
6301 	FUNCTION(("common_create_symlink(fd = %d, path = %s, toPath = %s, "
6302 		"mode = %d, kernel = %d)\n", fd, path, toPath, mode, kernel));
6303 
6304 	status = fd_and_path_to_dir_vnode(fd, path, &vnode, name, kernel);
6305 	if (status != B_OK)
6306 		return status;
6307 
6308 	if (HAS_FS_CALL(vnode, create_symlink))
6309 		status = FS_CALL(vnode, create_symlink, name, toPath, mode);
6310 	else {
6311 		status = HAS_FS_CALL(vnode, write)
6312 			? B_UNSUPPORTED : B_READ_ONLY_DEVICE;
6313 	}
6314 
6315 	put_vnode(vnode);
6316 
6317 	return status;
6318 }
6319 
6320 
6321 static status_t
6322 common_create_link(int pathFD, char* path, int toFD, char* toPath,
6323 	bool traverseLeafLink, bool kernel)
6324 {
6325 	// path validity checks have to be in the calling function!
6326 
6327 	FUNCTION(("common_create_link(path = %s, toPath = %s, kernel = %d)\n", path,
6328 		toPath, kernel));
6329 
6330 	char name[B_FILE_NAME_LENGTH];
6331 	struct vnode* directory;
6332 	status_t status = fd_and_path_to_dir_vnode(pathFD, path, &directory, name,
6333 		kernel);
6334 	if (status != B_OK)
6335 		return status;
6336 
6337 	struct vnode* vnode;
6338 	status = fd_and_path_to_vnode(toFD, toPath, traverseLeafLink, &vnode, NULL,
6339 		kernel);
6340 	if (status != B_OK)
6341 		goto err;
6342 
6343 	if (directory->mount != vnode->mount) {
6344 		status = B_CROSS_DEVICE_LINK;
6345 		goto err1;
6346 	}
6347 
6348 	if (HAS_FS_CALL(directory, link))
6349 		status = FS_CALL(directory, link, name, vnode);
6350 	else
6351 		status = B_READ_ONLY_DEVICE;
6352 
6353 err1:
6354 	put_vnode(vnode);
6355 err:
6356 	put_vnode(directory);
6357 
6358 	return status;
6359 }
6360 
6361 
6362 static status_t
6363 common_unlink(int fd, char* path, bool kernel)
6364 {
6365 	char filename[B_FILE_NAME_LENGTH];
6366 	struct vnode* vnode;
6367 	status_t status;
6368 
6369 	FUNCTION(("common_unlink: fd: %d, path '%s', kernel %d\n", fd, path,
6370 		kernel));
6371 
6372 	status = fd_and_path_to_dir_vnode(fd, path, &vnode, filename, kernel);
6373 	if (status < 0)
6374 		return status;
6375 
6376 	if (HAS_FS_CALL(vnode, unlink))
6377 		status = FS_CALL(vnode, unlink, filename);
6378 	else
6379 		status = B_READ_ONLY_DEVICE;
6380 
6381 	put_vnode(vnode);
6382 
6383 	return status;
6384 }
6385 
6386 
6387 static status_t
6388 common_access(int fd, char* path, int mode, bool effectiveUserGroup, bool kernel)
6389 {
6390 	struct vnode* vnode;
6391 	status_t status;
6392 
6393 	// TODO: honor effectiveUserGroup argument
6394 
6395 	status = fd_and_path_to_vnode(fd, path, true, &vnode, NULL, kernel);
6396 	if (status != B_OK)
6397 		return status;
6398 
6399 	if (HAS_FS_CALL(vnode, access))
6400 		status = FS_CALL(vnode, access, mode);
6401 	else
6402 		status = B_OK;
6403 
6404 	put_vnode(vnode);
6405 
6406 	return status;
6407 }
6408 
6409 
6410 static status_t
6411 common_rename(int fd, char* path, int newFD, char* newPath, bool kernel)
6412 {
6413 	struct vnode* fromVnode;
6414 	struct vnode* toVnode;
6415 	char fromName[B_FILE_NAME_LENGTH];
6416 	char toName[B_FILE_NAME_LENGTH];
6417 	status_t status;
6418 
6419 	FUNCTION(("common_rename(fd = %d, path = %s, newFD = %d, newPath = %s, "
6420 		"kernel = %d)\n", fd, path, newFD, newPath, kernel));
6421 
6422 	status = fd_and_path_to_dir_vnode(fd, path, &fromVnode, fromName, kernel);
6423 	if (status != B_OK)
6424 		return status;
6425 
6426 	status = fd_and_path_to_dir_vnode(newFD, newPath, &toVnode, toName, kernel);
6427 	if (status != B_OK)
6428 		goto err1;
6429 
6430 	if (fromVnode->device != toVnode->device) {
6431 		status = B_CROSS_DEVICE_LINK;
6432 		goto err2;
6433 	}
6434 
6435 	if (fromName[0] == '\0' || toName[0] == '\0'
6436 		|| !strcmp(fromName, ".") || !strcmp(fromName, "..")
6437 		|| !strcmp(toName, ".") || !strcmp(toName, "..")
6438 		|| (fromVnode == toVnode && !strcmp(fromName, toName))) {
6439 		status = B_BAD_VALUE;
6440 		goto err2;
6441 	}
6442 
6443 	if (HAS_FS_CALL(fromVnode, rename))
6444 		status = FS_CALL(fromVnode, rename, fromName, toVnode, toName);
6445 	else
6446 		status = B_READ_ONLY_DEVICE;
6447 
6448 err2:
6449 	put_vnode(toVnode);
6450 err1:
6451 	put_vnode(fromVnode);
6452 
6453 	return status;
6454 }
6455 
6456 
6457 static status_t
6458 common_read_stat(struct file_descriptor* descriptor, struct stat* stat)
6459 {
6460 	struct vnode* vnode = descriptor->u.vnode;
6461 
6462 	FUNCTION(("common_read_stat: stat %p\n", stat));
6463 
6464 	// TODO: remove this once all file systems properly set them!
6465 	stat->st_crtim.tv_nsec = 0;
6466 	stat->st_ctim.tv_nsec = 0;
6467 	stat->st_mtim.tv_nsec = 0;
6468 	stat->st_atim.tv_nsec = 0;
6469 
6470 	return vfs_stat_vnode(vnode, stat);
6471 }
6472 
6473 
6474 static status_t
6475 common_write_stat(struct file_descriptor* descriptor, const struct stat* stat,
6476 	int statMask)
6477 {
6478 	struct vnode* vnode = descriptor->u.vnode;
6479 
6480 	FUNCTION(("common_write_stat(vnode = %p, stat = %p, statMask = %d)\n",
6481 		vnode, stat, statMask));
6482 
6483 	if (!HAS_FS_CALL(vnode, write_stat))
6484 		return B_READ_ONLY_DEVICE;
6485 
6486 	return FS_CALL(vnode, write_stat, stat, statMask);
6487 }
6488 
6489 
6490 static status_t
6491 common_path_read_stat(int fd, char* path, bool traverseLeafLink,
6492 	struct stat* stat, bool kernel)
6493 {
6494 	FUNCTION(("common_path_read_stat: fd: %d, path '%s', stat %p,\n", fd, path,
6495 		stat));
6496 
6497 	struct vnode* vnode;
6498 	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6499 		NULL, kernel);
6500 	if (status != B_OK)
6501 		return status;
6502 
6503 	status = vfs_stat_vnode(vnode, stat);
6504 
6505 	put_vnode(vnode);
6506 	return status;
6507 }
6508 
6509 
6510 static status_t
6511 common_path_write_stat(int fd, char* path, bool traverseLeafLink,
6512 	const struct stat* stat, int statMask, bool kernel)
6513 {
6514 	FUNCTION(("common_write_stat: fd: %d, path '%s', stat %p, stat_mask %d, "
6515 		"kernel %d\n", fd, path, stat, statMask, kernel));
6516 
6517 	struct vnode* vnode;
6518 	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6519 		NULL, kernel);
6520 	if (status != B_OK)
6521 		return status;
6522 
6523 	if (HAS_FS_CALL(vnode, write_stat))
6524 		status = FS_CALL(vnode, write_stat, stat, statMask);
6525 	else
6526 		status = B_READ_ONLY_DEVICE;
6527 
6528 	put_vnode(vnode);
6529 
6530 	return status;
6531 }
6532 
6533 
6534 static int
6535 attr_dir_open(int fd, char* path, bool traverseLeafLink, bool kernel)
6536 {
6537 	FUNCTION(("attr_dir_open(fd = %d, path = '%s', kernel = %d)\n", fd, path,
6538 		kernel));
6539 
6540 	struct vnode* vnode;
6541 	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6542 		NULL, kernel);
6543 	if (status != B_OK)
6544 		return status;
6545 
6546 	status = open_attr_dir_vnode(vnode, kernel);
6547 	if (status < 0)
6548 		put_vnode(vnode);
6549 
6550 	return status;
6551 }
6552 
6553 
6554 static status_t
6555 attr_dir_close(struct file_descriptor* descriptor)
6556 {
6557 	struct vnode* vnode = descriptor->u.vnode;
6558 
6559 	FUNCTION(("attr_dir_close(descriptor = %p)\n", descriptor));
6560 
6561 	if (HAS_FS_CALL(vnode, close_attr_dir))
6562 		return FS_CALL(vnode, close_attr_dir, descriptor->cookie);
6563 
6564 	return B_OK;
6565 }
6566 
6567 
6568 static void
6569 attr_dir_free_fd(struct file_descriptor* descriptor)
6570 {
6571 	struct vnode* vnode = descriptor->u.vnode;
6572 
6573 	if (vnode != NULL) {
6574 		FS_CALL(vnode, free_attr_dir_cookie, descriptor->cookie);
6575 		put_vnode(vnode);
6576 	}
6577 }
6578 
6579 
6580 static status_t
6581 attr_dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
6582 	struct dirent* buffer, size_t bufferSize, uint32* _count)
6583 {
6584 	struct vnode* vnode = descriptor->u.vnode;
6585 
6586 	FUNCTION(("attr_dir_read(descriptor = %p)\n", descriptor));
6587 
6588 	if (HAS_FS_CALL(vnode, read_attr_dir))
6589 		return FS_CALL(vnode, read_attr_dir, descriptor->cookie, buffer,
6590 			bufferSize, _count);
6591 
6592 	return B_UNSUPPORTED;
6593 }
6594 
6595 
6596 static status_t
6597 attr_dir_rewind(struct file_descriptor* descriptor)
6598 {
6599 	struct vnode* vnode = descriptor->u.vnode;
6600 
6601 	FUNCTION(("attr_dir_rewind(descriptor = %p)\n", descriptor));
6602 
6603 	if (HAS_FS_CALL(vnode, rewind_attr_dir))
6604 		return FS_CALL(vnode, rewind_attr_dir, descriptor->cookie);
6605 
6606 	return B_UNSUPPORTED;
6607 }
6608 
6609 
6610 static int
6611 attr_create(int fd, char* path, const char* name, uint32 type,
6612 	int openMode, bool kernel)
6613 {
6614 	if (name == NULL || *name == '\0')
6615 		return B_BAD_VALUE;
6616 
6617 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
6618 	struct vnode* vnode;
6619 	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode, NULL,
6620 		kernel);
6621 	if (status != B_OK)
6622 		return status;
6623 
6624 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
6625 		status = B_LINK_LIMIT;
6626 		goto err;
6627 	}
6628 
6629 	if (!HAS_FS_CALL(vnode, create_attr)) {
6630 		status = B_READ_ONLY_DEVICE;
6631 		goto err;
6632 	}
6633 
6634 	void* cookie;
6635 	status = FS_CALL(vnode, create_attr, name, type, openMode, &cookie);
6636 	if (status != B_OK)
6637 		goto err;
6638 
6639 	fd = get_new_fd(FDTYPE_ATTR, NULL, vnode, cookie, openMode, kernel);
6640 	if (fd >= 0)
6641 		return fd;
6642 
6643 	status = fd;
6644 
6645 	FS_CALL(vnode, close_attr, cookie);
6646 	FS_CALL(vnode, free_attr_cookie, cookie);
6647 
6648 	FS_CALL(vnode, remove_attr, name);
6649 
6650 err:
6651 	put_vnode(vnode);
6652 
6653 	return status;
6654 }
6655 
6656 
6657 static int
6658 attr_open(int fd, char* path, const char* name, int openMode, bool kernel)
6659 {
6660 	if (name == NULL || *name == '\0')
6661 		return B_BAD_VALUE;
6662 
6663 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
6664 	struct vnode* vnode;
6665 	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode, NULL,
6666 		kernel);
6667 	if (status != B_OK)
6668 		return status;
6669 
6670 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
6671 		status = B_LINK_LIMIT;
6672 		goto err;
6673 	}
6674 
6675 	if (!HAS_FS_CALL(vnode, open_attr)) {
6676 		status = B_UNSUPPORTED;
6677 		goto err;
6678 	}
6679 
6680 	void* cookie;
6681 	status = FS_CALL(vnode, open_attr, name, openMode, &cookie);
6682 	if (status != B_OK)
6683 		goto err;
6684 
6685 	// now we only need a file descriptor for this attribute and we're done
6686 	fd = get_new_fd(FDTYPE_ATTR, NULL, vnode, cookie, openMode, kernel);
6687 	if (fd >= 0)
6688 		return fd;
6689 
6690 	status = fd;
6691 
6692 	FS_CALL(vnode, close_attr, cookie);
6693 	FS_CALL(vnode, free_attr_cookie, cookie);
6694 
6695 err:
6696 	put_vnode(vnode);
6697 
6698 	return status;
6699 }
6700 
6701 
6702 static status_t
6703 attr_close(struct file_descriptor* descriptor)
6704 {
6705 	struct vnode* vnode = descriptor->u.vnode;
6706 
6707 	FUNCTION(("attr_close(descriptor = %p)\n", descriptor));
6708 
6709 	if (HAS_FS_CALL(vnode, close_attr))
6710 		return FS_CALL(vnode, close_attr, descriptor->cookie);
6711 
6712 	return B_OK;
6713 }
6714 
6715 
6716 static void
6717 attr_free_fd(struct file_descriptor* descriptor)
6718 {
6719 	struct vnode* vnode = descriptor->u.vnode;
6720 
6721 	if (vnode != NULL) {
6722 		FS_CALL(vnode, free_attr_cookie, descriptor->cookie);
6723 		put_vnode(vnode);
6724 	}
6725 }
6726 
6727 
6728 static status_t
6729 attr_read(struct file_descriptor* descriptor, off_t pos, void* buffer,
6730 	size_t* length)
6731 {
6732 	struct vnode* vnode = descriptor->u.vnode;
6733 
6734 	FUNCTION(("attr_read: buf %p, pos %" B_PRIdOFF ", len %p = %ld\n", buffer,
6735 		pos, length, *length));
6736 
6737 	if (!HAS_FS_CALL(vnode, read_attr))
6738 		return B_UNSUPPORTED;
6739 
6740 	return FS_CALL(vnode, read_attr, descriptor->cookie, pos, buffer, length);
6741 }
6742 
6743 
6744 static status_t
6745 attr_write(struct file_descriptor* descriptor, off_t pos, const void* buffer,
6746 	size_t* length)
6747 {
6748 	struct vnode* vnode = descriptor->u.vnode;
6749 
6750 	FUNCTION(("attr_write: buf %p, pos %" B_PRIdOFF ", len %p\n", buffer, pos,
6751 		length));
6752 
6753 	if (!HAS_FS_CALL(vnode, write_attr))
6754 		return B_UNSUPPORTED;
6755 
6756 	return FS_CALL(vnode, write_attr, descriptor->cookie, pos, buffer, length);
6757 }
6758 
6759 
6760 static off_t
6761 attr_seek(struct file_descriptor* descriptor, off_t pos, int seekType)
6762 {
6763 	off_t offset;
6764 
6765 	switch (seekType) {
6766 		case SEEK_SET:
6767 			offset = 0;
6768 			break;
6769 		case SEEK_CUR:
6770 			offset = descriptor->pos;
6771 			break;
6772 		case SEEK_END:
6773 		{
6774 			struct vnode* vnode = descriptor->u.vnode;
6775 			if (!HAS_FS_CALL(vnode, read_stat))
6776 				return B_UNSUPPORTED;
6777 
6778 			struct stat stat;
6779 			status_t status = FS_CALL(vnode, read_attr_stat, descriptor->cookie,
6780 				&stat);
6781 			if (status != B_OK)
6782 				return status;
6783 
6784 			offset = stat.st_size;
6785 			break;
6786 		}
6787 		default:
6788 			return B_BAD_VALUE;
6789 	}
6790 
6791 	// assumes off_t is 64 bits wide
6792 	if (offset > 0 && LONGLONG_MAX - offset < pos)
6793 		return B_BUFFER_OVERFLOW;
6794 
6795 	pos += offset;
6796 	if (pos < 0)
6797 		return B_BAD_VALUE;
6798 
6799 	return descriptor->pos = pos;
6800 }
6801 
6802 
6803 static status_t
6804 attr_read_stat(struct file_descriptor* descriptor, struct stat* stat)
6805 {
6806 	struct vnode* vnode = descriptor->u.vnode;
6807 
6808 	FUNCTION(("attr_read_stat: stat 0x%p\n", stat));
6809 
6810 	if (!HAS_FS_CALL(vnode, read_attr_stat))
6811 		return B_UNSUPPORTED;
6812 
6813 	return FS_CALL(vnode, read_attr_stat, descriptor->cookie, stat);
6814 }
6815 
6816 
6817 static status_t
6818 attr_write_stat(struct file_descriptor* descriptor, const struct stat* stat,
6819 	int statMask)
6820 {
6821 	struct vnode* vnode = descriptor->u.vnode;
6822 
6823 	FUNCTION(("attr_write_stat: stat = %p, statMask %d\n", stat, statMask));
6824 
6825 	if (!HAS_FS_CALL(vnode, write_attr_stat))
6826 		return B_READ_ONLY_DEVICE;
6827 
6828 	return FS_CALL(vnode, write_attr_stat, descriptor->cookie, stat, statMask);
6829 }
6830 
6831 
6832 static status_t
6833 attr_remove(int fd, const char* name, bool kernel)
6834 {
6835 	struct file_descriptor* descriptor;
6836 	struct vnode* vnode;
6837 	status_t status;
6838 
6839 	if (name == NULL || *name == '\0')
6840 		return B_BAD_VALUE;
6841 
6842 	FUNCTION(("attr_remove: fd = %d, name = \"%s\", kernel %d\n", fd, name,
6843 		kernel));
6844 
6845 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6846 	if (descriptor == NULL)
6847 		return B_FILE_ERROR;
6848 
6849 	if (HAS_FS_CALL(vnode, remove_attr))
6850 		status = FS_CALL(vnode, remove_attr, name);
6851 	else
6852 		status = B_READ_ONLY_DEVICE;
6853 
6854 	put_fd(descriptor);
6855 
6856 	return status;
6857 }
6858 
6859 
6860 static status_t
6861 attr_rename(int fromFD, const char* fromName, int toFD, const char* toName,
6862 	bool kernel)
6863 {
6864 	struct file_descriptor* fromDescriptor;
6865 	struct file_descriptor* toDescriptor;
6866 	struct vnode* fromVnode;
6867 	struct vnode* toVnode;
6868 	status_t status;
6869 
6870 	if (fromName == NULL || *fromName == '\0' || toName == NULL
6871 		|| *toName == '\0')
6872 		return B_BAD_VALUE;
6873 
6874 	FUNCTION(("attr_rename: from fd = %d, from name = \"%s\", to fd = %d, to "
6875 		"name = \"%s\", kernel %d\n", fromFD, fromName, toFD, toName, kernel));
6876 
6877 	fromDescriptor = get_fd_and_vnode(fromFD, &fromVnode, kernel);
6878 	if (fromDescriptor == NULL)
6879 		return B_FILE_ERROR;
6880 
6881 	toDescriptor = get_fd_and_vnode(toFD, &toVnode, kernel);
6882 	if (toDescriptor == NULL) {
6883 		status = B_FILE_ERROR;
6884 		goto err;
6885 	}
6886 
6887 	// are the files on the same volume?
6888 	if (fromVnode->device != toVnode->device) {
6889 		status = B_CROSS_DEVICE_LINK;
6890 		goto err1;
6891 	}
6892 
6893 	if (HAS_FS_CALL(fromVnode, rename_attr)) {
6894 		status = FS_CALL(fromVnode, rename_attr, fromName, toVnode, toName);
6895 	} else
6896 		status = B_READ_ONLY_DEVICE;
6897 
6898 err1:
6899 	put_fd(toDescriptor);
6900 err:
6901 	put_fd(fromDescriptor);
6902 
6903 	return status;
6904 }
6905 
6906 
6907 static int
6908 index_dir_open(dev_t mountID, bool kernel)
6909 {
6910 	struct fs_mount* mount;
6911 	void* cookie;
6912 
6913 	FUNCTION(("index_dir_open(mountID = %" B_PRId32 ", kernel = %d)\n", mountID,
6914 		kernel));
6915 
6916 	status_t status = get_mount(mountID, &mount);
6917 	if (status != B_OK)
6918 		return status;
6919 
6920 	if (!HAS_FS_MOUNT_CALL(mount, open_index_dir)) {
6921 		status = B_UNSUPPORTED;
6922 		goto error;
6923 	}
6924 
6925 	status = FS_MOUNT_CALL(mount, open_index_dir, &cookie);
6926 	if (status != B_OK)
6927 		goto error;
6928 
6929 	// get fd for the index directory
6930 	int fd;
6931 	fd = get_new_fd(FDTYPE_INDEX_DIR, mount, NULL, cookie, O_CLOEXEC, kernel);
6932 	if (fd >= 0)
6933 		return fd;
6934 
6935 	// something went wrong
6936 	FS_MOUNT_CALL(mount, close_index_dir, cookie);
6937 	FS_MOUNT_CALL(mount, free_index_dir_cookie, cookie);
6938 
6939 	status = fd;
6940 
6941 error:
6942 	put_mount(mount);
6943 	return status;
6944 }
6945 
6946 
6947 static status_t
6948 index_dir_close(struct file_descriptor* descriptor)
6949 {
6950 	struct fs_mount* mount = descriptor->u.mount;
6951 
6952 	FUNCTION(("index_dir_close(descriptor = %p)\n", descriptor));
6953 
6954 	if (HAS_FS_MOUNT_CALL(mount, close_index_dir))
6955 		return FS_MOUNT_CALL(mount, close_index_dir, descriptor->cookie);
6956 
6957 	return B_OK;
6958 }
6959 
6960 
6961 static void
6962 index_dir_free_fd(struct file_descriptor* descriptor)
6963 {
6964 	struct fs_mount* mount = descriptor->u.mount;
6965 
6966 	if (mount != NULL) {
6967 		FS_MOUNT_CALL(mount, free_index_dir_cookie, descriptor->cookie);
6968 		put_mount(mount);
6969 	}
6970 }
6971 
6972 
6973 static status_t
6974 index_dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
6975 	struct dirent* buffer, size_t bufferSize, uint32* _count)
6976 {
6977 	struct fs_mount* mount = descriptor->u.mount;
6978 
6979 	if (HAS_FS_MOUNT_CALL(mount, read_index_dir)) {
6980 		return FS_MOUNT_CALL(mount, read_index_dir, descriptor->cookie, buffer,
6981 			bufferSize, _count);
6982 	}
6983 
6984 	return B_UNSUPPORTED;
6985 }
6986 
6987 
6988 static status_t
6989 index_dir_rewind(struct file_descriptor* descriptor)
6990 {
6991 	struct fs_mount* mount = descriptor->u.mount;
6992 
6993 	if (HAS_FS_MOUNT_CALL(mount, rewind_index_dir))
6994 		return FS_MOUNT_CALL(mount, rewind_index_dir, descriptor->cookie);
6995 
6996 	return B_UNSUPPORTED;
6997 }
6998 
6999 
7000 static status_t
7001 index_create(dev_t mountID, const char* name, uint32 type, uint32 flags,
7002 	bool kernel)
7003 {
7004 	FUNCTION(("index_create(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7005 		mountID, name, kernel));
7006 
7007 	struct fs_mount* mount;
7008 	status_t status = get_mount(mountID, &mount);
7009 	if (status != B_OK)
7010 		return status;
7011 
7012 	if (!HAS_FS_MOUNT_CALL(mount, create_index)) {
7013 		status = B_READ_ONLY_DEVICE;
7014 		goto out;
7015 	}
7016 
7017 	status = FS_MOUNT_CALL(mount, create_index, name, type, flags);
7018 
7019 out:
7020 	put_mount(mount);
7021 	return status;
7022 }
7023 
7024 
7025 #if 0
7026 static status_t
7027 index_read_stat(struct file_descriptor* descriptor, struct stat* stat)
7028 {
7029 	struct vnode* vnode = descriptor->u.vnode;
7030 
7031 	// ToDo: currently unused!
7032 	FUNCTION(("index_read_stat: stat 0x%p\n", stat));
7033 	if (!HAS_FS_CALL(vnode, read_index_stat))
7034 		return B_UNSUPPORTED;
7035 
7036 	return B_UNSUPPORTED;
7037 	//return FS_CALL(vnode, read_index_stat, descriptor->cookie, stat);
7038 }
7039 
7040 
7041 static void
7042 index_free_fd(struct file_descriptor* descriptor)
7043 {
7044 	struct vnode* vnode = descriptor->u.vnode;
7045 
7046 	if (vnode != NULL) {
7047 		FS_CALL(vnode, free_index_cookie, descriptor->cookie);
7048 		put_vnode(vnode);
7049 	}
7050 }
7051 #endif
7052 
7053 
7054 static status_t
7055 index_name_read_stat(dev_t mountID, const char* name, struct stat* stat,
7056 	bool kernel)
7057 {
7058 	FUNCTION(("index_remove(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7059 		mountID, name, kernel));
7060 
7061 	struct fs_mount* mount;
7062 	status_t status = get_mount(mountID, &mount);
7063 	if (status != B_OK)
7064 		return status;
7065 
7066 	if (!HAS_FS_MOUNT_CALL(mount, read_index_stat)) {
7067 		status = B_UNSUPPORTED;
7068 		goto out;
7069 	}
7070 
7071 	status = FS_MOUNT_CALL(mount, read_index_stat, name, stat);
7072 
7073 out:
7074 	put_mount(mount);
7075 	return status;
7076 }
7077 
7078 
7079 static status_t
7080 index_remove(dev_t mountID, const char* name, bool kernel)
7081 {
7082 	FUNCTION(("index_remove(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7083 		mountID, name, kernel));
7084 
7085 	struct fs_mount* mount;
7086 	status_t status = get_mount(mountID, &mount);
7087 	if (status != B_OK)
7088 		return status;
7089 
7090 	if (!HAS_FS_MOUNT_CALL(mount, remove_index)) {
7091 		status = B_READ_ONLY_DEVICE;
7092 		goto out;
7093 	}
7094 
7095 	status = FS_MOUNT_CALL(mount, remove_index, name);
7096 
7097 out:
7098 	put_mount(mount);
7099 	return status;
7100 }
7101 
7102 
7103 /*!	TODO: the query FS API is still the pretty much the same as in R5.
7104 		It would be nice if the FS would find some more kernel support
7105 		for them.
7106 		For example, query parsing should be moved into the kernel.
7107 */
7108 static int
7109 query_open(dev_t device, const char* query, uint32 flags, port_id port,
7110 	int32 token, bool kernel)
7111 {
7112 	struct fs_mount* mount;
7113 	void* cookie;
7114 
7115 	FUNCTION(("query_open(device = %" B_PRId32 ", query = \"%s\", kernel = %d)\n",
7116 		device, query, kernel));
7117 
7118 	status_t status = get_mount(device, &mount);
7119 	if (status != B_OK)
7120 		return status;
7121 
7122 	if (!HAS_FS_MOUNT_CALL(mount, open_query)) {
7123 		status = B_UNSUPPORTED;
7124 		goto error;
7125 	}
7126 
7127 	status = FS_MOUNT_CALL(mount, open_query, query, flags, port, token,
7128 		&cookie);
7129 	if (status != B_OK)
7130 		goto error;
7131 
7132 	// get fd for the index directory
7133 	int fd;
7134 	fd = get_new_fd(FDTYPE_QUERY, mount, NULL, cookie, O_CLOEXEC, kernel);
7135 	if (fd >= 0)
7136 		return fd;
7137 
7138 	status = fd;
7139 
7140 	// something went wrong
7141 	FS_MOUNT_CALL(mount, close_query, cookie);
7142 	FS_MOUNT_CALL(mount, free_query_cookie, cookie);
7143 
7144 error:
7145 	put_mount(mount);
7146 	return status;
7147 }
7148 
7149 
7150 static status_t
7151 query_close(struct file_descriptor* descriptor)
7152 {
7153 	struct fs_mount* mount = descriptor->u.mount;
7154 
7155 	FUNCTION(("query_close(descriptor = %p)\n", descriptor));
7156 
7157 	if (HAS_FS_MOUNT_CALL(mount, close_query))
7158 		return FS_MOUNT_CALL(mount, close_query, descriptor->cookie);
7159 
7160 	return B_OK;
7161 }
7162 
7163 
7164 static void
7165 query_free_fd(struct file_descriptor* descriptor)
7166 {
7167 	struct fs_mount* mount = descriptor->u.mount;
7168 
7169 	if (mount != NULL) {
7170 		FS_MOUNT_CALL(mount, free_query_cookie, descriptor->cookie);
7171 		put_mount(mount);
7172 	}
7173 }
7174 
7175 
7176 static status_t
7177 query_read(struct io_context* ioContext, struct file_descriptor* descriptor,
7178 	struct dirent* buffer, size_t bufferSize, uint32* _count)
7179 {
7180 	struct fs_mount* mount = descriptor->u.mount;
7181 
7182 	if (HAS_FS_MOUNT_CALL(mount, read_query)) {
7183 		return FS_MOUNT_CALL(mount, read_query, descriptor->cookie, buffer,
7184 			bufferSize, _count);
7185 	}
7186 
7187 	return B_UNSUPPORTED;
7188 }
7189 
7190 
7191 static status_t
7192 query_rewind(struct file_descriptor* descriptor)
7193 {
7194 	struct fs_mount* mount = descriptor->u.mount;
7195 
7196 	if (HAS_FS_MOUNT_CALL(mount, rewind_query))
7197 		return FS_MOUNT_CALL(mount, rewind_query, descriptor->cookie);
7198 
7199 	return B_UNSUPPORTED;
7200 }
7201 
7202 
7203 //	#pragma mark - General File System functions
7204 
7205 
7206 static dev_t
7207 fs_mount(char* path, const char* device, const char* fsName, uint32 flags,
7208 	const char* args, bool kernel)
7209 {
7210 	struct ::fs_mount* mount;
7211 	status_t status = B_OK;
7212 	fs_volume* volume = NULL;
7213 	int32 layer = 0;
7214 	Vnode* coveredNode = NULL;
7215 
7216 	FUNCTION(("fs_mount: path = '%s', device = '%s', fs_name = '%s', flags = %#"
7217 		B_PRIx32 ", args = '%s'\n", path, device, fsName, flags, args));
7218 
7219 	// The path is always safe, we just have to make sure that fsName is
7220 	// almost valid - we can't make any assumptions about args, though.
7221 	// A NULL fsName is OK, if a device was given and the FS is not virtual.
7222 	// We'll get it from the DDM later.
7223 	if (fsName == NULL) {
7224 		if (!device || flags & B_MOUNT_VIRTUAL_DEVICE)
7225 			return B_BAD_VALUE;
7226 	} else if (fsName[0] == '\0')
7227 		return B_BAD_VALUE;
7228 
7229 	RecursiveLocker mountOpLocker(sMountOpLock);
7230 
7231 	// Helper to delete a newly created file device on failure.
7232 	// Not exactly beautiful, but helps to keep the code below cleaner.
7233 	struct FileDeviceDeleter {
7234 		FileDeviceDeleter() : id(-1) {}
7235 		~FileDeviceDeleter()
7236 		{
7237 			KDiskDeviceManager::Default()->DeleteFileDevice(id);
7238 		}
7239 
7240 		partition_id id;
7241 	} fileDeviceDeleter;
7242 
7243 	// If the file system is not a "virtual" one, the device argument should
7244 	// point to a real file/device (if given at all).
7245 	// get the partition
7246 	KDiskDeviceManager* ddm = KDiskDeviceManager::Default();
7247 	KPartition* partition = NULL;
7248 	KPath normalizedDevice;
7249 	bool newlyCreatedFileDevice = false;
7250 
7251 	if (!(flags & B_MOUNT_VIRTUAL_DEVICE) && device != NULL) {
7252 		// normalize the device path
7253 		status = normalizedDevice.SetTo(device, true);
7254 		if (status != B_OK)
7255 			return status;
7256 
7257 		// get a corresponding partition from the DDM
7258 		partition = ddm->RegisterPartition(normalizedDevice.Path());
7259 		if (partition == NULL) {
7260 			// Partition not found: This either means, the user supplied
7261 			// an invalid path, or the path refers to an image file. We try
7262 			// to let the DDM create a file device for the path.
7263 			partition_id deviceID = ddm->CreateFileDevice(
7264 				normalizedDevice.Path(), &newlyCreatedFileDevice);
7265 			if (deviceID >= 0) {
7266 				partition = ddm->RegisterPartition(deviceID);
7267 				if (newlyCreatedFileDevice)
7268 					fileDeviceDeleter.id = deviceID;
7269 			}
7270 		}
7271 
7272 		if (!partition) {
7273 			TRACE(("fs_mount(): Partition `%s' not found.\n",
7274 				normalizedDevice.Path()));
7275 			return B_ENTRY_NOT_FOUND;
7276 		}
7277 
7278 		device = normalizedDevice.Path();
7279 			// correct path to file device
7280 	}
7281 	PartitionRegistrar partitionRegistrar(partition, true);
7282 
7283 	// Write lock the partition's device. For the time being, we keep the lock
7284 	// until we're done mounting -- not nice, but ensure, that no-one is
7285 	// interfering.
7286 	// TODO: Just mark the partition busy while mounting!
7287 	KDiskDevice* diskDevice = NULL;
7288 	if (partition) {
7289 		diskDevice = ddm->WriteLockDevice(partition->Device()->ID());
7290 		if (!diskDevice) {
7291 			TRACE(("fs_mount(): Failed to lock disk device!\n"));
7292 			return B_ERROR;
7293 		}
7294 	}
7295 
7296 	DeviceWriteLocker writeLocker(diskDevice, true);
7297 		// this takes over the write lock acquired before
7298 
7299 	if (partition != NULL) {
7300 		// make sure, that the partition is not busy
7301 		if (partition->IsBusy()) {
7302 			TRACE(("fs_mount(): Partition is busy.\n"));
7303 			return B_BUSY;
7304 		}
7305 
7306 		// if no FS name had been supplied, we get it from the partition
7307 		if (fsName == NULL) {
7308 			KDiskSystem* diskSystem = partition->DiskSystem();
7309 			if (!diskSystem) {
7310 				TRACE(("fs_mount(): No FS name was given, and the DDM didn't "
7311 					"recognize it.\n"));
7312 				return B_BAD_VALUE;
7313 			}
7314 
7315 			if (!diskSystem->IsFileSystem()) {
7316 				TRACE(("fs_mount(): No FS name was given, and the DDM found a "
7317 					"partitioning system.\n"));
7318 				return B_BAD_VALUE;
7319 			}
7320 
7321 			// The disk system name will not change, and the KDiskSystem
7322 			// object will not go away while the disk device is locked (and
7323 			// the partition has a reference to it), so this is safe.
7324 			fsName = diskSystem->Name();
7325 		}
7326 	}
7327 
7328 	mount = new(std::nothrow) (struct ::fs_mount);
7329 	if (mount == NULL)
7330 		return B_NO_MEMORY;
7331 
7332 	mount->device_name = strdup(device);
7333 		// "device" can be NULL
7334 
7335 	status = mount->entry_cache.Init();
7336 	if (status != B_OK)
7337 		goto err1;
7338 
7339 	// initialize structure
7340 	mount->id = sNextMountID++;
7341 	mount->partition = NULL;
7342 	mount->root_vnode = NULL;
7343 	mount->covers_vnode = NULL;
7344 	mount->unmounting = false;
7345 	mount->owns_file_device = false;
7346 	mount->volume = NULL;
7347 
7348 	// build up the volume(s)
7349 	while (true) {
7350 		char* layerFSName = get_file_system_name_for_layer(fsName, layer);
7351 		if (layerFSName == NULL) {
7352 			if (layer == 0) {
7353 				status = B_NO_MEMORY;
7354 				goto err1;
7355 			}
7356 
7357 			break;
7358 		}
7359 		MemoryDeleter layerFSNameDeleter(layerFSName);
7360 
7361 		volume = (fs_volume*)malloc(sizeof(fs_volume));
7362 		if (volume == NULL) {
7363 			status = B_NO_MEMORY;
7364 			goto err1;
7365 		}
7366 
7367 		volume->id = mount->id;
7368 		volume->partition = partition != NULL ? partition->ID() : -1;
7369 		volume->layer = layer++;
7370 		volume->private_volume = NULL;
7371 		volume->ops = NULL;
7372 		volume->sub_volume = NULL;
7373 		volume->super_volume = NULL;
7374 		volume->file_system = NULL;
7375 		volume->file_system_name = NULL;
7376 
7377 		volume->file_system_name = get_file_system_name(layerFSName);
7378 		if (volume->file_system_name == NULL) {
7379 			status = B_NO_MEMORY;
7380 			free(volume);
7381 			goto err1;
7382 		}
7383 
7384 		volume->file_system = get_file_system(layerFSName);
7385 		if (volume->file_system == NULL) {
7386 			status = B_DEVICE_NOT_FOUND;
7387 			free(volume->file_system_name);
7388 			free(volume);
7389 			goto err1;
7390 		}
7391 
7392 		if (mount->volume == NULL)
7393 			mount->volume = volume;
7394 		else {
7395 			volume->super_volume = mount->volume;
7396 			mount->volume->sub_volume = volume;
7397 			mount->volume = volume;
7398 		}
7399 	}
7400 
7401 	// insert mount struct into list before we call FS's mount() function
7402 	// so that vnodes can be created for this mount
7403 	mutex_lock(&sMountMutex);
7404 	sMountsTable->Insert(mount);
7405 	mutex_unlock(&sMountMutex);
7406 
7407 	ino_t rootID;
7408 
7409 	if (!sRoot) {
7410 		// we haven't mounted anything yet
7411 		if (strcmp(path, "/") != 0) {
7412 			status = B_ERROR;
7413 			goto err2;
7414 		}
7415 
7416 		status = mount->volume->file_system->mount(mount->volume, device, flags,
7417 			args, &rootID);
7418 		if (status != 0)
7419 			goto err2;
7420 	} else {
7421 		status = path_to_vnode(path, true, &coveredNode, NULL, kernel);
7422 		if (status != B_OK)
7423 			goto err2;
7424 
7425 		mount->covers_vnode = coveredNode;
7426 
7427 		// make sure covered_vnode is a directory
7428 		if (!S_ISDIR(coveredNode->Type())) {
7429 			status = B_NOT_A_DIRECTORY;
7430 			goto err3;
7431 		}
7432 
7433 		if (coveredNode->IsCovered()) {
7434 			// this is already a covered vnode
7435 			status = B_BUSY;
7436 			goto err3;
7437 		}
7438 
7439 		// mount it/them
7440 		fs_volume* volume = mount->volume;
7441 		while (volume) {
7442 			status = volume->file_system->mount(volume, device, flags, args,
7443 				&rootID);
7444 			if (status != B_OK) {
7445 				if (volume->sub_volume)
7446 					goto err4;
7447 				goto err3;
7448 			}
7449 
7450 			volume = volume->super_volume;
7451 		}
7452 
7453 		volume = mount->volume;
7454 		while (volume) {
7455 			if (volume->ops->all_layers_mounted != NULL)
7456 				volume->ops->all_layers_mounted(volume);
7457 			volume = volume->super_volume;
7458 		}
7459 	}
7460 
7461 	// the root node is supposed to be owned by the file system - it must
7462 	// exist at this point
7463 	mount->root_vnode = lookup_vnode(mount->id, rootID);
7464 	if (mount->root_vnode == NULL || mount->root_vnode->ref_count != 1) {
7465 		panic("fs_mount: file system does not own its root node!\n");
7466 		status = B_ERROR;
7467 		goto err4;
7468 	}
7469 
7470 	// set up the links between the root vnode and the vnode it covers
7471 	rw_lock_write_lock(&sVnodeLock);
7472 	if (coveredNode != NULL) {
7473 		if (coveredNode->IsCovered()) {
7474 			// the vnode is covered now
7475 			status = B_BUSY;
7476 			rw_lock_write_unlock(&sVnodeLock);
7477 			goto err4;
7478 		}
7479 
7480 		mount->root_vnode->covers = coveredNode;
7481 		mount->root_vnode->SetCovering(true);
7482 
7483 		coveredNode->covered_by = mount->root_vnode;
7484 		coveredNode->SetCovered(true);
7485 	}
7486 	rw_lock_write_unlock(&sVnodeLock);
7487 
7488 	if (!sRoot) {
7489 		sRoot = mount->root_vnode;
7490 		mutex_lock(&sIOContextRootLock);
7491 		get_current_io_context(true)->root = sRoot;
7492 		mutex_unlock(&sIOContextRootLock);
7493 		inc_vnode_ref_count(sRoot);
7494 	}
7495 
7496 	// supply the partition (if any) with the mount cookie and mark it mounted
7497 	if (partition) {
7498 		partition->SetMountCookie(mount->volume->private_volume);
7499 		partition->SetVolumeID(mount->id);
7500 
7501 		// keep a partition reference as long as the partition is mounted
7502 		partitionRegistrar.Detach();
7503 		mount->partition = partition;
7504 		mount->owns_file_device = newlyCreatedFileDevice;
7505 		fileDeviceDeleter.id = -1;
7506 	}
7507 
7508 	notify_mount(mount->id,
7509 		coveredNode != NULL ? coveredNode->device : -1,
7510 		coveredNode ? coveredNode->id : -1);
7511 
7512 	return mount->id;
7513 
7514 err4:
7515 	FS_MOUNT_CALL_NO_PARAMS(mount, unmount);
7516 err3:
7517 	if (coveredNode != NULL)
7518 		put_vnode(coveredNode);
7519 err2:
7520 	mutex_lock(&sMountMutex);
7521 	sMountsTable->Remove(mount);
7522 	mutex_unlock(&sMountMutex);
7523 err1:
7524 	delete mount;
7525 
7526 	return status;
7527 }
7528 
7529 
7530 static status_t
7531 fs_unmount(char* path, dev_t mountID, uint32 flags, bool kernel)
7532 {
7533 	struct fs_mount* mount;
7534 	status_t err;
7535 
7536 	FUNCTION(("fs_unmount(path '%s', dev %" B_PRId32 ", kernel %d\n", path,
7537 		mountID, kernel));
7538 
7539 	struct vnode* pathVnode = NULL;
7540 	if (path != NULL) {
7541 		err = path_to_vnode(path, true, &pathVnode, NULL, kernel);
7542 		if (err != B_OK)
7543 			return B_ENTRY_NOT_FOUND;
7544 	}
7545 
7546 	RecursiveLocker mountOpLocker(sMountOpLock);
7547 
7548 	// this lock is not strictly necessary, but here in case of KDEBUG
7549 	// to keep the ASSERT in find_mount() working.
7550 	KDEBUG_ONLY(mutex_lock(&sMountMutex));
7551 	mount = find_mount(path != NULL ? pathVnode->device : mountID);
7552 	KDEBUG_ONLY(mutex_unlock(&sMountMutex));
7553 	if (mount == NULL) {
7554 		panic("fs_unmount: find_mount() failed on root vnode @%p of mount\n",
7555 			pathVnode);
7556 	}
7557 
7558 	if (path != NULL) {
7559 		put_vnode(pathVnode);
7560 
7561 		if (mount->root_vnode != pathVnode) {
7562 			// not mountpoint
7563 			return B_BAD_VALUE;
7564 		}
7565 	}
7566 
7567 	// if the volume is associated with a partition, lock the device of the
7568 	// partition as long as we are unmounting
7569 	KDiskDeviceManager* ddm = KDiskDeviceManager::Default();
7570 	KPartition* partition = mount->partition;
7571 	KDiskDevice* diskDevice = NULL;
7572 	if (partition != NULL) {
7573 		if (partition->Device() == NULL) {
7574 			dprintf("fs_unmount(): There is no device!\n");
7575 			return B_ERROR;
7576 		}
7577 		diskDevice = ddm->WriteLockDevice(partition->Device()->ID());
7578 		if (!diskDevice) {
7579 			TRACE(("fs_unmount(): Failed to lock disk device!\n"));
7580 			return B_ERROR;
7581 		}
7582 	}
7583 	DeviceWriteLocker writeLocker(diskDevice, true);
7584 
7585 	// make sure, that the partition is not busy
7586 	if (partition != NULL) {
7587 		if ((flags & B_UNMOUNT_BUSY_PARTITION) == 0 && partition->IsBusy()) {
7588 			TRACE(("fs_unmount(): Partition is busy.\n"));
7589 			return B_BUSY;
7590 		}
7591 	}
7592 
7593 	// grab the vnode master mutex to keep someone from creating
7594 	// a vnode while we're figuring out if we can continue
7595 	WriteLocker vnodesWriteLocker(&sVnodeLock);
7596 
7597 	bool disconnectedDescriptors = false;
7598 
7599 	while (true) {
7600 		bool busy = false;
7601 
7602 		// cycle through the list of vnodes associated with this mount and
7603 		// make sure all of them are not busy or have refs on them
7604 		VnodeList::Iterator iterator = mount->vnodes.GetIterator();
7605 		while (struct vnode* vnode = iterator.Next()) {
7606 			if (vnode->IsBusy()) {
7607 				busy = true;
7608 				break;
7609 			}
7610 
7611 			// check the vnode's ref count -- subtract additional references for
7612 			// covering
7613 			int32 refCount = vnode->ref_count;
7614 			if (vnode->covers != NULL)
7615 				refCount--;
7616 			if (vnode->covered_by != NULL)
7617 				refCount--;
7618 
7619 			if (refCount != 0) {
7620 				// there are still vnodes in use on this mount, so we cannot
7621 				// unmount yet
7622 				busy = true;
7623 				break;
7624 			}
7625 		}
7626 
7627 		if (!busy)
7628 			break;
7629 
7630 		if ((flags & B_FORCE_UNMOUNT) == 0)
7631 			return B_BUSY;
7632 
7633 		if (disconnectedDescriptors) {
7634 			// wait a bit until the last access is finished, and then try again
7635 			vnodesWriteLocker.Unlock();
7636 			snooze(100000);
7637 			// TODO: if there is some kind of bug that prevents the ref counts
7638 			// from getting back to zero, this will fall into an endless loop...
7639 			vnodesWriteLocker.Lock();
7640 			continue;
7641 		}
7642 
7643 		// the file system is still busy - but we're forced to unmount it,
7644 		// so let's disconnect all open file descriptors
7645 
7646 		mount->unmounting = true;
7647 			// prevent new vnodes from being created
7648 
7649 		vnodesWriteLocker.Unlock();
7650 
7651 		disconnect_mount_or_vnode_fds(mount, NULL);
7652 		disconnectedDescriptors = true;
7653 
7654 		vnodesWriteLocker.Lock();
7655 	}
7656 
7657 	// We can safely continue. Mark all of the vnodes busy and this mount
7658 	// structure in unmounting state. Also undo the vnode covers/covered_by
7659 	// links.
7660 	mount->unmounting = true;
7661 
7662 	VnodeList::Iterator iterator = mount->vnodes.GetIterator();
7663 	while (struct vnode* vnode = iterator.Next()) {
7664 		// Remove all covers/covered_by links from other mounts' nodes to this
7665 		// vnode and adjust the node ref count accordingly. We will release the
7666 		// references to the external vnodes below.
7667 		if (Vnode* coveredNode = vnode->covers) {
7668 			if (Vnode* coveringNode = vnode->covered_by) {
7669 				// We have both covered and covering vnodes, so just remove us
7670 				// from the chain.
7671 				coveredNode->covered_by = coveringNode;
7672 				coveringNode->covers = coveredNode;
7673 				vnode->ref_count -= 2;
7674 
7675 				vnode->covered_by = NULL;
7676 				vnode->covers = NULL;
7677 				vnode->SetCovering(false);
7678 				vnode->SetCovered(false);
7679 			} else {
7680 				// We only have a covered vnode. Remove its link to us.
7681 				coveredNode->covered_by = NULL;
7682 				coveredNode->SetCovered(false);
7683 				vnode->ref_count--;
7684 
7685 				// If the other node is an external vnode, we keep its link
7686 				// link around so we can put the reference later on. Otherwise
7687 				// we get rid of it right now.
7688 				if (coveredNode->mount == mount) {
7689 					vnode->covers = NULL;
7690 					coveredNode->ref_count--;
7691 				}
7692 			}
7693 		} else if (Vnode* coveringNode = vnode->covered_by) {
7694 			// We only have a covering vnode. Remove its link to us.
7695 			coveringNode->covers = NULL;
7696 			coveringNode->SetCovering(false);
7697 			vnode->ref_count--;
7698 
7699 			// If the other node is an external vnode, we keep its link
7700 			// link around so we can put the reference later on. Otherwise
7701 			// we get rid of it right now.
7702 			if (coveringNode->mount == mount) {
7703 				vnode->covered_by = NULL;
7704 				coveringNode->ref_count--;
7705 			}
7706 		}
7707 
7708 		vnode->SetBusy(true);
7709 		vnode_to_be_freed(vnode);
7710 	}
7711 
7712 	vnodesWriteLocker.Unlock();
7713 
7714 	// Free all vnodes associated with this mount.
7715 	// They will be removed from the mount list by free_vnode(), so
7716 	// we don't have to do this.
7717 	while (struct vnode* vnode = mount->vnodes.Head()) {
7718 		// Put the references to external covered/covering vnodes we kept above.
7719 		if (Vnode* coveredNode = vnode->covers)
7720 			put_vnode(coveredNode);
7721 		if (Vnode* coveringNode = vnode->covered_by)
7722 			put_vnode(coveringNode);
7723 
7724 		free_vnode(vnode, false);
7725 	}
7726 
7727 	// remove the mount structure from the hash table
7728 	mutex_lock(&sMountMutex);
7729 	sMountsTable->Remove(mount);
7730 	mutex_unlock(&sMountMutex);
7731 
7732 	mountOpLocker.Unlock();
7733 
7734 	FS_MOUNT_CALL_NO_PARAMS(mount, unmount);
7735 	notify_unmount(mount->id);
7736 
7737 	// dereference the partition and mark it unmounted
7738 	if (partition) {
7739 		partition->SetVolumeID(-1);
7740 		partition->SetMountCookie(NULL);
7741 
7742 		if (mount->owns_file_device)
7743 			KDiskDeviceManager::Default()->DeleteFileDevice(partition->ID());
7744 		partition->Unregister();
7745 	}
7746 
7747 	delete mount;
7748 	return B_OK;
7749 }
7750 
7751 
7752 static status_t
7753 fs_sync(dev_t device)
7754 {
7755 	struct fs_mount* mount;
7756 	status_t status = get_mount(device, &mount);
7757 	if (status != B_OK)
7758 		return status;
7759 
7760 	struct vnode marker;
7761 	memset(&marker, 0, sizeof(marker));
7762 	marker.SetBusy(true);
7763 	marker.SetRemoved(true);
7764 
7765 	// First, synchronize all file caches
7766 
7767 	while (true) {
7768 		WriteLocker locker(sVnodeLock);
7769 			// Note: That's the easy way. Which is probably OK for sync(),
7770 			// since it's a relatively rare call and doesn't need to allow for
7771 			// a lot of concurrency. Using a read lock would be possible, but
7772 			// also more involved, since we had to lock the individual nodes
7773 			// and take care of the locking order, which we might not want to
7774 			// do while holding fs_mount::rlock.
7775 
7776 		// synchronize access to vnode list
7777 		recursive_lock_lock(&mount->rlock);
7778 
7779 		struct vnode* vnode;
7780 		if (!marker.IsRemoved()) {
7781 			vnode = mount->vnodes.GetNext(&marker);
7782 			mount->vnodes.Remove(&marker);
7783 			marker.SetRemoved(true);
7784 		} else
7785 			vnode = mount->vnodes.First();
7786 
7787 		while (vnode != NULL && (vnode->cache == NULL
7788 			|| vnode->IsRemoved() || vnode->IsBusy())) {
7789 			// TODO: we could track writes (and writable mapped vnodes)
7790 			//	and have a simple flag that we could test for here
7791 			vnode = mount->vnodes.GetNext(vnode);
7792 		}
7793 
7794 		if (vnode != NULL) {
7795 			// insert marker vnode again
7796 			mount->vnodes.Insert(mount->vnodes.GetNext(vnode), &marker);
7797 			marker.SetRemoved(false);
7798 		}
7799 
7800 		recursive_lock_unlock(&mount->rlock);
7801 
7802 		if (vnode == NULL)
7803 			break;
7804 
7805 		vnode = lookup_vnode(mount->id, vnode->id);
7806 		if (vnode == NULL || vnode->IsBusy())
7807 			continue;
7808 
7809 		if (vnode->ref_count == 0) {
7810 			// this vnode has been unused before
7811 			vnode_used(vnode);
7812 		}
7813 		inc_vnode_ref_count(vnode);
7814 
7815 		locker.Unlock();
7816 
7817 		if (vnode->cache != NULL && !vnode->IsRemoved())
7818 			vnode->cache->WriteModified();
7819 
7820 		put_vnode(vnode);
7821 	}
7822 
7823 	// And then, let the file systems do their synchronizing work
7824 
7825 	if (HAS_FS_MOUNT_CALL(mount, sync))
7826 		status = FS_MOUNT_CALL_NO_PARAMS(mount, sync);
7827 
7828 	put_mount(mount);
7829 	return status;
7830 }
7831 
7832 
7833 static status_t
7834 fs_read_info(dev_t device, struct fs_info* info)
7835 {
7836 	struct fs_mount* mount;
7837 	status_t status = get_mount(device, &mount);
7838 	if (status != B_OK)
7839 		return status;
7840 
7841 	memset(info, 0, sizeof(struct fs_info));
7842 
7843 	if (HAS_FS_MOUNT_CALL(mount, read_fs_info))
7844 		status = FS_MOUNT_CALL(mount, read_fs_info, info);
7845 
7846 	// fill in info the file system doesn't (have to) know about
7847 	if (status == B_OK) {
7848 		info->dev = mount->id;
7849 		info->root = mount->root_vnode->id;
7850 
7851 		fs_volume* volume = mount->volume;
7852 		while (volume->super_volume != NULL)
7853 			volume = volume->super_volume;
7854 
7855 		strlcpy(info->fsh_name, volume->file_system_name,
7856 			sizeof(info->fsh_name));
7857 		if (mount->device_name != NULL) {
7858 			strlcpy(info->device_name, mount->device_name,
7859 				sizeof(info->device_name));
7860 		}
7861 	}
7862 
7863 	// if the call is not supported by the file system, there are still
7864 	// the parts that we filled out ourselves
7865 
7866 	put_mount(mount);
7867 	return status;
7868 }
7869 
7870 
7871 static status_t
7872 fs_write_info(dev_t device, const struct fs_info* info, int mask)
7873 {
7874 	struct fs_mount* mount;
7875 	status_t status = get_mount(device, &mount);
7876 	if (status != B_OK)
7877 		return status;
7878 
7879 	if (HAS_FS_MOUNT_CALL(mount, write_fs_info))
7880 		status = FS_MOUNT_CALL(mount, write_fs_info, info, mask);
7881 	else
7882 		status = B_READ_ONLY_DEVICE;
7883 
7884 	put_mount(mount);
7885 	return status;
7886 }
7887 
7888 
7889 static dev_t
7890 fs_next_device(int32* _cookie)
7891 {
7892 	struct fs_mount* mount = NULL;
7893 	dev_t device = *_cookie;
7894 
7895 	mutex_lock(&sMountMutex);
7896 
7897 	// Since device IDs are assigned sequentially, this algorithm
7898 	// does work good enough. It makes sure that the device list
7899 	// returned is sorted, and that no device is skipped when an
7900 	// already visited device got unmounted.
7901 
7902 	while (device < sNextMountID) {
7903 		mount = find_mount(device++);
7904 		if (mount != NULL && mount->volume->private_volume != NULL)
7905 			break;
7906 	}
7907 
7908 	*_cookie = device;
7909 
7910 	if (mount != NULL)
7911 		device = mount->id;
7912 	else
7913 		device = B_BAD_VALUE;
7914 
7915 	mutex_unlock(&sMountMutex);
7916 
7917 	return device;
7918 }
7919 
7920 
7921 ssize_t
7922 fs_read_attr(int fd, const char *attribute, uint32 type, off_t pos,
7923 	void *buffer, size_t readBytes)
7924 {
7925 	int attrFD = attr_open(fd, NULL, attribute, O_RDONLY, true);
7926 	if (attrFD < 0)
7927 		return attrFD;
7928 
7929 	ssize_t bytesRead = _kern_read(attrFD, pos, buffer, readBytes);
7930 
7931 	_kern_close(attrFD);
7932 
7933 	return bytesRead;
7934 }
7935 
7936 
7937 static status_t
7938 get_cwd(char* buffer, size_t size, bool kernel)
7939 {
7940 	// Get current working directory from io context
7941 	struct io_context* context = get_current_io_context(kernel);
7942 	status_t status;
7943 
7944 	FUNCTION(("vfs_get_cwd: buf %p, size %ld\n", buffer, size));
7945 
7946 	mutex_lock(&context->io_mutex);
7947 
7948 	struct vnode* vnode = context->cwd;
7949 	if (vnode)
7950 		inc_vnode_ref_count(vnode);
7951 
7952 	mutex_unlock(&context->io_mutex);
7953 
7954 	if (vnode) {
7955 		status = dir_vnode_to_path(vnode, buffer, size, kernel);
7956 		put_vnode(vnode);
7957 	} else
7958 		status = B_ERROR;
7959 
7960 	return status;
7961 }
7962 
7963 
7964 static status_t
7965 set_cwd(int fd, char* path, bool kernel)
7966 {
7967 	struct io_context* context;
7968 	struct vnode* vnode = NULL;
7969 	struct vnode* oldDirectory;
7970 	status_t status;
7971 
7972 	FUNCTION(("set_cwd: path = \'%s\'\n", path));
7973 
7974 	// Get vnode for passed path, and bail if it failed
7975 	status = fd_and_path_to_vnode(fd, path, true, &vnode, NULL, kernel);
7976 	if (status < 0)
7977 		return status;
7978 
7979 	if (!S_ISDIR(vnode->Type())) {
7980 		// nope, can't cwd to here
7981 		status = B_NOT_A_DIRECTORY;
7982 		goto err;
7983 	}
7984 
7985 	// We need to have the permission to enter the directory, too
7986 	if (HAS_FS_CALL(vnode, access)) {
7987 		status = FS_CALL(vnode, access, X_OK);
7988 		if (status != B_OK)
7989 			goto err;
7990 	}
7991 
7992 	// Get current io context and lock
7993 	context = get_current_io_context(kernel);
7994 	mutex_lock(&context->io_mutex);
7995 
7996 	// save the old current working directory first
7997 	oldDirectory = context->cwd;
7998 	context->cwd = vnode;
7999 
8000 	mutex_unlock(&context->io_mutex);
8001 
8002 	if (oldDirectory)
8003 		put_vnode(oldDirectory);
8004 
8005 	return B_NO_ERROR;
8006 
8007 err:
8008 	put_vnode(vnode);
8009 	return status;
8010 }
8011 
8012 
8013 //	#pragma mark - kernel mirrored syscalls
8014 
8015 
8016 dev_t
8017 _kern_mount(const char* path, const char* device, const char* fsName,
8018 	uint32 flags, const char* args, size_t argsLength)
8019 {
8020 	KPath pathBuffer(path, false, B_PATH_NAME_LENGTH + 1);
8021 	if (pathBuffer.InitCheck() != B_OK)
8022 		return B_NO_MEMORY;
8023 
8024 	return fs_mount(pathBuffer.LockBuffer(), device, fsName, flags, args, true);
8025 }
8026 
8027 
8028 status_t
8029 _kern_unmount(const char* path, uint32 flags)
8030 {
8031 	KPath pathBuffer(path, false, B_PATH_NAME_LENGTH + 1);
8032 	if (pathBuffer.InitCheck() != B_OK)
8033 		return B_NO_MEMORY;
8034 
8035 	return fs_unmount(pathBuffer.LockBuffer(), -1, flags, true);
8036 }
8037 
8038 
8039 status_t
8040 _kern_read_fs_info(dev_t device, struct fs_info* info)
8041 {
8042 	if (info == NULL)
8043 		return B_BAD_VALUE;
8044 
8045 	return fs_read_info(device, info);
8046 }
8047 
8048 
8049 status_t
8050 _kern_write_fs_info(dev_t device, const struct fs_info* info, int mask)
8051 {
8052 	if (info == NULL)
8053 		return B_BAD_VALUE;
8054 
8055 	return fs_write_info(device, info, mask);
8056 }
8057 
8058 
8059 status_t
8060 _kern_sync(void)
8061 {
8062 	// Note: _kern_sync() is also called from _user_sync()
8063 	int32 cookie = 0;
8064 	dev_t device;
8065 	while ((device = next_dev(&cookie)) >= 0) {
8066 		status_t status = fs_sync(device);
8067 		if (status != B_OK && status != B_BAD_VALUE) {
8068 			dprintf("sync: device %" B_PRIdDEV " couldn't sync: %s\n", device,
8069 				strerror(status));
8070 		}
8071 	}
8072 
8073 	return B_OK;
8074 }
8075 
8076 
8077 dev_t
8078 _kern_next_device(int32* _cookie)
8079 {
8080 	return fs_next_device(_cookie);
8081 }
8082 
8083 
8084 status_t
8085 _kern_get_next_fd_info(team_id teamID, uint32* _cookie, fd_info* info,
8086 	size_t infoSize)
8087 {
8088 	if (infoSize != sizeof(fd_info))
8089 		return B_BAD_VALUE;
8090 
8091 	// get the team
8092 	Team* team = Team::Get(teamID);
8093 	if (team == NULL)
8094 		return B_BAD_TEAM_ID;
8095 	BReference<Team> teamReference(team, true);
8096 
8097 	// now that we have a team reference, its I/O context won't go away
8098 	io_context* context = team->io_context;
8099 	MutexLocker contextLocker(context->io_mutex);
8100 
8101 	uint32 slot = *_cookie;
8102 
8103 	struct file_descriptor* descriptor;
8104 	while (slot < context->table_size
8105 		&& (descriptor = context->fds[slot]) == NULL) {
8106 		slot++;
8107 	}
8108 
8109 	if (slot >= context->table_size)
8110 		return B_ENTRY_NOT_FOUND;
8111 
8112 	info->number = slot;
8113 	info->open_mode = descriptor->open_mode;
8114 
8115 	struct vnode* vnode = fd_vnode(descriptor);
8116 	if (vnode != NULL) {
8117 		info->device = vnode->device;
8118 		info->node = vnode->id;
8119 	} else if (descriptor->u.mount != NULL) {
8120 		info->device = descriptor->u.mount->id;
8121 		info->node = -1;
8122 	}
8123 
8124 	*_cookie = slot + 1;
8125 	return B_OK;
8126 }
8127 
8128 
8129 int
8130 _kern_open_entry_ref(dev_t device, ino_t inode, const char* name, int openMode,
8131 	int perms)
8132 {
8133 	if ((openMode & O_CREAT) != 0) {
8134 		return file_create_entry_ref(device, inode, name, openMode, perms,
8135 			true);
8136 	}
8137 
8138 	return file_open_entry_ref(device, inode, name, openMode, true);
8139 }
8140 
8141 
8142 /*!	\brief Opens a node specified by a FD + path pair.
8143 
8144 	At least one of \a fd and \a path must be specified.
8145 	If only \a fd is given, the function opens the node identified by this
8146 	FD. If only a path is given, this path is opened. If both are given and
8147 	the path is absolute, \a fd is ignored; a relative path is reckoned off
8148 	of the directory (!) identified by \a fd.
8149 
8150 	\param fd The FD. May be < 0.
8151 	\param path The absolute or relative path. May be \c NULL.
8152 	\param openMode The open mode.
8153 	\return A FD referring to the newly opened node, or an error code,
8154 			if an error occurs.
8155 */
8156 int
8157 _kern_open(int fd, const char* path, int openMode, int perms)
8158 {
8159 	KPath pathBuffer(path, false, B_PATH_NAME_LENGTH + 1);
8160 	if (pathBuffer.InitCheck() != B_OK)
8161 		return B_NO_MEMORY;
8162 
8163 	if (openMode & O_CREAT)
8164 		return file_create(fd, pathBuffer.LockBuffer(), openMode, perms, true);
8165 
8166 	return file_open(fd, pathBuffer.LockBuffer(), openMode, true);
8167 }
8168 
8169 
8170 /*!	\brief Opens a directory specified by entry_ref or node_ref.
8171 
8172 	The supplied name may be \c NULL, in which case directory identified
8173 	by \a device and \a inode will be opened. Otherwise \a device and
8174 	\a inode identify the parent directory of the directory to be opened
8175 	and \a name its entry name.
8176 
8177 	\param device If \a name is specified the ID of the device the parent
8178 		   directory of the directory to be opened resides on, otherwise
8179 		   the device of the directory itself.
8180 	\param inode If \a name is specified the node ID of the parent
8181 		   directory of the directory to be opened, otherwise node ID of the
8182 		   directory itself.
8183 	\param name The entry name of the directory to be opened. If \c NULL,
8184 		   the \a device + \a inode pair identify the node to be opened.
8185 	\return The FD of the newly opened directory or an error code, if
8186 			something went wrong.
8187 */
8188 int
8189 _kern_open_dir_entry_ref(dev_t device, ino_t inode, const char* name)
8190 {
8191 	return dir_open_entry_ref(device, inode, name, true);
8192 }
8193 
8194 
8195 /*!	\brief Opens a directory specified by a FD + path pair.
8196 
8197 	At least one of \a fd and \a path must be specified.
8198 	If only \a fd is given, the function opens the directory identified by this
8199 	FD. If only a path is given, this path is opened. If both are given and
8200 	the path is absolute, \a fd is ignored; a relative path is reckoned off
8201 	of the directory (!) identified by \a fd.
8202 
8203 	\param fd The FD. May be < 0.
8204 	\param path The absolute or relative path. May be \c NULL.
8205 	\return A FD referring to the newly opened directory, or an error code,
8206 			if an error occurs.
8207 */
8208 int
8209 _kern_open_dir(int fd, const char* path)
8210 {
8211 	if (path == NULL)
8212 		return dir_open(fd, NULL, true);;
8213 
8214 	KPath pathBuffer(path, false, B_PATH_NAME_LENGTH + 1);
8215 	if (pathBuffer.InitCheck() != B_OK)
8216 		return B_NO_MEMORY;
8217 
8218 	return dir_open(fd, pathBuffer.LockBuffer(), true);
8219 }
8220 
8221 
8222 status_t
8223 _kern_fcntl(int fd, int op, size_t argument)
8224 {
8225 	return common_fcntl(fd, op, argument, true);
8226 }
8227 
8228 
8229 status_t
8230 _kern_fsync(int fd)
8231 {
8232 	return common_sync(fd, true);
8233 }
8234 
8235 
8236 status_t
8237 _kern_lock_node(int fd)
8238 {
8239 	return common_lock_node(fd, true);
8240 }
8241 
8242 
8243 status_t
8244 _kern_unlock_node(int fd)
8245 {
8246 	return common_unlock_node(fd, true);
8247 }
8248 
8249 
8250 status_t
8251 _kern_create_dir_entry_ref(dev_t device, ino_t inode, const char* name,
8252 	int perms)
8253 {
8254 	return dir_create_entry_ref(device, inode, name, perms, true);
8255 }
8256 
8257 
8258 /*!	\brief Creates a directory specified by a FD + path pair.
8259 
8260 	\a path must always be specified (it contains the name of the new directory
8261 	at least). If only a path is given, this path identifies the location at
8262 	which the directory shall be created. If both \a fd and \a path are given
8263 	and the path is absolute, \a fd is ignored; a relative path is reckoned off
8264 	of the directory (!) identified by \a fd.
8265 
8266 	\param fd The FD. May be < 0.
8267 	\param path The absolute or relative path. Must not be \c NULL.
8268 	\param perms The access permissions the new directory shall have.
8269 	\return \c B_OK, if the directory has been created successfully, another
8270 			error code otherwise.
8271 */
8272 status_t
8273 _kern_create_dir(int fd, const char* path, int perms)
8274 {
8275 	KPath pathBuffer(path, false, B_PATH_NAME_LENGTH + 1);
8276 	if (pathBuffer.InitCheck() != B_OK)
8277 		return B_NO_MEMORY;
8278 
8279 	return dir_create(fd, pathBuffer.LockBuffer(), perms, true);
8280 }
8281 
8282 
8283 status_t
8284 _kern_remove_dir(int fd, const char* path)
8285 {
8286 	if (path) {
8287 		KPath pathBuffer(path, false, B_PATH_NAME_LENGTH + 1);
8288 		if (pathBuffer.InitCheck() != B_OK)
8289 			return B_NO_MEMORY;
8290 
8291 		return dir_remove(fd, pathBuffer.LockBuffer(), true);
8292 	}
8293 
8294 	return dir_remove(fd, NULL, true);
8295 }
8296 
8297 
8298 /*!	\brief Reads the contents of a symlink referred to by a FD + path pair.
8299 
8300 	At least one of \a fd and \a path must be specified.
8301 	If only \a fd is given, the function the symlink to be read is the node
8302 	identified by this FD. If only a path is given, this path identifies the
8303 	symlink to be read. If both are given and the path is absolute, \a fd is
8304 	ignored; a relative path is reckoned off of the directory (!) identified
8305 	by \a fd.
8306 	If this function fails with B_BUFFER_OVERFLOW, the \a _bufferSize pointer
8307 	will still be updated to reflect the required buffer size.
8308 
8309 	\param fd The FD. May be < 0.
8310 	\param path The absolute or relative path. May be \c NULL.
8311 	\param buffer The buffer into which the contents of the symlink shall be
8312 		   written.
8313 	\param _bufferSize A pointer to the size of the supplied buffer.
8314 	\return The length of the link on success or an appropriate error code
8315 */
8316 status_t
8317 _kern_read_link(int fd, const char* path, char* buffer, size_t* _bufferSize)
8318 {
8319 	if (path) {
8320 		KPath pathBuffer(path, false, B_PATH_NAME_LENGTH + 1);
8321 		if (pathBuffer.InitCheck() != B_OK)
8322 			return B_NO_MEMORY;
8323 
8324 		return common_read_link(fd, pathBuffer.LockBuffer(),
8325 			buffer, _bufferSize, true);
8326 	}
8327 
8328 	return common_read_link(fd, NULL, buffer, _bufferSize, true);
8329 }
8330 
8331 
8332 /*!	\brief Creates a symlink specified by a FD + path pair.
8333 
8334 	\a path must always be specified (it contains the name of the new symlink
8335 	at least). If only a path is given, this path identifies the location at
8336 	which the symlink shall be created. If both \a fd and \a path are given and
8337 	the path is absolute, \a fd is ignored; a relative path is reckoned off
8338 	of the directory (!) identified by \a fd.
8339 
8340 	\param fd The FD. May be < 0.
8341 	\param toPath The absolute or relative path. Must not be \c NULL.
8342 	\param mode The access permissions the new symlink shall have.
8343 	\return \c B_OK, if the symlink has been created successfully, another
8344 			error code otherwise.
8345 */
8346 status_t
8347 _kern_create_symlink(int fd, const char* path, const char* toPath, int mode)
8348 {
8349 	KPath pathBuffer(path, false, B_PATH_NAME_LENGTH + 1);
8350 	if (pathBuffer.InitCheck() != B_OK)
8351 		return B_NO_MEMORY;
8352 
8353 	return common_create_symlink(fd, pathBuffer.LockBuffer(),
8354 		toPath, mode, true);
8355 }
8356 
8357 
8358 status_t
8359 _kern_create_link(int pathFD, const char* path, int toFD, const char* toPath,
8360 	bool traverseLeafLink)
8361 {
8362 	KPath pathBuffer(path, false, B_PATH_NAME_LENGTH + 1);
8363 	KPath toPathBuffer(toPath, false, B_PATH_NAME_LENGTH + 1);
8364 	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
8365 		return B_NO_MEMORY;
8366 
8367 	return common_create_link(pathFD, pathBuffer.LockBuffer(), toFD,
8368 		toPathBuffer.LockBuffer(), traverseLeafLink, true);
8369 }
8370 
8371 
8372 /*!	\brief Removes an entry specified by a FD + path pair from its directory.
8373 
8374 	\a path must always be specified (it contains at least the name of the entry
8375 	to be deleted). If only a path is given, this path identifies the entry
8376 	directly. If both \a fd and \a path are given and the path is absolute,
8377 	\a fd is ignored; a relative path is reckoned off of the directory (!)
8378 	identified by \a fd.
8379 
8380 	\param fd The FD. May be < 0.
8381 	\param path The absolute or relative path. Must not be \c NULL.
8382 	\return \c B_OK, if the entry has been removed successfully, another
8383 			error code otherwise.
8384 */
8385 status_t
8386 _kern_unlink(int fd, const char* path)
8387 {
8388 	KPath pathBuffer(path, false, B_PATH_NAME_LENGTH + 1);
8389 	if (pathBuffer.InitCheck() != B_OK)
8390 		return B_NO_MEMORY;
8391 
8392 	return common_unlink(fd, pathBuffer.LockBuffer(), true);
8393 }
8394 
8395 
8396 /*!	\brief Moves an entry specified by a FD + path pair to a an entry specified
8397 		   by another FD + path pair.
8398 
8399 	\a oldPath and \a newPath must always be specified (they contain at least
8400 	the name of the entry). If only a path is given, this path identifies the
8401 	entry directly. If both a FD and a path are given and the path is absolute,
8402 	the FD is ignored; a relative path is reckoned off of the directory (!)
8403 	identified by the respective FD.
8404 
8405 	\param oldFD The FD of the old location. May be < 0.
8406 	\param oldPath The absolute or relative path of the old location. Must not
8407 		   be \c NULL.
8408 	\param newFD The FD of the new location. May be < 0.
8409 	\param newPath The absolute or relative path of the new location. Must not
8410 		   be \c NULL.
8411 	\return \c B_OK, if the entry has been moved successfully, another
8412 			error code otherwise.
8413 */
8414 status_t
8415 _kern_rename(int oldFD, const char* oldPath, int newFD, const char* newPath)
8416 {
8417 	KPath oldPathBuffer(oldPath, false, B_PATH_NAME_LENGTH + 1);
8418 	KPath newPathBuffer(newPath, false, B_PATH_NAME_LENGTH + 1);
8419 	if (oldPathBuffer.InitCheck() != B_OK || newPathBuffer.InitCheck() != B_OK)
8420 		return B_NO_MEMORY;
8421 
8422 	return common_rename(oldFD, oldPathBuffer.LockBuffer(),
8423 		newFD, newPathBuffer.LockBuffer(), true);
8424 }
8425 
8426 
8427 status_t
8428 _kern_access(int fd, const char* path, int mode, bool effectiveUserGroup)
8429 {
8430 	KPath pathBuffer(path, false, B_PATH_NAME_LENGTH + 1);
8431 	if (pathBuffer.InitCheck() != B_OK)
8432 		return B_NO_MEMORY;
8433 
8434 	return common_access(fd, pathBuffer.LockBuffer(), mode, effectiveUserGroup,
8435 		true);
8436 }
8437 
8438 
8439 /*!	\brief Reads stat data of an entity specified by a FD + path pair.
8440 
8441 	If only \a fd is given, the stat operation associated with the type
8442 	of the FD (node, attr, attr dir etc.) is performed. If only \a path is
8443 	given, this path identifies the entry for whose node to retrieve the
8444 	stat data. If both \a fd and \a path are given and the path is absolute,
8445 	\a fd is ignored; a relative path is reckoned off of the directory (!)
8446 	identified by \a fd and specifies the entry whose stat data shall be
8447 	retrieved.
8448 
8449 	\param fd The FD. May be < 0.
8450 	\param path The absolute or relative path. Must not be \c NULL.
8451 	\param traverseLeafLink If \a path is given, \c true specifies that the
8452 		   function shall not stick to symlinks, but traverse them.
8453 	\param stat The buffer the stat data shall be written into.
8454 	\param statSize The size of the supplied stat buffer.
8455 	\return \c B_OK, if the the stat data have been read successfully, another
8456 			error code otherwise.
8457 */
8458 status_t
8459 _kern_read_stat(int fd, const char* path, bool traverseLeafLink,
8460 	struct stat* stat, size_t statSize)
8461 {
8462 	struct stat completeStat;
8463 	struct stat* originalStat = NULL;
8464 	status_t status;
8465 
8466 	if (statSize > sizeof(struct stat))
8467 		return B_BAD_VALUE;
8468 
8469 	// this supports different stat extensions
8470 	if (statSize < sizeof(struct stat)) {
8471 		originalStat = stat;
8472 		stat = &completeStat;
8473 	}
8474 
8475 	status = vfs_read_stat(fd, path, traverseLeafLink, stat, true);
8476 
8477 	if (status == B_OK && originalStat != NULL)
8478 		memcpy(originalStat, stat, statSize);
8479 
8480 	return status;
8481 }
8482 
8483 
8484 /*!	\brief Writes stat data of an entity specified by a FD + path pair.
8485 
8486 	If only \a fd is given, the stat operation associated with the type
8487 	of the FD (node, attr, attr dir etc.) is performed. If only \a path is
8488 	given, this path identifies the entry for whose node to write the
8489 	stat data. If both \a fd and \a path are given and the path is absolute,
8490 	\a fd is ignored; a relative path is reckoned off of the directory (!)
8491 	identified by \a fd and specifies the entry whose stat data shall be
8492 	written.
8493 
8494 	\param fd The FD. May be < 0.
8495 	\param path The absolute or relative path. Must not be \c NULL.
8496 	\param traverseLeafLink If \a path is given, \c true specifies that the
8497 		   function shall not stick to symlinks, but traverse them.
8498 	\param stat The buffer containing the stat data to be written.
8499 	\param statSize The size of the supplied stat buffer.
8500 	\param statMask A mask specifying which parts of the stat data shall be
8501 		   written.
8502 	\return \c B_OK, if the the stat data have been written successfully,
8503 			another error code otherwise.
8504 */
8505 status_t
8506 _kern_write_stat(int fd, const char* path, bool traverseLeafLink,
8507 	const struct stat* stat, size_t statSize, int statMask)
8508 {
8509 	struct stat completeStat;
8510 
8511 	if (statSize > sizeof(struct stat))
8512 		return B_BAD_VALUE;
8513 
8514 	// this supports different stat extensions
8515 	if (statSize < sizeof(struct stat)) {
8516 		memset((uint8*)&completeStat + statSize, 0,
8517 			sizeof(struct stat) - statSize);
8518 		memcpy(&completeStat, stat, statSize);
8519 		stat = &completeStat;
8520 	}
8521 
8522 	status_t status;
8523 
8524 	if (path) {
8525 		// path given: write the stat of the node referred to by (fd, path)
8526 		KPath pathBuffer(path, false, B_PATH_NAME_LENGTH + 1);
8527 		if (pathBuffer.InitCheck() != B_OK)
8528 			return B_NO_MEMORY;
8529 
8530 		status = common_path_write_stat(fd, pathBuffer.LockBuffer(),
8531 			traverseLeafLink, stat, statMask, true);
8532 	} else {
8533 		// no path given: get the FD and use the FD operation
8534 		struct file_descriptor* descriptor
8535 			= get_fd(get_current_io_context(true), fd);
8536 		if (descriptor == NULL)
8537 			return B_FILE_ERROR;
8538 
8539 		if (descriptor->ops->fd_write_stat)
8540 			status = descriptor->ops->fd_write_stat(descriptor, stat, statMask);
8541 		else
8542 			status = B_UNSUPPORTED;
8543 
8544 		put_fd(descriptor);
8545 	}
8546 
8547 	return status;
8548 }
8549 
8550 
8551 int
8552 _kern_open_attr_dir(int fd, const char* path, bool traverseLeafLink)
8553 {
8554 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
8555 	if (pathBuffer.InitCheck() != B_OK)
8556 		return B_NO_MEMORY;
8557 
8558 	if (path != NULL)
8559 		pathBuffer.SetTo(path);
8560 
8561 	return attr_dir_open(fd, path ? pathBuffer.LockBuffer() : NULL,
8562 		traverseLeafLink, true);
8563 }
8564 
8565 
8566 int
8567 _kern_open_attr(int fd, const char* path, const char* name, uint32 type,
8568 	int openMode)
8569 {
8570 	KPath pathBuffer(path, false, B_PATH_NAME_LENGTH + 1);
8571 	if (pathBuffer.InitCheck() != B_OK)
8572 		return B_NO_MEMORY;
8573 
8574 	if ((openMode & O_CREAT) != 0) {
8575 		return attr_create(fd, pathBuffer.LockBuffer(), name, type, openMode,
8576 			true);
8577 	}
8578 
8579 	return attr_open(fd, pathBuffer.LockBuffer(), name, openMode, true);
8580 }
8581 
8582 
8583 status_t
8584 _kern_remove_attr(int fd, const char* name)
8585 {
8586 	return attr_remove(fd, name, true);
8587 }
8588 
8589 
8590 status_t
8591 _kern_rename_attr(int fromFile, const char* fromName, int toFile,
8592 	const char* toName)
8593 {
8594 	return attr_rename(fromFile, fromName, toFile, toName, true);
8595 }
8596 
8597 
8598 int
8599 _kern_open_index_dir(dev_t device)
8600 {
8601 	return index_dir_open(device, true);
8602 }
8603 
8604 
8605 status_t
8606 _kern_create_index(dev_t device, const char* name, uint32 type, uint32 flags)
8607 {
8608 	return index_create(device, name, type, flags, true);
8609 }
8610 
8611 
8612 status_t
8613 _kern_read_index_stat(dev_t device, const char* name, struct stat* stat)
8614 {
8615 	return index_name_read_stat(device, name, stat, true);
8616 }
8617 
8618 
8619 status_t
8620 _kern_remove_index(dev_t device, const char* name)
8621 {
8622 	return index_remove(device, name, true);
8623 }
8624 
8625 
8626 status_t
8627 _kern_getcwd(char* buffer, size_t size)
8628 {
8629 	TRACE(("_kern_getcwd: buf %p, %ld\n", buffer, size));
8630 
8631 	// Call vfs to get current working directory
8632 	return get_cwd(buffer, size, true);
8633 }
8634 
8635 
8636 status_t
8637 _kern_setcwd(int fd, const char* path)
8638 {
8639 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
8640 	if (pathBuffer.InitCheck() != B_OK)
8641 		return B_NO_MEMORY;
8642 
8643 	if (path != NULL)
8644 		pathBuffer.SetTo(path);
8645 
8646 	return set_cwd(fd, path != NULL ? pathBuffer.LockBuffer() : NULL, true);
8647 }
8648 
8649 
8650 //	#pragma mark - userland syscalls
8651 
8652 
8653 dev_t
8654 _user_mount(const char* userPath, const char* userDevice,
8655 	const char* userFileSystem, uint32 flags, const char* userArgs,
8656 	size_t argsLength)
8657 {
8658 	char fileSystem[B_FILE_NAME_LENGTH];
8659 	KPath path, device;
8660 	char* args = NULL;
8661 	status_t status;
8662 
8663 	if (!IS_USER_ADDRESS(userPath)
8664 		|| !IS_USER_ADDRESS(userFileSystem)
8665 		|| !IS_USER_ADDRESS(userDevice))
8666 		return B_BAD_ADDRESS;
8667 
8668 	if (path.InitCheck() != B_OK || device.InitCheck() != B_OK)
8669 		return B_NO_MEMORY;
8670 
8671 	if (user_strlcpy(path.LockBuffer(), userPath, B_PATH_NAME_LENGTH) < B_OK)
8672 		return B_BAD_ADDRESS;
8673 
8674 	if (userFileSystem != NULL
8675 		&& user_strlcpy(fileSystem, userFileSystem, sizeof(fileSystem)) < B_OK)
8676 		return B_BAD_ADDRESS;
8677 
8678 	if (userDevice != NULL
8679 		&& user_strlcpy(device.LockBuffer(), userDevice, B_PATH_NAME_LENGTH)
8680 			< B_OK)
8681 		return B_BAD_ADDRESS;
8682 
8683 	if (userArgs != NULL && argsLength > 0) {
8684 		// this is a safety restriction
8685 		if (argsLength >= 65536)
8686 			return B_NAME_TOO_LONG;
8687 
8688 		args = (char*)malloc(argsLength + 1);
8689 		if (args == NULL)
8690 			return B_NO_MEMORY;
8691 
8692 		if (user_strlcpy(args, userArgs, argsLength + 1) < B_OK) {
8693 			free(args);
8694 			return B_BAD_ADDRESS;
8695 		}
8696 	}
8697 	path.UnlockBuffer();
8698 	device.UnlockBuffer();
8699 
8700 	status = fs_mount(path.LockBuffer(),
8701 		userDevice != NULL ? device.Path() : NULL,
8702 		userFileSystem ? fileSystem : NULL, flags, args, false);
8703 
8704 	free(args);
8705 	return status;
8706 }
8707 
8708 
8709 status_t
8710 _user_unmount(const char* userPath, uint32 flags)
8711 {
8712 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
8713 	if (pathBuffer.InitCheck() != B_OK)
8714 		return B_NO_MEMORY;
8715 
8716 	char* path = pathBuffer.LockBuffer();
8717 
8718 	if (user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
8719 		return B_BAD_ADDRESS;
8720 
8721 	return fs_unmount(path, -1, flags & ~B_UNMOUNT_BUSY_PARTITION, false);
8722 }
8723 
8724 
8725 status_t
8726 _user_read_fs_info(dev_t device, struct fs_info* userInfo)
8727 {
8728 	struct fs_info info;
8729 	status_t status;
8730 
8731 	if (userInfo == NULL)
8732 		return B_BAD_VALUE;
8733 
8734 	if (!IS_USER_ADDRESS(userInfo))
8735 		return B_BAD_ADDRESS;
8736 
8737 	status = fs_read_info(device, &info);
8738 	if (status != B_OK)
8739 		return status;
8740 
8741 	if (user_memcpy(userInfo, &info, sizeof(struct fs_info)) != B_OK)
8742 		return B_BAD_ADDRESS;
8743 
8744 	return B_OK;
8745 }
8746 
8747 
8748 status_t
8749 _user_write_fs_info(dev_t device, const struct fs_info* userInfo, int mask)
8750 {
8751 	struct fs_info info;
8752 
8753 	if (userInfo == NULL)
8754 		return B_BAD_VALUE;
8755 
8756 	if (!IS_USER_ADDRESS(userInfo)
8757 		|| user_memcpy(&info, userInfo, sizeof(struct fs_info)) != B_OK)
8758 		return B_BAD_ADDRESS;
8759 
8760 	return fs_write_info(device, &info, mask);
8761 }
8762 
8763 
8764 dev_t
8765 _user_next_device(int32* _userCookie)
8766 {
8767 	int32 cookie;
8768 	dev_t device;
8769 
8770 	if (!IS_USER_ADDRESS(_userCookie)
8771 		|| user_memcpy(&cookie, _userCookie, sizeof(int32)) != B_OK)
8772 		return B_BAD_ADDRESS;
8773 
8774 	device = fs_next_device(&cookie);
8775 
8776 	if (device >= B_OK) {
8777 		// update user cookie
8778 		if (user_memcpy(_userCookie, &cookie, sizeof(int32)) != B_OK)
8779 			return B_BAD_ADDRESS;
8780 	}
8781 
8782 	return device;
8783 }
8784 
8785 
8786 status_t
8787 _user_sync(void)
8788 {
8789 	return _kern_sync();
8790 }
8791 
8792 
8793 status_t
8794 _user_get_next_fd_info(team_id team, uint32* userCookie, fd_info* userInfo,
8795 	size_t infoSize)
8796 {
8797 	struct fd_info info;
8798 	uint32 cookie;
8799 
8800 	// only root can do this (or should root's group be enough?)
8801 	if (geteuid() != 0)
8802 		return B_NOT_ALLOWED;
8803 
8804 	if (infoSize != sizeof(fd_info))
8805 		return B_BAD_VALUE;
8806 
8807 	if (!IS_USER_ADDRESS(userCookie) || !IS_USER_ADDRESS(userInfo)
8808 		|| user_memcpy(&cookie, userCookie, sizeof(uint32)) != B_OK)
8809 		return B_BAD_ADDRESS;
8810 
8811 	status_t status = _kern_get_next_fd_info(team, &cookie, &info, infoSize);
8812 	if (status != B_OK)
8813 		return status;
8814 
8815 	if (user_memcpy(userCookie, &cookie, sizeof(uint32)) != B_OK
8816 		|| user_memcpy(userInfo, &info, infoSize) != B_OK)
8817 		return B_BAD_ADDRESS;
8818 
8819 	return status;
8820 }
8821 
8822 
8823 status_t
8824 _user_entry_ref_to_path(dev_t device, ino_t inode, const char* leaf,
8825 	char* userPath, size_t pathLength)
8826 {
8827 	if (!IS_USER_ADDRESS(userPath))
8828 		return B_BAD_ADDRESS;
8829 
8830 	KPath path(B_PATH_NAME_LENGTH + 1);
8831 	if (path.InitCheck() != B_OK)
8832 		return B_NO_MEMORY;
8833 
8834 	// copy the leaf name onto the stack
8835 	char stackLeaf[B_FILE_NAME_LENGTH];
8836 	if (leaf) {
8837 		if (!IS_USER_ADDRESS(leaf))
8838 			return B_BAD_ADDRESS;
8839 
8840 		int length = user_strlcpy(stackLeaf, leaf, B_FILE_NAME_LENGTH);
8841 		if (length < 0)
8842 			return length;
8843 		if (length >= B_FILE_NAME_LENGTH)
8844 			return B_NAME_TOO_LONG;
8845 
8846 		leaf = stackLeaf;
8847 	}
8848 
8849 	status_t status = vfs_entry_ref_to_path(device, inode, leaf,
8850 		false, path.LockBuffer(), path.BufferSize());
8851 	if (status != B_OK)
8852 		return status;
8853 
8854 	path.UnlockBuffer();
8855 
8856 	int length = user_strlcpy(userPath, path.Path(), pathLength);
8857 	if (length < 0)
8858 		return length;
8859 	if (length >= (int)pathLength)
8860 		return B_BUFFER_OVERFLOW;
8861 
8862 	return B_OK;
8863 }
8864 
8865 
8866 status_t
8867 _user_normalize_path(const char* userPath, bool traverseLink, char* buffer)
8868 {
8869 	if (userPath == NULL || buffer == NULL)
8870 		return B_BAD_VALUE;
8871 	if (!IS_USER_ADDRESS(userPath) || !IS_USER_ADDRESS(buffer))
8872 		return B_BAD_ADDRESS;
8873 
8874 	// copy path from userland
8875 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
8876 	if (pathBuffer.InitCheck() != B_OK)
8877 		return B_NO_MEMORY;
8878 	char* path = pathBuffer.LockBuffer();
8879 
8880 	if (user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
8881 		return B_BAD_ADDRESS;
8882 
8883 	status_t error = normalize_path(path, pathBuffer.BufferSize(), traverseLink,
8884 		false);
8885 	if (error != B_OK)
8886 		return error;
8887 
8888 	// copy back to userland
8889 	int len = user_strlcpy(buffer, path, B_PATH_NAME_LENGTH);
8890 	if (len < 0)
8891 		return len;
8892 	if (len >= B_PATH_NAME_LENGTH)
8893 		return B_BUFFER_OVERFLOW;
8894 
8895 	return B_OK;
8896 }
8897 
8898 
8899 int
8900 _user_open_entry_ref(dev_t device, ino_t inode, const char* userName,
8901 	int openMode, int perms)
8902 {
8903 	char name[B_FILE_NAME_LENGTH];
8904 
8905 	if (userName == NULL || device < 0 || inode < 0)
8906 		return B_BAD_VALUE;
8907 	if (!IS_USER_ADDRESS(userName)
8908 		|| user_strlcpy(name, userName, sizeof(name)) < B_OK)
8909 		return B_BAD_ADDRESS;
8910 
8911 	if ((openMode & O_CREAT) != 0) {
8912 		return file_create_entry_ref(device, inode, name, openMode, perms,
8913 			false);
8914 	}
8915 
8916 	return file_open_entry_ref(device, inode, name, openMode, false);
8917 }
8918 
8919 
8920 int
8921 _user_open(int fd, const char* userPath, int openMode, int perms)
8922 {
8923 	KPath path(B_PATH_NAME_LENGTH + 1);
8924 	if (path.InitCheck() != B_OK)
8925 		return B_NO_MEMORY;
8926 
8927 	char* buffer = path.LockBuffer();
8928 
8929 	if (!IS_USER_ADDRESS(userPath)
8930 		|| user_strlcpy(buffer, userPath, B_PATH_NAME_LENGTH) < B_OK)
8931 		return B_BAD_ADDRESS;
8932 
8933 	if ((openMode & O_CREAT) != 0)
8934 		return file_create(fd, buffer, openMode, perms, false);
8935 
8936 	return file_open(fd, buffer, openMode, false);
8937 }
8938 
8939 
8940 int
8941 _user_open_dir_entry_ref(dev_t device, ino_t inode, const char* userName)
8942 {
8943 	if (userName != NULL) {
8944 		char name[B_FILE_NAME_LENGTH];
8945 
8946 		if (!IS_USER_ADDRESS(userName)
8947 			|| user_strlcpy(name, userName, sizeof(name)) < B_OK)
8948 			return B_BAD_ADDRESS;
8949 
8950 		return dir_open_entry_ref(device, inode, name, false);
8951 	}
8952 	return dir_open_entry_ref(device, inode, NULL, false);
8953 }
8954 
8955 
8956 int
8957 _user_open_dir(int fd, const char* userPath)
8958 {
8959 	if (userPath == NULL)
8960 		return dir_open(fd, NULL, false);
8961 
8962 	KPath path(B_PATH_NAME_LENGTH + 1);
8963 	if (path.InitCheck() != B_OK)
8964 		return B_NO_MEMORY;
8965 
8966 	char* buffer = path.LockBuffer();
8967 
8968 	if (!IS_USER_ADDRESS(userPath)
8969 		|| user_strlcpy(buffer, userPath, B_PATH_NAME_LENGTH) < B_OK)
8970 		return B_BAD_ADDRESS;
8971 
8972 	return dir_open(fd, buffer, false);
8973 }
8974 
8975 
8976 /*!	\brief Opens a directory's parent directory and returns the entry name
8977 		   of the former.
8978 
8979 	Aside from that it returns the directory's entry name, this method is
8980 	equivalent to \code _user_open_dir(fd, "..") \endcode. It really is
8981 	equivalent, if \a userName is \c NULL.
8982 
8983 	If a name buffer is supplied and the name does not fit the buffer, the
8984 	function fails. A buffer of size \c B_FILE_NAME_LENGTH should be safe.
8985 
8986 	\param fd A FD referring to a directory.
8987 	\param userName Buffer the directory's entry name shall be written into.
8988 		   May be \c NULL.
8989 	\param nameLength Size of the name buffer.
8990 	\return The file descriptor of the opened parent directory, if everything
8991 			went fine, an error code otherwise.
8992 */
8993 int
8994 _user_open_parent_dir(int fd, char* userName, size_t nameLength)
8995 {
8996 	bool kernel = false;
8997 
8998 	if (userName && !IS_USER_ADDRESS(userName))
8999 		return B_BAD_ADDRESS;
9000 
9001 	// open the parent dir
9002 	int parentFD = dir_open(fd, (char*)"..", kernel);
9003 	if (parentFD < 0)
9004 		return parentFD;
9005 	FDCloser fdCloser(parentFD, kernel);
9006 
9007 	if (userName) {
9008 		// get the vnodes
9009 		struct vnode* parentVNode = get_vnode_from_fd(parentFD, kernel);
9010 		struct vnode* dirVNode = get_vnode_from_fd(fd, kernel);
9011 		VNodePutter parentVNodePutter(parentVNode);
9012 		VNodePutter dirVNodePutter(dirVNode);
9013 		if (!parentVNode || !dirVNode)
9014 			return B_FILE_ERROR;
9015 
9016 		// get the vnode name
9017 		char _buffer[sizeof(struct dirent) + B_FILE_NAME_LENGTH];
9018 		struct dirent* buffer = (struct dirent*)_buffer;
9019 		status_t status = get_vnode_name(dirVNode, parentVNode, buffer,
9020 			sizeof(_buffer), get_current_io_context(false));
9021 		if (status != B_OK)
9022 			return status;
9023 
9024 		// copy the name to the userland buffer
9025 		int len = user_strlcpy(userName, buffer->d_name, nameLength);
9026 		if (len < 0)
9027 			return len;
9028 		if (len >= (int)nameLength)
9029 			return B_BUFFER_OVERFLOW;
9030 	}
9031 
9032 	return fdCloser.Detach();
9033 }
9034 
9035 
9036 status_t
9037 _user_fcntl(int fd, int op, size_t argument)
9038 {
9039 	status_t status = common_fcntl(fd, op, argument, false);
9040 	if (op == F_SETLKW)
9041 		syscall_restart_handle_post(status);
9042 
9043 	return status;
9044 }
9045 
9046 
9047 status_t
9048 _user_fsync(int fd)
9049 {
9050 	return common_sync(fd, false);
9051 }
9052 
9053 
9054 status_t
9055 _user_flock(int fd, int operation)
9056 {
9057 	FUNCTION(("_user_fcntl(fd = %d, op = %d)\n", fd, operation));
9058 
9059 	// Check if the operation is valid
9060 	switch (operation & ~LOCK_NB) {
9061 		case LOCK_UN:
9062 		case LOCK_SH:
9063 		case LOCK_EX:
9064 			break;
9065 
9066 		default:
9067 			return B_BAD_VALUE;
9068 	}
9069 
9070 	struct file_descriptor* descriptor;
9071 	struct vnode* vnode;
9072 	descriptor = get_fd_and_vnode(fd, &vnode, false);
9073 	if (descriptor == NULL)
9074 		return B_FILE_ERROR;
9075 
9076 	if (descriptor->type != FDTYPE_FILE) {
9077 		put_fd(descriptor);
9078 		return B_BAD_VALUE;
9079 	}
9080 
9081 	struct flock flock;
9082 	flock.l_start = 0;
9083 	flock.l_len = OFF_MAX;
9084 	flock.l_whence = 0;
9085 	flock.l_type = (operation & LOCK_SH) != 0 ? F_RDLCK : F_WRLCK;
9086 
9087 	status_t status;
9088 	if ((operation & LOCK_UN) != 0)
9089 		status = release_advisory_lock(vnode, &flock);
9090 	else {
9091 		status = acquire_advisory_lock(vnode,
9092 			thread_get_current_thread()->team->session_id, &flock,
9093 			(operation & LOCK_NB) == 0);
9094 	}
9095 
9096 	syscall_restart_handle_post(status);
9097 
9098 	put_fd(descriptor);
9099 	return status;
9100 }
9101 
9102 
9103 status_t
9104 _user_lock_node(int fd)
9105 {
9106 	return common_lock_node(fd, false);
9107 }
9108 
9109 
9110 status_t
9111 _user_unlock_node(int fd)
9112 {
9113 	return common_unlock_node(fd, false);
9114 }
9115 
9116 
9117 status_t
9118 _user_create_dir_entry_ref(dev_t device, ino_t inode, const char* userName,
9119 	int perms)
9120 {
9121 	char name[B_FILE_NAME_LENGTH];
9122 	status_t status;
9123 
9124 	if (!IS_USER_ADDRESS(userName))
9125 		return B_BAD_ADDRESS;
9126 
9127 	status = user_strlcpy(name, userName, sizeof(name));
9128 	if (status < 0)
9129 		return status;
9130 
9131 	return dir_create_entry_ref(device, inode, name, perms, false);
9132 }
9133 
9134 
9135 status_t
9136 _user_create_dir(int fd, const char* userPath, int perms)
9137 {
9138 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9139 	if (pathBuffer.InitCheck() != B_OK)
9140 		return B_NO_MEMORY;
9141 
9142 	char* path = pathBuffer.LockBuffer();
9143 
9144 	if (!IS_USER_ADDRESS(userPath)
9145 		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9146 		return B_BAD_ADDRESS;
9147 
9148 	return dir_create(fd, path, perms, false);
9149 }
9150 
9151 
9152 status_t
9153 _user_remove_dir(int fd, const char* userPath)
9154 {
9155 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9156 	if (pathBuffer.InitCheck() != B_OK)
9157 		return B_NO_MEMORY;
9158 
9159 	char* path = pathBuffer.LockBuffer();
9160 
9161 	if (userPath != NULL) {
9162 		if (!IS_USER_ADDRESS(userPath)
9163 			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9164 			return B_BAD_ADDRESS;
9165 	}
9166 
9167 	return dir_remove(fd, userPath ? path : NULL, false);
9168 }
9169 
9170 
9171 status_t
9172 _user_read_link(int fd, const char* userPath, char* userBuffer,
9173 	size_t* userBufferSize)
9174 {
9175 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1), linkBuffer;
9176 	if (pathBuffer.InitCheck() != B_OK || linkBuffer.InitCheck() != B_OK)
9177 		return B_NO_MEMORY;
9178 
9179 	size_t bufferSize;
9180 
9181 	if (!IS_USER_ADDRESS(userBuffer) || !IS_USER_ADDRESS(userBufferSize)
9182 		|| user_memcpy(&bufferSize, userBufferSize, sizeof(size_t)) != B_OK)
9183 		return B_BAD_ADDRESS;
9184 
9185 	char* path = pathBuffer.LockBuffer();
9186 	char* buffer = linkBuffer.LockBuffer();
9187 
9188 	if (userPath) {
9189 		if (!IS_USER_ADDRESS(userPath)
9190 			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9191 			return B_BAD_ADDRESS;
9192 
9193 		if (bufferSize > B_PATH_NAME_LENGTH)
9194 			bufferSize = B_PATH_NAME_LENGTH;
9195 	}
9196 
9197 	status_t status = common_read_link(fd, userPath ? path : NULL, buffer,
9198 		&bufferSize, false);
9199 
9200 	// we also update the bufferSize in case of errors
9201 	// (the real length will be returned in case of B_BUFFER_OVERFLOW)
9202 	if (user_memcpy(userBufferSize, &bufferSize, sizeof(size_t)) != B_OK)
9203 		return B_BAD_ADDRESS;
9204 
9205 	if (status != B_OK)
9206 		return status;
9207 
9208 	if (user_memcpy(userBuffer, buffer, bufferSize) != B_OK)
9209 		return B_BAD_ADDRESS;
9210 
9211 	return B_OK;
9212 }
9213 
9214 
9215 status_t
9216 _user_create_symlink(int fd, const char* userPath, const char* userToPath,
9217 	int mode)
9218 {
9219 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9220 	KPath toPathBuffer(B_PATH_NAME_LENGTH + 1);
9221 	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
9222 		return B_NO_MEMORY;
9223 
9224 	char* path = pathBuffer.LockBuffer();
9225 	char* toPath = toPathBuffer.LockBuffer();
9226 
9227 	if (!IS_USER_ADDRESS(userPath)
9228 		|| !IS_USER_ADDRESS(userToPath)
9229 		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK
9230 		|| user_strlcpy(toPath, userToPath, B_PATH_NAME_LENGTH) < B_OK)
9231 		return B_BAD_ADDRESS;
9232 
9233 	return common_create_symlink(fd, path, toPath, mode, false);
9234 }
9235 
9236 
9237 status_t
9238 _user_create_link(int pathFD, const char* userPath, int toFD,
9239 	const char* userToPath, bool traverseLeafLink)
9240 {
9241 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9242 	KPath toPathBuffer(B_PATH_NAME_LENGTH + 1);
9243 	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
9244 		return B_NO_MEMORY;
9245 
9246 	char* path = pathBuffer.LockBuffer();
9247 	char* toPath = toPathBuffer.LockBuffer();
9248 
9249 	if (!IS_USER_ADDRESS(userPath)
9250 		|| !IS_USER_ADDRESS(userToPath)
9251 		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK
9252 		|| user_strlcpy(toPath, userToPath, B_PATH_NAME_LENGTH) < B_OK)
9253 		return B_BAD_ADDRESS;
9254 
9255 	status_t status = check_path(toPath);
9256 	if (status != B_OK)
9257 		return status;
9258 
9259 	return common_create_link(pathFD, path, toFD, toPath, traverseLeafLink,
9260 		false);
9261 }
9262 
9263 
9264 status_t
9265 _user_unlink(int fd, const char* userPath)
9266 {
9267 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9268 	if (pathBuffer.InitCheck() != B_OK)
9269 		return B_NO_MEMORY;
9270 
9271 	char* path = pathBuffer.LockBuffer();
9272 
9273 	if (!IS_USER_ADDRESS(userPath)
9274 		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9275 		return B_BAD_ADDRESS;
9276 
9277 	return common_unlink(fd, path, false);
9278 }
9279 
9280 
9281 status_t
9282 _user_rename(int oldFD, const char* userOldPath, int newFD,
9283 	const char* userNewPath)
9284 {
9285 	KPath oldPathBuffer(B_PATH_NAME_LENGTH + 1);
9286 	KPath newPathBuffer(B_PATH_NAME_LENGTH + 1);
9287 	if (oldPathBuffer.InitCheck() != B_OK || newPathBuffer.InitCheck() != B_OK)
9288 		return B_NO_MEMORY;
9289 
9290 	char* oldPath = oldPathBuffer.LockBuffer();
9291 	char* newPath = newPathBuffer.LockBuffer();
9292 
9293 	if (!IS_USER_ADDRESS(userOldPath) || !IS_USER_ADDRESS(userNewPath)
9294 		|| user_strlcpy(oldPath, userOldPath, B_PATH_NAME_LENGTH) < B_OK
9295 		|| user_strlcpy(newPath, userNewPath, B_PATH_NAME_LENGTH) < B_OK)
9296 		return B_BAD_ADDRESS;
9297 
9298 	return common_rename(oldFD, oldPath, newFD, newPath, false);
9299 }
9300 
9301 
9302 status_t
9303 _user_create_fifo(int fd, const char* userPath, mode_t perms)
9304 {
9305 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9306 	if (pathBuffer.InitCheck() != B_OK)
9307 		return B_NO_MEMORY;
9308 
9309 	char* path = pathBuffer.LockBuffer();
9310 
9311 	if (!IS_USER_ADDRESS(userPath)
9312 		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK) {
9313 		return B_BAD_ADDRESS;
9314 	}
9315 
9316 	// split into directory vnode and filename path
9317 	char filename[B_FILE_NAME_LENGTH];
9318 	struct vnode* dir;
9319 	status_t status = fd_and_path_to_dir_vnode(fd, path, &dir, filename, false);
9320 	if (status != B_OK)
9321 		return status;
9322 
9323 	VNodePutter _(dir);
9324 
9325 	// the underlying FS needs to support creating FIFOs
9326 	if (!HAS_FS_CALL(dir, create_special_node))
9327 		return B_UNSUPPORTED;
9328 
9329 	// create the entry	-- the FIFO sub node is set up automatically
9330 	fs_vnode superVnode;
9331 	ino_t nodeID;
9332 	status = FS_CALL(dir, create_special_node, filename, NULL,
9333 		S_IFIFO | (perms & S_IUMSK), 0, &superVnode, &nodeID);
9334 
9335 	// create_special_node() acquired a reference for us that we don't need.
9336 	if (status == B_OK)
9337 		put_vnode(dir->mount->volume, nodeID);
9338 
9339 	return status;
9340 }
9341 
9342 
9343 status_t
9344 _user_create_pipe(int* userFDs)
9345 {
9346 	// rootfs should support creating FIFOs, but let's be sure
9347 	if (!HAS_FS_CALL(sRoot, create_special_node))
9348 		return B_UNSUPPORTED;
9349 
9350 	// create the node	-- the FIFO sub node is set up automatically
9351 	fs_vnode superVnode;
9352 	ino_t nodeID;
9353 	status_t status = FS_CALL(sRoot, create_special_node, NULL, NULL,
9354 		S_IFIFO | S_IRUSR | S_IWUSR, 0, &superVnode, &nodeID);
9355 	if (status != B_OK)
9356 		return status;
9357 
9358 	// We've got one reference to the node and need another one.
9359 	struct vnode* vnode;
9360 	status = get_vnode(sRoot->mount->id, nodeID, &vnode, true, false);
9361 	if (status != B_OK) {
9362 		// that should not happen
9363 		dprintf("_user_create_pipe(): Failed to lookup vnode (%" B_PRIdDEV ", "
9364 			"%" B_PRIdINO ")\n", sRoot->mount->id, sRoot->id);
9365 		return status;
9366 	}
9367 
9368 	// Everything looks good so far. Open two FDs for reading respectively
9369 	// writing.
9370 	int fds[2];
9371 	fds[0] = open_vnode(vnode, O_RDONLY, false);
9372 	fds[1] = open_vnode(vnode, O_WRONLY, false);
9373 
9374 	FDCloser closer0(fds[0], false);
9375 	FDCloser closer1(fds[1], false);
9376 
9377 	status = (fds[0] >= 0 ? (fds[1] >= 0 ? B_OK : fds[1]) : fds[0]);
9378 
9379 	// copy FDs to userland
9380 	if (status == B_OK) {
9381 		if (!IS_USER_ADDRESS(userFDs)
9382 			|| user_memcpy(userFDs, fds, sizeof(fds)) != B_OK) {
9383 			status = B_BAD_ADDRESS;
9384 		}
9385 	}
9386 
9387 	// keep FDs, if everything went fine
9388 	if (status == B_OK) {
9389 		closer0.Detach();
9390 		closer1.Detach();
9391 	}
9392 
9393 	return status;
9394 }
9395 
9396 
9397 status_t
9398 _user_access(int fd, const char* userPath, int mode, bool effectiveUserGroup)
9399 {
9400 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9401 	if (pathBuffer.InitCheck() != B_OK)
9402 		return B_NO_MEMORY;
9403 
9404 	char* path = pathBuffer.LockBuffer();
9405 
9406 	if (!IS_USER_ADDRESS(userPath)
9407 		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9408 		return B_BAD_ADDRESS;
9409 
9410 	return common_access(fd, path, mode, effectiveUserGroup, false);
9411 }
9412 
9413 
9414 status_t
9415 _user_read_stat(int fd, const char* userPath, bool traverseLink,
9416 	struct stat* userStat, size_t statSize)
9417 {
9418 	struct stat stat;
9419 	status_t status;
9420 
9421 	if (statSize > sizeof(struct stat))
9422 		return B_BAD_VALUE;
9423 
9424 	if (!IS_USER_ADDRESS(userStat))
9425 		return B_BAD_ADDRESS;
9426 
9427 	if (userPath) {
9428 		// path given: get the stat of the node referred to by (fd, path)
9429 		if (!IS_USER_ADDRESS(userPath))
9430 			return B_BAD_ADDRESS;
9431 
9432 		KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9433 		if (pathBuffer.InitCheck() != B_OK)
9434 			return B_NO_MEMORY;
9435 
9436 		char* path = pathBuffer.LockBuffer();
9437 
9438 		ssize_t length = user_strlcpy(path, userPath, B_PATH_NAME_LENGTH);
9439 		if (length < B_OK)
9440 			return length;
9441 		if (length >= B_PATH_NAME_LENGTH)
9442 			return B_NAME_TOO_LONG;
9443 
9444 		status = common_path_read_stat(fd, path, traverseLink, &stat, false);
9445 	} else {
9446 		// no path given: get the FD and use the FD operation
9447 		struct file_descriptor* descriptor
9448 			= get_fd(get_current_io_context(false), fd);
9449 		if (descriptor == NULL)
9450 			return B_FILE_ERROR;
9451 
9452 		if (descriptor->ops->fd_read_stat)
9453 			status = descriptor->ops->fd_read_stat(descriptor, &stat);
9454 		else
9455 			status = B_UNSUPPORTED;
9456 
9457 		put_fd(descriptor);
9458 	}
9459 
9460 	if (status != B_OK)
9461 		return status;
9462 
9463 	return user_memcpy(userStat, &stat, statSize);
9464 }
9465 
9466 
9467 status_t
9468 _user_write_stat(int fd, const char* userPath, bool traverseLeafLink,
9469 	const struct stat* userStat, size_t statSize, int statMask)
9470 {
9471 	if (statSize > sizeof(struct stat))
9472 		return B_BAD_VALUE;
9473 
9474 	struct stat stat;
9475 
9476 	if (!IS_USER_ADDRESS(userStat)
9477 		|| user_memcpy(&stat, userStat, statSize) < B_OK)
9478 		return B_BAD_ADDRESS;
9479 
9480 	// clear additional stat fields
9481 	if (statSize < sizeof(struct stat))
9482 		memset((uint8*)&stat + statSize, 0, sizeof(struct stat) - statSize);
9483 
9484 	status_t status;
9485 
9486 	if (userPath) {
9487 		// path given: write the stat of the node referred to by (fd, path)
9488 		if (!IS_USER_ADDRESS(userPath))
9489 			return B_BAD_ADDRESS;
9490 
9491 		KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9492 		if (pathBuffer.InitCheck() != B_OK)
9493 			return B_NO_MEMORY;
9494 
9495 		char* path = pathBuffer.LockBuffer();
9496 
9497 		ssize_t length = user_strlcpy(path, userPath, B_PATH_NAME_LENGTH);
9498 		if (length < B_OK)
9499 			return length;
9500 		if (length >= B_PATH_NAME_LENGTH)
9501 			return B_NAME_TOO_LONG;
9502 
9503 		status = common_path_write_stat(fd, path, traverseLeafLink, &stat,
9504 			statMask, false);
9505 	} else {
9506 		// no path given: get the FD and use the FD operation
9507 		struct file_descriptor* descriptor
9508 			= get_fd(get_current_io_context(false), fd);
9509 		if (descriptor == NULL)
9510 			return B_FILE_ERROR;
9511 
9512 		if (descriptor->ops->fd_write_stat) {
9513 			status = descriptor->ops->fd_write_stat(descriptor, &stat,
9514 				statMask);
9515 		} else
9516 			status = B_UNSUPPORTED;
9517 
9518 		put_fd(descriptor);
9519 	}
9520 
9521 	return status;
9522 }
9523 
9524 
9525 int
9526 _user_open_attr_dir(int fd, const char* userPath, bool traverseLeafLink)
9527 {
9528 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9529 	if (pathBuffer.InitCheck() != B_OK)
9530 		return B_NO_MEMORY;
9531 
9532 	char* path = pathBuffer.LockBuffer();
9533 
9534 	if (userPath != NULL) {
9535 		if (!IS_USER_ADDRESS(userPath)
9536 			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9537 			return B_BAD_ADDRESS;
9538 	}
9539 
9540 	return attr_dir_open(fd, userPath ? path : NULL, traverseLeafLink, false);
9541 }
9542 
9543 
9544 ssize_t
9545 _user_read_attr(int fd, const char* attribute, off_t pos, void* userBuffer,
9546 	size_t readBytes)
9547 {
9548 	int attr = attr_open(fd, NULL, attribute, O_RDONLY, false);
9549 	if (attr < 0)
9550 		return attr;
9551 
9552 	ssize_t bytes = _user_read(attr, pos, userBuffer, readBytes);
9553 	_user_close(attr);
9554 
9555 	return bytes;
9556 }
9557 
9558 
9559 ssize_t
9560 _user_write_attr(int fd, const char* attribute, uint32 type, off_t pos,
9561 	const void* buffer, size_t writeBytes)
9562 {
9563 	// Try to support the BeOS typical truncation as well as the position
9564 	// argument
9565 	int attr = attr_create(fd, NULL, attribute, type,
9566 		O_CREAT | O_WRONLY | (pos != 0 ? 0 : O_TRUNC), false);
9567 	if (attr < 0)
9568 		return attr;
9569 
9570 	ssize_t bytes = _user_write(attr, pos, buffer, writeBytes);
9571 	_user_close(attr);
9572 
9573 	return bytes;
9574 }
9575 
9576 
9577 status_t
9578 _user_stat_attr(int fd, const char* attribute, struct attr_info* userAttrInfo)
9579 {
9580 	int attr = attr_open(fd, NULL, attribute, O_RDONLY, false);
9581 	if (attr < 0)
9582 		return attr;
9583 
9584 	struct file_descriptor* descriptor
9585 		= get_fd(get_current_io_context(false), attr);
9586 	if (descriptor == NULL) {
9587 		_user_close(attr);
9588 		return B_FILE_ERROR;
9589 	}
9590 
9591 	struct stat stat;
9592 	status_t status;
9593 	if (descriptor->ops->fd_read_stat)
9594 		status = descriptor->ops->fd_read_stat(descriptor, &stat);
9595 	else
9596 		status = B_UNSUPPORTED;
9597 
9598 	put_fd(descriptor);
9599 	_user_close(attr);
9600 
9601 	if (status == B_OK) {
9602 		attr_info info;
9603 		info.type = stat.st_type;
9604 		info.size = stat.st_size;
9605 
9606 		if (user_memcpy(userAttrInfo, &info, sizeof(struct attr_info)) != B_OK)
9607 			return B_BAD_ADDRESS;
9608 	}
9609 
9610 	return status;
9611 }
9612 
9613 
9614 int
9615 _user_open_attr(int fd, const char* userPath, const char* userName,
9616 	uint32 type, int openMode)
9617 {
9618 	char name[B_FILE_NAME_LENGTH];
9619 
9620 	if (!IS_USER_ADDRESS(userName)
9621 		|| user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9622 		return B_BAD_ADDRESS;
9623 
9624 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9625 	if (pathBuffer.InitCheck() != B_OK)
9626 		return B_NO_MEMORY;
9627 
9628 	char* path = pathBuffer.LockBuffer();
9629 
9630 	if (userPath != NULL) {
9631 		if (!IS_USER_ADDRESS(userPath)
9632 			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9633 			return B_BAD_ADDRESS;
9634 	}
9635 
9636 	if ((openMode & O_CREAT) != 0) {
9637 		return attr_create(fd, userPath ? path : NULL, name, type, openMode,
9638 			false);
9639 	}
9640 
9641 	return attr_open(fd, userPath ? path : NULL, name, openMode, false);
9642 }
9643 
9644 
9645 status_t
9646 _user_remove_attr(int fd, const char* userName)
9647 {
9648 	char name[B_FILE_NAME_LENGTH];
9649 
9650 	if (!IS_USER_ADDRESS(userName)
9651 		|| user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9652 		return B_BAD_ADDRESS;
9653 
9654 	return attr_remove(fd, name, false);
9655 }
9656 
9657 
9658 status_t
9659 _user_rename_attr(int fromFile, const char* userFromName, int toFile,
9660 	const char* userToName)
9661 {
9662 	if (!IS_USER_ADDRESS(userFromName)
9663 		|| !IS_USER_ADDRESS(userToName))
9664 		return B_BAD_ADDRESS;
9665 
9666 	KPath fromNameBuffer(B_FILE_NAME_LENGTH);
9667 	KPath toNameBuffer(B_FILE_NAME_LENGTH);
9668 	if (fromNameBuffer.InitCheck() != B_OK || toNameBuffer.InitCheck() != B_OK)
9669 		return B_NO_MEMORY;
9670 
9671 	char* fromName = fromNameBuffer.LockBuffer();
9672 	char* toName = toNameBuffer.LockBuffer();
9673 
9674 	if (user_strlcpy(fromName, userFromName, B_FILE_NAME_LENGTH) < B_OK
9675 		|| user_strlcpy(toName, userToName, B_FILE_NAME_LENGTH) < B_OK)
9676 		return B_BAD_ADDRESS;
9677 
9678 	return attr_rename(fromFile, fromName, toFile, toName, false);
9679 }
9680 
9681 
9682 int
9683 _user_open_index_dir(dev_t device)
9684 {
9685 	return index_dir_open(device, false);
9686 }
9687 
9688 
9689 status_t
9690 _user_create_index(dev_t device, const char* userName, uint32 type,
9691 	uint32 flags)
9692 {
9693 	char name[B_FILE_NAME_LENGTH];
9694 
9695 	if (!IS_USER_ADDRESS(userName)
9696 		|| user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9697 		return B_BAD_ADDRESS;
9698 
9699 	return index_create(device, name, type, flags, false);
9700 }
9701 
9702 
9703 status_t
9704 _user_read_index_stat(dev_t device, const char* userName, struct stat* userStat)
9705 {
9706 	char name[B_FILE_NAME_LENGTH];
9707 	struct stat stat;
9708 	status_t status;
9709 
9710 	if (!IS_USER_ADDRESS(userName)
9711 		|| !IS_USER_ADDRESS(userStat)
9712 		|| user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9713 		return B_BAD_ADDRESS;
9714 
9715 	status = index_name_read_stat(device, name, &stat, false);
9716 	if (status == B_OK) {
9717 		if (user_memcpy(userStat, &stat, sizeof(stat)) != B_OK)
9718 			return B_BAD_ADDRESS;
9719 	}
9720 
9721 	return status;
9722 }
9723 
9724 
9725 status_t
9726 _user_remove_index(dev_t device, const char* userName)
9727 {
9728 	char name[B_FILE_NAME_LENGTH];
9729 
9730 	if (!IS_USER_ADDRESS(userName)
9731 		|| user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9732 		return B_BAD_ADDRESS;
9733 
9734 	return index_remove(device, name, false);
9735 }
9736 
9737 
9738 status_t
9739 _user_getcwd(char* userBuffer, size_t size)
9740 {
9741 	if (size == 0)
9742 		return B_BAD_VALUE;
9743 	if (!IS_USER_ADDRESS(userBuffer))
9744 		return B_BAD_ADDRESS;
9745 
9746 	if (size > kMaxPathLength)
9747 		size = kMaxPathLength;
9748 
9749 	KPath pathBuffer(size);
9750 	if (pathBuffer.InitCheck() != B_OK)
9751 		return B_NO_MEMORY;
9752 
9753 	TRACE(("user_getcwd: buf %p, %ld\n", userBuffer, size));
9754 
9755 	char* path = pathBuffer.LockBuffer();
9756 
9757 	status_t status = get_cwd(path, size, false);
9758 	if (status != B_OK)
9759 		return status;
9760 
9761 	// Copy back the result
9762 	if (user_strlcpy(userBuffer, path, size) < B_OK)
9763 		return B_BAD_ADDRESS;
9764 
9765 	return status;
9766 }
9767 
9768 
9769 status_t
9770 _user_setcwd(int fd, const char* userPath)
9771 {
9772 	TRACE(("user_setcwd: path = %p\n", userPath));
9773 
9774 	KPath pathBuffer(B_PATH_NAME_LENGTH);
9775 	if (pathBuffer.InitCheck() != B_OK)
9776 		return B_NO_MEMORY;
9777 
9778 	char* path = pathBuffer.LockBuffer();
9779 
9780 	if (userPath != NULL) {
9781 		if (!IS_USER_ADDRESS(userPath)
9782 			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9783 			return B_BAD_ADDRESS;
9784 	}
9785 
9786 	return set_cwd(fd, userPath != NULL ? path : NULL, false);
9787 }
9788 
9789 
9790 status_t
9791 _user_change_root(const char* userPath)
9792 {
9793 	// only root is allowed to chroot()
9794 	if (geteuid() != 0)
9795 		return B_NOT_ALLOWED;
9796 
9797 	// alloc path buffer
9798 	KPath pathBuffer(B_PATH_NAME_LENGTH);
9799 	if (pathBuffer.InitCheck() != B_OK)
9800 		return B_NO_MEMORY;
9801 
9802 	// copy userland path to kernel
9803 	char* path = pathBuffer.LockBuffer();
9804 	if (userPath != NULL) {
9805 		if (!IS_USER_ADDRESS(userPath)
9806 			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9807 			return B_BAD_ADDRESS;
9808 	}
9809 
9810 	// get the vnode
9811 	struct vnode* vnode;
9812 	status_t status = path_to_vnode(path, true, &vnode, NULL, false);
9813 	if (status != B_OK)
9814 		return status;
9815 
9816 	// set the new root
9817 	struct io_context* context = get_current_io_context(false);
9818 	mutex_lock(&sIOContextRootLock);
9819 	struct vnode* oldRoot = context->root;
9820 	context->root = vnode;
9821 	mutex_unlock(&sIOContextRootLock);
9822 
9823 	put_vnode(oldRoot);
9824 
9825 	return B_OK;
9826 }
9827 
9828 
9829 int
9830 _user_open_query(dev_t device, const char* userQuery, size_t queryLength,
9831 	uint32 flags, port_id port, int32 token)
9832 {
9833 	char* query;
9834 
9835 	if (device < 0 || userQuery == NULL || queryLength == 0)
9836 		return B_BAD_VALUE;
9837 
9838 	// this is a safety restriction
9839 	if (queryLength >= 65536)
9840 		return B_NAME_TOO_LONG;
9841 
9842 	query = (char*)malloc(queryLength + 1);
9843 	if (query == NULL)
9844 		return B_NO_MEMORY;
9845 	if (user_strlcpy(query, userQuery, queryLength + 1) < B_OK) {
9846 		free(query);
9847 		return B_BAD_ADDRESS;
9848 	}
9849 
9850 	int fd = query_open(device, query, flags, port, token, false);
9851 
9852 	free(query);
9853 	return fd;
9854 }
9855 
9856 
9857 #include "vfs_request_io.cpp"
9858