xref: /haiku/src/system/kernel/fs/vfs.cpp (revision 4a55cc230cf7566cadcbb23b1928eefff8aea9a2)
1 /*
2  * Copyright 2005-2013, Ingo Weinhold, ingo_weinhold@gmx.de.
3  * Copyright 2002-2018, Axel Dörfler, axeld@pinc-software.de.
4  * Distributed under the terms of the MIT License.
5  *
6  * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
7  * Distributed under the terms of the NewOS License.
8  */
9 
10 
11 /*! Virtual File System and File System Interface Layer */
12 
13 
14 #include <ctype.h>
15 #include <fcntl.h>
16 #include <limits.h>
17 #include <stddef.h>
18 #include <stdio.h>
19 #include <string.h>
20 #include <sys/file.h>
21 #include <sys/ioctl.h>
22 #include <sys/resource.h>
23 #include <sys/stat.h>
24 #include <unistd.h>
25 
26 #include <fs_attr.h>
27 #include <fs_info.h>
28 #include <fs_interface.h>
29 #include <fs_volume.h>
30 #include <NodeMonitor.h>
31 #include <OS.h>
32 #include <StorageDefs.h>
33 
34 #include <AutoDeleter.h>
35 #include <block_cache.h>
36 #include <boot/kernel_args.h>
37 #include <debug_heap.h>
38 #include <disk_device_manager/KDiskDevice.h>
39 #include <disk_device_manager/KDiskDeviceManager.h>
40 #include <disk_device_manager/KDiskDeviceUtils.h>
41 #include <disk_device_manager/KDiskSystem.h>
42 #include <fd.h>
43 #include <file_cache.h>
44 #include <fs/node_monitor.h>
45 #include <KPath.h>
46 #include <lock.h>
47 #include <low_resource_manager.h>
48 #include <slab/Slab.h>
49 #include <StackOrHeapArray.h>
50 #include <syscalls.h>
51 #include <syscall_restart.h>
52 #include <tracing.h>
53 #include <util/atomic.h>
54 #include <util/AutoLock.h>
55 #include <util/ThreadAutoLock.h>
56 #include <util/DoublyLinkedList.h>
57 #include <vfs.h>
58 #include <vm/vm.h>
59 #include <vm/VMCache.h>
60 #include <wait_for_objects.h>
61 
62 #include "EntryCache.h"
63 #include "fifo.h"
64 #include "IORequest.h"
65 #include "unused_vnodes.h"
66 #include "vfs_tracing.h"
67 #include "Vnode.h"
68 #include "../cache/vnode_store.h"
69 
70 
71 //#define TRACE_VFS
72 #ifdef TRACE_VFS
73 #	define TRACE(x) dprintf x
74 #	define FUNCTION(x) dprintf x
75 #else
76 #	define TRACE(x) ;
77 #	define FUNCTION(x) ;
78 #endif
79 
80 #define ADD_DEBUGGER_COMMANDS
81 
82 
83 #define HAS_FS_CALL(vnode, op)			(vnode->ops->op != NULL)
84 #define HAS_FS_MOUNT_CALL(mount, op)	(mount->volume->ops->op != NULL)
85 
86 #if KDEBUG
87 #	define FS_CALL(vnode, op, params...) \
88 		( HAS_FS_CALL(vnode, op) ? \
89 			vnode->ops->op(vnode->mount->volume, vnode, params) \
90 			: (panic("FS_CALL: vnode %p op " #op " is NULL", vnode), 0))
91 #	define FS_CALL_NO_PARAMS(vnode, op) \
92 		( HAS_FS_CALL(vnode, op) ? \
93 			vnode->ops->op(vnode->mount->volume, vnode) \
94 			: (panic("FS_CALL_NO_PARAMS: vnode %p op " #op " is NULL", vnode), 0))
95 #	define FS_MOUNT_CALL(mount, op, params...) \
96 		( HAS_FS_MOUNT_CALL(mount, op) ? \
97 			mount->volume->ops->op(mount->volume, params) \
98 			: (panic("FS_MOUNT_CALL: mount %p op " #op " is NULL", mount), 0))
99 #	define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
100 		( HAS_FS_MOUNT_CALL(mount, op) ? \
101 			mount->volume->ops->op(mount->volume) \
102 			: (panic("FS_MOUNT_CALL_NO_PARAMS: mount %p op " #op " is NULL", mount), 0))
103 #else
104 #	define FS_CALL(vnode, op, params...) \
105 			vnode->ops->op(vnode->mount->volume, vnode, params)
106 #	define FS_CALL_NO_PARAMS(vnode, op) \
107 			vnode->ops->op(vnode->mount->volume, vnode)
108 #	define FS_MOUNT_CALL(mount, op, params...) \
109 			mount->volume->ops->op(mount->volume, params)
110 #	define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
111 			mount->volume->ops->op(mount->volume)
112 #endif
113 
114 
115 const static size_t kMaxPathLength = 65536;
116 	// The absolute maximum path length (for getcwd() - this is not depending
117 	// on PATH_MAX
118 
119 
120 typedef DoublyLinkedList<vnode> VnodeList;
121 
122 /*!	\brief Structure to manage a mounted file system
123 
124 	Note: The root_vnode and root_vnode->covers fields (what others?) are
125 	initialized in fs_mount() and not changed afterwards. That is as soon
126 	as the mount is mounted and it is made sure it won't be unmounted
127 	(e.g. by holding a reference to a vnode of that mount) (read) access
128 	to those fields is always safe, even without additional locking. Morever
129 	while mounted the mount holds a reference to the root_vnode->covers vnode,
130 	and thus making the access path vnode->mount->root_vnode->covers->mount->...
131 	safe if a reference to vnode is held (note that for the root mount
132 	root_vnode->covers is NULL, though).
133 */
134 struct fs_mount {
135 	fs_mount()
136 		:
137 		volume(NULL),
138 		device_name(NULL)
139 	{
140 		mutex_init(&lock, "mount lock");
141 	}
142 
143 	~fs_mount()
144 	{
145 		mutex_destroy(&lock);
146 		free(device_name);
147 
148 		while (volume) {
149 			fs_volume* superVolume = volume->super_volume;
150 
151 			if (volume->file_system != NULL)
152 				put_module(volume->file_system->info.name);
153 
154 			free(volume->file_system_name);
155 			free(volume);
156 			volume = superVolume;
157 		}
158 	}
159 
160 	struct fs_mount* next;
161 	dev_t			id;
162 	fs_volume*		volume;
163 	char*			device_name;
164 	mutex			lock;	// guards the vnodes list
165 	struct vnode*	root_vnode;
166 	struct vnode*	covers_vnode;	// immutable
167 	KPartition*		partition;
168 	VnodeList		vnodes;
169 	EntryCache		entry_cache;
170 	bool			unmounting;
171 	bool			owns_file_device;
172 };
173 
174 
175 namespace {
176 
177 struct advisory_lock : public DoublyLinkedListLinkImpl<advisory_lock> {
178 	list_link		link;
179 	void*			bound_to;
180 	team_id			team;
181 	pid_t			session;
182 	off_t			start;
183 	off_t			end;
184 	bool			shared;
185 };
186 
187 typedef DoublyLinkedList<advisory_lock> LockList;
188 
189 } // namespace
190 
191 
192 struct advisory_locking {
193 	sem_id			lock;
194 	sem_id			wait_sem;
195 	LockList		locks;
196 
197 	advisory_locking()
198 		:
199 		lock(-1),
200 		wait_sem(-1)
201 	{
202 	}
203 
204 	~advisory_locking()
205 	{
206 		if (lock >= 0)
207 			delete_sem(lock);
208 		if (wait_sem >= 0)
209 			delete_sem(wait_sem);
210 	}
211 };
212 
213 /*!	\brief Guards sMountsTable.
214 
215 	The holder is allowed to read/write access the sMountsTable.
216 	Manipulation of the fs_mount structures themselves
217 	(and their destruction) requires different locks though.
218 */
219 static rw_lock sMountLock = RW_LOCK_INITIALIZER("vfs_mount_lock");
220 
221 /*!	\brief Guards mount/unmount operations.
222 
223 	The fs_mount() and fs_unmount() hold the lock during their whole operation.
224 	That is locking the lock ensures that no FS is mounted/unmounted. In
225 	particular this means that
226 	- sMountsTable will not be modified,
227 	- the fields immutable after initialization of the fs_mount structures in
228 	  sMountsTable will not be modified,
229 
230 	The thread trying to lock the lock must not hold sVnodeLock or
231 	sMountLock.
232 */
233 static recursive_lock sMountOpLock;
234 
235 /*!	\brief Guards sVnodeTable.
236 
237 	The holder is allowed read/write access to sVnodeTable and to
238 	any unbusy vnode in that table, save to the immutable fields (device, id,
239 	private_node, mount) to which only read-only access is allowed.
240 	The mutable fields advisory_locking, mandatory_locked_by, and ref_count, as
241 	well as the busy, removed, unused flags, and the vnode's type can also be
242 	write accessed when holding a read lock to sVnodeLock *and* having the vnode
243 	locked. Write access to covered_by and covers requires to write lock
244 	sVnodeLock.
245 
246 	The thread trying to acquire the lock must not hold sMountLock.
247 	You must not hold this lock when calling create_sem(), as this might call
248 	vfs_free_unused_vnodes() and thus cause a deadlock.
249 */
250 static rw_lock sVnodeLock = RW_LOCK_INITIALIZER("vfs_vnode_lock");
251 
252 /*!	\brief Guards io_context::root.
253 
254 	Must be held when setting or getting the io_context::root field.
255 	The only operation allowed while holding this lock besides getting or
256 	setting the field is inc_vnode_ref_count() on io_context::root.
257 */
258 static mutex sIOContextRootLock = MUTEX_INITIALIZER("io_context::root lock");
259 
260 
261 namespace {
262 
263 struct vnode_hash_key {
264 	dev_t	device;
265 	ino_t	vnode;
266 };
267 
268 struct VnodeHash {
269 	typedef vnode_hash_key	KeyType;
270 	typedef	struct vnode	ValueType;
271 
272 #define VHASH(mountid, vnodeid) \
273 	(((uint32)((vnodeid) >> 32) + (uint32)(vnodeid)) ^ (uint32)(mountid))
274 
275 	size_t HashKey(KeyType key) const
276 	{
277 		return VHASH(key.device, key.vnode);
278 	}
279 
280 	size_t Hash(ValueType* vnode) const
281 	{
282 		return VHASH(vnode->device, vnode->id);
283 	}
284 
285 #undef VHASH
286 
287 	bool Compare(KeyType key, ValueType* vnode) const
288 	{
289 		return vnode->device == key.device && vnode->id == key.vnode;
290 	}
291 
292 	ValueType*& GetLink(ValueType* value) const
293 	{
294 		return value->next;
295 	}
296 };
297 
298 typedef BOpenHashTable<VnodeHash> VnodeTable;
299 
300 
301 struct MountHash {
302 	typedef dev_t			KeyType;
303 	typedef	struct fs_mount	ValueType;
304 
305 	size_t HashKey(KeyType key) const
306 	{
307 		return key;
308 	}
309 
310 	size_t Hash(ValueType* mount) const
311 	{
312 		return mount->id;
313 	}
314 
315 	bool Compare(KeyType key, ValueType* mount) const
316 	{
317 		return mount->id == key;
318 	}
319 
320 	ValueType*& GetLink(ValueType* value) const
321 	{
322 		return value->next;
323 	}
324 };
325 
326 typedef BOpenHashTable<MountHash> MountTable;
327 
328 } // namespace
329 
330 
331 object_cache* sPathNameCache;
332 object_cache* sVnodeCache;
333 object_cache* sFileDescriptorCache;
334 
335 #define VNODE_HASH_TABLE_SIZE 1024
336 static VnodeTable* sVnodeTable;
337 static struct vnode* sRoot;
338 
339 #define MOUNTS_HASH_TABLE_SIZE 16
340 static MountTable* sMountsTable;
341 static dev_t sNextMountID = 1;
342 
343 #define MAX_TEMP_IO_VECS 8
344 
345 // How long to wait for busy vnodes (10s)
346 #define BUSY_VNODE_RETRIES 2000
347 #define BUSY_VNODE_DELAY 5000
348 
349 mode_t __gUmask = 022;
350 
351 /* function declarations */
352 
353 static void free_unused_vnodes();
354 
355 // file descriptor operation prototypes
356 static status_t file_read(struct file_descriptor* descriptor, off_t pos,
357 	void* buffer, size_t* _bytes);
358 static status_t file_write(struct file_descriptor* descriptor, off_t pos,
359 	const void* buffer, size_t* _bytes);
360 static off_t file_seek(struct file_descriptor* descriptor, off_t pos,
361 	int seekType);
362 static void file_free_fd(struct file_descriptor* descriptor);
363 static status_t file_close(struct file_descriptor* descriptor);
364 static status_t file_select(struct file_descriptor* descriptor, uint8 event,
365 	struct selectsync* sync);
366 static status_t file_deselect(struct file_descriptor* descriptor, uint8 event,
367 	struct selectsync* sync);
368 static status_t dir_read(struct io_context* context,
369 	struct file_descriptor* descriptor, struct dirent* buffer,
370 	size_t bufferSize, uint32* _count);
371 static status_t dir_read(struct io_context* ioContext, struct vnode* vnode,
372 	void* cookie, struct dirent* buffer, size_t bufferSize, uint32* _count);
373 static status_t dir_rewind(struct file_descriptor* descriptor);
374 static void dir_free_fd(struct file_descriptor* descriptor);
375 static status_t dir_close(struct file_descriptor* descriptor);
376 static status_t attr_dir_read(struct io_context* context,
377 	struct file_descriptor* descriptor, struct dirent* buffer,
378 	size_t bufferSize, uint32* _count);
379 static status_t attr_dir_rewind(struct file_descriptor* descriptor);
380 static void attr_dir_free_fd(struct file_descriptor* descriptor);
381 static status_t attr_dir_close(struct file_descriptor* descriptor);
382 static status_t attr_read(struct file_descriptor* descriptor, off_t pos,
383 	void* buffer, size_t* _bytes);
384 static status_t attr_write(struct file_descriptor* descriptor, off_t pos,
385 	const void* buffer, size_t* _bytes);
386 static off_t attr_seek(struct file_descriptor* descriptor, off_t pos,
387 	int seekType);
388 static void attr_free_fd(struct file_descriptor* descriptor);
389 static status_t attr_close(struct file_descriptor* descriptor);
390 static status_t attr_read_stat(struct file_descriptor* descriptor,
391 	struct stat* statData);
392 static status_t attr_write_stat(struct file_descriptor* descriptor,
393 	const struct stat* stat, int statMask);
394 static status_t index_dir_read(struct io_context* context,
395 	struct file_descriptor* descriptor, struct dirent* buffer,
396 	size_t bufferSize, uint32* _count);
397 static status_t index_dir_rewind(struct file_descriptor* descriptor);
398 static void index_dir_free_fd(struct file_descriptor* descriptor);
399 static status_t index_dir_close(struct file_descriptor* descriptor);
400 static status_t query_read(struct io_context* context,
401 	struct file_descriptor* descriptor, struct dirent* buffer,
402 	size_t bufferSize, uint32* _count);
403 static status_t query_rewind(struct file_descriptor* descriptor);
404 static void query_free_fd(struct file_descriptor* descriptor);
405 static status_t query_close(struct file_descriptor* descriptor);
406 
407 static status_t common_ioctl(struct file_descriptor* descriptor, ulong op,
408 	void* buffer, size_t length);
409 static status_t common_read_stat(struct file_descriptor* descriptor,
410 	struct stat* statData);
411 static status_t common_write_stat(struct file_descriptor* descriptor,
412 	const struct stat* statData, int statMask);
413 static status_t common_path_read_stat(int fd, char* path, bool traverseLeafLink,
414 	struct stat* stat, bool kernel);
415 
416 static status_t vnode_path_to_vnode(struct vnode* vnode, char* path,
417 	bool traverseLeafLink, int count, bool kernel,
418 	struct vnode** _vnode, ino_t* _parentID);
419 static status_t dir_vnode_to_path(struct vnode* vnode, char* buffer,
420 	size_t bufferSize, bool kernel);
421 static status_t fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
422 	struct vnode** _vnode, ino_t* _parentID, bool kernel);
423 static void inc_vnode_ref_count(struct vnode* vnode);
424 static status_t dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree,
425 	bool reenter);
426 static inline void put_vnode(struct vnode* vnode);
427 static status_t fs_unmount(char* path, dev_t mountID, uint32 flags,
428 	bool kernel);
429 static int open_vnode(struct vnode* vnode, int openMode, bool kernel);
430 
431 
432 static struct fd_ops sFileOps = {
433 	file_read,
434 	file_write,
435 	file_seek,
436 	common_ioctl,
437 	NULL,		// set_flags
438 	file_select,
439 	file_deselect,
440 	NULL,		// read_dir()
441 	NULL,		// rewind_dir()
442 	common_read_stat,
443 	common_write_stat,
444 	file_close,
445 	file_free_fd
446 };
447 
448 static struct fd_ops sDirectoryOps = {
449 	NULL,		// read()
450 	NULL,		// write()
451 	NULL,		// seek()
452 	common_ioctl,
453 	NULL,		// set_flags
454 	NULL,		// select()
455 	NULL,		// deselect()
456 	dir_read,
457 	dir_rewind,
458 	common_read_stat,
459 	common_write_stat,
460 	dir_close,
461 	dir_free_fd
462 };
463 
464 static struct fd_ops sAttributeDirectoryOps = {
465 	NULL,		// read()
466 	NULL,		// write()
467 	NULL,		// seek()
468 	common_ioctl,
469 	NULL,		// set_flags
470 	NULL,		// select()
471 	NULL,		// deselect()
472 	attr_dir_read,
473 	attr_dir_rewind,
474 	common_read_stat,
475 	common_write_stat,
476 	attr_dir_close,
477 	attr_dir_free_fd
478 };
479 
480 static struct fd_ops sAttributeOps = {
481 	attr_read,
482 	attr_write,
483 	attr_seek,
484 	common_ioctl,
485 	NULL,		// set_flags
486 	NULL,		// select()
487 	NULL,		// deselect()
488 	NULL,		// read_dir()
489 	NULL,		// rewind_dir()
490 	attr_read_stat,
491 	attr_write_stat,
492 	attr_close,
493 	attr_free_fd
494 };
495 
496 static struct fd_ops sIndexDirectoryOps = {
497 	NULL,		// read()
498 	NULL,		// write()
499 	NULL,		// seek()
500 	NULL,		// ioctl()
501 	NULL,		// set_flags
502 	NULL,		// select()
503 	NULL,		// deselect()
504 	index_dir_read,
505 	index_dir_rewind,
506 	NULL,		// read_stat()
507 	NULL,		// write_stat()
508 	index_dir_close,
509 	index_dir_free_fd
510 };
511 
512 #if 0
513 static struct fd_ops sIndexOps = {
514 	NULL,		// read()
515 	NULL,		// write()
516 	NULL,		// seek()
517 	NULL,		// ioctl()
518 	NULL,		// set_flags
519 	NULL,		// select()
520 	NULL,		// deselect()
521 	NULL,		// dir_read()
522 	NULL,		// dir_rewind()
523 	index_read_stat,	// read_stat()
524 	NULL,		// write_stat()
525 	NULL,		// dir_close()
526 	NULL		// free_fd()
527 };
528 #endif
529 
530 static struct fd_ops sQueryOps = {
531 	NULL,		// read()
532 	NULL,		// write()
533 	NULL,		// seek()
534 	NULL,		// ioctl()
535 	NULL,		// set_flags
536 	NULL,		// select()
537 	NULL,		// deselect()
538 	query_read,
539 	query_rewind,
540 	NULL,		// read_stat()
541 	NULL,		// write_stat()
542 	query_close,
543 	query_free_fd
544 };
545 
546 
547 namespace {
548 
549 class VNodePutter {
550 public:
551 	VNodePutter(struct vnode* vnode = NULL) : fVNode(vnode) {}
552 
553 	~VNodePutter()
554 	{
555 		Put();
556 	}
557 
558 	void SetTo(struct vnode* vnode)
559 	{
560 		Put();
561 		fVNode = vnode;
562 	}
563 
564 	void Put()
565 	{
566 		if (fVNode) {
567 			put_vnode(fVNode);
568 			fVNode = NULL;
569 		}
570 	}
571 
572 	struct vnode* Detach()
573 	{
574 		struct vnode* vnode = fVNode;
575 		fVNode = NULL;
576 		return vnode;
577 	}
578 
579 private:
580 	struct vnode* fVNode;
581 };
582 
583 
584 class FDCloser {
585 public:
586 	FDCloser() : fFD(-1), fKernel(true) {}
587 
588 	FDCloser(int fd, bool kernel) : fFD(fd), fKernel(kernel) {}
589 
590 	~FDCloser()
591 	{
592 		Close();
593 	}
594 
595 	void SetTo(int fd, bool kernel)
596 	{
597 		Close();
598 		fFD = fd;
599 		fKernel = kernel;
600 	}
601 
602 	void Close()
603 	{
604 		if (fFD >= 0) {
605 			if (fKernel)
606 				_kern_close(fFD);
607 			else
608 				_user_close(fFD);
609 			fFD = -1;
610 		}
611 	}
612 
613 	int Detach()
614 	{
615 		int fd = fFD;
616 		fFD = -1;
617 		return fd;
618 	}
619 
620 private:
621 	int		fFD;
622 	bool	fKernel;
623 };
624 
625 } // namespace
626 
627 
628 #if VFS_PAGES_IO_TRACING
629 
630 namespace VFSPagesIOTracing {
631 
632 class PagesIOTraceEntry : public AbstractTraceEntry {
633 protected:
634 	PagesIOTraceEntry(struct vnode* vnode, void* cookie, off_t pos,
635 		const generic_io_vec* vecs, uint32 count, uint32 flags,
636 		generic_size_t bytesRequested, status_t status,
637 		generic_size_t bytesTransferred)
638 		:
639 		fVnode(vnode),
640 		fMountID(vnode->mount->id),
641 		fNodeID(vnode->id),
642 		fCookie(cookie),
643 		fPos(pos),
644 		fCount(count),
645 		fFlags(flags),
646 		fBytesRequested(bytesRequested),
647 		fStatus(status),
648 		fBytesTransferred(bytesTransferred)
649 	{
650 		fVecs = (generic_io_vec*)alloc_tracing_buffer_memcpy(vecs,
651 			sizeof(generic_io_vec) * count, false);
652 	}
653 
654 	void AddDump(TraceOutput& out, const char* mode)
655 	{
656 		out.Print("vfs pages io %5s: vnode: %p (%" B_PRId32 ", %" B_PRId64 "), "
657 			"cookie: %p, pos: %" B_PRIdOFF ", size: %" B_PRIu64 ", vecs: {",
658 			mode, fVnode, fMountID, fNodeID, fCookie, fPos,
659 			(uint64)fBytesRequested);
660 
661 		if (fVecs != NULL) {
662 			for (uint32 i = 0; i < fCount; i++) {
663 				if (i > 0)
664 					out.Print(", ");
665 				out.Print("(%" B_PRIx64 ", %" B_PRIu64 ")", (uint64)fVecs[i].base,
666 					(uint64)fVecs[i].length);
667 			}
668 		}
669 
670 		out.Print("}, flags: %#" B_PRIx32 " -> status: %#" B_PRIx32 ", "
671 			"transferred: %" B_PRIu64, fFlags, fStatus,
672 			(uint64)fBytesTransferred);
673 	}
674 
675 protected:
676 	struct vnode*	fVnode;
677 	dev_t			fMountID;
678 	ino_t			fNodeID;
679 	void*			fCookie;
680 	off_t			fPos;
681 	generic_io_vec*	fVecs;
682 	uint32			fCount;
683 	uint32			fFlags;
684 	generic_size_t	fBytesRequested;
685 	status_t		fStatus;
686 	generic_size_t	fBytesTransferred;
687 };
688 
689 
690 class ReadPages : public PagesIOTraceEntry {
691 public:
692 	ReadPages(struct vnode* vnode, void* cookie, off_t pos,
693 		const generic_io_vec* vecs, uint32 count, uint32 flags,
694 		generic_size_t bytesRequested, status_t status,
695 		generic_size_t bytesTransferred)
696 		:
697 		PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
698 			bytesRequested, status, bytesTransferred)
699 	{
700 		Initialized();
701 	}
702 
703 	virtual void AddDump(TraceOutput& out)
704 	{
705 		PagesIOTraceEntry::AddDump(out, "read");
706 	}
707 };
708 
709 
710 class WritePages : public PagesIOTraceEntry {
711 public:
712 	WritePages(struct vnode* vnode, void* cookie, off_t pos,
713 		const generic_io_vec* vecs, uint32 count, uint32 flags,
714 		generic_size_t bytesRequested, status_t status,
715 		generic_size_t bytesTransferred)
716 		:
717 		PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
718 			bytesRequested, status, bytesTransferred)
719 	{
720 		Initialized();
721 	}
722 
723 	virtual void AddDump(TraceOutput& out)
724 	{
725 		PagesIOTraceEntry::AddDump(out, "write");
726 	}
727 };
728 
729 }	// namespace VFSPagesIOTracing
730 
731 #	define TPIO(x) new(std::nothrow) VFSPagesIOTracing::x;
732 #else
733 #	define TPIO(x) ;
734 #endif	// VFS_PAGES_IO_TRACING
735 
736 
737 /*! Finds the mounted device (the fs_mount structure) with the given ID.
738 	Note, you must hold the sMountLock lock when you call this function.
739 */
740 static struct fs_mount*
741 find_mount(dev_t id)
742 {
743 	ASSERT_READ_LOCKED_RW_LOCK(&sMountLock);
744 
745 	return sMountsTable->Lookup(id);
746 }
747 
748 
749 static status_t
750 get_mount(dev_t id, struct fs_mount** _mount)
751 {
752 	struct fs_mount* mount;
753 
754 	ReadLocker nodeLocker(sVnodeLock);
755 	ReadLocker mountLocker(sMountLock);
756 
757 	mount = find_mount(id);
758 	if (mount == NULL)
759 		return B_BAD_VALUE;
760 
761 	struct vnode* rootNode = mount->root_vnode;
762 	if (mount->unmounting || rootNode == NULL || rootNode->IsBusy()
763 		|| rootNode->ref_count == 0) {
764 		// might have been called during a mount/unmount operation
765 		return B_BUSY;
766 	}
767 
768 	inc_vnode_ref_count(rootNode);
769 	*_mount = mount;
770 	return B_OK;
771 }
772 
773 
774 static void
775 put_mount(struct fs_mount* mount)
776 {
777 	if (mount)
778 		put_vnode(mount->root_vnode);
779 }
780 
781 
782 /*!	Tries to open the specified file system module.
783 	Accepts a file system name of the form "bfs" or "file_systems/bfs/v1".
784 	Returns a pointer to file system module interface, or NULL if it
785 	could not open the module.
786 */
787 static file_system_module_info*
788 get_file_system(const char* fsName)
789 {
790 	char name[B_FILE_NAME_LENGTH];
791 	if (strncmp(fsName, "file_systems/", strlen("file_systems/"))) {
792 		// construct module name if we didn't get one
793 		// (we currently support only one API)
794 		snprintf(name, sizeof(name), "file_systems/%s/v1", fsName);
795 		fsName = NULL;
796 	}
797 
798 	file_system_module_info* info;
799 	if (get_module(fsName ? fsName : name, (module_info**)&info) != B_OK)
800 		return NULL;
801 
802 	return info;
803 }
804 
805 
806 /*!	Accepts a file system name of the form "bfs" or "file_systems/bfs/v1"
807 	and returns a compatible fs_info.fsh_name name ("bfs" in both cases).
808 	The name is allocated for you, and you have to free() it when you're
809 	done with it.
810 	Returns NULL if the required memory is not available.
811 */
812 static char*
813 get_file_system_name(const char* fsName)
814 {
815 	const size_t length = strlen("file_systems/");
816 
817 	if (strncmp(fsName, "file_systems/", length)) {
818 		// the name already seems to be the module's file name
819 		return strdup(fsName);
820 	}
821 
822 	fsName += length;
823 	const char* end = strchr(fsName, '/');
824 	if (end == NULL) {
825 		// this doesn't seem to be a valid name, but well...
826 		return strdup(fsName);
827 	}
828 
829 	// cut off the trailing /v1
830 
831 	char* name = (char*)malloc(end + 1 - fsName);
832 	if (name == NULL)
833 		return NULL;
834 
835 	strlcpy(name, fsName, end + 1 - fsName);
836 	return name;
837 }
838 
839 
840 /*!	Accepts a list of file system names separated by a colon, one for each
841 	layer and returns the file system name for the specified layer.
842 	The name is allocated for you, and you have to free() it when you're
843 	done with it.
844 	Returns NULL if the required memory is not available or if there is no
845 	name for the specified layer.
846 */
847 static char*
848 get_file_system_name_for_layer(const char* fsNames, int32 layer)
849 {
850 	while (layer >= 0) {
851 		const char* end = strchr(fsNames, ':');
852 		if (end == NULL) {
853 			if (layer == 0)
854 				return strdup(fsNames);
855 			return NULL;
856 		}
857 
858 		if (layer == 0) {
859 			size_t length = end - fsNames + 1;
860 			char* result = (char*)malloc(length);
861 			strlcpy(result, fsNames, length);
862 			return result;
863 		}
864 
865 		fsNames = end + 1;
866 		layer--;
867 	}
868 
869 	return NULL;
870 }
871 
872 
873 static void
874 add_vnode_to_mount_list(struct vnode* vnode, struct fs_mount* mount)
875 {
876 	MutexLocker _(mount->lock);
877 	mount->vnodes.Add(vnode);
878 }
879 
880 
881 static void
882 remove_vnode_from_mount_list(struct vnode* vnode, struct fs_mount* mount)
883 {
884 	MutexLocker _(mount->lock);
885 	mount->vnodes.Remove(vnode);
886 }
887 
888 
889 /*!	\brief Looks up a vnode by mount and node ID in the sVnodeTable.
890 
891 	The caller must hold the sVnodeLock (read lock at least).
892 
893 	\param mountID the mount ID.
894 	\param vnodeID the node ID.
895 
896 	\return The vnode structure, if it was found in the hash table, \c NULL
897 			otherwise.
898 */
899 static struct vnode*
900 lookup_vnode(dev_t mountID, ino_t vnodeID)
901 {
902 	struct vnode_hash_key key;
903 
904 	key.device = mountID;
905 	key.vnode = vnodeID;
906 
907 	return sVnodeTable->Lookup(key);
908 }
909 
910 
911 /*!	\brief Checks whether or not a busy vnode should be waited for (again).
912 
913 	This will also wait for BUSY_VNODE_DELAY before returning if one should
914 	still wait for the vnode becoming unbusy.
915 
916 	\return \c true if one should retry, \c false if not.
917 */
918 static bool
919 retry_busy_vnode(int32& tries, dev_t mountID, ino_t vnodeID)
920 {
921 	if (--tries < 0) {
922 		// vnode doesn't seem to become unbusy
923 		dprintf("vnode %" B_PRIdDEV ":%" B_PRIdINO
924 			" is not becoming unbusy!\n", mountID, vnodeID);
925 		return false;
926 	}
927 	snooze(BUSY_VNODE_DELAY);
928 	return true;
929 }
930 
931 
932 /*!	Creates a new vnode with the given mount and node ID.
933 	If the node already exists, it is returned instead and no new node is
934 	created. In either case -- but not, if an error occurs -- the function write
935 	locks \c sVnodeLock and keeps it locked for the caller when returning. On
936 	error the lock is not held on return.
937 
938 	\param mountID The mount ID.
939 	\param vnodeID The vnode ID.
940 	\param _vnode Will be set to the new vnode on success.
941 	\param _nodeCreated Will be set to \c true when the returned vnode has
942 		been newly created, \c false when it already existed. Will not be
943 		changed on error.
944 	\return \c B_OK, when the vnode was successfully created and inserted or
945 		a node with the given ID was found, \c B_NO_MEMORY or
946 		\c B_ENTRY_NOT_FOUND on error.
947 */
948 static status_t
949 create_new_vnode_and_lock(dev_t mountID, ino_t vnodeID, struct vnode*& _vnode,
950 	bool& _nodeCreated)
951 {
952 	FUNCTION(("create_new_vnode_and_lock()\n"));
953 
954 	struct vnode* vnode = (struct vnode*)object_cache_alloc(sVnodeCache, 0);
955 	if (vnode == NULL)
956 		return B_NO_MEMORY;
957 
958 	// initialize basic values
959 	memset(vnode, 0, sizeof(struct vnode));
960 	vnode->device = mountID;
961 	vnode->id = vnodeID;
962 	vnode->ref_count = 1;
963 	vnode->SetBusy(true);
964 
965 	// look up the node -- it might have been added by someone else in the
966 	// meantime
967 	rw_lock_write_lock(&sVnodeLock);
968 	struct vnode* existingVnode = lookup_vnode(mountID, vnodeID);
969 	if (existingVnode != NULL) {
970 		object_cache_free(sVnodeCache, vnode, 0);
971 		_vnode = existingVnode;
972 		_nodeCreated = false;
973 		return B_OK;
974 	}
975 
976 	// get the mount structure
977 	rw_lock_read_lock(&sMountLock);
978 	vnode->mount = find_mount(mountID);
979 	if (!vnode->mount || vnode->mount->unmounting) {
980 		rw_lock_read_unlock(&sMountLock);
981 		rw_lock_write_unlock(&sVnodeLock);
982 		object_cache_free(sVnodeCache, vnode, 0);
983 		return B_ENTRY_NOT_FOUND;
984 	}
985 
986 	// add the vnode to the mount's node list and the hash table
987 	sVnodeTable->Insert(vnode);
988 	add_vnode_to_mount_list(vnode, vnode->mount);
989 
990 	rw_lock_read_unlock(&sMountLock);
991 
992 	_vnode = vnode;
993 	_nodeCreated = true;
994 
995 	// keep the vnode lock locked
996 	return B_OK;
997 }
998 
999 
1000 /*!	Frees the vnode and all resources it has acquired, and removes
1001 	it from the vnode hash as well as from its mount structure.
1002 	Will also make sure that any cache modifications are written back.
1003 */
1004 static void
1005 free_vnode(struct vnode* vnode, bool reenter)
1006 {
1007 	ASSERT_PRINT(vnode->ref_count == 0 && vnode->IsBusy(), "vnode: %p\n",
1008 		vnode);
1009 	ASSERT_PRINT(vnode->advisory_locking == NULL, "vnode: %p\n", vnode);
1010 
1011 	// write back any changes in this vnode's cache -- but only
1012 	// if the vnode won't be deleted, in which case the changes
1013 	// will be discarded
1014 
1015 	if (!vnode->IsRemoved() && HAS_FS_CALL(vnode, fsync))
1016 		FS_CALL_NO_PARAMS(vnode, fsync);
1017 
1018 	// Note: If this vnode has a cache attached, there will still be two
1019 	// references to that cache at this point. The last one belongs to the vnode
1020 	// itself (cf. vfs_get_vnode_cache()) and one belongs to the node's file
1021 	// cache. Each but the last reference to a cache also includes a reference
1022 	// to the vnode. The file cache, however, released its reference (cf.
1023 	// file_cache_create()), so that this vnode's ref count has the chance to
1024 	// ever drop to 0. Deleting the file cache now, will cause the next to last
1025 	// cache reference to be released, which will also release a (no longer
1026 	// existing) vnode reference. To avoid problems, we set the vnode's ref
1027 	// count, so that it will neither become negative nor 0.
1028 	vnode->ref_count = 2;
1029 
1030 	if (!vnode->IsUnpublished()) {
1031 		if (vnode->IsRemoved())
1032 			FS_CALL(vnode, remove_vnode, reenter);
1033 		else
1034 			FS_CALL(vnode, put_vnode, reenter);
1035 	}
1036 
1037 	// If the vnode has a VMCache attached, make sure that it won't try to get
1038 	// another reference via VMVnodeCache::AcquireUnreferencedStoreRef(). As
1039 	// long as the vnode is busy and in the hash, that won't happen, but as
1040 	// soon as we've removed it from the hash, it could reload the vnode -- with
1041 	// a new cache attached!
1042 	if (vnode->cache != NULL && vnode->cache->type == CACHE_TYPE_VNODE)
1043 		((VMVnodeCache*)vnode->cache)->VnodeDeleted();
1044 
1045 	// The file system has removed the resources of the vnode now, so we can
1046 	// make it available again (by removing the busy vnode from the hash).
1047 	rw_lock_write_lock(&sVnodeLock);
1048 	sVnodeTable->Remove(vnode);
1049 	rw_lock_write_unlock(&sVnodeLock);
1050 
1051 	// if we have a VMCache attached, remove it
1052 	if (vnode->cache)
1053 		vnode->cache->ReleaseRef();
1054 
1055 	vnode->cache = NULL;
1056 
1057 	remove_vnode_from_mount_list(vnode, vnode->mount);
1058 
1059 	object_cache_free(sVnodeCache, vnode, 0);
1060 }
1061 
1062 
1063 /*!	\brief Decrements the reference counter of the given vnode and deletes it,
1064 	if the counter dropped to 0.
1065 
1066 	The caller must, of course, own a reference to the vnode to call this
1067 	function.
1068 	The caller must not hold the sVnodeLock or the sMountLock.
1069 
1070 	\param vnode the vnode.
1071 	\param alwaysFree don't move this vnode into the unused list, but really
1072 		   delete it if possible.
1073 	\param reenter \c true, if this function is called (indirectly) from within
1074 		   a file system. This will be passed to file system hooks only.
1075 	\return \c B_OK, if everything went fine, an error code otherwise.
1076 */
1077 static status_t
1078 dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree, bool reenter)
1079 {
1080 	ReadLocker locker(sVnodeLock);
1081 	AutoLocker<Vnode> nodeLocker(vnode);
1082 
1083 	int32 oldRefCount = atomic_add(&vnode->ref_count, -1);
1084 
1085 	ASSERT_PRINT(oldRefCount > 0, "vnode %p\n", vnode);
1086 
1087 	TRACE(("dec_vnode_ref_count: vnode %p, ref now %" B_PRId32 "\n", vnode,
1088 		vnode->ref_count));
1089 
1090 	if (oldRefCount != 1)
1091 		return B_OK;
1092 
1093 	if (vnode->IsBusy())
1094 		panic("dec_vnode_ref_count: called on busy vnode %p\n", vnode);
1095 
1096 	bool freeNode = false;
1097 	bool freeUnusedNodes = false;
1098 
1099 	// Just insert the vnode into an unused list if we don't need
1100 	// to delete it
1101 	if (vnode->IsRemoved() || alwaysFree) {
1102 		vnode_to_be_freed(vnode);
1103 		vnode->SetBusy(true);
1104 		freeNode = true;
1105 	} else
1106 		freeUnusedNodes = vnode_unused(vnode);
1107 
1108 	nodeLocker.Unlock();
1109 	locker.Unlock();
1110 
1111 	if (freeNode)
1112 		free_vnode(vnode, reenter);
1113 	else if (freeUnusedNodes)
1114 		free_unused_vnodes();
1115 
1116 	return B_OK;
1117 }
1118 
1119 
1120 /*!	\brief Increments the reference counter of the given vnode.
1121 
1122 	The caller must make sure that the node isn't deleted while this function
1123 	is called. This can be done either:
1124 	- by ensuring that a reference to the node exists and remains in existence,
1125 	  or
1126 	- by holding the vnode's lock (which also requires read locking sVnodeLock)
1127 	  or by holding sVnodeLock write locked.
1128 
1129 	In the second case the caller is responsible for dealing with the ref count
1130 	0 -> 1 transition. That is 1. this function must not be invoked when the
1131 	node is busy in the first place and 2. vnode_used() must be called for the
1132 	node.
1133 
1134 	\param vnode the vnode.
1135 */
1136 static void
1137 inc_vnode_ref_count(struct vnode* vnode)
1138 {
1139 	atomic_add(&vnode->ref_count, 1);
1140 	TRACE(("inc_vnode_ref_count: vnode %p, ref now %" B_PRId32 "\n", vnode,
1141 		vnode->ref_count));
1142 }
1143 
1144 
1145 static bool
1146 is_special_node_type(int type)
1147 {
1148 	// at the moment only FIFOs are supported
1149 	return S_ISFIFO(type);
1150 }
1151 
1152 
1153 static status_t
1154 create_special_sub_node(struct vnode* vnode, uint32 flags)
1155 {
1156 	if (S_ISFIFO(vnode->Type()))
1157 		return create_fifo_vnode(vnode->mount->volume, vnode);
1158 
1159 	return B_BAD_VALUE;
1160 }
1161 
1162 
1163 /*!	\brief Retrieves a vnode for a given mount ID, node ID pair.
1164 
1165 	If the node is not yet in memory, it will be loaded.
1166 
1167 	The caller must not hold the sVnodeLock or the sMountLock.
1168 
1169 	\param mountID the mount ID.
1170 	\param vnodeID the node ID.
1171 	\param _vnode Pointer to a vnode* variable into which the pointer to the
1172 		   retrieved vnode structure shall be written.
1173 	\param reenter \c true, if this function is called (indirectly) from within
1174 		   a file system.
1175 	\return \c B_OK, if everything when fine, an error code otherwise.
1176 */
1177 static status_t
1178 get_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode, bool canWait,
1179 	int reenter)
1180 {
1181 	FUNCTION(("get_vnode: mountid %" B_PRId32 " vnid 0x%" B_PRIx64 " %p\n",
1182 		mountID, vnodeID, _vnode));
1183 
1184 	rw_lock_read_lock(&sVnodeLock);
1185 
1186 	int32 tries = BUSY_VNODE_RETRIES;
1187 restart:
1188 	struct vnode* vnode = lookup_vnode(mountID, vnodeID);
1189 	AutoLocker<Vnode> nodeLocker(vnode);
1190 
1191 	if (vnode && vnode->IsBusy()) {
1192 		nodeLocker.Unlock();
1193 		rw_lock_read_unlock(&sVnodeLock);
1194 		if (!canWait) {
1195 			dprintf("vnode %" B_PRIdDEV ":%" B_PRIdINO " is busy!\n",
1196 				mountID, vnodeID);
1197 			return B_BUSY;
1198 		}
1199 		if (!retry_busy_vnode(tries, mountID, vnodeID))
1200 			return B_BUSY;
1201 
1202 		rw_lock_read_lock(&sVnodeLock);
1203 		goto restart;
1204 	}
1205 
1206 	TRACE(("get_vnode: tried to lookup vnode, got %p\n", vnode));
1207 
1208 	status_t status;
1209 
1210 	if (vnode) {
1211 		if (vnode->ref_count == 0) {
1212 			// this vnode has been unused before
1213 			vnode_used(vnode);
1214 		}
1215 		inc_vnode_ref_count(vnode);
1216 
1217 		nodeLocker.Unlock();
1218 		rw_lock_read_unlock(&sVnodeLock);
1219 	} else {
1220 		// we need to create a new vnode and read it in
1221 		rw_lock_read_unlock(&sVnodeLock);
1222 			// unlock -- create_new_vnode_and_lock() write-locks on success
1223 		bool nodeCreated;
1224 		status = create_new_vnode_and_lock(mountID, vnodeID, vnode,
1225 			nodeCreated);
1226 		if (status != B_OK)
1227 			return status;
1228 
1229 		if (!nodeCreated) {
1230 			rw_lock_read_lock(&sVnodeLock);
1231 			rw_lock_write_unlock(&sVnodeLock);
1232 			goto restart;
1233 		}
1234 
1235 		rw_lock_write_unlock(&sVnodeLock);
1236 
1237 		int type;
1238 		uint32 flags;
1239 		status = FS_MOUNT_CALL(vnode->mount, get_vnode, vnodeID, vnode, &type,
1240 			&flags, reenter);
1241 		if (status == B_OK && vnode->private_node == NULL)
1242 			status = B_BAD_VALUE;
1243 
1244 		bool gotNode = status == B_OK;
1245 		bool publishSpecialSubNode = false;
1246 		if (gotNode) {
1247 			vnode->SetType(type);
1248 			publishSpecialSubNode = is_special_node_type(type)
1249 				&& (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
1250 		}
1251 
1252 		if (gotNode && publishSpecialSubNode)
1253 			status = create_special_sub_node(vnode, flags);
1254 
1255 		if (status != B_OK) {
1256 			if (gotNode)
1257 				FS_CALL(vnode, put_vnode, reenter);
1258 
1259 			rw_lock_write_lock(&sVnodeLock);
1260 			sVnodeTable->Remove(vnode);
1261 			remove_vnode_from_mount_list(vnode, vnode->mount);
1262 			rw_lock_write_unlock(&sVnodeLock);
1263 
1264 			object_cache_free(sVnodeCache, vnode, 0);
1265 			return status;
1266 		}
1267 
1268 		rw_lock_read_lock(&sVnodeLock);
1269 		vnode->Lock();
1270 
1271 		vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
1272 		vnode->SetBusy(false);
1273 
1274 		vnode->Unlock();
1275 		rw_lock_read_unlock(&sVnodeLock);
1276 	}
1277 
1278 	TRACE(("get_vnode: returning %p\n", vnode));
1279 
1280 	*_vnode = vnode;
1281 	return B_OK;
1282 }
1283 
1284 
1285 /*!	\brief Decrements the reference counter of the given vnode and deletes it,
1286 	if the counter dropped to 0.
1287 
1288 	The caller must, of course, own a reference to the vnode to call this
1289 	function.
1290 	The caller must not hold the sVnodeLock or the sMountLock.
1291 
1292 	\param vnode the vnode.
1293 */
1294 static inline void
1295 put_vnode(struct vnode* vnode)
1296 {
1297 	dec_vnode_ref_count(vnode, false, false);
1298 }
1299 
1300 
1301 static void
1302 free_unused_vnodes(int32 level)
1303 {
1304 	unused_vnodes_check_started();
1305 
1306 	if (level == B_NO_LOW_RESOURCE) {
1307 		unused_vnodes_check_done();
1308 		return;
1309 	}
1310 
1311 	flush_hot_vnodes();
1312 
1313 	// determine how many nodes to free
1314 	uint32 count = 1;
1315 	{
1316 		MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1317 
1318 		switch (level) {
1319 			case B_LOW_RESOURCE_NOTE:
1320 				count = sUnusedVnodes / 100;
1321 				break;
1322 			case B_LOW_RESOURCE_WARNING:
1323 				count = sUnusedVnodes / 10;
1324 				break;
1325 			case B_LOW_RESOURCE_CRITICAL:
1326 				count = sUnusedVnodes;
1327 				break;
1328 		}
1329 
1330 		if (count > sUnusedVnodes)
1331 			count = sUnusedVnodes;
1332 	}
1333 
1334 	// Write back the modified pages of some unused vnodes and free them.
1335 
1336 	for (uint32 i = 0; i < count; i++) {
1337 		ReadLocker vnodesReadLocker(sVnodeLock);
1338 
1339 		// get the first node
1340 		MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1341 		struct vnode* vnode = (struct vnode*)list_get_first_item(
1342 			&sUnusedVnodeList);
1343 		unusedVnodesLocker.Unlock();
1344 
1345 		if (vnode == NULL)
1346 			break;
1347 
1348 		// lock the node
1349 		AutoLocker<Vnode> nodeLocker(vnode);
1350 
1351 		// Check whether the node is still unused -- since we only append to the
1352 		// tail of the unused queue, the vnode should still be at its head.
1353 		// Alternatively we could check its ref count for 0 and its busy flag,
1354 		// but if the node is no longer at the head of the queue, it means it
1355 		// has been touched in the meantime, i.e. it is no longer the least
1356 		// recently used unused vnode and we rather don't free it.
1357 		unusedVnodesLocker.Lock();
1358 		if (vnode != list_get_first_item(&sUnusedVnodeList))
1359 			continue;
1360 		unusedVnodesLocker.Unlock();
1361 
1362 		ASSERT(!vnode->IsBusy());
1363 
1364 		// grab a reference
1365 		inc_vnode_ref_count(vnode);
1366 		vnode_used(vnode);
1367 
1368 		// write back changes and free the node
1369 		nodeLocker.Unlock();
1370 		vnodesReadLocker.Unlock();
1371 
1372 		if (vnode->cache != NULL)
1373 			vnode->cache->WriteModified();
1374 
1375 		dec_vnode_ref_count(vnode, true, false);
1376 			// this should free the vnode when it's still unused
1377 	}
1378 
1379 	unused_vnodes_check_done();
1380 }
1381 
1382 
1383 /*!	Gets the vnode the given vnode is covering.
1384 
1385 	The caller must have \c sVnodeLock read-locked at least.
1386 
1387 	The function returns a reference to the retrieved vnode (if any), the caller
1388 	is responsible to free.
1389 
1390 	\param vnode The vnode whose covered node shall be returned.
1391 	\return The covered vnode, or \c NULL if the given vnode doesn't cover any
1392 		vnode.
1393 */
1394 static inline Vnode*
1395 get_covered_vnode_locked(Vnode* vnode)
1396 {
1397 	if (Vnode* coveredNode = vnode->covers) {
1398 		while (coveredNode->covers != NULL)
1399 			coveredNode = coveredNode->covers;
1400 
1401 		inc_vnode_ref_count(coveredNode);
1402 		return coveredNode;
1403 	}
1404 
1405 	return NULL;
1406 }
1407 
1408 
1409 /*!	Gets the vnode the given vnode is covering.
1410 
1411 	The caller must not hold \c sVnodeLock. Note that this implies a race
1412 	condition, since the situation can change at any time.
1413 
1414 	The function returns a reference to the retrieved vnode (if any), the caller
1415 	is responsible to free.
1416 
1417 	\param vnode The vnode whose covered node shall be returned.
1418 	\return The covered vnode, or \c NULL if the given vnode doesn't cover any
1419 		vnode.
1420 */
1421 static inline Vnode*
1422 get_covered_vnode(Vnode* vnode)
1423 {
1424 	if (!vnode->IsCovering())
1425 		return NULL;
1426 
1427 	ReadLocker vnodeReadLocker(sVnodeLock);
1428 	return get_covered_vnode_locked(vnode);
1429 }
1430 
1431 
1432 /*!	Gets the vnode the given vnode is covered by.
1433 
1434 	The caller must have \c sVnodeLock read-locked at least.
1435 
1436 	The function returns a reference to the retrieved vnode (if any), the caller
1437 	is responsible to free.
1438 
1439 	\param vnode The vnode whose covering node shall be returned.
1440 	\return The covering vnode, or \c NULL if the given vnode isn't covered by
1441 		any vnode.
1442 */
1443 static Vnode*
1444 get_covering_vnode_locked(Vnode* vnode)
1445 {
1446 	if (Vnode* coveringNode = vnode->covered_by) {
1447 		while (coveringNode->covered_by != NULL)
1448 			coveringNode = coveringNode->covered_by;
1449 
1450 		inc_vnode_ref_count(coveringNode);
1451 		return coveringNode;
1452 	}
1453 
1454 	return NULL;
1455 }
1456 
1457 
1458 /*!	Gets the vnode the given vnode is covered by.
1459 
1460 	The caller must not hold \c sVnodeLock. Note that this implies a race
1461 	condition, since the situation can change at any time.
1462 
1463 	The function returns a reference to the retrieved vnode (if any), the caller
1464 	is responsible to free.
1465 
1466 	\param vnode The vnode whose covering node shall be returned.
1467 	\return The covering vnode, or \c NULL if the given vnode isn't covered by
1468 		any vnode.
1469 */
1470 static inline Vnode*
1471 get_covering_vnode(Vnode* vnode)
1472 {
1473 	if (!vnode->IsCovered())
1474 		return NULL;
1475 
1476 	ReadLocker vnodeReadLocker(sVnodeLock);
1477 	return get_covering_vnode_locked(vnode);
1478 }
1479 
1480 
1481 static void
1482 free_unused_vnodes()
1483 {
1484 	free_unused_vnodes(
1485 		low_resource_state(B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1486 			| B_KERNEL_RESOURCE_ADDRESS_SPACE));
1487 }
1488 
1489 
1490 static void
1491 vnode_low_resource_handler(void* /*data*/, uint32 resources, int32 level)
1492 {
1493 	TRACE(("vnode_low_resource_handler(level = %" B_PRId32 ")\n", level));
1494 
1495 	free_unused_vnodes(level);
1496 }
1497 
1498 
1499 static inline void
1500 put_advisory_locking(struct advisory_locking* locking)
1501 {
1502 	release_sem(locking->lock);
1503 }
1504 
1505 
1506 /*!	Returns the advisory_locking object of the \a vnode in case it
1507 	has one, and locks it.
1508 	You have to call put_advisory_locking() when you're done with
1509 	it.
1510 	Note, you must not have the vnode mutex locked when calling
1511 	this function.
1512 */
1513 static struct advisory_locking*
1514 get_advisory_locking(struct vnode* vnode)
1515 {
1516 	rw_lock_read_lock(&sVnodeLock);
1517 	vnode->Lock();
1518 
1519 	struct advisory_locking* locking = vnode->advisory_locking;
1520 	sem_id lock = locking != NULL ? locking->lock : B_ERROR;
1521 
1522 	vnode->Unlock();
1523 	rw_lock_read_unlock(&sVnodeLock);
1524 
1525 	if (lock >= 0)
1526 		lock = acquire_sem(lock);
1527 	if (lock < 0) {
1528 		// This means the locking has been deleted in the mean time
1529 		// or had never existed in the first place - otherwise, we
1530 		// would get the lock at some point.
1531 		return NULL;
1532 	}
1533 
1534 	return locking;
1535 }
1536 
1537 
1538 /*!	Creates a locked advisory_locking object, and attaches it to the
1539 	given \a vnode.
1540 	Returns B_OK in case of success - also if the vnode got such an
1541 	object from someone else in the mean time, you'll still get this
1542 	one locked then.
1543 */
1544 static status_t
1545 create_advisory_locking(struct vnode* vnode)
1546 {
1547 	if (vnode == NULL)
1548 		return B_FILE_ERROR;
1549 
1550 	ObjectDeleter<advisory_locking> lockingDeleter;
1551 	struct advisory_locking* locking = NULL;
1552 
1553 	while (get_advisory_locking(vnode) == NULL) {
1554 		// no locking object set on the vnode yet, create one
1555 		if (locking == NULL) {
1556 			locking = new(std::nothrow) advisory_locking;
1557 			if (locking == NULL)
1558 				return B_NO_MEMORY;
1559 			lockingDeleter.SetTo(locking);
1560 
1561 			locking->wait_sem = create_sem(0, "advisory lock");
1562 			if (locking->wait_sem < 0)
1563 				return locking->wait_sem;
1564 
1565 			locking->lock = create_sem(0, "advisory locking");
1566 			if (locking->lock < 0)
1567 				return locking->lock;
1568 		}
1569 
1570 		// set our newly created locking object
1571 		ReadLocker _(sVnodeLock);
1572 		AutoLocker<Vnode> nodeLocker(vnode);
1573 		if (vnode->advisory_locking == NULL) {
1574 			vnode->advisory_locking = locking;
1575 			lockingDeleter.Detach();
1576 			return B_OK;
1577 		}
1578 	}
1579 
1580 	// The vnode already had a locking object. That's just as well.
1581 
1582 	return B_OK;
1583 }
1584 
1585 
1586 /*! Returns \c true when either \a flock is \c NULL or the \a flock intersects
1587 	with the advisory_lock \a lock.
1588 */
1589 static bool
1590 advisory_lock_intersects(struct advisory_lock* lock, struct flock* flock)
1591 {
1592 	if (flock == NULL)
1593 		return true;
1594 
1595 	return lock->start <= flock->l_start - 1 + flock->l_len
1596 		&& lock->end >= flock->l_start;
1597 }
1598 
1599 
1600 /*!	Tests whether acquiring a lock would block.
1601 */
1602 static status_t
1603 test_advisory_lock(struct vnode* vnode, struct flock* flock)
1604 {
1605 	flock->l_type = F_UNLCK;
1606 
1607 	struct advisory_locking* locking = get_advisory_locking(vnode);
1608 	if (locking == NULL)
1609 		return B_OK;
1610 
1611 	team_id team = team_get_current_team_id();
1612 
1613 	LockList::Iterator iterator = locking->locks.GetIterator();
1614 	while (iterator.HasNext()) {
1615 		struct advisory_lock* lock = iterator.Next();
1616 
1617 		 if (lock->team != team && advisory_lock_intersects(lock, flock)) {
1618 			// locks do overlap
1619 			if (flock->l_type != F_RDLCK || !lock->shared) {
1620 				// collision
1621 				flock->l_type = lock->shared ? F_RDLCK : F_WRLCK;
1622 				flock->l_whence = SEEK_SET;
1623 				flock->l_start = lock->start;
1624 				flock->l_len = lock->end - lock->start + 1;
1625 				flock->l_pid = lock->team;
1626 				break;
1627 			}
1628 		}
1629 	}
1630 
1631 	put_advisory_locking(locking);
1632 	return B_OK;
1633 }
1634 
1635 
1636 /*!	Removes the specified lock, or all locks of the calling team
1637 	if \a flock is NULL.
1638 */
1639 static status_t
1640 release_advisory_lock(struct vnode* vnode, struct io_context* context,
1641 	struct file_descriptor* descriptor, struct flock* flock)
1642 {
1643 	FUNCTION(("release_advisory_lock(vnode = %p, flock = %p)\n", vnode, flock));
1644 
1645 	struct advisory_locking* locking = get_advisory_locking(vnode);
1646 	if (locking == NULL)
1647 		return B_OK;
1648 
1649 	// find matching lock entries
1650 
1651 	LockList::Iterator iterator = locking->locks.GetIterator();
1652 	while (iterator.HasNext()) {
1653 		struct advisory_lock* lock = iterator.Next();
1654 		bool removeLock = false;
1655 
1656 		if (descriptor != NULL && lock->bound_to == descriptor) {
1657 			// Remove flock() locks
1658 			removeLock = true;
1659 		} else if (lock->bound_to == context
1660 				&& advisory_lock_intersects(lock, flock)) {
1661 			// Remove POSIX locks
1662 			bool endsBeyond = false;
1663 			bool startsBefore = false;
1664 			if (flock != NULL) {
1665 				startsBefore = lock->start < flock->l_start;
1666 				endsBeyond = lock->end > flock->l_start - 1 + flock->l_len;
1667 			}
1668 
1669 			if (!startsBefore && !endsBeyond) {
1670 				// lock is completely contained in flock
1671 				removeLock = true;
1672 			} else if (startsBefore && !endsBeyond) {
1673 				// cut the end of the lock
1674 				lock->end = flock->l_start - 1;
1675 			} else if (!startsBefore && endsBeyond) {
1676 				// cut the start of the lock
1677 				lock->start = flock->l_start + flock->l_len;
1678 			} else {
1679 				// divide the lock into two locks
1680 				struct advisory_lock* secondLock = new advisory_lock;
1681 				if (secondLock == NULL) {
1682 					// TODO: we should probably revert the locks we already
1683 					// changed... (ie. allocate upfront)
1684 					put_advisory_locking(locking);
1685 					return B_NO_MEMORY;
1686 				}
1687 
1688 				lock->end = flock->l_start - 1;
1689 
1690 				secondLock->bound_to = context;
1691 				secondLock->team = lock->team;
1692 				secondLock->session = lock->session;
1693 				// values must already be normalized when getting here
1694 				secondLock->start = flock->l_start + flock->l_len;
1695 				secondLock->end = lock->end;
1696 				secondLock->shared = lock->shared;
1697 
1698 				locking->locks.Add(secondLock);
1699 			}
1700 		}
1701 
1702 		if (removeLock) {
1703 			// this lock is no longer used
1704 			iterator.Remove();
1705 			free(lock);
1706 		}
1707 	}
1708 
1709 	bool removeLocking = locking->locks.IsEmpty();
1710 	release_sem_etc(locking->wait_sem, 1, B_RELEASE_ALL);
1711 
1712 	put_advisory_locking(locking);
1713 
1714 	if (removeLocking) {
1715 		// We can remove the whole advisory locking structure; it's no
1716 		// longer used
1717 		locking = get_advisory_locking(vnode);
1718 		if (locking != NULL) {
1719 			ReadLocker locker(sVnodeLock);
1720 			AutoLocker<Vnode> nodeLocker(vnode);
1721 
1722 			// the locking could have been changed in the mean time
1723 			if (locking->locks.IsEmpty()) {
1724 				vnode->advisory_locking = NULL;
1725 				nodeLocker.Unlock();
1726 				locker.Unlock();
1727 
1728 				// we've detached the locking from the vnode, so we can
1729 				// safely delete it
1730 				delete locking;
1731 			} else {
1732 				// the locking is in use again
1733 				nodeLocker.Unlock();
1734 				locker.Unlock();
1735 				release_sem_etc(locking->lock, 1, B_DO_NOT_RESCHEDULE);
1736 			}
1737 		}
1738 	}
1739 
1740 	return B_OK;
1741 }
1742 
1743 
1744 /*!	Acquires an advisory lock for the \a vnode. If \a wait is \c true, it
1745 	will wait for the lock to become available, if there are any collisions
1746 	(it will return B_PERMISSION_DENIED in this case if \a wait is \c false).
1747 
1748 	If \a descriptor is NULL, POSIX semantics are used for this lock. Otherwise,
1749 	BSD flock() semantics are used, that is, all children can unlock the file
1750 	in question (we even allow parents to remove the lock, though, but that
1751 	seems to be in line to what the BSD's are doing).
1752 */
1753 static status_t
1754 acquire_advisory_lock(struct vnode* vnode, io_context* context,
1755 	struct file_descriptor* descriptor, struct flock* flock, bool wait)
1756 {
1757 	FUNCTION(("acquire_advisory_lock(vnode = %p, flock = %p, wait = %s)\n",
1758 		vnode, flock, wait ? "yes" : "no"));
1759 
1760 	bool shared = flock->l_type == F_RDLCK;
1761 	void* boundTo = descriptor != NULL ? (void*)descriptor : (void*)context;
1762 	status_t status = B_OK;
1763 
1764 	// TODO: do deadlock detection!
1765 
1766 	struct advisory_locking* locking;
1767 
1768 	while (true) {
1769 		// if this vnode has an advisory_locking structure attached,
1770 		// lock that one and search for any colliding file lock
1771 		status = create_advisory_locking(vnode);
1772 		if (status != B_OK)
1773 			return status;
1774 
1775 		locking = vnode->advisory_locking;
1776 		team_id team = team_get_current_team_id();
1777 		sem_id waitForLock = -1;
1778 
1779 		// test for collisions
1780 		LockList::Iterator iterator = locking->locks.GetIterator();
1781 		while (iterator.HasNext()) {
1782 			struct advisory_lock* lock = iterator.Next();
1783 
1784 			// TODO: locks from the same team might be joinable!
1785 			if ((lock->team != team || lock->bound_to != boundTo)
1786 					&& advisory_lock_intersects(lock, flock)) {
1787 				// locks do overlap
1788 				if (!shared || !lock->shared) {
1789 					// we need to wait
1790 					waitForLock = locking->wait_sem;
1791 					break;
1792 				}
1793 			}
1794 		}
1795 
1796 		if (waitForLock < 0)
1797 			break;
1798 
1799 		// We need to wait. Do that or fail now, if we've been asked not to.
1800 
1801 		if (!wait) {
1802 			put_advisory_locking(locking);
1803 			return descriptor != NULL ? B_WOULD_BLOCK : B_PERMISSION_DENIED;
1804 		}
1805 
1806 		status = switch_sem_etc(locking->lock, waitForLock, 1,
1807 			B_CAN_INTERRUPT, 0);
1808 		if (status != B_OK && status != B_BAD_SEM_ID)
1809 			return status;
1810 
1811 		// We have been notified, but we need to re-lock the locking object. So
1812 		// go another round...
1813 	}
1814 
1815 	// install new lock
1816 
1817 	struct advisory_lock* lock = (struct advisory_lock*)malloc(
1818 		sizeof(struct advisory_lock));
1819 	if (lock == NULL) {
1820 		put_advisory_locking(locking);
1821 		return B_NO_MEMORY;
1822 	}
1823 
1824 	lock->bound_to = boundTo;
1825 	lock->team = team_get_current_team_id();
1826 	lock->session = thread_get_current_thread()->team->session_id;
1827 	// values must already be normalized when getting here
1828 	lock->start = flock->l_start;
1829 	lock->end = flock->l_start - 1 + flock->l_len;
1830 	lock->shared = shared;
1831 
1832 	locking->locks.Add(lock);
1833 	put_advisory_locking(locking);
1834 
1835 	return status;
1836 }
1837 
1838 
1839 /*!	Normalizes the \a flock structure to make it easier to compare the
1840 	structure with others. The l_start and l_len fields are set to absolute
1841 	values according to the l_whence field.
1842 */
1843 static status_t
1844 normalize_flock(struct file_descriptor* descriptor, struct flock* flock)
1845 {
1846 	switch (flock->l_whence) {
1847 		case SEEK_SET:
1848 			break;
1849 		case SEEK_CUR:
1850 			flock->l_start += descriptor->pos;
1851 			break;
1852 		case SEEK_END:
1853 		{
1854 			struct vnode* vnode = descriptor->u.vnode;
1855 			struct stat stat;
1856 			status_t status;
1857 
1858 			if (!HAS_FS_CALL(vnode, read_stat))
1859 				return B_UNSUPPORTED;
1860 
1861 			status = FS_CALL(vnode, read_stat, &stat);
1862 			if (status != B_OK)
1863 				return status;
1864 
1865 			flock->l_start += stat.st_size;
1866 			break;
1867 		}
1868 		default:
1869 			return B_BAD_VALUE;
1870 	}
1871 
1872 	if (flock->l_start < 0)
1873 		flock->l_start = 0;
1874 	if (flock->l_len == 0)
1875 		flock->l_len = OFF_MAX;
1876 
1877 	// don't let the offset and length overflow
1878 	if (flock->l_start > 0 && OFF_MAX - flock->l_start < flock->l_len)
1879 		flock->l_len = OFF_MAX - flock->l_start;
1880 
1881 	if (flock->l_len < 0) {
1882 		// a negative length reverses the region
1883 		flock->l_start += flock->l_len;
1884 		flock->l_len = -flock->l_len;
1885 	}
1886 
1887 	return B_OK;
1888 }
1889 
1890 
1891 static void
1892 replace_vnode_if_disconnected(struct fs_mount* mount,
1893 	struct vnode* vnodeToDisconnect, struct vnode*& vnode,
1894 	struct vnode* fallBack, bool lockRootLock)
1895 {
1896 	struct vnode* givenVnode = vnode;
1897 	bool vnodeReplaced = false;
1898 
1899 	ReadLocker vnodeReadLocker(sVnodeLock);
1900 
1901 	if (lockRootLock)
1902 		mutex_lock(&sIOContextRootLock);
1903 
1904 	while (vnode != NULL && vnode->mount == mount
1905 		&& (vnodeToDisconnect == NULL || vnodeToDisconnect == vnode)) {
1906 		if (vnode->covers != NULL) {
1907 			// redirect the vnode to the covered vnode
1908 			vnode = vnode->covers;
1909 		} else
1910 			vnode = fallBack;
1911 
1912 		vnodeReplaced = true;
1913 	}
1914 
1915 	// If we've replaced the node, grab a reference for the new one.
1916 	if (vnodeReplaced && vnode != NULL)
1917 		inc_vnode_ref_count(vnode);
1918 
1919 	if (lockRootLock)
1920 		mutex_unlock(&sIOContextRootLock);
1921 
1922 	vnodeReadLocker.Unlock();
1923 
1924 	if (vnodeReplaced)
1925 		put_vnode(givenVnode);
1926 }
1927 
1928 
1929 /*!	Disconnects all file descriptors that are associated with the
1930 	\a vnodeToDisconnect, or if this is NULL, all vnodes of the specified
1931 	\a mount object.
1932 
1933 	Note, after you've called this function, there might still be ongoing
1934 	accesses - they won't be interrupted if they already happened before.
1935 	However, any subsequent access will fail.
1936 
1937 	This is not a cheap function and should be used with care and rarely.
1938 	TODO: there is currently no means to stop a blocking read/write!
1939 */
1940 static void
1941 disconnect_mount_or_vnode_fds(struct fs_mount* mount,
1942 	struct vnode* vnodeToDisconnect)
1943 {
1944 	// iterate over all teams and peek into their file descriptors
1945 	TeamListIterator teamIterator;
1946 	while (Team* team = teamIterator.Next()) {
1947 		BReference<Team> teamReference(team, true);
1948 		TeamLocker teamLocker(team);
1949 
1950 		// lock the I/O context
1951 		io_context* context = team->io_context;
1952 		if (context == NULL)
1953 			continue;
1954 		MutexLocker contextLocker(context->io_mutex);
1955 
1956 		teamLocker.Unlock();
1957 
1958 		replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->root,
1959 			sRoot, true);
1960 		replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->cwd,
1961 			sRoot, false);
1962 
1963 		for (uint32 i = 0; i < context->table_size; i++) {
1964 			struct file_descriptor* descriptor = context->fds[i];
1965 			if (descriptor == NULL || (descriptor->open_mode & O_DISCONNECTED) != 0)
1966 				continue;
1967 
1968 			inc_fd_ref_count(descriptor);
1969 
1970 			// if this descriptor points at this mount, we
1971 			// need to disconnect it to be able to unmount
1972 			struct vnode* vnode = fd_vnode(descriptor);
1973 			if (vnodeToDisconnect != NULL) {
1974 				if (vnode == vnodeToDisconnect)
1975 					disconnect_fd(descriptor);
1976 			} else if ((vnode != NULL && vnode->mount == mount)
1977 				|| (vnode == NULL && descriptor->u.mount == mount))
1978 				disconnect_fd(descriptor);
1979 
1980 			put_fd(descriptor);
1981 		}
1982 	}
1983 }
1984 
1985 
1986 /*!	\brief Gets the root node of the current IO context.
1987 	If \a kernel is \c true, the kernel IO context will be used.
1988 	The caller obtains a reference to the returned node.
1989 */
1990 struct vnode*
1991 get_root_vnode(bool kernel)
1992 {
1993 	if (!kernel) {
1994 		// Get current working directory from io context
1995 		struct io_context* context = get_current_io_context(kernel);
1996 
1997 		mutex_lock(&sIOContextRootLock);
1998 
1999 		struct vnode* root = context->root;
2000 		if (root != NULL)
2001 			inc_vnode_ref_count(root);
2002 
2003 		mutex_unlock(&sIOContextRootLock);
2004 
2005 		if (root != NULL)
2006 			return root;
2007 
2008 		// That should never happen.
2009 		dprintf("get_root_vnode(): IO context for team %" B_PRId32 " doesn't "
2010 			"have a root\n", team_get_current_team_id());
2011 	}
2012 
2013 	inc_vnode_ref_count(sRoot);
2014 	return sRoot;
2015 }
2016 
2017 
2018 /*!	\brief Gets the directory path and leaf name for a given path.
2019 
2020 	The supplied \a path is transformed to refer to the directory part of
2021 	the entry identified by the original path, and into the buffer \a filename
2022 	the leaf name of the original entry is written.
2023 	Neither the returned path nor the leaf name can be expected to be
2024 	canonical.
2025 
2026 	\param path The path to be analyzed. Must be able to store at least one
2027 		   additional character.
2028 	\param filename The buffer into which the leaf name will be written.
2029 		   Must be of size B_FILE_NAME_LENGTH at least.
2030 	\return \c B_OK, if everything went fine, \c B_NAME_TOO_LONG, if the leaf
2031 		   name is longer than \c B_FILE_NAME_LENGTH, or \c B_ENTRY_NOT_FOUND,
2032 		   if the given path name is empty.
2033 */
2034 static status_t
2035 get_dir_path_and_leaf(char* path, char* filename)
2036 {
2037 	if (*path == '\0')
2038 		return B_ENTRY_NOT_FOUND;
2039 
2040 	char* last = strrchr(path, '/');
2041 		// '/' are not allowed in file names!
2042 
2043 	FUNCTION(("get_dir_path_and_leaf(path = %s)\n", path));
2044 
2045 	if (last == NULL) {
2046 		// this path is single segment with no '/' in it
2047 		// ex. "foo"
2048 		if (strlcpy(filename, path, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2049 			return B_NAME_TOO_LONG;
2050 
2051 		strcpy(path, ".");
2052 	} else {
2053 		last++;
2054 		if (last[0] == '\0') {
2055 			// special case: the path ends in one or more '/' - remove them
2056 			while (*--last == '/' && last != path);
2057 			last[1] = '\0';
2058 
2059 			if (last == path && last[0] == '/') {
2060 				// This path points to the root of the file system
2061 				strcpy(filename, ".");
2062 				return B_OK;
2063 			}
2064 			for (; last != path && *(last - 1) != '/'; last--);
2065 				// rewind to the start of the leaf before the '/'
2066 		}
2067 
2068 		// normal leaf: replace the leaf portion of the path with a '.'
2069 		if (strlcpy(filename, last, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2070 			return B_NAME_TOO_LONG;
2071 
2072 		last[0] = '.';
2073 		last[1] = '\0';
2074 	}
2075 	return B_OK;
2076 }
2077 
2078 
2079 static status_t
2080 entry_ref_to_vnode(dev_t mountID, ino_t directoryID, const char* name,
2081 	bool traverse, bool kernel, struct vnode** _vnode)
2082 {
2083 	char clonedName[B_FILE_NAME_LENGTH + 1];
2084 	if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2085 		return B_NAME_TOO_LONG;
2086 
2087 	// get the directory vnode and let vnode_path_to_vnode() do the rest
2088 	struct vnode* directory;
2089 
2090 	status_t status = get_vnode(mountID, directoryID, &directory, true, false);
2091 	if (status < 0)
2092 		return status;
2093 
2094 	return vnode_path_to_vnode(directory, clonedName, traverse, 0, kernel,
2095 		_vnode, NULL);
2096 }
2097 
2098 
2099 /*!	Looks up the entry with name \a name in the directory represented by \a dir
2100 	and returns the respective vnode.
2101 	On success a reference to the vnode is acquired for the caller.
2102 */
2103 static status_t
2104 lookup_dir_entry(struct vnode* dir, const char* name, struct vnode** _vnode)
2105 {
2106 	ino_t id;
2107 	bool missing;
2108 
2109 	if (dir->mount->entry_cache.Lookup(dir->id, name, id, missing)) {
2110 		return missing ? B_ENTRY_NOT_FOUND
2111 			: get_vnode(dir->device, id, _vnode, true, false);
2112 	}
2113 
2114 	status_t status = FS_CALL(dir, lookup, name, &id);
2115 	if (status != B_OK)
2116 		return status;
2117 
2118 	// The lookup() hook calls get_vnode() or publish_vnode(), so we do already
2119 	// have a reference and just need to look the node up.
2120 	rw_lock_read_lock(&sVnodeLock);
2121 	*_vnode = lookup_vnode(dir->device, id);
2122 	rw_lock_read_unlock(&sVnodeLock);
2123 
2124 	if (*_vnode == NULL) {
2125 		panic("lookup_dir_entry(): could not lookup vnode (mountid 0x%" B_PRIx32
2126 			" vnid 0x%" B_PRIx64 ")\n", dir->device, id);
2127 		return B_ENTRY_NOT_FOUND;
2128 	}
2129 
2130 //	ktrace_printf("lookup_dir_entry(): dir: %p (%ld, %lld), name: \"%s\" -> "
2131 //		"%p (%ld, %lld)", dir, dir->mount->id, dir->id, name, *_vnode,
2132 //		(*_vnode)->mount->id, (*_vnode)->id);
2133 
2134 	return B_OK;
2135 }
2136 
2137 
2138 /*!	Returns the vnode for the relative path starting at the specified \a vnode.
2139 	\a path must not be NULL.
2140 	If it returns successfully, \a path contains the name of the last path
2141 	component. This function clobbers the buffer pointed to by \a path only
2142 	if it does contain more than one component.
2143 	Note, this reduces the ref_count of the starting \a vnode, no matter if
2144 	it is successful or not!
2145 */
2146 static status_t
2147 vnode_path_to_vnode(struct vnode* vnode, char* path, bool traverseLeafLink,
2148 	int count, struct io_context* ioContext, struct vnode** _vnode,
2149 	ino_t* _parentID)
2150 {
2151 	status_t status = B_OK;
2152 	ino_t lastParentID = vnode->id;
2153 
2154 	FUNCTION(("vnode_path_to_vnode(vnode = %p, path = %s)\n", vnode, path));
2155 
2156 	if (path == NULL) {
2157 		put_vnode(vnode);
2158 		return B_BAD_VALUE;
2159 	}
2160 
2161 	if (*path == '\0') {
2162 		put_vnode(vnode);
2163 		return B_ENTRY_NOT_FOUND;
2164 	}
2165 
2166 	while (true) {
2167 		struct vnode* nextVnode;
2168 		char* nextPath;
2169 
2170 		TRACE(("vnode_path_to_vnode: top of loop. p = %p, p = '%s'\n", path,
2171 			path));
2172 
2173 		// done?
2174 		if (path[0] == '\0')
2175 			break;
2176 
2177 		// walk to find the next path component ("path" will point to a single
2178 		// path component), and filter out multiple slashes
2179 		for (nextPath = path + 1; *nextPath != '\0' && *nextPath != '/';
2180 				nextPath++);
2181 
2182 		bool directoryFound = false;
2183 		if (*nextPath == '/') {
2184 			directoryFound = true;
2185 			*nextPath = '\0';
2186 			do
2187 				nextPath++;
2188 			while (*nextPath == '/');
2189 		}
2190 
2191 		// See if the '..' is at a covering vnode move to the covered
2192 		// vnode so we pass the '..' path to the underlying filesystem.
2193 		// Also prevent breaking the root of the IO context.
2194 		if (strcmp("..", path) == 0) {
2195 			if (vnode == ioContext->root) {
2196 				// Attempted prison break! Keep it contained.
2197 				path = nextPath;
2198 				continue;
2199 			}
2200 
2201 			if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2202 				nextVnode = coveredVnode;
2203 				put_vnode(vnode);
2204 				vnode = nextVnode;
2205 			}
2206 		}
2207 
2208 		// check if vnode is really a directory
2209 		if (status == B_OK && !S_ISDIR(vnode->Type()))
2210 			status = B_NOT_A_DIRECTORY;
2211 
2212 		// Check if we have the right to search the current directory vnode.
2213 		// If a file system doesn't have the access() function, we assume that
2214 		// searching a directory is always allowed
2215 		if (status == B_OK && HAS_FS_CALL(vnode, access))
2216 			status = FS_CALL(vnode, access, X_OK);
2217 
2218 		// Tell the filesystem to get the vnode of this path component (if we
2219 		// got the permission from the call above)
2220 		if (status == B_OK)
2221 			status = lookup_dir_entry(vnode, path, &nextVnode);
2222 
2223 		if (status != B_OK) {
2224 			put_vnode(vnode);
2225 			return status;
2226 		}
2227 
2228 		// If the new node is a symbolic link, resolve it (if we've been told
2229 		// to do it)
2230 		if (S_ISLNK(nextVnode->Type())
2231 			&& (traverseLeafLink || directoryFound)) {
2232 			size_t bufferSize;
2233 			char* buffer;
2234 
2235 			TRACE(("traverse link\n"));
2236 
2237 			// it's not exactly nice style using goto in this way, but hey,
2238 			// it works :-/
2239 			if (count + 1 > B_MAX_SYMLINKS) {
2240 				status = B_LINK_LIMIT;
2241 				goto resolve_link_error;
2242 			}
2243 
2244 			bufferSize = B_PATH_NAME_LENGTH;
2245 			buffer = (char*)object_cache_alloc(sPathNameCache, 0);
2246 			if (buffer == NULL) {
2247 				status = B_NO_MEMORY;
2248 				goto resolve_link_error;
2249 			}
2250 
2251 			if (HAS_FS_CALL(nextVnode, read_symlink)) {
2252 				bufferSize--;
2253 				status = FS_CALL(nextVnode, read_symlink, buffer, &bufferSize);
2254 				// null-terminate
2255 				if (status >= 0 && bufferSize < B_PATH_NAME_LENGTH)
2256 					buffer[bufferSize] = '\0';
2257 			} else
2258 				status = B_BAD_VALUE;
2259 
2260 			if (status != B_OK) {
2261 				free(buffer);
2262 
2263 		resolve_link_error:
2264 				put_vnode(vnode);
2265 				put_vnode(nextVnode);
2266 
2267 				return status;
2268 			}
2269 			put_vnode(nextVnode);
2270 
2271 			// Check if we start from the root directory or the current
2272 			// directory ("vnode" still points to that one).
2273 			// Cut off all leading slashes if it's the root directory
2274 			path = buffer;
2275 			bool absoluteSymlink = false;
2276 			if (path[0] == '/') {
2277 				// we don't need the old directory anymore
2278 				put_vnode(vnode);
2279 
2280 				while (*++path == '/')
2281 					;
2282 
2283 				mutex_lock(&sIOContextRootLock);
2284 				vnode = ioContext->root;
2285 				inc_vnode_ref_count(vnode);
2286 				mutex_unlock(&sIOContextRootLock);
2287 
2288 				absoluteSymlink = true;
2289 			}
2290 
2291 			inc_vnode_ref_count(vnode);
2292 				// balance the next recursion - we will decrement the
2293 				// ref_count of the vnode, no matter if we succeeded or not
2294 
2295 			if (absoluteSymlink && *path == '\0') {
2296 				// symlink was just "/"
2297 				nextVnode = vnode;
2298 			} else {
2299 				status = vnode_path_to_vnode(vnode, path, true, count + 1,
2300 					ioContext, &nextVnode, &lastParentID);
2301 			}
2302 
2303 			object_cache_free(sPathNameCache, buffer, 0);
2304 
2305 			if (status != B_OK) {
2306 				put_vnode(vnode);
2307 				return status;
2308 			}
2309 		} else
2310 			lastParentID = vnode->id;
2311 
2312 		// decrease the ref count on the old dir we just looked up into
2313 		put_vnode(vnode);
2314 
2315 		path = nextPath;
2316 		vnode = nextVnode;
2317 
2318 		// see if we hit a covered node
2319 		if (Vnode* coveringNode = get_covering_vnode(vnode)) {
2320 			put_vnode(vnode);
2321 			vnode = coveringNode;
2322 		}
2323 	}
2324 
2325 	*_vnode = vnode;
2326 	if (_parentID)
2327 		*_parentID = lastParentID;
2328 
2329 	return B_OK;
2330 }
2331 
2332 
2333 static status_t
2334 vnode_path_to_vnode(struct vnode* vnode, char* path, bool traverseLeafLink,
2335 	int count, bool kernel, struct vnode** _vnode, ino_t* _parentID)
2336 {
2337 	return vnode_path_to_vnode(vnode, path, traverseLeafLink, count,
2338 		get_current_io_context(kernel), _vnode, _parentID);
2339 }
2340 
2341 
2342 static status_t
2343 path_to_vnode(char* path, bool traverseLink, struct vnode** _vnode,
2344 	ino_t* _parentID, bool kernel)
2345 {
2346 	struct vnode* start = NULL;
2347 
2348 	FUNCTION(("path_to_vnode(path = \"%s\")\n", path));
2349 
2350 	if (!path)
2351 		return B_BAD_VALUE;
2352 
2353 	if (*path == '\0')
2354 		return B_ENTRY_NOT_FOUND;
2355 
2356 	// figure out if we need to start at root or at cwd
2357 	if (*path == '/') {
2358 		if (sRoot == NULL) {
2359 			// we're a bit early, aren't we?
2360 			return B_ERROR;
2361 		}
2362 
2363 		while (*++path == '/')
2364 			;
2365 		start = get_root_vnode(kernel);
2366 
2367 		if (*path == '\0') {
2368 			*_vnode = start;
2369 			return B_OK;
2370 		}
2371 
2372 	} else {
2373 		struct io_context* context = get_current_io_context(kernel);
2374 
2375 		mutex_lock(&context->io_mutex);
2376 		start = context->cwd;
2377 		if (start != NULL)
2378 			inc_vnode_ref_count(start);
2379 		mutex_unlock(&context->io_mutex);
2380 
2381 		if (start == NULL)
2382 			return B_ERROR;
2383 	}
2384 
2385 	return vnode_path_to_vnode(start, path, traverseLink, 0, kernel, _vnode,
2386 		_parentID);
2387 }
2388 
2389 
2390 /*! Returns the vnode in the next to last segment of the path, and returns
2391 	the last portion in filename.
2392 	The path buffer must be able to store at least one additional character.
2393 */
2394 static status_t
2395 path_to_dir_vnode(char* path, struct vnode** _vnode, char* filename,
2396 	bool kernel)
2397 {
2398 	status_t status = get_dir_path_and_leaf(path, filename);
2399 	if (status != B_OK)
2400 		return status;
2401 
2402 	return path_to_vnode(path, true, _vnode, NULL, kernel);
2403 }
2404 
2405 
2406 /*!	\brief Retrieves the directory vnode and the leaf name of an entry referred
2407 		   to by a FD + path pair.
2408 
2409 	\a path must be given in either case. \a fd might be omitted, in which
2410 	case \a path is either an absolute path or one relative to the current
2411 	directory. If both a supplied and \a path is relative it is reckoned off
2412 	of the directory referred to by \a fd. If \a path is absolute \a fd is
2413 	ignored.
2414 
2415 	The caller has the responsibility to call put_vnode() on the returned
2416 	directory vnode.
2417 
2418 	\param fd The FD. May be < 0.
2419 	\param path The absolute or relative path. Must not be \c NULL. The buffer
2420 	       is modified by this function. It must have at least room for a
2421 	       string one character longer than the path it contains.
2422 	\param _vnode A pointer to a variable the directory vnode shall be written
2423 		   into.
2424 	\param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2425 		   the leaf name of the specified entry will be written.
2426 	\param kernel \c true, if invoked from inside the kernel, \c false if
2427 		   invoked from userland.
2428 	\return \c B_OK, if everything went fine, another error code otherwise.
2429 */
2430 static status_t
2431 fd_and_path_to_dir_vnode(int fd, char* path, struct vnode** _vnode,
2432 	char* filename, bool kernel)
2433 {
2434 	if (!path)
2435 		return B_BAD_VALUE;
2436 	if (*path == '\0')
2437 		return B_ENTRY_NOT_FOUND;
2438 	if (fd < 0)
2439 		return path_to_dir_vnode(path, _vnode, filename, kernel);
2440 
2441 	status_t status = get_dir_path_and_leaf(path, filename);
2442 	if (status != B_OK)
2443 		return status;
2444 
2445 	return fd_and_path_to_vnode(fd, path, true, _vnode, NULL, kernel);
2446 }
2447 
2448 
2449 /*!	\brief Retrieves the directory vnode and the leaf name of an entry referred
2450 		   to by a vnode + path pair.
2451 
2452 	\a path must be given in either case. \a vnode might be omitted, in which
2453 	case \a path is either an absolute path or one relative to the current
2454 	directory. If both a supplied and \a path is relative it is reckoned off
2455 	of the directory referred to by \a vnode. If \a path is absolute \a vnode is
2456 	ignored.
2457 
2458 	The caller has the responsibility to call put_vnode() on the returned
2459 	directory vnode.
2460 
2461 	\param vnode The vnode. May be \c NULL.
2462 	\param path The absolute or relative path. Must not be \c NULL. The buffer
2463 	       is modified by this function. It must have at least room for a
2464 	       string one character longer than the path it contains.
2465 	\param _vnode A pointer to a variable the directory vnode shall be written
2466 		   into.
2467 	\param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2468 		   the leaf name of the specified entry will be written.
2469 	\param kernel \c true, if invoked from inside the kernel, \c false if
2470 		   invoked from userland.
2471 	\return \c B_OK, if everything went fine, another error code otherwise.
2472 */
2473 static status_t
2474 vnode_and_path_to_dir_vnode(struct vnode* vnode, char* path,
2475 	struct vnode** _vnode, char* filename, bool kernel)
2476 {
2477 	if (!path)
2478 		return B_BAD_VALUE;
2479 	if (*path == '\0')
2480 		return B_ENTRY_NOT_FOUND;
2481 	if (vnode == NULL || path[0] == '/')
2482 		return path_to_dir_vnode(path, _vnode, filename, kernel);
2483 
2484 	status_t status = get_dir_path_and_leaf(path, filename);
2485 	if (status != B_OK)
2486 		return status;
2487 
2488 	inc_vnode_ref_count(vnode);
2489 		// vnode_path_to_vnode() always decrements the ref count
2490 
2491 	return vnode_path_to_vnode(vnode, path, true, 0, kernel, _vnode, NULL);
2492 }
2493 
2494 
2495 /*! Returns a vnode's name in the d_name field of a supplied dirent buffer.
2496 */
2497 static status_t
2498 get_vnode_name(struct vnode* vnode, struct vnode* parent, struct dirent* buffer,
2499 	size_t bufferSize, struct io_context* ioContext)
2500 {
2501 	if (bufferSize < sizeof(struct dirent))
2502 		return B_BAD_VALUE;
2503 
2504 	// See if the vnode is covering another vnode and move to the covered
2505 	// vnode so we get the underlying file system
2506 	VNodePutter vnodePutter;
2507 	if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2508 		vnode = coveredVnode;
2509 		vnodePutter.SetTo(vnode);
2510 	}
2511 
2512 	if (HAS_FS_CALL(vnode, get_vnode_name)) {
2513 		// The FS supports getting the name of a vnode.
2514 		if (FS_CALL(vnode, get_vnode_name, buffer->d_name,
2515 			(char*)buffer + bufferSize - buffer->d_name) == B_OK)
2516 			return B_OK;
2517 	}
2518 
2519 	// The FS doesn't support getting the name of a vnode. So we search the
2520 	// parent directory for the vnode, if the caller let us.
2521 
2522 	if (parent == NULL || !HAS_FS_CALL(parent, read_dir))
2523 		return B_UNSUPPORTED;
2524 
2525 	void* cookie;
2526 
2527 	status_t status = FS_CALL(parent, open_dir, &cookie);
2528 	if (status >= B_OK) {
2529 		while (true) {
2530 			uint32 num = 1;
2531 			// We use the FS hook directly instead of dir_read(), since we don't
2532 			// want the entries to be fixed. We have already resolved vnode to
2533 			// the covered node.
2534 			status = FS_CALL(parent, read_dir, cookie, buffer, bufferSize,
2535 				&num);
2536 			if (status != B_OK)
2537 				break;
2538 			if (num == 0) {
2539 				status = B_ENTRY_NOT_FOUND;
2540 				break;
2541 			}
2542 
2543 			if (vnode->id == buffer->d_ino) {
2544 				// found correct entry!
2545 				break;
2546 			}
2547 		}
2548 
2549 		FS_CALL(parent, close_dir, cookie);
2550 		FS_CALL(parent, free_dir_cookie, cookie);
2551 	}
2552 	return status;
2553 }
2554 
2555 
2556 static status_t
2557 get_vnode_name(struct vnode* vnode, struct vnode* parent, char* name,
2558 	size_t nameSize, bool kernel)
2559 {
2560 	char buffer[offsetof(struct dirent, d_name) + B_FILE_NAME_LENGTH + 1];
2561 	struct dirent* dirent = (struct dirent*)buffer;
2562 
2563 	status_t status = get_vnode_name(vnode, parent, dirent, sizeof(buffer),
2564 		get_current_io_context(kernel));
2565 	if (status != B_OK)
2566 		return status;
2567 
2568 	if (strlcpy(name, dirent->d_name, nameSize) >= nameSize)
2569 		return B_BUFFER_OVERFLOW;
2570 
2571 	return B_OK;
2572 }
2573 
2574 
2575 /*!	Gets the full path to a given directory vnode.
2576 	It uses the fs_get_vnode_name() call to get the name of a vnode; if a
2577 	file system doesn't support this call, it will fall back to iterating
2578 	through the parent directory to get the name of the child.
2579 
2580 	To protect against circular loops, it supports a maximum tree depth
2581 	of 256 levels.
2582 
2583 	Note that the path may not be correct the time this function returns!
2584 	It doesn't use any locking to prevent returning the correct path, as
2585 	paths aren't safe anyway: the path to a file can change at any time.
2586 
2587 	It might be a good idea, though, to check if the returned path exists
2588 	in the calling function (it's not done here because of efficiency)
2589 */
2590 static status_t
2591 dir_vnode_to_path(struct vnode* vnode, char* buffer, size_t bufferSize,
2592 	bool kernel)
2593 {
2594 	FUNCTION(("dir_vnode_to_path(%p, %p, %lu)\n", vnode, buffer, bufferSize));
2595 
2596 	if (vnode == NULL || buffer == NULL || bufferSize == 0)
2597 		return B_BAD_VALUE;
2598 
2599 	if (!S_ISDIR(vnode->Type()))
2600 		return B_NOT_A_DIRECTORY;
2601 
2602 	char* path = buffer;
2603 	int32 insert = bufferSize;
2604 	int32 maxLevel = 256;
2605 	int32 length;
2606 	status_t status = B_OK;
2607 	struct io_context* ioContext = get_current_io_context(kernel);
2608 
2609 	// we don't use get_vnode() here because this call is more
2610 	// efficient and does all we need from get_vnode()
2611 	inc_vnode_ref_count(vnode);
2612 
2613 	path[--insert] = '\0';
2614 		// the path is filled right to left
2615 
2616 	while (true) {
2617 		// If the node is the context's root, bail out. Otherwise resolve mount
2618 		// points.
2619 		if (vnode == ioContext->root)
2620 			break;
2621 
2622 		if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2623 			put_vnode(vnode);
2624 			vnode = coveredVnode;
2625 		}
2626 
2627 		// lookup the parent vnode
2628 		struct vnode* parentVnode;
2629 		status = lookup_dir_entry(vnode, "..", &parentVnode);
2630 		if (status != B_OK)
2631 			goto out;
2632 
2633 		if (parentVnode == vnode) {
2634 			// The caller apparently got their hands on a node outside of their
2635 			// context's root. Now we've hit the global root.
2636 			put_vnode(parentVnode);
2637 			break;
2638 		}
2639 
2640 		// get the node's name
2641 		char nameBuffer[offsetof(struct dirent, d_name) + B_FILE_NAME_LENGTH + 1];
2642 			// also used for fs_read_dir()
2643 		char* name = &((struct dirent*)nameBuffer)->d_name[0];
2644 		status = get_vnode_name(vnode, parentVnode, (struct dirent*)nameBuffer,
2645 			sizeof(nameBuffer), ioContext);
2646 
2647 		// release the current vnode, we only need its parent from now on
2648 		put_vnode(vnode);
2649 		vnode = parentVnode;
2650 
2651 		if (status != B_OK)
2652 			goto out;
2653 
2654 		// TODO: add an explicit check for loops in about 10 levels to do
2655 		// real loop detection
2656 
2657 		// don't go deeper as 'maxLevel' to prevent circular loops
2658 		if (maxLevel-- < 0) {
2659 			status = B_LINK_LIMIT;
2660 			goto out;
2661 		}
2662 
2663 		// add the name in front of the current path
2664 		name[B_FILE_NAME_LENGTH - 1] = '\0';
2665 		length = strlen(name);
2666 		insert -= length;
2667 		if (insert <= 0) {
2668 			status = B_RESULT_NOT_REPRESENTABLE;
2669 			goto out;
2670 		}
2671 		memcpy(path + insert, name, length);
2672 		path[--insert] = '/';
2673 	}
2674 
2675 	// the root dir will result in an empty path: fix it
2676 	if (path[insert] == '\0')
2677 		path[--insert] = '/';
2678 
2679 	TRACE(("  path is: %s\n", path + insert));
2680 
2681 	// move the path to the start of the buffer
2682 	length = bufferSize - insert;
2683 	memmove(buffer, path + insert, length);
2684 
2685 out:
2686 	put_vnode(vnode);
2687 	return status;
2688 }
2689 
2690 
2691 /*!	Checks the length of every path component, and adds a '.'
2692 	if the path ends in a slash.
2693 	The given path buffer must be able to store at least one
2694 	additional character.
2695 */
2696 static status_t
2697 check_path(char* to)
2698 {
2699 	int32 length = 0;
2700 
2701 	// check length of every path component
2702 
2703 	while (*to) {
2704 		char* begin;
2705 		if (*to == '/')
2706 			to++, length++;
2707 
2708 		begin = to;
2709 		while (*to != '/' && *to)
2710 			to++, length++;
2711 
2712 		if (to - begin > B_FILE_NAME_LENGTH)
2713 			return B_NAME_TOO_LONG;
2714 	}
2715 
2716 	if (length == 0)
2717 		return B_ENTRY_NOT_FOUND;
2718 
2719 	// complete path if there is a slash at the end
2720 
2721 	if (*(to - 1) == '/') {
2722 		if (length > B_PATH_NAME_LENGTH - 2)
2723 			return B_NAME_TOO_LONG;
2724 
2725 		to[0] = '.';
2726 		to[1] = '\0';
2727 	}
2728 
2729 	return B_OK;
2730 }
2731 
2732 
2733 static struct file_descriptor*
2734 get_fd_and_vnode(int fd, struct vnode** _vnode, bool kernel)
2735 {
2736 	struct file_descriptor* descriptor
2737 		= get_fd(get_current_io_context(kernel), fd);
2738 	if (descriptor == NULL)
2739 		return NULL;
2740 
2741 	struct vnode* vnode = fd_vnode(descriptor);
2742 	if (vnode == NULL) {
2743 		put_fd(descriptor);
2744 		return NULL;
2745 	}
2746 
2747 	// ToDo: when we can close a file descriptor at any point, investigate
2748 	//	if this is still valid to do (accessing the vnode without ref_count
2749 	//	or locking)
2750 	*_vnode = vnode;
2751 	return descriptor;
2752 }
2753 
2754 
2755 static struct vnode*
2756 get_vnode_from_fd(int fd, bool kernel)
2757 {
2758 	struct file_descriptor* descriptor;
2759 	struct vnode* vnode;
2760 
2761 	descriptor = get_fd(get_current_io_context(kernel), fd);
2762 	if (descriptor == NULL)
2763 		return NULL;
2764 
2765 	vnode = fd_vnode(descriptor);
2766 	if (vnode != NULL)
2767 		inc_vnode_ref_count(vnode);
2768 
2769 	put_fd(descriptor);
2770 	return vnode;
2771 }
2772 
2773 
2774 /*!	Gets the vnode from an FD + path combination. If \a fd is lower than zero,
2775 	only the path will be considered. In this case, the \a path must not be
2776 	NULL.
2777 	If \a fd is a valid file descriptor, \a path may be NULL for directories,
2778 	and should be NULL for files.
2779 */
2780 static status_t
2781 fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
2782 	struct vnode** _vnode, ino_t* _parentID, bool kernel)
2783 {
2784 	if (fd < 0 && !path)
2785 		return B_BAD_VALUE;
2786 
2787 	if (path != NULL && *path == '\0')
2788 		return B_ENTRY_NOT_FOUND;
2789 
2790 	if (fd < 0 || (path != NULL && path[0] == '/')) {
2791 		// no FD or absolute path
2792 		return path_to_vnode(path, traverseLeafLink, _vnode, _parentID, kernel);
2793 	}
2794 
2795 	// FD only, or FD + relative path
2796 	struct vnode* vnode = get_vnode_from_fd(fd, kernel);
2797 	if (vnode == NULL)
2798 		return B_FILE_ERROR;
2799 
2800 	if (path != NULL) {
2801 		return vnode_path_to_vnode(vnode, path, traverseLeafLink, 0, kernel,
2802 			_vnode, _parentID);
2803 	}
2804 
2805 	// there is no relative path to take into account
2806 
2807 	*_vnode = vnode;
2808 	if (_parentID)
2809 		*_parentID = -1;
2810 
2811 	return B_OK;
2812 }
2813 
2814 
2815 static int
2816 get_new_fd(int type, struct fs_mount* mount, struct vnode* vnode,
2817 	void* cookie, int openMode, bool kernel)
2818 {
2819 	struct file_descriptor* descriptor;
2820 	int fd;
2821 
2822 	// If the vnode is locked, we don't allow creating a new file/directory
2823 	// file_descriptor for it
2824 	if (vnode && vnode->mandatory_locked_by != NULL
2825 		&& (type == FDTYPE_FILE || type == FDTYPE_DIR))
2826 		return B_BUSY;
2827 
2828 	descriptor = alloc_fd();
2829 	if (!descriptor)
2830 		return B_NO_MEMORY;
2831 
2832 	if (vnode)
2833 		descriptor->u.vnode = vnode;
2834 	else
2835 		descriptor->u.mount = mount;
2836 	descriptor->cookie = cookie;
2837 
2838 	switch (type) {
2839 		// vnode types
2840 		case FDTYPE_FILE:
2841 			descriptor->ops = &sFileOps;
2842 			break;
2843 		case FDTYPE_DIR:
2844 			descriptor->ops = &sDirectoryOps;
2845 			break;
2846 		case FDTYPE_ATTR:
2847 			descriptor->ops = &sAttributeOps;
2848 			break;
2849 		case FDTYPE_ATTR_DIR:
2850 			descriptor->ops = &sAttributeDirectoryOps;
2851 			break;
2852 
2853 		// mount types
2854 		case FDTYPE_INDEX_DIR:
2855 			descriptor->ops = &sIndexDirectoryOps;
2856 			break;
2857 		case FDTYPE_QUERY:
2858 			descriptor->ops = &sQueryOps;
2859 			break;
2860 
2861 		default:
2862 			panic("get_new_fd() called with unknown type %d\n", type);
2863 			break;
2864 	}
2865 	descriptor->type = type;
2866 	descriptor->open_mode = openMode;
2867 
2868 	io_context* context = get_current_io_context(kernel);
2869 	fd = new_fd(context, descriptor);
2870 	if (fd < 0) {
2871 		descriptor->ops = NULL;
2872 		put_fd(descriptor);
2873 		return B_NO_MORE_FDS;
2874 	}
2875 
2876 	mutex_lock(&context->io_mutex);
2877 	fd_set_close_on_exec(context, fd, (openMode & O_CLOEXEC) != 0);
2878 	mutex_unlock(&context->io_mutex);
2879 
2880 	return fd;
2881 }
2882 
2883 
2884 /*!	In-place normalizes \a path. It's otherwise semantically equivalent to
2885 	vfs_normalize_path(). See there for more documentation.
2886 */
2887 static status_t
2888 normalize_path(char* path, size_t pathSize, bool traverseLink, bool kernel)
2889 {
2890 	VNodePutter dirPutter;
2891 	struct vnode* dir = NULL;
2892 	status_t error;
2893 
2894 	for (int i = 0; i < B_MAX_SYMLINKS; i++) {
2895 		// get dir vnode + leaf name
2896 		struct vnode* nextDir;
2897 		char leaf[B_FILE_NAME_LENGTH];
2898 		error = vnode_and_path_to_dir_vnode(dir, path, &nextDir, leaf, kernel);
2899 		if (error != B_OK)
2900 			return error;
2901 
2902 		dir = nextDir;
2903 		strcpy(path, leaf);
2904 		dirPutter.SetTo(dir);
2905 
2906 		// get file vnode, if we shall resolve links
2907 		bool fileExists = false;
2908 		struct vnode* fileVnode;
2909 		VNodePutter fileVnodePutter;
2910 		if (traverseLink) {
2911 			inc_vnode_ref_count(dir);
2912 			if (vnode_path_to_vnode(dir, path, false, 0, kernel, &fileVnode,
2913 					NULL) == B_OK) {
2914 				fileVnodePutter.SetTo(fileVnode);
2915 				fileExists = true;
2916 			}
2917 		}
2918 
2919 		if (!fileExists || !traverseLink || !S_ISLNK(fileVnode->Type())) {
2920 			// we're done -- construct the path
2921 			bool hasLeaf = true;
2922 			if (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0) {
2923 				// special cases "." and ".." -- get the dir, forget the leaf
2924 				inc_vnode_ref_count(dir);
2925 				error = vnode_path_to_vnode(dir, leaf, false, 0, kernel,
2926 					&nextDir, NULL);
2927 				if (error != B_OK)
2928 					return error;
2929 				dir = nextDir;
2930 				dirPutter.SetTo(dir);
2931 				hasLeaf = false;
2932 			}
2933 
2934 			// get the directory path
2935 			error = dir_vnode_to_path(dir, path, B_PATH_NAME_LENGTH, kernel);
2936 			if (error != B_OK)
2937 				return error;
2938 
2939 			// append the leaf name
2940 			if (hasLeaf) {
2941 				// insert a directory separator if this is not the file system
2942 				// root
2943 				if ((strcmp(path, "/") != 0
2944 					&& strlcat(path, "/", pathSize) >= pathSize)
2945 					|| strlcat(path, leaf, pathSize) >= pathSize) {
2946 					return B_NAME_TOO_LONG;
2947 				}
2948 			}
2949 
2950 			return B_OK;
2951 		}
2952 
2953 		// read link
2954 		if (HAS_FS_CALL(fileVnode, read_symlink)) {
2955 			size_t bufferSize = B_PATH_NAME_LENGTH - 1;
2956 			error = FS_CALL(fileVnode, read_symlink, path, &bufferSize);
2957 			if (error != B_OK)
2958 				return error;
2959 			if (bufferSize < B_PATH_NAME_LENGTH)
2960 				path[bufferSize] = '\0';
2961 		} else
2962 			return B_BAD_VALUE;
2963 	}
2964 
2965 	return B_LINK_LIMIT;
2966 }
2967 
2968 
2969 static status_t
2970 resolve_covered_parent(struct vnode* parent, dev_t* _device, ino_t* _node,
2971 	struct io_context* ioContext)
2972 {
2973 	// Make sure the IO context root is not bypassed.
2974 	if (parent == ioContext->root) {
2975 		*_device = parent->device;
2976 		*_node = parent->id;
2977 		return B_OK;
2978 	}
2979 
2980 	inc_vnode_ref_count(parent);
2981 		// vnode_path_to_vnode() puts the node
2982 
2983 	// ".." is guaranteed not to be clobbered by this call
2984 	struct vnode* vnode;
2985 	status_t status = vnode_path_to_vnode(parent, (char*)"..", false, 0,
2986 		ioContext, &vnode, NULL);
2987 	if (status == B_OK) {
2988 		*_device = vnode->device;
2989 		*_node = vnode->id;
2990 		put_vnode(vnode);
2991 	}
2992 
2993 	return status;
2994 }
2995 
2996 
2997 #ifdef ADD_DEBUGGER_COMMANDS
2998 
2999 
3000 static void
3001 _dump_advisory_locking(advisory_locking* locking)
3002 {
3003 	if (locking == NULL)
3004 		return;
3005 
3006 	kprintf("   lock:        %" B_PRId32, locking->lock);
3007 	kprintf("   wait_sem:    %" B_PRId32, locking->wait_sem);
3008 
3009 	int32 index = 0;
3010 	LockList::Iterator iterator = locking->locks.GetIterator();
3011 	while (iterator.HasNext()) {
3012 		struct advisory_lock* lock = iterator.Next();
3013 
3014 		kprintf("   [%2" B_PRId32 "] team:   %" B_PRId32 "\n", index++, lock->team);
3015 		kprintf("        start:  %" B_PRIdOFF "\n", lock->start);
3016 		kprintf("        end:    %" B_PRIdOFF "\n", lock->end);
3017 		kprintf("        shared? %s\n", lock->shared ? "yes" : "no");
3018 	}
3019 }
3020 
3021 
3022 static void
3023 _dump_mount(struct fs_mount* mount)
3024 {
3025 	kprintf("MOUNT: %p\n", mount);
3026 	kprintf(" id:            %" B_PRIdDEV "\n", mount->id);
3027 	kprintf(" device_name:   %s\n", mount->device_name);
3028 	kprintf(" root_vnode:    %p\n", mount->root_vnode);
3029 	kprintf(" covers:        %p\n", mount->root_vnode->covers);
3030 	kprintf(" partition:     %p\n", mount->partition);
3031 	kprintf(" lock:          %p\n", &mount->lock);
3032 	kprintf(" flags:        %s%s\n", mount->unmounting ? " unmounting" : "",
3033 		mount->owns_file_device ? " owns_file_device" : "");
3034 
3035 	fs_volume* volume = mount->volume;
3036 	while (volume != NULL) {
3037 		kprintf(" volume %p:\n", volume);
3038 		kprintf("  layer:            %" B_PRId32 "\n", volume->layer);
3039 		kprintf("  private_volume:   %p\n", volume->private_volume);
3040 		kprintf("  ops:              %p\n", volume->ops);
3041 		kprintf("  file_system:      %p\n", volume->file_system);
3042 		kprintf("  file_system_name: %s\n", volume->file_system_name);
3043 		volume = volume->super_volume;
3044 	}
3045 
3046 	set_debug_variable("_volume", (addr_t)mount->volume->private_volume);
3047 	set_debug_variable("_root", (addr_t)mount->root_vnode);
3048 	set_debug_variable("_covers", (addr_t)mount->root_vnode->covers);
3049 	set_debug_variable("_partition", (addr_t)mount->partition);
3050 }
3051 
3052 
3053 static bool
3054 debug_prepend_vnode_name_to_path(char* buffer, size_t& bufferSize,
3055 	const char* name)
3056 {
3057 	bool insertSlash = buffer[bufferSize] != '\0';
3058 	size_t nameLength = strlen(name);
3059 
3060 	if (bufferSize < nameLength + (insertSlash ? 1 : 0))
3061 		return false;
3062 
3063 	if (insertSlash)
3064 		buffer[--bufferSize] = '/';
3065 
3066 	bufferSize -= nameLength;
3067 	memcpy(buffer + bufferSize, name, nameLength);
3068 
3069 	return true;
3070 }
3071 
3072 
3073 static bool
3074 debug_prepend_vnode_id_to_path(char* buffer, size_t& bufferSize, dev_t devID,
3075 	ino_t nodeID)
3076 {
3077 	if (bufferSize == 0)
3078 		return false;
3079 
3080 	bool insertSlash = buffer[bufferSize] != '\0';
3081 	if (insertSlash)
3082 		buffer[--bufferSize] = '/';
3083 
3084 	size_t size = snprintf(buffer, bufferSize,
3085 		"<%" B_PRIdDEV ",%" B_PRIdINO ">", devID, nodeID);
3086 	if (size > bufferSize) {
3087 		if (insertSlash)
3088 			bufferSize++;
3089 		return false;
3090 	}
3091 
3092 	if (size < bufferSize)
3093 		memmove(buffer + bufferSize - size, buffer, size);
3094 
3095 	bufferSize -= size;
3096 	return true;
3097 }
3098 
3099 
3100 static char*
3101 debug_resolve_vnode_path(struct vnode* vnode, char* buffer, size_t bufferSize,
3102 	bool& _truncated)
3103 {
3104 	// null-terminate the path
3105 	buffer[--bufferSize] = '\0';
3106 
3107 	while (true) {
3108 		while (vnode->covers != NULL)
3109 			vnode = vnode->covers;
3110 
3111 		if (vnode == sRoot) {
3112 			_truncated = bufferSize == 0;
3113 			if (!_truncated)
3114 				buffer[--bufferSize] = '/';
3115 			return buffer + bufferSize;
3116 		}
3117 
3118 		// resolve the name
3119 		ino_t dirID;
3120 		const char* name = vnode->mount->entry_cache.DebugReverseLookup(
3121 			vnode->id, dirID);
3122 		if (name == NULL) {
3123 			// Failed to resolve the name -- prepend "<dev,node>/".
3124 			_truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3125 				vnode->mount->id, vnode->id);
3126 			return buffer + bufferSize;
3127 		}
3128 
3129 		// prepend the name
3130 		if (!debug_prepend_vnode_name_to_path(buffer, bufferSize, name)) {
3131 			_truncated = true;
3132 			return buffer + bufferSize;
3133 		}
3134 
3135 		// resolve the directory node
3136 		struct vnode* nextVnode = lookup_vnode(vnode->mount->id, dirID);
3137 		if (nextVnode == NULL) {
3138 			_truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3139 				vnode->mount->id, dirID);
3140 			return buffer + bufferSize;
3141 		}
3142 
3143 		vnode = nextVnode;
3144 	}
3145 }
3146 
3147 
3148 static void
3149 _dump_vnode(struct vnode* vnode, bool printPath)
3150 {
3151 	kprintf("VNODE: %p\n", vnode);
3152 	kprintf(" device:        %" B_PRIdDEV "\n", vnode->device);
3153 	kprintf(" id:            %" B_PRIdINO "\n", vnode->id);
3154 	kprintf(" ref_count:     %" B_PRId32 "\n", vnode->ref_count);
3155 	kprintf(" private_node:  %p\n", vnode->private_node);
3156 	kprintf(" mount:         %p\n", vnode->mount);
3157 	kprintf(" covered_by:    %p\n", vnode->covered_by);
3158 	kprintf(" covers:        %p\n", vnode->covers);
3159 	kprintf(" cache:         %p\n", vnode->cache);
3160 	kprintf(" type:          %#" B_PRIx32 "\n", vnode->Type());
3161 	kprintf(" flags:         %s%s%s\n", vnode->IsRemoved() ? "r" : "-",
3162 		vnode->IsBusy() ? "b" : "-", vnode->IsUnpublished() ? "u" : "-");
3163 	kprintf(" advisory_lock: %p\n", vnode->advisory_locking);
3164 
3165 	_dump_advisory_locking(vnode->advisory_locking);
3166 
3167 	if (printPath) {
3168 		void* buffer = debug_malloc(B_PATH_NAME_LENGTH);
3169 		if (buffer != NULL) {
3170 			bool truncated;
3171 			char* path = debug_resolve_vnode_path(vnode, (char*)buffer,
3172 				B_PATH_NAME_LENGTH, truncated);
3173 			if (path != NULL) {
3174 				kprintf(" path:          ");
3175 				if (truncated)
3176 					kputs("<truncated>/");
3177 				kputs(path);
3178 				kputs("\n");
3179 			} else
3180 				kprintf("Failed to resolve vnode path.\n");
3181 
3182 			debug_free(buffer);
3183 		} else
3184 			kprintf("Failed to allocate memory for constructing the path.\n");
3185 	}
3186 
3187 	set_debug_variable("_node", (addr_t)vnode->private_node);
3188 	set_debug_variable("_mount", (addr_t)vnode->mount);
3189 	set_debug_variable("_covered_by", (addr_t)vnode->covered_by);
3190 	set_debug_variable("_covers", (addr_t)vnode->covers);
3191 	set_debug_variable("_adv_lock", (addr_t)vnode->advisory_locking);
3192 }
3193 
3194 
3195 static int
3196 dump_mount(int argc, char** argv)
3197 {
3198 	if (argc != 2 || !strcmp(argv[1], "--help")) {
3199 		kprintf("usage: %s [id|address]\n", argv[0]);
3200 		return 0;
3201 	}
3202 
3203 	ulong val = parse_expression(argv[1]);
3204 	uint32 id = val;
3205 
3206 	struct fs_mount* mount = sMountsTable->Lookup(id);
3207 	if (mount == NULL) {
3208 		if (IS_USER_ADDRESS(id)) {
3209 			kprintf("fs_mount not found\n");
3210 			return 0;
3211 		}
3212 		mount = (fs_mount*)val;
3213 	}
3214 
3215 	_dump_mount(mount);
3216 	return 0;
3217 }
3218 
3219 
3220 static int
3221 dump_mounts(int argc, char** argv)
3222 {
3223 	if (argc != 1) {
3224 		kprintf("usage: %s\n", argv[0]);
3225 		return 0;
3226 	}
3227 
3228 	kprintf("%-*s    id %-*s   %-*s   %-*s   fs_name\n",
3229 		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "root",
3230 		B_PRINTF_POINTER_WIDTH, "covers", B_PRINTF_POINTER_WIDTH, "cookie");
3231 
3232 	struct fs_mount* mount;
3233 
3234 	MountTable::Iterator iterator(sMountsTable);
3235 	while (iterator.HasNext()) {
3236 		mount = iterator.Next();
3237 		kprintf("%p%4" B_PRIdDEV " %p %p %p %s\n", mount, mount->id, mount->root_vnode,
3238 			mount->root_vnode->covers, mount->volume->private_volume,
3239 			mount->volume->file_system_name);
3240 
3241 		fs_volume* volume = mount->volume;
3242 		while (volume->super_volume != NULL) {
3243 			volume = volume->super_volume;
3244 			kprintf("                                     %p %s\n",
3245 				volume->private_volume, volume->file_system_name);
3246 		}
3247 	}
3248 
3249 	return 0;
3250 }
3251 
3252 
3253 static int
3254 dump_vnode(int argc, char** argv)
3255 {
3256 	bool printPath = false;
3257 	int argi = 1;
3258 	if (argc >= 2 && strcmp(argv[argi], "-p") == 0) {
3259 		printPath = true;
3260 		argi++;
3261 	}
3262 
3263 	if (argi >= argc || argi + 2 < argc) {
3264 		print_debugger_command_usage(argv[0]);
3265 		return 0;
3266 	}
3267 
3268 	struct vnode* vnode = NULL;
3269 
3270 	if (argi + 1 == argc) {
3271 		vnode = (struct vnode*)parse_expression(argv[argi]);
3272 		if (IS_USER_ADDRESS(vnode)) {
3273 			kprintf("invalid vnode address\n");
3274 			return 0;
3275 		}
3276 		_dump_vnode(vnode, printPath);
3277 		return 0;
3278 	}
3279 
3280 	dev_t device = parse_expression(argv[argi]);
3281 	ino_t id = parse_expression(argv[argi + 1]);
3282 
3283 	VnodeTable::Iterator iterator(sVnodeTable);
3284 	while (iterator.HasNext()) {
3285 		vnode = iterator.Next();
3286 		if (vnode->id != id || vnode->device != device)
3287 			continue;
3288 
3289 		_dump_vnode(vnode, printPath);
3290 	}
3291 
3292 	return 0;
3293 }
3294 
3295 
3296 static int
3297 dump_vnodes(int argc, char** argv)
3298 {
3299 	if (argc != 2 || !strcmp(argv[1], "--help")) {
3300 		kprintf("usage: %s [device]\n", argv[0]);
3301 		return 0;
3302 	}
3303 
3304 	// restrict dumped nodes to a certain device if requested
3305 	dev_t device = parse_expression(argv[1]);
3306 
3307 	struct vnode* vnode;
3308 
3309 	kprintf("%-*s   dev     inode  ref %-*s   %-*s   %-*s   flags\n",
3310 		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache",
3311 		B_PRINTF_POINTER_WIDTH, "fs-node", B_PRINTF_POINTER_WIDTH, "locking");
3312 
3313 	VnodeTable::Iterator iterator(sVnodeTable);
3314 	while (iterator.HasNext()) {
3315 		vnode = iterator.Next();
3316 		if (vnode->device != device)
3317 			continue;
3318 
3319 		kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO "%5" B_PRId32 " %p %p %p %s%s%s\n",
3320 			vnode, vnode->device, vnode->id, vnode->ref_count, vnode->cache,
3321 			vnode->private_node, vnode->advisory_locking,
3322 			vnode->IsRemoved() ? "r" : "-", vnode->IsBusy() ? "b" : "-",
3323 			vnode->IsUnpublished() ? "u" : "-");
3324 	}
3325 
3326 	return 0;
3327 }
3328 
3329 
3330 static int
3331 dump_vnode_caches(int argc, char** argv)
3332 {
3333 	struct vnode* vnode;
3334 
3335 	if (argc > 2 || !strcmp(argv[1], "--help")) {
3336 		kprintf("usage: %s [device]\n", argv[0]);
3337 		return 0;
3338 	}
3339 
3340 	// restrict dumped nodes to a certain device if requested
3341 	dev_t device = -1;
3342 	if (argc > 1)
3343 		device = parse_expression(argv[1]);
3344 
3345 	kprintf("%-*s   dev     inode %-*s       size   pages\n",
3346 		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache");
3347 
3348 	VnodeTable::Iterator iterator(sVnodeTable);
3349 	while (iterator.HasNext()) {
3350 		vnode = iterator.Next();
3351 		if (vnode->cache == NULL)
3352 			continue;
3353 		if (device != -1 && vnode->device != device)
3354 			continue;
3355 
3356 		kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO " %p %8" B_PRIdOFF "%8" B_PRId32 "\n",
3357 			vnode, vnode->device, vnode->id, vnode->cache,
3358 			(vnode->cache->virtual_end + B_PAGE_SIZE - 1) / B_PAGE_SIZE,
3359 			vnode->cache->page_count);
3360 	}
3361 
3362 	return 0;
3363 }
3364 
3365 
3366 int
3367 dump_io_context(int argc, char** argv)
3368 {
3369 	if (argc > 2 || !strcmp(argv[1], "--help")) {
3370 		kprintf("usage: %s [team-id|address]\n", argv[0]);
3371 		return 0;
3372 	}
3373 
3374 	struct io_context* context = NULL;
3375 
3376 	if (argc > 1) {
3377 		ulong num = parse_expression(argv[1]);
3378 		if (IS_KERNEL_ADDRESS(num))
3379 			context = (struct io_context*)num;
3380 		else {
3381 			Team* team = team_get_team_struct_locked(num);
3382 			if (team == NULL) {
3383 				kprintf("could not find team with ID %lu\n", num);
3384 				return 0;
3385 			}
3386 			context = (struct io_context*)team->io_context;
3387 		}
3388 	} else
3389 		context = get_current_io_context(true);
3390 
3391 	kprintf("I/O CONTEXT: %p\n", context);
3392 	kprintf(" root vnode:\t%p\n", context->root);
3393 	kprintf(" cwd vnode:\t%p\n", context->cwd);
3394 	kprintf(" used fds:\t%" B_PRIu32 "\n", context->num_used_fds);
3395 	kprintf(" max fds:\t%" B_PRIu32 "\n", context->table_size);
3396 
3397 	if (context->num_used_fds) {
3398 		kprintf("   no.  type    %*s  ref  open  mode         pos    %*s\n",
3399 			B_PRINTF_POINTER_WIDTH, "ops", B_PRINTF_POINTER_WIDTH, "cookie");
3400 	}
3401 
3402 	for (uint32 i = 0; i < context->table_size; i++) {
3403 		struct file_descriptor* fd = context->fds[i];
3404 		if (fd == NULL)
3405 			continue;
3406 
3407 		kprintf("  %3" B_PRIu32 ":  %4" B_PRId32 "  %p  %3" B_PRId32 "  %4"
3408 			B_PRIu32 "  %4" B_PRIx32 "  %10" B_PRIdOFF "  %p  %s %p\n", i,
3409 			fd->type, fd->ops, fd->ref_count, fd->open_count, fd->open_mode,
3410 			fd->pos, fd->cookie,
3411 			fd->type >= FDTYPE_INDEX && fd->type <= FDTYPE_QUERY
3412 				? "mount" : "vnode",
3413 			fd->u.vnode);
3414 	}
3415 
3416 	kprintf(" used monitors:\t%" B_PRIu32 "\n", context->num_monitors);
3417 	kprintf(" max monitors:\t%" B_PRIu32 "\n", context->max_monitors);
3418 
3419 	set_debug_variable("_cwd", (addr_t)context->cwd);
3420 
3421 	return 0;
3422 }
3423 
3424 
3425 int
3426 dump_vnode_usage(int argc, char** argv)
3427 {
3428 	if (argc != 1) {
3429 		kprintf("usage: %s\n", argv[0]);
3430 		return 0;
3431 	}
3432 
3433 	kprintf("Unused vnodes: %" B_PRIu32 " (max unused %" B_PRIu32 ")\n",
3434 		sUnusedVnodes, kMaxUnusedVnodes);
3435 
3436 	uint32 count = sVnodeTable->CountElements();
3437 
3438 	kprintf("%" B_PRIu32 " vnodes total (%" B_PRIu32 " in use).\n", count,
3439 		count - sUnusedVnodes);
3440 	return 0;
3441 }
3442 
3443 #endif	// ADD_DEBUGGER_COMMANDS
3444 
3445 
3446 /*!	Clears memory specified by an iovec array.
3447 */
3448 static void
3449 zero_iovecs(const iovec* vecs, size_t vecCount, size_t bytes)
3450 {
3451 	for (size_t i = 0; i < vecCount && bytes > 0; i++) {
3452 		size_t length = std::min(vecs[i].iov_len, bytes);
3453 		memset(vecs[i].iov_base, 0, length);
3454 		bytes -= length;
3455 	}
3456 }
3457 
3458 
3459 /*!	Does the dirty work of combining the file_io_vecs with the iovecs
3460 	and calls the file system hooks to read/write the request to disk.
3461 */
3462 static status_t
3463 common_file_io_vec_pages(struct vnode* vnode, void* cookie,
3464 	const file_io_vec* fileVecs, size_t fileVecCount, const iovec* vecs,
3465 	size_t vecCount, uint32* _vecIndex, size_t* _vecOffset, size_t* _numBytes,
3466 	bool doWrite)
3467 {
3468 	if (fileVecCount == 0) {
3469 		// There are no file vecs at this offset, so we're obviously trying
3470 		// to access the file outside of its bounds
3471 		return B_BAD_VALUE;
3472 	}
3473 
3474 	size_t numBytes = *_numBytes;
3475 	uint32 fileVecIndex;
3476 	size_t vecOffset = *_vecOffset;
3477 	uint32 vecIndex = *_vecIndex;
3478 	status_t status;
3479 	size_t size;
3480 
3481 	if (!doWrite && vecOffset == 0) {
3482 		// now directly read the data from the device
3483 		// the first file_io_vec can be read directly
3484 
3485 		if (fileVecs[0].length < (off_t)numBytes)
3486 			size = fileVecs[0].length;
3487 		else
3488 			size = numBytes;
3489 
3490 		if (fileVecs[0].offset >= 0) {
3491 			status = FS_CALL(vnode, read_pages, cookie, fileVecs[0].offset,
3492 				&vecs[vecIndex], vecCount - vecIndex, &size);
3493 		} else {
3494 			// sparse read
3495 			zero_iovecs(&vecs[vecIndex], vecCount - vecIndex, size);
3496 			status = B_OK;
3497 		}
3498 		if (status != B_OK)
3499 			return status;
3500 
3501 		// TODO: this is a work-around for buggy device drivers!
3502 		//	When our own drivers honour the length, we can:
3503 		//	a) also use this direct I/O for writes (otherwise, it would
3504 		//	   overwrite precious data)
3505 		//	b) panic if the term below is true (at least for writes)
3506 		if ((off_t)size > fileVecs[0].length) {
3507 			//dprintf("warning: device driver %p doesn't respect total length "
3508 			//	"in read_pages() call!\n", ref->device);
3509 			size = fileVecs[0].length;
3510 		}
3511 
3512 		ASSERT((off_t)size <= fileVecs[0].length);
3513 
3514 		// If the file portion was contiguous, we're already done now
3515 		if (size == numBytes)
3516 			return B_OK;
3517 
3518 		// if we reached the end of the file, we can return as well
3519 		if ((off_t)size != fileVecs[0].length) {
3520 			*_numBytes = size;
3521 			return B_OK;
3522 		}
3523 
3524 		fileVecIndex = 1;
3525 
3526 		// first, find out where we have to continue in our iovecs
3527 		for (; vecIndex < vecCount; vecIndex++) {
3528 			if (size < vecs[vecIndex].iov_len)
3529 				break;
3530 
3531 			size -= vecs[vecIndex].iov_len;
3532 		}
3533 
3534 		vecOffset = size;
3535 	} else {
3536 		fileVecIndex = 0;
3537 		size = 0;
3538 	}
3539 
3540 	// Too bad, let's process the rest of the file_io_vecs
3541 
3542 	size_t totalSize = size;
3543 	size_t bytesLeft = numBytes - size;
3544 
3545 	for (; fileVecIndex < fileVecCount; fileVecIndex++) {
3546 		const file_io_vec &fileVec = fileVecs[fileVecIndex];
3547 		off_t fileOffset = fileVec.offset;
3548 		off_t fileLeft = min_c(fileVec.length, (off_t)bytesLeft);
3549 
3550 		TRACE(("FILE VEC [%" B_PRIu32 "] length %" B_PRIdOFF "\n", fileVecIndex,
3551 			fileLeft));
3552 
3553 		// process the complete fileVec
3554 		while (fileLeft > 0) {
3555 			iovec tempVecs[MAX_TEMP_IO_VECS];
3556 			uint32 tempCount = 0;
3557 
3558 			// size tracks how much of what is left of the current fileVec
3559 			// (fileLeft) has been assigned to tempVecs
3560 			size = 0;
3561 
3562 			// assign what is left of the current fileVec to the tempVecs
3563 			for (size = 0; (off_t)size < fileLeft && vecIndex < vecCount
3564 					&& tempCount < MAX_TEMP_IO_VECS;) {
3565 				// try to satisfy one iovec per iteration (or as much as
3566 				// possible)
3567 
3568 				// bytes left of the current iovec
3569 				size_t vecLeft = vecs[vecIndex].iov_len - vecOffset;
3570 				if (vecLeft == 0) {
3571 					vecOffset = 0;
3572 					vecIndex++;
3573 					continue;
3574 				}
3575 
3576 				TRACE(("fill vec %" B_PRIu32 ", offset = %lu, size = %lu\n",
3577 					vecIndex, vecOffset, size));
3578 
3579 				// actually available bytes
3580 				size_t tempVecSize = min_c(vecLeft, fileLeft - size);
3581 
3582 				tempVecs[tempCount].iov_base
3583 					= (void*)((addr_t)vecs[vecIndex].iov_base + vecOffset);
3584 				tempVecs[tempCount].iov_len = tempVecSize;
3585 				tempCount++;
3586 
3587 				size += tempVecSize;
3588 				vecOffset += tempVecSize;
3589 			}
3590 
3591 			size_t bytes = size;
3592 
3593 			if (fileOffset == -1) {
3594 				if (doWrite) {
3595 					panic("sparse write attempt: vnode %p", vnode);
3596 					status = B_IO_ERROR;
3597 				} else {
3598 					// sparse read
3599 					zero_iovecs(tempVecs, tempCount, bytes);
3600 					status = B_OK;
3601 				}
3602 			} else if (doWrite) {
3603 				status = FS_CALL(vnode, write_pages, cookie, fileOffset,
3604 					tempVecs, tempCount, &bytes);
3605 			} else {
3606 				status = FS_CALL(vnode, read_pages, cookie, fileOffset,
3607 					tempVecs, tempCount, &bytes);
3608 			}
3609 			if (status != B_OK)
3610 				return status;
3611 
3612 			totalSize += bytes;
3613 			bytesLeft -= size;
3614 			if (fileOffset >= 0)
3615 				fileOffset += size;
3616 			fileLeft -= size;
3617 			//dprintf("-> file left = %Lu\n", fileLeft);
3618 
3619 			if (size != bytes || vecIndex >= vecCount) {
3620 				// there are no more bytes or iovecs, let's bail out
3621 				*_numBytes = totalSize;
3622 				return B_OK;
3623 			}
3624 		}
3625 	}
3626 
3627 	*_vecIndex = vecIndex;
3628 	*_vecOffset = vecOffset;
3629 	*_numBytes = totalSize;
3630 	return B_OK;
3631 }
3632 
3633 
3634 static bool
3635 is_user_in_group(gid_t gid)
3636 {
3637 	if (gid == getegid())
3638 		return true;
3639 
3640 	gid_t groups[NGROUPS_MAX];
3641 	int groupCount = getgroups(NGROUPS_MAX, groups);
3642 	for (int i = 0; i < groupCount; i++) {
3643 		if (gid == groups[i])
3644 			return true;
3645 	}
3646 
3647 	return false;
3648 }
3649 
3650 
3651 static status_t
3652 free_io_context(io_context* context)
3653 {
3654 	uint32 i;
3655 
3656 	TIOC(FreeIOContext(context));
3657 
3658 	if (context->root)
3659 		put_vnode(context->root);
3660 
3661 	if (context->cwd)
3662 		put_vnode(context->cwd);
3663 
3664 	mutex_lock(&context->io_mutex);
3665 
3666 	for (i = 0; i < context->table_size; i++) {
3667 		if (struct file_descriptor* descriptor = context->fds[i]) {
3668 			close_fd(context, descriptor);
3669 			put_fd(descriptor);
3670 		}
3671 	}
3672 
3673 	mutex_destroy(&context->io_mutex);
3674 
3675 	remove_node_monitors(context);
3676 	free(context->fds);
3677 	free(context);
3678 
3679 	return B_OK;
3680 }
3681 
3682 
3683 static status_t
3684 resize_monitor_table(struct io_context* context, const int newSize)
3685 {
3686 	int	status = B_OK;
3687 
3688 	if (newSize <= 0 || newSize > MAX_NODE_MONITORS)
3689 		return B_BAD_VALUE;
3690 
3691 	mutex_lock(&context->io_mutex);
3692 
3693 	if ((size_t)newSize < context->num_monitors) {
3694 		status = B_BUSY;
3695 		goto out;
3696 	}
3697 	context->max_monitors = newSize;
3698 
3699 out:
3700 	mutex_unlock(&context->io_mutex);
3701 	return status;
3702 }
3703 
3704 
3705 //	#pragma mark - public API for file systems
3706 
3707 
3708 extern "C" status_t
3709 new_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3710 	fs_vnode_ops* ops)
3711 {
3712 	FUNCTION(("new_vnode(volume = %p (%" B_PRId32 "), vnodeID = %" B_PRId64
3713 		", node = %p)\n", volume, volume->id, vnodeID, privateNode));
3714 
3715 	if (privateNode == NULL)
3716 		return B_BAD_VALUE;
3717 
3718 	int32 tries = BUSY_VNODE_RETRIES;
3719 restart:
3720 	// create the node
3721 	bool nodeCreated;
3722 	struct vnode* vnode;
3723 	status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3724 		nodeCreated);
3725 	if (status != B_OK)
3726 		return status;
3727 
3728 	WriteLocker nodeLocker(sVnodeLock, true);
3729 		// create_new_vnode_and_lock() has locked for us
3730 
3731 	if (!nodeCreated && vnode->IsBusy()) {
3732 		nodeLocker.Unlock();
3733 		if (!retry_busy_vnode(tries, volume->id, vnodeID))
3734 			return B_BUSY;
3735 		goto restart;
3736 	}
3737 
3738 	// file system integrity check:
3739 	// test if the vnode already exists and bail out if this is the case!
3740 	if (!nodeCreated) {
3741 		panic("vnode %" B_PRIdDEV ":%" B_PRIdINO " already exists (node = %p, "
3742 			"vnode->node = %p)!", volume->id, vnodeID, privateNode,
3743 			vnode->private_node);
3744 		return B_ERROR;
3745 	}
3746 
3747 	vnode->private_node = privateNode;
3748 	vnode->ops = ops;
3749 	vnode->SetUnpublished(true);
3750 
3751 	TRACE(("returns: %s\n", strerror(status)));
3752 
3753 	return status;
3754 }
3755 
3756 
3757 extern "C" status_t
3758 publish_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3759 	fs_vnode_ops* ops, int type, uint32 flags)
3760 {
3761 	FUNCTION(("publish_vnode()\n"));
3762 
3763 	int32 tries = BUSY_VNODE_RETRIES;
3764 restart:
3765 	WriteLocker locker(sVnodeLock);
3766 
3767 	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3768 
3769 	bool nodeCreated = false;
3770 	if (vnode == NULL) {
3771 		if (privateNode == NULL)
3772 			return B_BAD_VALUE;
3773 
3774 		// create the node
3775 		locker.Unlock();
3776 			// create_new_vnode_and_lock() will re-lock for us on success
3777 		status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3778 			nodeCreated);
3779 		if (status != B_OK)
3780 			return status;
3781 
3782 		locker.SetTo(sVnodeLock, true);
3783 	}
3784 
3785 	if (nodeCreated) {
3786 		vnode->private_node = privateNode;
3787 		vnode->ops = ops;
3788 		vnode->SetUnpublished(true);
3789 	} else if (vnode->IsBusy() && vnode->IsUnpublished()
3790 		&& vnode->private_node == privateNode && vnode->ops == ops) {
3791 		// already known, but not published
3792 	} else if (vnode->IsBusy()) {
3793 		locker.Unlock();
3794 		if (!retry_busy_vnode(tries, volume->id, vnodeID))
3795 			return B_BUSY;
3796 		goto restart;
3797 	} else
3798 		return B_BAD_VALUE;
3799 
3800 	bool publishSpecialSubNode = false;
3801 
3802 	vnode->SetType(type);
3803 	vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
3804 	publishSpecialSubNode = is_special_node_type(type)
3805 		&& (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
3806 
3807 	status_t status = B_OK;
3808 
3809 	// create sub vnodes, if necessary
3810 	if (volume->sub_volume != NULL || publishSpecialSubNode) {
3811 		locker.Unlock();
3812 
3813 		fs_volume* subVolume = volume;
3814 		if (volume->sub_volume != NULL) {
3815 			while (status == B_OK && subVolume->sub_volume != NULL) {
3816 				subVolume = subVolume->sub_volume;
3817 				status = subVolume->ops->create_sub_vnode(subVolume, vnodeID,
3818 					vnode);
3819 			}
3820 		}
3821 
3822 		if (status == B_OK && publishSpecialSubNode)
3823 			status = create_special_sub_node(vnode, flags);
3824 
3825 		if (status != B_OK) {
3826 			// error -- clean up the created sub vnodes
3827 			while (subVolume->super_volume != volume) {
3828 				subVolume = subVolume->super_volume;
3829 				subVolume->ops->delete_sub_vnode(subVolume, vnode);
3830 			}
3831 		}
3832 
3833 		if (status == B_OK) {
3834 			ReadLocker vnodesReadLocker(sVnodeLock);
3835 			AutoLocker<Vnode> nodeLocker(vnode);
3836 			vnode->SetBusy(false);
3837 			vnode->SetUnpublished(false);
3838 		} else {
3839 			locker.Lock();
3840 			sVnodeTable->Remove(vnode);
3841 			remove_vnode_from_mount_list(vnode, vnode->mount);
3842 			object_cache_free(sVnodeCache, vnode, 0);
3843 		}
3844 	} else {
3845 		// we still hold the write lock -- mark the node unbusy and published
3846 		vnode->SetBusy(false);
3847 		vnode->SetUnpublished(false);
3848 	}
3849 
3850 	TRACE(("returns: %s\n", strerror(status)));
3851 
3852 	return status;
3853 }
3854 
3855 
3856 extern "C" status_t
3857 get_vnode(fs_volume* volume, ino_t vnodeID, void** _privateNode)
3858 {
3859 	struct vnode* vnode;
3860 
3861 	if (volume == NULL)
3862 		return B_BAD_VALUE;
3863 
3864 	status_t status = get_vnode(volume->id, vnodeID, &vnode, true, true);
3865 	if (status != B_OK)
3866 		return status;
3867 
3868 	// If this is a layered FS, we need to get the node cookie for the requested
3869 	// layer.
3870 	if (HAS_FS_CALL(vnode, get_super_vnode)) {
3871 		fs_vnode resolvedNode;
3872 		status_t status = FS_CALL(vnode, get_super_vnode, volume,
3873 			&resolvedNode);
3874 		if (status != B_OK) {
3875 			panic("get_vnode(): Failed to get super node for vnode %p, "
3876 				"volume: %p", vnode, volume);
3877 			put_vnode(vnode);
3878 			return status;
3879 		}
3880 
3881 		if (_privateNode != NULL)
3882 			*_privateNode = resolvedNode.private_node;
3883 	} else if (_privateNode != NULL)
3884 		*_privateNode = vnode->private_node;
3885 
3886 	return B_OK;
3887 }
3888 
3889 
3890 extern "C" status_t
3891 acquire_vnode(fs_volume* volume, ino_t vnodeID)
3892 {
3893 	struct vnode* vnode;
3894 
3895 	rw_lock_read_lock(&sVnodeLock);
3896 	vnode = lookup_vnode(volume->id, vnodeID);
3897 	rw_lock_read_unlock(&sVnodeLock);
3898 
3899 	if (vnode == NULL)
3900 		return B_BAD_VALUE;
3901 
3902 	inc_vnode_ref_count(vnode);
3903 	return B_OK;
3904 }
3905 
3906 
3907 extern "C" status_t
3908 put_vnode(fs_volume* volume, ino_t vnodeID)
3909 {
3910 	struct vnode* vnode;
3911 
3912 	rw_lock_read_lock(&sVnodeLock);
3913 	vnode = lookup_vnode(volume->id, vnodeID);
3914 	rw_lock_read_unlock(&sVnodeLock);
3915 
3916 	if (vnode == NULL)
3917 		return B_BAD_VALUE;
3918 
3919 	dec_vnode_ref_count(vnode, false, true);
3920 	return B_OK;
3921 }
3922 
3923 
3924 extern "C" status_t
3925 remove_vnode(fs_volume* volume, ino_t vnodeID)
3926 {
3927 	ReadLocker locker(sVnodeLock);
3928 
3929 	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3930 	if (vnode == NULL)
3931 		return B_ENTRY_NOT_FOUND;
3932 
3933 	if (vnode->covered_by != NULL || vnode->covers != NULL) {
3934 		// this vnode is in use
3935 		return B_BUSY;
3936 	}
3937 
3938 	vnode->Lock();
3939 
3940 	vnode->SetRemoved(true);
3941 	bool removeUnpublished = false;
3942 
3943 	if (vnode->IsUnpublished()) {
3944 		// prepare the vnode for deletion
3945 		removeUnpublished = true;
3946 		vnode->SetBusy(true);
3947 	}
3948 
3949 	vnode->Unlock();
3950 	locker.Unlock();
3951 
3952 	if (removeUnpublished) {
3953 		// If the vnode hasn't been published yet, we delete it here
3954 		atomic_add(&vnode->ref_count, -1);
3955 		free_vnode(vnode, true);
3956 	}
3957 
3958 	return B_OK;
3959 }
3960 
3961 
3962 extern "C" status_t
3963 unremove_vnode(fs_volume* volume, ino_t vnodeID)
3964 {
3965 	struct vnode* vnode;
3966 
3967 	rw_lock_read_lock(&sVnodeLock);
3968 
3969 	vnode = lookup_vnode(volume->id, vnodeID);
3970 	if (vnode) {
3971 		AutoLocker<Vnode> nodeLocker(vnode);
3972 		vnode->SetRemoved(false);
3973 	}
3974 
3975 	rw_lock_read_unlock(&sVnodeLock);
3976 	return B_OK;
3977 }
3978 
3979 
3980 extern "C" status_t
3981 get_vnode_removed(fs_volume* volume, ino_t vnodeID, bool* _removed)
3982 {
3983 	ReadLocker _(sVnodeLock);
3984 
3985 	if (struct vnode* vnode = lookup_vnode(volume->id, vnodeID)) {
3986 		if (_removed != NULL)
3987 			*_removed = vnode->IsRemoved();
3988 		return B_OK;
3989 	}
3990 
3991 	return B_BAD_VALUE;
3992 }
3993 
3994 
3995 extern "C" fs_volume*
3996 volume_for_vnode(fs_vnode* _vnode)
3997 {
3998 	if (_vnode == NULL)
3999 		return NULL;
4000 
4001 	struct vnode* vnode = static_cast<struct vnode*>(_vnode);
4002 	return vnode->mount->volume;
4003 }
4004 
4005 
4006 extern "C" status_t
4007 check_access_permissions(int accessMode, mode_t mode, gid_t nodeGroupID,
4008 	uid_t nodeUserID)
4009 {
4010 	// get node permissions
4011 	int userPermissions = (mode & S_IRWXU) >> 6;
4012 	int groupPermissions = (mode & S_IRWXG) >> 3;
4013 	int otherPermissions = mode & S_IRWXO;
4014 
4015 	// get the node permissions for this uid/gid
4016 	int permissions = 0;
4017 	uid_t uid = geteuid();
4018 
4019 	if (uid == 0) {
4020 		// user is root
4021 		// root has always read/write permission, but at least one of the
4022 		// X bits must be set for execute permission
4023 		permissions = userPermissions | groupPermissions | otherPermissions
4024 			| S_IROTH | S_IWOTH;
4025 		if (S_ISDIR(mode))
4026 			permissions |= S_IXOTH;
4027 	} else if (uid == nodeUserID) {
4028 		// user is node owner
4029 		permissions = userPermissions;
4030 	} else if (is_user_in_group(nodeGroupID)) {
4031 		// user is in owning group
4032 		permissions = groupPermissions;
4033 	} else {
4034 		// user is one of the others
4035 		permissions = otherPermissions;
4036 	}
4037 
4038 	return (accessMode & ~permissions) == 0 ? B_OK : B_PERMISSION_DENIED;
4039 }
4040 
4041 
4042 #if 0
4043 extern "C" status_t
4044 read_pages(int fd, off_t pos, const iovec* vecs, size_t count,
4045 	size_t* _numBytes)
4046 {
4047 	struct file_descriptor* descriptor;
4048 	struct vnode* vnode;
4049 
4050 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4051 	if (descriptor == NULL)
4052 		return B_FILE_ERROR;
4053 
4054 	status_t status = vfs_read_pages(vnode, descriptor->cookie, pos, vecs,
4055 		count, 0, _numBytes);
4056 
4057 	put_fd(descriptor);
4058 	return status;
4059 }
4060 
4061 
4062 extern "C" status_t
4063 write_pages(int fd, off_t pos, const iovec* vecs, size_t count,
4064 	size_t* _numBytes)
4065 {
4066 	struct file_descriptor* descriptor;
4067 	struct vnode* vnode;
4068 
4069 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4070 	if (descriptor == NULL)
4071 		return B_FILE_ERROR;
4072 
4073 	status_t status = vfs_write_pages(vnode, descriptor->cookie, pos, vecs,
4074 		count, 0, _numBytes);
4075 
4076 	put_fd(descriptor);
4077 	return status;
4078 }
4079 #endif
4080 
4081 
4082 extern "C" status_t
4083 read_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
4084 	const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
4085 	size_t* _bytes)
4086 {
4087 	struct file_descriptor* descriptor;
4088 	struct vnode* vnode;
4089 
4090 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4091 	if (descriptor == NULL)
4092 		return B_FILE_ERROR;
4093 
4094 	status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
4095 		fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
4096 		false);
4097 
4098 	put_fd(descriptor);
4099 	return status;
4100 }
4101 
4102 
4103 extern "C" status_t
4104 write_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
4105 	const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
4106 	size_t* _bytes)
4107 {
4108 	struct file_descriptor* descriptor;
4109 	struct vnode* vnode;
4110 
4111 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4112 	if (descriptor == NULL)
4113 		return B_FILE_ERROR;
4114 
4115 	status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
4116 		fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
4117 		true);
4118 
4119 	put_fd(descriptor);
4120 	return status;
4121 }
4122 
4123 
4124 extern "C" status_t
4125 entry_cache_add(dev_t mountID, ino_t dirID, const char* name, ino_t nodeID)
4126 {
4127 	// lookup mount -- the caller is required to make sure that the mount
4128 	// won't go away
4129 	ReadLocker locker(sMountLock);
4130 	struct fs_mount* mount = find_mount(mountID);
4131 	if (mount == NULL)
4132 		return B_BAD_VALUE;
4133 	locker.Unlock();
4134 
4135 	return mount->entry_cache.Add(dirID, name, nodeID, false);
4136 }
4137 
4138 
4139 extern "C" status_t
4140 entry_cache_add_missing(dev_t mountID, ino_t dirID, const char* name)
4141 {
4142 	// lookup mount -- the caller is required to make sure that the mount
4143 	// won't go away
4144 	ReadLocker locker(sMountLock);
4145 	struct fs_mount* mount = find_mount(mountID);
4146 	if (mount == NULL)
4147 		return B_BAD_VALUE;
4148 	locker.Unlock();
4149 
4150 	return mount->entry_cache.Add(dirID, name, -1, true);
4151 }
4152 
4153 
4154 extern "C" status_t
4155 entry_cache_remove(dev_t mountID, ino_t dirID, const char* name)
4156 {
4157 	// lookup mount -- the caller is required to make sure that the mount
4158 	// won't go away
4159 	ReadLocker locker(sMountLock);
4160 	struct fs_mount* mount = find_mount(mountID);
4161 	if (mount == NULL)
4162 		return B_BAD_VALUE;
4163 	locker.Unlock();
4164 
4165 	return mount->entry_cache.Remove(dirID, name);
4166 }
4167 
4168 
4169 //	#pragma mark - private VFS API
4170 //	Functions the VFS exports for other parts of the kernel
4171 
4172 
4173 /*! Acquires another reference to the vnode that has to be released
4174 	by calling vfs_put_vnode().
4175 */
4176 void
4177 vfs_acquire_vnode(struct vnode* vnode)
4178 {
4179 	inc_vnode_ref_count(vnode);
4180 }
4181 
4182 
4183 /*! This is currently called from file_cache_create() only.
4184 	It's probably a temporary solution as long as devfs requires that
4185 	fs_read_pages()/fs_write_pages() are called with the standard
4186 	open cookie and not with a device cookie.
4187 	If that's done differently, remove this call; it has no other
4188 	purpose.
4189 */
4190 extern "C" status_t
4191 vfs_get_cookie_from_fd(int fd, void** _cookie)
4192 {
4193 	struct file_descriptor* descriptor;
4194 
4195 	descriptor = get_fd(get_current_io_context(true), fd);
4196 	if (descriptor == NULL)
4197 		return B_FILE_ERROR;
4198 
4199 	*_cookie = descriptor->cookie;
4200 	return B_OK;
4201 }
4202 
4203 
4204 extern "C" status_t
4205 vfs_get_vnode_from_fd(int fd, bool kernel, struct vnode** vnode)
4206 {
4207 	*vnode = get_vnode_from_fd(fd, kernel);
4208 
4209 	if (*vnode == NULL)
4210 		return B_FILE_ERROR;
4211 
4212 	return B_NO_ERROR;
4213 }
4214 
4215 
4216 extern "C" status_t
4217 vfs_get_vnode_from_path(const char* path, bool kernel, struct vnode** _vnode)
4218 {
4219 	TRACE(("vfs_get_vnode_from_path: entry. path = '%s', kernel %d\n",
4220 		path, kernel));
4221 
4222 	KPath pathBuffer;
4223 	if (pathBuffer.InitCheck() != B_OK)
4224 		return B_NO_MEMORY;
4225 
4226 	char* buffer = pathBuffer.LockBuffer();
4227 	strlcpy(buffer, path, pathBuffer.BufferSize());
4228 
4229 	struct vnode* vnode;
4230 	status_t status = path_to_vnode(buffer, true, &vnode, NULL, kernel);
4231 	if (status != B_OK)
4232 		return status;
4233 
4234 	*_vnode = vnode;
4235 	return B_OK;
4236 }
4237 
4238 
4239 extern "C" status_t
4240 vfs_get_vnode(dev_t mountID, ino_t vnodeID, bool canWait, struct vnode** _vnode)
4241 {
4242 	struct vnode* vnode = NULL;
4243 
4244 	status_t status = get_vnode(mountID, vnodeID, &vnode, canWait, false);
4245 	if (status != B_OK)
4246 		return status;
4247 
4248 	*_vnode = vnode;
4249 	return B_OK;
4250 }
4251 
4252 
4253 extern "C" status_t
4254 vfs_entry_ref_to_vnode(dev_t mountID, ino_t directoryID,
4255 	const char* name, struct vnode** _vnode)
4256 {
4257 	return entry_ref_to_vnode(mountID, directoryID, name, false, true, _vnode);
4258 }
4259 
4260 
4261 extern "C" void
4262 vfs_vnode_to_node_ref(struct vnode* vnode, dev_t* _mountID, ino_t* _vnodeID)
4263 {
4264 	*_mountID = vnode->device;
4265 	*_vnodeID = vnode->id;
4266 }
4267 
4268 
4269 /*!
4270 	Helper function abstracting the process of "converting" a given
4271 	vnode-pointer to a fs_vnode-pointer.
4272 	Currently only used in bindfs.
4273 */
4274 extern "C" fs_vnode*
4275 vfs_fsnode_for_vnode(struct vnode* vnode)
4276 {
4277 	return vnode;
4278 }
4279 
4280 
4281 /*!
4282 	Calls fs_open() on the given vnode and returns a new
4283 	file descriptor for it
4284 */
4285 int
4286 vfs_open_vnode(struct vnode* vnode, int openMode, bool kernel)
4287 {
4288 	return open_vnode(vnode, openMode, kernel);
4289 }
4290 
4291 
4292 /*!	Looks up a vnode with the given mount and vnode ID.
4293 	Must only be used with "in-use" vnodes as it doesn't grab a reference
4294 	to the node.
4295 	It's currently only be used by file_cache_create().
4296 */
4297 extern "C" status_t
4298 vfs_lookup_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode)
4299 {
4300 	rw_lock_read_lock(&sVnodeLock);
4301 	struct vnode* vnode = lookup_vnode(mountID, vnodeID);
4302 	rw_lock_read_unlock(&sVnodeLock);
4303 
4304 	if (vnode == NULL)
4305 		return B_ERROR;
4306 
4307 	*_vnode = vnode;
4308 	return B_OK;
4309 }
4310 
4311 
4312 extern "C" status_t
4313 vfs_get_fs_node_from_path(fs_volume* volume, const char* path,
4314 	bool traverseLeafLink, bool kernel, void** _node)
4315 {
4316 	TRACE(("vfs_get_fs_node_from_path(volume = %p, path = \"%s\", kernel %d)\n",
4317 		volume, path, kernel));
4318 
4319 	KPath pathBuffer;
4320 	if (pathBuffer.InitCheck() != B_OK)
4321 		return B_NO_MEMORY;
4322 
4323 	fs_mount* mount;
4324 	status_t status = get_mount(volume->id, &mount);
4325 	if (status != B_OK)
4326 		return status;
4327 
4328 	char* buffer = pathBuffer.LockBuffer();
4329 	strlcpy(buffer, path, pathBuffer.BufferSize());
4330 
4331 	struct vnode* vnode = mount->root_vnode;
4332 
4333 	if (buffer[0] == '/')
4334 		status = path_to_vnode(buffer, traverseLeafLink, &vnode, NULL, kernel);
4335 	else {
4336 		inc_vnode_ref_count(vnode);
4337 			// vnode_path_to_vnode() releases a reference to the starting vnode
4338 		status = vnode_path_to_vnode(vnode, buffer, traverseLeafLink, 0,
4339 			kernel, &vnode, NULL);
4340 	}
4341 
4342 	put_mount(mount);
4343 
4344 	if (status != B_OK)
4345 		return status;
4346 
4347 	if (vnode->device != volume->id) {
4348 		// wrong mount ID - must not gain access on foreign file system nodes
4349 		put_vnode(vnode);
4350 		return B_BAD_VALUE;
4351 	}
4352 
4353 	// Use get_vnode() to resolve the cookie for the right layer.
4354 	status = get_vnode(volume, vnode->id, _node);
4355 	put_vnode(vnode);
4356 
4357 	return status;
4358 }
4359 
4360 
4361 status_t
4362 vfs_read_stat(int fd, const char* path, bool traverseLeafLink,
4363 	struct stat* stat, bool kernel)
4364 {
4365 	status_t status;
4366 
4367 	if (path != NULL) {
4368 		// path given: get the stat of the node referred to by (fd, path)
4369 		KPath pathBuffer(path);
4370 		if (pathBuffer.InitCheck() != B_OK)
4371 			return B_NO_MEMORY;
4372 
4373 		status = common_path_read_stat(fd, pathBuffer.LockBuffer(),
4374 			traverseLeafLink, stat, kernel);
4375 	} else {
4376 		// no path given: get the FD and use the FD operation
4377 		struct file_descriptor* descriptor
4378 			= get_fd(get_current_io_context(kernel), fd);
4379 		if (descriptor == NULL)
4380 			return B_FILE_ERROR;
4381 
4382 		if (descriptor->ops->fd_read_stat)
4383 			status = descriptor->ops->fd_read_stat(descriptor, stat);
4384 		else
4385 			status = B_UNSUPPORTED;
4386 
4387 		put_fd(descriptor);
4388 	}
4389 
4390 	return status;
4391 }
4392 
4393 
4394 /*!	Finds the full path to the file that contains the module \a moduleName,
4395 	puts it into \a pathBuffer, and returns B_OK for success.
4396 	If \a pathBuffer was too small, it returns \c B_BUFFER_OVERFLOW,
4397 	\c B_ENTRY_NOT_FOUNT if no file could be found.
4398 	\a pathBuffer is clobbered in any case and must not be relied on if this
4399 	functions returns unsuccessfully.
4400 	\a basePath and \a pathBuffer must not point to the same space.
4401 */
4402 status_t
4403 vfs_get_module_path(const char* basePath, const char* moduleName,
4404 	char* pathBuffer, size_t bufferSize)
4405 {
4406 	struct vnode* dir;
4407 	struct vnode* file;
4408 	status_t status;
4409 	size_t length;
4410 	char* path;
4411 
4412 	if (bufferSize == 0
4413 		|| strlcpy(pathBuffer, basePath, bufferSize) >= bufferSize)
4414 		return B_BUFFER_OVERFLOW;
4415 
4416 	status = path_to_vnode(pathBuffer, true, &dir, NULL, true);
4417 	if (status != B_OK)
4418 		return status;
4419 
4420 	// the path buffer had been clobbered by the above call
4421 	length = strlcpy(pathBuffer, basePath, bufferSize);
4422 	if (pathBuffer[length - 1] != '/')
4423 		pathBuffer[length++] = '/';
4424 
4425 	path = pathBuffer + length;
4426 	bufferSize -= length;
4427 
4428 	while (moduleName) {
4429 		char* nextPath = strchr(moduleName, '/');
4430 		if (nextPath == NULL)
4431 			length = strlen(moduleName);
4432 		else {
4433 			length = nextPath - moduleName;
4434 			nextPath++;
4435 		}
4436 
4437 		if (length + 1 >= bufferSize) {
4438 			status = B_BUFFER_OVERFLOW;
4439 			goto err;
4440 		}
4441 
4442 		memcpy(path, moduleName, length);
4443 		path[length] = '\0';
4444 		moduleName = nextPath;
4445 
4446 		status = vnode_path_to_vnode(dir, path, true, 0, true, &file, NULL);
4447 		if (status != B_OK) {
4448 			// vnode_path_to_vnode() has already released the reference to dir
4449 			return status;
4450 		}
4451 
4452 		if (S_ISDIR(file->Type())) {
4453 			// goto the next directory
4454 			path[length] = '/';
4455 			path[length + 1] = '\0';
4456 			path += length + 1;
4457 			bufferSize -= length + 1;
4458 
4459 			dir = file;
4460 		} else if (S_ISREG(file->Type())) {
4461 			// it's a file so it should be what we've searched for
4462 			put_vnode(file);
4463 
4464 			return B_OK;
4465 		} else {
4466 			TRACE(("vfs_get_module_path(): something is strange here: "
4467 				"0x%08" B_PRIx32 "...\n", file->Type()));
4468 			status = B_ERROR;
4469 			dir = file;
4470 			goto err;
4471 		}
4472 	}
4473 
4474 	// if we got here, the moduleName just pointed to a directory, not to
4475 	// a real module - what should we do in this case?
4476 	status = B_ENTRY_NOT_FOUND;
4477 
4478 err:
4479 	put_vnode(dir);
4480 	return status;
4481 }
4482 
4483 
4484 /*!	\brief Normalizes a given path.
4485 
4486 	The path must refer to an existing or non-existing entry in an existing
4487 	directory, that is chopping off the leaf component the remaining path must
4488 	refer to an existing directory.
4489 
4490 	The returned will be canonical in that it will be absolute, will not
4491 	contain any "." or ".." components or duplicate occurrences of '/'s,
4492 	and none of the directory components will by symbolic links.
4493 
4494 	Any two paths referring to the same entry, will result in the same
4495 	normalized path (well, that is pretty much the definition of `normalized',
4496 	isn't it :-).
4497 
4498 	\param path The path to be normalized.
4499 	\param buffer The buffer into which the normalized path will be written.
4500 		   May be the same one as \a path.
4501 	\param bufferSize The size of \a buffer.
4502 	\param traverseLink If \c true, the function also resolves leaf symlinks.
4503 	\param kernel \c true, if the IO context of the kernel shall be used,
4504 		   otherwise that of the team this thread belongs to. Only relevant,
4505 		   if the path is relative (to get the CWD).
4506 	\return \c B_OK if everything went fine, another error code otherwise.
4507 */
4508 status_t
4509 vfs_normalize_path(const char* path, char* buffer, size_t bufferSize,
4510 	bool traverseLink, bool kernel)
4511 {
4512 	if (!path || !buffer || bufferSize < 1)
4513 		return B_BAD_VALUE;
4514 
4515 	if (path != buffer) {
4516 		if (strlcpy(buffer, path, bufferSize) >= bufferSize)
4517 			return B_BUFFER_OVERFLOW;
4518 	}
4519 
4520 	return normalize_path(buffer, bufferSize, traverseLink, kernel);
4521 }
4522 
4523 
4524 /*!	\brief Gets the parent of the passed in node.
4525 
4526 	Gets the parent of the passed in node, and correctly resolves covered
4527 	nodes.
4528 */
4529 extern "C" status_t
4530 vfs_resolve_parent(struct vnode* parent, dev_t* device, ino_t* node)
4531 {
4532 	return resolve_covered_parent(parent, device, node,
4533 		get_current_io_context(true));
4534 }
4535 
4536 
4537 /*!	\brief Creates a special node in the file system.
4538 
4539 	The caller gets a reference to the newly created node (which is passed
4540 	back through \a _createdVnode) and is responsible for releasing it.
4541 
4542 	\param path The path where to create the entry for the node. Can be \c NULL,
4543 		in which case the node is created without an entry in the root FS -- it
4544 		will automatically be deleted when the last reference has been released.
4545 	\param subVnode The definition of the subnode. Can be \c NULL, in which case
4546 		the target file system will just create the node with its standard
4547 		operations. Depending on the type of the node a subnode might be created
4548 		automatically, though.
4549 	\param mode The type and permissions for the node to be created.
4550 	\param flags Flags to be passed to the creating FS.
4551 	\param kernel \c true, if called in the kernel context (relevant only if
4552 		\a path is not \c NULL and not absolute).
4553 	\param _superVnode Pointer to a pre-allocated structure to be filled by the
4554 		file system creating the node, with the private data pointer and
4555 		operations for the super node. Can be \c NULL.
4556 	\param _createVnode Pointer to pre-allocated storage where to store the
4557 		pointer to the newly created node.
4558 	\return \c B_OK, if everything went fine, another error code otherwise.
4559 */
4560 status_t
4561 vfs_create_special_node(const char* path, fs_vnode* subVnode, mode_t mode,
4562 	uint32 flags, bool kernel, fs_vnode* _superVnode,
4563 	struct vnode** _createdVnode)
4564 {
4565 	struct vnode* dirNode;
4566 	char _leaf[B_FILE_NAME_LENGTH];
4567 	char* leaf = NULL;
4568 
4569 	if (path) {
4570 		// We've got a path. Get the dir vnode and the leaf name.
4571 		KPath tmpPathBuffer;
4572 		if (tmpPathBuffer.InitCheck() != B_OK)
4573 			return B_NO_MEMORY;
4574 
4575 		char* tmpPath = tmpPathBuffer.LockBuffer();
4576 		if (strlcpy(tmpPath, path, B_PATH_NAME_LENGTH) >= B_PATH_NAME_LENGTH)
4577 			return B_NAME_TOO_LONG;
4578 
4579 		// get the dir vnode and the leaf name
4580 		leaf = _leaf;
4581 		status_t error = path_to_dir_vnode(tmpPath, &dirNode, leaf, kernel);
4582 		if (error != B_OK)
4583 			return error;
4584 	} else {
4585 		// No path. Create the node in the root FS.
4586 		dirNode = sRoot;
4587 		inc_vnode_ref_count(dirNode);
4588 	}
4589 
4590 	VNodePutter _(dirNode);
4591 
4592 	// check support for creating special nodes
4593 	if (!HAS_FS_CALL(dirNode, create_special_node))
4594 		return B_UNSUPPORTED;
4595 
4596 	// create the node
4597 	fs_vnode superVnode;
4598 	ino_t nodeID;
4599 	status_t status = FS_CALL(dirNode, create_special_node, leaf, subVnode,
4600 		mode, flags, _superVnode != NULL ? _superVnode : &superVnode, &nodeID);
4601 	if (status != B_OK)
4602 		return status;
4603 
4604 	// lookup the node
4605 	rw_lock_read_lock(&sVnodeLock);
4606 	*_createdVnode = lookup_vnode(dirNode->mount->id, nodeID);
4607 	rw_lock_read_unlock(&sVnodeLock);
4608 
4609 	if (*_createdVnode == NULL) {
4610 		panic("vfs_create_special_node(): lookup of node failed");
4611 		return B_ERROR;
4612 	}
4613 
4614 	return B_OK;
4615 }
4616 
4617 
4618 extern "C" void
4619 vfs_put_vnode(struct vnode* vnode)
4620 {
4621 	put_vnode(vnode);
4622 }
4623 
4624 
4625 extern "C" status_t
4626 vfs_get_cwd(dev_t* _mountID, ino_t* _vnodeID)
4627 {
4628 	// Get current working directory from io context
4629 	struct io_context* context = get_current_io_context(false);
4630 	status_t status = B_OK;
4631 
4632 	mutex_lock(&context->io_mutex);
4633 
4634 	if (context->cwd != NULL) {
4635 		*_mountID = context->cwd->device;
4636 		*_vnodeID = context->cwd->id;
4637 	} else
4638 		status = B_ERROR;
4639 
4640 	mutex_unlock(&context->io_mutex);
4641 	return status;
4642 }
4643 
4644 
4645 status_t
4646 vfs_unmount(dev_t mountID, uint32 flags)
4647 {
4648 	return fs_unmount(NULL, mountID, flags, true);
4649 }
4650 
4651 
4652 extern "C" status_t
4653 vfs_disconnect_vnode(dev_t mountID, ino_t vnodeID)
4654 {
4655 	struct vnode* vnode;
4656 
4657 	status_t status = get_vnode(mountID, vnodeID, &vnode, true, true);
4658 	if (status != B_OK)
4659 		return status;
4660 
4661 	disconnect_mount_or_vnode_fds(vnode->mount, vnode);
4662 	put_vnode(vnode);
4663 	return B_OK;
4664 }
4665 
4666 
4667 extern "C" void
4668 vfs_free_unused_vnodes(int32 level)
4669 {
4670 	vnode_low_resource_handler(NULL,
4671 		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
4672 			| B_KERNEL_RESOURCE_ADDRESS_SPACE,
4673 		level);
4674 }
4675 
4676 
4677 extern "C" bool
4678 vfs_can_page(struct vnode* vnode, void* cookie)
4679 {
4680 	FUNCTION(("vfs_canpage: vnode %p\n", vnode));
4681 
4682 	if (HAS_FS_CALL(vnode, can_page))
4683 		return FS_CALL(vnode, can_page, cookie);
4684 	return false;
4685 }
4686 
4687 
4688 extern "C" status_t
4689 vfs_read_pages(struct vnode* vnode, void* cookie, off_t pos,
4690 	const generic_io_vec* vecs, size_t count, uint32 flags,
4691 	generic_size_t* _numBytes)
4692 {
4693 	FUNCTION(("vfs_read_pages: vnode %p, vecs %p, pos %" B_PRIdOFF "\n", vnode,
4694 		vecs, pos));
4695 
4696 #if VFS_PAGES_IO_TRACING
4697 	generic_size_t bytesRequested = *_numBytes;
4698 #endif
4699 
4700 	IORequest request;
4701 	status_t status = request.Init(pos, vecs, count, *_numBytes, false, flags);
4702 	if (status == B_OK) {
4703 		status = vfs_vnode_io(vnode, cookie, &request);
4704 		if (status == B_OK)
4705 			status = request.Wait();
4706 		*_numBytes = request.TransferredBytes();
4707 	}
4708 
4709 	TPIO(ReadPages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4710 		status, *_numBytes));
4711 
4712 	return status;
4713 }
4714 
4715 
4716 extern "C" status_t
4717 vfs_write_pages(struct vnode* vnode, void* cookie, off_t pos,
4718 	const generic_io_vec* vecs, size_t count, uint32 flags,
4719 	generic_size_t* _numBytes)
4720 {
4721 	FUNCTION(("vfs_write_pages: vnode %p, vecs %p, pos %" B_PRIdOFF "\n", vnode,
4722 		vecs, pos));
4723 
4724 #if VFS_PAGES_IO_TRACING
4725 	generic_size_t bytesRequested = *_numBytes;
4726 #endif
4727 
4728 	IORequest request;
4729 	status_t status = request.Init(pos, vecs, count, *_numBytes, true, flags);
4730 	if (status == B_OK) {
4731 		status = vfs_vnode_io(vnode, cookie, &request);
4732 		if (status == B_OK)
4733 			status = request.Wait();
4734 		*_numBytes = request.TransferredBytes();
4735 	}
4736 
4737 	TPIO(WritePages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4738 		status, *_numBytes));
4739 
4740 	return status;
4741 }
4742 
4743 
4744 /*!	Gets the vnode's VMCache object. If it didn't have one, it will be
4745 	created if \a allocate is \c true.
4746 	In case it's successful, it will also grab a reference to the cache
4747 	it returns.
4748 */
4749 extern "C" status_t
4750 vfs_get_vnode_cache(struct vnode* vnode, VMCache** _cache, bool allocate)
4751 {
4752 	if (vnode->cache != NULL) {
4753 		vnode->cache->AcquireRef();
4754 		*_cache = vnode->cache;
4755 		return B_OK;
4756 	}
4757 
4758 	rw_lock_read_lock(&sVnodeLock);
4759 	vnode->Lock();
4760 
4761 	status_t status = B_OK;
4762 
4763 	// The cache could have been created in the meantime
4764 	if (vnode->cache == NULL) {
4765 		if (allocate) {
4766 			// TODO: actually the vnode needs to be busy already here, or
4767 			//	else this won't work...
4768 			bool wasBusy = vnode->IsBusy();
4769 			vnode->SetBusy(true);
4770 
4771 			vnode->Unlock();
4772 			rw_lock_read_unlock(&sVnodeLock);
4773 
4774 			status = vm_create_vnode_cache(vnode, &vnode->cache);
4775 
4776 			rw_lock_read_lock(&sVnodeLock);
4777 			vnode->Lock();
4778 			vnode->SetBusy(wasBusy);
4779 		} else
4780 			status = B_BAD_VALUE;
4781 	}
4782 
4783 	vnode->Unlock();
4784 	rw_lock_read_unlock(&sVnodeLock);
4785 
4786 	if (status == B_OK) {
4787 		vnode->cache->AcquireRef();
4788 		*_cache = vnode->cache;
4789 	}
4790 
4791 	return status;
4792 }
4793 
4794 
4795 /*!	Sets the vnode's VMCache object, for subsystems that want to manage
4796 	their own.
4797 	In case it's successful, it will also grab a reference to the cache
4798 	it returns.
4799 */
4800 extern "C" status_t
4801 vfs_set_vnode_cache(struct vnode* vnode, VMCache* _cache)
4802 {
4803 	rw_lock_read_lock(&sVnodeLock);
4804 	vnode->Lock();
4805 
4806 	status_t status = B_OK;
4807 	if (vnode->cache != NULL) {
4808 		status = B_NOT_ALLOWED;
4809 	} else {
4810 		vnode->cache = _cache;
4811 		_cache->AcquireRef();
4812 	}
4813 
4814 	vnode->Unlock();
4815 	rw_lock_read_unlock(&sVnodeLock);
4816 	return status;
4817 }
4818 
4819 
4820 status_t
4821 vfs_get_file_map(struct vnode* vnode, off_t offset, size_t size,
4822 	file_io_vec* vecs, size_t* _count)
4823 {
4824 	FUNCTION(("vfs_get_file_map: vnode %p, vecs %p, offset %" B_PRIdOFF
4825 		", size = %" B_PRIuSIZE "\n", vnode, vecs, offset, size));
4826 
4827 	return FS_CALL(vnode, get_file_map, offset, size, vecs, _count);
4828 }
4829 
4830 
4831 status_t
4832 vfs_stat_vnode(struct vnode* vnode, struct stat* stat)
4833 {
4834 	status_t status = FS_CALL(vnode, read_stat, stat);
4835 
4836 	// fill in the st_dev and st_ino fields
4837 	if (status == B_OK) {
4838 		stat->st_dev = vnode->device;
4839 		stat->st_ino = vnode->id;
4840 		// the rdev field must stay unset for non-special files
4841 		if (!S_ISBLK(stat->st_mode) && !S_ISCHR(stat->st_mode))
4842 			stat->st_rdev = -1;
4843 	}
4844 
4845 	return status;
4846 }
4847 
4848 
4849 status_t
4850 vfs_stat_node_ref(dev_t device, ino_t inode, struct stat* stat)
4851 {
4852 	struct vnode* vnode;
4853 	status_t status = get_vnode(device, inode, &vnode, true, false);
4854 	if (status != B_OK)
4855 		return status;
4856 
4857 	status = vfs_stat_vnode(vnode, stat);
4858 
4859 	put_vnode(vnode);
4860 	return status;
4861 }
4862 
4863 
4864 status_t
4865 vfs_get_vnode_name(struct vnode* vnode, char* name, size_t nameSize)
4866 {
4867 	return get_vnode_name(vnode, NULL, name, nameSize, true);
4868 }
4869 
4870 
4871 status_t
4872 vfs_entry_ref_to_path(dev_t device, ino_t inode, const char* leaf,
4873 	bool kernel, char* path, size_t pathLength)
4874 {
4875 	struct vnode* vnode;
4876 	status_t status;
4877 
4878 	// filter invalid leaf names
4879 	if (leaf != NULL && (leaf[0] == '\0' || strchr(leaf, '/')))
4880 		return B_BAD_VALUE;
4881 
4882 	// get the vnode matching the dir's node_ref
4883 	if (leaf && (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0)) {
4884 		// special cases "." and "..": we can directly get the vnode of the
4885 		// referenced directory
4886 		status = entry_ref_to_vnode(device, inode, leaf, false, kernel, &vnode);
4887 		leaf = NULL;
4888 	} else
4889 		status = get_vnode(device, inode, &vnode, true, false);
4890 	if (status != B_OK)
4891 		return status;
4892 
4893 	// get the directory path
4894 	status = dir_vnode_to_path(vnode, path, pathLength, kernel);
4895 	put_vnode(vnode);
4896 		// we don't need the vnode anymore
4897 	if (status != B_OK)
4898 		return status;
4899 
4900 	// append the leaf name
4901 	if (leaf) {
4902 		// insert a directory separator if this is not the file system root
4903 		if ((strcmp(path, "/") && strlcat(path, "/", pathLength)
4904 				>= pathLength)
4905 			|| strlcat(path, leaf, pathLength) >= pathLength) {
4906 			return B_NAME_TOO_LONG;
4907 		}
4908 	}
4909 
4910 	return B_OK;
4911 }
4912 
4913 
4914 /*!	If the given descriptor locked its vnode, that lock will be released. */
4915 void
4916 vfs_unlock_vnode_if_locked(struct file_descriptor* descriptor)
4917 {
4918 	struct vnode* vnode = fd_vnode(descriptor);
4919 
4920 	if (vnode != NULL && vnode->mandatory_locked_by == descriptor)
4921 		vnode->mandatory_locked_by = NULL;
4922 }
4923 
4924 
4925 /*!	Releases any POSIX locks on the file descriptor. */
4926 status_t
4927 vfs_release_posix_lock(io_context* context, struct file_descriptor* descriptor)
4928 {
4929 	struct vnode* vnode = descriptor->u.vnode;
4930 	if (vnode == NULL)
4931 		return B_OK;
4932 
4933 	if (HAS_FS_CALL(vnode, release_lock))
4934 		return FS_CALL(vnode, release_lock, descriptor->cookie, NULL);
4935 
4936 	return release_advisory_lock(vnode, context, NULL, NULL);
4937 }
4938 
4939 
4940 /*!	Closes all file descriptors of the specified I/O context that
4941 	have the O_CLOEXEC flag set.
4942 */
4943 void
4944 vfs_exec_io_context(io_context* context)
4945 {
4946 	uint32 i;
4947 
4948 	for (i = 0; i < context->table_size; i++) {
4949 		mutex_lock(&context->io_mutex);
4950 
4951 		struct file_descriptor* descriptor = context->fds[i];
4952 		bool remove = false;
4953 
4954 		if (descriptor != NULL && fd_close_on_exec(context, i)) {
4955 			context->fds[i] = NULL;
4956 			context->num_used_fds--;
4957 
4958 			remove = true;
4959 		}
4960 
4961 		mutex_unlock(&context->io_mutex);
4962 
4963 		if (remove) {
4964 			close_fd(context, descriptor);
4965 			put_fd(descriptor);
4966 		}
4967 	}
4968 }
4969 
4970 
4971 /*! Sets up a new io_control structure, and inherits the properties
4972 	of the parent io_control if it is given.
4973 */
4974 io_context*
4975 vfs_new_io_context(io_context* parentContext, bool purgeCloseOnExec)
4976 {
4977 	io_context* context = (io_context*)malloc(sizeof(io_context));
4978 	if (context == NULL)
4979 		return NULL;
4980 
4981 	TIOC(NewIOContext(context, parentContext));
4982 
4983 	memset(context, 0, sizeof(io_context));
4984 	context->ref_count = 1;
4985 
4986 	MutexLocker parentLocker;
4987 
4988 	size_t tableSize;
4989 	if (parentContext != NULL) {
4990 		parentLocker.SetTo(parentContext->io_mutex, false);
4991 		tableSize = parentContext->table_size;
4992 	} else
4993 		tableSize = DEFAULT_FD_TABLE_SIZE;
4994 
4995 	// allocate space for FDs and their close-on-exec flag
4996 	context->fds = (file_descriptor**)malloc(
4997 		sizeof(struct file_descriptor*) * tableSize
4998 		+ sizeof(struct select_info**) * tableSize
4999 		+ (tableSize + 7) / 8);
5000 	if (context->fds == NULL) {
5001 		free(context);
5002 		return NULL;
5003 	}
5004 
5005 	context->select_infos = (select_info**)(context->fds + tableSize);
5006 	context->fds_close_on_exec = (uint8*)(context->select_infos + tableSize);
5007 
5008 	memset(context->fds, 0, sizeof(struct file_descriptor*) * tableSize
5009 		+ sizeof(struct select_info**) * tableSize
5010 		+ (tableSize + 7) / 8);
5011 
5012 	mutex_init(&context->io_mutex, "I/O context");
5013 
5014 	// Copy all parent file descriptors
5015 
5016 	if (parentContext != NULL) {
5017 		size_t i;
5018 
5019 		mutex_lock(&sIOContextRootLock);
5020 		context->root = parentContext->root;
5021 		if (context->root)
5022 			inc_vnode_ref_count(context->root);
5023 		mutex_unlock(&sIOContextRootLock);
5024 
5025 		context->cwd = parentContext->cwd;
5026 		if (context->cwd)
5027 			inc_vnode_ref_count(context->cwd);
5028 
5029 		if (parentContext->inherit_fds) {
5030 			for (i = 0; i < tableSize; i++) {
5031 				struct file_descriptor* descriptor = parentContext->fds[i];
5032 
5033 				if (descriptor != NULL
5034 					&& (descriptor->open_mode & O_DISCONNECTED) == 0) {
5035 					bool closeOnExec = fd_close_on_exec(parentContext, i);
5036 					if (closeOnExec && purgeCloseOnExec)
5037 						continue;
5038 
5039 					TFD(InheritFD(context, i, descriptor, parentContext));
5040 
5041 					context->fds[i] = descriptor;
5042 					context->num_used_fds++;
5043 					atomic_add(&descriptor->ref_count, 1);
5044 					atomic_add(&descriptor->open_count, 1);
5045 
5046 					if (closeOnExec)
5047 						fd_set_close_on_exec(context, i, true);
5048 				}
5049 			}
5050 		}
5051 
5052 		parentLocker.Unlock();
5053 	} else {
5054 		context->root = sRoot;
5055 		context->cwd = sRoot;
5056 
5057 		if (context->root)
5058 			inc_vnode_ref_count(context->root);
5059 
5060 		if (context->cwd)
5061 			inc_vnode_ref_count(context->cwd);
5062 	}
5063 
5064 	context->table_size = tableSize;
5065 	context->inherit_fds = parentContext != NULL;
5066 
5067 	list_init(&context->node_monitors);
5068 	context->max_monitors = DEFAULT_NODE_MONITORS;
5069 
5070 	return context;
5071 }
5072 
5073 
5074 void
5075 vfs_get_io_context(io_context* context)
5076 {
5077 	atomic_add(&context->ref_count, 1);
5078 }
5079 
5080 
5081 void
5082 vfs_put_io_context(io_context* context)
5083 {
5084 	if (atomic_add(&context->ref_count, -1) == 1)
5085 		free_io_context(context);
5086 }
5087 
5088 
5089 status_t
5090 vfs_resize_fd_table(struct io_context* context, uint32 newSize)
5091 {
5092 	if (newSize == 0 || newSize > MAX_FD_TABLE_SIZE)
5093 		return B_BAD_VALUE;
5094 
5095 	TIOC(ResizeIOContext(context, newSize));
5096 
5097 	MutexLocker _(context->io_mutex);
5098 
5099 	uint32 oldSize = context->table_size;
5100 	int oldCloseOnExitBitmapSize = (oldSize + 7) / 8;
5101 	int newCloseOnExitBitmapSize = (newSize + 7) / 8;
5102 
5103 	// If the tables shrink, make sure none of the fds being dropped are in use.
5104 	if (newSize < oldSize) {
5105 		for (uint32 i = oldSize; i-- > newSize;) {
5106 			if (context->fds[i])
5107 				return B_BUSY;
5108 		}
5109 	}
5110 
5111 	// store pointers to the old tables
5112 	file_descriptor** oldFDs = context->fds;
5113 	select_info** oldSelectInfos = context->select_infos;
5114 	uint8* oldCloseOnExecTable = context->fds_close_on_exec;
5115 
5116 	// allocate new tables
5117 	file_descriptor** newFDs = (file_descriptor**)malloc(
5118 		sizeof(struct file_descriptor*) * newSize
5119 		+ sizeof(struct select_infos**) * newSize
5120 		+ newCloseOnExitBitmapSize);
5121 	if (newFDs == NULL)
5122 		return B_NO_MEMORY;
5123 
5124 	context->fds = newFDs;
5125 	context->select_infos = (select_info**)(context->fds + newSize);
5126 	context->fds_close_on_exec = (uint8*)(context->select_infos + newSize);
5127 	context->table_size = newSize;
5128 
5129 	// copy entries from old tables
5130 	uint32 toCopy = min_c(oldSize, newSize);
5131 
5132 	memcpy(context->fds, oldFDs, sizeof(void*) * toCopy);
5133 	memcpy(context->select_infos, oldSelectInfos, sizeof(void*) * toCopy);
5134 	memcpy(context->fds_close_on_exec, oldCloseOnExecTable,
5135 		min_c(oldCloseOnExitBitmapSize, newCloseOnExitBitmapSize));
5136 
5137 	// clear additional entries, if the tables grow
5138 	if (newSize > oldSize) {
5139 		memset(context->fds + oldSize, 0, sizeof(void*) * (newSize - oldSize));
5140 		memset(context->select_infos + oldSize, 0,
5141 			sizeof(void*) * (newSize - oldSize));
5142 		memset(context->fds_close_on_exec + oldCloseOnExitBitmapSize, 0,
5143 			newCloseOnExitBitmapSize - oldCloseOnExitBitmapSize);
5144 	}
5145 
5146 	free(oldFDs);
5147 
5148 	return B_OK;
5149 }
5150 
5151 
5152 /*!	\brief Resolves a vnode to the vnode it is covered by, if any.
5153 
5154 	Given an arbitrary vnode (identified by mount and node ID), the function
5155 	checks, whether the vnode is covered by another vnode. If it is, the
5156 	function returns the mount and node ID of the covering vnode. Otherwise
5157 	it simply returns the supplied mount and node ID.
5158 
5159 	In case of error (e.g. the supplied node could not be found) the variables
5160 	for storing the resolved mount and node ID remain untouched and an error
5161 	code is returned.
5162 
5163 	\param mountID The mount ID of the vnode in question.
5164 	\param nodeID The node ID of the vnode in question.
5165 	\param resolvedMountID Pointer to storage for the resolved mount ID.
5166 	\param resolvedNodeID Pointer to storage for the resolved node ID.
5167 	\return
5168 	- \c B_OK, if everything went fine,
5169 	- another error code, if something went wrong.
5170 */
5171 status_t
5172 vfs_resolve_vnode_to_covering_vnode(dev_t mountID, ino_t nodeID,
5173 	dev_t* resolvedMountID, ino_t* resolvedNodeID)
5174 {
5175 	// get the node
5176 	struct vnode* node;
5177 	status_t error = get_vnode(mountID, nodeID, &node, true, false);
5178 	if (error != B_OK)
5179 		return error;
5180 
5181 	// resolve the node
5182 	if (Vnode* coveringNode = get_covering_vnode(node)) {
5183 		put_vnode(node);
5184 		node = coveringNode;
5185 	}
5186 
5187 	// set the return values
5188 	*resolvedMountID = node->device;
5189 	*resolvedNodeID = node->id;
5190 
5191 	put_vnode(node);
5192 
5193 	return B_OK;
5194 }
5195 
5196 
5197 status_t
5198 vfs_get_mount_point(dev_t mountID, dev_t* _mountPointMountID,
5199 	ino_t* _mountPointNodeID)
5200 {
5201 	ReadLocker nodeLocker(sVnodeLock);
5202 	ReadLocker mountLocker(sMountLock);
5203 
5204 	struct fs_mount* mount = find_mount(mountID);
5205 	if (mount == NULL)
5206 		return B_BAD_VALUE;
5207 
5208 	Vnode* mountPoint = mount->covers_vnode;
5209 
5210 	*_mountPointMountID = mountPoint->device;
5211 	*_mountPointNodeID = mountPoint->id;
5212 
5213 	return B_OK;
5214 }
5215 
5216 
5217 status_t
5218 vfs_bind_mount_directory(dev_t mountID, ino_t nodeID, dev_t coveredMountID,
5219 	ino_t coveredNodeID)
5220 {
5221 	// get the vnodes
5222 	Vnode* vnode;
5223 	status_t error = get_vnode(mountID, nodeID, &vnode, true, false);
5224 	if (error != B_OK)
5225 		return B_BAD_VALUE;
5226 	VNodePutter vnodePutter(vnode);
5227 
5228 	Vnode* coveredVnode;
5229 	error = get_vnode(coveredMountID, coveredNodeID, &coveredVnode, true,
5230 		false);
5231 	if (error != B_OK)
5232 		return B_BAD_VALUE;
5233 	VNodePutter coveredVnodePutter(coveredVnode);
5234 
5235 	// establish the covered/covering links
5236 	WriteLocker locker(sVnodeLock);
5237 
5238 	if (vnode->covers != NULL || coveredVnode->covered_by != NULL
5239 		|| vnode->mount->unmounting || coveredVnode->mount->unmounting) {
5240 		return B_BUSY;
5241 	}
5242 
5243 	vnode->covers = coveredVnode;
5244 	vnode->SetCovering(true);
5245 
5246 	coveredVnode->covered_by = vnode;
5247 	coveredVnode->SetCovered(true);
5248 
5249 	// the vnodes do now reference each other
5250 	inc_vnode_ref_count(vnode);
5251 	inc_vnode_ref_count(coveredVnode);
5252 
5253 	return B_OK;
5254 }
5255 
5256 
5257 int
5258 vfs_getrlimit(int resource, struct rlimit* rlp)
5259 {
5260 	if (!rlp)
5261 		return B_BAD_ADDRESS;
5262 
5263 	switch (resource) {
5264 		case RLIMIT_NOFILE:
5265 		{
5266 			struct io_context* context = get_current_io_context(false);
5267 			MutexLocker _(context->io_mutex);
5268 
5269 			rlp->rlim_cur = context->table_size;
5270 			rlp->rlim_max = MAX_FD_TABLE_SIZE;
5271 			return 0;
5272 		}
5273 
5274 		case RLIMIT_NOVMON:
5275 		{
5276 			struct io_context* context = get_current_io_context(false);
5277 			MutexLocker _(context->io_mutex);
5278 
5279 			rlp->rlim_cur = context->max_monitors;
5280 			rlp->rlim_max = MAX_NODE_MONITORS;
5281 			return 0;
5282 		}
5283 
5284 		default:
5285 			return B_BAD_VALUE;
5286 	}
5287 }
5288 
5289 
5290 int
5291 vfs_setrlimit(int resource, const struct rlimit* rlp)
5292 {
5293 	if (!rlp)
5294 		return B_BAD_ADDRESS;
5295 
5296 	switch (resource) {
5297 		case RLIMIT_NOFILE:
5298 			/* TODO: check getuid() */
5299 			if (rlp->rlim_max != RLIM_SAVED_MAX
5300 				&& rlp->rlim_max != MAX_FD_TABLE_SIZE)
5301 				return B_NOT_ALLOWED;
5302 
5303 			return vfs_resize_fd_table(get_current_io_context(false),
5304 				rlp->rlim_cur);
5305 
5306 		case RLIMIT_NOVMON:
5307 			/* TODO: check getuid() */
5308 			if (rlp->rlim_max != RLIM_SAVED_MAX
5309 				&& rlp->rlim_max != MAX_NODE_MONITORS)
5310 				return B_NOT_ALLOWED;
5311 
5312 			return resize_monitor_table(get_current_io_context(false),
5313 				rlp->rlim_cur);
5314 
5315 		default:
5316 			return B_BAD_VALUE;
5317 	}
5318 }
5319 
5320 
5321 status_t
5322 vfs_init(kernel_args* args)
5323 {
5324 	vnode::StaticInit();
5325 
5326 	sVnodeTable = new(std::nothrow) VnodeTable();
5327 	if (sVnodeTable == NULL || sVnodeTable->Init(VNODE_HASH_TABLE_SIZE) != B_OK)
5328 		panic("vfs_init: error creating vnode hash table\n");
5329 
5330 	struct vnode dummy_vnode;
5331 	list_init_etc(&sUnusedVnodeList, offset_of_member(dummy_vnode, unused_link));
5332 
5333 	struct fs_mount dummyMount;
5334 	sMountsTable = new(std::nothrow) MountTable();
5335 	if (sMountsTable == NULL
5336 			|| sMountsTable->Init(MOUNTS_HASH_TABLE_SIZE) != B_OK)
5337 		panic("vfs_init: error creating mounts hash table\n");
5338 
5339 	sPathNameCache = create_object_cache("vfs path names",
5340 		B_PATH_NAME_LENGTH + 1, 8, NULL, NULL, NULL);
5341 	if (sPathNameCache == NULL)
5342 		panic("vfs_init: error creating path name object_cache\n");
5343 
5344 	sVnodeCache = create_object_cache("vfs vnodes",
5345 		sizeof(struct vnode), 8, NULL, NULL, NULL);
5346 	if (sVnodeCache == NULL)
5347 		panic("vfs_init: error creating vnode object_cache\n");
5348 
5349 	sFileDescriptorCache = create_object_cache("vfs fds",
5350 		sizeof(file_descriptor), 8, NULL, NULL, NULL);
5351 	if (sFileDescriptorCache == NULL)
5352 		panic("vfs_init: error creating file descriptor object_cache\n");
5353 
5354 	node_monitor_init();
5355 
5356 	sRoot = NULL;
5357 
5358 	recursive_lock_init(&sMountOpLock, "vfs_mount_op_lock");
5359 
5360 	if (block_cache_init() != B_OK)
5361 		return B_ERROR;
5362 
5363 #ifdef ADD_DEBUGGER_COMMANDS
5364 	// add some debugger commands
5365 	add_debugger_command_etc("vnode", &dump_vnode,
5366 		"Print info about the specified vnode",
5367 		"[ \"-p\" ] ( <vnode> | <devID> <nodeID> )\n"
5368 		"Prints information about the vnode specified by address <vnode> or\n"
5369 		"<devID>, <vnodeID> pair. If \"-p\" is given, a path of the vnode is\n"
5370 		"constructed and printed. It might not be possible to construct a\n"
5371 		"complete path, though.\n",
5372 		0);
5373 	add_debugger_command("vnodes", &dump_vnodes,
5374 		"list all vnodes (from the specified device)");
5375 	add_debugger_command("vnode_caches", &dump_vnode_caches,
5376 		"list all vnode caches");
5377 	add_debugger_command("mount", &dump_mount,
5378 		"info about the specified fs_mount");
5379 	add_debugger_command("mounts", &dump_mounts, "list all fs_mounts");
5380 	add_debugger_command("io_context", &dump_io_context,
5381 		"info about the I/O context");
5382 	add_debugger_command("vnode_usage", &dump_vnode_usage,
5383 		"info about vnode usage");
5384 #endif
5385 
5386 	register_low_resource_handler(&vnode_low_resource_handler, NULL,
5387 		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
5388 			| B_KERNEL_RESOURCE_ADDRESS_SPACE,
5389 		0);
5390 
5391 	fifo_init();
5392 	file_map_init();
5393 
5394 	return file_cache_init();
5395 }
5396 
5397 
5398 //	#pragma mark - fd_ops implementations
5399 
5400 
5401 /*!
5402 	Calls fs_open() on the given vnode and returns a new
5403 	file descriptor for it
5404 */
5405 static int
5406 open_vnode(struct vnode* vnode, int openMode, bool kernel)
5407 {
5408 	void* cookie;
5409 	status_t status = FS_CALL(vnode, open, openMode, &cookie);
5410 	if (status != B_OK)
5411 		return status;
5412 
5413 	int fd = get_new_fd(FDTYPE_FILE, NULL, vnode, cookie, openMode, kernel);
5414 	if (fd < 0) {
5415 		FS_CALL(vnode, close, cookie);
5416 		FS_CALL(vnode, free_cookie, cookie);
5417 	}
5418 	return fd;
5419 }
5420 
5421 
5422 /*!
5423 	Calls fs_open() on the given vnode and returns a new
5424 	file descriptor for it
5425 */
5426 static int
5427 create_vnode(struct vnode* directory, const char* name, int openMode,
5428 	int perms, bool kernel)
5429 {
5430 	bool traverse = ((openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0);
5431 	status_t status = B_ERROR;
5432 	struct vnode* vnode;
5433 	void* cookie;
5434 	ino_t newID;
5435 
5436 	// This is somewhat tricky: If the entry already exists, the FS responsible
5437 	// for the directory might not necessarily also be the one responsible for
5438 	// the node the entry refers to (e.g. in case of mount points or FIFOs). So
5439 	// we can actually never call the create() hook without O_EXCL. Instead we
5440 	// try to look the entry up first. If it already exists, we just open the
5441 	// node (unless O_EXCL), otherwise we call create() with O_EXCL. This
5442 	// introduces a race condition, since someone else might have created the
5443 	// entry in the meantime. We hope the respective FS returns the correct
5444 	// error code and retry (up to 3 times) again.
5445 
5446 	for (int i = 0; i < 3 && status != B_OK; i++) {
5447 		// look the node up
5448 		status = lookup_dir_entry(directory, name, &vnode);
5449 		if (status == B_OK) {
5450 			VNodePutter putter(vnode);
5451 
5452 			if ((openMode & O_EXCL) != 0)
5453 				return B_FILE_EXISTS;
5454 
5455 			// If the node is a symlink, we have to follow it, unless
5456 			// O_NOTRAVERSE is set.
5457 			if (S_ISLNK(vnode->Type()) && traverse) {
5458 				putter.Put();
5459 				char clonedName[B_FILE_NAME_LENGTH + 1];
5460 				if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH)
5461 						>= B_FILE_NAME_LENGTH) {
5462 					return B_NAME_TOO_LONG;
5463 				}
5464 
5465 				inc_vnode_ref_count(directory);
5466 				status = vnode_path_to_vnode(directory, clonedName, true, 0,
5467 					kernel, &vnode, NULL);
5468 				if (status != B_OK)
5469 					return status;
5470 
5471 				putter.SetTo(vnode);
5472 			}
5473 
5474 			if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type()))
5475 				return B_LINK_LIMIT;
5476 
5477 			int fd = open_vnode(vnode, openMode & ~O_CREAT, kernel);
5478 			// on success keep the vnode reference for the FD
5479 			if (fd >= 0)
5480 				putter.Detach();
5481 
5482 			return fd;
5483 		}
5484 
5485 		// it doesn't exist yet -- try to create it
5486 
5487 		if (!HAS_FS_CALL(directory, create))
5488 			return B_READ_ONLY_DEVICE;
5489 
5490 		status = FS_CALL(directory, create, name, openMode | O_EXCL, perms,
5491 			&cookie, &newID);
5492 		if (status != B_OK
5493 			&& ((openMode & O_EXCL) != 0 || status != B_FILE_EXISTS)) {
5494 			return status;
5495 		}
5496 	}
5497 
5498 	if (status != B_OK)
5499 		return status;
5500 
5501 	// the node has been created successfully
5502 
5503 	rw_lock_read_lock(&sVnodeLock);
5504 	vnode = lookup_vnode(directory->device, newID);
5505 	rw_lock_read_unlock(&sVnodeLock);
5506 
5507 	if (vnode == NULL) {
5508 		panic("vfs: fs_create() returned success but there is no vnode, "
5509 			"mount ID %" B_PRIdDEV "!\n", directory->device);
5510 		return B_BAD_VALUE;
5511 	}
5512 
5513 	int fd = get_new_fd(FDTYPE_FILE, NULL, vnode, cookie, openMode, kernel);
5514 	if (fd >= 0)
5515 		return fd;
5516 
5517 	status = fd;
5518 
5519 	// something went wrong, clean up
5520 
5521 	FS_CALL(vnode, close, cookie);
5522 	FS_CALL(vnode, free_cookie, cookie);
5523 	put_vnode(vnode);
5524 
5525 	FS_CALL(directory, unlink, name);
5526 
5527 	return status;
5528 }
5529 
5530 
5531 /*! Calls fs open_dir() on the given vnode and returns a new
5532 	file descriptor for it
5533 */
5534 static int
5535 open_dir_vnode(struct vnode* vnode, bool kernel)
5536 {
5537 	if (!HAS_FS_CALL(vnode, open_dir))
5538 		return B_UNSUPPORTED;
5539 
5540 	void* cookie;
5541 	status_t status = FS_CALL(vnode, open_dir, &cookie);
5542 	if (status != B_OK)
5543 		return status;
5544 
5545 	// directory is opened, create a fd
5546 	status = get_new_fd(FDTYPE_DIR, NULL, vnode, cookie, O_CLOEXEC, kernel);
5547 	if (status >= 0)
5548 		return status;
5549 
5550 	FS_CALL(vnode, close_dir, cookie);
5551 	FS_CALL(vnode, free_dir_cookie, cookie);
5552 
5553 	return status;
5554 }
5555 
5556 
5557 /*! Calls fs open_attr_dir() on the given vnode and returns a new
5558 	file descriptor for it.
5559 	Used by attr_dir_open(), and attr_dir_open_fd().
5560 */
5561 static int
5562 open_attr_dir_vnode(struct vnode* vnode, bool kernel)
5563 {
5564 	if (!HAS_FS_CALL(vnode, open_attr_dir))
5565 		return B_UNSUPPORTED;
5566 
5567 	void* cookie;
5568 	status_t status = FS_CALL(vnode, open_attr_dir, &cookie);
5569 	if (status != B_OK)
5570 		return status;
5571 
5572 	// directory is opened, create a fd
5573 	status = get_new_fd(FDTYPE_ATTR_DIR, NULL, vnode, cookie, O_CLOEXEC,
5574 		kernel);
5575 	if (status >= 0)
5576 		return status;
5577 
5578 	FS_CALL(vnode, close_attr_dir, cookie);
5579 	FS_CALL(vnode, free_attr_dir_cookie, cookie);
5580 
5581 	return status;
5582 }
5583 
5584 
5585 static int
5586 file_create_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5587 	int openMode, int perms, bool kernel)
5588 {
5589 	FUNCTION(("file_create_entry_ref: name = '%s', omode %x, perms %d, "
5590 		"kernel %d\n", name, openMode, perms, kernel));
5591 
5592 	// get directory to put the new file in
5593 	struct vnode* directory;
5594 	status_t status = get_vnode(mountID, directoryID, &directory, true, false);
5595 	if (status != B_OK)
5596 		return status;
5597 
5598 	status = create_vnode(directory, name, openMode, perms, kernel);
5599 	put_vnode(directory);
5600 
5601 	return status;
5602 }
5603 
5604 
5605 static int
5606 file_create(int fd, char* path, int openMode, int perms, bool kernel)
5607 {
5608 	FUNCTION(("file_create: path '%s', omode %x, perms %d, kernel %d\n", path,
5609 		openMode, perms, kernel));
5610 
5611 	// get directory to put the new file in
5612 	char name[B_FILE_NAME_LENGTH];
5613 	struct vnode* directory;
5614 	status_t status = fd_and_path_to_dir_vnode(fd, path, &directory, name,
5615 		kernel);
5616 	if (status < 0)
5617 		return status;
5618 
5619 	status = create_vnode(directory, name, openMode, perms, kernel);
5620 
5621 	put_vnode(directory);
5622 	return status;
5623 }
5624 
5625 
5626 static int
5627 file_open_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5628 	int openMode, bool kernel)
5629 {
5630 	if (name == NULL || *name == '\0')
5631 		return B_BAD_VALUE;
5632 
5633 	FUNCTION(("file_open_entry_ref(ref = (%" B_PRId32 ", %" B_PRId64 ", %s), "
5634 		"openMode = %d)\n", mountID, directoryID, name, openMode));
5635 
5636 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5637 
5638 	// get the vnode matching the entry_ref
5639 	struct vnode* vnode;
5640 	status_t status = entry_ref_to_vnode(mountID, directoryID, name, traverse,
5641 		kernel, &vnode);
5642 	if (status != B_OK)
5643 		return status;
5644 
5645 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
5646 		put_vnode(vnode);
5647 		return B_LINK_LIMIT;
5648 	}
5649 
5650 	int newFD = open_vnode(vnode, openMode, kernel);
5651 	if (newFD >= 0) {
5652 		// The vnode reference has been transferred to the FD
5653 		cache_node_opened(vnode, FDTYPE_FILE, vnode->cache, mountID,
5654 			directoryID, vnode->id, name);
5655 	} else
5656 		put_vnode(vnode);
5657 
5658 	return newFD;
5659 }
5660 
5661 
5662 static int
5663 file_open(int fd, char* path, int openMode, bool kernel)
5664 {
5665 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5666 
5667 	FUNCTION(("file_open: fd: %d, entry path = '%s', omode %d, kernel %d\n",
5668 		fd, path, openMode, kernel));
5669 
5670 	// get the vnode matching the vnode + path combination
5671 	struct vnode* vnode;
5672 	ino_t parentID;
5673 	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode,
5674 		&parentID, kernel);
5675 	if (status != B_OK)
5676 		return status;
5677 
5678 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
5679 		put_vnode(vnode);
5680 		return B_LINK_LIMIT;
5681 	}
5682 
5683 	// open the vnode
5684 	int newFD = open_vnode(vnode, openMode, kernel);
5685 	if (newFD >= 0) {
5686 		// The vnode reference has been transferred to the FD
5687 		cache_node_opened(vnode, FDTYPE_FILE, vnode->cache,
5688 			vnode->device, parentID, vnode->id, NULL);
5689 	} else
5690 		put_vnode(vnode);
5691 
5692 	return newFD;
5693 }
5694 
5695 
5696 static status_t
5697 file_close(struct file_descriptor* descriptor)
5698 {
5699 	struct vnode* vnode = descriptor->u.vnode;
5700 	status_t status = B_OK;
5701 
5702 	FUNCTION(("file_close(descriptor = %p)\n", descriptor));
5703 
5704 	cache_node_closed(vnode, FDTYPE_FILE, vnode->cache, vnode->device,
5705 		vnode->id);
5706 	if (HAS_FS_CALL(vnode, close)) {
5707 		status = FS_CALL(vnode, close, descriptor->cookie);
5708 	}
5709 
5710 	if (status == B_OK) {
5711 		// remove all outstanding locks for this team
5712 		if (HAS_FS_CALL(vnode, release_lock))
5713 			status = FS_CALL(vnode, release_lock, descriptor->cookie, NULL);
5714 		else
5715 			status = release_advisory_lock(vnode, NULL, descriptor, NULL);
5716 	}
5717 	return status;
5718 }
5719 
5720 
5721 static void
5722 file_free_fd(struct file_descriptor* descriptor)
5723 {
5724 	struct vnode* vnode = descriptor->u.vnode;
5725 
5726 	if (vnode != NULL) {
5727 		FS_CALL(vnode, free_cookie, descriptor->cookie);
5728 		put_vnode(vnode);
5729 	}
5730 }
5731 
5732 
5733 static status_t
5734 file_read(struct file_descriptor* descriptor, off_t pos, void* buffer,
5735 	size_t* length)
5736 {
5737 	struct vnode* vnode = descriptor->u.vnode;
5738 	FUNCTION(("file_read: buf %p, pos %" B_PRIdOFF ", len %p = %ld\n", buffer,
5739 		pos, length, *length));
5740 
5741 	if (S_ISDIR(vnode->Type()))
5742 		return B_IS_A_DIRECTORY;
5743 
5744 	return FS_CALL(vnode, read, descriptor->cookie, pos, buffer, length);
5745 }
5746 
5747 
5748 static status_t
5749 file_write(struct file_descriptor* descriptor, off_t pos, const void* buffer,
5750 	size_t* length)
5751 {
5752 	struct vnode* vnode = descriptor->u.vnode;
5753 	FUNCTION(("file_write: buf %p, pos %" B_PRIdOFF ", len %p\n", buffer, pos,
5754 		length));
5755 
5756 	if (S_ISDIR(vnode->Type()))
5757 		return B_IS_A_DIRECTORY;
5758 	if (!HAS_FS_CALL(vnode, write))
5759 		return B_READ_ONLY_DEVICE;
5760 
5761 	return FS_CALL(vnode, write, descriptor->cookie, pos, buffer, length);
5762 }
5763 
5764 
5765 static off_t
5766 file_seek(struct file_descriptor* descriptor, off_t pos, int seekType)
5767 {
5768 	struct vnode* vnode = descriptor->u.vnode;
5769 	off_t offset;
5770 	bool isDevice = false;
5771 
5772 	FUNCTION(("file_seek(pos = %" B_PRIdOFF ", seekType = %d)\n", pos,
5773 		seekType));
5774 
5775 	// some kinds of files are not seekable
5776 	switch (vnode->Type() & S_IFMT) {
5777 		case S_IFIFO:
5778 		case S_IFSOCK:
5779 			return ESPIPE;
5780 
5781 		// drivers publish block devices as chr, so pick both
5782 		case S_IFBLK:
5783 		case S_IFCHR:
5784 			isDevice = true;
5785 			break;
5786 		// The Open Group Base Specs don't mention any file types besides pipes,
5787 		// fifos, and sockets specially, so we allow seeking them.
5788 		case S_IFREG:
5789 		case S_IFDIR:
5790 		case S_IFLNK:
5791 			break;
5792 	}
5793 
5794 	switch (seekType) {
5795 		case SEEK_SET:
5796 			offset = 0;
5797 			break;
5798 		case SEEK_CUR:
5799 			offset = descriptor->pos;
5800 			break;
5801 		case SEEK_END:
5802 		{
5803 			// stat() the node
5804 			if (!HAS_FS_CALL(vnode, read_stat))
5805 				return B_UNSUPPORTED;
5806 
5807 			struct stat stat;
5808 			status_t status = FS_CALL(vnode, read_stat, &stat);
5809 			if (status != B_OK)
5810 				return status;
5811 
5812 			offset = stat.st_size;
5813 
5814 			if (offset == 0 && isDevice) {
5815 				// stat() on regular drivers doesn't report size
5816 				device_geometry geometry;
5817 
5818 				if (HAS_FS_CALL(vnode, ioctl)) {
5819 					status = FS_CALL(vnode, ioctl, descriptor->cookie,
5820 						B_GET_GEOMETRY, &geometry, sizeof(geometry));
5821 					if (status == B_OK)
5822 						offset = (off_t)geometry.bytes_per_sector
5823 							* geometry.sectors_per_track
5824 							* geometry.cylinder_count
5825 							* geometry.head_count;
5826 				}
5827 			}
5828 
5829 			break;
5830 		}
5831 		case SEEK_DATA:
5832 		case SEEK_HOLE:
5833 		{
5834 			status_t status = B_BAD_VALUE;
5835 			if (HAS_FS_CALL(vnode, ioctl)) {
5836 				offset = pos;
5837 				status = FS_CALL(vnode, ioctl, descriptor->cookie,
5838 					seekType == SEEK_DATA ? FIOSEEKDATA : FIOSEEKHOLE,
5839 					&offset, sizeof(offset));
5840 				if (status == B_OK) {
5841 					if (offset > pos)
5842 						offset -= pos;
5843 					break;
5844 				}
5845 			}
5846 			if (status != B_BAD_VALUE && status != B_DEV_INVALID_IOCTL)
5847 				return status;
5848 
5849 			// basic implementation with stat() the node
5850 			if (!HAS_FS_CALL(vnode, read_stat) || isDevice)
5851 				return B_BAD_VALUE;
5852 
5853 			struct stat stat;
5854 			status = FS_CALL(vnode, read_stat, &stat);
5855 			if (status != B_OK)
5856 				return status;
5857 
5858 			off_t end = stat.st_size;
5859 			if (pos >= end)
5860 				return ENXIO;
5861 			offset = seekType == SEEK_HOLE ? end - pos : 0;
5862 			break;
5863 		}
5864 		default:
5865 			return B_BAD_VALUE;
5866 	}
5867 
5868 	// assumes off_t is 64 bits wide
5869 	if (offset > 0 && LONGLONG_MAX - offset < pos)
5870 		return B_BUFFER_OVERFLOW;
5871 
5872 	pos += offset;
5873 	if (pos < 0)
5874 		return B_BAD_VALUE;
5875 
5876 	return descriptor->pos = pos;
5877 }
5878 
5879 
5880 static status_t
5881 file_select(struct file_descriptor* descriptor, uint8 event,
5882 	struct selectsync* sync)
5883 {
5884 	FUNCTION(("file_select(%p, %u, %p)\n", descriptor, event, sync));
5885 
5886 	struct vnode* vnode = descriptor->u.vnode;
5887 
5888 	// If the FS has no select() hook, notify select() now.
5889 	if (!HAS_FS_CALL(vnode, select)) {
5890 		if (!SELECT_TYPE_IS_OUTPUT_ONLY(event))
5891 			return notify_select_event(sync, event);
5892 		else
5893 			return B_OK;
5894 	}
5895 
5896 	return FS_CALL(vnode, select, descriptor->cookie, event, sync);
5897 }
5898 
5899 
5900 static status_t
5901 file_deselect(struct file_descriptor* descriptor, uint8 event,
5902 	struct selectsync* sync)
5903 {
5904 	struct vnode* vnode = descriptor->u.vnode;
5905 
5906 	if (!HAS_FS_CALL(vnode, deselect))
5907 		return B_OK;
5908 
5909 	return FS_CALL(vnode, deselect, descriptor->cookie, event, sync);
5910 }
5911 
5912 
5913 static status_t
5914 dir_create_entry_ref(dev_t mountID, ino_t parentID, const char* name, int perms,
5915 	bool kernel)
5916 {
5917 	struct vnode* vnode;
5918 	status_t status;
5919 
5920 	if (name == NULL || *name == '\0')
5921 		return B_BAD_VALUE;
5922 
5923 	FUNCTION(("dir_create_entry_ref(dev = %" B_PRId32 ", ino = %" B_PRId64 ", "
5924 		"name = '%s', perms = %d)\n", mountID, parentID, name, perms));
5925 
5926 	status = get_vnode(mountID, parentID, &vnode, true, false);
5927 	if (status != B_OK)
5928 		return status;
5929 
5930 	if (HAS_FS_CALL(vnode, create_dir))
5931 		status = FS_CALL(vnode, create_dir, name, perms);
5932 	else
5933 		status = B_READ_ONLY_DEVICE;
5934 
5935 	put_vnode(vnode);
5936 	return status;
5937 }
5938 
5939 
5940 static status_t
5941 dir_create(int fd, char* path, int perms, bool kernel)
5942 {
5943 	char filename[B_FILE_NAME_LENGTH];
5944 	struct vnode* vnode;
5945 	status_t status;
5946 
5947 	FUNCTION(("dir_create: path '%s', perms %d, kernel %d\n", path, perms,
5948 		kernel));
5949 
5950 	status = fd_and_path_to_dir_vnode(fd, path, &vnode, filename, kernel);
5951 	if (status < 0)
5952 		return status;
5953 
5954 	if (HAS_FS_CALL(vnode, create_dir)) {
5955 		status = FS_CALL(vnode, create_dir, filename, perms);
5956 	} else
5957 		status = B_READ_ONLY_DEVICE;
5958 
5959 	put_vnode(vnode);
5960 	return status;
5961 }
5962 
5963 
5964 static int
5965 dir_open_entry_ref(dev_t mountID, ino_t parentID, const char* name, bool kernel)
5966 {
5967 	FUNCTION(("dir_open_entry_ref()\n"));
5968 
5969 	if (name && name[0] == '\0')
5970 		return B_BAD_VALUE;
5971 
5972 	// get the vnode matching the entry_ref/node_ref
5973 	struct vnode* vnode;
5974 	status_t status;
5975 	if (name) {
5976 		status = entry_ref_to_vnode(mountID, parentID, name, true, kernel,
5977 			&vnode);
5978 	} else
5979 		status = get_vnode(mountID, parentID, &vnode, true, false);
5980 	if (status != B_OK)
5981 		return status;
5982 
5983 	int newFD = open_dir_vnode(vnode, kernel);
5984 	if (newFD >= 0) {
5985 		// The vnode reference has been transferred to the FD
5986 		cache_node_opened(vnode, FDTYPE_DIR, vnode->cache, mountID, parentID,
5987 			vnode->id, name);
5988 	} else
5989 		put_vnode(vnode);
5990 
5991 	return newFD;
5992 }
5993 
5994 
5995 static int
5996 dir_open(int fd, char* path, bool kernel)
5997 {
5998 	FUNCTION(("dir_open: fd: %d, entry path = '%s', kernel %d\n", fd, path,
5999 		kernel));
6000 
6001 	// get the vnode matching the vnode + path combination
6002 	struct vnode* vnode = NULL;
6003 	ino_t parentID;
6004 	status_t status = fd_and_path_to_vnode(fd, path, true, &vnode, &parentID,
6005 		kernel);
6006 	if (status != B_OK)
6007 		return status;
6008 
6009 	// open the dir
6010 	int newFD = open_dir_vnode(vnode, kernel);
6011 	if (newFD >= 0) {
6012 		// The vnode reference has been transferred to the FD
6013 		cache_node_opened(vnode, FDTYPE_DIR, vnode->cache, vnode->device,
6014 			parentID, vnode->id, NULL);
6015 	} else
6016 		put_vnode(vnode);
6017 
6018 	return newFD;
6019 }
6020 
6021 
6022 static status_t
6023 dir_close(struct file_descriptor* descriptor)
6024 {
6025 	struct vnode* vnode = descriptor->u.vnode;
6026 
6027 	FUNCTION(("dir_close(descriptor = %p)\n", descriptor));
6028 
6029 	cache_node_closed(vnode, FDTYPE_DIR, vnode->cache, vnode->device,
6030 		vnode->id);
6031 	if (HAS_FS_CALL(vnode, close_dir))
6032 		return FS_CALL(vnode, close_dir, descriptor->cookie);
6033 
6034 	return B_OK;
6035 }
6036 
6037 
6038 static void
6039 dir_free_fd(struct file_descriptor* descriptor)
6040 {
6041 	struct vnode* vnode = descriptor->u.vnode;
6042 
6043 	if (vnode != NULL) {
6044 		FS_CALL(vnode, free_dir_cookie, descriptor->cookie);
6045 		put_vnode(vnode);
6046 	}
6047 }
6048 
6049 
6050 static status_t
6051 dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
6052 	struct dirent* buffer, size_t bufferSize, uint32* _count)
6053 {
6054 	return dir_read(ioContext, descriptor->u.vnode, descriptor->cookie, buffer,
6055 		bufferSize, _count);
6056 }
6057 
6058 
6059 static status_t
6060 fix_dirent(struct vnode* parent, struct dirent* entry,
6061 	struct io_context* ioContext)
6062 {
6063 	// set d_pdev and d_pino
6064 	entry->d_pdev = parent->device;
6065 	entry->d_pino = parent->id;
6066 
6067 	// If this is the ".." entry and the directory covering another vnode,
6068 	// we need to replace d_dev and d_ino with the actual values.
6069 	if (strcmp(entry->d_name, "..") == 0 && parent->IsCovering()) {
6070 		return resolve_covered_parent(parent, &entry->d_dev, &entry->d_ino,
6071 			ioContext);
6072 	}
6073 
6074 	// resolve covered vnodes
6075 	ReadLocker _(&sVnodeLock);
6076 
6077 	struct vnode* vnode = lookup_vnode(entry->d_dev, entry->d_ino);
6078 	if (vnode != NULL && vnode->covered_by != NULL) {
6079 		do {
6080 			vnode = vnode->covered_by;
6081 		} while (vnode->covered_by != NULL);
6082 
6083 		entry->d_dev = vnode->device;
6084 		entry->d_ino = vnode->id;
6085 	}
6086 
6087 	return B_OK;
6088 }
6089 
6090 
6091 static status_t
6092 dir_read(struct io_context* ioContext, struct vnode* vnode, void* cookie,
6093 	struct dirent* buffer, size_t bufferSize, uint32* _count)
6094 {
6095 	if (!HAS_FS_CALL(vnode, read_dir))
6096 		return B_UNSUPPORTED;
6097 
6098 	status_t error = FS_CALL(vnode, read_dir, cookie, buffer, bufferSize,
6099 		_count);
6100 	if (error != B_OK)
6101 		return error;
6102 
6103 	// we need to adjust the read dirents
6104 	uint32 count = *_count;
6105 	for (uint32 i = 0; i < count; i++) {
6106 		error = fix_dirent(vnode, buffer, ioContext);
6107 		if (error != B_OK)
6108 			return error;
6109 
6110 		buffer = (struct dirent*)((uint8*)buffer + buffer->d_reclen);
6111 	}
6112 
6113 	return error;
6114 }
6115 
6116 
6117 static status_t
6118 dir_rewind(struct file_descriptor* descriptor)
6119 {
6120 	struct vnode* vnode = descriptor->u.vnode;
6121 
6122 	if (HAS_FS_CALL(vnode, rewind_dir)) {
6123 		return FS_CALL(vnode, rewind_dir, descriptor->cookie);
6124 	}
6125 
6126 	return B_UNSUPPORTED;
6127 }
6128 
6129 
6130 static status_t
6131 dir_remove(int fd, char* path, bool kernel)
6132 {
6133 	char name[B_FILE_NAME_LENGTH];
6134 	struct vnode* directory;
6135 	status_t status;
6136 
6137 	if (path != NULL) {
6138 		// we need to make sure our path name doesn't stop with "/", ".",
6139 		// or ".."
6140 		char* lastSlash;
6141 		while ((lastSlash = strrchr(path, '/')) != NULL) {
6142 			char* leaf = lastSlash + 1;
6143 			if (!strcmp(leaf, ".."))
6144 				return B_NOT_ALLOWED;
6145 
6146 			// omit multiple slashes
6147 			while (lastSlash > path && lastSlash[-1] == '/')
6148 				lastSlash--;
6149 
6150 			if (leaf[0]
6151 				&& strcmp(leaf, ".")) {
6152 				break;
6153 			}
6154 			// "name/" -> "name", or "name/." -> "name"
6155 			lastSlash[0] = '\0';
6156 		}
6157 
6158 		if (!strcmp(path, ".") || !strcmp(path, ".."))
6159 			return B_NOT_ALLOWED;
6160 	}
6161 
6162 	status = fd_and_path_to_dir_vnode(fd, path, &directory, name, kernel);
6163 	if (status != B_OK)
6164 		return status;
6165 
6166 	if (HAS_FS_CALL(directory, remove_dir))
6167 		status = FS_CALL(directory, remove_dir, name);
6168 	else
6169 		status = B_READ_ONLY_DEVICE;
6170 
6171 	put_vnode(directory);
6172 	return status;
6173 }
6174 
6175 
6176 static status_t
6177 common_ioctl(struct file_descriptor* descriptor, ulong op, void* buffer,
6178 	size_t length)
6179 {
6180 	struct vnode* vnode = descriptor->u.vnode;
6181 
6182 	if (HAS_FS_CALL(vnode, ioctl))
6183 		return FS_CALL(vnode, ioctl, descriptor->cookie, op, buffer, length);
6184 
6185 	return B_DEV_INVALID_IOCTL;
6186 }
6187 
6188 
6189 static status_t
6190 common_fcntl(int fd, int op, size_t argument, bool kernel)
6191 {
6192 	struct flock flock;
6193 
6194 	FUNCTION(("common_fcntl(fd = %d, op = %d, argument = %lx, %s)\n",
6195 		fd, op, argument, kernel ? "kernel" : "user"));
6196 
6197 	struct io_context* context = get_current_io_context(kernel);
6198 
6199 	struct file_descriptor* descriptor = get_fd(context, fd);
6200 	if (descriptor == NULL)
6201 		return B_FILE_ERROR;
6202 
6203 	struct vnode* vnode = fd_vnode(descriptor);
6204 
6205 	status_t status = B_OK;
6206 
6207 	if (op == F_SETLK || op == F_SETLKW || op == F_GETLK) {
6208 		if (descriptor->type != FDTYPE_FILE)
6209 			status = B_BAD_VALUE;
6210 		else if (kernel)
6211 			memcpy(&flock, (struct flock*)argument, sizeof(struct flock));
6212 		else if (user_memcpy(&flock, (struct flock*)argument,
6213 				sizeof(struct flock)) != B_OK)
6214 			status = B_BAD_ADDRESS;
6215 		if (status != B_OK) {
6216 			put_fd(descriptor);
6217 			return status;
6218 		}
6219 	}
6220 
6221 	switch (op) {
6222 		case F_SETFD:
6223 		{
6224 			// Set file descriptor flags
6225 
6226 			// O_CLOEXEC is the only flag available at this time
6227 			mutex_lock(&context->io_mutex);
6228 			fd_set_close_on_exec(context, fd, (argument & FD_CLOEXEC) != 0);
6229 			mutex_unlock(&context->io_mutex);
6230 
6231 			status = B_OK;
6232 			break;
6233 		}
6234 
6235 		case F_GETFD:
6236 		{
6237 			// Get file descriptor flags
6238 			mutex_lock(&context->io_mutex);
6239 			status = fd_close_on_exec(context, fd) ? FD_CLOEXEC : 0;
6240 			mutex_unlock(&context->io_mutex);
6241 			break;
6242 		}
6243 
6244 		case F_SETFL:
6245 			// Set file descriptor open mode
6246 
6247 			// we only accept changes to O_APPEND and O_NONBLOCK
6248 			argument &= O_APPEND | O_NONBLOCK;
6249 			if (descriptor->ops->fd_set_flags != NULL) {
6250 				status = descriptor->ops->fd_set_flags(descriptor, argument);
6251 			} else if (vnode != NULL && HAS_FS_CALL(vnode, set_flags)) {
6252 				status = FS_CALL(vnode, set_flags, descriptor->cookie,
6253 					(int)argument);
6254 			} else
6255 				status = B_UNSUPPORTED;
6256 
6257 			if (status == B_OK) {
6258 				// update this descriptor's open_mode field
6259 				descriptor->open_mode = (descriptor->open_mode
6260 					& ~(O_APPEND | O_NONBLOCK)) | argument;
6261 			}
6262 
6263 			break;
6264 
6265 		case F_GETFL:
6266 			// Get file descriptor open mode
6267 			status = descriptor->open_mode;
6268 			break;
6269 
6270 		case F_DUPFD:
6271 		case F_DUPFD_CLOEXEC:
6272 		{
6273 			status = new_fd_etc(context, descriptor, (int)argument);
6274 			if (status >= 0) {
6275 				mutex_lock(&context->io_mutex);
6276 				fd_set_close_on_exec(context, status, op == F_DUPFD_CLOEXEC);
6277 				mutex_unlock(&context->io_mutex);
6278 
6279 				atomic_add(&descriptor->ref_count, 1);
6280 			}
6281 			break;
6282 		}
6283 
6284 		case F_GETLK:
6285 			if (vnode != NULL) {
6286 				struct flock normalizedLock;
6287 
6288 				memcpy(&normalizedLock, &flock, sizeof(struct flock));
6289 				status = normalize_flock(descriptor, &normalizedLock);
6290 				if (status != B_OK)
6291 					break;
6292 
6293 				if (HAS_FS_CALL(vnode, test_lock)) {
6294 					status = FS_CALL(vnode, test_lock, descriptor->cookie,
6295 						&normalizedLock);
6296 				} else
6297 					status = test_advisory_lock(vnode, &normalizedLock);
6298 				if (status == B_OK) {
6299 					if (normalizedLock.l_type == F_UNLCK) {
6300 						// no conflicting lock found, copy back the same struct
6301 						// we were given except change type to F_UNLCK
6302 						flock.l_type = F_UNLCK;
6303 						if (kernel) {
6304 							memcpy((struct flock*)argument, &flock,
6305 								sizeof(struct flock));
6306 						} else {
6307 							status = user_memcpy((struct flock*)argument,
6308 								&flock, sizeof(struct flock));
6309 						}
6310 					} else {
6311 						// a conflicting lock was found, copy back its range and
6312 						// type
6313 						if (normalizedLock.l_len == OFF_MAX)
6314 							normalizedLock.l_len = 0;
6315 
6316 						if (kernel) {
6317 							memcpy((struct flock*)argument,
6318 								&normalizedLock, sizeof(struct flock));
6319 						} else {
6320 							status = user_memcpy((struct flock*)argument,
6321 								&normalizedLock, sizeof(struct flock));
6322 						}
6323 					}
6324 				}
6325 			} else
6326 				status = B_BAD_VALUE;
6327 			break;
6328 
6329 		case F_SETLK:
6330 		case F_SETLKW:
6331 			status = normalize_flock(descriptor, &flock);
6332 			if (status != B_OK)
6333 				break;
6334 
6335 			if (vnode == NULL) {
6336 				status = B_BAD_VALUE;
6337 			} else if (flock.l_type == F_UNLCK) {
6338 				if (HAS_FS_CALL(vnode, release_lock)) {
6339 					status = FS_CALL(vnode, release_lock, descriptor->cookie,
6340 						&flock);
6341 				} else {
6342 					status = release_advisory_lock(vnode, context, NULL,
6343 						&flock);
6344 				}
6345 			} else {
6346 				// the open mode must match the lock type
6347 				if (((descriptor->open_mode & O_RWMASK) == O_RDONLY
6348 						&& flock.l_type == F_WRLCK)
6349 					|| ((descriptor->open_mode & O_RWMASK) == O_WRONLY
6350 						&& flock.l_type == F_RDLCK))
6351 					status = B_FILE_ERROR;
6352 				else {
6353 					if (HAS_FS_CALL(vnode, acquire_lock)) {
6354 						status = FS_CALL(vnode, acquire_lock,
6355 							descriptor->cookie, &flock, op == F_SETLKW);
6356 					} else {
6357 						status = acquire_advisory_lock(vnode, context, NULL,
6358 							&flock, op == F_SETLKW);
6359 					}
6360 				}
6361 			}
6362 			break;
6363 
6364 		// ToDo: add support for more ops?
6365 
6366 		default:
6367 			status = B_BAD_VALUE;
6368 	}
6369 
6370 	put_fd(descriptor);
6371 	return status;
6372 }
6373 
6374 
6375 static status_t
6376 common_sync(int fd, bool kernel)
6377 {
6378 	struct file_descriptor* descriptor;
6379 	struct vnode* vnode;
6380 	status_t status;
6381 
6382 	FUNCTION(("common_fsync: entry. fd %d kernel %d\n", fd, kernel));
6383 
6384 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6385 	if (descriptor == NULL)
6386 		return B_FILE_ERROR;
6387 
6388 	if (HAS_FS_CALL(vnode, fsync))
6389 		status = FS_CALL_NO_PARAMS(vnode, fsync);
6390 	else
6391 		status = B_UNSUPPORTED;
6392 
6393 	put_fd(descriptor);
6394 	return status;
6395 }
6396 
6397 
6398 static status_t
6399 common_lock_node(int fd, bool kernel)
6400 {
6401 	struct file_descriptor* descriptor;
6402 	struct vnode* vnode;
6403 
6404 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6405 	if (descriptor == NULL)
6406 		return B_FILE_ERROR;
6407 
6408 	status_t status = B_OK;
6409 
6410 	// We need to set the locking atomically - someone
6411 	// else might set one at the same time
6412 	if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by, descriptor,
6413 			(file_descriptor*)NULL) != NULL)
6414 		status = B_BUSY;
6415 
6416 	put_fd(descriptor);
6417 	return status;
6418 }
6419 
6420 
6421 static status_t
6422 common_unlock_node(int fd, bool kernel)
6423 {
6424 	struct file_descriptor* descriptor;
6425 	struct vnode* vnode;
6426 
6427 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6428 	if (descriptor == NULL)
6429 		return B_FILE_ERROR;
6430 
6431 	status_t status = B_OK;
6432 
6433 	// We need to set the locking atomically - someone
6434 	// else might set one at the same time
6435 	if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by,
6436 			(file_descriptor*)NULL, descriptor) != descriptor)
6437 		status = B_BAD_VALUE;
6438 
6439 	put_fd(descriptor);
6440 	return status;
6441 }
6442 
6443 
6444 static status_t
6445 common_preallocate(int fd, off_t offset, off_t length, bool kernel)
6446 {
6447 	struct file_descriptor* descriptor;
6448 	struct vnode* vnode;
6449 
6450 	if (offset < 0 || length == 0)
6451 		return B_BAD_VALUE;
6452 	if (offset > OFF_MAX - length)
6453 		return B_FILE_TOO_LARGE;
6454 
6455 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6456 	if (descriptor == NULL || (descriptor->open_mode & O_RWMASK) == O_RDONLY)
6457 		return B_FILE_ERROR;
6458 
6459 	switch (vnode->Type() & S_IFMT) {
6460 		case S_IFIFO:
6461 		case S_IFSOCK:
6462 			return ESPIPE;
6463 
6464 		case S_IFBLK:
6465 		case S_IFCHR:
6466 		case S_IFDIR:
6467 		case S_IFLNK:
6468 			return B_DEVICE_NOT_FOUND;
6469 
6470 		case S_IFREG:
6471 			break;
6472 	}
6473 
6474 	status_t status = B_OK;
6475 	if (HAS_FS_CALL(vnode, preallocate)) {
6476 		status = FS_CALL(vnode, preallocate, offset, length);
6477 	} else {
6478 		status = HAS_FS_CALL(vnode, write)
6479 			? B_UNSUPPORTED : B_READ_ONLY_DEVICE;
6480 	}
6481 
6482 	return status;
6483 }
6484 
6485 
6486 static status_t
6487 common_read_link(int fd, char* path, char* buffer, size_t* _bufferSize,
6488 	bool kernel)
6489 {
6490 	struct vnode* vnode;
6491 	status_t status;
6492 
6493 	status = fd_and_path_to_vnode(fd, path, false, &vnode, NULL, kernel);
6494 	if (status != B_OK)
6495 		return status;
6496 
6497 	if (HAS_FS_CALL(vnode, read_symlink)) {
6498 		status = FS_CALL(vnode, read_symlink, buffer, _bufferSize);
6499 	} else
6500 		status = B_BAD_VALUE;
6501 
6502 	put_vnode(vnode);
6503 	return status;
6504 }
6505 
6506 
6507 static status_t
6508 common_create_symlink(int fd, char* path, const char* toPath, int mode,
6509 	bool kernel)
6510 {
6511 	// path validity checks have to be in the calling function!
6512 	char name[B_FILE_NAME_LENGTH];
6513 	struct vnode* vnode;
6514 	status_t status;
6515 
6516 	FUNCTION(("common_create_symlink(fd = %d, path = %s, toPath = %s, "
6517 		"mode = %d, kernel = %d)\n", fd, path, toPath, mode, kernel));
6518 
6519 	status = fd_and_path_to_dir_vnode(fd, path, &vnode, name, kernel);
6520 	if (status != B_OK)
6521 		return status;
6522 
6523 	if (HAS_FS_CALL(vnode, create_symlink))
6524 		status = FS_CALL(vnode, create_symlink, name, toPath, mode);
6525 	else {
6526 		status = HAS_FS_CALL(vnode, write)
6527 			? B_UNSUPPORTED : B_READ_ONLY_DEVICE;
6528 	}
6529 
6530 	put_vnode(vnode);
6531 
6532 	return status;
6533 }
6534 
6535 
6536 static status_t
6537 common_create_link(int pathFD, char* path, int toFD, char* toPath,
6538 	bool traverseLeafLink, bool kernel)
6539 {
6540 	// path validity checks have to be in the calling function!
6541 
6542 	FUNCTION(("common_create_link(path = %s, toPath = %s, kernel = %d)\n", path,
6543 		toPath, kernel));
6544 
6545 	char name[B_FILE_NAME_LENGTH];
6546 	struct vnode* directory;
6547 	status_t status = fd_and_path_to_dir_vnode(pathFD, path, &directory, name,
6548 		kernel);
6549 	if (status != B_OK)
6550 		return status;
6551 
6552 	struct vnode* vnode;
6553 	status = fd_and_path_to_vnode(toFD, toPath, traverseLeafLink, &vnode, NULL,
6554 		kernel);
6555 	if (status != B_OK)
6556 		goto err;
6557 
6558 	if (directory->mount != vnode->mount) {
6559 		status = B_CROSS_DEVICE_LINK;
6560 		goto err1;
6561 	}
6562 
6563 	if (HAS_FS_CALL(directory, link))
6564 		status = FS_CALL(directory, link, name, vnode);
6565 	else
6566 		status = B_READ_ONLY_DEVICE;
6567 
6568 err1:
6569 	put_vnode(vnode);
6570 err:
6571 	put_vnode(directory);
6572 
6573 	return status;
6574 }
6575 
6576 
6577 static status_t
6578 common_unlink(int fd, char* path, bool kernel)
6579 {
6580 	char filename[B_FILE_NAME_LENGTH];
6581 	struct vnode* vnode;
6582 	status_t status;
6583 
6584 	FUNCTION(("common_unlink: fd: %d, path '%s', kernel %d\n", fd, path,
6585 		kernel));
6586 
6587 	status = fd_and_path_to_dir_vnode(fd, path, &vnode, filename, kernel);
6588 	if (status < 0)
6589 		return status;
6590 
6591 	if (HAS_FS_CALL(vnode, unlink))
6592 		status = FS_CALL(vnode, unlink, filename);
6593 	else
6594 		status = B_READ_ONLY_DEVICE;
6595 
6596 	put_vnode(vnode);
6597 
6598 	return status;
6599 }
6600 
6601 
6602 static status_t
6603 common_access(int fd, char* path, int mode, bool effectiveUserGroup, bool kernel)
6604 {
6605 	struct vnode* vnode;
6606 	status_t status;
6607 
6608 	// TODO: honor effectiveUserGroup argument
6609 
6610 	status = fd_and_path_to_vnode(fd, path, true, &vnode, NULL, kernel);
6611 	if (status != B_OK)
6612 		return status;
6613 
6614 	if (HAS_FS_CALL(vnode, access))
6615 		status = FS_CALL(vnode, access, mode);
6616 	else
6617 		status = B_OK;
6618 
6619 	put_vnode(vnode);
6620 
6621 	return status;
6622 }
6623 
6624 
6625 static status_t
6626 common_rename(int fd, char* path, int newFD, char* newPath, bool kernel)
6627 {
6628 	struct vnode* fromVnode;
6629 	struct vnode* toVnode;
6630 	char fromName[B_FILE_NAME_LENGTH];
6631 	char toName[B_FILE_NAME_LENGTH];
6632 	status_t status;
6633 
6634 	FUNCTION(("common_rename(fd = %d, path = %s, newFD = %d, newPath = %s, "
6635 		"kernel = %d)\n", fd, path, newFD, newPath, kernel));
6636 
6637 	status = fd_and_path_to_dir_vnode(fd, path, &fromVnode, fromName, kernel);
6638 	if (status != B_OK)
6639 		return status;
6640 
6641 	status = fd_and_path_to_dir_vnode(newFD, newPath, &toVnode, toName, kernel);
6642 	if (status != B_OK)
6643 		goto err1;
6644 
6645 	if (fromVnode->device != toVnode->device) {
6646 		status = B_CROSS_DEVICE_LINK;
6647 		goto err2;
6648 	}
6649 
6650 	if (fromName[0] == '\0' || toName[0] == '\0'
6651 		|| !strcmp(fromName, ".") || !strcmp(fromName, "..")
6652 		|| !strcmp(toName, ".") || !strcmp(toName, "..")
6653 		|| (fromVnode == toVnode && !strcmp(fromName, toName))) {
6654 		status = B_BAD_VALUE;
6655 		goto err2;
6656 	}
6657 
6658 	if (HAS_FS_CALL(fromVnode, rename))
6659 		status = FS_CALL(fromVnode, rename, fromName, toVnode, toName);
6660 	else
6661 		status = B_READ_ONLY_DEVICE;
6662 
6663 err2:
6664 	put_vnode(toVnode);
6665 err1:
6666 	put_vnode(fromVnode);
6667 
6668 	return status;
6669 }
6670 
6671 
6672 static status_t
6673 common_read_stat(struct file_descriptor* descriptor, struct stat* stat)
6674 {
6675 	struct vnode* vnode = descriptor->u.vnode;
6676 
6677 	FUNCTION(("common_read_stat: stat %p\n", stat));
6678 
6679 	// TODO: remove this once all file systems properly set them!
6680 	stat->st_crtim.tv_nsec = 0;
6681 	stat->st_ctim.tv_nsec = 0;
6682 	stat->st_mtim.tv_nsec = 0;
6683 	stat->st_atim.tv_nsec = 0;
6684 
6685 	return vfs_stat_vnode(vnode, stat);
6686 }
6687 
6688 
6689 static status_t
6690 common_write_stat(struct file_descriptor* descriptor, const struct stat* stat,
6691 	int statMask)
6692 {
6693 	struct vnode* vnode = descriptor->u.vnode;
6694 
6695 	FUNCTION(("common_write_stat(vnode = %p, stat = %p, statMask = %d)\n",
6696 		vnode, stat, statMask));
6697 
6698 	if ((descriptor->open_mode & O_RWMASK) == O_RDONLY
6699 		&& (statMask & B_STAT_SIZE) != 0) {
6700 		return B_BAD_VALUE;
6701 	}
6702 
6703 	if (!HAS_FS_CALL(vnode, write_stat))
6704 		return B_READ_ONLY_DEVICE;
6705 
6706 	return FS_CALL(vnode, write_stat, stat, statMask);
6707 }
6708 
6709 
6710 static status_t
6711 common_path_read_stat(int fd, char* path, bool traverseLeafLink,
6712 	struct stat* stat, bool kernel)
6713 {
6714 	FUNCTION(("common_path_read_stat: fd: %d, path '%s', stat %p,\n", fd, path,
6715 		stat));
6716 
6717 	struct vnode* vnode;
6718 	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6719 		NULL, kernel);
6720 	if (status != B_OK)
6721 		return status;
6722 
6723 	status = vfs_stat_vnode(vnode, stat);
6724 
6725 	put_vnode(vnode);
6726 	return status;
6727 }
6728 
6729 
6730 static status_t
6731 common_path_write_stat(int fd, char* path, bool traverseLeafLink,
6732 	const struct stat* stat, int statMask, bool kernel)
6733 {
6734 	FUNCTION(("common_write_stat: fd: %d, path '%s', stat %p, stat_mask %d, "
6735 		"kernel %d\n", fd, path, stat, statMask, kernel));
6736 
6737 	struct vnode* vnode;
6738 	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6739 		NULL, kernel);
6740 	if (status != B_OK)
6741 		return status;
6742 
6743 	if (HAS_FS_CALL(vnode, write_stat))
6744 		status = FS_CALL(vnode, write_stat, stat, statMask);
6745 	else
6746 		status = B_READ_ONLY_DEVICE;
6747 
6748 	put_vnode(vnode);
6749 
6750 	return status;
6751 }
6752 
6753 
6754 static int
6755 attr_dir_open(int fd, char* path, bool traverseLeafLink, bool kernel)
6756 {
6757 	FUNCTION(("attr_dir_open(fd = %d, path = '%s', kernel = %d)\n", fd, path,
6758 		kernel));
6759 
6760 	struct vnode* vnode;
6761 	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6762 		NULL, kernel);
6763 	if (status != B_OK)
6764 		return status;
6765 
6766 	status = open_attr_dir_vnode(vnode, kernel);
6767 	if (status < 0)
6768 		put_vnode(vnode);
6769 
6770 	return status;
6771 }
6772 
6773 
6774 static status_t
6775 attr_dir_close(struct file_descriptor* descriptor)
6776 {
6777 	struct vnode* vnode = descriptor->u.vnode;
6778 
6779 	FUNCTION(("attr_dir_close(descriptor = %p)\n", descriptor));
6780 
6781 	if (HAS_FS_CALL(vnode, close_attr_dir))
6782 		return FS_CALL(vnode, close_attr_dir, descriptor->cookie);
6783 
6784 	return B_OK;
6785 }
6786 
6787 
6788 static void
6789 attr_dir_free_fd(struct file_descriptor* descriptor)
6790 {
6791 	struct vnode* vnode = descriptor->u.vnode;
6792 
6793 	if (vnode != NULL) {
6794 		FS_CALL(vnode, free_attr_dir_cookie, descriptor->cookie);
6795 		put_vnode(vnode);
6796 	}
6797 }
6798 
6799 
6800 static status_t
6801 attr_dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
6802 	struct dirent* buffer, size_t bufferSize, uint32* _count)
6803 {
6804 	struct vnode* vnode = descriptor->u.vnode;
6805 
6806 	FUNCTION(("attr_dir_read(descriptor = %p)\n", descriptor));
6807 
6808 	if (HAS_FS_CALL(vnode, read_attr_dir))
6809 		return FS_CALL(vnode, read_attr_dir, descriptor->cookie, buffer,
6810 			bufferSize, _count);
6811 
6812 	return B_UNSUPPORTED;
6813 }
6814 
6815 
6816 static status_t
6817 attr_dir_rewind(struct file_descriptor* descriptor)
6818 {
6819 	struct vnode* vnode = descriptor->u.vnode;
6820 
6821 	FUNCTION(("attr_dir_rewind(descriptor = %p)\n", descriptor));
6822 
6823 	if (HAS_FS_CALL(vnode, rewind_attr_dir))
6824 		return FS_CALL(vnode, rewind_attr_dir, descriptor->cookie);
6825 
6826 	return B_UNSUPPORTED;
6827 }
6828 
6829 
6830 static int
6831 attr_create(int fd, char* path, const char* name, uint32 type,
6832 	int openMode, bool kernel)
6833 {
6834 	if (name == NULL || *name == '\0')
6835 		return B_BAD_VALUE;
6836 
6837 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
6838 	struct vnode* vnode;
6839 	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode, NULL,
6840 		kernel);
6841 	if (status != B_OK)
6842 		return status;
6843 
6844 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
6845 		status = B_LINK_LIMIT;
6846 		goto err;
6847 	}
6848 
6849 	if (!HAS_FS_CALL(vnode, create_attr)) {
6850 		status = B_READ_ONLY_DEVICE;
6851 		goto err;
6852 	}
6853 
6854 	void* cookie;
6855 	status = FS_CALL(vnode, create_attr, name, type, openMode, &cookie);
6856 	if (status != B_OK)
6857 		goto err;
6858 
6859 	fd = get_new_fd(FDTYPE_ATTR, NULL, vnode, cookie, openMode, kernel);
6860 	if (fd >= 0)
6861 		return fd;
6862 
6863 	status = fd;
6864 
6865 	FS_CALL(vnode, close_attr, cookie);
6866 	FS_CALL(vnode, free_attr_cookie, cookie);
6867 
6868 	FS_CALL(vnode, remove_attr, name);
6869 
6870 err:
6871 	put_vnode(vnode);
6872 
6873 	return status;
6874 }
6875 
6876 
6877 static int
6878 attr_open(int fd, char* path, const char* name, int openMode, bool kernel)
6879 {
6880 	if (name == NULL || *name == '\0')
6881 		return B_BAD_VALUE;
6882 
6883 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
6884 	struct vnode* vnode;
6885 	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode, NULL,
6886 		kernel);
6887 	if (status != B_OK)
6888 		return status;
6889 
6890 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
6891 		status = B_LINK_LIMIT;
6892 		goto err;
6893 	}
6894 
6895 	if (!HAS_FS_CALL(vnode, open_attr)) {
6896 		status = B_UNSUPPORTED;
6897 		goto err;
6898 	}
6899 
6900 	void* cookie;
6901 	status = FS_CALL(vnode, open_attr, name, openMode, &cookie);
6902 	if (status != B_OK)
6903 		goto err;
6904 
6905 	// now we only need a file descriptor for this attribute and we're done
6906 	fd = get_new_fd(FDTYPE_ATTR, NULL, vnode, cookie, openMode, kernel);
6907 	if (fd >= 0)
6908 		return fd;
6909 
6910 	status = fd;
6911 
6912 	FS_CALL(vnode, close_attr, cookie);
6913 	FS_CALL(vnode, free_attr_cookie, cookie);
6914 
6915 err:
6916 	put_vnode(vnode);
6917 
6918 	return status;
6919 }
6920 
6921 
6922 static status_t
6923 attr_close(struct file_descriptor* descriptor)
6924 {
6925 	struct vnode* vnode = descriptor->u.vnode;
6926 
6927 	FUNCTION(("attr_close(descriptor = %p)\n", descriptor));
6928 
6929 	if (HAS_FS_CALL(vnode, close_attr))
6930 		return FS_CALL(vnode, close_attr, descriptor->cookie);
6931 
6932 	return B_OK;
6933 }
6934 
6935 
6936 static void
6937 attr_free_fd(struct file_descriptor* descriptor)
6938 {
6939 	struct vnode* vnode = descriptor->u.vnode;
6940 
6941 	if (vnode != NULL) {
6942 		FS_CALL(vnode, free_attr_cookie, descriptor->cookie);
6943 		put_vnode(vnode);
6944 	}
6945 }
6946 
6947 
6948 static status_t
6949 attr_read(struct file_descriptor* descriptor, off_t pos, void* buffer,
6950 	size_t* length)
6951 {
6952 	struct vnode* vnode = descriptor->u.vnode;
6953 
6954 	FUNCTION(("attr_read: buf %p, pos %" B_PRIdOFF ", len %p = %ld\n", buffer,
6955 		pos, length, *length));
6956 
6957 	if (!HAS_FS_CALL(vnode, read_attr))
6958 		return B_UNSUPPORTED;
6959 
6960 	return FS_CALL(vnode, read_attr, descriptor->cookie, pos, buffer, length);
6961 }
6962 
6963 
6964 static status_t
6965 attr_write(struct file_descriptor* descriptor, off_t pos, const void* buffer,
6966 	size_t* length)
6967 {
6968 	struct vnode* vnode = descriptor->u.vnode;
6969 
6970 	FUNCTION(("attr_write: buf %p, pos %" B_PRIdOFF ", len %p\n", buffer, pos,
6971 		length));
6972 
6973 	if (!HAS_FS_CALL(vnode, write_attr))
6974 		return B_UNSUPPORTED;
6975 
6976 	return FS_CALL(vnode, write_attr, descriptor->cookie, pos, buffer, length);
6977 }
6978 
6979 
6980 static off_t
6981 attr_seek(struct file_descriptor* descriptor, off_t pos, int seekType)
6982 {
6983 	off_t offset;
6984 
6985 	switch (seekType) {
6986 		case SEEK_SET:
6987 			offset = 0;
6988 			break;
6989 		case SEEK_CUR:
6990 			offset = descriptor->pos;
6991 			break;
6992 		case SEEK_END:
6993 		{
6994 			struct vnode* vnode = descriptor->u.vnode;
6995 			if (!HAS_FS_CALL(vnode, read_stat))
6996 				return B_UNSUPPORTED;
6997 
6998 			struct stat stat;
6999 			status_t status = FS_CALL(vnode, read_attr_stat, descriptor->cookie,
7000 				&stat);
7001 			if (status != B_OK)
7002 				return status;
7003 
7004 			offset = stat.st_size;
7005 			break;
7006 		}
7007 		default:
7008 			return B_BAD_VALUE;
7009 	}
7010 
7011 	// assumes off_t is 64 bits wide
7012 	if (offset > 0 && LONGLONG_MAX - offset < pos)
7013 		return B_BUFFER_OVERFLOW;
7014 
7015 	pos += offset;
7016 	if (pos < 0)
7017 		return B_BAD_VALUE;
7018 
7019 	return descriptor->pos = pos;
7020 }
7021 
7022 
7023 static status_t
7024 attr_read_stat(struct file_descriptor* descriptor, struct stat* stat)
7025 {
7026 	struct vnode* vnode = descriptor->u.vnode;
7027 
7028 	FUNCTION(("attr_read_stat: stat 0x%p\n", stat));
7029 
7030 	if (!HAS_FS_CALL(vnode, read_attr_stat))
7031 		return B_UNSUPPORTED;
7032 
7033 	return FS_CALL(vnode, read_attr_stat, descriptor->cookie, stat);
7034 }
7035 
7036 
7037 static status_t
7038 attr_write_stat(struct file_descriptor* descriptor, const struct stat* stat,
7039 	int statMask)
7040 {
7041 	struct vnode* vnode = descriptor->u.vnode;
7042 
7043 	FUNCTION(("attr_write_stat: stat = %p, statMask %d\n", stat, statMask));
7044 
7045 	if (!HAS_FS_CALL(vnode, write_attr_stat))
7046 		return B_READ_ONLY_DEVICE;
7047 
7048 	return FS_CALL(vnode, write_attr_stat, descriptor->cookie, stat, statMask);
7049 }
7050 
7051 
7052 static status_t
7053 attr_remove(int fd, const char* name, bool kernel)
7054 {
7055 	struct file_descriptor* descriptor;
7056 	struct vnode* vnode;
7057 	status_t status;
7058 
7059 	if (name == NULL || *name == '\0')
7060 		return B_BAD_VALUE;
7061 
7062 	FUNCTION(("attr_remove: fd = %d, name = \"%s\", kernel %d\n", fd, name,
7063 		kernel));
7064 
7065 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
7066 	if (descriptor == NULL)
7067 		return B_FILE_ERROR;
7068 
7069 	if (HAS_FS_CALL(vnode, remove_attr))
7070 		status = FS_CALL(vnode, remove_attr, name);
7071 	else
7072 		status = B_READ_ONLY_DEVICE;
7073 
7074 	put_fd(descriptor);
7075 
7076 	return status;
7077 }
7078 
7079 
7080 static status_t
7081 attr_rename(int fromFD, const char* fromName, int toFD, const char* toName,
7082 	bool kernel)
7083 {
7084 	struct file_descriptor* fromDescriptor;
7085 	struct file_descriptor* toDescriptor;
7086 	struct vnode* fromVnode;
7087 	struct vnode* toVnode;
7088 	status_t status;
7089 
7090 	if (fromName == NULL || *fromName == '\0' || toName == NULL
7091 		|| *toName == '\0')
7092 		return B_BAD_VALUE;
7093 
7094 	FUNCTION(("attr_rename: from fd = %d, from name = \"%s\", to fd = %d, to "
7095 		"name = \"%s\", kernel %d\n", fromFD, fromName, toFD, toName, kernel));
7096 
7097 	fromDescriptor = get_fd_and_vnode(fromFD, &fromVnode, kernel);
7098 	if (fromDescriptor == NULL)
7099 		return B_FILE_ERROR;
7100 
7101 	toDescriptor = get_fd_and_vnode(toFD, &toVnode, kernel);
7102 	if (toDescriptor == NULL) {
7103 		status = B_FILE_ERROR;
7104 		goto err;
7105 	}
7106 
7107 	// are the files on the same volume?
7108 	if (fromVnode->device != toVnode->device) {
7109 		status = B_CROSS_DEVICE_LINK;
7110 		goto err1;
7111 	}
7112 
7113 	if (HAS_FS_CALL(fromVnode, rename_attr)) {
7114 		status = FS_CALL(fromVnode, rename_attr, fromName, toVnode, toName);
7115 	} else
7116 		status = B_READ_ONLY_DEVICE;
7117 
7118 err1:
7119 	put_fd(toDescriptor);
7120 err:
7121 	put_fd(fromDescriptor);
7122 
7123 	return status;
7124 }
7125 
7126 
7127 static int
7128 index_dir_open(dev_t mountID, bool kernel)
7129 {
7130 	struct fs_mount* mount;
7131 	void* cookie;
7132 
7133 	FUNCTION(("index_dir_open(mountID = %" B_PRId32 ", kernel = %d)\n", mountID,
7134 		kernel));
7135 
7136 	status_t status = get_mount(mountID, &mount);
7137 	if (status != B_OK)
7138 		return status;
7139 
7140 	if (!HAS_FS_MOUNT_CALL(mount, open_index_dir)) {
7141 		status = B_UNSUPPORTED;
7142 		goto error;
7143 	}
7144 
7145 	status = FS_MOUNT_CALL(mount, open_index_dir, &cookie);
7146 	if (status != B_OK)
7147 		goto error;
7148 
7149 	// get fd for the index directory
7150 	int fd;
7151 	fd = get_new_fd(FDTYPE_INDEX_DIR, mount, NULL, cookie, O_CLOEXEC, kernel);
7152 	if (fd >= 0)
7153 		return fd;
7154 
7155 	// something went wrong
7156 	FS_MOUNT_CALL(mount, close_index_dir, cookie);
7157 	FS_MOUNT_CALL(mount, free_index_dir_cookie, cookie);
7158 
7159 	status = fd;
7160 
7161 error:
7162 	put_mount(mount);
7163 	return status;
7164 }
7165 
7166 
7167 static status_t
7168 index_dir_close(struct file_descriptor* descriptor)
7169 {
7170 	struct fs_mount* mount = descriptor->u.mount;
7171 
7172 	FUNCTION(("index_dir_close(descriptor = %p)\n", descriptor));
7173 
7174 	if (HAS_FS_MOUNT_CALL(mount, close_index_dir))
7175 		return FS_MOUNT_CALL(mount, close_index_dir, descriptor->cookie);
7176 
7177 	return B_OK;
7178 }
7179 
7180 
7181 static void
7182 index_dir_free_fd(struct file_descriptor* descriptor)
7183 {
7184 	struct fs_mount* mount = descriptor->u.mount;
7185 
7186 	if (mount != NULL) {
7187 		FS_MOUNT_CALL(mount, free_index_dir_cookie, descriptor->cookie);
7188 		put_mount(mount);
7189 	}
7190 }
7191 
7192 
7193 static status_t
7194 index_dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
7195 	struct dirent* buffer, size_t bufferSize, uint32* _count)
7196 {
7197 	struct fs_mount* mount = descriptor->u.mount;
7198 
7199 	if (HAS_FS_MOUNT_CALL(mount, read_index_dir)) {
7200 		return FS_MOUNT_CALL(mount, read_index_dir, descriptor->cookie, buffer,
7201 			bufferSize, _count);
7202 	}
7203 
7204 	return B_UNSUPPORTED;
7205 }
7206 
7207 
7208 static status_t
7209 index_dir_rewind(struct file_descriptor* descriptor)
7210 {
7211 	struct fs_mount* mount = descriptor->u.mount;
7212 
7213 	if (HAS_FS_MOUNT_CALL(mount, rewind_index_dir))
7214 		return FS_MOUNT_CALL(mount, rewind_index_dir, descriptor->cookie);
7215 
7216 	return B_UNSUPPORTED;
7217 }
7218 
7219 
7220 static status_t
7221 index_create(dev_t mountID, const char* name, uint32 type, uint32 flags,
7222 	bool kernel)
7223 {
7224 	FUNCTION(("index_create(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7225 		mountID, name, kernel));
7226 
7227 	struct fs_mount* mount;
7228 	status_t status = get_mount(mountID, &mount);
7229 	if (status != B_OK)
7230 		return status;
7231 
7232 	if (!HAS_FS_MOUNT_CALL(mount, create_index)) {
7233 		status = B_READ_ONLY_DEVICE;
7234 		goto out;
7235 	}
7236 
7237 	status = FS_MOUNT_CALL(mount, create_index, name, type, flags);
7238 
7239 out:
7240 	put_mount(mount);
7241 	return status;
7242 }
7243 
7244 
7245 #if 0
7246 static status_t
7247 index_read_stat(struct file_descriptor* descriptor, struct stat* stat)
7248 {
7249 	struct vnode* vnode = descriptor->u.vnode;
7250 
7251 	// ToDo: currently unused!
7252 	FUNCTION(("index_read_stat: stat 0x%p\n", stat));
7253 	if (!HAS_FS_CALL(vnode, read_index_stat))
7254 		return B_UNSUPPORTED;
7255 
7256 	return B_UNSUPPORTED;
7257 	//return FS_CALL(vnode, read_index_stat, descriptor->cookie, stat);
7258 }
7259 
7260 
7261 static void
7262 index_free_fd(struct file_descriptor* descriptor)
7263 {
7264 	struct vnode* vnode = descriptor->u.vnode;
7265 
7266 	if (vnode != NULL) {
7267 		FS_CALL(vnode, free_index_cookie, descriptor->cookie);
7268 		put_vnode(vnode);
7269 	}
7270 }
7271 #endif
7272 
7273 
7274 static status_t
7275 index_name_read_stat(dev_t mountID, const char* name, struct stat* stat,
7276 	bool kernel)
7277 {
7278 	FUNCTION(("index_remove(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7279 		mountID, name, kernel));
7280 
7281 	struct fs_mount* mount;
7282 	status_t status = get_mount(mountID, &mount);
7283 	if (status != B_OK)
7284 		return status;
7285 
7286 	if (!HAS_FS_MOUNT_CALL(mount, read_index_stat)) {
7287 		status = B_UNSUPPORTED;
7288 		goto out;
7289 	}
7290 
7291 	status = FS_MOUNT_CALL(mount, read_index_stat, name, stat);
7292 
7293 out:
7294 	put_mount(mount);
7295 	return status;
7296 }
7297 
7298 
7299 static status_t
7300 index_remove(dev_t mountID, const char* name, bool kernel)
7301 {
7302 	FUNCTION(("index_remove(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7303 		mountID, name, kernel));
7304 
7305 	struct fs_mount* mount;
7306 	status_t status = get_mount(mountID, &mount);
7307 	if (status != B_OK)
7308 		return status;
7309 
7310 	if (!HAS_FS_MOUNT_CALL(mount, remove_index)) {
7311 		status = B_READ_ONLY_DEVICE;
7312 		goto out;
7313 	}
7314 
7315 	status = FS_MOUNT_CALL(mount, remove_index, name);
7316 
7317 out:
7318 	put_mount(mount);
7319 	return status;
7320 }
7321 
7322 
7323 /*!	TODO: the query FS API is still the pretty much the same as in R5.
7324 		It would be nice if the FS would find some more kernel support
7325 		for them.
7326 		For example, query parsing should be moved into the kernel.
7327 */
7328 static int
7329 query_open(dev_t device, const char* query, uint32 flags, port_id port,
7330 	int32 token, bool kernel)
7331 {
7332 	struct fs_mount* mount;
7333 	void* cookie;
7334 
7335 	FUNCTION(("query_open(device = %" B_PRId32 ", query = \"%s\", kernel = %d)\n",
7336 		device, query, kernel));
7337 
7338 	status_t status = get_mount(device, &mount);
7339 	if (status != B_OK)
7340 		return status;
7341 
7342 	if (!HAS_FS_MOUNT_CALL(mount, open_query)) {
7343 		status = B_UNSUPPORTED;
7344 		goto error;
7345 	}
7346 
7347 	status = FS_MOUNT_CALL(mount, open_query, query, flags, port, token,
7348 		&cookie);
7349 	if (status != B_OK)
7350 		goto error;
7351 
7352 	// get fd for the index directory
7353 	int fd;
7354 	fd = get_new_fd(FDTYPE_QUERY, mount, NULL, cookie, O_CLOEXEC, kernel);
7355 	if (fd >= 0)
7356 		return fd;
7357 
7358 	status = fd;
7359 
7360 	// something went wrong
7361 	FS_MOUNT_CALL(mount, close_query, cookie);
7362 	FS_MOUNT_CALL(mount, free_query_cookie, cookie);
7363 
7364 error:
7365 	put_mount(mount);
7366 	return status;
7367 }
7368 
7369 
7370 static status_t
7371 query_close(struct file_descriptor* descriptor)
7372 {
7373 	struct fs_mount* mount = descriptor->u.mount;
7374 
7375 	FUNCTION(("query_close(descriptor = %p)\n", descriptor));
7376 
7377 	if (HAS_FS_MOUNT_CALL(mount, close_query))
7378 		return FS_MOUNT_CALL(mount, close_query, descriptor->cookie);
7379 
7380 	return B_OK;
7381 }
7382 
7383 
7384 static void
7385 query_free_fd(struct file_descriptor* descriptor)
7386 {
7387 	struct fs_mount* mount = descriptor->u.mount;
7388 
7389 	if (mount != NULL) {
7390 		FS_MOUNT_CALL(mount, free_query_cookie, descriptor->cookie);
7391 		put_mount(mount);
7392 	}
7393 }
7394 
7395 
7396 static status_t
7397 query_read(struct io_context* ioContext, struct file_descriptor* descriptor,
7398 	struct dirent* buffer, size_t bufferSize, uint32* _count)
7399 {
7400 	struct fs_mount* mount = descriptor->u.mount;
7401 
7402 	if (HAS_FS_MOUNT_CALL(mount, read_query)) {
7403 		return FS_MOUNT_CALL(mount, read_query, descriptor->cookie, buffer,
7404 			bufferSize, _count);
7405 	}
7406 
7407 	return B_UNSUPPORTED;
7408 }
7409 
7410 
7411 static status_t
7412 query_rewind(struct file_descriptor* descriptor)
7413 {
7414 	struct fs_mount* mount = descriptor->u.mount;
7415 
7416 	if (HAS_FS_MOUNT_CALL(mount, rewind_query))
7417 		return FS_MOUNT_CALL(mount, rewind_query, descriptor->cookie);
7418 
7419 	return B_UNSUPPORTED;
7420 }
7421 
7422 
7423 //	#pragma mark - General File System functions
7424 
7425 
7426 static dev_t
7427 fs_mount(char* path, const char* device, const char* fsName, uint32 flags,
7428 	const char* args, bool kernel)
7429 {
7430 	struct ::fs_mount* mount;
7431 	status_t status = B_OK;
7432 	fs_volume* volume = NULL;
7433 	int32 layer = 0;
7434 	Vnode* coveredNode = NULL;
7435 
7436 	FUNCTION(("fs_mount: path = '%s', device = '%s', fs_name = '%s', flags = %#"
7437 		B_PRIx32 ", args = '%s'\n", path, device, fsName, flags, args));
7438 
7439 	// The path is always safe, we just have to make sure that fsName is
7440 	// almost valid - we can't make any assumptions about args, though.
7441 	// A NULL fsName is OK, if a device was given and the FS is not virtual.
7442 	// We'll get it from the DDM later.
7443 	if (fsName == NULL) {
7444 		if (!device || flags & B_MOUNT_VIRTUAL_DEVICE)
7445 			return B_BAD_VALUE;
7446 	} else if (fsName[0] == '\0')
7447 		return B_BAD_VALUE;
7448 
7449 	RecursiveLocker mountOpLocker(sMountOpLock);
7450 
7451 	// Helper to delete a newly created file device on failure.
7452 	// Not exactly beautiful, but helps to keep the code below cleaner.
7453 	struct FileDeviceDeleter {
7454 		FileDeviceDeleter() : id(-1) {}
7455 		~FileDeviceDeleter()
7456 		{
7457 			KDiskDeviceManager::Default()->DeleteFileDevice(id);
7458 		}
7459 
7460 		partition_id id;
7461 	} fileDeviceDeleter;
7462 
7463 	// If the file system is not a "virtual" one, the device argument should
7464 	// point to a real file/device (if given at all).
7465 	// get the partition
7466 	KDiskDeviceManager* ddm = KDiskDeviceManager::Default();
7467 	KPartition* partition = NULL;
7468 	KPath normalizedDevice;
7469 	bool newlyCreatedFileDevice = false;
7470 
7471 	if (!(flags & B_MOUNT_VIRTUAL_DEVICE) && device != NULL) {
7472 		// normalize the device path
7473 		status = normalizedDevice.SetTo(device, true);
7474 		if (status != B_OK)
7475 			return status;
7476 
7477 		// get a corresponding partition from the DDM
7478 		partition = ddm->RegisterPartition(normalizedDevice.Path());
7479 		if (partition == NULL) {
7480 			// Partition not found: This either means, the user supplied
7481 			// an invalid path, or the path refers to an image file. We try
7482 			// to let the DDM create a file device for the path.
7483 			partition_id deviceID = ddm->CreateFileDevice(
7484 				normalizedDevice.Path(), &newlyCreatedFileDevice);
7485 			if (deviceID >= 0) {
7486 				partition = ddm->RegisterPartition(deviceID);
7487 				if (newlyCreatedFileDevice)
7488 					fileDeviceDeleter.id = deviceID;
7489 			}
7490 		}
7491 
7492 		if (!partition) {
7493 			TRACE(("fs_mount(): Partition `%s' not found.\n",
7494 				normalizedDevice.Path()));
7495 			return B_ENTRY_NOT_FOUND;
7496 		}
7497 
7498 		device = normalizedDevice.Path();
7499 			// correct path to file device
7500 	}
7501 	PartitionRegistrar partitionRegistrar(partition, true);
7502 
7503 	// Write lock the partition's device. For the time being, we keep the lock
7504 	// until we're done mounting -- not nice, but ensure, that no-one is
7505 	// interfering.
7506 	// TODO: Just mark the partition busy while mounting!
7507 	KDiskDevice* diskDevice = NULL;
7508 	if (partition) {
7509 		diskDevice = ddm->WriteLockDevice(partition->Device()->ID());
7510 		if (!diskDevice) {
7511 			TRACE(("fs_mount(): Failed to lock disk device!\n"));
7512 			return B_ERROR;
7513 		}
7514 	}
7515 
7516 	DeviceWriteLocker writeLocker(diskDevice, true);
7517 		// this takes over the write lock acquired before
7518 
7519 	if (partition != NULL) {
7520 		// make sure, that the partition is not busy
7521 		if (partition->IsBusy()) {
7522 			TRACE(("fs_mount(): Partition is busy.\n"));
7523 			return B_BUSY;
7524 		}
7525 
7526 		// if no FS name had been supplied, we get it from the partition
7527 		if (fsName == NULL) {
7528 			KDiskSystem* diskSystem = partition->DiskSystem();
7529 			if (!diskSystem) {
7530 				TRACE(("fs_mount(): No FS name was given, and the DDM didn't "
7531 					"recognize it.\n"));
7532 				return B_BAD_VALUE;
7533 			}
7534 
7535 			if (!diskSystem->IsFileSystem()) {
7536 				TRACE(("fs_mount(): No FS name was given, and the DDM found a "
7537 					"partitioning system.\n"));
7538 				return B_BAD_VALUE;
7539 			}
7540 
7541 			// The disk system name will not change, and the KDiskSystem
7542 			// object will not go away while the disk device is locked (and
7543 			// the partition has a reference to it), so this is safe.
7544 			fsName = diskSystem->Name();
7545 		}
7546 	}
7547 
7548 	mount = new(std::nothrow) (struct ::fs_mount);
7549 	if (mount == NULL)
7550 		return B_NO_MEMORY;
7551 
7552 	mount->device_name = strdup(device);
7553 		// "device" can be NULL
7554 
7555 	status = mount->entry_cache.Init();
7556 	if (status != B_OK)
7557 		goto err1;
7558 
7559 	// initialize structure
7560 	mount->id = sNextMountID++;
7561 	mount->partition = NULL;
7562 	mount->root_vnode = NULL;
7563 	mount->covers_vnode = NULL;
7564 	mount->unmounting = false;
7565 	mount->owns_file_device = false;
7566 	mount->volume = NULL;
7567 
7568 	// build up the volume(s)
7569 	while (true) {
7570 		char* layerFSName = get_file_system_name_for_layer(fsName, layer);
7571 		if (layerFSName == NULL) {
7572 			if (layer == 0) {
7573 				status = B_NO_MEMORY;
7574 				goto err1;
7575 			}
7576 
7577 			break;
7578 		}
7579 		MemoryDeleter layerFSNameDeleter(layerFSName);
7580 
7581 		volume = (fs_volume*)malloc(sizeof(fs_volume));
7582 		if (volume == NULL) {
7583 			status = B_NO_MEMORY;
7584 			goto err1;
7585 		}
7586 
7587 		volume->id = mount->id;
7588 		volume->partition = partition != NULL ? partition->ID() : -1;
7589 		volume->layer = layer++;
7590 		volume->private_volume = NULL;
7591 		volume->ops = NULL;
7592 		volume->sub_volume = NULL;
7593 		volume->super_volume = NULL;
7594 		volume->file_system = NULL;
7595 		volume->file_system_name = NULL;
7596 
7597 		volume->file_system_name = get_file_system_name(layerFSName);
7598 		if (volume->file_system_name == NULL) {
7599 			status = B_NO_MEMORY;
7600 			free(volume);
7601 			goto err1;
7602 		}
7603 
7604 		volume->file_system = get_file_system(layerFSName);
7605 		if (volume->file_system == NULL) {
7606 			status = B_DEVICE_NOT_FOUND;
7607 			free(volume->file_system_name);
7608 			free(volume);
7609 			goto err1;
7610 		}
7611 
7612 		if (mount->volume == NULL)
7613 			mount->volume = volume;
7614 		else {
7615 			volume->super_volume = mount->volume;
7616 			mount->volume->sub_volume = volume;
7617 			mount->volume = volume;
7618 		}
7619 	}
7620 
7621 	// insert mount struct into list before we call FS's mount() function
7622 	// so that vnodes can be created for this mount
7623 	rw_lock_write_lock(&sMountLock);
7624 	sMountsTable->Insert(mount);
7625 	rw_lock_write_unlock(&sMountLock);
7626 
7627 	ino_t rootID;
7628 
7629 	if (!sRoot) {
7630 		// we haven't mounted anything yet
7631 		if (strcmp(path, "/") != 0) {
7632 			status = B_ERROR;
7633 			goto err2;
7634 		}
7635 
7636 		status = mount->volume->file_system->mount(mount->volume, device, flags,
7637 			args, &rootID);
7638 		if (status != B_OK || mount->volume->ops == NULL)
7639 			goto err2;
7640 	} else {
7641 		status = path_to_vnode(path, true, &coveredNode, NULL, kernel);
7642 		if (status != B_OK)
7643 			goto err2;
7644 
7645 		mount->covers_vnode = coveredNode;
7646 
7647 		// make sure covered_vnode is a directory
7648 		if (!S_ISDIR(coveredNode->Type())) {
7649 			status = B_NOT_A_DIRECTORY;
7650 			goto err3;
7651 		}
7652 
7653 		if (coveredNode->IsCovered()) {
7654 			// this is already a covered vnode
7655 			status = B_BUSY;
7656 			goto err3;
7657 		}
7658 
7659 		// mount it/them
7660 		fs_volume* volume = mount->volume;
7661 		while (volume) {
7662 			status = volume->file_system->mount(volume, device, flags, args,
7663 				&rootID);
7664 			if (status != B_OK || volume->ops == NULL) {
7665 				if (status == B_OK && volume->ops == NULL)
7666 					panic("fs_mount: mount() succeeded but ops is NULL!");
7667 				if (volume->sub_volume)
7668 					goto err4;
7669 				goto err3;
7670 			}
7671 
7672 			volume = volume->super_volume;
7673 		}
7674 
7675 		volume = mount->volume;
7676 		while (volume) {
7677 			if (volume->ops->all_layers_mounted != NULL)
7678 				volume->ops->all_layers_mounted(volume);
7679 			volume = volume->super_volume;
7680 		}
7681 	}
7682 
7683 	// the root node is supposed to be owned by the file system - it must
7684 	// exist at this point
7685 	mount->root_vnode = lookup_vnode(mount->id, rootID);
7686 	if (mount->root_vnode == NULL || mount->root_vnode->ref_count != 1) {
7687 		panic("fs_mount: file system does not own its root node!\n");
7688 		status = B_ERROR;
7689 		goto err4;
7690 	}
7691 
7692 	// set up the links between the root vnode and the vnode it covers
7693 	rw_lock_write_lock(&sVnodeLock);
7694 	if (coveredNode != NULL) {
7695 		if (coveredNode->IsCovered()) {
7696 			// the vnode is covered now
7697 			status = B_BUSY;
7698 			rw_lock_write_unlock(&sVnodeLock);
7699 			goto err4;
7700 		}
7701 
7702 		mount->root_vnode->covers = coveredNode;
7703 		mount->root_vnode->SetCovering(true);
7704 
7705 		coveredNode->covered_by = mount->root_vnode;
7706 		coveredNode->SetCovered(true);
7707 	}
7708 	rw_lock_write_unlock(&sVnodeLock);
7709 
7710 	if (!sRoot) {
7711 		sRoot = mount->root_vnode;
7712 		mutex_lock(&sIOContextRootLock);
7713 		get_current_io_context(true)->root = sRoot;
7714 		mutex_unlock(&sIOContextRootLock);
7715 		inc_vnode_ref_count(sRoot);
7716 	}
7717 
7718 	// supply the partition (if any) with the mount cookie and mark it mounted
7719 	if (partition) {
7720 		partition->SetMountCookie(mount->volume->private_volume);
7721 		partition->SetVolumeID(mount->id);
7722 
7723 		// keep a partition reference as long as the partition is mounted
7724 		partitionRegistrar.Detach();
7725 		mount->partition = partition;
7726 		mount->owns_file_device = newlyCreatedFileDevice;
7727 		fileDeviceDeleter.id = -1;
7728 	}
7729 
7730 	notify_mount(mount->id,
7731 		coveredNode != NULL ? coveredNode->device : -1,
7732 		coveredNode ? coveredNode->id : -1);
7733 
7734 	return mount->id;
7735 
7736 err4:
7737 	FS_MOUNT_CALL_NO_PARAMS(mount, unmount);
7738 err3:
7739 	if (coveredNode != NULL)
7740 		put_vnode(coveredNode);
7741 err2:
7742 	rw_lock_write_lock(&sMountLock);
7743 	sMountsTable->Remove(mount);
7744 	rw_lock_write_unlock(&sMountLock);
7745 err1:
7746 	delete mount;
7747 
7748 	return status;
7749 }
7750 
7751 
7752 static status_t
7753 fs_unmount(char* path, dev_t mountID, uint32 flags, bool kernel)
7754 {
7755 	struct fs_mount* mount;
7756 	status_t err;
7757 
7758 	FUNCTION(("fs_unmount(path '%s', dev %" B_PRId32 ", kernel %d\n", path,
7759 		mountID, kernel));
7760 
7761 	struct vnode* pathVnode = NULL;
7762 	if (path != NULL) {
7763 		err = path_to_vnode(path, true, &pathVnode, NULL, kernel);
7764 		if (err != B_OK)
7765 			return B_ENTRY_NOT_FOUND;
7766 	}
7767 
7768 	RecursiveLocker mountOpLocker(sMountOpLock);
7769 	ReadLocker mountLocker(sMountLock);
7770 
7771 	mount = find_mount(path != NULL ? pathVnode->device : mountID);
7772 	if (mount == NULL) {
7773 		panic("fs_unmount: find_mount() failed on root vnode @%p of mount\n",
7774 			pathVnode);
7775 	}
7776 
7777 	mountLocker.Unlock();
7778 
7779 	if (path != NULL) {
7780 		put_vnode(pathVnode);
7781 
7782 		if (mount->root_vnode != pathVnode) {
7783 			// not mountpoint
7784 			return B_BAD_VALUE;
7785 		}
7786 	}
7787 
7788 	// if the volume is associated with a partition, lock the device of the
7789 	// partition as long as we are unmounting
7790 	KDiskDeviceManager* ddm = KDiskDeviceManager::Default();
7791 	KPartition* partition = mount->partition;
7792 	KDiskDevice* diskDevice = NULL;
7793 	if (partition != NULL) {
7794 		if (partition->Device() == NULL) {
7795 			dprintf("fs_unmount(): There is no device!\n");
7796 			return B_ERROR;
7797 		}
7798 		diskDevice = ddm->WriteLockDevice(partition->Device()->ID());
7799 		if (!diskDevice) {
7800 			TRACE(("fs_unmount(): Failed to lock disk device!\n"));
7801 			return B_ERROR;
7802 		}
7803 	}
7804 	DeviceWriteLocker writeLocker(diskDevice, true);
7805 
7806 	// make sure, that the partition is not busy
7807 	if (partition != NULL) {
7808 		if ((flags & B_UNMOUNT_BUSY_PARTITION) == 0 && partition->IsBusy()) {
7809 			TRACE(("fs_unmount(): Partition is busy.\n"));
7810 			return B_BUSY;
7811 		}
7812 	}
7813 
7814 	// grab the vnode master mutex to keep someone from creating
7815 	// a vnode while we're figuring out if we can continue
7816 	WriteLocker vnodesWriteLocker(&sVnodeLock);
7817 
7818 	bool disconnectedDescriptors = false;
7819 
7820 	while (true) {
7821 		bool busy = false;
7822 
7823 		// cycle through the list of vnodes associated with this mount and
7824 		// make sure all of them are not busy or have refs on them
7825 		VnodeList::Iterator iterator = mount->vnodes.GetIterator();
7826 		while (struct vnode* vnode = iterator.Next()) {
7827 			if (vnode->IsBusy()) {
7828 				busy = true;
7829 				break;
7830 			}
7831 
7832 			// check the vnode's ref count -- subtract additional references for
7833 			// covering
7834 			int32 refCount = vnode->ref_count;
7835 			if (vnode->covers != NULL)
7836 				refCount--;
7837 			if (vnode->covered_by != NULL)
7838 				refCount--;
7839 
7840 			if (refCount != 0) {
7841 				// there are still vnodes in use on this mount, so we cannot
7842 				// unmount yet
7843 				busy = true;
7844 				break;
7845 			}
7846 		}
7847 
7848 		if (!busy)
7849 			break;
7850 
7851 		if ((flags & B_FORCE_UNMOUNT) == 0)
7852 			return B_BUSY;
7853 
7854 		if (disconnectedDescriptors) {
7855 			// wait a bit until the last access is finished, and then try again
7856 			vnodesWriteLocker.Unlock();
7857 			snooze(100000);
7858 			// TODO: if there is some kind of bug that prevents the ref counts
7859 			// from getting back to zero, this will fall into an endless loop...
7860 			vnodesWriteLocker.Lock();
7861 			continue;
7862 		}
7863 
7864 		// the file system is still busy - but we're forced to unmount it,
7865 		// so let's disconnect all open file descriptors
7866 
7867 		mount->unmounting = true;
7868 			// prevent new vnodes from being created
7869 
7870 		vnodesWriteLocker.Unlock();
7871 
7872 		disconnect_mount_or_vnode_fds(mount, NULL);
7873 		disconnectedDescriptors = true;
7874 
7875 		vnodesWriteLocker.Lock();
7876 	}
7877 
7878 	// We can safely continue. Mark all of the vnodes busy and this mount
7879 	// structure in unmounting state. Also undo the vnode covers/covered_by
7880 	// links.
7881 	mount->unmounting = true;
7882 
7883 	VnodeList::Iterator iterator = mount->vnodes.GetIterator();
7884 	while (struct vnode* vnode = iterator.Next()) {
7885 		// Remove all covers/covered_by links from other mounts' nodes to this
7886 		// vnode and adjust the node ref count accordingly. We will release the
7887 		// references to the external vnodes below.
7888 		if (Vnode* coveredNode = vnode->covers) {
7889 			if (Vnode* coveringNode = vnode->covered_by) {
7890 				// We have both covered and covering vnodes, so just remove us
7891 				// from the chain.
7892 				coveredNode->covered_by = coveringNode;
7893 				coveringNode->covers = coveredNode;
7894 				vnode->ref_count -= 2;
7895 
7896 				vnode->covered_by = NULL;
7897 				vnode->covers = NULL;
7898 				vnode->SetCovering(false);
7899 				vnode->SetCovered(false);
7900 			} else {
7901 				// We only have a covered vnode. Remove its link to us.
7902 				coveredNode->covered_by = NULL;
7903 				coveredNode->SetCovered(false);
7904 				vnode->ref_count--;
7905 
7906 				// If the other node is an external vnode, we keep its link
7907 				// link around so we can put the reference later on. Otherwise
7908 				// we get rid of it right now.
7909 				if (coveredNode->mount == mount) {
7910 					vnode->covers = NULL;
7911 					coveredNode->ref_count--;
7912 				}
7913 			}
7914 		} else if (Vnode* coveringNode = vnode->covered_by) {
7915 			// We only have a covering vnode. Remove its link to us.
7916 			coveringNode->covers = NULL;
7917 			coveringNode->SetCovering(false);
7918 			vnode->ref_count--;
7919 
7920 			// If the other node is an external vnode, we keep its link
7921 			// link around so we can put the reference later on. Otherwise
7922 			// we get rid of it right now.
7923 			if (coveringNode->mount == mount) {
7924 				vnode->covered_by = NULL;
7925 				coveringNode->ref_count--;
7926 			}
7927 		}
7928 
7929 		vnode->SetBusy(true);
7930 		vnode_to_be_freed(vnode);
7931 	}
7932 
7933 	vnodesWriteLocker.Unlock();
7934 
7935 	// Free all vnodes associated with this mount.
7936 	// They will be removed from the mount list by free_vnode(), so
7937 	// we don't have to do this.
7938 	while (struct vnode* vnode = mount->vnodes.Head()) {
7939 		// Put the references to external covered/covering vnodes we kept above.
7940 		if (Vnode* coveredNode = vnode->covers)
7941 			put_vnode(coveredNode);
7942 		if (Vnode* coveringNode = vnode->covered_by)
7943 			put_vnode(coveringNode);
7944 
7945 		free_vnode(vnode, false);
7946 	}
7947 
7948 	// remove the mount structure from the hash table
7949 	rw_lock_write_lock(&sMountLock);
7950 	sMountsTable->Remove(mount);
7951 	rw_lock_write_unlock(&sMountLock);
7952 
7953 	mountOpLocker.Unlock();
7954 
7955 	FS_MOUNT_CALL_NO_PARAMS(mount, unmount);
7956 	notify_unmount(mount->id);
7957 
7958 	// dereference the partition and mark it unmounted
7959 	if (partition) {
7960 		partition->SetVolumeID(-1);
7961 		partition->SetMountCookie(NULL);
7962 
7963 		if (mount->owns_file_device)
7964 			KDiskDeviceManager::Default()->DeleteFileDevice(partition->ID());
7965 		partition->Unregister();
7966 	}
7967 
7968 	delete mount;
7969 	return B_OK;
7970 }
7971 
7972 
7973 static status_t
7974 fs_sync(dev_t device)
7975 {
7976 	struct fs_mount* mount;
7977 	status_t status = get_mount(device, &mount);
7978 	if (status != B_OK)
7979 		return status;
7980 
7981 	struct vnode marker;
7982 	memset(&marker, 0, sizeof(marker));
7983 	marker.SetBusy(true);
7984 	marker.SetRemoved(true);
7985 
7986 	// First, synchronize all file caches
7987 
7988 	while (true) {
7989 		WriteLocker locker(sVnodeLock);
7990 			// Note: That's the easy way. Which is probably OK for sync(),
7991 			// since it's a relatively rare call and doesn't need to allow for
7992 			// a lot of concurrency. Using a read lock would be possible, but
7993 			// also more involved, since we had to lock the individual nodes
7994 			// and take care of the locking order, which we might not want to
7995 			// do while holding fs_mount::lock.
7996 
7997 		// synchronize access to vnode list
7998 		mutex_lock(&mount->lock);
7999 
8000 		struct vnode* vnode;
8001 		if (!marker.IsRemoved()) {
8002 			vnode = mount->vnodes.GetNext(&marker);
8003 			mount->vnodes.Remove(&marker);
8004 			marker.SetRemoved(true);
8005 		} else
8006 			vnode = mount->vnodes.First();
8007 
8008 		while (vnode != NULL && (vnode->cache == NULL
8009 			|| vnode->IsRemoved() || vnode->IsBusy())) {
8010 			// TODO: we could track writes (and writable mapped vnodes)
8011 			//	and have a simple flag that we could test for here
8012 			vnode = mount->vnodes.GetNext(vnode);
8013 		}
8014 
8015 		if (vnode != NULL) {
8016 			// insert marker vnode again
8017 			mount->vnodes.Insert(mount->vnodes.GetNext(vnode), &marker);
8018 			marker.SetRemoved(false);
8019 		}
8020 
8021 		mutex_unlock(&mount->lock);
8022 
8023 		if (vnode == NULL)
8024 			break;
8025 
8026 		vnode = lookup_vnode(mount->id, vnode->id);
8027 		if (vnode == NULL || vnode->IsBusy())
8028 			continue;
8029 
8030 		if (vnode->ref_count == 0) {
8031 			// this vnode has been unused before
8032 			vnode_used(vnode);
8033 		}
8034 		inc_vnode_ref_count(vnode);
8035 
8036 		locker.Unlock();
8037 
8038 		if (vnode->cache != NULL && !vnode->IsRemoved())
8039 			vnode->cache->WriteModified();
8040 
8041 		put_vnode(vnode);
8042 	}
8043 
8044 	// Let the file systems do their synchronizing work
8045 	if (HAS_FS_MOUNT_CALL(mount, sync))
8046 		status = FS_MOUNT_CALL_NO_PARAMS(mount, sync);
8047 
8048 	// Finally, flush the underlying device's write cache (if possible.)
8049 	if (mount->partition != NULL && mount->partition->Device() != NULL)
8050 		ioctl(mount->partition->Device()->FD(), B_FLUSH_DRIVE_CACHE);
8051 
8052 	put_mount(mount);
8053 	return status;
8054 }
8055 
8056 
8057 static status_t
8058 fs_read_info(dev_t device, struct fs_info* info)
8059 {
8060 	struct fs_mount* mount;
8061 	status_t status = get_mount(device, &mount);
8062 	if (status != B_OK)
8063 		return status;
8064 
8065 	memset(info, 0, sizeof(struct fs_info));
8066 
8067 	if (HAS_FS_MOUNT_CALL(mount, read_fs_info))
8068 		status = FS_MOUNT_CALL(mount, read_fs_info, info);
8069 
8070 	// fill in info the file system doesn't (have to) know about
8071 	if (status == B_OK) {
8072 		info->dev = mount->id;
8073 		info->root = mount->root_vnode->id;
8074 
8075 		fs_volume* volume = mount->volume;
8076 		while (volume->super_volume != NULL)
8077 			volume = volume->super_volume;
8078 
8079 		strlcpy(info->fsh_name, volume->file_system_name,
8080 			sizeof(info->fsh_name));
8081 		if (mount->device_name != NULL) {
8082 			strlcpy(info->device_name, mount->device_name,
8083 				sizeof(info->device_name));
8084 		}
8085 	}
8086 
8087 	// if the call is not supported by the file system, there are still
8088 	// the parts that we filled out ourselves
8089 
8090 	put_mount(mount);
8091 	return status;
8092 }
8093 
8094 
8095 static status_t
8096 fs_write_info(dev_t device, const struct fs_info* info, int mask)
8097 {
8098 	struct fs_mount* mount;
8099 	status_t status = get_mount(device, &mount);
8100 	if (status != B_OK)
8101 		return status;
8102 
8103 	if (HAS_FS_MOUNT_CALL(mount, write_fs_info))
8104 		status = FS_MOUNT_CALL(mount, write_fs_info, info, mask);
8105 	else
8106 		status = B_READ_ONLY_DEVICE;
8107 
8108 	put_mount(mount);
8109 	return status;
8110 }
8111 
8112 
8113 static dev_t
8114 fs_next_device(int32* _cookie)
8115 {
8116 	struct fs_mount* mount = NULL;
8117 	dev_t device = *_cookie;
8118 
8119 	rw_lock_read_lock(&sMountLock);
8120 
8121 	// Since device IDs are assigned sequentially, this algorithm
8122 	// does work good enough. It makes sure that the device list
8123 	// returned is sorted, and that no device is skipped when an
8124 	// already visited device got unmounted.
8125 
8126 	while (device < sNextMountID) {
8127 		mount = find_mount(device++);
8128 		if (mount != NULL && mount->volume->private_volume != NULL)
8129 			break;
8130 	}
8131 
8132 	*_cookie = device;
8133 
8134 	if (mount != NULL)
8135 		device = mount->id;
8136 	else
8137 		device = B_BAD_VALUE;
8138 
8139 	rw_lock_read_unlock(&sMountLock);
8140 
8141 	return device;
8142 }
8143 
8144 
8145 ssize_t
8146 fs_read_attr(int fd, const char *attribute, uint32 type, off_t pos,
8147 	void *buffer, size_t readBytes)
8148 {
8149 	int attrFD = attr_open(fd, NULL, attribute, O_RDONLY, true);
8150 	if (attrFD < 0)
8151 		return attrFD;
8152 
8153 	ssize_t bytesRead = _kern_read(attrFD, pos, buffer, readBytes);
8154 
8155 	_kern_close(attrFD);
8156 
8157 	return bytesRead;
8158 }
8159 
8160 
8161 static status_t
8162 get_cwd(char* buffer, size_t size, bool kernel)
8163 {
8164 	// Get current working directory from io context
8165 	struct io_context* context = get_current_io_context(kernel);
8166 	status_t status;
8167 
8168 	FUNCTION(("vfs_get_cwd: buf %p, size %ld\n", buffer, size));
8169 
8170 	mutex_lock(&context->io_mutex);
8171 
8172 	struct vnode* vnode = context->cwd;
8173 	if (vnode)
8174 		inc_vnode_ref_count(vnode);
8175 
8176 	mutex_unlock(&context->io_mutex);
8177 
8178 	if (vnode) {
8179 		status = dir_vnode_to_path(vnode, buffer, size, kernel);
8180 		put_vnode(vnode);
8181 	} else
8182 		status = B_ERROR;
8183 
8184 	return status;
8185 }
8186 
8187 
8188 static status_t
8189 set_cwd(int fd, char* path, bool kernel)
8190 {
8191 	struct io_context* context;
8192 	struct vnode* vnode = NULL;
8193 	struct vnode* oldDirectory;
8194 	status_t status;
8195 
8196 	FUNCTION(("set_cwd: path = \'%s\'\n", path));
8197 
8198 	// Get vnode for passed path, and bail if it failed
8199 	status = fd_and_path_to_vnode(fd, path, true, &vnode, NULL, kernel);
8200 	if (status < 0)
8201 		return status;
8202 
8203 	if (!S_ISDIR(vnode->Type())) {
8204 		// nope, can't cwd to here
8205 		status = B_NOT_A_DIRECTORY;
8206 		goto err;
8207 	}
8208 
8209 	// We need to have the permission to enter the directory, too
8210 	if (HAS_FS_CALL(vnode, access)) {
8211 		status = FS_CALL(vnode, access, X_OK);
8212 		if (status != B_OK)
8213 			goto err;
8214 	}
8215 
8216 	// Get current io context and lock
8217 	context = get_current_io_context(kernel);
8218 	mutex_lock(&context->io_mutex);
8219 
8220 	// save the old current working directory first
8221 	oldDirectory = context->cwd;
8222 	context->cwd = vnode;
8223 
8224 	mutex_unlock(&context->io_mutex);
8225 
8226 	if (oldDirectory)
8227 		put_vnode(oldDirectory);
8228 
8229 	return B_NO_ERROR;
8230 
8231 err:
8232 	put_vnode(vnode);
8233 	return status;
8234 }
8235 
8236 
8237 static status_t
8238 user_copy_name(char* to, const char* from, size_t length)
8239 {
8240 	ssize_t len = user_strlcpy(to, from, length);
8241 	if (len < 0)
8242 		return len;
8243 	if (len >= (ssize_t)length)
8244 		return B_NAME_TOO_LONG;
8245 	return B_OK;
8246 }
8247 
8248 
8249 //	#pragma mark - kernel mirrored syscalls
8250 
8251 
8252 dev_t
8253 _kern_mount(const char* path, const char* device, const char* fsName,
8254 	uint32 flags, const char* args, size_t argsLength)
8255 {
8256 	KPath pathBuffer(path);
8257 	if (pathBuffer.InitCheck() != B_OK)
8258 		return B_NO_MEMORY;
8259 
8260 	return fs_mount(pathBuffer.LockBuffer(), device, fsName, flags, args, true);
8261 }
8262 
8263 
8264 status_t
8265 _kern_unmount(const char* path, uint32 flags)
8266 {
8267 	KPath pathBuffer(path);
8268 	if (pathBuffer.InitCheck() != B_OK)
8269 		return B_NO_MEMORY;
8270 
8271 	return fs_unmount(pathBuffer.LockBuffer(), -1, flags, true);
8272 }
8273 
8274 
8275 status_t
8276 _kern_read_fs_info(dev_t device, struct fs_info* info)
8277 {
8278 	if (info == NULL)
8279 		return B_BAD_VALUE;
8280 
8281 	return fs_read_info(device, info);
8282 }
8283 
8284 
8285 status_t
8286 _kern_write_fs_info(dev_t device, const struct fs_info* info, int mask)
8287 {
8288 	if (info == NULL)
8289 		return B_BAD_VALUE;
8290 
8291 	return fs_write_info(device, info, mask);
8292 }
8293 
8294 
8295 status_t
8296 _kern_sync(void)
8297 {
8298 	// Note: _kern_sync() is also called from _user_sync()
8299 	int32 cookie = 0;
8300 	dev_t device;
8301 	while ((device = next_dev(&cookie)) >= 0) {
8302 		status_t status = fs_sync(device);
8303 		if (status != B_OK && status != B_BAD_VALUE) {
8304 			dprintf("sync: device %" B_PRIdDEV " couldn't sync: %s\n", device,
8305 				strerror(status));
8306 		}
8307 	}
8308 
8309 	return B_OK;
8310 }
8311 
8312 
8313 dev_t
8314 _kern_next_device(int32* _cookie)
8315 {
8316 	return fs_next_device(_cookie);
8317 }
8318 
8319 
8320 status_t
8321 _kern_get_next_fd_info(team_id teamID, uint32* _cookie, fd_info* info,
8322 	size_t infoSize)
8323 {
8324 	if (infoSize != sizeof(fd_info))
8325 		return B_BAD_VALUE;
8326 
8327 	// get the team
8328 	Team* team = Team::Get(teamID);
8329 	if (team == NULL)
8330 		return B_BAD_TEAM_ID;
8331 	BReference<Team> teamReference(team, true);
8332 
8333 	// now that we have a team reference, its I/O context won't go away
8334 	io_context* context = team->io_context;
8335 	MutexLocker contextLocker(context->io_mutex);
8336 
8337 	uint32 slot = *_cookie;
8338 
8339 	struct file_descriptor* descriptor;
8340 	while (slot < context->table_size
8341 		&& (descriptor = context->fds[slot]) == NULL) {
8342 		slot++;
8343 	}
8344 
8345 	if (slot >= context->table_size)
8346 		return B_ENTRY_NOT_FOUND;
8347 
8348 	info->number = slot;
8349 	info->open_mode = descriptor->open_mode;
8350 
8351 	struct vnode* vnode = fd_vnode(descriptor);
8352 	if (vnode != NULL) {
8353 		info->device = vnode->device;
8354 		info->node = vnode->id;
8355 	} else if (descriptor->u.mount != NULL) {
8356 		info->device = descriptor->u.mount->id;
8357 		info->node = -1;
8358 	}
8359 
8360 	*_cookie = slot + 1;
8361 	return B_OK;
8362 }
8363 
8364 
8365 int
8366 _kern_open_entry_ref(dev_t device, ino_t inode, const char* name, int openMode,
8367 	int perms)
8368 {
8369 	if ((openMode & O_CREAT) != 0) {
8370 		return file_create_entry_ref(device, inode, name, openMode, perms,
8371 			true);
8372 	}
8373 
8374 	return file_open_entry_ref(device, inode, name, openMode, true);
8375 }
8376 
8377 
8378 /*!	\brief Opens a node specified by a FD + path pair.
8379 
8380 	At least one of \a fd and \a path must be specified.
8381 	If only \a fd is given, the function opens the node identified by this
8382 	FD. If only a path is given, this path is opened. If both are given and
8383 	the path is absolute, \a fd is ignored; a relative path is reckoned off
8384 	of the directory (!) identified by \a fd.
8385 
8386 	\param fd The FD. May be < 0.
8387 	\param path The absolute or relative path. May be \c NULL.
8388 	\param openMode The open mode.
8389 	\return A FD referring to the newly opened node, or an error code,
8390 			if an error occurs.
8391 */
8392 int
8393 _kern_open(int fd, const char* path, int openMode, int perms)
8394 {
8395 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8396 	if (pathBuffer.InitCheck() != B_OK)
8397 		return B_NO_MEMORY;
8398 
8399 	if ((openMode & O_CREAT) != 0)
8400 		return file_create(fd, pathBuffer.LockBuffer(), openMode, perms, true);
8401 
8402 	return file_open(fd, pathBuffer.LockBuffer(), openMode, true);
8403 }
8404 
8405 
8406 /*!	\brief Opens a directory specified by entry_ref or node_ref.
8407 
8408 	The supplied name may be \c NULL, in which case directory identified
8409 	by \a device and \a inode will be opened. Otherwise \a device and
8410 	\a inode identify the parent directory of the directory to be opened
8411 	and \a name its entry name.
8412 
8413 	\param device If \a name is specified the ID of the device the parent
8414 		   directory of the directory to be opened resides on, otherwise
8415 		   the device of the directory itself.
8416 	\param inode If \a name is specified the node ID of the parent
8417 		   directory of the directory to be opened, otherwise node ID of the
8418 		   directory itself.
8419 	\param name The entry name of the directory to be opened. If \c NULL,
8420 		   the \a device + \a inode pair identify the node to be opened.
8421 	\return The FD of the newly opened directory or an error code, if
8422 			something went wrong.
8423 */
8424 int
8425 _kern_open_dir_entry_ref(dev_t device, ino_t inode, const char* name)
8426 {
8427 	return dir_open_entry_ref(device, inode, name, true);
8428 }
8429 
8430 
8431 /*!	\brief Opens a directory specified by a FD + path pair.
8432 
8433 	At least one of \a fd and \a path must be specified.
8434 	If only \a fd is given, the function opens the directory identified by this
8435 	FD. If only a path is given, this path is opened. If both are given and
8436 	the path is absolute, \a fd is ignored; a relative path is reckoned off
8437 	of the directory (!) identified by \a fd.
8438 
8439 	\param fd The FD. May be < 0.
8440 	\param path The absolute or relative path. May be \c NULL.
8441 	\return A FD referring to the newly opened directory, or an error code,
8442 			if an error occurs.
8443 */
8444 int
8445 _kern_open_dir(int fd, const char* path)
8446 {
8447 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8448 	if (pathBuffer.InitCheck() != B_OK)
8449 		return B_NO_MEMORY;
8450 
8451 	return dir_open(fd, pathBuffer.LockBuffer(), true);
8452 }
8453 
8454 
8455 status_t
8456 _kern_fcntl(int fd, int op, size_t argument)
8457 {
8458 	return common_fcntl(fd, op, argument, true);
8459 }
8460 
8461 
8462 status_t
8463 _kern_fsync(int fd)
8464 {
8465 	return common_sync(fd, true);
8466 }
8467 
8468 
8469 status_t
8470 _kern_lock_node(int fd)
8471 {
8472 	return common_lock_node(fd, true);
8473 }
8474 
8475 
8476 status_t
8477 _kern_unlock_node(int fd)
8478 {
8479 	return common_unlock_node(fd, true);
8480 }
8481 
8482 
8483 status_t
8484 _kern_preallocate(int fd, off_t offset, off_t length)
8485 {
8486 	return common_preallocate(fd, offset, length, true);
8487 }
8488 
8489 
8490 status_t
8491 _kern_create_dir_entry_ref(dev_t device, ino_t inode, const char* name,
8492 	int perms)
8493 {
8494 	return dir_create_entry_ref(device, inode, name, perms, true);
8495 }
8496 
8497 
8498 /*!	\brief Creates a directory specified by a FD + path pair.
8499 
8500 	\a path must always be specified (it contains the name of the new directory
8501 	at least). If only a path is given, this path identifies the location at
8502 	which the directory shall be created. If both \a fd and \a path are given
8503 	and the path is absolute, \a fd is ignored; a relative path is reckoned off
8504 	of the directory (!) identified by \a fd.
8505 
8506 	\param fd The FD. May be < 0.
8507 	\param path The absolute or relative path. Must not be \c NULL.
8508 	\param perms The access permissions the new directory shall have.
8509 	\return \c B_OK, if the directory has been created successfully, another
8510 			error code otherwise.
8511 */
8512 status_t
8513 _kern_create_dir(int fd, const char* path, int perms)
8514 {
8515 	KPath pathBuffer(path, KPath::DEFAULT);
8516 	if (pathBuffer.InitCheck() != B_OK)
8517 		return B_NO_MEMORY;
8518 
8519 	return dir_create(fd, pathBuffer.LockBuffer(), perms, true);
8520 }
8521 
8522 
8523 status_t
8524 _kern_remove_dir(int fd, const char* path)
8525 {
8526 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8527 	if (pathBuffer.InitCheck() != B_OK)
8528 		return B_NO_MEMORY;
8529 
8530 	return dir_remove(fd, pathBuffer.LockBuffer(), true);
8531 }
8532 
8533 
8534 /*!	\brief Reads the contents of a symlink referred to by a FD + path pair.
8535 
8536 	At least one of \a fd and \a path must be specified.
8537 	If only \a fd is given, the function the symlink to be read is the node
8538 	identified by this FD. If only a path is given, this path identifies the
8539 	symlink to be read. If both are given and the path is absolute, \a fd is
8540 	ignored; a relative path is reckoned off of the directory (!) identified
8541 	by \a fd.
8542 	If this function fails with B_BUFFER_OVERFLOW, the \a _bufferSize pointer
8543 	will still be updated to reflect the required buffer size.
8544 
8545 	\param fd The FD. May be < 0.
8546 	\param path The absolute or relative path. May be \c NULL.
8547 	\param buffer The buffer into which the contents of the symlink shall be
8548 		   written.
8549 	\param _bufferSize A pointer to the size of the supplied buffer.
8550 	\return The length of the link on success or an appropriate error code
8551 */
8552 status_t
8553 _kern_read_link(int fd, const char* path, char* buffer, size_t* _bufferSize)
8554 {
8555 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8556 	if (pathBuffer.InitCheck() != B_OK)
8557 		return B_NO_MEMORY;
8558 
8559 	return common_read_link(fd, pathBuffer.LockBuffer(),
8560 		buffer, _bufferSize, true);
8561 }
8562 
8563 
8564 /*!	\brief Creates a symlink specified by a FD + path pair.
8565 
8566 	\a path must always be specified (it contains the name of the new symlink
8567 	at least). If only a path is given, this path identifies the location at
8568 	which the symlink shall be created. If both \a fd and \a path are given and
8569 	the path is absolute, \a fd is ignored; a relative path is reckoned off
8570 	of the directory (!) identified by \a fd.
8571 
8572 	\param fd The FD. May be < 0.
8573 	\param toPath The absolute or relative path. Must not be \c NULL.
8574 	\param mode The access permissions the new symlink shall have.
8575 	\return \c B_OK, if the symlink has been created successfully, another
8576 			error code otherwise.
8577 */
8578 status_t
8579 _kern_create_symlink(int fd, const char* path, const char* toPath, int mode)
8580 {
8581 	KPath pathBuffer(path);
8582 	if (pathBuffer.InitCheck() != B_OK)
8583 		return B_NO_MEMORY;
8584 
8585 	return common_create_symlink(fd, pathBuffer.LockBuffer(),
8586 		toPath, mode, true);
8587 }
8588 
8589 
8590 status_t
8591 _kern_create_link(int pathFD, const char* path, int toFD, const char* toPath,
8592 	bool traverseLeafLink)
8593 {
8594 	KPath pathBuffer(path);
8595 	KPath toPathBuffer(toPath);
8596 	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
8597 		return B_NO_MEMORY;
8598 
8599 	return common_create_link(pathFD, pathBuffer.LockBuffer(), toFD,
8600 		toPathBuffer.LockBuffer(), traverseLeafLink, true);
8601 }
8602 
8603 
8604 /*!	\brief Removes an entry specified by a FD + path pair from its directory.
8605 
8606 	\a path must always be specified (it contains at least the name of the entry
8607 	to be deleted). If only a path is given, this path identifies the entry
8608 	directly. If both \a fd and \a path are given and the path is absolute,
8609 	\a fd is ignored; a relative path is reckoned off of the directory (!)
8610 	identified by \a fd.
8611 
8612 	\param fd The FD. May be < 0.
8613 	\param path The absolute or relative path. Must not be \c NULL.
8614 	\return \c B_OK, if the entry has been removed successfully, another
8615 			error code otherwise.
8616 */
8617 status_t
8618 _kern_unlink(int fd, const char* path)
8619 {
8620 	KPath pathBuffer(path);
8621 	if (pathBuffer.InitCheck() != B_OK)
8622 		return B_NO_MEMORY;
8623 
8624 	return common_unlink(fd, pathBuffer.LockBuffer(), true);
8625 }
8626 
8627 
8628 /*!	\brief Moves an entry specified by a FD + path pair to a an entry specified
8629 		   by another FD + path pair.
8630 
8631 	\a oldPath and \a newPath must always be specified (they contain at least
8632 	the name of the entry). If only a path is given, this path identifies the
8633 	entry directly. If both a FD and a path are given and the path is absolute,
8634 	the FD is ignored; a relative path is reckoned off of the directory (!)
8635 	identified by the respective FD.
8636 
8637 	\param oldFD The FD of the old location. May be < 0.
8638 	\param oldPath The absolute or relative path of the old location. Must not
8639 		   be \c NULL.
8640 	\param newFD The FD of the new location. May be < 0.
8641 	\param newPath The absolute or relative path of the new location. Must not
8642 		   be \c NULL.
8643 	\return \c B_OK, if the entry has been moved successfully, another
8644 			error code otherwise.
8645 */
8646 status_t
8647 _kern_rename(int oldFD, const char* oldPath, int newFD, const char* newPath)
8648 {
8649 	KPath oldPathBuffer(oldPath);
8650 	KPath newPathBuffer(newPath);
8651 	if (oldPathBuffer.InitCheck() != B_OK || newPathBuffer.InitCheck() != B_OK)
8652 		return B_NO_MEMORY;
8653 
8654 	return common_rename(oldFD, oldPathBuffer.LockBuffer(),
8655 		newFD, newPathBuffer.LockBuffer(), true);
8656 }
8657 
8658 
8659 status_t
8660 _kern_access(int fd, const char* path, int mode, bool effectiveUserGroup)
8661 {
8662 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8663 	if (pathBuffer.InitCheck() != B_OK)
8664 		return B_NO_MEMORY;
8665 
8666 	return common_access(fd, pathBuffer.LockBuffer(), mode, effectiveUserGroup,
8667 		true);
8668 }
8669 
8670 
8671 /*!	\brief Reads stat data of an entity specified by a FD + path pair.
8672 
8673 	If only \a fd is given, the stat operation associated with the type
8674 	of the FD (node, attr, attr dir etc.) is performed. If only \a path is
8675 	given, this path identifies the entry for whose node to retrieve the
8676 	stat data. If both \a fd and \a path are given and the path is absolute,
8677 	\a fd is ignored; a relative path is reckoned off of the directory (!)
8678 	identified by \a fd and specifies the entry whose stat data shall be
8679 	retrieved.
8680 
8681 	\param fd The FD. May be < 0.
8682 	\param path The absolute or relative path. Must not be \c NULL.
8683 	\param traverseLeafLink If \a path is given, \c true specifies that the
8684 		   function shall not stick to symlinks, but traverse them.
8685 	\param stat The buffer the stat data shall be written into.
8686 	\param statSize The size of the supplied stat buffer.
8687 	\return \c B_OK, if the the stat data have been read successfully, another
8688 			error code otherwise.
8689 */
8690 status_t
8691 _kern_read_stat(int fd, const char* path, bool traverseLeafLink,
8692 	struct stat* stat, size_t statSize)
8693 {
8694 	struct stat completeStat;
8695 	struct stat* originalStat = NULL;
8696 	status_t status;
8697 
8698 	if (statSize > sizeof(struct stat))
8699 		return B_BAD_VALUE;
8700 
8701 	// this supports different stat extensions
8702 	if (statSize < sizeof(struct stat)) {
8703 		originalStat = stat;
8704 		stat = &completeStat;
8705 	}
8706 
8707 	status = vfs_read_stat(fd, path, traverseLeafLink, stat, true);
8708 
8709 	if (status == B_OK && originalStat != NULL)
8710 		memcpy(originalStat, stat, statSize);
8711 
8712 	return status;
8713 }
8714 
8715 
8716 /*!	\brief Writes stat data of an entity specified by a FD + path pair.
8717 
8718 	If only \a fd is given, the stat operation associated with the type
8719 	of the FD (node, attr, attr dir etc.) is performed. If only \a path is
8720 	given, this path identifies the entry for whose node to write the
8721 	stat data. If both \a fd and \a path are given and the path is absolute,
8722 	\a fd is ignored; a relative path is reckoned off of the directory (!)
8723 	identified by \a fd and specifies the entry whose stat data shall be
8724 	written.
8725 
8726 	\param fd The FD. May be < 0.
8727 	\param path The absolute or relative path. May be \c NULL.
8728 	\param traverseLeafLink If \a path is given, \c true specifies that the
8729 		   function shall not stick to symlinks, but traverse them.
8730 	\param stat The buffer containing the stat data to be written.
8731 	\param statSize The size of the supplied stat buffer.
8732 	\param statMask A mask specifying which parts of the stat data shall be
8733 		   written.
8734 	\return \c B_OK, if the the stat data have been written successfully,
8735 			another error code otherwise.
8736 */
8737 status_t
8738 _kern_write_stat(int fd, const char* path, bool traverseLeafLink,
8739 	const struct stat* stat, size_t statSize, int statMask)
8740 {
8741 	struct stat completeStat;
8742 
8743 	if (statSize > sizeof(struct stat))
8744 		return B_BAD_VALUE;
8745 
8746 	// this supports different stat extensions
8747 	if (statSize < sizeof(struct stat)) {
8748 		memset((uint8*)&completeStat + statSize, 0,
8749 			sizeof(struct stat) - statSize);
8750 		memcpy(&completeStat, stat, statSize);
8751 		stat = &completeStat;
8752 	}
8753 
8754 	status_t status;
8755 
8756 	if (path != NULL) {
8757 		// path given: write the stat of the node referred to by (fd, path)
8758 		KPath pathBuffer(path);
8759 		if (pathBuffer.InitCheck() != B_OK)
8760 			return B_NO_MEMORY;
8761 
8762 		status = common_path_write_stat(fd, pathBuffer.LockBuffer(),
8763 			traverseLeafLink, stat, statMask, true);
8764 	} else {
8765 		// no path given: get the FD and use the FD operation
8766 		struct file_descriptor* descriptor
8767 			= get_fd(get_current_io_context(true), fd);
8768 		if (descriptor == NULL)
8769 			return B_FILE_ERROR;
8770 
8771 		if (descriptor->ops->fd_write_stat)
8772 			status = descriptor->ops->fd_write_stat(descriptor, stat, statMask);
8773 		else
8774 			status = B_UNSUPPORTED;
8775 
8776 		put_fd(descriptor);
8777 	}
8778 
8779 	return status;
8780 }
8781 
8782 
8783 int
8784 _kern_open_attr_dir(int fd, const char* path, bool traverseLeafLink)
8785 {
8786 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8787 	if (pathBuffer.InitCheck() != B_OK)
8788 		return B_NO_MEMORY;
8789 
8790 	return attr_dir_open(fd, pathBuffer.LockBuffer(), traverseLeafLink, true);
8791 }
8792 
8793 
8794 int
8795 _kern_open_attr(int fd, const char* path, const char* name, uint32 type,
8796 	int openMode)
8797 {
8798 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8799 	if (pathBuffer.InitCheck() != B_OK)
8800 		return B_NO_MEMORY;
8801 
8802 	if ((openMode & O_CREAT) != 0) {
8803 		return attr_create(fd, pathBuffer.LockBuffer(), name, type, openMode,
8804 			true);
8805 	}
8806 
8807 	return attr_open(fd, pathBuffer.LockBuffer(), name, openMode, true);
8808 }
8809 
8810 
8811 status_t
8812 _kern_remove_attr(int fd, const char* name)
8813 {
8814 	return attr_remove(fd, name, true);
8815 }
8816 
8817 
8818 status_t
8819 _kern_rename_attr(int fromFile, const char* fromName, int toFile,
8820 	const char* toName)
8821 {
8822 	return attr_rename(fromFile, fromName, toFile, toName, true);
8823 }
8824 
8825 
8826 int
8827 _kern_open_index_dir(dev_t device)
8828 {
8829 	return index_dir_open(device, true);
8830 }
8831 
8832 
8833 status_t
8834 _kern_create_index(dev_t device, const char* name, uint32 type, uint32 flags)
8835 {
8836 	return index_create(device, name, type, flags, true);
8837 }
8838 
8839 
8840 status_t
8841 _kern_read_index_stat(dev_t device, const char* name, struct stat* stat)
8842 {
8843 	return index_name_read_stat(device, name, stat, true);
8844 }
8845 
8846 
8847 status_t
8848 _kern_remove_index(dev_t device, const char* name)
8849 {
8850 	return index_remove(device, name, true);
8851 }
8852 
8853 
8854 status_t
8855 _kern_getcwd(char* buffer, size_t size)
8856 {
8857 	TRACE(("_kern_getcwd: buf %p, %ld\n", buffer, size));
8858 
8859 	// Call vfs to get current working directory
8860 	return get_cwd(buffer, size, true);
8861 }
8862 
8863 
8864 status_t
8865 _kern_setcwd(int fd, const char* path)
8866 {
8867 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8868 	if (pathBuffer.InitCheck() != B_OK)
8869 		return B_NO_MEMORY;
8870 
8871 	return set_cwd(fd, pathBuffer.LockBuffer(), true);
8872 }
8873 
8874 
8875 //	#pragma mark - userland syscalls
8876 
8877 
8878 dev_t
8879 _user_mount(const char* userPath, const char* userDevice,
8880 	const char* userFileSystem, uint32 flags, const char* userArgs,
8881 	size_t argsLength)
8882 {
8883 	char fileSystem[B_FILE_NAME_LENGTH];
8884 	KPath path, device;
8885 	char* args = NULL;
8886 	status_t status;
8887 
8888 	if (!IS_USER_ADDRESS(userPath))
8889 		return B_BAD_ADDRESS;
8890 
8891 	if (path.InitCheck() != B_OK || device.InitCheck() != B_OK)
8892 		return B_NO_MEMORY;
8893 
8894 	status = user_copy_name(path.LockBuffer(), userPath,
8895 		B_PATH_NAME_LENGTH);
8896 	if (status != B_OK)
8897 		return status;
8898 	path.UnlockBuffer();
8899 
8900 	if (userFileSystem != NULL) {
8901 		if (!IS_USER_ADDRESS(userFileSystem))
8902 			return B_BAD_ADDRESS;
8903 
8904 		status = user_copy_name(fileSystem, userFileSystem, sizeof(fileSystem));
8905 		if (status != B_OK)
8906 			return status;
8907 	}
8908 
8909 	if (userDevice != NULL) {
8910 		if (!IS_USER_ADDRESS(userDevice))
8911 			return B_BAD_ADDRESS;
8912 
8913 		status = user_copy_name(device.LockBuffer(), userDevice,
8914 			B_PATH_NAME_LENGTH);
8915 		if (status != B_OK)
8916 			return status;
8917 		device.UnlockBuffer();
8918 	}
8919 
8920 	if (userArgs != NULL && argsLength > 0) {
8921 		if (!IS_USER_ADDRESS(userArgs))
8922 			return B_BAD_ADDRESS;
8923 
8924 		// this is a safety restriction
8925 		if (argsLength >= 65536)
8926 			return B_NAME_TOO_LONG;
8927 
8928 		args = (char*)malloc(argsLength + 1);
8929 		if (args == NULL)
8930 			return B_NO_MEMORY;
8931 
8932 		status = user_copy_name(args, userArgs, argsLength + 1);
8933 		if (status != B_OK) {
8934 			free(args);
8935 			return status;
8936 		}
8937 	}
8938 
8939 	status = fs_mount(path.LockBuffer(),
8940 		userDevice != NULL ? device.Path() : NULL,
8941 		userFileSystem ? fileSystem : NULL, flags, args, false);
8942 
8943 	free(args);
8944 	return status;
8945 }
8946 
8947 
8948 status_t
8949 _user_unmount(const char* userPath, uint32 flags)
8950 {
8951 	if (!IS_USER_ADDRESS(userPath))
8952 		return B_BAD_ADDRESS;
8953 
8954 	KPath pathBuffer;
8955 	if (pathBuffer.InitCheck() != B_OK)
8956 		return B_NO_MEMORY;
8957 
8958 	char* path = pathBuffer.LockBuffer();
8959 
8960 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
8961 	if (status != B_OK)
8962 		return status;
8963 
8964 	return fs_unmount(path, -1, flags & ~B_UNMOUNT_BUSY_PARTITION, false);
8965 }
8966 
8967 
8968 status_t
8969 _user_read_fs_info(dev_t device, struct fs_info* userInfo)
8970 {
8971 	struct fs_info info;
8972 	status_t status;
8973 
8974 	if (userInfo == NULL)
8975 		return B_BAD_VALUE;
8976 
8977 	if (!IS_USER_ADDRESS(userInfo))
8978 		return B_BAD_ADDRESS;
8979 
8980 	status = fs_read_info(device, &info);
8981 	if (status != B_OK)
8982 		return status;
8983 
8984 	if (user_memcpy(userInfo, &info, sizeof(struct fs_info)) != B_OK)
8985 		return B_BAD_ADDRESS;
8986 
8987 	return B_OK;
8988 }
8989 
8990 
8991 status_t
8992 _user_write_fs_info(dev_t device, const struct fs_info* userInfo, int mask)
8993 {
8994 	struct fs_info info;
8995 
8996 	if (userInfo == NULL)
8997 		return B_BAD_VALUE;
8998 
8999 	if (!IS_USER_ADDRESS(userInfo)
9000 		|| user_memcpy(&info, userInfo, sizeof(struct fs_info)) != B_OK)
9001 		return B_BAD_ADDRESS;
9002 
9003 	return fs_write_info(device, &info, mask);
9004 }
9005 
9006 
9007 dev_t
9008 _user_next_device(int32* _userCookie)
9009 {
9010 	int32 cookie;
9011 	dev_t device;
9012 
9013 	if (!IS_USER_ADDRESS(_userCookie)
9014 		|| user_memcpy(&cookie, _userCookie, sizeof(int32)) != B_OK)
9015 		return B_BAD_ADDRESS;
9016 
9017 	device = fs_next_device(&cookie);
9018 
9019 	if (device >= B_OK) {
9020 		// update user cookie
9021 		if (user_memcpy(_userCookie, &cookie, sizeof(int32)) != B_OK)
9022 			return B_BAD_ADDRESS;
9023 	}
9024 
9025 	return device;
9026 }
9027 
9028 
9029 status_t
9030 _user_sync(void)
9031 {
9032 	return _kern_sync();
9033 }
9034 
9035 
9036 status_t
9037 _user_get_next_fd_info(team_id team, uint32* userCookie, fd_info* userInfo,
9038 	size_t infoSize)
9039 {
9040 	struct fd_info info;
9041 	uint32 cookie;
9042 
9043 	// only root can do this
9044 	if (geteuid() != 0)
9045 		return B_NOT_ALLOWED;
9046 
9047 	if (infoSize != sizeof(fd_info))
9048 		return B_BAD_VALUE;
9049 
9050 	if (!IS_USER_ADDRESS(userCookie) || !IS_USER_ADDRESS(userInfo)
9051 		|| user_memcpy(&cookie, userCookie, sizeof(uint32)) != B_OK)
9052 		return B_BAD_ADDRESS;
9053 
9054 	status_t status = _kern_get_next_fd_info(team, &cookie, &info, infoSize);
9055 	if (status != B_OK)
9056 		return status;
9057 
9058 	if (user_memcpy(userCookie, &cookie, sizeof(uint32)) != B_OK
9059 		|| user_memcpy(userInfo, &info, infoSize) != B_OK)
9060 		return B_BAD_ADDRESS;
9061 
9062 	return status;
9063 }
9064 
9065 
9066 status_t
9067 _user_entry_ref_to_path(dev_t device, ino_t inode, const char* leaf,
9068 	char* userPath, size_t pathLength)
9069 {
9070 	if (!IS_USER_ADDRESS(userPath))
9071 		return B_BAD_ADDRESS;
9072 
9073 	KPath path;
9074 	if (path.InitCheck() != B_OK)
9075 		return B_NO_MEMORY;
9076 
9077 	// copy the leaf name onto the stack
9078 	char stackLeaf[B_FILE_NAME_LENGTH];
9079 	if (leaf != NULL) {
9080 		if (!IS_USER_ADDRESS(leaf))
9081 			return B_BAD_ADDRESS;
9082 
9083 		int status = user_copy_name(stackLeaf, leaf, B_FILE_NAME_LENGTH);
9084 		if (status != B_OK)
9085 			return status;
9086 
9087 		leaf = stackLeaf;
9088 	}
9089 
9090 	status_t status = vfs_entry_ref_to_path(device, inode, leaf,
9091 		false, path.LockBuffer(), path.BufferSize());
9092 	if (status != B_OK)
9093 		return status;
9094 
9095 	path.UnlockBuffer();
9096 
9097 	int length = user_strlcpy(userPath, path.Path(), pathLength);
9098 	if (length < 0)
9099 		return length;
9100 	if (length >= (int)pathLength)
9101 		return B_BUFFER_OVERFLOW;
9102 
9103 	return B_OK;
9104 }
9105 
9106 
9107 status_t
9108 _user_normalize_path(const char* userPath, bool traverseLink, char* buffer)
9109 {
9110 	if (userPath == NULL || buffer == NULL)
9111 		return B_BAD_VALUE;
9112 	if (!IS_USER_ADDRESS(userPath) || !IS_USER_ADDRESS(buffer))
9113 		return B_BAD_ADDRESS;
9114 
9115 	// copy path from userland
9116 	KPath pathBuffer;
9117 	if (pathBuffer.InitCheck() != B_OK)
9118 		return B_NO_MEMORY;
9119 	char* path = pathBuffer.LockBuffer();
9120 
9121 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9122 	if (status != B_OK)
9123 		return status;
9124 
9125 	status_t error = normalize_path(path, pathBuffer.BufferSize(), traverseLink,
9126 		false);
9127 	if (error != B_OK)
9128 		return error;
9129 
9130 	// copy back to userland
9131 	int len = user_strlcpy(buffer, path, B_PATH_NAME_LENGTH);
9132 	if (len < 0)
9133 		return len;
9134 	if (len >= B_PATH_NAME_LENGTH)
9135 		return B_BUFFER_OVERFLOW;
9136 
9137 	return B_OK;
9138 }
9139 
9140 
9141 int
9142 _user_open_entry_ref(dev_t device, ino_t inode, const char* userName,
9143 	int openMode, int perms)
9144 {
9145 	char name[B_FILE_NAME_LENGTH];
9146 
9147 	if (userName == NULL || device < 0 || inode < 0)
9148 		return B_BAD_VALUE;
9149 	if (!IS_USER_ADDRESS(userName))
9150 		return B_BAD_ADDRESS;
9151 	status_t status = user_copy_name(name, userName, sizeof(name));
9152 	if (status != B_OK)
9153 		return status;
9154 
9155 	if ((openMode & O_CREAT) != 0) {
9156 		return file_create_entry_ref(device, inode, name, openMode, perms,
9157 			false);
9158 	}
9159 
9160 	return file_open_entry_ref(device, inode, name, openMode, false);
9161 }
9162 
9163 
9164 int
9165 _user_open(int fd, const char* userPath, int openMode, int perms)
9166 {
9167 	KPath path;
9168 	if (path.InitCheck() != B_OK)
9169 		return B_NO_MEMORY;
9170 
9171 	char* buffer = path.LockBuffer();
9172 
9173 	if (!IS_USER_ADDRESS(userPath))
9174 		return B_BAD_ADDRESS;
9175 	status_t status = user_copy_name(buffer, userPath, B_PATH_NAME_LENGTH);
9176 	if (status != B_OK)
9177 		return status;
9178 
9179 	if ((openMode & O_CREAT) != 0)
9180 		return file_create(fd, buffer, openMode, perms, false);
9181 
9182 	return file_open(fd, buffer, openMode, false);
9183 }
9184 
9185 
9186 int
9187 _user_open_dir_entry_ref(dev_t device, ino_t inode, const char* userName)
9188 {
9189 	if (userName != NULL) {
9190 		char name[B_FILE_NAME_LENGTH];
9191 
9192 		if (!IS_USER_ADDRESS(userName))
9193 			return B_BAD_ADDRESS;
9194 		status_t status = user_copy_name(name, userName, sizeof(name));
9195 		if (status != B_OK)
9196 			return status;
9197 
9198 		return dir_open_entry_ref(device, inode, name, false);
9199 	}
9200 	return dir_open_entry_ref(device, inode, NULL, false);
9201 }
9202 
9203 
9204 int
9205 _user_open_dir(int fd, const char* userPath)
9206 {
9207 	if (userPath == NULL)
9208 		return dir_open(fd, NULL, false);
9209 
9210 	KPath path;
9211 	if (path.InitCheck() != B_OK)
9212 		return B_NO_MEMORY;
9213 
9214 	char* buffer = path.LockBuffer();
9215 
9216 	if (!IS_USER_ADDRESS(userPath))
9217 		return B_BAD_ADDRESS;
9218 	status_t status = user_copy_name(buffer, userPath, B_PATH_NAME_LENGTH);
9219 	if (status != B_OK)
9220 		return status;
9221 
9222 	return dir_open(fd, buffer, false);
9223 }
9224 
9225 
9226 /*!	\brief Opens a directory's parent directory and returns the entry name
9227 		   of the former.
9228 
9229 	Aside from that it returns the directory's entry name, this method is
9230 	equivalent to \code _user_open_dir(fd, "..") \endcode. It really is
9231 	equivalent, if \a userName is \c NULL.
9232 
9233 	If a name buffer is supplied and the name does not fit the buffer, the
9234 	function fails. A buffer of size \c B_FILE_NAME_LENGTH should be safe.
9235 
9236 	\param fd A FD referring to a directory.
9237 	\param userName Buffer the directory's entry name shall be written into.
9238 		   May be \c NULL.
9239 	\param nameLength Size of the name buffer.
9240 	\return The file descriptor of the opened parent directory, if everything
9241 			went fine, an error code otherwise.
9242 */
9243 int
9244 _user_open_parent_dir(int fd, char* userName, size_t nameLength)
9245 {
9246 	bool kernel = false;
9247 
9248 	if (userName && !IS_USER_ADDRESS(userName))
9249 		return B_BAD_ADDRESS;
9250 
9251 	// open the parent dir
9252 	int parentFD = dir_open(fd, (char*)"..", kernel);
9253 	if (parentFD < 0)
9254 		return parentFD;
9255 	FDCloser fdCloser(parentFD, kernel);
9256 
9257 	if (userName) {
9258 		// get the vnodes
9259 		struct vnode* parentVNode = get_vnode_from_fd(parentFD, kernel);
9260 		struct vnode* dirVNode = get_vnode_from_fd(fd, kernel);
9261 		VNodePutter parentVNodePutter(parentVNode);
9262 		VNodePutter dirVNodePutter(dirVNode);
9263 		if (!parentVNode || !dirVNode)
9264 			return B_FILE_ERROR;
9265 
9266 		// get the vnode name
9267 		char _buffer[offsetof(struct dirent, d_name) + B_FILE_NAME_LENGTH + 1];
9268 		struct dirent* buffer = (struct dirent*)_buffer;
9269 		status_t status = get_vnode_name(dirVNode, parentVNode, buffer,
9270 			sizeof(_buffer), get_current_io_context(false));
9271 		if (status != B_OK)
9272 			return status;
9273 
9274 		// copy the name to the userland buffer
9275 		int len = user_strlcpy(userName, buffer->d_name, nameLength);
9276 		if (len < 0)
9277 			return len;
9278 		if (len >= (int)nameLength)
9279 			return B_BUFFER_OVERFLOW;
9280 	}
9281 
9282 	return fdCloser.Detach();
9283 }
9284 
9285 
9286 status_t
9287 _user_fcntl(int fd, int op, size_t argument)
9288 {
9289 	status_t status = common_fcntl(fd, op, argument, false);
9290 	if (op == F_SETLKW)
9291 		syscall_restart_handle_post(status);
9292 
9293 	return status;
9294 }
9295 
9296 
9297 status_t
9298 _user_fsync(int fd)
9299 {
9300 	return common_sync(fd, false);
9301 }
9302 
9303 
9304 status_t
9305 _user_flock(int fd, int operation)
9306 {
9307 	FUNCTION(("_user_fcntl(fd = %d, op = %d)\n", fd, operation));
9308 
9309 	// Check if the operation is valid
9310 	switch (operation & ~LOCK_NB) {
9311 		case LOCK_UN:
9312 		case LOCK_SH:
9313 		case LOCK_EX:
9314 			break;
9315 
9316 		default:
9317 			return B_BAD_VALUE;
9318 	}
9319 
9320 	struct file_descriptor* descriptor;
9321 	struct vnode* vnode;
9322 	descriptor = get_fd_and_vnode(fd, &vnode, false);
9323 	if (descriptor == NULL)
9324 		return B_FILE_ERROR;
9325 
9326 	if (descriptor->type != FDTYPE_FILE) {
9327 		put_fd(descriptor);
9328 		return B_BAD_VALUE;
9329 	}
9330 
9331 	struct flock flock;
9332 	flock.l_start = 0;
9333 	flock.l_len = OFF_MAX;
9334 	flock.l_whence = 0;
9335 	flock.l_type = (operation & LOCK_SH) != 0 ? F_RDLCK : F_WRLCK;
9336 
9337 	status_t status;
9338 	if ((operation & LOCK_UN) != 0) {
9339 		if (HAS_FS_CALL(vnode, release_lock))
9340 			status = FS_CALL(vnode, release_lock, descriptor->cookie, &flock);
9341 		else
9342 			status = release_advisory_lock(vnode, NULL, descriptor, &flock);
9343 	} else {
9344 		if (HAS_FS_CALL(vnode, acquire_lock)) {
9345 			status = FS_CALL(vnode, acquire_lock, descriptor->cookie, &flock,
9346 				(operation & LOCK_NB) == 0);
9347 		} else {
9348 			status = acquire_advisory_lock(vnode, NULL, descriptor, &flock,
9349 				(operation & LOCK_NB) == 0);
9350 		}
9351 	}
9352 
9353 	syscall_restart_handle_post(status);
9354 
9355 	put_fd(descriptor);
9356 	return status;
9357 }
9358 
9359 
9360 status_t
9361 _user_lock_node(int fd)
9362 {
9363 	return common_lock_node(fd, false);
9364 }
9365 
9366 
9367 status_t
9368 _user_unlock_node(int fd)
9369 {
9370 	return common_unlock_node(fd, false);
9371 }
9372 
9373 
9374 status_t
9375 _user_preallocate(int fd, off_t offset, off_t length)
9376 {
9377 	return common_preallocate(fd, offset, length, false);
9378 }
9379 
9380 
9381 status_t
9382 _user_create_dir_entry_ref(dev_t device, ino_t inode, const char* userName,
9383 	int perms)
9384 {
9385 	char name[B_FILE_NAME_LENGTH];
9386 	status_t status;
9387 
9388 	if (!IS_USER_ADDRESS(userName))
9389 		return B_BAD_ADDRESS;
9390 
9391 	status = user_copy_name(name, userName, sizeof(name));
9392 	if (status != B_OK)
9393 		return status;
9394 
9395 	return dir_create_entry_ref(device, inode, name, perms, false);
9396 }
9397 
9398 
9399 status_t
9400 _user_create_dir(int fd, const char* userPath, int perms)
9401 {
9402 	KPath pathBuffer;
9403 	if (pathBuffer.InitCheck() != B_OK)
9404 		return B_NO_MEMORY;
9405 
9406 	char* path = pathBuffer.LockBuffer();
9407 
9408 	if (!IS_USER_ADDRESS(userPath))
9409 		return B_BAD_ADDRESS;
9410 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9411 	if (status != B_OK)
9412 		return status;
9413 
9414 	return dir_create(fd, path, perms, false);
9415 }
9416 
9417 
9418 status_t
9419 _user_remove_dir(int fd, const char* userPath)
9420 {
9421 	KPath pathBuffer;
9422 	if (pathBuffer.InitCheck() != B_OK)
9423 		return B_NO_MEMORY;
9424 
9425 	char* path = pathBuffer.LockBuffer();
9426 
9427 	if (userPath != NULL) {
9428 		if (!IS_USER_ADDRESS(userPath))
9429 			return B_BAD_ADDRESS;
9430 		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9431 		if (status != B_OK)
9432 			return status;
9433 	}
9434 
9435 	return dir_remove(fd, userPath ? path : NULL, false);
9436 }
9437 
9438 
9439 status_t
9440 _user_read_link(int fd, const char* userPath, char* userBuffer,
9441 	size_t* userBufferSize)
9442 {
9443 	KPath pathBuffer, linkBuffer;
9444 	if (pathBuffer.InitCheck() != B_OK || linkBuffer.InitCheck() != B_OK)
9445 		return B_NO_MEMORY;
9446 
9447 	size_t bufferSize;
9448 
9449 	if (!IS_USER_ADDRESS(userBuffer) || !IS_USER_ADDRESS(userBufferSize)
9450 		|| user_memcpy(&bufferSize, userBufferSize, sizeof(size_t)) != B_OK)
9451 		return B_BAD_ADDRESS;
9452 
9453 	char* path = pathBuffer.LockBuffer();
9454 	char* buffer = linkBuffer.LockBuffer();
9455 
9456 	if (userPath) {
9457 		if (!IS_USER_ADDRESS(userPath))
9458 			return B_BAD_ADDRESS;
9459 		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9460 		if (status != B_OK)
9461 			return status;
9462 
9463 		if (bufferSize > B_PATH_NAME_LENGTH)
9464 			bufferSize = B_PATH_NAME_LENGTH;
9465 	}
9466 
9467 	size_t newBufferSize = bufferSize;
9468 	status_t status = common_read_link(fd, userPath ? path : NULL, buffer,
9469 		&newBufferSize, false);
9470 
9471 	// we also update the bufferSize in case of errors
9472 	// (the real length will be returned in case of B_BUFFER_OVERFLOW)
9473 	if (user_memcpy(userBufferSize, &newBufferSize, sizeof(size_t)) != B_OK)
9474 		return B_BAD_ADDRESS;
9475 
9476 	if (status != B_OK)
9477 		return status;
9478 
9479 	bufferSize = min_c(newBufferSize, bufferSize);
9480 	if (user_memcpy(userBuffer, buffer, bufferSize) != B_OK)
9481 		return B_BAD_ADDRESS;
9482 
9483 	return B_OK;
9484 }
9485 
9486 
9487 status_t
9488 _user_create_symlink(int fd, const char* userPath, const char* userToPath,
9489 	int mode)
9490 {
9491 	KPath pathBuffer;
9492 	KPath toPathBuffer;
9493 	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
9494 		return B_NO_MEMORY;
9495 
9496 	char* path = pathBuffer.LockBuffer();
9497 	char* toPath = toPathBuffer.LockBuffer();
9498 
9499 	if (!IS_USER_ADDRESS(userPath) || !IS_USER_ADDRESS(userToPath))
9500 		return B_BAD_ADDRESS;
9501 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9502 	if (status != B_OK)
9503 		return status;
9504 	status = user_copy_name(toPath, userToPath, B_PATH_NAME_LENGTH);
9505 	if (status != B_OK)
9506 		return status;
9507 
9508 	return common_create_symlink(fd, path, toPath, mode, false);
9509 }
9510 
9511 
9512 status_t
9513 _user_create_link(int pathFD, const char* userPath, int toFD,
9514 	const char* userToPath, bool traverseLeafLink)
9515 {
9516 	KPath pathBuffer;
9517 	KPath toPathBuffer;
9518 	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
9519 		return B_NO_MEMORY;
9520 
9521 	char* path = pathBuffer.LockBuffer();
9522 	char* toPath = toPathBuffer.LockBuffer();
9523 
9524 	if (!IS_USER_ADDRESS(userPath) || !IS_USER_ADDRESS(userToPath))
9525 		return B_BAD_ADDRESS;
9526 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9527 	if (status != B_OK)
9528 		return status;
9529 	status = user_copy_name(toPath, userToPath, B_PATH_NAME_LENGTH);
9530 	if (status != B_OK)
9531 		return status;
9532 
9533 	status = check_path(toPath);
9534 	if (status != B_OK)
9535 		return status;
9536 
9537 	return common_create_link(pathFD, path, toFD, toPath, traverseLeafLink,
9538 		false);
9539 }
9540 
9541 
9542 status_t
9543 _user_unlink(int fd, const char* userPath)
9544 {
9545 	KPath pathBuffer;
9546 	if (pathBuffer.InitCheck() != B_OK)
9547 		return B_NO_MEMORY;
9548 
9549 	char* path = pathBuffer.LockBuffer();
9550 
9551 	if (!IS_USER_ADDRESS(userPath))
9552 		return B_BAD_ADDRESS;
9553 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9554 	if (status != B_OK)
9555 		return status;
9556 
9557 	return common_unlink(fd, path, false);
9558 }
9559 
9560 
9561 status_t
9562 _user_rename(int oldFD, const char* userOldPath, int newFD,
9563 	const char* userNewPath)
9564 {
9565 	KPath oldPathBuffer;
9566 	KPath newPathBuffer;
9567 	if (oldPathBuffer.InitCheck() != B_OK || newPathBuffer.InitCheck() != B_OK)
9568 		return B_NO_MEMORY;
9569 
9570 	char* oldPath = oldPathBuffer.LockBuffer();
9571 	char* newPath = newPathBuffer.LockBuffer();
9572 
9573 	if (!IS_USER_ADDRESS(userOldPath) || !IS_USER_ADDRESS(userNewPath))
9574 		return B_BAD_ADDRESS;
9575 	status_t status = user_copy_name(oldPath, userOldPath, B_PATH_NAME_LENGTH);
9576 	if (status != B_OK)
9577 		return status;
9578 	status = user_copy_name(newPath, userNewPath, B_PATH_NAME_LENGTH);
9579 	if (status != B_OK)
9580 		return status;
9581 
9582 	return common_rename(oldFD, oldPath, newFD, newPath, false);
9583 }
9584 
9585 
9586 status_t
9587 _user_create_fifo(int fd, const char* userPath, mode_t perms)
9588 {
9589 	KPath pathBuffer;
9590 	if (pathBuffer.InitCheck() != B_OK)
9591 		return B_NO_MEMORY;
9592 
9593 	char* path = pathBuffer.LockBuffer();
9594 
9595 	if (!IS_USER_ADDRESS(userPath))
9596 		return B_BAD_ADDRESS;
9597 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9598 	if (status != B_OK)
9599 		return status;
9600 
9601 	// split into directory vnode and filename path
9602 	char filename[B_FILE_NAME_LENGTH];
9603 	struct vnode* dir;
9604 	status = fd_and_path_to_dir_vnode(fd, path, &dir, filename, false);
9605 	if (status != B_OK)
9606 		return status;
9607 
9608 	VNodePutter _(dir);
9609 
9610 	// the underlying FS needs to support creating FIFOs
9611 	if (!HAS_FS_CALL(dir, create_special_node))
9612 		return B_UNSUPPORTED;
9613 
9614 	// create the entry	-- the FIFO sub node is set up automatically
9615 	fs_vnode superVnode;
9616 	ino_t nodeID;
9617 	status = FS_CALL(dir, create_special_node, filename, NULL,
9618 		S_IFIFO | (perms & S_IUMSK), 0, &superVnode, &nodeID);
9619 
9620 	// create_special_node() acquired a reference for us that we don't need.
9621 	if (status == B_OK)
9622 		put_vnode(dir->mount->volume, nodeID);
9623 
9624 	return status;
9625 }
9626 
9627 
9628 status_t
9629 _user_create_pipe(int* userFDs)
9630 {
9631 	// rootfs should support creating FIFOs, but let's be sure
9632 	if (!HAS_FS_CALL(sRoot, create_special_node))
9633 		return B_UNSUPPORTED;
9634 
9635 	// create the node	-- the FIFO sub node is set up automatically
9636 	fs_vnode superVnode;
9637 	ino_t nodeID;
9638 	status_t status = FS_CALL(sRoot, create_special_node, NULL, NULL,
9639 		S_IFIFO | S_IRUSR | S_IWUSR, 0, &superVnode, &nodeID);
9640 	if (status != B_OK)
9641 		return status;
9642 
9643 	// We've got one reference to the node and need another one.
9644 	struct vnode* vnode;
9645 	status = get_vnode(sRoot->mount->id, nodeID, &vnode, true, false);
9646 	if (status != B_OK) {
9647 		// that should not happen
9648 		dprintf("_user_create_pipe(): Failed to lookup vnode (%" B_PRIdDEV ", "
9649 			"%" B_PRIdINO ")\n", sRoot->mount->id, sRoot->id);
9650 		return status;
9651 	}
9652 
9653 	// Everything looks good so far. Open two FDs for reading respectively
9654 	// writing.
9655 	int fds[2];
9656 	fds[0] = open_vnode(vnode, O_RDONLY, false);
9657 	fds[1] = open_vnode(vnode, O_WRONLY, false);
9658 
9659 	FDCloser closer0(fds[0], false);
9660 	FDCloser closer1(fds[1], false);
9661 
9662 	status = (fds[0] >= 0 ? (fds[1] >= 0 ? B_OK : fds[1]) : fds[0]);
9663 
9664 	// copy FDs to userland
9665 	if (status == B_OK) {
9666 		if (!IS_USER_ADDRESS(userFDs)
9667 			|| user_memcpy(userFDs, fds, sizeof(fds)) != B_OK) {
9668 			status = B_BAD_ADDRESS;
9669 		}
9670 	}
9671 
9672 	// keep FDs, if everything went fine
9673 	if (status == B_OK) {
9674 		closer0.Detach();
9675 		closer1.Detach();
9676 	}
9677 
9678 	return status;
9679 }
9680 
9681 
9682 status_t
9683 _user_access(int fd, const char* userPath, int mode, bool effectiveUserGroup)
9684 {
9685 	KPath pathBuffer;
9686 	if (pathBuffer.InitCheck() != B_OK)
9687 		return B_NO_MEMORY;
9688 
9689 	char* path = pathBuffer.LockBuffer();
9690 
9691 	if (!IS_USER_ADDRESS(userPath))
9692 		return B_BAD_ADDRESS;
9693 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9694 	if (status != B_OK)
9695 		return status;
9696 
9697 	return common_access(fd, path, mode, effectiveUserGroup, false);
9698 }
9699 
9700 
9701 status_t
9702 _user_read_stat(int fd, const char* userPath, bool traverseLink,
9703 	struct stat* userStat, size_t statSize)
9704 {
9705 	struct stat stat = {0};
9706 	status_t status;
9707 
9708 	if (statSize > sizeof(struct stat))
9709 		return B_BAD_VALUE;
9710 
9711 	if (!IS_USER_ADDRESS(userStat))
9712 		return B_BAD_ADDRESS;
9713 
9714 	if (userPath != NULL) {
9715 		// path given: get the stat of the node referred to by (fd, path)
9716 		if (!IS_USER_ADDRESS(userPath))
9717 			return B_BAD_ADDRESS;
9718 
9719 		KPath pathBuffer;
9720 		if (pathBuffer.InitCheck() != B_OK)
9721 			return B_NO_MEMORY;
9722 
9723 		char* path = pathBuffer.LockBuffer();
9724 
9725 		status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9726 		if (status != B_OK)
9727 			return status;
9728 
9729 		status = common_path_read_stat(fd, path, traverseLink, &stat, false);
9730 	} else {
9731 		// no path given: get the FD and use the FD operation
9732 		struct file_descriptor* descriptor
9733 			= get_fd(get_current_io_context(false), fd);
9734 		if (descriptor == NULL)
9735 			return B_FILE_ERROR;
9736 
9737 		if (descriptor->ops->fd_read_stat)
9738 			status = descriptor->ops->fd_read_stat(descriptor, &stat);
9739 		else
9740 			status = B_UNSUPPORTED;
9741 
9742 		put_fd(descriptor);
9743 	}
9744 
9745 	if (status != B_OK)
9746 		return status;
9747 
9748 	return user_memcpy(userStat, &stat, statSize);
9749 }
9750 
9751 
9752 status_t
9753 _user_write_stat(int fd, const char* userPath, bool traverseLeafLink,
9754 	const struct stat* userStat, size_t statSize, int statMask)
9755 {
9756 	if (statSize > sizeof(struct stat))
9757 		return B_BAD_VALUE;
9758 
9759 	struct stat stat;
9760 
9761 	if (!IS_USER_ADDRESS(userStat)
9762 		|| user_memcpy(&stat, userStat, statSize) < B_OK)
9763 		return B_BAD_ADDRESS;
9764 
9765 	// clear additional stat fields
9766 	if (statSize < sizeof(struct stat))
9767 		memset((uint8*)&stat + statSize, 0, sizeof(struct stat) - statSize);
9768 
9769 	status_t status;
9770 
9771 	if (userPath != NULL) {
9772 		// path given: write the stat of the node referred to by (fd, path)
9773 		if (!IS_USER_ADDRESS(userPath))
9774 			return B_BAD_ADDRESS;
9775 
9776 		KPath pathBuffer;
9777 		if (pathBuffer.InitCheck() != B_OK)
9778 			return B_NO_MEMORY;
9779 
9780 		char* path = pathBuffer.LockBuffer();
9781 
9782 		status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9783 		if (status != B_OK)
9784 			return status;
9785 
9786 		status = common_path_write_stat(fd, path, traverseLeafLink, &stat,
9787 			statMask, false);
9788 	} else {
9789 		// no path given: get the FD and use the FD operation
9790 		struct file_descriptor* descriptor
9791 			= get_fd(get_current_io_context(false), fd);
9792 		if (descriptor == NULL)
9793 			return B_FILE_ERROR;
9794 
9795 		if (descriptor->ops->fd_write_stat) {
9796 			status = descriptor->ops->fd_write_stat(descriptor, &stat,
9797 				statMask);
9798 		} else
9799 			status = B_UNSUPPORTED;
9800 
9801 		put_fd(descriptor);
9802 	}
9803 
9804 	return status;
9805 }
9806 
9807 
9808 int
9809 _user_open_attr_dir(int fd, const char* userPath, bool traverseLeafLink)
9810 {
9811 	KPath pathBuffer;
9812 	if (pathBuffer.InitCheck() != B_OK)
9813 		return B_NO_MEMORY;
9814 
9815 	char* path = pathBuffer.LockBuffer();
9816 
9817 	if (userPath != NULL) {
9818 		if (!IS_USER_ADDRESS(userPath))
9819 			return B_BAD_ADDRESS;
9820 		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9821 		if (status != B_OK)
9822 			return status;
9823 	}
9824 
9825 	return attr_dir_open(fd, userPath ? path : NULL, traverseLeafLink, false);
9826 }
9827 
9828 
9829 ssize_t
9830 _user_read_attr(int fd, const char* userAttribute, off_t pos, void* userBuffer,
9831 	size_t readBytes)
9832 {
9833 	char attribute[B_FILE_NAME_LENGTH];
9834 
9835 	if (userAttribute == NULL)
9836 		return B_BAD_VALUE;
9837 	if (!IS_USER_ADDRESS(userAttribute))
9838 		return B_BAD_ADDRESS;
9839 	status_t status = user_copy_name(attribute, userAttribute, sizeof(attribute));
9840 	if (status != B_OK)
9841 		return status;
9842 
9843 	int attr = attr_open(fd, NULL, attribute, O_RDONLY, false);
9844 	if (attr < 0)
9845 		return attr;
9846 
9847 	ssize_t bytes = _user_read(attr, pos, userBuffer, readBytes);
9848 	_user_close(attr);
9849 
9850 	return bytes;
9851 }
9852 
9853 
9854 ssize_t
9855 _user_write_attr(int fd, const char* userAttribute, uint32 type, off_t pos,
9856 	const void* buffer, size_t writeBytes)
9857 {
9858 	char attribute[B_FILE_NAME_LENGTH];
9859 
9860 	if (userAttribute == NULL)
9861 		return B_BAD_VALUE;
9862 	if (!IS_USER_ADDRESS(userAttribute))
9863 		return B_BAD_ADDRESS;
9864 	status_t status = user_copy_name(attribute, userAttribute, sizeof(attribute));
9865 	if (status != B_OK)
9866 		return status;
9867 
9868 	// Try to support the BeOS typical truncation as well as the position
9869 	// argument
9870 	int attr = attr_create(fd, NULL, attribute, type,
9871 		O_CREAT | O_WRONLY | (pos != 0 ? 0 : O_TRUNC), false);
9872 	if (attr < 0)
9873 		return attr;
9874 
9875 	ssize_t bytes = _user_write(attr, pos, buffer, writeBytes);
9876 	_user_close(attr);
9877 
9878 	return bytes;
9879 }
9880 
9881 
9882 status_t
9883 _user_stat_attr(int fd, const char* userAttribute,
9884 	struct attr_info* userAttrInfo)
9885 {
9886 	char attribute[B_FILE_NAME_LENGTH];
9887 
9888 	if (userAttribute == NULL || userAttrInfo == NULL)
9889 		return B_BAD_VALUE;
9890 	if (!IS_USER_ADDRESS(userAttribute) || !IS_USER_ADDRESS(userAttrInfo))
9891 		return B_BAD_ADDRESS;
9892 	status_t status = user_copy_name(attribute, userAttribute,
9893 		sizeof(attribute));
9894 	if (status != B_OK)
9895 		return status;
9896 
9897 	int attr = attr_open(fd, NULL, attribute, O_RDONLY, false);
9898 	if (attr < 0)
9899 		return attr;
9900 
9901 	struct file_descriptor* descriptor
9902 		= get_fd(get_current_io_context(false), attr);
9903 	if (descriptor == NULL) {
9904 		_user_close(attr);
9905 		return B_FILE_ERROR;
9906 	}
9907 
9908 	struct stat stat;
9909 	if (descriptor->ops->fd_read_stat)
9910 		status = descriptor->ops->fd_read_stat(descriptor, &stat);
9911 	else
9912 		status = B_UNSUPPORTED;
9913 
9914 	put_fd(descriptor);
9915 	_user_close(attr);
9916 
9917 	if (status == B_OK) {
9918 		attr_info info;
9919 		info.type = stat.st_type;
9920 		info.size = stat.st_size;
9921 
9922 		if (user_memcpy(userAttrInfo, &info, sizeof(struct attr_info)) != B_OK)
9923 			return B_BAD_ADDRESS;
9924 	}
9925 
9926 	return status;
9927 }
9928 
9929 
9930 int
9931 _user_open_attr(int fd, const char* userPath, const char* userName,
9932 	uint32 type, int openMode)
9933 {
9934 	char name[B_FILE_NAME_LENGTH];
9935 
9936 	if (!IS_USER_ADDRESS(userName))
9937 		return B_BAD_ADDRESS;
9938 	status_t status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
9939 	if (status != B_OK)
9940 		return status;
9941 
9942 	KPath pathBuffer;
9943 	if (pathBuffer.InitCheck() != B_OK)
9944 		return B_NO_MEMORY;
9945 
9946 	char* path = pathBuffer.LockBuffer();
9947 
9948 	if (userPath != NULL) {
9949 		if (!IS_USER_ADDRESS(userPath))
9950 			return B_BAD_ADDRESS;
9951 		status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9952 		if (status != B_OK)
9953 			return status;
9954 	}
9955 
9956 	if ((openMode & O_CREAT) != 0) {
9957 		return attr_create(fd, userPath ? path : NULL, name, type, openMode,
9958 			false);
9959 	}
9960 
9961 	return attr_open(fd, userPath ? path : NULL, name, openMode, false);
9962 }
9963 
9964 
9965 status_t
9966 _user_remove_attr(int fd, const char* userName)
9967 {
9968 	char name[B_FILE_NAME_LENGTH];
9969 
9970 	if (!IS_USER_ADDRESS(userName))
9971 		return B_BAD_ADDRESS;
9972 	status_t status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
9973 	if (status != B_OK)
9974 		return status;
9975 
9976 	return attr_remove(fd, name, false);
9977 }
9978 
9979 
9980 status_t
9981 _user_rename_attr(int fromFile, const char* userFromName, int toFile,
9982 	const char* userToName)
9983 {
9984 	if (!IS_USER_ADDRESS(userFromName)
9985 		|| !IS_USER_ADDRESS(userToName))
9986 		return B_BAD_ADDRESS;
9987 
9988 	KPath fromNameBuffer(B_FILE_NAME_LENGTH);
9989 	KPath toNameBuffer(B_FILE_NAME_LENGTH);
9990 	if (fromNameBuffer.InitCheck() != B_OK || toNameBuffer.InitCheck() != B_OK)
9991 		return B_NO_MEMORY;
9992 
9993 	char* fromName = fromNameBuffer.LockBuffer();
9994 	char* toName = toNameBuffer.LockBuffer();
9995 
9996 	status_t status = user_copy_name(fromName, userFromName, B_FILE_NAME_LENGTH);
9997 	if (status != B_OK)
9998 		return status;
9999 	status = user_copy_name(toName, userToName, B_FILE_NAME_LENGTH);
10000 	if (status != B_OK)
10001 		return status;
10002 
10003 	return attr_rename(fromFile, fromName, toFile, toName, false);
10004 }
10005 
10006 
10007 int
10008 _user_open_index_dir(dev_t device)
10009 {
10010 	return index_dir_open(device, false);
10011 }
10012 
10013 
10014 status_t
10015 _user_create_index(dev_t device, const char* userName, uint32 type,
10016 	uint32 flags)
10017 {
10018 	char name[B_FILE_NAME_LENGTH];
10019 
10020 	if (!IS_USER_ADDRESS(userName))
10021 		return B_BAD_ADDRESS;
10022 	status_t status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
10023 	if (status != B_OK)
10024 		return status;
10025 
10026 	return index_create(device, name, type, flags, false);
10027 }
10028 
10029 
10030 status_t
10031 _user_read_index_stat(dev_t device, const char* userName, struct stat* userStat)
10032 {
10033 	char name[B_FILE_NAME_LENGTH];
10034 	struct stat stat = {0};
10035 	status_t status;
10036 
10037 	if (!IS_USER_ADDRESS(userName) || !IS_USER_ADDRESS(userStat))
10038 		return B_BAD_ADDRESS;
10039 	status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
10040 	if (status != B_OK)
10041 		return status;
10042 
10043 	status = index_name_read_stat(device, name, &stat, false);
10044 	if (status == B_OK) {
10045 		if (user_memcpy(userStat, &stat, sizeof(stat)) != B_OK)
10046 			return B_BAD_ADDRESS;
10047 	}
10048 
10049 	return status;
10050 }
10051 
10052 
10053 status_t
10054 _user_remove_index(dev_t device, const char* userName)
10055 {
10056 	char name[B_FILE_NAME_LENGTH];
10057 
10058 	if (!IS_USER_ADDRESS(userName))
10059 		return B_BAD_ADDRESS;
10060 	status_t status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
10061 	if (status != B_OK)
10062 		return status;
10063 
10064 	return index_remove(device, name, false);
10065 }
10066 
10067 
10068 status_t
10069 _user_getcwd(char* userBuffer, size_t size)
10070 {
10071 	if (size == 0)
10072 		return B_BAD_VALUE;
10073 	if (!IS_USER_ADDRESS(userBuffer))
10074 		return B_BAD_ADDRESS;
10075 
10076 	if (size > kMaxPathLength)
10077 		size = kMaxPathLength;
10078 
10079 	KPath pathBuffer(size);
10080 	if (pathBuffer.InitCheck() != B_OK)
10081 		return B_NO_MEMORY;
10082 
10083 	TRACE(("user_getcwd: buf %p, %ld\n", userBuffer, size));
10084 
10085 	char* path = pathBuffer.LockBuffer();
10086 
10087 	status_t status = get_cwd(path, size, false);
10088 	if (status != B_OK)
10089 		return status;
10090 
10091 	// Copy back the result
10092 	if (user_strlcpy(userBuffer, path, size) < B_OK)
10093 		return B_BAD_ADDRESS;
10094 
10095 	return status;
10096 }
10097 
10098 
10099 status_t
10100 _user_setcwd(int fd, const char* userPath)
10101 {
10102 	TRACE(("user_setcwd: path = %p\n", userPath));
10103 
10104 	KPath pathBuffer;
10105 	if (pathBuffer.InitCheck() != B_OK)
10106 		return B_NO_MEMORY;
10107 
10108 	char* path = pathBuffer.LockBuffer();
10109 
10110 	if (userPath != NULL) {
10111 		if (!IS_USER_ADDRESS(userPath))
10112 			return B_BAD_ADDRESS;
10113 		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
10114 		if (status != B_OK)
10115 			return status;
10116 	}
10117 
10118 	return set_cwd(fd, userPath != NULL ? path : NULL, false);
10119 }
10120 
10121 
10122 status_t
10123 _user_change_root(const char* userPath)
10124 {
10125 	// only root is allowed to chroot()
10126 	if (geteuid() != 0)
10127 		return B_NOT_ALLOWED;
10128 
10129 	// alloc path buffer
10130 	KPath pathBuffer;
10131 	if (pathBuffer.InitCheck() != B_OK)
10132 		return B_NO_MEMORY;
10133 
10134 	// copy userland path to kernel
10135 	char* path = pathBuffer.LockBuffer();
10136 	if (userPath != NULL) {
10137 		if (!IS_USER_ADDRESS(userPath))
10138 			return B_BAD_ADDRESS;
10139 		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
10140 		if (status != B_OK)
10141 			return status;
10142 	}
10143 
10144 	// get the vnode
10145 	struct vnode* vnode;
10146 	status_t status = path_to_vnode(path, true, &vnode, NULL, false);
10147 	if (status != B_OK)
10148 		return status;
10149 
10150 	// set the new root
10151 	struct io_context* context = get_current_io_context(false);
10152 	mutex_lock(&sIOContextRootLock);
10153 	struct vnode* oldRoot = context->root;
10154 	context->root = vnode;
10155 	mutex_unlock(&sIOContextRootLock);
10156 
10157 	put_vnode(oldRoot);
10158 
10159 	return B_OK;
10160 }
10161 
10162 
10163 int
10164 _user_open_query(dev_t device, const char* userQuery, size_t queryLength,
10165 	uint32 flags, port_id port, int32 token)
10166 {
10167 	if (device < 0 || userQuery == NULL || queryLength == 0)
10168 		return B_BAD_VALUE;
10169 
10170 	if (!IS_USER_ADDRESS(userQuery))
10171 		return B_BAD_ADDRESS;
10172 
10173 	// this is a safety restriction
10174 	if (queryLength >= 65536)
10175 		return B_NAME_TOO_LONG;
10176 
10177 	BStackOrHeapArray<char, 128> query(queryLength + 1);
10178 	if (!query.IsValid())
10179 		return B_NO_MEMORY;
10180 
10181 	if (user_strlcpy(query, userQuery, queryLength + 1) < B_OK)
10182 		return B_BAD_ADDRESS;
10183 
10184 	return query_open(device, query, flags, port, token, false);
10185 }
10186 
10187 
10188 #include "vfs_request_io.cpp"
10189