xref: /haiku/src/system/kernel/fs/vfs.cpp (revision df3ac004ba00d875be84ec7853864b739a2292bf)
1 /*
2  * Copyright 2005-2013, Ingo Weinhold, ingo_weinhold@gmx.de.
3  * Copyright 2002-2017, Axel Dörfler, axeld@pinc-software.de.
4  * Distributed under the terms of the MIT License.
5  *
6  * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
7  * Distributed under the terms of the NewOS License.
8  */
9 
10 
11 /*! Virtual File System and File System Interface Layer */
12 
13 
14 #include <ctype.h>
15 #include <fcntl.h>
16 #include <limits.h>
17 #include <stddef.h>
18 #include <stdio.h>
19 #include <string.h>
20 #include <sys/file.h>
21 #include <sys/resource.h>
22 #include <sys/stat.h>
23 #include <unistd.h>
24 
25 #include <fs_attr.h>
26 #include <fs_info.h>
27 #include <fs_interface.h>
28 #include <fs_volume.h>
29 #include <OS.h>
30 #include <StorageDefs.h>
31 
32 #include <AutoDeleter.h>
33 #include <block_cache.h>
34 #include <boot/kernel_args.h>
35 #include <debug_heap.h>
36 #include <disk_device_manager/KDiskDevice.h>
37 #include <disk_device_manager/KDiskDeviceManager.h>
38 #include <disk_device_manager/KDiskDeviceUtils.h>
39 #include <disk_device_manager/KDiskSystem.h>
40 #include <fd.h>
41 #include <file_cache.h>
42 #include <fs/node_monitor.h>
43 #include <KPath.h>
44 #include <lock.h>
45 #include <low_resource_manager.h>
46 #include <syscalls.h>
47 #include <syscall_restart.h>
48 #include <tracing.h>
49 #include <util/atomic.h>
50 #include <util/AutoLock.h>
51 #include <util/DoublyLinkedList.h>
52 #include <vfs.h>
53 #include <vm/vm.h>
54 #include <vm/VMCache.h>
55 #include <wait_for_objects.h>
56 
57 #include "EntryCache.h"
58 #include "fifo.h"
59 #include "IORequest.h"
60 #include "unused_vnodes.h"
61 #include "vfs_tracing.h"
62 #include "Vnode.h"
63 #include "../cache/vnode_store.h"
64 
65 
66 //#define TRACE_VFS
67 #ifdef TRACE_VFS
68 #	define TRACE(x) dprintf x
69 #	define FUNCTION(x) dprintf x
70 #else
71 #	define TRACE(x) ;
72 #	define FUNCTION(x) ;
73 #endif
74 
75 #define ADD_DEBUGGER_COMMANDS
76 
77 
78 #define HAS_FS_CALL(vnode, op)			(vnode->ops->op != NULL)
79 #define HAS_FS_MOUNT_CALL(mount, op)	(mount->volume->ops->op != NULL)
80 
81 #if KDEBUG
82 #	define FS_CALL(vnode, op, params...) \
83 		( HAS_FS_CALL(vnode, op) ? \
84 			vnode->ops->op(vnode->mount->volume, vnode, params) \
85 			: (panic("FS_CALL op " #op " is NULL"), 0))
86 #	define FS_CALL_NO_PARAMS(vnode, op) \
87 		( HAS_FS_CALL(vnode, op) ? \
88 			vnode->ops->op(vnode->mount->volume, vnode) \
89 			: (panic("FS_CALL_NO_PARAMS op " #op " is NULL"), 0))
90 #	define FS_MOUNT_CALL(mount, op, params...) \
91 		( HAS_FS_MOUNT_CALL(mount, op) ? \
92 			mount->volume->ops->op(mount->volume, params) \
93 			: (panic("FS_MOUNT_CALL op " #op " is NULL"), 0))
94 #	define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
95 		( HAS_FS_MOUNT_CALL(mount, op) ? \
96 			mount->volume->ops->op(mount->volume) \
97 			: (panic("FS_MOUNT_CALL_NO_PARAMS op " #op " is NULL"), 0))
98 #else
99 #	define FS_CALL(vnode, op, params...) \
100 			vnode->ops->op(vnode->mount->volume, vnode, params)
101 #	define FS_CALL_NO_PARAMS(vnode, op) \
102 			vnode->ops->op(vnode->mount->volume, vnode)
103 #	define FS_MOUNT_CALL(mount, op, params...) \
104 			mount->volume->ops->op(mount->volume, params)
105 #	define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
106 			mount->volume->ops->op(mount->volume)
107 #endif
108 
109 
110 const static size_t kMaxPathLength = 65536;
111 	// The absolute maximum path length (for getcwd() - this is not depending
112 	// on PATH_MAX
113 
114 
115 typedef DoublyLinkedList<vnode> VnodeList;
116 
117 /*!	\brief Structure to manage a mounted file system
118 
119 	Note: The root_vnode and root_vnode->covers fields (what others?) are
120 	initialized in fs_mount() and not changed afterwards. That is as soon
121 	as the mount is mounted and it is made sure it won't be unmounted
122 	(e.g. by holding a reference to a vnode of that mount) (read) access
123 	to those fields is always safe, even without additional locking. Morever
124 	while mounted the mount holds a reference to the root_vnode->covers vnode,
125 	and thus making the access path vnode->mount->root_vnode->covers->mount->...
126 	safe if a reference to vnode is held (note that for the root mount
127 	root_vnode->covers is NULL, though).
128 */
129 struct fs_mount {
130 	fs_mount()
131 		:
132 		volume(NULL),
133 		device_name(NULL)
134 	{
135 		recursive_lock_init(&rlock, "mount rlock");
136 	}
137 
138 	~fs_mount()
139 	{
140 		recursive_lock_destroy(&rlock);
141 		free(device_name);
142 
143 		while (volume) {
144 			fs_volume* superVolume = volume->super_volume;
145 
146 			if (volume->file_system != NULL)
147 				put_module(volume->file_system->info.name);
148 
149 			free(volume->file_system_name);
150 			free(volume);
151 			volume = superVolume;
152 		}
153 	}
154 
155 	struct fs_mount* next;
156 	dev_t			id;
157 	fs_volume*		volume;
158 	char*			device_name;
159 	recursive_lock	rlock;	// guards the vnodes list
160 		// TODO: Make this a mutex! It is never used recursively.
161 	struct vnode*	root_vnode;
162 	struct vnode*	covers_vnode;	// immutable
163 	KPartition*		partition;
164 	VnodeList		vnodes;
165 	EntryCache		entry_cache;
166 	bool			unmounting;
167 	bool			owns_file_device;
168 };
169 
170 
171 namespace {
172 
173 struct advisory_lock : public DoublyLinkedListLinkImpl<advisory_lock> {
174 	list_link		link;
175 	team_id			team;
176 	pid_t			session;
177 	off_t			start;
178 	off_t			end;
179 	bool			shared;
180 };
181 
182 typedef DoublyLinkedList<advisory_lock> LockList;
183 
184 } // namespace
185 
186 
187 struct advisory_locking {
188 	sem_id			lock;
189 	sem_id			wait_sem;
190 	LockList		locks;
191 
192 	advisory_locking()
193 		:
194 		lock(-1),
195 		wait_sem(-1)
196 	{
197 	}
198 
199 	~advisory_locking()
200 	{
201 		if (lock >= 0)
202 			delete_sem(lock);
203 		if (wait_sem >= 0)
204 			delete_sem(wait_sem);
205 	}
206 };
207 
208 /*!	\brief Guards sMountsTable.
209 
210 	The holder is allowed to read/write access the sMountsTable.
211 	Manipulation of the fs_mount structures themselves
212 	(and their destruction) requires different locks though.
213 */
214 static mutex sMountMutex = MUTEX_INITIALIZER("vfs_mount_lock");
215 
216 /*!	\brief Guards mount/unmount operations.
217 
218 	The fs_mount() and fs_unmount() hold the lock during their whole operation.
219 	That is locking the lock ensures that no FS is mounted/unmounted. In
220 	particular this means that
221 	- sMountsTable will not be modified,
222 	- the fields immutable after initialization of the fs_mount structures in
223 	  sMountsTable will not be modified,
224 
225 	The thread trying to lock the lock must not hold sVnodeLock or
226 	sMountMutex.
227 */
228 static recursive_lock sMountOpLock;
229 
230 /*!	\brief Guards sVnodeTable.
231 
232 	The holder is allowed read/write access to sVnodeTable and to
233 	any unbusy vnode in that table, save to the immutable fields (device, id,
234 	private_node, mount) to which only read-only access is allowed.
235 	The mutable fields advisory_locking, mandatory_locked_by, and ref_count, as
236 	well as the busy, removed, unused flags, and the vnode's type can also be
237 	write accessed when holding a read lock to sVnodeLock *and* having the vnode
238 	locked. Write access to covered_by and covers requires to write lock
239 	sVnodeLock.
240 
241 	The thread trying to acquire the lock must not hold sMountMutex.
242 	You must not hold this lock when calling create_sem(), as this might call
243 	vfs_free_unused_vnodes() and thus cause a deadlock.
244 */
245 static rw_lock sVnodeLock = RW_LOCK_INITIALIZER("vfs_vnode_lock");
246 
247 /*!	\brief Guards io_context::root.
248 
249 	Must be held when setting or getting the io_context::root field.
250 	The only operation allowed while holding this lock besides getting or
251 	setting the field is inc_vnode_ref_count() on io_context::root.
252 */
253 static mutex sIOContextRootLock = MUTEX_INITIALIZER("io_context::root lock");
254 
255 
256 namespace {
257 
258 struct vnode_hash_key {
259 	dev_t	device;
260 	ino_t	vnode;
261 };
262 
263 struct VnodeHash {
264 	typedef vnode_hash_key	KeyType;
265 	typedef	struct vnode	ValueType;
266 
267 #define VHASH(mountid, vnodeid) \
268 	(((uint32)((vnodeid) >> 32) + (uint32)(vnodeid)) ^ (uint32)(mountid))
269 
270 	size_t HashKey(KeyType key) const
271 	{
272 		return VHASH(key.device, key.vnode);
273 	}
274 
275 	size_t Hash(ValueType* vnode) const
276 	{
277 		return VHASH(vnode->device, vnode->id);
278 	}
279 
280 #undef VHASH
281 
282 	bool Compare(KeyType key, ValueType* vnode) const
283 	{
284 		return vnode->device == key.device && vnode->id == key.vnode;
285 	}
286 
287 	ValueType*& GetLink(ValueType* value) const
288 	{
289 		return value->next;
290 	}
291 };
292 
293 typedef BOpenHashTable<VnodeHash> VnodeTable;
294 
295 
296 struct MountHash {
297 	typedef dev_t			KeyType;
298 	typedef	struct fs_mount	ValueType;
299 
300 	size_t HashKey(KeyType key) const
301 	{
302 		return key;
303 	}
304 
305 	size_t Hash(ValueType* mount) const
306 	{
307 		return mount->id;
308 	}
309 
310 	bool Compare(KeyType key, ValueType* mount) const
311 	{
312 		return mount->id == key;
313 	}
314 
315 	ValueType*& GetLink(ValueType* value) const
316 	{
317 		return value->next;
318 	}
319 };
320 
321 typedef BOpenHashTable<MountHash> MountTable;
322 
323 } // namespace
324 
325 
326 #define VNODE_HASH_TABLE_SIZE 1024
327 static VnodeTable* sVnodeTable;
328 static struct vnode* sRoot;
329 
330 #define MOUNTS_HASH_TABLE_SIZE 16
331 static MountTable* sMountsTable;
332 static dev_t sNextMountID = 1;
333 
334 #define MAX_TEMP_IO_VECS 8
335 
336 // How long to wait for busy vnodes (10s)
337 #define BUSY_VNODE_RETRIES 2000
338 #define BUSY_VNODE_DELAY 5000
339 
340 mode_t __gUmask = 022;
341 
342 /* function declarations */
343 
344 static void free_unused_vnodes();
345 
346 // file descriptor operation prototypes
347 static status_t file_read(struct file_descriptor* descriptor, off_t pos,
348 	void* buffer, size_t* _bytes);
349 static status_t file_write(struct file_descriptor* descriptor, off_t pos,
350 	const void* buffer, size_t* _bytes);
351 static off_t file_seek(struct file_descriptor* descriptor, off_t pos,
352 	int seekType);
353 static void file_free_fd(struct file_descriptor* descriptor);
354 static status_t file_close(struct file_descriptor* descriptor);
355 static status_t file_select(struct file_descriptor* descriptor, uint8 event,
356 	struct selectsync* sync);
357 static status_t file_deselect(struct file_descriptor* descriptor, uint8 event,
358 	struct selectsync* sync);
359 static status_t dir_read(struct io_context* context,
360 	struct file_descriptor* descriptor, struct dirent* buffer,
361 	size_t bufferSize, uint32* _count);
362 static status_t dir_read(struct io_context* ioContext, struct vnode* vnode,
363 	void* cookie, struct dirent* buffer, size_t bufferSize, uint32* _count);
364 static status_t dir_rewind(struct file_descriptor* descriptor);
365 static void dir_free_fd(struct file_descriptor* descriptor);
366 static status_t dir_close(struct file_descriptor* descriptor);
367 static status_t attr_dir_read(struct io_context* context,
368 	struct file_descriptor* descriptor, struct dirent* buffer,
369 	size_t bufferSize, uint32* _count);
370 static status_t attr_dir_rewind(struct file_descriptor* descriptor);
371 static void attr_dir_free_fd(struct file_descriptor* descriptor);
372 static status_t attr_dir_close(struct file_descriptor* descriptor);
373 static status_t attr_read(struct file_descriptor* descriptor, off_t pos,
374 	void* buffer, size_t* _bytes);
375 static status_t attr_write(struct file_descriptor* descriptor, off_t pos,
376 	const void* buffer, size_t* _bytes);
377 static off_t attr_seek(struct file_descriptor* descriptor, off_t pos,
378 	int seekType);
379 static void attr_free_fd(struct file_descriptor* descriptor);
380 static status_t attr_close(struct file_descriptor* descriptor);
381 static status_t attr_read_stat(struct file_descriptor* descriptor,
382 	struct stat* statData);
383 static status_t attr_write_stat(struct file_descriptor* descriptor,
384 	const struct stat* stat, int statMask);
385 static status_t index_dir_read(struct io_context* context,
386 	struct file_descriptor* descriptor, struct dirent* buffer,
387 	size_t bufferSize, uint32* _count);
388 static status_t index_dir_rewind(struct file_descriptor* descriptor);
389 static void index_dir_free_fd(struct file_descriptor* descriptor);
390 static status_t index_dir_close(struct file_descriptor* descriptor);
391 static status_t query_read(struct io_context* context,
392 	struct file_descriptor* descriptor, struct dirent* buffer,
393 	size_t bufferSize, uint32* _count);
394 static status_t query_rewind(struct file_descriptor* descriptor);
395 static void query_free_fd(struct file_descriptor* descriptor);
396 static status_t query_close(struct file_descriptor* descriptor);
397 
398 static status_t common_ioctl(struct file_descriptor* descriptor, ulong op,
399 	void* buffer, size_t length);
400 static status_t common_read_stat(struct file_descriptor* descriptor,
401 	struct stat* statData);
402 static status_t common_write_stat(struct file_descriptor* descriptor,
403 	const struct stat* statData, int statMask);
404 static status_t common_path_read_stat(int fd, char* path, bool traverseLeafLink,
405 	struct stat* stat, bool kernel);
406 
407 static status_t vnode_path_to_vnode(struct vnode* vnode, char* path,
408 	bool traverseLeafLink, int count, bool kernel,
409 	struct vnode** _vnode, ino_t* _parentID);
410 static status_t dir_vnode_to_path(struct vnode* vnode, char* buffer,
411 	size_t bufferSize, bool kernel);
412 static status_t fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
413 	struct vnode** _vnode, ino_t* _parentID, bool kernel);
414 static void inc_vnode_ref_count(struct vnode* vnode);
415 static status_t dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree,
416 	bool reenter);
417 static inline void put_vnode(struct vnode* vnode);
418 static status_t fs_unmount(char* path, dev_t mountID, uint32 flags,
419 	bool kernel);
420 static int open_vnode(struct vnode* vnode, int openMode, bool kernel);
421 
422 
423 static struct fd_ops sFileOps = {
424 	file_read,
425 	file_write,
426 	file_seek,
427 	common_ioctl,
428 	NULL,		// set_flags
429 	file_select,
430 	file_deselect,
431 	NULL,		// read_dir()
432 	NULL,		// rewind_dir()
433 	common_read_stat,
434 	common_write_stat,
435 	file_close,
436 	file_free_fd
437 };
438 
439 static struct fd_ops sDirectoryOps = {
440 	NULL,		// read()
441 	NULL,		// write()
442 	NULL,		// seek()
443 	common_ioctl,
444 	NULL,		// set_flags
445 	NULL,		// select()
446 	NULL,		// deselect()
447 	dir_read,
448 	dir_rewind,
449 	common_read_stat,
450 	common_write_stat,
451 	dir_close,
452 	dir_free_fd
453 };
454 
455 static struct fd_ops sAttributeDirectoryOps = {
456 	NULL,		// read()
457 	NULL,		// write()
458 	NULL,		// seek()
459 	common_ioctl,
460 	NULL,		// set_flags
461 	NULL,		// select()
462 	NULL,		// deselect()
463 	attr_dir_read,
464 	attr_dir_rewind,
465 	common_read_stat,
466 	common_write_stat,
467 	attr_dir_close,
468 	attr_dir_free_fd
469 };
470 
471 static struct fd_ops sAttributeOps = {
472 	attr_read,
473 	attr_write,
474 	attr_seek,
475 	common_ioctl,
476 	NULL,		// set_flags
477 	NULL,		// select()
478 	NULL,		// deselect()
479 	NULL,		// read_dir()
480 	NULL,		// rewind_dir()
481 	attr_read_stat,
482 	attr_write_stat,
483 	attr_close,
484 	attr_free_fd
485 };
486 
487 static struct fd_ops sIndexDirectoryOps = {
488 	NULL,		// read()
489 	NULL,		// write()
490 	NULL,		// seek()
491 	NULL,		// ioctl()
492 	NULL,		// set_flags
493 	NULL,		// select()
494 	NULL,		// deselect()
495 	index_dir_read,
496 	index_dir_rewind,
497 	NULL,		// read_stat()
498 	NULL,		// write_stat()
499 	index_dir_close,
500 	index_dir_free_fd
501 };
502 
503 #if 0
504 static struct fd_ops sIndexOps = {
505 	NULL,		// read()
506 	NULL,		// write()
507 	NULL,		// seek()
508 	NULL,		// ioctl()
509 	NULL,		// set_flags
510 	NULL,		// select()
511 	NULL,		// deselect()
512 	NULL,		// dir_read()
513 	NULL,		// dir_rewind()
514 	index_read_stat,	// read_stat()
515 	NULL,		// write_stat()
516 	NULL,		// dir_close()
517 	NULL		// free_fd()
518 };
519 #endif
520 
521 static struct fd_ops sQueryOps = {
522 	NULL,		// read()
523 	NULL,		// write()
524 	NULL,		// seek()
525 	NULL,		// ioctl()
526 	NULL,		// set_flags
527 	NULL,		// select()
528 	NULL,		// deselect()
529 	query_read,
530 	query_rewind,
531 	NULL,		// read_stat()
532 	NULL,		// write_stat()
533 	query_close,
534 	query_free_fd
535 };
536 
537 
538 namespace {
539 
540 class VNodePutter {
541 public:
542 	VNodePutter(struct vnode* vnode = NULL) : fVNode(vnode) {}
543 
544 	~VNodePutter()
545 	{
546 		Put();
547 	}
548 
549 	void SetTo(struct vnode* vnode)
550 	{
551 		Put();
552 		fVNode = vnode;
553 	}
554 
555 	void Put()
556 	{
557 		if (fVNode) {
558 			put_vnode(fVNode);
559 			fVNode = NULL;
560 		}
561 	}
562 
563 	struct vnode* Detach()
564 	{
565 		struct vnode* vnode = fVNode;
566 		fVNode = NULL;
567 		return vnode;
568 	}
569 
570 private:
571 	struct vnode* fVNode;
572 };
573 
574 
575 class FDCloser {
576 public:
577 	FDCloser() : fFD(-1), fKernel(true) {}
578 
579 	FDCloser(int fd, bool kernel) : fFD(fd), fKernel(kernel) {}
580 
581 	~FDCloser()
582 	{
583 		Close();
584 	}
585 
586 	void SetTo(int fd, bool kernel)
587 	{
588 		Close();
589 		fFD = fd;
590 		fKernel = kernel;
591 	}
592 
593 	void Close()
594 	{
595 		if (fFD >= 0) {
596 			if (fKernel)
597 				_kern_close(fFD);
598 			else
599 				_user_close(fFD);
600 			fFD = -1;
601 		}
602 	}
603 
604 	int Detach()
605 	{
606 		int fd = fFD;
607 		fFD = -1;
608 		return fd;
609 	}
610 
611 private:
612 	int		fFD;
613 	bool	fKernel;
614 };
615 
616 } // namespace
617 
618 
619 #if VFS_PAGES_IO_TRACING
620 
621 namespace VFSPagesIOTracing {
622 
623 class PagesIOTraceEntry : public AbstractTraceEntry {
624 protected:
625 	PagesIOTraceEntry(struct vnode* vnode, void* cookie, off_t pos,
626 		const generic_io_vec* vecs, uint32 count, uint32 flags,
627 		generic_size_t bytesRequested, status_t status,
628 		generic_size_t bytesTransferred)
629 		:
630 		fVnode(vnode),
631 		fMountID(vnode->mount->id),
632 		fNodeID(vnode->id),
633 		fCookie(cookie),
634 		fPos(pos),
635 		fCount(count),
636 		fFlags(flags),
637 		fBytesRequested(bytesRequested),
638 		fStatus(status),
639 		fBytesTransferred(bytesTransferred)
640 	{
641 		fVecs = (generic_io_vec*)alloc_tracing_buffer_memcpy(vecs,
642 			sizeof(generic_io_vec) * count, false);
643 	}
644 
645 	void AddDump(TraceOutput& out, const char* mode)
646 	{
647 		out.Print("vfs pages io %5s: vnode: %p (%" B_PRId32 ", %" B_PRId64 "), "
648 			"cookie: %p, pos: %" B_PRIdOFF ", size: %" B_PRIu64 ", vecs: {",
649 			mode, fVnode, fMountID, fNodeID, fCookie, fPos,
650 			(uint64)fBytesRequested);
651 
652 		if (fVecs != NULL) {
653 			for (uint32 i = 0; i < fCount; i++) {
654 				if (i > 0)
655 					out.Print(", ");
656 				out.Print("(%" B_PRIx64 ", %" B_PRIu64 ")", (uint64)fVecs[i].base,
657 					(uint64)fVecs[i].length);
658 			}
659 		}
660 
661 		out.Print("}, flags: %#" B_PRIx32 " -> status: %#" B_PRIx32 ", "
662 			"transferred: %" B_PRIu64, fFlags, fStatus,
663 			(uint64)fBytesTransferred);
664 	}
665 
666 protected:
667 	struct vnode*	fVnode;
668 	dev_t			fMountID;
669 	ino_t			fNodeID;
670 	void*			fCookie;
671 	off_t			fPos;
672 	generic_io_vec*	fVecs;
673 	uint32			fCount;
674 	uint32			fFlags;
675 	generic_size_t	fBytesRequested;
676 	status_t		fStatus;
677 	generic_size_t	fBytesTransferred;
678 };
679 
680 
681 class ReadPages : public PagesIOTraceEntry {
682 public:
683 	ReadPages(struct vnode* vnode, void* cookie, off_t pos,
684 		const generic_io_vec* vecs, uint32 count, uint32 flags,
685 		generic_size_t bytesRequested, status_t status,
686 		generic_size_t bytesTransferred)
687 		:
688 		PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
689 			bytesRequested, status, bytesTransferred)
690 	{
691 		Initialized();
692 	}
693 
694 	virtual void AddDump(TraceOutput& out)
695 	{
696 		PagesIOTraceEntry::AddDump(out, "read");
697 	}
698 };
699 
700 
701 class WritePages : public PagesIOTraceEntry {
702 public:
703 	WritePages(struct vnode* vnode, void* cookie, off_t pos,
704 		const generic_io_vec* vecs, uint32 count, uint32 flags,
705 		generic_size_t bytesRequested, status_t status,
706 		generic_size_t bytesTransferred)
707 		:
708 		PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
709 			bytesRequested, status, bytesTransferred)
710 	{
711 		Initialized();
712 	}
713 
714 	virtual void AddDump(TraceOutput& out)
715 	{
716 		PagesIOTraceEntry::AddDump(out, "write");
717 	}
718 };
719 
720 }	// namespace VFSPagesIOTracing
721 
722 #	define TPIO(x) new(std::nothrow) VFSPagesIOTracing::x;
723 #else
724 #	define TPIO(x) ;
725 #endif	// VFS_PAGES_IO_TRACING
726 
727 
728 /*! Finds the mounted device (the fs_mount structure) with the given ID.
729 	Note, you must hold the gMountMutex lock when you call this function.
730 */
731 static struct fs_mount*
732 find_mount(dev_t id)
733 {
734 	ASSERT_LOCKED_MUTEX(&sMountMutex);
735 
736 	return sMountsTable->Lookup(id);
737 }
738 
739 
740 static status_t
741 get_mount(dev_t id, struct fs_mount** _mount)
742 {
743 	struct fs_mount* mount;
744 
745 	ReadLocker nodeLocker(sVnodeLock);
746 	MutexLocker mountLocker(sMountMutex);
747 
748 	mount = find_mount(id);
749 	if (mount == NULL)
750 		return B_BAD_VALUE;
751 
752 	struct vnode* rootNode = mount->root_vnode;
753 	if (mount->unmounting || rootNode == NULL || rootNode->IsBusy()
754 		|| rootNode->ref_count == 0) {
755 		// might have been called during a mount/unmount operation
756 		return B_BUSY;
757 	}
758 
759 	inc_vnode_ref_count(rootNode);
760 	*_mount = mount;
761 	return B_OK;
762 }
763 
764 
765 static void
766 put_mount(struct fs_mount* mount)
767 {
768 	if (mount)
769 		put_vnode(mount->root_vnode);
770 }
771 
772 
773 /*!	Tries to open the specified file system module.
774 	Accepts a file system name of the form "bfs" or "file_systems/bfs/v1".
775 	Returns a pointer to file system module interface, or NULL if it
776 	could not open the module.
777 */
778 static file_system_module_info*
779 get_file_system(const char* fsName)
780 {
781 	char name[B_FILE_NAME_LENGTH];
782 	if (strncmp(fsName, "file_systems/", strlen("file_systems/"))) {
783 		// construct module name if we didn't get one
784 		// (we currently support only one API)
785 		snprintf(name, sizeof(name), "file_systems/%s/v1", fsName);
786 		fsName = NULL;
787 	}
788 
789 	file_system_module_info* info;
790 	if (get_module(fsName ? fsName : name, (module_info**)&info) != B_OK)
791 		return NULL;
792 
793 	return info;
794 }
795 
796 
797 /*!	Accepts a file system name of the form "bfs" or "file_systems/bfs/v1"
798 	and returns a compatible fs_info.fsh_name name ("bfs" in both cases).
799 	The name is allocated for you, and you have to free() it when you're
800 	done with it.
801 	Returns NULL if the required memory is not available.
802 */
803 static char*
804 get_file_system_name(const char* fsName)
805 {
806 	const size_t length = strlen("file_systems/");
807 
808 	if (strncmp(fsName, "file_systems/", length)) {
809 		// the name already seems to be the module's file name
810 		return strdup(fsName);
811 	}
812 
813 	fsName += length;
814 	const char* end = strchr(fsName, '/');
815 	if (end == NULL) {
816 		// this doesn't seem to be a valid name, but well...
817 		return strdup(fsName);
818 	}
819 
820 	// cut off the trailing /v1
821 
822 	char* name = (char*)malloc(end + 1 - fsName);
823 	if (name == NULL)
824 		return NULL;
825 
826 	strlcpy(name, fsName, end + 1 - fsName);
827 	return name;
828 }
829 
830 
831 /*!	Accepts a list of file system names separated by a colon, one for each
832 	layer and returns the file system name for the specified layer.
833 	The name is allocated for you, and you have to free() it when you're
834 	done with it.
835 	Returns NULL if the required memory is not available or if there is no
836 	name for the specified layer.
837 */
838 static char*
839 get_file_system_name_for_layer(const char* fsNames, int32 layer)
840 {
841 	while (layer >= 0) {
842 		const char* end = strchr(fsNames, ':');
843 		if (end == NULL) {
844 			if (layer == 0)
845 				return strdup(fsNames);
846 			return NULL;
847 		}
848 
849 		if (layer == 0) {
850 			size_t length = end - fsNames + 1;
851 			char* result = (char*)malloc(length);
852 			strlcpy(result, fsNames, length);
853 			return result;
854 		}
855 
856 		fsNames = end + 1;
857 		layer--;
858 	}
859 
860 	return NULL;
861 }
862 
863 
864 static void
865 add_vnode_to_mount_list(struct vnode* vnode, struct fs_mount* mount)
866 {
867 	RecursiveLocker _(mount->rlock);
868 	mount->vnodes.Add(vnode);
869 }
870 
871 
872 static void
873 remove_vnode_from_mount_list(struct vnode* vnode, struct fs_mount* mount)
874 {
875 	RecursiveLocker _(mount->rlock);
876 	mount->vnodes.Remove(vnode);
877 }
878 
879 
880 /*!	\brief Looks up a vnode by mount and node ID in the sVnodeTable.
881 
882 	The caller must hold the sVnodeLock (read lock at least).
883 
884 	\param mountID the mount ID.
885 	\param vnodeID the node ID.
886 
887 	\return The vnode structure, if it was found in the hash table, \c NULL
888 			otherwise.
889 */
890 static struct vnode*
891 lookup_vnode(dev_t mountID, ino_t vnodeID)
892 {
893 	struct vnode_hash_key key;
894 
895 	key.device = mountID;
896 	key.vnode = vnodeID;
897 
898 	return sVnodeTable->Lookup(key);
899 }
900 
901 
902 /*!	\brief Checks whether or not a busy vnode should be waited for (again).
903 
904 	This will also wait for BUSY_VNODE_DELAY before returning if one should
905 	still wait for the vnode becoming unbusy.
906 
907 	\return \c true if one should retry, \c false if not.
908 */
909 static bool
910 retry_busy_vnode(int32& tries, dev_t mountID, ino_t vnodeID)
911 {
912 	if (--tries < 0) {
913 		// vnode doesn't seem to become unbusy
914 		dprintf("vnode %" B_PRIdDEV ":%" B_PRIdINO
915 			" is not becoming unbusy!\n", mountID, vnodeID);
916 		return false;
917 	}
918 	snooze(BUSY_VNODE_DELAY);
919 	return true;
920 }
921 
922 
923 /*!	Creates a new vnode with the given mount and node ID.
924 	If the node already exists, it is returned instead and no new node is
925 	created. In either case -- but not, if an error occurs -- the function write
926 	locks \c sVnodeLock and keeps it locked for the caller when returning. On
927 	error the lock is not held on return.
928 
929 	\param mountID The mount ID.
930 	\param vnodeID The vnode ID.
931 	\param _vnode Will be set to the new vnode on success.
932 	\param _nodeCreated Will be set to \c true when the returned vnode has
933 		been newly created, \c false when it already existed. Will not be
934 		changed on error.
935 	\return \c B_OK, when the vnode was successfully created and inserted or
936 		a node with the given ID was found, \c B_NO_MEMORY or
937 		\c B_ENTRY_NOT_FOUND on error.
938 */
939 static status_t
940 create_new_vnode_and_lock(dev_t mountID, ino_t vnodeID, struct vnode*& _vnode,
941 	bool& _nodeCreated)
942 {
943 	FUNCTION(("create_new_vnode_and_lock()\n"));
944 
945 	struct vnode* vnode = (struct vnode*)malloc(sizeof(struct vnode));
946 	if (vnode == NULL)
947 		return B_NO_MEMORY;
948 
949 	// initialize basic values
950 	memset(vnode, 0, sizeof(struct vnode));
951 	vnode->device = mountID;
952 	vnode->id = vnodeID;
953 	vnode->ref_count = 1;
954 	vnode->SetBusy(true);
955 
956 	// look up the node -- it might have been added by someone else in the
957 	// meantime
958 	rw_lock_write_lock(&sVnodeLock);
959 	struct vnode* existingVnode = lookup_vnode(mountID, vnodeID);
960 	if (existingVnode != NULL) {
961 		free(vnode);
962 		_vnode = existingVnode;
963 		_nodeCreated = false;
964 		return B_OK;
965 	}
966 
967 	// get the mount structure
968 	mutex_lock(&sMountMutex);
969 	vnode->mount = find_mount(mountID);
970 	if (!vnode->mount || vnode->mount->unmounting) {
971 		mutex_unlock(&sMountMutex);
972 		rw_lock_write_unlock(&sVnodeLock);
973 		free(vnode);
974 		return B_ENTRY_NOT_FOUND;
975 	}
976 
977 	// add the vnode to the mount's node list and the hash table
978 	sVnodeTable->Insert(vnode);
979 	add_vnode_to_mount_list(vnode, vnode->mount);
980 
981 	mutex_unlock(&sMountMutex);
982 
983 	_vnode = vnode;
984 	_nodeCreated = true;
985 
986 	// keep the vnode lock locked
987 	return B_OK;
988 }
989 
990 
991 /*!	Frees the vnode and all resources it has acquired, and removes
992 	it from the vnode hash as well as from its mount structure.
993 	Will also make sure that any cache modifications are written back.
994 */
995 static void
996 free_vnode(struct vnode* vnode, bool reenter)
997 {
998 	ASSERT_PRINT(vnode->ref_count == 0 && vnode->IsBusy(), "vnode: %p\n",
999 		vnode);
1000 
1001 	// write back any changes in this vnode's cache -- but only
1002 	// if the vnode won't be deleted, in which case the changes
1003 	// will be discarded
1004 
1005 	if (!vnode->IsRemoved() && HAS_FS_CALL(vnode, fsync))
1006 		FS_CALL_NO_PARAMS(vnode, fsync);
1007 
1008 	// Note: If this vnode has a cache attached, there will still be two
1009 	// references to that cache at this point. The last one belongs to the vnode
1010 	// itself (cf. vfs_get_vnode_cache()) and one belongs to the node's file
1011 	// cache. Each but the last reference to a cache also includes a reference
1012 	// to the vnode. The file cache, however, released its reference (cf.
1013 	// file_cache_create()), so that this vnode's ref count has the chance to
1014 	// ever drop to 0. Deleting the file cache now, will cause the next to last
1015 	// cache reference to be released, which will also release a (no longer
1016 	// existing) vnode reference. To avoid problems, we set the vnode's ref
1017 	// count, so that it will neither become negative nor 0.
1018 	vnode->ref_count = 2;
1019 
1020 	if (!vnode->IsUnpublished()) {
1021 		if (vnode->IsRemoved())
1022 			FS_CALL(vnode, remove_vnode, reenter);
1023 		else
1024 			FS_CALL(vnode, put_vnode, reenter);
1025 	}
1026 
1027 	// If the vnode has a VMCache attached, make sure that it won't try to get
1028 	// another reference via VMVnodeCache::AcquireUnreferencedStoreRef(). As
1029 	// long as the vnode is busy and in the hash, that won't happen, but as
1030 	// soon as we've removed it from the hash, it could reload the vnode -- with
1031 	// a new cache attached!
1032 	if (vnode->cache != NULL)
1033 		((VMVnodeCache*)vnode->cache)->VnodeDeleted();
1034 
1035 	// The file system has removed the resources of the vnode now, so we can
1036 	// make it available again (by removing the busy vnode from the hash).
1037 	rw_lock_write_lock(&sVnodeLock);
1038 	sVnodeTable->Remove(vnode);
1039 	rw_lock_write_unlock(&sVnodeLock);
1040 
1041 	// if we have a VMCache attached, remove it
1042 	if (vnode->cache)
1043 		vnode->cache->ReleaseRef();
1044 
1045 	vnode->cache = NULL;
1046 
1047 	remove_vnode_from_mount_list(vnode, vnode->mount);
1048 
1049 	free(vnode);
1050 }
1051 
1052 
1053 /*!	\brief Decrements the reference counter of the given vnode and deletes it,
1054 	if the counter dropped to 0.
1055 
1056 	The caller must, of course, own a reference to the vnode to call this
1057 	function.
1058 	The caller must not hold the sVnodeLock or the sMountMutex.
1059 
1060 	\param vnode the vnode.
1061 	\param alwaysFree don't move this vnode into the unused list, but really
1062 		   delete it if possible.
1063 	\param reenter \c true, if this function is called (indirectly) from within
1064 		   a file system. This will be passed to file system hooks only.
1065 	\return \c B_OK, if everything went fine, an error code otherwise.
1066 */
1067 static status_t
1068 dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree, bool reenter)
1069 {
1070 	ReadLocker locker(sVnodeLock);
1071 	AutoLocker<Vnode> nodeLocker(vnode);
1072 
1073 	int32 oldRefCount = atomic_add(&vnode->ref_count, -1);
1074 
1075 	ASSERT_PRINT(oldRefCount > 0, "vnode %p\n", vnode);
1076 
1077 	TRACE(("dec_vnode_ref_count: vnode %p, ref now %" B_PRId32 "\n", vnode,
1078 		vnode->ref_count));
1079 
1080 	if (oldRefCount != 1)
1081 		return B_OK;
1082 
1083 	if (vnode->IsBusy())
1084 		panic("dec_vnode_ref_count: called on busy vnode %p\n", vnode);
1085 
1086 	bool freeNode = false;
1087 	bool freeUnusedNodes = false;
1088 
1089 	// Just insert the vnode into an unused list if we don't need
1090 	// to delete it
1091 	if (vnode->IsRemoved() || alwaysFree) {
1092 		vnode_to_be_freed(vnode);
1093 		vnode->SetBusy(true);
1094 		freeNode = true;
1095 	} else
1096 		freeUnusedNodes = vnode_unused(vnode);
1097 
1098 	nodeLocker.Unlock();
1099 	locker.Unlock();
1100 
1101 	if (freeNode)
1102 		free_vnode(vnode, reenter);
1103 	else if (freeUnusedNodes)
1104 		free_unused_vnodes();
1105 
1106 	return B_OK;
1107 }
1108 
1109 
1110 /*!	\brief Increments the reference counter of the given vnode.
1111 
1112 	The caller must make sure that the node isn't deleted while this function
1113 	is called. This can be done either:
1114 	- by ensuring that a reference to the node exists and remains in existence,
1115 	  or
1116 	- by holding the vnode's lock (which also requires read locking sVnodeLock)
1117 	  or by holding sVnodeLock write locked.
1118 
1119 	In the second case the caller is responsible for dealing with the ref count
1120 	0 -> 1 transition. That is 1. this function must not be invoked when the
1121 	node is busy in the first place and 2. vnode_used() must be called for the
1122 	node.
1123 
1124 	\param vnode the vnode.
1125 */
1126 static void
1127 inc_vnode_ref_count(struct vnode* vnode)
1128 {
1129 	atomic_add(&vnode->ref_count, 1);
1130 	TRACE(("inc_vnode_ref_count: vnode %p, ref now %" B_PRId32 "\n", vnode,
1131 		vnode->ref_count));
1132 }
1133 
1134 
1135 static bool
1136 is_special_node_type(int type)
1137 {
1138 	// at the moment only FIFOs are supported
1139 	return S_ISFIFO(type);
1140 }
1141 
1142 
1143 static status_t
1144 create_special_sub_node(struct vnode* vnode, uint32 flags)
1145 {
1146 	if (S_ISFIFO(vnode->Type()))
1147 		return create_fifo_vnode(vnode->mount->volume, vnode);
1148 
1149 	return B_BAD_VALUE;
1150 }
1151 
1152 
1153 /*!	\brief Retrieves a vnode for a given mount ID, node ID pair.
1154 
1155 	If the node is not yet in memory, it will be loaded.
1156 
1157 	The caller must not hold the sVnodeLock or the sMountMutex.
1158 
1159 	\param mountID the mount ID.
1160 	\param vnodeID the node ID.
1161 	\param _vnode Pointer to a vnode* variable into which the pointer to the
1162 		   retrieved vnode structure shall be written.
1163 	\param reenter \c true, if this function is called (indirectly) from within
1164 		   a file system.
1165 	\return \c B_OK, if everything when fine, an error code otherwise.
1166 */
1167 static status_t
1168 get_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode, bool canWait,
1169 	int reenter)
1170 {
1171 	FUNCTION(("get_vnode: mountid %" B_PRId32 " vnid 0x%" B_PRIx64 " %p\n",
1172 		mountID, vnodeID, _vnode));
1173 
1174 	rw_lock_read_lock(&sVnodeLock);
1175 
1176 	int32 tries = BUSY_VNODE_RETRIES;
1177 restart:
1178 	struct vnode* vnode = lookup_vnode(mountID, vnodeID);
1179 	AutoLocker<Vnode> nodeLocker(vnode);
1180 
1181 	if (vnode && vnode->IsBusy()) {
1182 		nodeLocker.Unlock();
1183 		rw_lock_read_unlock(&sVnodeLock);
1184 		if (!canWait) {
1185 			dprintf("vnode %" B_PRIdDEV ":%" B_PRIdINO " is busy!\n",
1186 				mountID, vnodeID);
1187 			return B_BUSY;
1188 		}
1189 		if (!retry_busy_vnode(tries, mountID, vnodeID))
1190 			return B_BUSY;
1191 
1192 		rw_lock_read_lock(&sVnodeLock);
1193 		goto restart;
1194 	}
1195 
1196 	TRACE(("get_vnode: tried to lookup vnode, got %p\n", vnode));
1197 
1198 	status_t status;
1199 
1200 	if (vnode) {
1201 		if (vnode->ref_count == 0) {
1202 			// this vnode has been unused before
1203 			vnode_used(vnode);
1204 		}
1205 		inc_vnode_ref_count(vnode);
1206 
1207 		nodeLocker.Unlock();
1208 		rw_lock_read_unlock(&sVnodeLock);
1209 	} else {
1210 		// we need to create a new vnode and read it in
1211 		rw_lock_read_unlock(&sVnodeLock);
1212 			// unlock -- create_new_vnode_and_lock() write-locks on success
1213 		bool nodeCreated;
1214 		status = create_new_vnode_and_lock(mountID, vnodeID, vnode,
1215 			nodeCreated);
1216 		if (status != B_OK)
1217 			return status;
1218 
1219 		if (!nodeCreated) {
1220 			rw_lock_read_lock(&sVnodeLock);
1221 			rw_lock_write_unlock(&sVnodeLock);
1222 			goto restart;
1223 		}
1224 
1225 		rw_lock_write_unlock(&sVnodeLock);
1226 
1227 		int type;
1228 		uint32 flags;
1229 		status = FS_MOUNT_CALL(vnode->mount, get_vnode, vnodeID, vnode, &type,
1230 			&flags, reenter);
1231 		if (status == B_OK && vnode->private_node == NULL)
1232 			status = B_BAD_VALUE;
1233 
1234 		bool gotNode = status == B_OK;
1235 		bool publishSpecialSubNode = false;
1236 		if (gotNode) {
1237 			vnode->SetType(type);
1238 			publishSpecialSubNode = is_special_node_type(type)
1239 				&& (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
1240 		}
1241 
1242 		if (gotNode && publishSpecialSubNode)
1243 			status = create_special_sub_node(vnode, flags);
1244 
1245 		if (status != B_OK) {
1246 			if (gotNode)
1247 				FS_CALL(vnode, put_vnode, reenter);
1248 
1249 			rw_lock_write_lock(&sVnodeLock);
1250 			sVnodeTable->Remove(vnode);
1251 			remove_vnode_from_mount_list(vnode, vnode->mount);
1252 			rw_lock_write_unlock(&sVnodeLock);
1253 
1254 			free(vnode);
1255 			return status;
1256 		}
1257 
1258 		rw_lock_read_lock(&sVnodeLock);
1259 		vnode->Lock();
1260 
1261 		vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
1262 		vnode->SetBusy(false);
1263 
1264 		vnode->Unlock();
1265 		rw_lock_read_unlock(&sVnodeLock);
1266 	}
1267 
1268 	TRACE(("get_vnode: returning %p\n", vnode));
1269 
1270 	*_vnode = vnode;
1271 	return B_OK;
1272 }
1273 
1274 
1275 /*!	\brief Decrements the reference counter of the given vnode and deletes it,
1276 	if the counter dropped to 0.
1277 
1278 	The caller must, of course, own a reference to the vnode to call this
1279 	function.
1280 	The caller must not hold the sVnodeLock or the sMountMutex.
1281 
1282 	\param vnode the vnode.
1283 */
1284 static inline void
1285 put_vnode(struct vnode* vnode)
1286 {
1287 	dec_vnode_ref_count(vnode, false, false);
1288 }
1289 
1290 
1291 static void
1292 free_unused_vnodes(int32 level)
1293 {
1294 	unused_vnodes_check_started();
1295 
1296 	if (level == B_NO_LOW_RESOURCE) {
1297 		unused_vnodes_check_done();
1298 		return;
1299 	}
1300 
1301 	flush_hot_vnodes();
1302 
1303 	// determine how many nodes to free
1304 	uint32 count = 1;
1305 	{
1306 		MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1307 
1308 		switch (level) {
1309 			case B_LOW_RESOURCE_NOTE:
1310 				count = sUnusedVnodes / 100;
1311 				break;
1312 			case B_LOW_RESOURCE_WARNING:
1313 				count = sUnusedVnodes / 10;
1314 				break;
1315 			case B_LOW_RESOURCE_CRITICAL:
1316 				count = sUnusedVnodes;
1317 				break;
1318 		}
1319 
1320 		if (count > sUnusedVnodes)
1321 			count = sUnusedVnodes;
1322 	}
1323 
1324 	// Write back the modified pages of some unused vnodes and free them.
1325 
1326 	for (uint32 i = 0; i < count; i++) {
1327 		ReadLocker vnodesReadLocker(sVnodeLock);
1328 
1329 		// get the first node
1330 		MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1331 		struct vnode* vnode = (struct vnode*)list_get_first_item(
1332 			&sUnusedVnodeList);
1333 		unusedVnodesLocker.Unlock();
1334 
1335 		if (vnode == NULL)
1336 			break;
1337 
1338 		// lock the node
1339 		AutoLocker<Vnode> nodeLocker(vnode);
1340 
1341 		// Check whether the node is still unused -- since we only append to the
1342 		// tail of the unused queue, the vnode should still be at its head.
1343 		// Alternatively we could check its ref count for 0 and its busy flag,
1344 		// but if the node is no longer at the head of the queue, it means it
1345 		// has been touched in the meantime, i.e. it is no longer the least
1346 		// recently used unused vnode and we rather don't free it.
1347 		unusedVnodesLocker.Lock();
1348 		if (vnode != list_get_first_item(&sUnusedVnodeList))
1349 			continue;
1350 		unusedVnodesLocker.Unlock();
1351 
1352 		ASSERT(!vnode->IsBusy());
1353 
1354 		// grab a reference
1355 		inc_vnode_ref_count(vnode);
1356 		vnode_used(vnode);
1357 
1358 		// write back changes and free the node
1359 		nodeLocker.Unlock();
1360 		vnodesReadLocker.Unlock();
1361 
1362 		if (vnode->cache != NULL)
1363 			vnode->cache->WriteModified();
1364 
1365 		dec_vnode_ref_count(vnode, true, false);
1366 			// this should free the vnode when it's still unused
1367 	}
1368 
1369 	unused_vnodes_check_done();
1370 }
1371 
1372 
1373 /*!	Gets the vnode the given vnode is covering.
1374 
1375 	The caller must have \c sVnodeLock read-locked at least.
1376 
1377 	The function returns a reference to the retrieved vnode (if any), the caller
1378 	is responsible to free.
1379 
1380 	\param vnode The vnode whose covered node shall be returned.
1381 	\return The covered vnode, or \c NULL if the given vnode doesn't cover any
1382 		vnode.
1383 */
1384 static inline Vnode*
1385 get_covered_vnode_locked(Vnode* vnode)
1386 {
1387 	if (Vnode* coveredNode = vnode->covers) {
1388 		while (coveredNode->covers != NULL)
1389 			coveredNode = coveredNode->covers;
1390 
1391 		inc_vnode_ref_count(coveredNode);
1392 		return coveredNode;
1393 	}
1394 
1395 	return NULL;
1396 }
1397 
1398 
1399 /*!	Gets the vnode the given vnode is covering.
1400 
1401 	The caller must not hold \c sVnodeLock. Note that this implies a race
1402 	condition, since the situation can change at any time.
1403 
1404 	The function returns a reference to the retrieved vnode (if any), the caller
1405 	is responsible to free.
1406 
1407 	\param vnode The vnode whose covered node shall be returned.
1408 	\return The covered vnode, or \c NULL if the given vnode doesn't cover any
1409 		vnode.
1410 */
1411 static inline Vnode*
1412 get_covered_vnode(Vnode* vnode)
1413 {
1414 	if (!vnode->IsCovering())
1415 		return NULL;
1416 
1417 	ReadLocker vnodeReadLocker(sVnodeLock);
1418 	return get_covered_vnode_locked(vnode);
1419 }
1420 
1421 
1422 /*!	Gets the vnode the given vnode is covered by.
1423 
1424 	The caller must have \c sVnodeLock read-locked at least.
1425 
1426 	The function returns a reference to the retrieved vnode (if any), the caller
1427 	is responsible to free.
1428 
1429 	\param vnode The vnode whose covering node shall be returned.
1430 	\return The covering vnode, or \c NULL if the given vnode isn't covered by
1431 		any vnode.
1432 */
1433 static Vnode*
1434 get_covering_vnode_locked(Vnode* vnode)
1435 {
1436 	if (Vnode* coveringNode = vnode->covered_by) {
1437 		while (coveringNode->covered_by != NULL)
1438 			coveringNode = coveringNode->covered_by;
1439 
1440 		inc_vnode_ref_count(coveringNode);
1441 		return coveringNode;
1442 	}
1443 
1444 	return NULL;
1445 }
1446 
1447 
1448 /*!	Gets the vnode the given vnode is covered by.
1449 
1450 	The caller must not hold \c sVnodeLock. Note that this implies a race
1451 	condition, since the situation can change at any time.
1452 
1453 	The function returns a reference to the retrieved vnode (if any), the caller
1454 	is responsible to free.
1455 
1456 	\param vnode The vnode whose covering node shall be returned.
1457 	\return The covering vnode, or \c NULL if the given vnode isn't covered by
1458 		any vnode.
1459 */
1460 static inline Vnode*
1461 get_covering_vnode(Vnode* vnode)
1462 {
1463 	if (!vnode->IsCovered())
1464 		return NULL;
1465 
1466 	ReadLocker vnodeReadLocker(sVnodeLock);
1467 	return get_covering_vnode_locked(vnode);
1468 }
1469 
1470 
1471 static void
1472 free_unused_vnodes()
1473 {
1474 	free_unused_vnodes(
1475 		low_resource_state(B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1476 			| B_KERNEL_RESOURCE_ADDRESS_SPACE));
1477 }
1478 
1479 
1480 static void
1481 vnode_low_resource_handler(void* /*data*/, uint32 resources, int32 level)
1482 {
1483 	TRACE(("vnode_low_resource_handler(level = %" B_PRId32 ")\n", level));
1484 
1485 	free_unused_vnodes(level);
1486 }
1487 
1488 
1489 static inline void
1490 put_advisory_locking(struct advisory_locking* locking)
1491 {
1492 	release_sem(locking->lock);
1493 }
1494 
1495 
1496 /*!	Returns the advisory_locking object of the \a vnode in case it
1497 	has one, and locks it.
1498 	You have to call put_advisory_locking() when you're done with
1499 	it.
1500 	Note, you must not have the vnode mutex locked when calling
1501 	this function.
1502 */
1503 static struct advisory_locking*
1504 get_advisory_locking(struct vnode* vnode)
1505 {
1506 	rw_lock_read_lock(&sVnodeLock);
1507 	vnode->Lock();
1508 
1509 	struct advisory_locking* locking = vnode->advisory_locking;
1510 	sem_id lock = locking != NULL ? locking->lock : B_ERROR;
1511 
1512 	vnode->Unlock();
1513 	rw_lock_read_unlock(&sVnodeLock);
1514 
1515 	if (lock >= 0)
1516 		lock = acquire_sem(lock);
1517 	if (lock < 0) {
1518 		// This means the locking has been deleted in the mean time
1519 		// or had never existed in the first place - otherwise, we
1520 		// would get the lock at some point.
1521 		return NULL;
1522 	}
1523 
1524 	return locking;
1525 }
1526 
1527 
1528 /*!	Creates a locked advisory_locking object, and attaches it to the
1529 	given \a vnode.
1530 	Returns B_OK in case of success - also if the vnode got such an
1531 	object from someone else in the mean time, you'll still get this
1532 	one locked then.
1533 */
1534 static status_t
1535 create_advisory_locking(struct vnode* vnode)
1536 {
1537 	if (vnode == NULL)
1538 		return B_FILE_ERROR;
1539 
1540 	ObjectDeleter<advisory_locking> lockingDeleter;
1541 	struct advisory_locking* locking = NULL;
1542 
1543 	while (get_advisory_locking(vnode) == NULL) {
1544 		// no locking object set on the vnode yet, create one
1545 		if (locking == NULL) {
1546 			locking = new(std::nothrow) advisory_locking;
1547 			if (locking == NULL)
1548 				return B_NO_MEMORY;
1549 			lockingDeleter.SetTo(locking);
1550 
1551 			locking->wait_sem = create_sem(0, "advisory lock");
1552 			if (locking->wait_sem < 0)
1553 				return locking->wait_sem;
1554 
1555 			locking->lock = create_sem(0, "advisory locking");
1556 			if (locking->lock < 0)
1557 				return locking->lock;
1558 		}
1559 
1560 		// set our newly created locking object
1561 		ReadLocker _(sVnodeLock);
1562 		AutoLocker<Vnode> nodeLocker(vnode);
1563 		if (vnode->advisory_locking == NULL) {
1564 			vnode->advisory_locking = locking;
1565 			lockingDeleter.Detach();
1566 			return B_OK;
1567 		}
1568 	}
1569 
1570 	// The vnode already had a locking object. That's just as well.
1571 
1572 	return B_OK;
1573 }
1574 
1575 
1576 /*! Returns \c true when either \a flock is \c NULL or the \a flock intersects
1577 	with the advisory_lock \a lock.
1578 */
1579 static bool
1580 advisory_lock_intersects(struct advisory_lock* lock, struct flock* flock)
1581 {
1582 	if (flock == NULL)
1583 		return true;
1584 
1585 	return lock->start <= flock->l_start - 1 + flock->l_len
1586 		&& lock->end >= flock->l_start;
1587 }
1588 
1589 
1590 /*!	Tests whether acquiring a lock would block.
1591 */
1592 static status_t
1593 test_advisory_lock(struct vnode* vnode, struct flock* flock)
1594 {
1595 	flock->l_type = F_UNLCK;
1596 
1597 	struct advisory_locking* locking = get_advisory_locking(vnode);
1598 	if (locking == NULL)
1599 		return B_OK;
1600 
1601 	team_id team = team_get_current_team_id();
1602 
1603 	LockList::Iterator iterator = locking->locks.GetIterator();
1604 	while (iterator.HasNext()) {
1605 		struct advisory_lock* lock = iterator.Next();
1606 
1607 		 if (lock->team != team && advisory_lock_intersects(lock, flock)) {
1608 			// locks do overlap
1609 			if (flock->l_type != F_RDLCK || !lock->shared) {
1610 				// collision
1611 				flock->l_type = lock->shared ? F_RDLCK : F_WRLCK;
1612 				flock->l_whence = SEEK_SET;
1613 				flock->l_start = lock->start;
1614 				flock->l_len = lock->end - lock->start + 1;
1615 				flock->l_pid = lock->team;
1616 				break;
1617 			}
1618 		}
1619 	}
1620 
1621 	put_advisory_locking(locking);
1622 	return B_OK;
1623 }
1624 
1625 
1626 /*!	Removes the specified lock, or all locks of the calling team
1627 	if \a flock is NULL.
1628 */
1629 static status_t
1630 release_advisory_lock(struct vnode* vnode, struct flock* flock)
1631 {
1632 	FUNCTION(("release_advisory_lock(vnode = %p, flock = %p)\n", vnode, flock));
1633 
1634 	struct advisory_locking* locking = get_advisory_locking(vnode);
1635 	if (locking == NULL)
1636 		return B_OK;
1637 
1638 	// TODO: use the thread ID instead??
1639 	team_id team = team_get_current_team_id();
1640 	pid_t session = thread_get_current_thread()->team->session_id;
1641 
1642 	// find matching lock entries
1643 
1644 	LockList::Iterator iterator = locking->locks.GetIterator();
1645 	while (iterator.HasNext()) {
1646 		struct advisory_lock* lock = iterator.Next();
1647 		bool removeLock = false;
1648 
1649 		if (lock->session == session)
1650 			removeLock = true;
1651 		else if (lock->team == team && advisory_lock_intersects(lock, flock)) {
1652 			bool endsBeyond = false;
1653 			bool startsBefore = false;
1654 			if (flock != NULL) {
1655 				startsBefore = lock->start < flock->l_start;
1656 				endsBeyond = lock->end > flock->l_start - 1 + flock->l_len;
1657 			}
1658 
1659 			if (!startsBefore && !endsBeyond) {
1660 				// lock is completely contained in flock
1661 				removeLock = true;
1662 			} else if (startsBefore && !endsBeyond) {
1663 				// cut the end of the lock
1664 				lock->end = flock->l_start - 1;
1665 			} else if (!startsBefore && endsBeyond) {
1666 				// cut the start of the lock
1667 				lock->start = flock->l_start + flock->l_len;
1668 			} else {
1669 				// divide the lock into two locks
1670 				struct advisory_lock* secondLock = new advisory_lock;
1671 				if (secondLock == NULL) {
1672 					// TODO: we should probably revert the locks we already
1673 					// changed... (ie. allocate upfront)
1674 					put_advisory_locking(locking);
1675 					return B_NO_MEMORY;
1676 				}
1677 
1678 				lock->end = flock->l_start - 1;
1679 
1680 				secondLock->team = lock->team;
1681 				secondLock->session = lock->session;
1682 				// values must already be normalized when getting here
1683 				secondLock->start = flock->l_start + flock->l_len;
1684 				secondLock->end = lock->end;
1685 				secondLock->shared = lock->shared;
1686 
1687 				locking->locks.Add(secondLock);
1688 			}
1689 		}
1690 
1691 		if (removeLock) {
1692 			// this lock is no longer used
1693 			iterator.Remove();
1694 			free(lock);
1695 		}
1696 	}
1697 
1698 	bool removeLocking = locking->locks.IsEmpty();
1699 	release_sem_etc(locking->wait_sem, 1, B_RELEASE_ALL);
1700 
1701 	put_advisory_locking(locking);
1702 
1703 	if (removeLocking) {
1704 		// We can remove the whole advisory locking structure; it's no
1705 		// longer used
1706 		locking = get_advisory_locking(vnode);
1707 		if (locking != NULL) {
1708 			ReadLocker locker(sVnodeLock);
1709 			AutoLocker<Vnode> nodeLocker(vnode);
1710 
1711 			// the locking could have been changed in the mean time
1712 			if (locking->locks.IsEmpty()) {
1713 				vnode->advisory_locking = NULL;
1714 				nodeLocker.Unlock();
1715 				locker.Unlock();
1716 
1717 				// we've detached the locking from the vnode, so we can
1718 				// safely delete it
1719 				delete locking;
1720 			} else {
1721 				// the locking is in use again
1722 				nodeLocker.Unlock();
1723 				locker.Unlock();
1724 				release_sem_etc(locking->lock, 1, B_DO_NOT_RESCHEDULE);
1725 			}
1726 		}
1727 	}
1728 
1729 	return B_OK;
1730 }
1731 
1732 
1733 /*!	Acquires an advisory lock for the \a vnode. If \a wait is \c true, it
1734 	will wait for the lock to become available, if there are any collisions
1735 	(it will return B_PERMISSION_DENIED in this case if \a wait is \c false).
1736 
1737 	If \a session is -1, POSIX semantics are used for this lock. Otherwise,
1738 	BSD flock() semantics are used, that is, all children can unlock the file
1739 	in question (we even allow parents to remove the lock, though, but that
1740 	seems to be in line to what the BSD's are doing).
1741 */
1742 static status_t
1743 acquire_advisory_lock(struct vnode* vnode, pid_t session, struct flock* flock,
1744 	bool wait)
1745 {
1746 	FUNCTION(("acquire_advisory_lock(vnode = %p, flock = %p, wait = %s)\n",
1747 		vnode, flock, wait ? "yes" : "no"));
1748 
1749 	bool shared = flock->l_type == F_RDLCK;
1750 	status_t status = B_OK;
1751 
1752 	// TODO: do deadlock detection!
1753 
1754 	struct advisory_locking* locking;
1755 
1756 	while (true) {
1757 		// if this vnode has an advisory_locking structure attached,
1758 		// lock that one and search for any colliding file lock
1759 		status = create_advisory_locking(vnode);
1760 		if (status != B_OK)
1761 			return status;
1762 
1763 		locking = vnode->advisory_locking;
1764 		team_id team = team_get_current_team_id();
1765 		sem_id waitForLock = -1;
1766 
1767 		// test for collisions
1768 		LockList::Iterator iterator = locking->locks.GetIterator();
1769 		while (iterator.HasNext()) {
1770 			struct advisory_lock* lock = iterator.Next();
1771 
1772 			// TODO: locks from the same team might be joinable!
1773 			if (lock->team != team && advisory_lock_intersects(lock, flock)) {
1774 				// locks do overlap
1775 				if (!shared || !lock->shared) {
1776 					// we need to wait
1777 					waitForLock = locking->wait_sem;
1778 					break;
1779 				}
1780 			}
1781 		}
1782 
1783 		if (waitForLock < 0)
1784 			break;
1785 
1786 		// We need to wait. Do that or fail now, if we've been asked not to.
1787 
1788 		if (!wait) {
1789 			put_advisory_locking(locking);
1790 			return session != -1 ? B_WOULD_BLOCK : B_PERMISSION_DENIED;
1791 		}
1792 
1793 		status = switch_sem_etc(locking->lock, waitForLock, 1,
1794 			B_CAN_INTERRUPT, 0);
1795 		if (status != B_OK && status != B_BAD_SEM_ID)
1796 			return status;
1797 
1798 		// We have been notified, but we need to re-lock the locking object. So
1799 		// go another round...
1800 	}
1801 
1802 	// install new lock
1803 
1804 	struct advisory_lock* lock = (struct advisory_lock*)malloc(
1805 		sizeof(struct advisory_lock));
1806 	if (lock == NULL) {
1807 		put_advisory_locking(locking);
1808 		return B_NO_MEMORY;
1809 	}
1810 
1811 	lock->team = team_get_current_team_id();
1812 	lock->session = session;
1813 	// values must already be normalized when getting here
1814 	lock->start = flock->l_start;
1815 	lock->end = flock->l_start - 1 + flock->l_len;
1816 	lock->shared = shared;
1817 
1818 	locking->locks.Add(lock);
1819 	put_advisory_locking(locking);
1820 
1821 	return status;
1822 }
1823 
1824 
1825 /*!	Normalizes the \a flock structure to make it easier to compare the
1826 	structure with others. The l_start and l_len fields are set to absolute
1827 	values according to the l_whence field.
1828 */
1829 static status_t
1830 normalize_flock(struct file_descriptor* descriptor, struct flock* flock)
1831 {
1832 	switch (flock->l_whence) {
1833 		case SEEK_SET:
1834 			break;
1835 		case SEEK_CUR:
1836 			flock->l_start += descriptor->pos;
1837 			break;
1838 		case SEEK_END:
1839 		{
1840 			struct vnode* vnode = descriptor->u.vnode;
1841 			struct stat stat;
1842 			status_t status;
1843 
1844 			if (!HAS_FS_CALL(vnode, read_stat))
1845 				return B_UNSUPPORTED;
1846 
1847 			status = FS_CALL(vnode, read_stat, &stat);
1848 			if (status != B_OK)
1849 				return status;
1850 
1851 			flock->l_start += stat.st_size;
1852 			break;
1853 		}
1854 		default:
1855 			return B_BAD_VALUE;
1856 	}
1857 
1858 	if (flock->l_start < 0)
1859 		flock->l_start = 0;
1860 	if (flock->l_len == 0)
1861 		flock->l_len = OFF_MAX;
1862 
1863 	// don't let the offset and length overflow
1864 	if (flock->l_start > 0 && OFF_MAX - flock->l_start < flock->l_len)
1865 		flock->l_len = OFF_MAX - flock->l_start;
1866 
1867 	if (flock->l_len < 0) {
1868 		// a negative length reverses the region
1869 		flock->l_start += flock->l_len;
1870 		flock->l_len = -flock->l_len;
1871 	}
1872 
1873 	return B_OK;
1874 }
1875 
1876 
1877 static void
1878 replace_vnode_if_disconnected(struct fs_mount* mount,
1879 	struct vnode* vnodeToDisconnect, struct vnode*& vnode,
1880 	struct vnode* fallBack, bool lockRootLock)
1881 {
1882 	struct vnode* givenVnode = vnode;
1883 	bool vnodeReplaced = false;
1884 
1885 	ReadLocker vnodeReadLocker(sVnodeLock);
1886 
1887 	if (lockRootLock)
1888 		mutex_lock(&sIOContextRootLock);
1889 
1890 	while (vnode != NULL && vnode->mount == mount
1891 		&& (vnodeToDisconnect == NULL || vnodeToDisconnect == vnode)) {
1892 		if (vnode->covers != NULL) {
1893 			// redirect the vnode to the covered vnode
1894 			vnode = vnode->covers;
1895 		} else
1896 			vnode = fallBack;
1897 
1898 		vnodeReplaced = true;
1899 	}
1900 
1901 	// If we've replaced the node, grab a reference for the new one.
1902 	if (vnodeReplaced && vnode != NULL)
1903 		inc_vnode_ref_count(vnode);
1904 
1905 	if (lockRootLock)
1906 		mutex_unlock(&sIOContextRootLock);
1907 
1908 	vnodeReadLocker.Unlock();
1909 
1910 	if (vnodeReplaced)
1911 		put_vnode(givenVnode);
1912 }
1913 
1914 
1915 /*!	Disconnects all file descriptors that are associated with the
1916 	\a vnodeToDisconnect, or if this is NULL, all vnodes of the specified
1917 	\a mount object.
1918 
1919 	Note, after you've called this function, there might still be ongoing
1920 	accesses - they won't be interrupted if they already happened before.
1921 	However, any subsequent access will fail.
1922 
1923 	This is not a cheap function and should be used with care and rarely.
1924 	TODO: there is currently no means to stop a blocking read/write!
1925 */
1926 static void
1927 disconnect_mount_or_vnode_fds(struct fs_mount* mount,
1928 	struct vnode* vnodeToDisconnect)
1929 {
1930 	// iterate over all teams and peek into their file descriptors
1931 	TeamListIterator teamIterator;
1932 	while (Team* team = teamIterator.Next()) {
1933 		BReference<Team> teamReference(team, true);
1934 		TeamLocker teamLocker(team);
1935 
1936 		// lock the I/O context
1937 		io_context* context = team->io_context;
1938 		if (context == NULL)
1939 			continue;
1940 		MutexLocker contextLocker(context->io_mutex);
1941 
1942 		teamLocker.Unlock();
1943 
1944 		replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->root,
1945 			sRoot, true);
1946 		replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->cwd,
1947 			sRoot, false);
1948 
1949 		for (uint32 i = 0; i < context->table_size; i++) {
1950 			if (struct file_descriptor* descriptor = context->fds[i]) {
1951 				inc_fd_ref_count(descriptor);
1952 
1953 				// if this descriptor points at this mount, we
1954 				// need to disconnect it to be able to unmount
1955 				struct vnode* vnode = fd_vnode(descriptor);
1956 				if (vnodeToDisconnect != NULL) {
1957 					if (vnode == vnodeToDisconnect)
1958 						disconnect_fd(descriptor);
1959 				} else if ((vnode != NULL && vnode->mount == mount)
1960 					|| (vnode == NULL && descriptor->u.mount == mount))
1961 					disconnect_fd(descriptor);
1962 
1963 				put_fd(descriptor);
1964 			}
1965 		}
1966 	}
1967 }
1968 
1969 
1970 /*!	\brief Gets the root node of the current IO context.
1971 	If \a kernel is \c true, the kernel IO context will be used.
1972 	The caller obtains a reference to the returned node.
1973 */
1974 struct vnode*
1975 get_root_vnode(bool kernel)
1976 {
1977 	if (!kernel) {
1978 		// Get current working directory from io context
1979 		struct io_context* context = get_current_io_context(kernel);
1980 
1981 		mutex_lock(&sIOContextRootLock);
1982 
1983 		struct vnode* root = context->root;
1984 		if (root != NULL)
1985 			inc_vnode_ref_count(root);
1986 
1987 		mutex_unlock(&sIOContextRootLock);
1988 
1989 		if (root != NULL)
1990 			return root;
1991 
1992 		// That should never happen.
1993 		dprintf("get_root_vnode(): IO context for team %" B_PRId32 " doesn't "
1994 			"have a root\n", team_get_current_team_id());
1995 	}
1996 
1997 	inc_vnode_ref_count(sRoot);
1998 	return sRoot;
1999 }
2000 
2001 
2002 /*!	\brief Gets the directory path and leaf name for a given path.
2003 
2004 	The supplied \a path is transformed to refer to the directory part of
2005 	the entry identified by the original path, and into the buffer \a filename
2006 	the leaf name of the original entry is written.
2007 	Neither the returned path nor the leaf name can be expected to be
2008 	canonical.
2009 
2010 	\param path The path to be analyzed. Must be able to store at least one
2011 		   additional character.
2012 	\param filename The buffer into which the leaf name will be written.
2013 		   Must be of size B_FILE_NAME_LENGTH at least.
2014 	\return \c B_OK, if everything went fine, \c B_NAME_TOO_LONG, if the leaf
2015 		   name is longer than \c B_FILE_NAME_LENGTH, or \c B_ENTRY_NOT_FOUND,
2016 		   if the given path name is empty.
2017 */
2018 static status_t
2019 get_dir_path_and_leaf(char* path, char* filename)
2020 {
2021 	if (*path == '\0')
2022 		return B_ENTRY_NOT_FOUND;
2023 
2024 	char* last = strrchr(path, '/');
2025 		// '/' are not allowed in file names!
2026 
2027 	FUNCTION(("get_dir_path_and_leaf(path = %s)\n", path));
2028 
2029 	if (last == NULL) {
2030 		// this path is single segment with no '/' in it
2031 		// ex. "foo"
2032 		if (strlcpy(filename, path, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2033 			return B_NAME_TOO_LONG;
2034 
2035 		strcpy(path, ".");
2036 	} else {
2037 		last++;
2038 		if (last[0] == '\0') {
2039 			// special case: the path ends in one or more '/' - remove them
2040 			while (*--last == '/' && last != path);
2041 			last[1] = '\0';
2042 
2043 			if (last == path && last[0] == '/') {
2044 				// This path points to the root of the file system
2045 				strcpy(filename, ".");
2046 				return B_OK;
2047 			}
2048 			for (; last != path && *(last - 1) != '/'; last--);
2049 				// rewind to the start of the leaf before the '/'
2050 		}
2051 
2052 		// normal leaf: replace the leaf portion of the path with a '.'
2053 		if (strlcpy(filename, last, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2054 			return B_NAME_TOO_LONG;
2055 
2056 		last[0] = '.';
2057 		last[1] = '\0';
2058 	}
2059 	return B_OK;
2060 }
2061 
2062 
2063 static status_t
2064 entry_ref_to_vnode(dev_t mountID, ino_t directoryID, const char* name,
2065 	bool traverse, bool kernel, struct vnode** _vnode)
2066 {
2067 	char clonedName[B_FILE_NAME_LENGTH + 1];
2068 	if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2069 		return B_NAME_TOO_LONG;
2070 
2071 	// get the directory vnode and let vnode_path_to_vnode() do the rest
2072 	struct vnode* directory;
2073 
2074 	status_t status = get_vnode(mountID, directoryID, &directory, true, false);
2075 	if (status < 0)
2076 		return status;
2077 
2078 	return vnode_path_to_vnode(directory, clonedName, traverse, 0, kernel,
2079 		_vnode, NULL);
2080 }
2081 
2082 
2083 /*!	Looks up the entry with name \a name in the directory represented by \a dir
2084 	and returns the respective vnode.
2085 	On success a reference to the vnode is acquired for the caller.
2086 */
2087 static status_t
2088 lookup_dir_entry(struct vnode* dir, const char* name, struct vnode** _vnode)
2089 {
2090 	ino_t id;
2091 	bool missing;
2092 
2093 	if (dir->mount->entry_cache.Lookup(dir->id, name, id, missing)) {
2094 		return missing ? B_ENTRY_NOT_FOUND
2095 			: get_vnode(dir->device, id, _vnode, true, false);
2096 	}
2097 
2098 	status_t status = FS_CALL(dir, lookup, name, &id);
2099 	if (status != B_OK)
2100 		return status;
2101 
2102 	// The lookup() hook calls get_vnode() or publish_vnode(), so we do already
2103 	// have a reference and just need to look the node up.
2104 	rw_lock_read_lock(&sVnodeLock);
2105 	*_vnode = lookup_vnode(dir->device, id);
2106 	rw_lock_read_unlock(&sVnodeLock);
2107 
2108 	if (*_vnode == NULL) {
2109 		panic("lookup_dir_entry(): could not lookup vnode (mountid 0x%" B_PRIx32
2110 			" vnid 0x%" B_PRIx64 ")\n", dir->device, id);
2111 		return B_ENTRY_NOT_FOUND;
2112 	}
2113 
2114 //	ktrace_printf("lookup_dir_entry(): dir: %p (%ld, %lld), name: \"%s\" -> "
2115 //		"%p (%ld, %lld)", dir, dir->mount->id, dir->id, name, *_vnode,
2116 //		(*_vnode)->mount->id, (*_vnode)->id);
2117 
2118 	return B_OK;
2119 }
2120 
2121 
2122 /*!	Returns the vnode for the relative path starting at the specified \a vnode.
2123 	\a path must not be NULL.
2124 	If it returns successfully, \a path contains the name of the last path
2125 	component. This function clobbers the buffer pointed to by \a path only
2126 	if it does contain more than one component.
2127 	Note, this reduces the ref_count of the starting \a vnode, no matter if
2128 	it is successful or not!
2129 */
2130 static status_t
2131 vnode_path_to_vnode(struct vnode* vnode, char* path, bool traverseLeafLink,
2132 	int count, struct io_context* ioContext, struct vnode** _vnode,
2133 	ino_t* _parentID)
2134 {
2135 	status_t status = B_OK;
2136 	ino_t lastParentID = vnode->id;
2137 
2138 	FUNCTION(("vnode_path_to_vnode(vnode = %p, path = %s)\n", vnode, path));
2139 
2140 	if (path == NULL) {
2141 		put_vnode(vnode);
2142 		return B_BAD_VALUE;
2143 	}
2144 
2145 	if (*path == '\0') {
2146 		put_vnode(vnode);
2147 		return B_ENTRY_NOT_FOUND;
2148 	}
2149 
2150 	while (true) {
2151 		struct vnode* nextVnode;
2152 		char* nextPath;
2153 
2154 		TRACE(("vnode_path_to_vnode: top of loop. p = %p, p = '%s'\n", path,
2155 			path));
2156 
2157 		// done?
2158 		if (path[0] == '\0')
2159 			break;
2160 
2161 		// walk to find the next path component ("path" will point to a single
2162 		// path component), and filter out multiple slashes
2163 		for (nextPath = path + 1; *nextPath != '\0' && *nextPath != '/';
2164 				nextPath++);
2165 
2166 		if (*nextPath == '/') {
2167 			*nextPath = '\0';
2168 			do
2169 				nextPath++;
2170 			while (*nextPath == '/');
2171 		}
2172 
2173 		// See if the '..' is at a covering vnode move to the covered
2174 		// vnode so we pass the '..' path to the underlying filesystem.
2175 		// Also prevent breaking the root of the IO context.
2176 		if (strcmp("..", path) == 0) {
2177 			if (vnode == ioContext->root) {
2178 				// Attempted prison break! Keep it contained.
2179 				path = nextPath;
2180 				continue;
2181 			}
2182 
2183 			if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2184 				nextVnode = coveredVnode;
2185 				put_vnode(vnode);
2186 				vnode = nextVnode;
2187 			}
2188 		}
2189 
2190 		// check if vnode is really a directory
2191 		if (status == B_OK && !S_ISDIR(vnode->Type()))
2192 			status = B_NOT_A_DIRECTORY;
2193 
2194 		// Check if we have the right to search the current directory vnode.
2195 		// If a file system doesn't have the access() function, we assume that
2196 		// searching a directory is always allowed
2197 		if (status == B_OK && HAS_FS_CALL(vnode, access))
2198 			status = FS_CALL(vnode, access, X_OK);
2199 
2200 		// Tell the filesystem to get the vnode of this path component (if we
2201 		// got the permission from the call above)
2202 		if (status == B_OK)
2203 			status = lookup_dir_entry(vnode, path, &nextVnode);
2204 
2205 		if (status != B_OK) {
2206 			put_vnode(vnode);
2207 			return status;
2208 		}
2209 
2210 		// If the new node is a symbolic link, resolve it (if we've been told
2211 		// to do it)
2212 		if (S_ISLNK(nextVnode->Type())
2213 			&& (traverseLeafLink || nextPath[0] != '\0')) {
2214 			size_t bufferSize;
2215 			char* buffer;
2216 
2217 			TRACE(("traverse link\n"));
2218 
2219 			// it's not exactly nice style using goto in this way, but hey,
2220 			// it works :-/
2221 			if (count + 1 > B_MAX_SYMLINKS) {
2222 				status = B_LINK_LIMIT;
2223 				goto resolve_link_error;
2224 			}
2225 
2226 			buffer = (char*)malloc(bufferSize = B_PATH_NAME_LENGTH);
2227 			if (buffer == NULL) {
2228 				status = B_NO_MEMORY;
2229 				goto resolve_link_error;
2230 			}
2231 
2232 			if (HAS_FS_CALL(nextVnode, read_symlink)) {
2233 				bufferSize--;
2234 				status = FS_CALL(nextVnode, read_symlink, buffer, &bufferSize);
2235 				// null-terminate
2236 				if (status >= 0)
2237 					buffer[bufferSize] = '\0';
2238 			} else
2239 				status = B_BAD_VALUE;
2240 
2241 			if (status != B_OK) {
2242 				free(buffer);
2243 
2244 		resolve_link_error:
2245 				put_vnode(vnode);
2246 				put_vnode(nextVnode);
2247 
2248 				return status;
2249 			}
2250 			put_vnode(nextVnode);
2251 
2252 			// Check if we start from the root directory or the current
2253 			// directory ("vnode" still points to that one).
2254 			// Cut off all leading slashes if it's the root directory
2255 			path = buffer;
2256 			bool absoluteSymlink = false;
2257 			if (path[0] == '/') {
2258 				// we don't need the old directory anymore
2259 				put_vnode(vnode);
2260 
2261 				while (*++path == '/')
2262 					;
2263 
2264 				mutex_lock(&sIOContextRootLock);
2265 				vnode = ioContext->root;
2266 				inc_vnode_ref_count(vnode);
2267 				mutex_unlock(&sIOContextRootLock);
2268 
2269 				absoluteSymlink = true;
2270 			}
2271 
2272 			inc_vnode_ref_count(vnode);
2273 				// balance the next recursion - we will decrement the
2274 				// ref_count of the vnode, no matter if we succeeded or not
2275 
2276 			if (absoluteSymlink && *path == '\0') {
2277 				// symlink was just "/"
2278 				nextVnode = vnode;
2279 			} else {
2280 				status = vnode_path_to_vnode(vnode, path, true, count + 1,
2281 					ioContext, &nextVnode, &lastParentID);
2282 			}
2283 
2284 			free(buffer);
2285 
2286 			if (status != B_OK) {
2287 				put_vnode(vnode);
2288 				return status;
2289 			}
2290 		} else
2291 			lastParentID = vnode->id;
2292 
2293 		// decrease the ref count on the old dir we just looked up into
2294 		put_vnode(vnode);
2295 
2296 		path = nextPath;
2297 		vnode = nextVnode;
2298 
2299 		// see if we hit a covered node
2300 		if (Vnode* coveringNode = get_covering_vnode(vnode)) {
2301 			put_vnode(vnode);
2302 			vnode = coveringNode;
2303 		}
2304 	}
2305 
2306 	*_vnode = vnode;
2307 	if (_parentID)
2308 		*_parentID = lastParentID;
2309 
2310 	return B_OK;
2311 }
2312 
2313 
2314 static status_t
2315 vnode_path_to_vnode(struct vnode* vnode, char* path, bool traverseLeafLink,
2316 	int count, bool kernel, struct vnode** _vnode, ino_t* _parentID)
2317 {
2318 	return vnode_path_to_vnode(vnode, path, traverseLeafLink, count,
2319 		get_current_io_context(kernel), _vnode, _parentID);
2320 }
2321 
2322 
2323 static status_t
2324 path_to_vnode(char* path, bool traverseLink, struct vnode** _vnode,
2325 	ino_t* _parentID, bool kernel)
2326 {
2327 	struct vnode* start = NULL;
2328 
2329 	FUNCTION(("path_to_vnode(path = \"%s\")\n", path));
2330 
2331 	if (!path)
2332 		return B_BAD_VALUE;
2333 
2334 	if (*path == '\0')
2335 		return B_ENTRY_NOT_FOUND;
2336 
2337 	// figure out if we need to start at root or at cwd
2338 	if (*path == '/') {
2339 		if (sRoot == NULL) {
2340 			// we're a bit early, aren't we?
2341 			return B_ERROR;
2342 		}
2343 
2344 		while (*++path == '/')
2345 			;
2346 		start = get_root_vnode(kernel);
2347 
2348 		if (*path == '\0') {
2349 			*_vnode = start;
2350 			return B_OK;
2351 		}
2352 
2353 	} else {
2354 		struct io_context* context = get_current_io_context(kernel);
2355 
2356 		mutex_lock(&context->io_mutex);
2357 		start = context->cwd;
2358 		if (start != NULL)
2359 			inc_vnode_ref_count(start);
2360 		mutex_unlock(&context->io_mutex);
2361 
2362 		if (start == NULL)
2363 			return B_ERROR;
2364 	}
2365 
2366 	return vnode_path_to_vnode(start, path, traverseLink, 0, kernel, _vnode,
2367 		_parentID);
2368 }
2369 
2370 
2371 /*! Returns the vnode in the next to last segment of the path, and returns
2372 	the last portion in filename.
2373 	The path buffer must be able to store at least one additional character.
2374 */
2375 static status_t
2376 path_to_dir_vnode(char* path, struct vnode** _vnode, char* filename,
2377 	bool kernel)
2378 {
2379 	status_t status = get_dir_path_and_leaf(path, filename);
2380 	if (status != B_OK)
2381 		return status;
2382 
2383 	return path_to_vnode(path, true, _vnode, NULL, kernel);
2384 }
2385 
2386 
2387 /*!	\brief Retrieves the directory vnode and the leaf name of an entry referred
2388 		   to by a FD + path pair.
2389 
2390 	\a path must be given in either case. \a fd might be omitted, in which
2391 	case \a path is either an absolute path or one relative to the current
2392 	directory. If both a supplied and \a path is relative it is reckoned off
2393 	of the directory referred to by \a fd. If \a path is absolute \a fd is
2394 	ignored.
2395 
2396 	The caller has the responsibility to call put_vnode() on the returned
2397 	directory vnode.
2398 
2399 	\param fd The FD. May be < 0.
2400 	\param path The absolute or relative path. Must not be \c NULL. The buffer
2401 	       is modified by this function. It must have at least room for a
2402 	       string one character longer than the path it contains.
2403 	\param _vnode A pointer to a variable the directory vnode shall be written
2404 		   into.
2405 	\param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2406 		   the leaf name of the specified entry will be written.
2407 	\param kernel \c true, if invoked from inside the kernel, \c false if
2408 		   invoked from userland.
2409 	\return \c B_OK, if everything went fine, another error code otherwise.
2410 */
2411 static status_t
2412 fd_and_path_to_dir_vnode(int fd, char* path, struct vnode** _vnode,
2413 	char* filename, bool kernel)
2414 {
2415 	if (!path)
2416 		return B_BAD_VALUE;
2417 	if (*path == '\0')
2418 		return B_ENTRY_NOT_FOUND;
2419 	if (fd < 0)
2420 		return path_to_dir_vnode(path, _vnode, filename, kernel);
2421 
2422 	status_t status = get_dir_path_and_leaf(path, filename);
2423 	if (status != B_OK)
2424 		return status;
2425 
2426 	return fd_and_path_to_vnode(fd, path, true, _vnode, NULL, kernel);
2427 }
2428 
2429 
2430 /*!	\brief Retrieves the directory vnode and the leaf name of an entry referred
2431 		   to by a vnode + path pair.
2432 
2433 	\a path must be given in either case. \a vnode might be omitted, in which
2434 	case \a path is either an absolute path or one relative to the current
2435 	directory. If both a supplied and \a path is relative it is reckoned off
2436 	of the directory referred to by \a vnode. If \a path is absolute \a vnode is
2437 	ignored.
2438 
2439 	The caller has the responsibility to call put_vnode() on the returned
2440 	directory vnode.
2441 
2442 	\param vnode The vnode. May be \c NULL.
2443 	\param path The absolute or relative path. Must not be \c NULL. The buffer
2444 	       is modified by this function. It must have at least room for a
2445 	       string one character longer than the path it contains.
2446 	\param _vnode A pointer to a variable the directory vnode shall be written
2447 		   into.
2448 	\param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2449 		   the leaf name of the specified entry will be written.
2450 	\param kernel \c true, if invoked from inside the kernel, \c false if
2451 		   invoked from userland.
2452 	\return \c B_OK, if everything went fine, another error code otherwise.
2453 */
2454 static status_t
2455 vnode_and_path_to_dir_vnode(struct vnode* vnode, char* path,
2456 	struct vnode** _vnode, char* filename, bool kernel)
2457 {
2458 	if (!path)
2459 		return B_BAD_VALUE;
2460 	if (*path == '\0')
2461 		return B_ENTRY_NOT_FOUND;
2462 	if (vnode == NULL || path[0] == '/')
2463 		return path_to_dir_vnode(path, _vnode, filename, kernel);
2464 
2465 	status_t status = get_dir_path_and_leaf(path, filename);
2466 	if (status != B_OK)
2467 		return status;
2468 
2469 	inc_vnode_ref_count(vnode);
2470 		// vnode_path_to_vnode() always decrements the ref count
2471 
2472 	return vnode_path_to_vnode(vnode, path, true, 0, kernel, _vnode, NULL);
2473 }
2474 
2475 
2476 /*! Returns a vnode's name in the d_name field of a supplied dirent buffer.
2477 */
2478 static status_t
2479 get_vnode_name(struct vnode* vnode, struct vnode* parent, struct dirent* buffer,
2480 	size_t bufferSize, struct io_context* ioContext)
2481 {
2482 	if (bufferSize < sizeof(struct dirent))
2483 		return B_BAD_VALUE;
2484 
2485 	// See if the vnode is covering another vnode and move to the covered
2486 	// vnode so we get the underlying file system
2487 	VNodePutter vnodePutter;
2488 	if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2489 		vnode = coveredVnode;
2490 		vnodePutter.SetTo(vnode);
2491 	}
2492 
2493 	if (HAS_FS_CALL(vnode, get_vnode_name)) {
2494 		// The FS supports getting the name of a vnode.
2495 		if (FS_CALL(vnode, get_vnode_name, buffer->d_name,
2496 			(char*)buffer + bufferSize - buffer->d_name) == B_OK)
2497 			return B_OK;
2498 	}
2499 
2500 	// The FS doesn't support getting the name of a vnode. So we search the
2501 	// parent directory for the vnode, if the caller let us.
2502 
2503 	if (parent == NULL || !HAS_FS_CALL(parent, read_dir))
2504 		return B_UNSUPPORTED;
2505 
2506 	void* cookie;
2507 
2508 	status_t status = FS_CALL(parent, open_dir, &cookie);
2509 	if (status >= B_OK) {
2510 		while (true) {
2511 			uint32 num = 1;
2512 			// We use the FS hook directly instead of dir_read(), since we don't
2513 			// want the entries to be fixed. We have already resolved vnode to
2514 			// the covered node.
2515 			status = FS_CALL(parent, read_dir, cookie, buffer, bufferSize,
2516 				&num);
2517 			if (status != B_OK)
2518 				break;
2519 			if (num == 0) {
2520 				status = B_ENTRY_NOT_FOUND;
2521 				break;
2522 			}
2523 
2524 			if (vnode->id == buffer->d_ino) {
2525 				// found correct entry!
2526 				break;
2527 			}
2528 		}
2529 
2530 		FS_CALL(parent, close_dir, cookie);
2531 		FS_CALL(parent, free_dir_cookie, cookie);
2532 	}
2533 	return status;
2534 }
2535 
2536 
2537 static status_t
2538 get_vnode_name(struct vnode* vnode, struct vnode* parent, char* name,
2539 	size_t nameSize, bool kernel)
2540 {
2541 	char buffer[sizeof(struct dirent) + B_FILE_NAME_LENGTH];
2542 	struct dirent* dirent = (struct dirent*)buffer;
2543 
2544 	status_t status = get_vnode_name(vnode, parent, dirent, sizeof(buffer),
2545 		get_current_io_context(kernel));
2546 	if (status != B_OK)
2547 		return status;
2548 
2549 	if (strlcpy(name, dirent->d_name, nameSize) >= nameSize)
2550 		return B_BUFFER_OVERFLOW;
2551 
2552 	return B_OK;
2553 }
2554 
2555 
2556 /*!	Gets the full path to a given directory vnode.
2557 	It uses the fs_get_vnode_name() call to get the name of a vnode; if a
2558 	file system doesn't support this call, it will fall back to iterating
2559 	through the parent directory to get the name of the child.
2560 
2561 	To protect against circular loops, it supports a maximum tree depth
2562 	of 256 levels.
2563 
2564 	Note that the path may not be correct the time this function returns!
2565 	It doesn't use any locking to prevent returning the correct path, as
2566 	paths aren't safe anyway: the path to a file can change at any time.
2567 
2568 	It might be a good idea, though, to check if the returned path exists
2569 	in the calling function (it's not done here because of efficiency)
2570 */
2571 static status_t
2572 dir_vnode_to_path(struct vnode* vnode, char* buffer, size_t bufferSize,
2573 	bool kernel)
2574 {
2575 	FUNCTION(("dir_vnode_to_path(%p, %p, %lu)\n", vnode, buffer, bufferSize));
2576 
2577 	if (vnode == NULL || buffer == NULL || bufferSize == 0)
2578 		return B_BAD_VALUE;
2579 
2580 	if (!S_ISDIR(vnode->Type()))
2581 		return B_NOT_A_DIRECTORY;
2582 
2583 	char* path = buffer;
2584 	int32 insert = bufferSize;
2585 	int32 maxLevel = 256;
2586 	int32 length;
2587 	status_t status = B_OK;
2588 	struct io_context* ioContext = get_current_io_context(kernel);
2589 
2590 	// we don't use get_vnode() here because this call is more
2591 	// efficient and does all we need from get_vnode()
2592 	inc_vnode_ref_count(vnode);
2593 
2594 	path[--insert] = '\0';
2595 		// the path is filled right to left
2596 
2597 	while (true) {
2598 		// If the node is the context's root, bail out. Otherwise resolve mount
2599 		// points.
2600 		if (vnode == ioContext->root)
2601 			break;
2602 
2603 		if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2604 			put_vnode(vnode);
2605 			vnode = coveredVnode;
2606 		}
2607 
2608 		// lookup the parent vnode
2609 		struct vnode* parentVnode;
2610 		status = lookup_dir_entry(vnode, "..", &parentVnode);
2611 		if (status != B_OK)
2612 			goto out;
2613 
2614 		if (parentVnode == vnode) {
2615 			// The caller apparently got their hands on a node outside of their
2616 			// context's root. Now we've hit the global root.
2617 			put_vnode(parentVnode);
2618 			break;
2619 		}
2620 
2621 		// get the node's name
2622 		char nameBuffer[sizeof(struct dirent) + B_FILE_NAME_LENGTH];
2623 			// also used for fs_read_dir()
2624 		char* name = &((struct dirent*)nameBuffer)->d_name[0];
2625 		status = get_vnode_name(vnode, parentVnode, (struct dirent*)nameBuffer,
2626 			sizeof(nameBuffer), ioContext);
2627 
2628 		// release the current vnode, we only need its parent from now on
2629 		put_vnode(vnode);
2630 		vnode = parentVnode;
2631 
2632 		if (status != B_OK)
2633 			goto out;
2634 
2635 		// TODO: add an explicit check for loops in about 10 levels to do
2636 		// real loop detection
2637 
2638 		// don't go deeper as 'maxLevel' to prevent circular loops
2639 		if (maxLevel-- < 0) {
2640 			status = B_LINK_LIMIT;
2641 			goto out;
2642 		}
2643 
2644 		// add the name in front of the current path
2645 		name[B_FILE_NAME_LENGTH - 1] = '\0';
2646 		length = strlen(name);
2647 		insert -= length;
2648 		if (insert <= 0) {
2649 			status = B_RESULT_NOT_REPRESENTABLE;
2650 			goto out;
2651 		}
2652 		memcpy(path + insert, name, length);
2653 		path[--insert] = '/';
2654 	}
2655 
2656 	// the root dir will result in an empty path: fix it
2657 	if (path[insert] == '\0')
2658 		path[--insert] = '/';
2659 
2660 	TRACE(("  path is: %s\n", path + insert));
2661 
2662 	// move the path to the start of the buffer
2663 	length = bufferSize - insert;
2664 	memmove(buffer, path + insert, length);
2665 
2666 out:
2667 	put_vnode(vnode);
2668 	return status;
2669 }
2670 
2671 
2672 /*!	Checks the length of every path component, and adds a '.'
2673 	if the path ends in a slash.
2674 	The given path buffer must be able to store at least one
2675 	additional character.
2676 */
2677 static status_t
2678 check_path(char* to)
2679 {
2680 	int32 length = 0;
2681 
2682 	// check length of every path component
2683 
2684 	while (*to) {
2685 		char* begin;
2686 		if (*to == '/')
2687 			to++, length++;
2688 
2689 		begin = to;
2690 		while (*to != '/' && *to)
2691 			to++, length++;
2692 
2693 		if (to - begin > B_FILE_NAME_LENGTH)
2694 			return B_NAME_TOO_LONG;
2695 	}
2696 
2697 	if (length == 0)
2698 		return B_ENTRY_NOT_FOUND;
2699 
2700 	// complete path if there is a slash at the end
2701 
2702 	if (*(to - 1) == '/') {
2703 		if (length > B_PATH_NAME_LENGTH - 2)
2704 			return B_NAME_TOO_LONG;
2705 
2706 		to[0] = '.';
2707 		to[1] = '\0';
2708 	}
2709 
2710 	return B_OK;
2711 }
2712 
2713 
2714 static struct file_descriptor*
2715 get_fd_and_vnode(int fd, struct vnode** _vnode, bool kernel)
2716 {
2717 	struct file_descriptor* descriptor
2718 		= get_fd(get_current_io_context(kernel), fd);
2719 	if (descriptor == NULL)
2720 		return NULL;
2721 
2722 	struct vnode* vnode = fd_vnode(descriptor);
2723 	if (vnode == NULL) {
2724 		put_fd(descriptor);
2725 		return NULL;
2726 	}
2727 
2728 	// ToDo: when we can close a file descriptor at any point, investigate
2729 	//	if this is still valid to do (accessing the vnode without ref_count
2730 	//	or locking)
2731 	*_vnode = vnode;
2732 	return descriptor;
2733 }
2734 
2735 
2736 static struct vnode*
2737 get_vnode_from_fd(int fd, bool kernel)
2738 {
2739 	struct file_descriptor* descriptor;
2740 	struct vnode* vnode;
2741 
2742 	descriptor = get_fd(get_current_io_context(kernel), fd);
2743 	if (descriptor == NULL)
2744 		return NULL;
2745 
2746 	vnode = fd_vnode(descriptor);
2747 	if (vnode != NULL)
2748 		inc_vnode_ref_count(vnode);
2749 
2750 	put_fd(descriptor);
2751 	return vnode;
2752 }
2753 
2754 
2755 /*!	Gets the vnode from an FD + path combination. If \a fd is lower than zero,
2756 	only the path will be considered. In this case, the \a path must not be
2757 	NULL.
2758 	If \a fd is a valid file descriptor, \a path may be NULL for directories,
2759 	and should be NULL for files.
2760 */
2761 static status_t
2762 fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
2763 	struct vnode** _vnode, ino_t* _parentID, bool kernel)
2764 {
2765 	if (fd < 0 && !path)
2766 		return B_BAD_VALUE;
2767 
2768 	if (path != NULL && *path == '\0')
2769 		return B_ENTRY_NOT_FOUND;
2770 
2771 	if (fd < 0 || (path != NULL && path[0] == '/')) {
2772 		// no FD or absolute path
2773 		return path_to_vnode(path, traverseLeafLink, _vnode, _parentID, kernel);
2774 	}
2775 
2776 	// FD only, or FD + relative path
2777 	struct vnode* vnode = get_vnode_from_fd(fd, kernel);
2778 	if (vnode == NULL)
2779 		return B_FILE_ERROR;
2780 
2781 	if (path != NULL) {
2782 		return vnode_path_to_vnode(vnode, path, traverseLeafLink, 0, kernel,
2783 			_vnode, _parentID);
2784 	}
2785 
2786 	// there is no relative path to take into account
2787 
2788 	*_vnode = vnode;
2789 	if (_parentID)
2790 		*_parentID = -1;
2791 
2792 	return B_OK;
2793 }
2794 
2795 
2796 static int
2797 get_new_fd(int type, struct fs_mount* mount, struct vnode* vnode,
2798 	void* cookie, int openMode, bool kernel)
2799 {
2800 	struct file_descriptor* descriptor;
2801 	int fd;
2802 
2803 	// If the vnode is locked, we don't allow creating a new file/directory
2804 	// file_descriptor for it
2805 	if (vnode && vnode->mandatory_locked_by != NULL
2806 		&& (type == FDTYPE_FILE || type == FDTYPE_DIR))
2807 		return B_BUSY;
2808 
2809 	descriptor = alloc_fd();
2810 	if (!descriptor)
2811 		return B_NO_MEMORY;
2812 
2813 	if (vnode)
2814 		descriptor->u.vnode = vnode;
2815 	else
2816 		descriptor->u.mount = mount;
2817 	descriptor->cookie = cookie;
2818 
2819 	switch (type) {
2820 		// vnode types
2821 		case FDTYPE_FILE:
2822 			descriptor->ops = &sFileOps;
2823 			break;
2824 		case FDTYPE_DIR:
2825 			descriptor->ops = &sDirectoryOps;
2826 			break;
2827 		case FDTYPE_ATTR:
2828 			descriptor->ops = &sAttributeOps;
2829 			break;
2830 		case FDTYPE_ATTR_DIR:
2831 			descriptor->ops = &sAttributeDirectoryOps;
2832 			break;
2833 
2834 		// mount types
2835 		case FDTYPE_INDEX_DIR:
2836 			descriptor->ops = &sIndexDirectoryOps;
2837 			break;
2838 		case FDTYPE_QUERY:
2839 			descriptor->ops = &sQueryOps;
2840 			break;
2841 
2842 		default:
2843 			panic("get_new_fd() called with unknown type %d\n", type);
2844 			break;
2845 	}
2846 	descriptor->type = type;
2847 	descriptor->open_mode = openMode;
2848 
2849 	io_context* context = get_current_io_context(kernel);
2850 	fd = new_fd(context, descriptor);
2851 	if (fd < 0) {
2852 		free(descriptor);
2853 		return B_NO_MORE_FDS;
2854 	}
2855 
2856 	mutex_lock(&context->io_mutex);
2857 	fd_set_close_on_exec(context, fd, (openMode & O_CLOEXEC) != 0);
2858 	mutex_unlock(&context->io_mutex);
2859 
2860 	return fd;
2861 }
2862 
2863 
2864 /*!	In-place normalizes \a path. It's otherwise semantically equivalent to
2865 	vfs_normalize_path(). See there for more documentation.
2866 */
2867 static status_t
2868 normalize_path(char* path, size_t pathSize, bool traverseLink, bool kernel)
2869 {
2870 	VNodePutter dirPutter;
2871 	struct vnode* dir = NULL;
2872 	status_t error;
2873 
2874 	for (int i = 0; i < B_MAX_SYMLINKS; i++) {
2875 		// get dir vnode + leaf name
2876 		struct vnode* nextDir;
2877 		char leaf[B_FILE_NAME_LENGTH];
2878 		error = vnode_and_path_to_dir_vnode(dir, path, &nextDir, leaf, kernel);
2879 		if (error != B_OK)
2880 			return error;
2881 
2882 		dir = nextDir;
2883 		strcpy(path, leaf);
2884 		dirPutter.SetTo(dir);
2885 
2886 		// get file vnode, if we shall resolve links
2887 		bool fileExists = false;
2888 		struct vnode* fileVnode;
2889 		VNodePutter fileVnodePutter;
2890 		if (traverseLink) {
2891 			inc_vnode_ref_count(dir);
2892 			if (vnode_path_to_vnode(dir, path, false, 0, kernel, &fileVnode,
2893 					NULL) == B_OK) {
2894 				fileVnodePutter.SetTo(fileVnode);
2895 				fileExists = true;
2896 			}
2897 		}
2898 
2899 		if (!fileExists || !traverseLink || !S_ISLNK(fileVnode->Type())) {
2900 			// we're done -- construct the path
2901 			bool hasLeaf = true;
2902 			if (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0) {
2903 				// special cases "." and ".." -- get the dir, forget the leaf
2904 				inc_vnode_ref_count(dir);
2905 				error = vnode_path_to_vnode(dir, leaf, false, 0, kernel,
2906 					&nextDir, NULL);
2907 				if (error != B_OK)
2908 					return error;
2909 				dir = nextDir;
2910 				dirPutter.SetTo(dir);
2911 				hasLeaf = false;
2912 			}
2913 
2914 			// get the directory path
2915 			error = dir_vnode_to_path(dir, path, B_PATH_NAME_LENGTH, kernel);
2916 			if (error != B_OK)
2917 				return error;
2918 
2919 			// append the leaf name
2920 			if (hasLeaf) {
2921 				// insert a directory separator if this is not the file system
2922 				// root
2923 				if ((strcmp(path, "/") != 0
2924 					&& strlcat(path, "/", pathSize) >= pathSize)
2925 					|| strlcat(path, leaf, pathSize) >= pathSize) {
2926 					return B_NAME_TOO_LONG;
2927 				}
2928 			}
2929 
2930 			return B_OK;
2931 		}
2932 
2933 		// read link
2934 		if (HAS_FS_CALL(fileVnode, read_symlink)) {
2935 			size_t bufferSize = B_PATH_NAME_LENGTH - 1;
2936 			error = FS_CALL(fileVnode, read_symlink, path, &bufferSize);
2937 			if (error != B_OK)
2938 				return error;
2939 			path[bufferSize] = '\0';
2940 		} else
2941 			return B_BAD_VALUE;
2942 	}
2943 
2944 	return B_LINK_LIMIT;
2945 }
2946 
2947 
2948 static status_t
2949 resolve_covered_parent(struct vnode* parent, dev_t* _device, ino_t* _node,
2950 	struct io_context* ioContext)
2951 {
2952 	// Make sure the IO context root is not bypassed.
2953 	if (parent == ioContext->root) {
2954 		*_device = parent->device;
2955 		*_node = parent->id;
2956 		return B_OK;
2957 	}
2958 
2959 	inc_vnode_ref_count(parent);
2960 		// vnode_path_to_vnode() puts the node
2961 
2962 	// ".." is guaranteed not to be clobbered by this call
2963 	struct vnode* vnode;
2964 	status_t status = vnode_path_to_vnode(parent, (char*)"..", false, 0,
2965 		ioContext, &vnode, NULL);
2966 	if (status == B_OK) {
2967 		*_device = vnode->device;
2968 		*_node = vnode->id;
2969 		put_vnode(vnode);
2970 	}
2971 
2972 	return status;
2973 }
2974 
2975 
2976 #ifdef ADD_DEBUGGER_COMMANDS
2977 
2978 
2979 static void
2980 _dump_advisory_locking(advisory_locking* locking)
2981 {
2982 	if (locking == NULL)
2983 		return;
2984 
2985 	kprintf("   lock:        %" B_PRId32, locking->lock);
2986 	kprintf("   wait_sem:    %" B_PRId32, locking->wait_sem);
2987 
2988 	int32 index = 0;
2989 	LockList::Iterator iterator = locking->locks.GetIterator();
2990 	while (iterator.HasNext()) {
2991 		struct advisory_lock* lock = iterator.Next();
2992 
2993 		kprintf("   [%2" B_PRId32 "] team:   %" B_PRId32 "\n", index++, lock->team);
2994 		kprintf("        start:  %" B_PRIdOFF "\n", lock->start);
2995 		kprintf("        end:    %" B_PRIdOFF "\n", lock->end);
2996 		kprintf("        shared? %s\n", lock->shared ? "yes" : "no");
2997 	}
2998 }
2999 
3000 
3001 static void
3002 _dump_mount(struct fs_mount* mount)
3003 {
3004 	kprintf("MOUNT: %p\n", mount);
3005 	kprintf(" id:            %" B_PRIdDEV "\n", mount->id);
3006 	kprintf(" device_name:   %s\n", mount->device_name);
3007 	kprintf(" root_vnode:    %p\n", mount->root_vnode);
3008 	kprintf(" covers:        %p\n", mount->root_vnode->covers);
3009 	kprintf(" partition:     %p\n", mount->partition);
3010 	kprintf(" lock:          %p\n", &mount->rlock);
3011 	kprintf(" flags:        %s%s\n", mount->unmounting ? " unmounting" : "",
3012 		mount->owns_file_device ? " owns_file_device" : "");
3013 
3014 	fs_volume* volume = mount->volume;
3015 	while (volume != NULL) {
3016 		kprintf(" volume %p:\n", volume);
3017 		kprintf("  layer:            %" B_PRId32 "\n", volume->layer);
3018 		kprintf("  private_volume:   %p\n", volume->private_volume);
3019 		kprintf("  ops:              %p\n", volume->ops);
3020 		kprintf("  file_system:      %p\n", volume->file_system);
3021 		kprintf("  file_system_name: %s\n", volume->file_system_name);
3022 		volume = volume->super_volume;
3023 	}
3024 
3025 	set_debug_variable("_volume", (addr_t)mount->volume->private_volume);
3026 	set_debug_variable("_root", (addr_t)mount->root_vnode);
3027 	set_debug_variable("_covers", (addr_t)mount->root_vnode->covers);
3028 	set_debug_variable("_partition", (addr_t)mount->partition);
3029 }
3030 
3031 
3032 static bool
3033 debug_prepend_vnode_name_to_path(char* buffer, size_t& bufferSize,
3034 	const char* name)
3035 {
3036 	bool insertSlash = buffer[bufferSize] != '\0';
3037 	size_t nameLength = strlen(name);
3038 
3039 	if (bufferSize < nameLength + (insertSlash ? 1 : 0))
3040 		return false;
3041 
3042 	if (insertSlash)
3043 		buffer[--bufferSize] = '/';
3044 
3045 	bufferSize -= nameLength;
3046 	memcpy(buffer + bufferSize, name, nameLength);
3047 
3048 	return true;
3049 }
3050 
3051 
3052 static bool
3053 debug_prepend_vnode_id_to_path(char* buffer, size_t& bufferSize, dev_t devID,
3054 	ino_t nodeID)
3055 {
3056 	if (bufferSize == 0)
3057 		return false;
3058 
3059 	bool insertSlash = buffer[bufferSize] != '\0';
3060 	if (insertSlash)
3061 		buffer[--bufferSize] = '/';
3062 
3063 	size_t size = snprintf(buffer, bufferSize,
3064 		"<%" B_PRIdDEV ",%" B_PRIdINO ">", devID, nodeID);
3065 	if (size > bufferSize) {
3066 		if (insertSlash)
3067 			bufferSize++;
3068 		return false;
3069 	}
3070 
3071 	if (size < bufferSize)
3072 		memmove(buffer + bufferSize - size, buffer, size);
3073 
3074 	bufferSize -= size;
3075 	return true;
3076 }
3077 
3078 
3079 static char*
3080 debug_resolve_vnode_path(struct vnode* vnode, char* buffer, size_t bufferSize,
3081 	bool& _truncated)
3082 {
3083 	// null-terminate the path
3084 	buffer[--bufferSize] = '\0';
3085 
3086 	while (true) {
3087 		while (vnode->covers != NULL)
3088 			vnode = vnode->covers;
3089 
3090 		if (vnode == sRoot) {
3091 			_truncated = bufferSize == 0;
3092 			if (!_truncated)
3093 				buffer[--bufferSize] = '/';
3094 			return buffer + bufferSize;
3095 		}
3096 
3097 		// resolve the name
3098 		ino_t dirID;
3099 		const char* name = vnode->mount->entry_cache.DebugReverseLookup(
3100 			vnode->id, dirID);
3101 		if (name == NULL) {
3102 			// Failed to resolve the name -- prepend "<dev,node>/".
3103 			_truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3104 				vnode->mount->id, vnode->id);
3105 			return buffer + bufferSize;
3106 		}
3107 
3108 		// prepend the name
3109 		if (!debug_prepend_vnode_name_to_path(buffer, bufferSize, name)) {
3110 			_truncated = true;
3111 			return buffer + bufferSize;
3112 		}
3113 
3114 		// resolve the directory node
3115 		struct vnode* nextVnode = lookup_vnode(vnode->mount->id, dirID);
3116 		if (nextVnode == NULL) {
3117 			_truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3118 				vnode->mount->id, dirID);
3119 			return buffer + bufferSize;
3120 		}
3121 
3122 		vnode = nextVnode;
3123 	}
3124 }
3125 
3126 
3127 static void
3128 _dump_vnode(struct vnode* vnode, bool printPath)
3129 {
3130 	kprintf("VNODE: %p\n", vnode);
3131 	kprintf(" device:        %" B_PRIdDEV "\n", vnode->device);
3132 	kprintf(" id:            %" B_PRIdINO "\n", vnode->id);
3133 	kprintf(" ref_count:     %" B_PRId32 "\n", vnode->ref_count);
3134 	kprintf(" private_node:  %p\n", vnode->private_node);
3135 	kprintf(" mount:         %p\n", vnode->mount);
3136 	kprintf(" covered_by:    %p\n", vnode->covered_by);
3137 	kprintf(" covers:        %p\n", vnode->covers);
3138 	kprintf(" cache:         %p\n", vnode->cache);
3139 	kprintf(" type:          %#" B_PRIx32 "\n", vnode->Type());
3140 	kprintf(" flags:         %s%s%s\n", vnode->IsRemoved() ? "r" : "-",
3141 		vnode->IsBusy() ? "b" : "-", vnode->IsUnpublished() ? "u" : "-");
3142 	kprintf(" advisory_lock: %p\n", vnode->advisory_locking);
3143 
3144 	_dump_advisory_locking(vnode->advisory_locking);
3145 
3146 	if (printPath) {
3147 		void* buffer = debug_malloc(B_PATH_NAME_LENGTH);
3148 		if (buffer != NULL) {
3149 			bool truncated;
3150 			char* path = debug_resolve_vnode_path(vnode, (char*)buffer,
3151 				B_PATH_NAME_LENGTH, truncated);
3152 			if (path != NULL) {
3153 				kprintf(" path:          ");
3154 				if (truncated)
3155 					kputs("<truncated>/");
3156 				kputs(path);
3157 				kputs("\n");
3158 			} else
3159 				kprintf("Failed to resolve vnode path.\n");
3160 
3161 			debug_free(buffer);
3162 		} else
3163 			kprintf("Failed to allocate memory for constructing the path.\n");
3164 	}
3165 
3166 	set_debug_variable("_node", (addr_t)vnode->private_node);
3167 	set_debug_variable("_mount", (addr_t)vnode->mount);
3168 	set_debug_variable("_covered_by", (addr_t)vnode->covered_by);
3169 	set_debug_variable("_covers", (addr_t)vnode->covers);
3170 	set_debug_variable("_adv_lock", (addr_t)vnode->advisory_locking);
3171 }
3172 
3173 
3174 static int
3175 dump_mount(int argc, char** argv)
3176 {
3177 	if (argc != 2 || !strcmp(argv[1], "--help")) {
3178 		kprintf("usage: %s [id|address]\n", argv[0]);
3179 		return 0;
3180 	}
3181 
3182 	ulong val = parse_expression(argv[1]);
3183 	uint32 id = val;
3184 
3185 	struct fs_mount* mount = sMountsTable->Lookup(id);
3186 	if (mount == NULL) {
3187 		if (IS_USER_ADDRESS(id)) {
3188 			kprintf("fs_mount not found\n");
3189 			return 0;
3190 		}
3191 		mount = (fs_mount*)val;
3192 	}
3193 
3194 	_dump_mount(mount);
3195 	return 0;
3196 }
3197 
3198 
3199 static int
3200 dump_mounts(int argc, char** argv)
3201 {
3202 	if (argc != 1) {
3203 		kprintf("usage: %s\n", argv[0]);
3204 		return 0;
3205 	}
3206 
3207 	kprintf("%-*s    id %-*s   %-*s   %-*s   fs_name\n",
3208 		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "root",
3209 		B_PRINTF_POINTER_WIDTH, "covers", B_PRINTF_POINTER_WIDTH, "cookie");
3210 
3211 	struct fs_mount* mount;
3212 
3213 	MountTable::Iterator iterator(sMountsTable);
3214 	while (iterator.HasNext()) {
3215 		mount = iterator.Next();
3216 		kprintf("%p%4" B_PRIdDEV " %p %p %p %s\n", mount, mount->id, mount->root_vnode,
3217 			mount->root_vnode->covers, mount->volume->private_volume,
3218 			mount->volume->file_system_name);
3219 
3220 		fs_volume* volume = mount->volume;
3221 		while (volume->super_volume != NULL) {
3222 			volume = volume->super_volume;
3223 			kprintf("                                     %p %s\n",
3224 				volume->private_volume, volume->file_system_name);
3225 		}
3226 	}
3227 
3228 	return 0;
3229 }
3230 
3231 
3232 static int
3233 dump_vnode(int argc, char** argv)
3234 {
3235 	bool printPath = false;
3236 	int argi = 1;
3237 	if (argc >= 2 && strcmp(argv[argi], "-p") == 0) {
3238 		printPath = true;
3239 		argi++;
3240 	}
3241 
3242 	if (argi >= argc || argi + 2 < argc) {
3243 		print_debugger_command_usage(argv[0]);
3244 		return 0;
3245 	}
3246 
3247 	struct vnode* vnode = NULL;
3248 
3249 	if (argi + 1 == argc) {
3250 		vnode = (struct vnode*)parse_expression(argv[argi]);
3251 		if (IS_USER_ADDRESS(vnode)) {
3252 			kprintf("invalid vnode address\n");
3253 			return 0;
3254 		}
3255 		_dump_vnode(vnode, printPath);
3256 		return 0;
3257 	}
3258 
3259 	dev_t device = parse_expression(argv[argi]);
3260 	ino_t id = parse_expression(argv[argi + 1]);
3261 
3262 	VnodeTable::Iterator iterator(sVnodeTable);
3263 	while (iterator.HasNext()) {
3264 		vnode = iterator.Next();
3265 		if (vnode->id != id || vnode->device != device)
3266 			continue;
3267 
3268 		_dump_vnode(vnode, printPath);
3269 	}
3270 
3271 	return 0;
3272 }
3273 
3274 
3275 static int
3276 dump_vnodes(int argc, char** argv)
3277 {
3278 	if (argc != 2 || !strcmp(argv[1], "--help")) {
3279 		kprintf("usage: %s [device]\n", argv[0]);
3280 		return 0;
3281 	}
3282 
3283 	// restrict dumped nodes to a certain device if requested
3284 	dev_t device = parse_expression(argv[1]);
3285 
3286 	struct vnode* vnode;
3287 
3288 	kprintf("%-*s   dev     inode  ref %-*s   %-*s   %-*s   flags\n",
3289 		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache",
3290 		B_PRINTF_POINTER_WIDTH, "fs-node", B_PRINTF_POINTER_WIDTH, "locking");
3291 
3292 	VnodeTable::Iterator iterator(sVnodeTable);
3293 	while (iterator.HasNext()) {
3294 		vnode = iterator.Next();
3295 		if (vnode->device != device)
3296 			continue;
3297 
3298 		kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO "%5" B_PRId32 " %p %p %p %s%s%s\n",
3299 			vnode, vnode->device, vnode->id, vnode->ref_count, vnode->cache,
3300 			vnode->private_node, vnode->advisory_locking,
3301 			vnode->IsRemoved() ? "r" : "-", vnode->IsBusy() ? "b" : "-",
3302 			vnode->IsUnpublished() ? "u" : "-");
3303 	}
3304 
3305 	return 0;
3306 }
3307 
3308 
3309 static int
3310 dump_vnode_caches(int argc, char** argv)
3311 {
3312 	struct vnode* vnode;
3313 
3314 	if (argc > 2 || !strcmp(argv[1], "--help")) {
3315 		kprintf("usage: %s [device]\n", argv[0]);
3316 		return 0;
3317 	}
3318 
3319 	// restrict dumped nodes to a certain device if requested
3320 	dev_t device = -1;
3321 	if (argc > 1)
3322 		device = parse_expression(argv[1]);
3323 
3324 	kprintf("%-*s   dev     inode %-*s       size   pages\n",
3325 		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache");
3326 
3327 	VnodeTable::Iterator iterator(sVnodeTable);
3328 	while (iterator.HasNext()) {
3329 		vnode = iterator.Next();
3330 		if (vnode->cache == NULL)
3331 			continue;
3332 		if (device != -1 && vnode->device != device)
3333 			continue;
3334 
3335 		kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO " %p %8" B_PRIdOFF "%8" B_PRId32 "\n",
3336 			vnode, vnode->device, vnode->id, vnode->cache,
3337 			(vnode->cache->virtual_end + B_PAGE_SIZE - 1) / B_PAGE_SIZE,
3338 			vnode->cache->page_count);
3339 	}
3340 
3341 	return 0;
3342 }
3343 
3344 
3345 int
3346 dump_io_context(int argc, char** argv)
3347 {
3348 	if (argc > 2 || !strcmp(argv[1], "--help")) {
3349 		kprintf("usage: %s [team-id|address]\n", argv[0]);
3350 		return 0;
3351 	}
3352 
3353 	struct io_context* context = NULL;
3354 
3355 	if (argc > 1) {
3356 		ulong num = parse_expression(argv[1]);
3357 		if (IS_KERNEL_ADDRESS(num))
3358 			context = (struct io_context*)num;
3359 		else {
3360 			Team* team = team_get_team_struct_locked(num);
3361 			if (team == NULL) {
3362 				kprintf("could not find team with ID %lu\n", num);
3363 				return 0;
3364 			}
3365 			context = (struct io_context*)team->io_context;
3366 		}
3367 	} else
3368 		context = get_current_io_context(true);
3369 
3370 	kprintf("I/O CONTEXT: %p\n", context);
3371 	kprintf(" root vnode:\t%p\n", context->root);
3372 	kprintf(" cwd vnode:\t%p\n", context->cwd);
3373 	kprintf(" used fds:\t%" B_PRIu32 "\n", context->num_used_fds);
3374 	kprintf(" max fds:\t%" B_PRIu32 "\n", context->table_size);
3375 
3376 	if (context->num_used_fds) {
3377 		kprintf("   no.  type    %*s  ref  open  mode         pos    %*s\n",
3378 			B_PRINTF_POINTER_WIDTH, "ops", B_PRINTF_POINTER_WIDTH, "cookie");
3379 	}
3380 
3381 	for (uint32 i = 0; i < context->table_size; i++) {
3382 		struct file_descriptor* fd = context->fds[i];
3383 		if (fd == NULL)
3384 			continue;
3385 
3386 		kprintf("  %3" B_PRIu32 ":  %4" B_PRId32 "  %p  %3" B_PRId32 "  %4"
3387 			B_PRIu32 "  %4" B_PRIx32 "  %10" B_PRIdOFF "  %p  %s %p\n", i,
3388 			fd->type, fd->ops, fd->ref_count, fd->open_count, fd->open_mode,
3389 			fd->pos, fd->cookie,
3390 			fd->type >= FDTYPE_INDEX && fd->type <= FDTYPE_QUERY
3391 				? "mount" : "vnode",
3392 			fd->u.vnode);
3393 	}
3394 
3395 	kprintf(" used monitors:\t%" B_PRIu32 "\n", context->num_monitors);
3396 	kprintf(" max monitors:\t%" B_PRIu32 "\n", context->max_monitors);
3397 
3398 	set_debug_variable("_cwd", (addr_t)context->cwd);
3399 
3400 	return 0;
3401 }
3402 
3403 
3404 int
3405 dump_vnode_usage(int argc, char** argv)
3406 {
3407 	if (argc != 1) {
3408 		kprintf("usage: %s\n", argv[0]);
3409 		return 0;
3410 	}
3411 
3412 	kprintf("Unused vnodes: %" B_PRIu32 " (max unused %" B_PRIu32 ")\n",
3413 		sUnusedVnodes, kMaxUnusedVnodes);
3414 
3415 	uint32 count = sVnodeTable->CountElements();
3416 
3417 	kprintf("%" B_PRIu32 " vnodes total (%" B_PRIu32 " in use).\n", count,
3418 		count - sUnusedVnodes);
3419 	return 0;
3420 }
3421 
3422 #endif	// ADD_DEBUGGER_COMMANDS
3423 
3424 
3425 /*!	Clears memory specified by an iovec array.
3426 */
3427 static void
3428 zero_iovecs(const iovec* vecs, size_t vecCount, size_t bytes)
3429 {
3430 	for (size_t i = 0; i < vecCount && bytes > 0; i++) {
3431 		size_t length = std::min(vecs[i].iov_len, bytes);
3432 		memset(vecs[i].iov_base, 0, length);
3433 		bytes -= length;
3434 	}
3435 }
3436 
3437 
3438 /*!	Does the dirty work of combining the file_io_vecs with the iovecs
3439 	and calls the file system hooks to read/write the request to disk.
3440 */
3441 static status_t
3442 common_file_io_vec_pages(struct vnode* vnode, void* cookie,
3443 	const file_io_vec* fileVecs, size_t fileVecCount, const iovec* vecs,
3444 	size_t vecCount, uint32* _vecIndex, size_t* _vecOffset, size_t* _numBytes,
3445 	bool doWrite)
3446 {
3447 	if (fileVecCount == 0) {
3448 		// There are no file vecs at this offset, so we're obviously trying
3449 		// to access the file outside of its bounds
3450 		return B_BAD_VALUE;
3451 	}
3452 
3453 	size_t numBytes = *_numBytes;
3454 	uint32 fileVecIndex;
3455 	size_t vecOffset = *_vecOffset;
3456 	uint32 vecIndex = *_vecIndex;
3457 	status_t status;
3458 	size_t size;
3459 
3460 	if (!doWrite && vecOffset == 0) {
3461 		// now directly read the data from the device
3462 		// the first file_io_vec can be read directly
3463 
3464 		if (fileVecs[0].length < (off_t)numBytes)
3465 			size = fileVecs[0].length;
3466 		else
3467 			size = numBytes;
3468 
3469 		if (fileVecs[0].offset >= 0) {
3470 			status = FS_CALL(vnode, read_pages, cookie, fileVecs[0].offset,
3471 				&vecs[vecIndex], vecCount - vecIndex, &size);
3472 		} else {
3473 			// sparse read
3474 			zero_iovecs(&vecs[vecIndex], vecCount - vecIndex, size);
3475 			status = B_OK;
3476 		}
3477 		if (status != B_OK)
3478 			return status;
3479 
3480 		// TODO: this is a work-around for buggy device drivers!
3481 		//	When our own drivers honour the length, we can:
3482 		//	a) also use this direct I/O for writes (otherwise, it would
3483 		//	   overwrite precious data)
3484 		//	b) panic if the term below is true (at least for writes)
3485 		if ((off_t)size > fileVecs[0].length) {
3486 			//dprintf("warning: device driver %p doesn't respect total length "
3487 			//	"in read_pages() call!\n", ref->device);
3488 			size = fileVecs[0].length;
3489 		}
3490 
3491 		ASSERT((off_t)size <= fileVecs[0].length);
3492 
3493 		// If the file portion was contiguous, we're already done now
3494 		if (size == numBytes)
3495 			return B_OK;
3496 
3497 		// if we reached the end of the file, we can return as well
3498 		if ((off_t)size != fileVecs[0].length) {
3499 			*_numBytes = size;
3500 			return B_OK;
3501 		}
3502 
3503 		fileVecIndex = 1;
3504 
3505 		// first, find out where we have to continue in our iovecs
3506 		for (; vecIndex < vecCount; vecIndex++) {
3507 			if (size < vecs[vecIndex].iov_len)
3508 				break;
3509 
3510 			size -= vecs[vecIndex].iov_len;
3511 		}
3512 
3513 		vecOffset = size;
3514 	} else {
3515 		fileVecIndex = 0;
3516 		size = 0;
3517 	}
3518 
3519 	// Too bad, let's process the rest of the file_io_vecs
3520 
3521 	size_t totalSize = size;
3522 	size_t bytesLeft = numBytes - size;
3523 
3524 	for (; fileVecIndex < fileVecCount; fileVecIndex++) {
3525 		const file_io_vec &fileVec = fileVecs[fileVecIndex];
3526 		off_t fileOffset = fileVec.offset;
3527 		off_t fileLeft = min_c(fileVec.length, (off_t)bytesLeft);
3528 
3529 		TRACE(("FILE VEC [%" B_PRIu32 "] length %" B_PRIdOFF "\n", fileVecIndex,
3530 			fileLeft));
3531 
3532 		// process the complete fileVec
3533 		while (fileLeft > 0) {
3534 			iovec tempVecs[MAX_TEMP_IO_VECS];
3535 			uint32 tempCount = 0;
3536 
3537 			// size tracks how much of what is left of the current fileVec
3538 			// (fileLeft) has been assigned to tempVecs
3539 			size = 0;
3540 
3541 			// assign what is left of the current fileVec to the tempVecs
3542 			for (size = 0; (off_t)size < fileLeft && vecIndex < vecCount
3543 					&& tempCount < MAX_TEMP_IO_VECS;) {
3544 				// try to satisfy one iovec per iteration (or as much as
3545 				// possible)
3546 
3547 				// bytes left of the current iovec
3548 				size_t vecLeft = vecs[vecIndex].iov_len - vecOffset;
3549 				if (vecLeft == 0) {
3550 					vecOffset = 0;
3551 					vecIndex++;
3552 					continue;
3553 				}
3554 
3555 				TRACE(("fill vec %" B_PRIu32 ", offset = %lu, size = %lu\n",
3556 					vecIndex, vecOffset, size));
3557 
3558 				// actually available bytes
3559 				size_t tempVecSize = min_c(vecLeft, fileLeft - size);
3560 
3561 				tempVecs[tempCount].iov_base
3562 					= (void*)((addr_t)vecs[vecIndex].iov_base + vecOffset);
3563 				tempVecs[tempCount].iov_len = tempVecSize;
3564 				tempCount++;
3565 
3566 				size += tempVecSize;
3567 				vecOffset += tempVecSize;
3568 			}
3569 
3570 			size_t bytes = size;
3571 
3572 			if (fileOffset == -1) {
3573 				if (doWrite) {
3574 					panic("sparse write attempt: vnode %p", vnode);
3575 					status = B_IO_ERROR;
3576 				} else {
3577 					// sparse read
3578 					zero_iovecs(tempVecs, tempCount, bytes);
3579 					status = B_OK;
3580 				}
3581 			} else if (doWrite) {
3582 				status = FS_CALL(vnode, write_pages, cookie, fileOffset,
3583 					tempVecs, tempCount, &bytes);
3584 			} else {
3585 				status = FS_CALL(vnode, read_pages, cookie, fileOffset,
3586 					tempVecs, tempCount, &bytes);
3587 			}
3588 			if (status != B_OK)
3589 				return status;
3590 
3591 			totalSize += bytes;
3592 			bytesLeft -= size;
3593 			if (fileOffset >= 0)
3594 				fileOffset += size;
3595 			fileLeft -= size;
3596 			//dprintf("-> file left = %Lu\n", fileLeft);
3597 
3598 			if (size != bytes || vecIndex >= vecCount) {
3599 				// there are no more bytes or iovecs, let's bail out
3600 				*_numBytes = totalSize;
3601 				return B_OK;
3602 			}
3603 		}
3604 	}
3605 
3606 	*_vecIndex = vecIndex;
3607 	*_vecOffset = vecOffset;
3608 	*_numBytes = totalSize;
3609 	return B_OK;
3610 }
3611 
3612 
3613 static bool
3614 is_user_in_group(gid_t gid)
3615 {
3616 	if (gid == getegid())
3617 		return true;
3618 
3619 	gid_t groups[NGROUPS_MAX];
3620 	int groupCount = getgroups(NGROUPS_MAX, groups);
3621 	for (int i = 0; i < groupCount; i++) {
3622 		if (gid == groups[i])
3623 			return true;
3624 	}
3625 
3626 	return false;
3627 }
3628 
3629 
3630 static status_t
3631 free_io_context(io_context* context)
3632 {
3633 	uint32 i;
3634 
3635 	TIOC(FreeIOContext(context));
3636 
3637 	if (context->root)
3638 		put_vnode(context->root);
3639 
3640 	if (context->cwd)
3641 		put_vnode(context->cwd);
3642 
3643 	mutex_lock(&context->io_mutex);
3644 
3645 	for (i = 0; i < context->table_size; i++) {
3646 		if (struct file_descriptor* descriptor = context->fds[i]) {
3647 			close_fd(descriptor);
3648 			put_fd(descriptor);
3649 		}
3650 	}
3651 
3652 	mutex_destroy(&context->io_mutex);
3653 
3654 	remove_node_monitors(context);
3655 	free(context->fds);
3656 	free(context);
3657 
3658 	return B_OK;
3659 }
3660 
3661 
3662 static status_t
3663 resize_monitor_table(struct io_context* context, const int newSize)
3664 {
3665 	int	status = B_OK;
3666 
3667 	if (newSize <= 0 || newSize > MAX_NODE_MONITORS)
3668 		return B_BAD_VALUE;
3669 
3670 	mutex_lock(&context->io_mutex);
3671 
3672 	if ((size_t)newSize < context->num_monitors) {
3673 		status = B_BUSY;
3674 		goto out;
3675 	}
3676 	context->max_monitors = newSize;
3677 
3678 out:
3679 	mutex_unlock(&context->io_mutex);
3680 	return status;
3681 }
3682 
3683 
3684 //	#pragma mark - public API for file systems
3685 
3686 
3687 extern "C" status_t
3688 new_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3689 	fs_vnode_ops* ops)
3690 {
3691 	FUNCTION(("new_vnode(volume = %p (%" B_PRId32 "), vnodeID = %" B_PRId64
3692 		", node = %p)\n", volume, volume->id, vnodeID, privateNode));
3693 
3694 	if (privateNode == NULL)
3695 		return B_BAD_VALUE;
3696 
3697 	int32 tries = BUSY_VNODE_RETRIES;
3698 restart:
3699 	// create the node
3700 	bool nodeCreated;
3701 	struct vnode* vnode;
3702 	status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3703 		nodeCreated);
3704 	if (status != B_OK)
3705 		return status;
3706 
3707 	WriteLocker nodeLocker(sVnodeLock, true);
3708 		// create_new_vnode_and_lock() has locked for us
3709 
3710 	if (!nodeCreated && vnode->IsBusy()) {
3711 		nodeLocker.Unlock();
3712 		if (!retry_busy_vnode(tries, volume->id, vnodeID))
3713 			return B_BUSY;
3714 		goto restart;
3715 	}
3716 
3717 	// file system integrity check:
3718 	// test if the vnode already exists and bail out if this is the case!
3719 	if (!nodeCreated) {
3720 		panic("vnode %" B_PRIdDEV ":%" B_PRIdINO " already exists (node = %p, "
3721 			"vnode->node = %p)!", volume->id, vnodeID, privateNode,
3722 			vnode->private_node);
3723 		return B_ERROR;
3724 	}
3725 
3726 	vnode->private_node = privateNode;
3727 	vnode->ops = ops;
3728 	vnode->SetUnpublished(true);
3729 
3730 	TRACE(("returns: %s\n", strerror(status)));
3731 
3732 	return status;
3733 }
3734 
3735 
3736 extern "C" status_t
3737 publish_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3738 	fs_vnode_ops* ops, int type, uint32 flags)
3739 {
3740 	FUNCTION(("publish_vnode()\n"));
3741 
3742 	int32 tries = BUSY_VNODE_RETRIES;
3743 restart:
3744 	WriteLocker locker(sVnodeLock);
3745 
3746 	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3747 
3748 	bool nodeCreated = false;
3749 	if (vnode == NULL) {
3750 		if (privateNode == NULL)
3751 			return B_BAD_VALUE;
3752 
3753 		// create the node
3754 		locker.Unlock();
3755 			// create_new_vnode_and_lock() will re-lock for us on success
3756 		status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3757 			nodeCreated);
3758 		if (status != B_OK)
3759 			return status;
3760 
3761 		locker.SetTo(sVnodeLock, true);
3762 	}
3763 
3764 	if (nodeCreated) {
3765 		vnode->private_node = privateNode;
3766 		vnode->ops = ops;
3767 		vnode->SetUnpublished(true);
3768 	} else if (vnode->IsBusy() && vnode->IsUnpublished()
3769 		&& vnode->private_node == privateNode && vnode->ops == ops) {
3770 		// already known, but not published
3771 	} else if (vnode->IsBusy()) {
3772 		locker.Unlock();
3773 		if (!retry_busy_vnode(tries, volume->id, vnodeID))
3774 			return B_BUSY;
3775 		goto restart;
3776 	} else
3777 		return B_BAD_VALUE;
3778 
3779 	bool publishSpecialSubNode = false;
3780 
3781 	vnode->SetType(type);
3782 	vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
3783 	publishSpecialSubNode = is_special_node_type(type)
3784 		&& (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
3785 
3786 	status_t status = B_OK;
3787 
3788 	// create sub vnodes, if necessary
3789 	if (volume->sub_volume != NULL || publishSpecialSubNode) {
3790 		locker.Unlock();
3791 
3792 		fs_volume* subVolume = volume;
3793 		if (volume->sub_volume != NULL) {
3794 			while (status == B_OK && subVolume->sub_volume != NULL) {
3795 				subVolume = subVolume->sub_volume;
3796 				status = subVolume->ops->create_sub_vnode(subVolume, vnodeID,
3797 					vnode);
3798 			}
3799 		}
3800 
3801 		if (status == B_OK && publishSpecialSubNode)
3802 			status = create_special_sub_node(vnode, flags);
3803 
3804 		if (status != B_OK) {
3805 			// error -- clean up the created sub vnodes
3806 			while (subVolume->super_volume != volume) {
3807 				subVolume = subVolume->super_volume;
3808 				subVolume->ops->delete_sub_vnode(subVolume, vnode);
3809 			}
3810 		}
3811 
3812 		if (status == B_OK) {
3813 			ReadLocker vnodesReadLocker(sVnodeLock);
3814 			AutoLocker<Vnode> nodeLocker(vnode);
3815 			vnode->SetBusy(false);
3816 			vnode->SetUnpublished(false);
3817 		} else {
3818 			locker.Lock();
3819 			sVnodeTable->Remove(vnode);
3820 			remove_vnode_from_mount_list(vnode, vnode->mount);
3821 			free(vnode);
3822 		}
3823 	} else {
3824 		// we still hold the write lock -- mark the node unbusy and published
3825 		vnode->SetBusy(false);
3826 		vnode->SetUnpublished(false);
3827 	}
3828 
3829 	TRACE(("returns: %s\n", strerror(status)));
3830 
3831 	return status;
3832 }
3833 
3834 
3835 extern "C" status_t
3836 get_vnode(fs_volume* volume, ino_t vnodeID, void** _privateNode)
3837 {
3838 	struct vnode* vnode;
3839 
3840 	if (volume == NULL)
3841 		return B_BAD_VALUE;
3842 
3843 	status_t status = get_vnode(volume->id, vnodeID, &vnode, true, true);
3844 	if (status != B_OK)
3845 		return status;
3846 
3847 	// If this is a layered FS, we need to get the node cookie for the requested
3848 	// layer.
3849 	if (HAS_FS_CALL(vnode, get_super_vnode)) {
3850 		fs_vnode resolvedNode;
3851 		status_t status = FS_CALL(vnode, get_super_vnode, volume,
3852 			&resolvedNode);
3853 		if (status != B_OK) {
3854 			panic("get_vnode(): Failed to get super node for vnode %p, "
3855 				"volume: %p", vnode, volume);
3856 			put_vnode(vnode);
3857 			return status;
3858 		}
3859 
3860 		if (_privateNode != NULL)
3861 			*_privateNode = resolvedNode.private_node;
3862 	} else if (_privateNode != NULL)
3863 		*_privateNode = vnode->private_node;
3864 
3865 	return B_OK;
3866 }
3867 
3868 
3869 extern "C" status_t
3870 acquire_vnode(fs_volume* volume, ino_t vnodeID)
3871 {
3872 	struct vnode* vnode;
3873 
3874 	rw_lock_read_lock(&sVnodeLock);
3875 	vnode = lookup_vnode(volume->id, vnodeID);
3876 	rw_lock_read_unlock(&sVnodeLock);
3877 
3878 	if (vnode == NULL)
3879 		return B_BAD_VALUE;
3880 
3881 	inc_vnode_ref_count(vnode);
3882 	return B_OK;
3883 }
3884 
3885 
3886 extern "C" status_t
3887 put_vnode(fs_volume* volume, ino_t vnodeID)
3888 {
3889 	struct vnode* vnode;
3890 
3891 	rw_lock_read_lock(&sVnodeLock);
3892 	vnode = lookup_vnode(volume->id, vnodeID);
3893 	rw_lock_read_unlock(&sVnodeLock);
3894 
3895 	if (vnode == NULL)
3896 		return B_BAD_VALUE;
3897 
3898 	dec_vnode_ref_count(vnode, false, true);
3899 	return B_OK;
3900 }
3901 
3902 
3903 extern "C" status_t
3904 remove_vnode(fs_volume* volume, ino_t vnodeID)
3905 {
3906 	ReadLocker locker(sVnodeLock);
3907 
3908 	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3909 	if (vnode == NULL)
3910 		return B_ENTRY_NOT_FOUND;
3911 
3912 	if (vnode->covered_by != NULL || vnode->covers != NULL) {
3913 		// this vnode is in use
3914 		return B_BUSY;
3915 	}
3916 
3917 	vnode->Lock();
3918 
3919 	vnode->SetRemoved(true);
3920 	bool removeUnpublished = false;
3921 
3922 	if (vnode->IsUnpublished()) {
3923 		// prepare the vnode for deletion
3924 		removeUnpublished = true;
3925 		vnode->SetBusy(true);
3926 	}
3927 
3928 	vnode->Unlock();
3929 	locker.Unlock();
3930 
3931 	if (removeUnpublished) {
3932 		// If the vnode hasn't been published yet, we delete it here
3933 		atomic_add(&vnode->ref_count, -1);
3934 		free_vnode(vnode, true);
3935 	}
3936 
3937 	return B_OK;
3938 }
3939 
3940 
3941 extern "C" status_t
3942 unremove_vnode(fs_volume* volume, ino_t vnodeID)
3943 {
3944 	struct vnode* vnode;
3945 
3946 	rw_lock_read_lock(&sVnodeLock);
3947 
3948 	vnode = lookup_vnode(volume->id, vnodeID);
3949 	if (vnode) {
3950 		AutoLocker<Vnode> nodeLocker(vnode);
3951 		vnode->SetRemoved(false);
3952 	}
3953 
3954 	rw_lock_read_unlock(&sVnodeLock);
3955 	return B_OK;
3956 }
3957 
3958 
3959 extern "C" status_t
3960 get_vnode_removed(fs_volume* volume, ino_t vnodeID, bool* _removed)
3961 {
3962 	ReadLocker _(sVnodeLock);
3963 
3964 	if (struct vnode* vnode = lookup_vnode(volume->id, vnodeID)) {
3965 		if (_removed != NULL)
3966 			*_removed = vnode->IsRemoved();
3967 		return B_OK;
3968 	}
3969 
3970 	return B_BAD_VALUE;
3971 }
3972 
3973 
3974 extern "C" fs_volume*
3975 volume_for_vnode(fs_vnode* _vnode)
3976 {
3977 	if (_vnode == NULL)
3978 		return NULL;
3979 
3980 	struct vnode* vnode = static_cast<struct vnode*>(_vnode);
3981 	return vnode->mount->volume;
3982 }
3983 
3984 
3985 extern "C" status_t
3986 check_access_permissions(int accessMode, mode_t mode, gid_t nodeGroupID,
3987 	uid_t nodeUserID)
3988 {
3989 	// get node permissions
3990 	int userPermissions = (mode & S_IRWXU) >> 6;
3991 	int groupPermissions = (mode & S_IRWXG) >> 3;
3992 	int otherPermissions = mode & S_IRWXO;
3993 
3994 	// get the node permissions for this uid/gid
3995 	int permissions = 0;
3996 	uid_t uid = geteuid();
3997 
3998 	if (uid == 0) {
3999 		// user is root
4000 		// root has always read/write permission, but at least one of the
4001 		// X bits must be set for execute permission
4002 		permissions = userPermissions | groupPermissions | otherPermissions
4003 			| S_IROTH | S_IWOTH;
4004 		if (S_ISDIR(mode))
4005 			permissions |= S_IXOTH;
4006 	} else if (uid == nodeUserID) {
4007 		// user is node owner
4008 		permissions = userPermissions;
4009 	} else if (is_user_in_group(nodeGroupID)) {
4010 		// user is in owning group
4011 		permissions = groupPermissions;
4012 	} else {
4013 		// user is one of the others
4014 		permissions = otherPermissions;
4015 	}
4016 
4017 	return (accessMode & ~permissions) == 0 ? B_OK : B_PERMISSION_DENIED;
4018 }
4019 
4020 
4021 #if 0
4022 extern "C" status_t
4023 read_pages(int fd, off_t pos, const iovec* vecs, size_t count,
4024 	size_t* _numBytes)
4025 {
4026 	struct file_descriptor* descriptor;
4027 	struct vnode* vnode;
4028 
4029 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4030 	if (descriptor == NULL)
4031 		return B_FILE_ERROR;
4032 
4033 	status_t status = vfs_read_pages(vnode, descriptor->cookie, pos, vecs,
4034 		count, 0, _numBytes);
4035 
4036 	put_fd(descriptor);
4037 	return status;
4038 }
4039 
4040 
4041 extern "C" status_t
4042 write_pages(int fd, off_t pos, const iovec* vecs, size_t count,
4043 	size_t* _numBytes)
4044 {
4045 	struct file_descriptor* descriptor;
4046 	struct vnode* vnode;
4047 
4048 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4049 	if (descriptor == NULL)
4050 		return B_FILE_ERROR;
4051 
4052 	status_t status = vfs_write_pages(vnode, descriptor->cookie, pos, vecs,
4053 		count, 0, _numBytes);
4054 
4055 	put_fd(descriptor);
4056 	return status;
4057 }
4058 #endif
4059 
4060 
4061 extern "C" status_t
4062 read_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
4063 	const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
4064 	size_t* _bytes)
4065 {
4066 	struct file_descriptor* descriptor;
4067 	struct vnode* vnode;
4068 
4069 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4070 	if (descriptor == NULL)
4071 		return B_FILE_ERROR;
4072 
4073 	status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
4074 		fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
4075 		false);
4076 
4077 	put_fd(descriptor);
4078 	return status;
4079 }
4080 
4081 
4082 extern "C" status_t
4083 write_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
4084 	const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
4085 	size_t* _bytes)
4086 {
4087 	struct file_descriptor* descriptor;
4088 	struct vnode* vnode;
4089 
4090 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4091 	if (descriptor == NULL)
4092 		return B_FILE_ERROR;
4093 
4094 	status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
4095 		fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
4096 		true);
4097 
4098 	put_fd(descriptor);
4099 	return status;
4100 }
4101 
4102 
4103 extern "C" status_t
4104 entry_cache_add(dev_t mountID, ino_t dirID, const char* name, ino_t nodeID)
4105 {
4106 	// lookup mount -- the caller is required to make sure that the mount
4107 	// won't go away
4108 	MutexLocker locker(sMountMutex);
4109 	struct fs_mount* mount = find_mount(mountID);
4110 	if (mount == NULL)
4111 		return B_BAD_VALUE;
4112 	locker.Unlock();
4113 
4114 	return mount->entry_cache.Add(dirID, name, nodeID, false);
4115 }
4116 
4117 
4118 extern "C" status_t
4119 entry_cache_add_missing(dev_t mountID, ino_t dirID, const char* name)
4120 {
4121 	// lookup mount -- the caller is required to make sure that the mount
4122 	// won't go away
4123 	MutexLocker locker(sMountMutex);
4124 	struct fs_mount* mount = find_mount(mountID);
4125 	if (mount == NULL)
4126 		return B_BAD_VALUE;
4127 	locker.Unlock();
4128 
4129 	return mount->entry_cache.Add(dirID, name, -1, true);
4130 }
4131 
4132 
4133 extern "C" status_t
4134 entry_cache_remove(dev_t mountID, ino_t dirID, const char* name)
4135 {
4136 	// lookup mount -- the caller is required to make sure that the mount
4137 	// won't go away
4138 	MutexLocker locker(sMountMutex);
4139 	struct fs_mount* mount = find_mount(mountID);
4140 	if (mount == NULL)
4141 		return B_BAD_VALUE;
4142 	locker.Unlock();
4143 
4144 	return mount->entry_cache.Remove(dirID, name);
4145 }
4146 
4147 
4148 //	#pragma mark - private VFS API
4149 //	Functions the VFS exports for other parts of the kernel
4150 
4151 
4152 /*! Acquires another reference to the vnode that has to be released
4153 	by calling vfs_put_vnode().
4154 */
4155 void
4156 vfs_acquire_vnode(struct vnode* vnode)
4157 {
4158 	inc_vnode_ref_count(vnode);
4159 }
4160 
4161 
4162 /*! This is currently called from file_cache_create() only.
4163 	It's probably a temporary solution as long as devfs requires that
4164 	fs_read_pages()/fs_write_pages() are called with the standard
4165 	open cookie and not with a device cookie.
4166 	If that's done differently, remove this call; it has no other
4167 	purpose.
4168 */
4169 extern "C" status_t
4170 vfs_get_cookie_from_fd(int fd, void** _cookie)
4171 {
4172 	struct file_descriptor* descriptor;
4173 
4174 	descriptor = get_fd(get_current_io_context(true), fd);
4175 	if (descriptor == NULL)
4176 		return B_FILE_ERROR;
4177 
4178 	*_cookie = descriptor->cookie;
4179 	return B_OK;
4180 }
4181 
4182 
4183 extern "C" status_t
4184 vfs_get_vnode_from_fd(int fd, bool kernel, struct vnode** vnode)
4185 {
4186 	*vnode = get_vnode_from_fd(fd, kernel);
4187 
4188 	if (*vnode == NULL)
4189 		return B_FILE_ERROR;
4190 
4191 	return B_NO_ERROR;
4192 }
4193 
4194 
4195 extern "C" status_t
4196 vfs_get_vnode_from_path(const char* path, bool kernel, struct vnode** _vnode)
4197 {
4198 	TRACE(("vfs_get_vnode_from_path: entry. path = '%s', kernel %d\n",
4199 		path, kernel));
4200 
4201 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
4202 	if (pathBuffer.InitCheck() != B_OK)
4203 		return B_NO_MEMORY;
4204 
4205 	char* buffer = pathBuffer.LockBuffer();
4206 	strlcpy(buffer, path, pathBuffer.BufferSize());
4207 
4208 	struct vnode* vnode;
4209 	status_t status = path_to_vnode(buffer, true, &vnode, NULL, kernel);
4210 	if (status != B_OK)
4211 		return status;
4212 
4213 	*_vnode = vnode;
4214 	return B_OK;
4215 }
4216 
4217 
4218 extern "C" status_t
4219 vfs_get_vnode(dev_t mountID, ino_t vnodeID, bool canWait, struct vnode** _vnode)
4220 {
4221 	struct vnode* vnode = NULL;
4222 
4223 	status_t status = get_vnode(mountID, vnodeID, &vnode, canWait, false);
4224 	if (status != B_OK)
4225 		return status;
4226 
4227 	*_vnode = vnode;
4228 	return B_OK;
4229 }
4230 
4231 
4232 extern "C" status_t
4233 vfs_entry_ref_to_vnode(dev_t mountID, ino_t directoryID,
4234 	const char* name, struct vnode** _vnode)
4235 {
4236 	return entry_ref_to_vnode(mountID, directoryID, name, false, true, _vnode);
4237 }
4238 
4239 
4240 extern "C" void
4241 vfs_vnode_to_node_ref(struct vnode* vnode, dev_t* _mountID, ino_t* _vnodeID)
4242 {
4243 	*_mountID = vnode->device;
4244 	*_vnodeID = vnode->id;
4245 }
4246 
4247 
4248 /*!
4249 	Helper function abstracting the process of "converting" a given
4250 	vnode-pointer to a fs_vnode-pointer.
4251 	Currently only used in bindfs.
4252 */
4253 extern "C" fs_vnode*
4254 vfs_fsnode_for_vnode(struct vnode* vnode)
4255 {
4256 	return vnode;
4257 }
4258 
4259 
4260 /*!
4261 	Calls fs_open() on the given vnode and returns a new
4262 	file descriptor for it
4263 */
4264 int
4265 vfs_open_vnode(struct vnode* vnode, int openMode, bool kernel)
4266 {
4267 	return open_vnode(vnode, openMode, kernel);
4268 }
4269 
4270 
4271 /*!	Looks up a vnode with the given mount and vnode ID.
4272 	Must only be used with "in-use" vnodes as it doesn't grab a reference
4273 	to the node.
4274 	It's currently only be used by file_cache_create().
4275 */
4276 extern "C" status_t
4277 vfs_lookup_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode)
4278 {
4279 	rw_lock_read_lock(&sVnodeLock);
4280 	struct vnode* vnode = lookup_vnode(mountID, vnodeID);
4281 	rw_lock_read_unlock(&sVnodeLock);
4282 
4283 	if (vnode == NULL)
4284 		return B_ERROR;
4285 
4286 	*_vnode = vnode;
4287 	return B_OK;
4288 }
4289 
4290 
4291 extern "C" status_t
4292 vfs_get_fs_node_from_path(fs_volume* volume, const char* path,
4293 	bool traverseLeafLink, bool kernel, void** _node)
4294 {
4295 	TRACE(("vfs_get_fs_node_from_path(volume = %p, path = \"%s\", kernel %d)\n",
4296 		volume, path, kernel));
4297 
4298 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
4299 	if (pathBuffer.InitCheck() != B_OK)
4300 		return B_NO_MEMORY;
4301 
4302 	fs_mount* mount;
4303 	status_t status = get_mount(volume->id, &mount);
4304 	if (status != B_OK)
4305 		return status;
4306 
4307 	char* buffer = pathBuffer.LockBuffer();
4308 	strlcpy(buffer, path, pathBuffer.BufferSize());
4309 
4310 	struct vnode* vnode = mount->root_vnode;
4311 
4312 	if (buffer[0] == '/')
4313 		status = path_to_vnode(buffer, traverseLeafLink, &vnode, NULL, kernel);
4314 	else {
4315 		inc_vnode_ref_count(vnode);
4316 			// vnode_path_to_vnode() releases a reference to the starting vnode
4317 		status = vnode_path_to_vnode(vnode, buffer, traverseLeafLink, 0,
4318 			kernel, &vnode, NULL);
4319 	}
4320 
4321 	put_mount(mount);
4322 
4323 	if (status != B_OK)
4324 		return status;
4325 
4326 	if (vnode->device != volume->id) {
4327 		// wrong mount ID - must not gain access on foreign file system nodes
4328 		put_vnode(vnode);
4329 		return B_BAD_VALUE;
4330 	}
4331 
4332 	// Use get_vnode() to resolve the cookie for the right layer.
4333 	status = get_vnode(volume, vnode->id, _node);
4334 	put_vnode(vnode);
4335 
4336 	return status;
4337 }
4338 
4339 
4340 status_t
4341 vfs_read_stat(int fd, const char* path, bool traverseLeafLink,
4342 	struct stat* stat, bool kernel)
4343 {
4344 	status_t status;
4345 
4346 	if (path != NULL) {
4347 		// path given: get the stat of the node referred to by (fd, path)
4348 		KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
4349 		if (pathBuffer.InitCheck() != B_OK)
4350 			return B_NO_MEMORY;
4351 
4352 		status = common_path_read_stat(fd, pathBuffer.LockBuffer(),
4353 			traverseLeafLink, stat, kernel);
4354 	} else {
4355 		// no path given: get the FD and use the FD operation
4356 		struct file_descriptor* descriptor
4357 			= get_fd(get_current_io_context(kernel), fd);
4358 		if (descriptor == NULL)
4359 			return B_FILE_ERROR;
4360 
4361 		if (descriptor->ops->fd_read_stat)
4362 			status = descriptor->ops->fd_read_stat(descriptor, stat);
4363 		else
4364 			status = B_UNSUPPORTED;
4365 
4366 		put_fd(descriptor);
4367 	}
4368 
4369 	return status;
4370 }
4371 
4372 
4373 /*!	Finds the full path to the file that contains the module \a moduleName,
4374 	puts it into \a pathBuffer, and returns B_OK for success.
4375 	If \a pathBuffer was too small, it returns \c B_BUFFER_OVERFLOW,
4376 	\c B_ENTRY_NOT_FOUNT if no file could be found.
4377 	\a pathBuffer is clobbered in any case and must not be relied on if this
4378 	functions returns unsuccessfully.
4379 	\a basePath and \a pathBuffer must not point to the same space.
4380 */
4381 status_t
4382 vfs_get_module_path(const char* basePath, const char* moduleName,
4383 	char* pathBuffer, size_t bufferSize)
4384 {
4385 	struct vnode* dir;
4386 	struct vnode* file;
4387 	status_t status;
4388 	size_t length;
4389 	char* path;
4390 
4391 	if (bufferSize == 0
4392 		|| strlcpy(pathBuffer, basePath, bufferSize) >= bufferSize)
4393 		return B_BUFFER_OVERFLOW;
4394 
4395 	status = path_to_vnode(pathBuffer, true, &dir, NULL, true);
4396 	if (status != B_OK)
4397 		return status;
4398 
4399 	// the path buffer had been clobbered by the above call
4400 	length = strlcpy(pathBuffer, basePath, bufferSize);
4401 	if (pathBuffer[length - 1] != '/')
4402 		pathBuffer[length++] = '/';
4403 
4404 	path = pathBuffer + length;
4405 	bufferSize -= length;
4406 
4407 	while (moduleName) {
4408 		char* nextPath = strchr(moduleName, '/');
4409 		if (nextPath == NULL)
4410 			length = strlen(moduleName);
4411 		else {
4412 			length = nextPath - moduleName;
4413 			nextPath++;
4414 		}
4415 
4416 		if (length + 1 >= bufferSize) {
4417 			status = B_BUFFER_OVERFLOW;
4418 			goto err;
4419 		}
4420 
4421 		memcpy(path, moduleName, length);
4422 		path[length] = '\0';
4423 		moduleName = nextPath;
4424 
4425 		status = vnode_path_to_vnode(dir, path, true, 0, true, &file, NULL);
4426 		if (status != B_OK) {
4427 			// vnode_path_to_vnode() has already released the reference to dir
4428 			return status;
4429 		}
4430 
4431 		if (S_ISDIR(file->Type())) {
4432 			// goto the next directory
4433 			path[length] = '/';
4434 			path[length + 1] = '\0';
4435 			path += length + 1;
4436 			bufferSize -= length + 1;
4437 
4438 			dir = file;
4439 		} else if (S_ISREG(file->Type())) {
4440 			// it's a file so it should be what we've searched for
4441 			put_vnode(file);
4442 
4443 			return B_OK;
4444 		} else {
4445 			TRACE(("vfs_get_module_path(): something is strange here: "
4446 				"0x%08" B_PRIx32 "...\n", file->Type()));
4447 			status = B_ERROR;
4448 			dir = file;
4449 			goto err;
4450 		}
4451 	}
4452 
4453 	// if we got here, the moduleName just pointed to a directory, not to
4454 	// a real module - what should we do in this case?
4455 	status = B_ENTRY_NOT_FOUND;
4456 
4457 err:
4458 	put_vnode(dir);
4459 	return status;
4460 }
4461 
4462 
4463 /*!	\brief Normalizes a given path.
4464 
4465 	The path must refer to an existing or non-existing entry in an existing
4466 	directory, that is chopping off the leaf component the remaining path must
4467 	refer to an existing directory.
4468 
4469 	The returned will be canonical in that it will be absolute, will not
4470 	contain any "." or ".." components or duplicate occurrences of '/'s,
4471 	and none of the directory components will by symbolic links.
4472 
4473 	Any two paths referring to the same entry, will result in the same
4474 	normalized path (well, that is pretty much the definition of `normalized',
4475 	isn't it :-).
4476 
4477 	\param path The path to be normalized.
4478 	\param buffer The buffer into which the normalized path will be written.
4479 		   May be the same one as \a path.
4480 	\param bufferSize The size of \a buffer.
4481 	\param traverseLink If \c true, the function also resolves leaf symlinks.
4482 	\param kernel \c true, if the IO context of the kernel shall be used,
4483 		   otherwise that of the team this thread belongs to. Only relevant,
4484 		   if the path is relative (to get the CWD).
4485 	\return \c B_OK if everything went fine, another error code otherwise.
4486 */
4487 status_t
4488 vfs_normalize_path(const char* path, char* buffer, size_t bufferSize,
4489 	bool traverseLink, bool kernel)
4490 {
4491 	if (!path || !buffer || bufferSize < 1)
4492 		return B_BAD_VALUE;
4493 
4494 	if (path != buffer) {
4495 		if (strlcpy(buffer, path, bufferSize) >= bufferSize)
4496 			return B_BUFFER_OVERFLOW;
4497 	}
4498 
4499 	return normalize_path(buffer, bufferSize, traverseLink, kernel);
4500 }
4501 
4502 
4503 /*!	\brief Gets the parent of the passed in node.
4504 
4505 	Gets the parent of the passed in node, and correctly resolves covered
4506 	nodes.
4507 */
4508 extern "C" status_t
4509 vfs_resolve_parent(struct vnode* parent, dev_t* device, ino_t* node)
4510 {
4511 	return resolve_covered_parent(parent, device, node,
4512 		get_current_io_context(true));
4513 }
4514 
4515 
4516 /*!	\brief Creates a special node in the file system.
4517 
4518 	The caller gets a reference to the newly created node (which is passed
4519 	back through \a _createdVnode) and is responsible for releasing it.
4520 
4521 	\param path The path where to create the entry for the node. Can be \c NULL,
4522 		in which case the node is created without an entry in the root FS -- it
4523 		will automatically be deleted when the last reference has been released.
4524 	\param subVnode The definition of the subnode. Can be \c NULL, in which case
4525 		the target file system will just create the node with its standard
4526 		operations. Depending on the type of the node a subnode might be created
4527 		automatically, though.
4528 	\param mode The type and permissions for the node to be created.
4529 	\param flags Flags to be passed to the creating FS.
4530 	\param kernel \c true, if called in the kernel context (relevant only if
4531 		\a path is not \c NULL and not absolute).
4532 	\param _superVnode Pointer to a pre-allocated structure to be filled by the
4533 		file system creating the node, with the private data pointer and
4534 		operations for the super node. Can be \c NULL.
4535 	\param _createVnode Pointer to pre-allocated storage where to store the
4536 		pointer to the newly created node.
4537 	\return \c B_OK, if everything went fine, another error code otherwise.
4538 */
4539 status_t
4540 vfs_create_special_node(const char* path, fs_vnode* subVnode, mode_t mode,
4541 	uint32 flags, bool kernel, fs_vnode* _superVnode,
4542 	struct vnode** _createdVnode)
4543 {
4544 	struct vnode* dirNode;
4545 	char _leaf[B_FILE_NAME_LENGTH];
4546 	char* leaf = NULL;
4547 
4548 	if (path) {
4549 		// We've got a path. Get the dir vnode and the leaf name.
4550 		KPath tmpPathBuffer(B_PATH_NAME_LENGTH + 1);
4551 		if (tmpPathBuffer.InitCheck() != B_OK)
4552 			return B_NO_MEMORY;
4553 
4554 		char* tmpPath = tmpPathBuffer.LockBuffer();
4555 		if (strlcpy(tmpPath, path, B_PATH_NAME_LENGTH) >= B_PATH_NAME_LENGTH)
4556 			return B_NAME_TOO_LONG;
4557 
4558 		// get the dir vnode and the leaf name
4559 		leaf = _leaf;
4560 		status_t error = path_to_dir_vnode(tmpPath, &dirNode, leaf, kernel);
4561 		if (error != B_OK)
4562 			return error;
4563 	} else {
4564 		// No path. Create the node in the root FS.
4565 		dirNode = sRoot;
4566 		inc_vnode_ref_count(dirNode);
4567 	}
4568 
4569 	VNodePutter _(dirNode);
4570 
4571 	// check support for creating special nodes
4572 	if (!HAS_FS_CALL(dirNode, create_special_node))
4573 		return B_UNSUPPORTED;
4574 
4575 	// create the node
4576 	fs_vnode superVnode;
4577 	ino_t nodeID;
4578 	status_t status = FS_CALL(dirNode, create_special_node, leaf, subVnode,
4579 		mode, flags, _superVnode != NULL ? _superVnode : &superVnode, &nodeID);
4580 	if (status != B_OK)
4581 		return status;
4582 
4583 	// lookup the node
4584 	rw_lock_read_lock(&sVnodeLock);
4585 	*_createdVnode = lookup_vnode(dirNode->mount->id, nodeID);
4586 	rw_lock_read_unlock(&sVnodeLock);
4587 
4588 	if (*_createdVnode == NULL) {
4589 		panic("vfs_create_special_node(): lookup of node failed");
4590 		return B_ERROR;
4591 	}
4592 
4593 	return B_OK;
4594 }
4595 
4596 
4597 extern "C" void
4598 vfs_put_vnode(struct vnode* vnode)
4599 {
4600 	put_vnode(vnode);
4601 }
4602 
4603 
4604 extern "C" status_t
4605 vfs_get_cwd(dev_t* _mountID, ino_t* _vnodeID)
4606 {
4607 	// Get current working directory from io context
4608 	struct io_context* context = get_current_io_context(false);
4609 	status_t status = B_OK;
4610 
4611 	mutex_lock(&context->io_mutex);
4612 
4613 	if (context->cwd != NULL) {
4614 		*_mountID = context->cwd->device;
4615 		*_vnodeID = context->cwd->id;
4616 	} else
4617 		status = B_ERROR;
4618 
4619 	mutex_unlock(&context->io_mutex);
4620 	return status;
4621 }
4622 
4623 
4624 status_t
4625 vfs_unmount(dev_t mountID, uint32 flags)
4626 {
4627 	return fs_unmount(NULL, mountID, flags, true);
4628 }
4629 
4630 
4631 extern "C" status_t
4632 vfs_disconnect_vnode(dev_t mountID, ino_t vnodeID)
4633 {
4634 	struct vnode* vnode;
4635 
4636 	status_t status = get_vnode(mountID, vnodeID, &vnode, true, true);
4637 	if (status != B_OK)
4638 		return status;
4639 
4640 	disconnect_mount_or_vnode_fds(vnode->mount, vnode);
4641 	put_vnode(vnode);
4642 	return B_OK;
4643 }
4644 
4645 
4646 extern "C" void
4647 vfs_free_unused_vnodes(int32 level)
4648 {
4649 	vnode_low_resource_handler(NULL,
4650 		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
4651 			| B_KERNEL_RESOURCE_ADDRESS_SPACE,
4652 		level);
4653 }
4654 
4655 
4656 extern "C" bool
4657 vfs_can_page(struct vnode* vnode, void* cookie)
4658 {
4659 	FUNCTION(("vfs_canpage: vnode %p\n", vnode));
4660 
4661 	if (HAS_FS_CALL(vnode, can_page))
4662 		return FS_CALL(vnode, can_page, cookie);
4663 	return false;
4664 }
4665 
4666 
4667 extern "C" status_t
4668 vfs_read_pages(struct vnode* vnode, void* cookie, off_t pos,
4669 	const generic_io_vec* vecs, size_t count, uint32 flags,
4670 	generic_size_t* _numBytes)
4671 {
4672 	FUNCTION(("vfs_read_pages: vnode %p, vecs %p, pos %" B_PRIdOFF "\n", vnode,
4673 		vecs, pos));
4674 
4675 #if VFS_PAGES_IO_TRACING
4676 	generic_size_t bytesRequested = *_numBytes;
4677 #endif
4678 
4679 	IORequest request;
4680 	status_t status = request.Init(pos, vecs, count, *_numBytes, false, flags);
4681 	if (status == B_OK) {
4682 		status = vfs_vnode_io(vnode, cookie, &request);
4683 		if (status == B_OK)
4684 			status = request.Wait();
4685 		*_numBytes = request.TransferredBytes();
4686 	}
4687 
4688 	TPIO(ReadPages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4689 		status, *_numBytes));
4690 
4691 	return status;
4692 }
4693 
4694 
4695 extern "C" status_t
4696 vfs_write_pages(struct vnode* vnode, void* cookie, off_t pos,
4697 	const generic_io_vec* vecs, size_t count, uint32 flags,
4698 	generic_size_t* _numBytes)
4699 {
4700 	FUNCTION(("vfs_write_pages: vnode %p, vecs %p, pos %" B_PRIdOFF "\n", vnode,
4701 		vecs, pos));
4702 
4703 #if VFS_PAGES_IO_TRACING
4704 	generic_size_t bytesRequested = *_numBytes;
4705 #endif
4706 
4707 	IORequest request;
4708 	status_t status = request.Init(pos, vecs, count, *_numBytes, true, flags);
4709 	if (status == B_OK) {
4710 		status = vfs_vnode_io(vnode, cookie, &request);
4711 		if (status == B_OK)
4712 			status = request.Wait();
4713 		*_numBytes = request.TransferredBytes();
4714 	}
4715 
4716 	TPIO(WritePages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4717 		status, *_numBytes));
4718 
4719 	return status;
4720 }
4721 
4722 
4723 /*!	Gets the vnode's VMCache object. If it didn't have one, it will be
4724 	created if \a allocate is \c true.
4725 	In case it's successful, it will also grab a reference to the cache
4726 	it returns.
4727 */
4728 extern "C" status_t
4729 vfs_get_vnode_cache(struct vnode* vnode, VMCache** _cache, bool allocate)
4730 {
4731 	if (vnode->cache != NULL) {
4732 		vnode->cache->AcquireRef();
4733 		*_cache = vnode->cache;
4734 		return B_OK;
4735 	}
4736 
4737 	rw_lock_read_lock(&sVnodeLock);
4738 	vnode->Lock();
4739 
4740 	status_t status = B_OK;
4741 
4742 	// The cache could have been created in the meantime
4743 	if (vnode->cache == NULL) {
4744 		if (allocate) {
4745 			// TODO: actually the vnode needs to be busy already here, or
4746 			//	else this won't work...
4747 			bool wasBusy = vnode->IsBusy();
4748 			vnode->SetBusy(true);
4749 
4750 			vnode->Unlock();
4751 			rw_lock_read_unlock(&sVnodeLock);
4752 
4753 			status = vm_create_vnode_cache(vnode, &vnode->cache);
4754 
4755 			rw_lock_read_lock(&sVnodeLock);
4756 			vnode->Lock();
4757 			vnode->SetBusy(wasBusy);
4758 		} else
4759 			status = B_BAD_VALUE;
4760 	}
4761 
4762 	vnode->Unlock();
4763 	rw_lock_read_unlock(&sVnodeLock);
4764 
4765 	if (status == B_OK) {
4766 		vnode->cache->AcquireRef();
4767 		*_cache = vnode->cache;
4768 	}
4769 
4770 	return status;
4771 }
4772 
4773 
4774 status_t
4775 vfs_get_file_map(struct vnode* vnode, off_t offset, size_t size,
4776 	file_io_vec* vecs, size_t* _count)
4777 {
4778 	FUNCTION(("vfs_get_file_map: vnode %p, vecs %p, offset %" B_PRIdOFF
4779 		", size = %" B_PRIuSIZE "\n", vnode, vecs, offset, size));
4780 
4781 	return FS_CALL(vnode, get_file_map, offset, size, vecs, _count);
4782 }
4783 
4784 
4785 status_t
4786 vfs_stat_vnode(struct vnode* vnode, struct stat* stat)
4787 {
4788 	status_t status = FS_CALL(vnode, read_stat, stat);
4789 
4790 	// fill in the st_dev and st_ino fields
4791 	if (status == B_OK) {
4792 		stat->st_dev = vnode->device;
4793 		stat->st_ino = vnode->id;
4794 		// the rdev field must stay unset for non-special files
4795 		if (!S_ISBLK(stat->st_mode) && !S_ISCHR(stat->st_mode))
4796 			stat->st_rdev = -1;
4797 	}
4798 
4799 	return status;
4800 }
4801 
4802 
4803 status_t
4804 vfs_stat_node_ref(dev_t device, ino_t inode, struct stat* stat)
4805 {
4806 	struct vnode* vnode;
4807 	status_t status = get_vnode(device, inode, &vnode, true, false);
4808 	if (status != B_OK)
4809 		return status;
4810 
4811 	status = vfs_stat_vnode(vnode, stat);
4812 
4813 	put_vnode(vnode);
4814 	return status;
4815 }
4816 
4817 
4818 status_t
4819 vfs_get_vnode_name(struct vnode* vnode, char* name, size_t nameSize)
4820 {
4821 	return get_vnode_name(vnode, NULL, name, nameSize, true);
4822 }
4823 
4824 
4825 status_t
4826 vfs_entry_ref_to_path(dev_t device, ino_t inode, const char* leaf,
4827 	bool kernel, char* path, size_t pathLength)
4828 {
4829 	struct vnode* vnode;
4830 	status_t status;
4831 
4832 	// filter invalid leaf names
4833 	if (leaf != NULL && (leaf[0] == '\0' || strchr(leaf, '/')))
4834 		return B_BAD_VALUE;
4835 
4836 	// get the vnode matching the dir's node_ref
4837 	if (leaf && (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0)) {
4838 		// special cases "." and "..": we can directly get the vnode of the
4839 		// referenced directory
4840 		status = entry_ref_to_vnode(device, inode, leaf, false, kernel, &vnode);
4841 		leaf = NULL;
4842 	} else
4843 		status = get_vnode(device, inode, &vnode, true, false);
4844 	if (status != B_OK)
4845 		return status;
4846 
4847 	// get the directory path
4848 	status = dir_vnode_to_path(vnode, path, pathLength, kernel);
4849 	put_vnode(vnode);
4850 		// we don't need the vnode anymore
4851 	if (status != B_OK)
4852 		return status;
4853 
4854 	// append the leaf name
4855 	if (leaf) {
4856 		// insert a directory separator if this is not the file system root
4857 		if ((strcmp(path, "/") && strlcat(path, "/", pathLength)
4858 				>= pathLength)
4859 			|| strlcat(path, leaf, pathLength) >= pathLength) {
4860 			return B_NAME_TOO_LONG;
4861 		}
4862 	}
4863 
4864 	return B_OK;
4865 }
4866 
4867 
4868 /*!	If the given descriptor locked its vnode, that lock will be released. */
4869 void
4870 vfs_unlock_vnode_if_locked(struct file_descriptor* descriptor)
4871 {
4872 	struct vnode* vnode = fd_vnode(descriptor);
4873 
4874 	if (vnode != NULL && vnode->mandatory_locked_by == descriptor)
4875 		vnode->mandatory_locked_by = NULL;
4876 }
4877 
4878 
4879 /*!	Closes all file descriptors of the specified I/O context that
4880 	have the O_CLOEXEC flag set.
4881 */
4882 void
4883 vfs_exec_io_context(io_context* context)
4884 {
4885 	uint32 i;
4886 
4887 	for (i = 0; i < context->table_size; i++) {
4888 		mutex_lock(&context->io_mutex);
4889 
4890 		struct file_descriptor* descriptor = context->fds[i];
4891 		bool remove = false;
4892 
4893 		if (descriptor != NULL && fd_close_on_exec(context, i)) {
4894 			context->fds[i] = NULL;
4895 			context->num_used_fds--;
4896 
4897 			remove = true;
4898 		}
4899 
4900 		mutex_unlock(&context->io_mutex);
4901 
4902 		if (remove) {
4903 			close_fd(descriptor);
4904 			put_fd(descriptor);
4905 		}
4906 	}
4907 }
4908 
4909 
4910 /*! Sets up a new io_control structure, and inherits the properties
4911 	of the parent io_control if it is given.
4912 */
4913 io_context*
4914 vfs_new_io_context(io_context* parentContext, bool purgeCloseOnExec)
4915 {
4916 	io_context* context = (io_context*)malloc(sizeof(io_context));
4917 	if (context == NULL)
4918 		return NULL;
4919 
4920 	TIOC(NewIOContext(context, parentContext));
4921 
4922 	memset(context, 0, sizeof(io_context));
4923 	context->ref_count = 1;
4924 
4925 	MutexLocker parentLocker;
4926 
4927 	size_t tableSize;
4928 	if (parentContext != NULL) {
4929 		parentLocker.SetTo(parentContext->io_mutex, false);
4930 		tableSize = parentContext->table_size;
4931 	} else
4932 		tableSize = DEFAULT_FD_TABLE_SIZE;
4933 
4934 	// allocate space for FDs and their close-on-exec flag
4935 	context->fds = (file_descriptor**)malloc(
4936 		sizeof(struct file_descriptor*) * tableSize
4937 		+ sizeof(struct select_sync*) * tableSize
4938 		+ (tableSize + 7) / 8);
4939 	if (context->fds == NULL) {
4940 		free(context);
4941 		return NULL;
4942 	}
4943 
4944 	context->select_infos = (select_info**)(context->fds + tableSize);
4945 	context->fds_close_on_exec = (uint8*)(context->select_infos + tableSize);
4946 
4947 	memset(context->fds, 0, sizeof(struct file_descriptor*) * tableSize
4948 		+ sizeof(struct select_sync*) * tableSize
4949 		+ (tableSize + 7) / 8);
4950 
4951 	mutex_init(&context->io_mutex, "I/O context");
4952 
4953 	// Copy all parent file descriptors
4954 
4955 	if (parentContext != NULL) {
4956 		size_t i;
4957 
4958 		mutex_lock(&sIOContextRootLock);
4959 		context->root = parentContext->root;
4960 		if (context->root)
4961 			inc_vnode_ref_count(context->root);
4962 		mutex_unlock(&sIOContextRootLock);
4963 
4964 		context->cwd = parentContext->cwd;
4965 		if (context->cwd)
4966 			inc_vnode_ref_count(context->cwd);
4967 
4968 		if (parentContext->inherit_fds) {
4969 			for (i = 0; i < tableSize; i++) {
4970 				struct file_descriptor* descriptor = parentContext->fds[i];
4971 
4972 				if (descriptor != NULL
4973 					&& (descriptor->open_mode & O_DISCONNECTED) == 0) {
4974 					bool closeOnExec = fd_close_on_exec(parentContext, i);
4975 					if (closeOnExec && purgeCloseOnExec)
4976 						continue;
4977 
4978 					TFD(InheritFD(context, i, descriptor, parentContext));
4979 
4980 					context->fds[i] = descriptor;
4981 					context->num_used_fds++;
4982 					atomic_add(&descriptor->ref_count, 1);
4983 					atomic_add(&descriptor->open_count, 1);
4984 
4985 					if (closeOnExec)
4986 						fd_set_close_on_exec(context, i, true);
4987 				}
4988 			}
4989 		}
4990 
4991 		parentLocker.Unlock();
4992 	} else {
4993 		context->root = sRoot;
4994 		context->cwd = sRoot;
4995 
4996 		if (context->root)
4997 			inc_vnode_ref_count(context->root);
4998 
4999 		if (context->cwd)
5000 			inc_vnode_ref_count(context->cwd);
5001 	}
5002 
5003 	context->table_size = tableSize;
5004 	context->inherit_fds = parentContext != NULL;
5005 
5006 	list_init(&context->node_monitors);
5007 	context->max_monitors = DEFAULT_NODE_MONITORS;
5008 
5009 	return context;
5010 }
5011 
5012 
5013 void
5014 vfs_get_io_context(io_context* context)
5015 {
5016 	atomic_add(&context->ref_count, 1);
5017 }
5018 
5019 
5020 void
5021 vfs_put_io_context(io_context* context)
5022 {
5023 	if (atomic_add(&context->ref_count, -1) == 1)
5024 		free_io_context(context);
5025 }
5026 
5027 
5028 status_t
5029 vfs_resize_fd_table(struct io_context* context, uint32 newSize)
5030 {
5031 	if (newSize == 0 || newSize > MAX_FD_TABLE_SIZE)
5032 		return B_BAD_VALUE;
5033 
5034 	TIOC(ResizeIOContext(context, newSize));
5035 
5036 	MutexLocker _(context->io_mutex);
5037 
5038 	uint32 oldSize = context->table_size;
5039 	int oldCloseOnExitBitmapSize = (oldSize + 7) / 8;
5040 	int newCloseOnExitBitmapSize = (newSize + 7) / 8;
5041 
5042 	// If the tables shrink, make sure none of the fds being dropped are in use.
5043 	if (newSize < oldSize) {
5044 		for (uint32 i = oldSize; i-- > newSize;) {
5045 			if (context->fds[i])
5046 				return B_BUSY;
5047 		}
5048 	}
5049 
5050 	// store pointers to the old tables
5051 	file_descriptor** oldFDs = context->fds;
5052 	select_info** oldSelectInfos = context->select_infos;
5053 	uint8* oldCloseOnExecTable = context->fds_close_on_exec;
5054 
5055 	// allocate new tables
5056 	file_descriptor** newFDs = (file_descriptor**)malloc(
5057 		sizeof(struct file_descriptor*) * newSize
5058 		+ sizeof(struct select_sync*) * newSize
5059 		+ newCloseOnExitBitmapSize);
5060 	if (newFDs == NULL)
5061 		return B_NO_MEMORY;
5062 
5063 	context->fds = newFDs;
5064 	context->select_infos = (select_info**)(context->fds + newSize);
5065 	context->fds_close_on_exec = (uint8*)(context->select_infos + newSize);
5066 	context->table_size = newSize;
5067 
5068 	// copy entries from old tables
5069 	uint32 toCopy = min_c(oldSize, newSize);
5070 
5071 	memcpy(context->fds, oldFDs, sizeof(void*) * toCopy);
5072 	memcpy(context->select_infos, oldSelectInfos, sizeof(void*) * toCopy);
5073 	memcpy(context->fds_close_on_exec, oldCloseOnExecTable,
5074 		min_c(oldCloseOnExitBitmapSize, newCloseOnExitBitmapSize));
5075 
5076 	// clear additional entries, if the tables grow
5077 	if (newSize > oldSize) {
5078 		memset(context->fds + oldSize, 0, sizeof(void*) * (newSize - oldSize));
5079 		memset(context->select_infos + oldSize, 0,
5080 			sizeof(void*) * (newSize - oldSize));
5081 		memset(context->fds_close_on_exec + oldCloseOnExitBitmapSize, 0,
5082 			newCloseOnExitBitmapSize - oldCloseOnExitBitmapSize);
5083 	}
5084 
5085 	free(oldFDs);
5086 
5087 	return B_OK;
5088 }
5089 
5090 
5091 /*!	\brief Resolves a vnode to the vnode it is covered by, if any.
5092 
5093 	Given an arbitrary vnode (identified by mount and node ID), the function
5094 	checks, whether the vnode is covered by another vnode. If it is, the
5095 	function returns the mount and node ID of the covering vnode. Otherwise
5096 	it simply returns the supplied mount and node ID.
5097 
5098 	In case of error (e.g. the supplied node could not be found) the variables
5099 	for storing the resolved mount and node ID remain untouched and an error
5100 	code is returned.
5101 
5102 	\param mountID The mount ID of the vnode in question.
5103 	\param nodeID The node ID of the vnode in question.
5104 	\param resolvedMountID Pointer to storage for the resolved mount ID.
5105 	\param resolvedNodeID Pointer to storage for the resolved node ID.
5106 	\return
5107 	- \c B_OK, if everything went fine,
5108 	- another error code, if something went wrong.
5109 */
5110 status_t
5111 vfs_resolve_vnode_to_covering_vnode(dev_t mountID, ino_t nodeID,
5112 	dev_t* resolvedMountID, ino_t* resolvedNodeID)
5113 {
5114 	// get the node
5115 	struct vnode* node;
5116 	status_t error = get_vnode(mountID, nodeID, &node, true, false);
5117 	if (error != B_OK)
5118 		return error;
5119 
5120 	// resolve the node
5121 	if (Vnode* coveringNode = get_covering_vnode(node)) {
5122 		put_vnode(node);
5123 		node = coveringNode;
5124 	}
5125 
5126 	// set the return values
5127 	*resolvedMountID = node->device;
5128 	*resolvedNodeID = node->id;
5129 
5130 	put_vnode(node);
5131 
5132 	return B_OK;
5133 }
5134 
5135 
5136 status_t
5137 vfs_get_mount_point(dev_t mountID, dev_t* _mountPointMountID,
5138 	ino_t* _mountPointNodeID)
5139 {
5140 	ReadLocker nodeLocker(sVnodeLock);
5141 	MutexLocker mountLocker(sMountMutex);
5142 
5143 	struct fs_mount* mount = find_mount(mountID);
5144 	if (mount == NULL)
5145 		return B_BAD_VALUE;
5146 
5147 	Vnode* mountPoint = mount->covers_vnode;
5148 
5149 	*_mountPointMountID = mountPoint->device;
5150 	*_mountPointNodeID = mountPoint->id;
5151 
5152 	return B_OK;
5153 }
5154 
5155 
5156 status_t
5157 vfs_bind_mount_directory(dev_t mountID, ino_t nodeID, dev_t coveredMountID,
5158 	ino_t coveredNodeID)
5159 {
5160 	// get the vnodes
5161 	Vnode* vnode;
5162 	status_t error = get_vnode(mountID, nodeID, &vnode, true, false);
5163 	if (error != B_OK)
5164 		return B_BAD_VALUE;
5165 	VNodePutter vnodePutter(vnode);
5166 
5167 	Vnode* coveredVnode;
5168 	error = get_vnode(coveredMountID, coveredNodeID, &coveredVnode, true,
5169 		false);
5170 	if (error != B_OK)
5171 		return B_BAD_VALUE;
5172 	VNodePutter coveredVnodePutter(coveredVnode);
5173 
5174 	// establish the covered/covering links
5175 	WriteLocker locker(sVnodeLock);
5176 
5177 	if (vnode->covers != NULL || coveredVnode->covered_by != NULL
5178 		|| vnode->mount->unmounting || coveredVnode->mount->unmounting) {
5179 		return B_BUSY;
5180 	}
5181 
5182 	vnode->covers = coveredVnode;
5183 	vnode->SetCovering(true);
5184 
5185 	coveredVnode->covered_by = vnode;
5186 	coveredVnode->SetCovered(true);
5187 
5188 	// the vnodes do now reference each other
5189 	inc_vnode_ref_count(vnode);
5190 	inc_vnode_ref_count(coveredVnode);
5191 
5192 	return B_OK;
5193 }
5194 
5195 
5196 int
5197 vfs_getrlimit(int resource, struct rlimit* rlp)
5198 {
5199 	if (!rlp)
5200 		return B_BAD_ADDRESS;
5201 
5202 	switch (resource) {
5203 		case RLIMIT_NOFILE:
5204 		{
5205 			struct io_context* context = get_current_io_context(false);
5206 			MutexLocker _(context->io_mutex);
5207 
5208 			rlp->rlim_cur = context->table_size;
5209 			rlp->rlim_max = MAX_FD_TABLE_SIZE;
5210 			return 0;
5211 		}
5212 
5213 		case RLIMIT_NOVMON:
5214 		{
5215 			struct io_context* context = get_current_io_context(false);
5216 			MutexLocker _(context->io_mutex);
5217 
5218 			rlp->rlim_cur = context->max_monitors;
5219 			rlp->rlim_max = MAX_NODE_MONITORS;
5220 			return 0;
5221 		}
5222 
5223 		default:
5224 			return B_BAD_VALUE;
5225 	}
5226 }
5227 
5228 
5229 int
5230 vfs_setrlimit(int resource, const struct rlimit* rlp)
5231 {
5232 	if (!rlp)
5233 		return B_BAD_ADDRESS;
5234 
5235 	switch (resource) {
5236 		case RLIMIT_NOFILE:
5237 			/* TODO: check getuid() */
5238 			if (rlp->rlim_max != RLIM_SAVED_MAX
5239 				&& rlp->rlim_max != MAX_FD_TABLE_SIZE)
5240 				return B_NOT_ALLOWED;
5241 
5242 			return vfs_resize_fd_table(get_current_io_context(false),
5243 				rlp->rlim_cur);
5244 
5245 		case RLIMIT_NOVMON:
5246 			/* TODO: check getuid() */
5247 			if (rlp->rlim_max != RLIM_SAVED_MAX
5248 				&& rlp->rlim_max != MAX_NODE_MONITORS)
5249 				return B_NOT_ALLOWED;
5250 
5251 			return resize_monitor_table(get_current_io_context(false),
5252 				rlp->rlim_cur);
5253 
5254 		default:
5255 			return B_BAD_VALUE;
5256 	}
5257 }
5258 
5259 
5260 status_t
5261 vfs_init(kernel_args* args)
5262 {
5263 	vnode::StaticInit();
5264 
5265 	sVnodeTable = new(std::nothrow) VnodeTable();
5266 	if (sVnodeTable == NULL || sVnodeTable->Init(VNODE_HASH_TABLE_SIZE) != B_OK)
5267 		panic("vfs_init: error creating vnode hash table\n");
5268 
5269 	struct vnode dummy_vnode;
5270 	list_init_etc(&sUnusedVnodeList, offset_of_member(dummy_vnode, unused_link));
5271 
5272 	struct fs_mount dummyMount;
5273 	sMountsTable = new(std::nothrow) MountTable();
5274 	if (sMountsTable == NULL
5275 			|| sMountsTable->Init(MOUNTS_HASH_TABLE_SIZE) != B_OK)
5276 		panic("vfs_init: error creating mounts hash table\n");
5277 
5278 	node_monitor_init();
5279 
5280 	sRoot = NULL;
5281 
5282 	recursive_lock_init(&sMountOpLock, "vfs_mount_op_lock");
5283 
5284 	if (block_cache_init() != B_OK)
5285 		return B_ERROR;
5286 
5287 #ifdef ADD_DEBUGGER_COMMANDS
5288 	// add some debugger commands
5289 	add_debugger_command_etc("vnode", &dump_vnode,
5290 		"Print info about the specified vnode",
5291 		"[ \"-p\" ] ( <vnode> | <devID> <nodeID> )\n"
5292 		"Prints information about the vnode specified by address <vnode> or\n"
5293 		"<devID>, <vnodeID> pair. If \"-p\" is given, a path of the vnode is\n"
5294 		"constructed and printed. It might not be possible to construct a\n"
5295 		"complete path, though.\n",
5296 		0);
5297 	add_debugger_command("vnodes", &dump_vnodes,
5298 		"list all vnodes (from the specified device)");
5299 	add_debugger_command("vnode_caches", &dump_vnode_caches,
5300 		"list all vnode caches");
5301 	add_debugger_command("mount", &dump_mount,
5302 		"info about the specified fs_mount");
5303 	add_debugger_command("mounts", &dump_mounts, "list all fs_mounts");
5304 	add_debugger_command("io_context", &dump_io_context,
5305 		"info about the I/O context");
5306 	add_debugger_command("vnode_usage", &dump_vnode_usage,
5307 		"info about vnode usage");
5308 #endif
5309 
5310 	register_low_resource_handler(&vnode_low_resource_handler, NULL,
5311 		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
5312 			| B_KERNEL_RESOURCE_ADDRESS_SPACE,
5313 		0);
5314 
5315 	fifo_init();
5316 	file_map_init();
5317 
5318 	return file_cache_init();
5319 }
5320 
5321 
5322 //	#pragma mark - fd_ops implementations
5323 
5324 
5325 /*!
5326 	Calls fs_open() on the given vnode and returns a new
5327 	file descriptor for it
5328 */
5329 static int
5330 open_vnode(struct vnode* vnode, int openMode, bool kernel)
5331 {
5332 	void* cookie;
5333 	status_t status = FS_CALL(vnode, open, openMode, &cookie);
5334 	if (status != B_OK)
5335 		return status;
5336 
5337 	int fd = get_new_fd(FDTYPE_FILE, NULL, vnode, cookie, openMode, kernel);
5338 	if (fd < 0) {
5339 		FS_CALL(vnode, close, cookie);
5340 		FS_CALL(vnode, free_cookie, cookie);
5341 	}
5342 	return fd;
5343 }
5344 
5345 
5346 /*!
5347 	Calls fs_open() on the given vnode and returns a new
5348 	file descriptor for it
5349 */
5350 static int
5351 create_vnode(struct vnode* directory, const char* name, int openMode,
5352 	int perms, bool kernel)
5353 {
5354 	bool traverse = ((openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0);
5355 	status_t status = B_ERROR;
5356 	struct vnode* vnode;
5357 	void* cookie;
5358 	ino_t newID;
5359 
5360 	// This is somewhat tricky: If the entry already exists, the FS responsible
5361 	// for the directory might not necessarily also be the one responsible for
5362 	// the node the entry refers to (e.g. in case of mount points or FIFOs). So
5363 	// we can actually never call the create() hook without O_EXCL. Instead we
5364 	// try to look the entry up first. If it already exists, we just open the
5365 	// node (unless O_EXCL), otherwise we call create() with O_EXCL. This
5366 	// introduces a race condition, since someone else might have created the
5367 	// entry in the meantime. We hope the respective FS returns the correct
5368 	// error code and retry (up to 3 times) again.
5369 
5370 	for (int i = 0; i < 3 && status != B_OK; i++) {
5371 		// look the node up
5372 		status = lookup_dir_entry(directory, name, &vnode);
5373 		if (status == B_OK) {
5374 			VNodePutter putter(vnode);
5375 
5376 			if ((openMode & O_EXCL) != 0)
5377 				return B_FILE_EXISTS;
5378 
5379 			// If the node is a symlink, we have to follow it, unless
5380 			// O_NOTRAVERSE is set.
5381 			if (S_ISLNK(vnode->Type()) && traverse) {
5382 				putter.Put();
5383 				char clonedName[B_FILE_NAME_LENGTH + 1];
5384 				if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH)
5385 						>= B_FILE_NAME_LENGTH) {
5386 					return B_NAME_TOO_LONG;
5387 				}
5388 
5389 				inc_vnode_ref_count(directory);
5390 				status = vnode_path_to_vnode(directory, clonedName, true, 0,
5391 					kernel, &vnode, NULL);
5392 				if (status != B_OK)
5393 					return status;
5394 
5395 				putter.SetTo(vnode);
5396 			}
5397 
5398 			if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type()))
5399 				return B_LINK_LIMIT;
5400 
5401 			int fd = open_vnode(vnode, openMode & ~O_CREAT, kernel);
5402 			// on success keep the vnode reference for the FD
5403 			if (fd >= 0)
5404 				putter.Detach();
5405 
5406 			return fd;
5407 		}
5408 
5409 		// it doesn't exist yet -- try to create it
5410 
5411 		if (!HAS_FS_CALL(directory, create))
5412 			return B_READ_ONLY_DEVICE;
5413 
5414 		status = FS_CALL(directory, create, name, openMode | O_EXCL, perms,
5415 			&cookie, &newID);
5416 		if (status != B_OK
5417 			&& ((openMode & O_EXCL) != 0 || status != B_FILE_EXISTS)) {
5418 			return status;
5419 		}
5420 	}
5421 
5422 	if (status != B_OK)
5423 		return status;
5424 
5425 	// the node has been created successfully
5426 
5427 	rw_lock_read_lock(&sVnodeLock);
5428 	vnode = lookup_vnode(directory->device, newID);
5429 	rw_lock_read_unlock(&sVnodeLock);
5430 
5431 	if (vnode == NULL) {
5432 		panic("vfs: fs_create() returned success but there is no vnode, "
5433 			"mount ID %" B_PRIdDEV "!\n", directory->device);
5434 		return B_BAD_VALUE;
5435 	}
5436 
5437 	int fd = get_new_fd(FDTYPE_FILE, NULL, vnode, cookie, openMode, kernel);
5438 	if (fd >= 0)
5439 		return fd;
5440 
5441 	status = fd;
5442 
5443 	// something went wrong, clean up
5444 
5445 	FS_CALL(vnode, close, cookie);
5446 	FS_CALL(vnode, free_cookie, cookie);
5447 	put_vnode(vnode);
5448 
5449 	FS_CALL(directory, unlink, name);
5450 
5451 	return status;
5452 }
5453 
5454 
5455 /*! Calls fs open_dir() on the given vnode and returns a new
5456 	file descriptor for it
5457 */
5458 static int
5459 open_dir_vnode(struct vnode* vnode, bool kernel)
5460 {
5461 	void* cookie;
5462 	status_t status = FS_CALL(vnode, open_dir, &cookie);
5463 	if (status != B_OK)
5464 		return status;
5465 
5466 	// directory is opened, create a fd
5467 	status = get_new_fd(FDTYPE_DIR, NULL, vnode, cookie, O_CLOEXEC, kernel);
5468 	if (status >= 0)
5469 		return status;
5470 
5471 	FS_CALL(vnode, close_dir, cookie);
5472 	FS_CALL(vnode, free_dir_cookie, cookie);
5473 
5474 	return status;
5475 }
5476 
5477 
5478 /*! Calls fs open_attr_dir() on the given vnode and returns a new
5479 	file descriptor for it.
5480 	Used by attr_dir_open(), and attr_dir_open_fd().
5481 */
5482 static int
5483 open_attr_dir_vnode(struct vnode* vnode, bool kernel)
5484 {
5485 	if (!HAS_FS_CALL(vnode, open_attr_dir))
5486 		return B_UNSUPPORTED;
5487 
5488 	void* cookie;
5489 	status_t status = FS_CALL(vnode, open_attr_dir, &cookie);
5490 	if (status != B_OK)
5491 		return status;
5492 
5493 	// directory is opened, create a fd
5494 	status = get_new_fd(FDTYPE_ATTR_DIR, NULL, vnode, cookie, O_CLOEXEC,
5495 		kernel);
5496 	if (status >= 0)
5497 		return status;
5498 
5499 	FS_CALL(vnode, close_attr_dir, cookie);
5500 	FS_CALL(vnode, free_attr_dir_cookie, cookie);
5501 
5502 	return status;
5503 }
5504 
5505 
5506 static int
5507 file_create_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5508 	int openMode, int perms, bool kernel)
5509 {
5510 	FUNCTION(("file_create_entry_ref: name = '%s', omode %x, perms %d, "
5511 		"kernel %d\n", name, openMode, perms, kernel));
5512 
5513 	// get directory to put the new file in
5514 	struct vnode* directory;
5515 	status_t status = get_vnode(mountID, directoryID, &directory, true, false);
5516 	if (status != B_OK)
5517 		return status;
5518 
5519 	status = create_vnode(directory, name, openMode, perms, kernel);
5520 	put_vnode(directory);
5521 
5522 	return status;
5523 }
5524 
5525 
5526 static int
5527 file_create(int fd, char* path, int openMode, int perms, bool kernel)
5528 {
5529 	FUNCTION(("file_create: path '%s', omode %x, perms %d, kernel %d\n", path,
5530 		openMode, perms, kernel));
5531 
5532 	// get directory to put the new file in
5533 	char name[B_FILE_NAME_LENGTH];
5534 	struct vnode* directory;
5535 	status_t status = fd_and_path_to_dir_vnode(fd, path, &directory, name,
5536 		kernel);
5537 	if (status < 0)
5538 		return status;
5539 
5540 	status = create_vnode(directory, name, openMode, perms, kernel);
5541 
5542 	put_vnode(directory);
5543 	return status;
5544 }
5545 
5546 
5547 static int
5548 file_open_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5549 	int openMode, bool kernel)
5550 {
5551 	if (name == NULL || *name == '\0')
5552 		return B_BAD_VALUE;
5553 
5554 	FUNCTION(("file_open_entry_ref(ref = (%" B_PRId32 ", %" B_PRId64 ", %s), "
5555 		"openMode = %d)\n", mountID, directoryID, name, openMode));
5556 
5557 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5558 
5559 	// get the vnode matching the entry_ref
5560 	struct vnode* vnode;
5561 	status_t status = entry_ref_to_vnode(mountID, directoryID, name, traverse,
5562 		kernel, &vnode);
5563 	if (status != B_OK)
5564 		return status;
5565 
5566 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
5567 		put_vnode(vnode);
5568 		return B_LINK_LIMIT;
5569 	}
5570 
5571 	int newFD = open_vnode(vnode, openMode, kernel);
5572 	if (newFD >= 0) {
5573 		// The vnode reference has been transferred to the FD
5574 		cache_node_opened(vnode, FDTYPE_FILE, vnode->cache, mountID,
5575 			directoryID, vnode->id, name);
5576 	} else
5577 		put_vnode(vnode);
5578 
5579 	return newFD;
5580 }
5581 
5582 
5583 static int
5584 file_open(int fd, char* path, int openMode, bool kernel)
5585 {
5586 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5587 
5588 	FUNCTION(("file_open: fd: %d, entry path = '%s', omode %d, kernel %d\n",
5589 		fd, path, openMode, kernel));
5590 
5591 	// get the vnode matching the vnode + path combination
5592 	struct vnode* vnode;
5593 	ino_t parentID;
5594 	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode,
5595 		&parentID, kernel);
5596 	if (status != B_OK)
5597 		return status;
5598 
5599 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
5600 		put_vnode(vnode);
5601 		return B_LINK_LIMIT;
5602 	}
5603 
5604 	// open the vnode
5605 	int newFD = open_vnode(vnode, openMode, kernel);
5606 	if (newFD >= 0) {
5607 		// The vnode reference has been transferred to the FD
5608 		cache_node_opened(vnode, FDTYPE_FILE, vnode->cache,
5609 			vnode->device, parentID, vnode->id, NULL);
5610 	} else
5611 		put_vnode(vnode);
5612 
5613 	return newFD;
5614 }
5615 
5616 
5617 static status_t
5618 file_close(struct file_descriptor* descriptor)
5619 {
5620 	struct vnode* vnode = descriptor->u.vnode;
5621 	status_t status = B_OK;
5622 
5623 	FUNCTION(("file_close(descriptor = %p)\n", descriptor));
5624 
5625 	cache_node_closed(vnode, FDTYPE_FILE, vnode->cache, vnode->device,
5626 		vnode->id);
5627 	if (HAS_FS_CALL(vnode, close)) {
5628 		status = FS_CALL(vnode, close, descriptor->cookie);
5629 	}
5630 
5631 	if (status == B_OK) {
5632 		// remove all outstanding locks for this team
5633 		if (HAS_FS_CALL(vnode, release_lock))
5634 			status = FS_CALL(vnode, release_lock, descriptor->cookie, NULL);
5635 		else
5636 			status = release_advisory_lock(vnode, NULL);
5637 	}
5638 	return status;
5639 }
5640 
5641 
5642 static void
5643 file_free_fd(struct file_descriptor* descriptor)
5644 {
5645 	struct vnode* vnode = descriptor->u.vnode;
5646 
5647 	if (vnode != NULL) {
5648 		FS_CALL(vnode, free_cookie, descriptor->cookie);
5649 		put_vnode(vnode);
5650 	}
5651 }
5652 
5653 
5654 static status_t
5655 file_read(struct file_descriptor* descriptor, off_t pos, void* buffer,
5656 	size_t* length)
5657 {
5658 	struct vnode* vnode = descriptor->u.vnode;
5659 	FUNCTION(("file_read: buf %p, pos %" B_PRIdOFF ", len %p = %ld\n", buffer,
5660 		pos, length, *length));
5661 
5662 	if (S_ISDIR(vnode->Type()))
5663 		return B_IS_A_DIRECTORY;
5664 
5665 	return FS_CALL(vnode, read, descriptor->cookie, pos, buffer, length);
5666 }
5667 
5668 
5669 static status_t
5670 file_write(struct file_descriptor* descriptor, off_t pos, const void* buffer,
5671 	size_t* length)
5672 {
5673 	struct vnode* vnode = descriptor->u.vnode;
5674 	FUNCTION(("file_write: buf %p, pos %" B_PRIdOFF ", len %p\n", buffer, pos,
5675 		length));
5676 
5677 	if (S_ISDIR(vnode->Type()))
5678 		return B_IS_A_DIRECTORY;
5679 	if (!HAS_FS_CALL(vnode, write))
5680 		return B_READ_ONLY_DEVICE;
5681 
5682 	return FS_CALL(vnode, write, descriptor->cookie, pos, buffer, length);
5683 }
5684 
5685 
5686 static off_t
5687 file_seek(struct file_descriptor* descriptor, off_t pos, int seekType)
5688 {
5689 	struct vnode* vnode = descriptor->u.vnode;
5690 	off_t offset;
5691 	bool isDevice = false;
5692 
5693 	FUNCTION(("file_seek(pos = %" B_PRIdOFF ", seekType = %d)\n", pos,
5694 		seekType));
5695 
5696 	// some kinds of files are not seekable
5697 	switch (vnode->Type() & S_IFMT) {
5698 		case S_IFIFO:
5699 		case S_IFSOCK:
5700 			return ESPIPE;
5701 
5702 		// drivers publish block devices as chr, so pick both
5703 		case S_IFBLK:
5704 		case S_IFCHR:
5705 			isDevice = true;
5706 			break;
5707 		// The Open Group Base Specs don't mention any file types besides pipes,
5708 		// fifos, and sockets specially, so we allow seeking them.
5709 		case S_IFREG:
5710 		case S_IFDIR:
5711 		case S_IFLNK:
5712 			break;
5713 	}
5714 
5715 	switch (seekType) {
5716 		case SEEK_SET:
5717 			offset = 0;
5718 			break;
5719 		case SEEK_CUR:
5720 			offset = descriptor->pos;
5721 			break;
5722 		case SEEK_END:
5723 		{
5724 			// stat() the node
5725 			if (!HAS_FS_CALL(vnode, read_stat))
5726 				return B_UNSUPPORTED;
5727 
5728 			struct stat stat;
5729 			status_t status = FS_CALL(vnode, read_stat, &stat);
5730 			if (status != B_OK)
5731 				return status;
5732 
5733 			offset = stat.st_size;
5734 
5735 			if (offset == 0 && isDevice) {
5736 				// stat() on regular drivers doesn't report size
5737 				device_geometry geometry;
5738 
5739 				if (HAS_FS_CALL(vnode, ioctl)) {
5740 					status = FS_CALL(vnode, ioctl, descriptor->cookie,
5741 						B_GET_GEOMETRY, &geometry, sizeof(geometry));
5742 					if (status == B_OK)
5743 						offset = (off_t)geometry.bytes_per_sector
5744 							* geometry.sectors_per_track
5745 							* geometry.cylinder_count
5746 							* geometry.head_count;
5747 				}
5748 			}
5749 
5750 			break;
5751 		}
5752 		default:
5753 			return B_BAD_VALUE;
5754 	}
5755 
5756 	// assumes off_t is 64 bits wide
5757 	if (offset > 0 && LONGLONG_MAX - offset < pos)
5758 		return B_BUFFER_OVERFLOW;
5759 
5760 	pos += offset;
5761 	if (pos < 0)
5762 		return B_BAD_VALUE;
5763 
5764 	return descriptor->pos = pos;
5765 }
5766 
5767 
5768 static status_t
5769 file_select(struct file_descriptor* descriptor, uint8 event,
5770 	struct selectsync* sync)
5771 {
5772 	FUNCTION(("file_select(%p, %u, %p)\n", descriptor, event, sync));
5773 
5774 	struct vnode* vnode = descriptor->u.vnode;
5775 
5776 	// If the FS has no select() hook, notify select() now.
5777 	if (!HAS_FS_CALL(vnode, select)) {
5778 		if (!SELECT_TYPE_IS_OUTPUT_ONLY(event))
5779 			return notify_select_event(sync, event);
5780 		else
5781 			return B_OK;
5782 	}
5783 
5784 	return FS_CALL(vnode, select, descriptor->cookie, event, sync);
5785 }
5786 
5787 
5788 static status_t
5789 file_deselect(struct file_descriptor* descriptor, uint8 event,
5790 	struct selectsync* sync)
5791 {
5792 	struct vnode* vnode = descriptor->u.vnode;
5793 
5794 	if (!HAS_FS_CALL(vnode, deselect))
5795 		return B_OK;
5796 
5797 	return FS_CALL(vnode, deselect, descriptor->cookie, event, sync);
5798 }
5799 
5800 
5801 static status_t
5802 dir_create_entry_ref(dev_t mountID, ino_t parentID, const char* name, int perms,
5803 	bool kernel)
5804 {
5805 	struct vnode* vnode;
5806 	status_t status;
5807 
5808 	if (name == NULL || *name == '\0')
5809 		return B_BAD_VALUE;
5810 
5811 	FUNCTION(("dir_create_entry_ref(dev = %" B_PRId32 ", ino = %" B_PRId64 ", "
5812 		"name = '%s', perms = %d)\n", mountID, parentID, name, perms));
5813 
5814 	status = get_vnode(mountID, parentID, &vnode, true, false);
5815 	if (status != B_OK)
5816 		return status;
5817 
5818 	if (HAS_FS_CALL(vnode, create_dir))
5819 		status = FS_CALL(vnode, create_dir, name, perms);
5820 	else
5821 		status = B_READ_ONLY_DEVICE;
5822 
5823 	put_vnode(vnode);
5824 	return status;
5825 }
5826 
5827 
5828 static status_t
5829 dir_create(int fd, char* path, int perms, bool kernel)
5830 {
5831 	char filename[B_FILE_NAME_LENGTH];
5832 	struct vnode* vnode;
5833 	status_t status;
5834 
5835 	FUNCTION(("dir_create: path '%s', perms %d, kernel %d\n", path, perms,
5836 		kernel));
5837 
5838 	status = fd_and_path_to_dir_vnode(fd, path, &vnode, filename, kernel);
5839 	if (status < 0)
5840 		return status;
5841 
5842 	if (HAS_FS_CALL(vnode, create_dir)) {
5843 		status = FS_CALL(vnode, create_dir, filename, perms);
5844 	} else
5845 		status = B_READ_ONLY_DEVICE;
5846 
5847 	put_vnode(vnode);
5848 	return status;
5849 }
5850 
5851 
5852 static int
5853 dir_open_entry_ref(dev_t mountID, ino_t parentID, const char* name, bool kernel)
5854 {
5855 	FUNCTION(("dir_open_entry_ref()\n"));
5856 
5857 	if (name && name[0] == '\0')
5858 		return B_BAD_VALUE;
5859 
5860 	// get the vnode matching the entry_ref/node_ref
5861 	struct vnode* vnode;
5862 	status_t status;
5863 	if (name) {
5864 		status = entry_ref_to_vnode(mountID, parentID, name, true, kernel,
5865 			&vnode);
5866 	} else
5867 		status = get_vnode(mountID, parentID, &vnode, true, false);
5868 	if (status != B_OK)
5869 		return status;
5870 
5871 	int newFD = open_dir_vnode(vnode, kernel);
5872 	if (newFD >= 0) {
5873 		// The vnode reference has been transferred to the FD
5874 		cache_node_opened(vnode, FDTYPE_DIR, vnode->cache, mountID, parentID,
5875 			vnode->id, name);
5876 	} else
5877 		put_vnode(vnode);
5878 
5879 	return newFD;
5880 }
5881 
5882 
5883 static int
5884 dir_open(int fd, char* path, bool kernel)
5885 {
5886 	FUNCTION(("dir_open: fd: %d, entry path = '%s', kernel %d\n", fd, path,
5887 		kernel));
5888 
5889 	// get the vnode matching the vnode + path combination
5890 	struct vnode* vnode = NULL;
5891 	ino_t parentID;
5892 	status_t status = fd_and_path_to_vnode(fd, path, true, &vnode, &parentID,
5893 		kernel);
5894 	if (status != B_OK)
5895 		return status;
5896 
5897 	// open the dir
5898 	int newFD = open_dir_vnode(vnode, kernel);
5899 	if (newFD >= 0) {
5900 		// The vnode reference has been transferred to the FD
5901 		cache_node_opened(vnode, FDTYPE_DIR, vnode->cache, vnode->device,
5902 			parentID, vnode->id, NULL);
5903 	} else
5904 		put_vnode(vnode);
5905 
5906 	return newFD;
5907 }
5908 
5909 
5910 static status_t
5911 dir_close(struct file_descriptor* descriptor)
5912 {
5913 	struct vnode* vnode = descriptor->u.vnode;
5914 
5915 	FUNCTION(("dir_close(descriptor = %p)\n", descriptor));
5916 
5917 	cache_node_closed(vnode, FDTYPE_DIR, vnode->cache, vnode->device,
5918 		vnode->id);
5919 	if (HAS_FS_CALL(vnode, close_dir))
5920 		return FS_CALL(vnode, close_dir, descriptor->cookie);
5921 
5922 	return B_OK;
5923 }
5924 
5925 
5926 static void
5927 dir_free_fd(struct file_descriptor* descriptor)
5928 {
5929 	struct vnode* vnode = descriptor->u.vnode;
5930 
5931 	if (vnode != NULL) {
5932 		FS_CALL(vnode, free_dir_cookie, descriptor->cookie);
5933 		put_vnode(vnode);
5934 	}
5935 }
5936 
5937 
5938 static status_t
5939 dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
5940 	struct dirent* buffer, size_t bufferSize, uint32* _count)
5941 {
5942 	return dir_read(ioContext, descriptor->u.vnode, descriptor->cookie, buffer,
5943 		bufferSize, _count);
5944 }
5945 
5946 
5947 static status_t
5948 fix_dirent(struct vnode* parent, struct dirent* entry,
5949 	struct io_context* ioContext)
5950 {
5951 	// set d_pdev and d_pino
5952 	entry->d_pdev = parent->device;
5953 	entry->d_pino = parent->id;
5954 
5955 	// If this is the ".." entry and the directory covering another vnode,
5956 	// we need to replace d_dev and d_ino with the actual values.
5957 	if (strcmp(entry->d_name, "..") == 0 && parent->IsCovering()) {
5958 		return resolve_covered_parent(parent, &entry->d_dev, &entry->d_ino,
5959 			ioContext);
5960 	}
5961 
5962 	// resolve covered vnodes
5963 	ReadLocker _(&sVnodeLock);
5964 
5965 	struct vnode* vnode = lookup_vnode(entry->d_dev, entry->d_ino);
5966 	if (vnode != NULL && vnode->covered_by != NULL) {
5967 		do {
5968 			vnode = vnode->covered_by;
5969 		} while (vnode->covered_by != NULL);
5970 
5971 		entry->d_dev = vnode->device;
5972 		entry->d_ino = vnode->id;
5973 	}
5974 
5975 	return B_OK;
5976 }
5977 
5978 
5979 static status_t
5980 dir_read(struct io_context* ioContext, struct vnode* vnode, void* cookie,
5981 	struct dirent* buffer, size_t bufferSize, uint32* _count)
5982 {
5983 	if (!HAS_FS_CALL(vnode, read_dir))
5984 		return B_UNSUPPORTED;
5985 
5986 	status_t error = FS_CALL(vnode, read_dir, cookie, buffer, bufferSize,
5987 		_count);
5988 	if (error != B_OK)
5989 		return error;
5990 
5991 	// we need to adjust the read dirents
5992 	uint32 count = *_count;
5993 	for (uint32 i = 0; i < count; i++) {
5994 		error = fix_dirent(vnode, buffer, ioContext);
5995 		if (error != B_OK)
5996 			return error;
5997 
5998 		buffer = (struct dirent*)((uint8*)buffer + buffer->d_reclen);
5999 	}
6000 
6001 	return error;
6002 }
6003 
6004 
6005 static status_t
6006 dir_rewind(struct file_descriptor* descriptor)
6007 {
6008 	struct vnode* vnode = descriptor->u.vnode;
6009 
6010 	if (HAS_FS_CALL(vnode, rewind_dir)) {
6011 		return FS_CALL(vnode, rewind_dir, descriptor->cookie);
6012 	}
6013 
6014 	return B_UNSUPPORTED;
6015 }
6016 
6017 
6018 static status_t
6019 dir_remove(int fd, char* path, bool kernel)
6020 {
6021 	char name[B_FILE_NAME_LENGTH];
6022 	struct vnode* directory;
6023 	status_t status;
6024 
6025 	if (path != NULL) {
6026 		// we need to make sure our path name doesn't stop with "/", ".",
6027 		// or ".."
6028 		char* lastSlash;
6029 		while ((lastSlash = strrchr(path, '/')) != NULL) {
6030 			char* leaf = lastSlash + 1;
6031 			if (!strcmp(leaf, ".."))
6032 				return B_NOT_ALLOWED;
6033 
6034 			// omit multiple slashes
6035 			while (lastSlash > path && lastSlash[-1] == '/')
6036 				lastSlash--;
6037 
6038 			if (leaf[0]
6039 				&& strcmp(leaf, ".")) {
6040 				break;
6041 			}
6042 			// "name/" -> "name", or "name/." -> "name"
6043 			lastSlash[0] = '\0';
6044 		}
6045 
6046 		if (!strcmp(path, ".") || !strcmp(path, ".."))
6047 			return B_NOT_ALLOWED;
6048 	}
6049 
6050 	status = fd_and_path_to_dir_vnode(fd, path, &directory, name, kernel);
6051 	if (status != B_OK)
6052 		return status;
6053 
6054 	if (HAS_FS_CALL(directory, remove_dir))
6055 		status = FS_CALL(directory, remove_dir, name);
6056 	else
6057 		status = B_READ_ONLY_DEVICE;
6058 
6059 	put_vnode(directory);
6060 	return status;
6061 }
6062 
6063 
6064 static status_t
6065 common_ioctl(struct file_descriptor* descriptor, ulong op, void* buffer,
6066 	size_t length)
6067 {
6068 	struct vnode* vnode = descriptor->u.vnode;
6069 
6070 	if (HAS_FS_CALL(vnode, ioctl))
6071 		return FS_CALL(vnode, ioctl, descriptor->cookie, op, buffer, length);
6072 
6073 	return B_DEV_INVALID_IOCTL;
6074 }
6075 
6076 
6077 static status_t
6078 common_fcntl(int fd, int op, size_t argument, bool kernel)
6079 {
6080 	struct flock flock;
6081 
6082 	FUNCTION(("common_fcntl(fd = %d, op = %d, argument = %lx, %s)\n",
6083 		fd, op, argument, kernel ? "kernel" : "user"));
6084 
6085 	struct file_descriptor* descriptor = get_fd(get_current_io_context(kernel),
6086 		fd);
6087 	if (descriptor == NULL)
6088 		return B_FILE_ERROR;
6089 
6090 	struct vnode* vnode = fd_vnode(descriptor);
6091 
6092 	status_t status = B_OK;
6093 
6094 	if (op == F_SETLK || op == F_SETLKW || op == F_GETLK) {
6095 		if (descriptor->type != FDTYPE_FILE)
6096 			status = B_BAD_VALUE;
6097 		else if (user_memcpy(&flock, (struct flock*)argument,
6098 				sizeof(struct flock)) != B_OK)
6099 			status = B_BAD_ADDRESS;
6100 
6101 		if (status != B_OK) {
6102 			put_fd(descriptor);
6103 			return status;
6104 		}
6105 	}
6106 
6107 	switch (op) {
6108 		case F_SETFD:
6109 		{
6110 			struct io_context* context = get_current_io_context(kernel);
6111 			// Set file descriptor flags
6112 
6113 			// O_CLOEXEC is the only flag available at this time
6114 			mutex_lock(&context->io_mutex);
6115 			fd_set_close_on_exec(context, fd, (argument & FD_CLOEXEC) != 0);
6116 			mutex_unlock(&context->io_mutex);
6117 
6118 			status = B_OK;
6119 			break;
6120 		}
6121 
6122 		case F_GETFD:
6123 		{
6124 			struct io_context* context = get_current_io_context(kernel);
6125 
6126 			// Get file descriptor flags
6127 			mutex_lock(&context->io_mutex);
6128 			status = fd_close_on_exec(context, fd) ? FD_CLOEXEC : 0;
6129 			mutex_unlock(&context->io_mutex);
6130 			break;
6131 		}
6132 
6133 		case F_SETFL:
6134 			// Set file descriptor open mode
6135 
6136 			// we only accept changes to O_APPEND and O_NONBLOCK
6137 			argument &= O_APPEND | O_NONBLOCK;
6138 			if (descriptor->ops->fd_set_flags != NULL) {
6139 				status = descriptor->ops->fd_set_flags(descriptor, argument);
6140 			} else if (vnode != NULL && HAS_FS_CALL(vnode, set_flags)) {
6141 				status = FS_CALL(vnode, set_flags, descriptor->cookie,
6142 					(int)argument);
6143 			} else
6144 				status = B_UNSUPPORTED;
6145 
6146 			if (status == B_OK) {
6147 				// update this descriptor's open_mode field
6148 				descriptor->open_mode = (descriptor->open_mode
6149 					& ~(O_APPEND | O_NONBLOCK)) | argument;
6150 			}
6151 
6152 			break;
6153 
6154 		case F_GETFL:
6155 			// Get file descriptor open mode
6156 			status = descriptor->open_mode;
6157 			break;
6158 
6159 		case F_DUPFD:
6160 		case F_DUPFD_CLOEXEC:
6161 		{
6162 			struct io_context* context = get_current_io_context(kernel);
6163 
6164 			status = new_fd_etc(context, descriptor, (int)argument);
6165 			if (status >= 0) {
6166 				mutex_lock(&context->io_mutex);
6167 				fd_set_close_on_exec(context, fd, op == F_DUPFD_CLOEXEC);
6168 				mutex_unlock(&context->io_mutex);
6169 
6170 				atomic_add(&descriptor->ref_count, 1);
6171 			}
6172 			break;
6173 		}
6174 
6175 		case F_GETLK:
6176 			if (vnode != NULL) {
6177 				struct flock normalizedLock;
6178 
6179 				memcpy(&normalizedLock, &flock, sizeof(struct flock));
6180 				status = normalize_flock(descriptor, &normalizedLock);
6181 				if (status != B_OK)
6182 					break;
6183 
6184 				if (HAS_FS_CALL(vnode, test_lock)) {
6185 					status = FS_CALL(vnode, test_lock, descriptor->cookie,
6186 						&normalizedLock);
6187 				} else
6188 					status = test_advisory_lock(vnode, &normalizedLock);
6189 				if (status == B_OK) {
6190 					if (normalizedLock.l_type == F_UNLCK) {
6191 						// no conflicting lock found, copy back the same struct
6192 						// we were given except change type to F_UNLCK
6193 						flock.l_type = F_UNLCK;
6194 						status = user_memcpy((struct flock*)argument, &flock,
6195 							sizeof(struct flock));
6196 					} else {
6197 						// a conflicting lock was found, copy back its range and
6198 						// type
6199 						if (normalizedLock.l_len == OFF_MAX)
6200 							normalizedLock.l_len = 0;
6201 
6202 						status = user_memcpy((struct flock*)argument,
6203 							&normalizedLock, sizeof(struct flock));
6204 					}
6205 				}
6206 			} else
6207 				status = B_BAD_VALUE;
6208 			break;
6209 
6210 		case F_SETLK:
6211 		case F_SETLKW:
6212 			status = normalize_flock(descriptor, &flock);
6213 			if (status != B_OK)
6214 				break;
6215 
6216 			if (vnode == NULL) {
6217 				status = B_BAD_VALUE;
6218 			} else if (flock.l_type == F_UNLCK) {
6219 				if (HAS_FS_CALL(vnode, release_lock)) {
6220 					status = FS_CALL(vnode, release_lock, descriptor->cookie,
6221 						&flock);
6222 				} else
6223 					status = release_advisory_lock(vnode, &flock);
6224 			} else {
6225 				// the open mode must match the lock type
6226 				if (((descriptor->open_mode & O_RWMASK) == O_RDONLY
6227 						&& flock.l_type == F_WRLCK)
6228 					|| ((descriptor->open_mode & O_RWMASK) == O_WRONLY
6229 						&& flock.l_type == F_RDLCK))
6230 					status = B_FILE_ERROR;
6231 				else {
6232 					if (HAS_FS_CALL(vnode, acquire_lock)) {
6233 						status = FS_CALL(vnode, acquire_lock,
6234 							descriptor->cookie, &flock, op == F_SETLKW);
6235 					} else {
6236 						status = acquire_advisory_lock(vnode, -1,
6237 							&flock, op == F_SETLKW);
6238 					}
6239 				}
6240 			}
6241 			break;
6242 
6243 		// ToDo: add support for more ops?
6244 
6245 		default:
6246 			status = B_BAD_VALUE;
6247 	}
6248 
6249 	put_fd(descriptor);
6250 	return status;
6251 }
6252 
6253 
6254 static status_t
6255 common_sync(int fd, bool kernel)
6256 {
6257 	struct file_descriptor* descriptor;
6258 	struct vnode* vnode;
6259 	status_t status;
6260 
6261 	FUNCTION(("common_fsync: entry. fd %d kernel %d\n", fd, kernel));
6262 
6263 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6264 	if (descriptor == NULL)
6265 		return B_FILE_ERROR;
6266 
6267 	if (HAS_FS_CALL(vnode, fsync))
6268 		status = FS_CALL_NO_PARAMS(vnode, fsync);
6269 	else
6270 		status = B_UNSUPPORTED;
6271 
6272 	put_fd(descriptor);
6273 	return status;
6274 }
6275 
6276 
6277 static status_t
6278 common_lock_node(int fd, bool kernel)
6279 {
6280 	struct file_descriptor* descriptor;
6281 	struct vnode* vnode;
6282 
6283 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6284 	if (descriptor == NULL)
6285 		return B_FILE_ERROR;
6286 
6287 	status_t status = B_OK;
6288 
6289 	// We need to set the locking atomically - someone
6290 	// else might set one at the same time
6291 	if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by, descriptor,
6292 			(file_descriptor*)NULL) != NULL)
6293 		status = B_BUSY;
6294 
6295 	put_fd(descriptor);
6296 	return status;
6297 }
6298 
6299 
6300 static status_t
6301 common_unlock_node(int fd, bool kernel)
6302 {
6303 	struct file_descriptor* descriptor;
6304 	struct vnode* vnode;
6305 
6306 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6307 	if (descriptor == NULL)
6308 		return B_FILE_ERROR;
6309 
6310 	status_t status = B_OK;
6311 
6312 	// We need to set the locking atomically - someone
6313 	// else might set one at the same time
6314 	if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by,
6315 			(file_descriptor*)NULL, descriptor) != descriptor)
6316 		status = B_BAD_VALUE;
6317 
6318 	put_fd(descriptor);
6319 	return status;
6320 }
6321 
6322 
6323 static status_t
6324 common_read_link(int fd, char* path, char* buffer, size_t* _bufferSize,
6325 	bool kernel)
6326 {
6327 	struct vnode* vnode;
6328 	status_t status;
6329 
6330 	status = fd_and_path_to_vnode(fd, path, false, &vnode, NULL, kernel);
6331 	if (status != B_OK)
6332 		return status;
6333 
6334 	if (HAS_FS_CALL(vnode, read_symlink)) {
6335 		status = FS_CALL(vnode, read_symlink, buffer, _bufferSize);
6336 	} else
6337 		status = B_BAD_VALUE;
6338 
6339 	put_vnode(vnode);
6340 	return status;
6341 }
6342 
6343 
6344 static status_t
6345 common_create_symlink(int fd, char* path, const char* toPath, int mode,
6346 	bool kernel)
6347 {
6348 	// path validity checks have to be in the calling function!
6349 	char name[B_FILE_NAME_LENGTH];
6350 	struct vnode* vnode;
6351 	status_t status;
6352 
6353 	FUNCTION(("common_create_symlink(fd = %d, path = %s, toPath = %s, "
6354 		"mode = %d, kernel = %d)\n", fd, path, toPath, mode, kernel));
6355 
6356 	status = fd_and_path_to_dir_vnode(fd, path, &vnode, name, kernel);
6357 	if (status != B_OK)
6358 		return status;
6359 
6360 	if (HAS_FS_CALL(vnode, create_symlink))
6361 		status = FS_CALL(vnode, create_symlink, name, toPath, mode);
6362 	else {
6363 		status = HAS_FS_CALL(vnode, write)
6364 			? B_UNSUPPORTED : B_READ_ONLY_DEVICE;
6365 	}
6366 
6367 	put_vnode(vnode);
6368 
6369 	return status;
6370 }
6371 
6372 
6373 static status_t
6374 common_create_link(int pathFD, char* path, int toFD, char* toPath,
6375 	bool traverseLeafLink, bool kernel)
6376 {
6377 	// path validity checks have to be in the calling function!
6378 
6379 	FUNCTION(("common_create_link(path = %s, toPath = %s, kernel = %d)\n", path,
6380 		toPath, kernel));
6381 
6382 	char name[B_FILE_NAME_LENGTH];
6383 	struct vnode* directory;
6384 	status_t status = fd_and_path_to_dir_vnode(pathFD, path, &directory, name,
6385 		kernel);
6386 	if (status != B_OK)
6387 		return status;
6388 
6389 	struct vnode* vnode;
6390 	status = fd_and_path_to_vnode(toFD, toPath, traverseLeafLink, &vnode, NULL,
6391 		kernel);
6392 	if (status != B_OK)
6393 		goto err;
6394 
6395 	if (directory->mount != vnode->mount) {
6396 		status = B_CROSS_DEVICE_LINK;
6397 		goto err1;
6398 	}
6399 
6400 	if (HAS_FS_CALL(directory, link))
6401 		status = FS_CALL(directory, link, name, vnode);
6402 	else
6403 		status = B_READ_ONLY_DEVICE;
6404 
6405 err1:
6406 	put_vnode(vnode);
6407 err:
6408 	put_vnode(directory);
6409 
6410 	return status;
6411 }
6412 
6413 
6414 static status_t
6415 common_unlink(int fd, char* path, bool kernel)
6416 {
6417 	char filename[B_FILE_NAME_LENGTH];
6418 	struct vnode* vnode;
6419 	status_t status;
6420 
6421 	FUNCTION(("common_unlink: fd: %d, path '%s', kernel %d\n", fd, path,
6422 		kernel));
6423 
6424 	status = fd_and_path_to_dir_vnode(fd, path, &vnode, filename, kernel);
6425 	if (status < 0)
6426 		return status;
6427 
6428 	if (HAS_FS_CALL(vnode, unlink))
6429 		status = FS_CALL(vnode, unlink, filename);
6430 	else
6431 		status = B_READ_ONLY_DEVICE;
6432 
6433 	put_vnode(vnode);
6434 
6435 	return status;
6436 }
6437 
6438 
6439 static status_t
6440 common_access(int fd, char* path, int mode, bool effectiveUserGroup, bool kernel)
6441 {
6442 	struct vnode* vnode;
6443 	status_t status;
6444 
6445 	// TODO: honor effectiveUserGroup argument
6446 
6447 	status = fd_and_path_to_vnode(fd, path, true, &vnode, NULL, kernel);
6448 	if (status != B_OK)
6449 		return status;
6450 
6451 	if (HAS_FS_CALL(vnode, access))
6452 		status = FS_CALL(vnode, access, mode);
6453 	else
6454 		status = B_OK;
6455 
6456 	put_vnode(vnode);
6457 
6458 	return status;
6459 }
6460 
6461 
6462 static status_t
6463 common_rename(int fd, char* path, int newFD, char* newPath, bool kernel)
6464 {
6465 	struct vnode* fromVnode;
6466 	struct vnode* toVnode;
6467 	char fromName[B_FILE_NAME_LENGTH];
6468 	char toName[B_FILE_NAME_LENGTH];
6469 	status_t status;
6470 
6471 	FUNCTION(("common_rename(fd = %d, path = %s, newFD = %d, newPath = %s, "
6472 		"kernel = %d)\n", fd, path, newFD, newPath, kernel));
6473 
6474 	status = fd_and_path_to_dir_vnode(fd, path, &fromVnode, fromName, kernel);
6475 	if (status != B_OK)
6476 		return status;
6477 
6478 	status = fd_and_path_to_dir_vnode(newFD, newPath, &toVnode, toName, kernel);
6479 	if (status != B_OK)
6480 		goto err1;
6481 
6482 	if (fromVnode->device != toVnode->device) {
6483 		status = B_CROSS_DEVICE_LINK;
6484 		goto err2;
6485 	}
6486 
6487 	if (fromName[0] == '\0' || toName[0] == '\0'
6488 		|| !strcmp(fromName, ".") || !strcmp(fromName, "..")
6489 		|| !strcmp(toName, ".") || !strcmp(toName, "..")
6490 		|| (fromVnode == toVnode && !strcmp(fromName, toName))) {
6491 		status = B_BAD_VALUE;
6492 		goto err2;
6493 	}
6494 
6495 	if (HAS_FS_CALL(fromVnode, rename))
6496 		status = FS_CALL(fromVnode, rename, fromName, toVnode, toName);
6497 	else
6498 		status = B_READ_ONLY_DEVICE;
6499 
6500 err2:
6501 	put_vnode(toVnode);
6502 err1:
6503 	put_vnode(fromVnode);
6504 
6505 	return status;
6506 }
6507 
6508 
6509 static status_t
6510 common_read_stat(struct file_descriptor* descriptor, struct stat* stat)
6511 {
6512 	struct vnode* vnode = descriptor->u.vnode;
6513 
6514 	FUNCTION(("common_read_stat: stat %p\n", stat));
6515 
6516 	// TODO: remove this once all file systems properly set them!
6517 	stat->st_crtim.tv_nsec = 0;
6518 	stat->st_ctim.tv_nsec = 0;
6519 	stat->st_mtim.tv_nsec = 0;
6520 	stat->st_atim.tv_nsec = 0;
6521 
6522 	return vfs_stat_vnode(vnode, stat);
6523 }
6524 
6525 
6526 static status_t
6527 common_write_stat(struct file_descriptor* descriptor, const struct stat* stat,
6528 	int statMask)
6529 {
6530 	struct vnode* vnode = descriptor->u.vnode;
6531 
6532 	FUNCTION(("common_write_stat(vnode = %p, stat = %p, statMask = %d)\n",
6533 		vnode, stat, statMask));
6534 
6535 	if (!HAS_FS_CALL(vnode, write_stat))
6536 		return B_READ_ONLY_DEVICE;
6537 
6538 	return FS_CALL(vnode, write_stat, stat, statMask);
6539 }
6540 
6541 
6542 static status_t
6543 common_path_read_stat(int fd, char* path, bool traverseLeafLink,
6544 	struct stat* stat, bool kernel)
6545 {
6546 	FUNCTION(("common_path_read_stat: fd: %d, path '%s', stat %p,\n", fd, path,
6547 		stat));
6548 
6549 	struct vnode* vnode;
6550 	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6551 		NULL, kernel);
6552 	if (status != B_OK)
6553 		return status;
6554 
6555 	status = vfs_stat_vnode(vnode, stat);
6556 
6557 	put_vnode(vnode);
6558 	return status;
6559 }
6560 
6561 
6562 static status_t
6563 common_path_write_stat(int fd, char* path, bool traverseLeafLink,
6564 	const struct stat* stat, int statMask, bool kernel)
6565 {
6566 	FUNCTION(("common_write_stat: fd: %d, path '%s', stat %p, stat_mask %d, "
6567 		"kernel %d\n", fd, path, stat, statMask, kernel));
6568 
6569 	struct vnode* vnode;
6570 	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6571 		NULL, kernel);
6572 	if (status != B_OK)
6573 		return status;
6574 
6575 	if (HAS_FS_CALL(vnode, write_stat))
6576 		status = FS_CALL(vnode, write_stat, stat, statMask);
6577 	else
6578 		status = B_READ_ONLY_DEVICE;
6579 
6580 	put_vnode(vnode);
6581 
6582 	return status;
6583 }
6584 
6585 
6586 static int
6587 attr_dir_open(int fd, char* path, bool traverseLeafLink, bool kernel)
6588 {
6589 	FUNCTION(("attr_dir_open(fd = %d, path = '%s', kernel = %d)\n", fd, path,
6590 		kernel));
6591 
6592 	struct vnode* vnode;
6593 	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6594 		NULL, kernel);
6595 	if (status != B_OK)
6596 		return status;
6597 
6598 	status = open_attr_dir_vnode(vnode, kernel);
6599 	if (status < 0)
6600 		put_vnode(vnode);
6601 
6602 	return status;
6603 }
6604 
6605 
6606 static status_t
6607 attr_dir_close(struct file_descriptor* descriptor)
6608 {
6609 	struct vnode* vnode = descriptor->u.vnode;
6610 
6611 	FUNCTION(("attr_dir_close(descriptor = %p)\n", descriptor));
6612 
6613 	if (HAS_FS_CALL(vnode, close_attr_dir))
6614 		return FS_CALL(vnode, close_attr_dir, descriptor->cookie);
6615 
6616 	return B_OK;
6617 }
6618 
6619 
6620 static void
6621 attr_dir_free_fd(struct file_descriptor* descriptor)
6622 {
6623 	struct vnode* vnode = descriptor->u.vnode;
6624 
6625 	if (vnode != NULL) {
6626 		FS_CALL(vnode, free_attr_dir_cookie, descriptor->cookie);
6627 		put_vnode(vnode);
6628 	}
6629 }
6630 
6631 
6632 static status_t
6633 attr_dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
6634 	struct dirent* buffer, size_t bufferSize, uint32* _count)
6635 {
6636 	struct vnode* vnode = descriptor->u.vnode;
6637 
6638 	FUNCTION(("attr_dir_read(descriptor = %p)\n", descriptor));
6639 
6640 	if (HAS_FS_CALL(vnode, read_attr_dir))
6641 		return FS_CALL(vnode, read_attr_dir, descriptor->cookie, buffer,
6642 			bufferSize, _count);
6643 
6644 	return B_UNSUPPORTED;
6645 }
6646 
6647 
6648 static status_t
6649 attr_dir_rewind(struct file_descriptor* descriptor)
6650 {
6651 	struct vnode* vnode = descriptor->u.vnode;
6652 
6653 	FUNCTION(("attr_dir_rewind(descriptor = %p)\n", descriptor));
6654 
6655 	if (HAS_FS_CALL(vnode, rewind_attr_dir))
6656 		return FS_CALL(vnode, rewind_attr_dir, descriptor->cookie);
6657 
6658 	return B_UNSUPPORTED;
6659 }
6660 
6661 
6662 static int
6663 attr_create(int fd, char* path, const char* name, uint32 type,
6664 	int openMode, bool kernel)
6665 {
6666 	if (name == NULL || *name == '\0')
6667 		return B_BAD_VALUE;
6668 
6669 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
6670 	struct vnode* vnode;
6671 	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode, NULL,
6672 		kernel);
6673 	if (status != B_OK)
6674 		return status;
6675 
6676 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
6677 		status = B_LINK_LIMIT;
6678 		goto err;
6679 	}
6680 
6681 	if (!HAS_FS_CALL(vnode, create_attr)) {
6682 		status = B_READ_ONLY_DEVICE;
6683 		goto err;
6684 	}
6685 
6686 	void* cookie;
6687 	status = FS_CALL(vnode, create_attr, name, type, openMode, &cookie);
6688 	if (status != B_OK)
6689 		goto err;
6690 
6691 	fd = get_new_fd(FDTYPE_ATTR, NULL, vnode, cookie, openMode, kernel);
6692 	if (fd >= 0)
6693 		return fd;
6694 
6695 	status = fd;
6696 
6697 	FS_CALL(vnode, close_attr, cookie);
6698 	FS_CALL(vnode, free_attr_cookie, cookie);
6699 
6700 	FS_CALL(vnode, remove_attr, name);
6701 
6702 err:
6703 	put_vnode(vnode);
6704 
6705 	return status;
6706 }
6707 
6708 
6709 static int
6710 attr_open(int fd, char* path, const char* name, int openMode, bool kernel)
6711 {
6712 	if (name == NULL || *name == '\0')
6713 		return B_BAD_VALUE;
6714 
6715 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
6716 	struct vnode* vnode;
6717 	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode, NULL,
6718 		kernel);
6719 	if (status != B_OK)
6720 		return status;
6721 
6722 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
6723 		status = B_LINK_LIMIT;
6724 		goto err;
6725 	}
6726 
6727 	if (!HAS_FS_CALL(vnode, open_attr)) {
6728 		status = B_UNSUPPORTED;
6729 		goto err;
6730 	}
6731 
6732 	void* cookie;
6733 	status = FS_CALL(vnode, open_attr, name, openMode, &cookie);
6734 	if (status != B_OK)
6735 		goto err;
6736 
6737 	// now we only need a file descriptor for this attribute and we're done
6738 	fd = get_new_fd(FDTYPE_ATTR, NULL, vnode, cookie, openMode, kernel);
6739 	if (fd >= 0)
6740 		return fd;
6741 
6742 	status = fd;
6743 
6744 	FS_CALL(vnode, close_attr, cookie);
6745 	FS_CALL(vnode, free_attr_cookie, cookie);
6746 
6747 err:
6748 	put_vnode(vnode);
6749 
6750 	return status;
6751 }
6752 
6753 
6754 static status_t
6755 attr_close(struct file_descriptor* descriptor)
6756 {
6757 	struct vnode* vnode = descriptor->u.vnode;
6758 
6759 	FUNCTION(("attr_close(descriptor = %p)\n", descriptor));
6760 
6761 	if (HAS_FS_CALL(vnode, close_attr))
6762 		return FS_CALL(vnode, close_attr, descriptor->cookie);
6763 
6764 	return B_OK;
6765 }
6766 
6767 
6768 static void
6769 attr_free_fd(struct file_descriptor* descriptor)
6770 {
6771 	struct vnode* vnode = descriptor->u.vnode;
6772 
6773 	if (vnode != NULL) {
6774 		FS_CALL(vnode, free_attr_cookie, descriptor->cookie);
6775 		put_vnode(vnode);
6776 	}
6777 }
6778 
6779 
6780 static status_t
6781 attr_read(struct file_descriptor* descriptor, off_t pos, void* buffer,
6782 	size_t* length)
6783 {
6784 	struct vnode* vnode = descriptor->u.vnode;
6785 
6786 	FUNCTION(("attr_read: buf %p, pos %" B_PRIdOFF ", len %p = %ld\n", buffer,
6787 		pos, length, *length));
6788 
6789 	if (!HAS_FS_CALL(vnode, read_attr))
6790 		return B_UNSUPPORTED;
6791 
6792 	return FS_CALL(vnode, read_attr, descriptor->cookie, pos, buffer, length);
6793 }
6794 
6795 
6796 static status_t
6797 attr_write(struct file_descriptor* descriptor, off_t pos, const void* buffer,
6798 	size_t* length)
6799 {
6800 	struct vnode* vnode = descriptor->u.vnode;
6801 
6802 	FUNCTION(("attr_write: buf %p, pos %" B_PRIdOFF ", len %p\n", buffer, pos,
6803 		length));
6804 
6805 	if (!HAS_FS_CALL(vnode, write_attr))
6806 		return B_UNSUPPORTED;
6807 
6808 	return FS_CALL(vnode, write_attr, descriptor->cookie, pos, buffer, length);
6809 }
6810 
6811 
6812 static off_t
6813 attr_seek(struct file_descriptor* descriptor, off_t pos, int seekType)
6814 {
6815 	off_t offset;
6816 
6817 	switch (seekType) {
6818 		case SEEK_SET:
6819 			offset = 0;
6820 			break;
6821 		case SEEK_CUR:
6822 			offset = descriptor->pos;
6823 			break;
6824 		case SEEK_END:
6825 		{
6826 			struct vnode* vnode = descriptor->u.vnode;
6827 			if (!HAS_FS_CALL(vnode, read_stat))
6828 				return B_UNSUPPORTED;
6829 
6830 			struct stat stat;
6831 			status_t status = FS_CALL(vnode, read_attr_stat, descriptor->cookie,
6832 				&stat);
6833 			if (status != B_OK)
6834 				return status;
6835 
6836 			offset = stat.st_size;
6837 			break;
6838 		}
6839 		default:
6840 			return B_BAD_VALUE;
6841 	}
6842 
6843 	// assumes off_t is 64 bits wide
6844 	if (offset > 0 && LONGLONG_MAX - offset < pos)
6845 		return B_BUFFER_OVERFLOW;
6846 
6847 	pos += offset;
6848 	if (pos < 0)
6849 		return B_BAD_VALUE;
6850 
6851 	return descriptor->pos = pos;
6852 }
6853 
6854 
6855 static status_t
6856 attr_read_stat(struct file_descriptor* descriptor, struct stat* stat)
6857 {
6858 	struct vnode* vnode = descriptor->u.vnode;
6859 
6860 	FUNCTION(("attr_read_stat: stat 0x%p\n", stat));
6861 
6862 	if (!HAS_FS_CALL(vnode, read_attr_stat))
6863 		return B_UNSUPPORTED;
6864 
6865 	return FS_CALL(vnode, read_attr_stat, descriptor->cookie, stat);
6866 }
6867 
6868 
6869 static status_t
6870 attr_write_stat(struct file_descriptor* descriptor, const struct stat* stat,
6871 	int statMask)
6872 {
6873 	struct vnode* vnode = descriptor->u.vnode;
6874 
6875 	FUNCTION(("attr_write_stat: stat = %p, statMask %d\n", stat, statMask));
6876 
6877 	if (!HAS_FS_CALL(vnode, write_attr_stat))
6878 		return B_READ_ONLY_DEVICE;
6879 
6880 	return FS_CALL(vnode, write_attr_stat, descriptor->cookie, stat, statMask);
6881 }
6882 
6883 
6884 static status_t
6885 attr_remove(int fd, const char* name, bool kernel)
6886 {
6887 	struct file_descriptor* descriptor;
6888 	struct vnode* vnode;
6889 	status_t status;
6890 
6891 	if (name == NULL || *name == '\0')
6892 		return B_BAD_VALUE;
6893 
6894 	FUNCTION(("attr_remove: fd = %d, name = \"%s\", kernel %d\n", fd, name,
6895 		kernel));
6896 
6897 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6898 	if (descriptor == NULL)
6899 		return B_FILE_ERROR;
6900 
6901 	if (HAS_FS_CALL(vnode, remove_attr))
6902 		status = FS_CALL(vnode, remove_attr, name);
6903 	else
6904 		status = B_READ_ONLY_DEVICE;
6905 
6906 	put_fd(descriptor);
6907 
6908 	return status;
6909 }
6910 
6911 
6912 static status_t
6913 attr_rename(int fromFD, const char* fromName, int toFD, const char* toName,
6914 	bool kernel)
6915 {
6916 	struct file_descriptor* fromDescriptor;
6917 	struct file_descriptor* toDescriptor;
6918 	struct vnode* fromVnode;
6919 	struct vnode* toVnode;
6920 	status_t status;
6921 
6922 	if (fromName == NULL || *fromName == '\0' || toName == NULL
6923 		|| *toName == '\0')
6924 		return B_BAD_VALUE;
6925 
6926 	FUNCTION(("attr_rename: from fd = %d, from name = \"%s\", to fd = %d, to "
6927 		"name = \"%s\", kernel %d\n", fromFD, fromName, toFD, toName, kernel));
6928 
6929 	fromDescriptor = get_fd_and_vnode(fromFD, &fromVnode, kernel);
6930 	if (fromDescriptor == NULL)
6931 		return B_FILE_ERROR;
6932 
6933 	toDescriptor = get_fd_and_vnode(toFD, &toVnode, kernel);
6934 	if (toDescriptor == NULL) {
6935 		status = B_FILE_ERROR;
6936 		goto err;
6937 	}
6938 
6939 	// are the files on the same volume?
6940 	if (fromVnode->device != toVnode->device) {
6941 		status = B_CROSS_DEVICE_LINK;
6942 		goto err1;
6943 	}
6944 
6945 	if (HAS_FS_CALL(fromVnode, rename_attr)) {
6946 		status = FS_CALL(fromVnode, rename_attr, fromName, toVnode, toName);
6947 	} else
6948 		status = B_READ_ONLY_DEVICE;
6949 
6950 err1:
6951 	put_fd(toDescriptor);
6952 err:
6953 	put_fd(fromDescriptor);
6954 
6955 	return status;
6956 }
6957 
6958 
6959 static int
6960 index_dir_open(dev_t mountID, bool kernel)
6961 {
6962 	struct fs_mount* mount;
6963 	void* cookie;
6964 
6965 	FUNCTION(("index_dir_open(mountID = %" B_PRId32 ", kernel = %d)\n", mountID,
6966 		kernel));
6967 
6968 	status_t status = get_mount(mountID, &mount);
6969 	if (status != B_OK)
6970 		return status;
6971 
6972 	if (!HAS_FS_MOUNT_CALL(mount, open_index_dir)) {
6973 		status = B_UNSUPPORTED;
6974 		goto error;
6975 	}
6976 
6977 	status = FS_MOUNT_CALL(mount, open_index_dir, &cookie);
6978 	if (status != B_OK)
6979 		goto error;
6980 
6981 	// get fd for the index directory
6982 	int fd;
6983 	fd = get_new_fd(FDTYPE_INDEX_DIR, mount, NULL, cookie, O_CLOEXEC, kernel);
6984 	if (fd >= 0)
6985 		return fd;
6986 
6987 	// something went wrong
6988 	FS_MOUNT_CALL(mount, close_index_dir, cookie);
6989 	FS_MOUNT_CALL(mount, free_index_dir_cookie, cookie);
6990 
6991 	status = fd;
6992 
6993 error:
6994 	put_mount(mount);
6995 	return status;
6996 }
6997 
6998 
6999 static status_t
7000 index_dir_close(struct file_descriptor* descriptor)
7001 {
7002 	struct fs_mount* mount = descriptor->u.mount;
7003 
7004 	FUNCTION(("index_dir_close(descriptor = %p)\n", descriptor));
7005 
7006 	if (HAS_FS_MOUNT_CALL(mount, close_index_dir))
7007 		return FS_MOUNT_CALL(mount, close_index_dir, descriptor->cookie);
7008 
7009 	return B_OK;
7010 }
7011 
7012 
7013 static void
7014 index_dir_free_fd(struct file_descriptor* descriptor)
7015 {
7016 	struct fs_mount* mount = descriptor->u.mount;
7017 
7018 	if (mount != NULL) {
7019 		FS_MOUNT_CALL(mount, free_index_dir_cookie, descriptor->cookie);
7020 		put_mount(mount);
7021 	}
7022 }
7023 
7024 
7025 static status_t
7026 index_dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
7027 	struct dirent* buffer, size_t bufferSize, uint32* _count)
7028 {
7029 	struct fs_mount* mount = descriptor->u.mount;
7030 
7031 	if (HAS_FS_MOUNT_CALL(mount, read_index_dir)) {
7032 		return FS_MOUNT_CALL(mount, read_index_dir, descriptor->cookie, buffer,
7033 			bufferSize, _count);
7034 	}
7035 
7036 	return B_UNSUPPORTED;
7037 }
7038 
7039 
7040 static status_t
7041 index_dir_rewind(struct file_descriptor* descriptor)
7042 {
7043 	struct fs_mount* mount = descriptor->u.mount;
7044 
7045 	if (HAS_FS_MOUNT_CALL(mount, rewind_index_dir))
7046 		return FS_MOUNT_CALL(mount, rewind_index_dir, descriptor->cookie);
7047 
7048 	return B_UNSUPPORTED;
7049 }
7050 
7051 
7052 static status_t
7053 index_create(dev_t mountID, const char* name, uint32 type, uint32 flags,
7054 	bool kernel)
7055 {
7056 	FUNCTION(("index_create(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7057 		mountID, name, kernel));
7058 
7059 	struct fs_mount* mount;
7060 	status_t status = get_mount(mountID, &mount);
7061 	if (status != B_OK)
7062 		return status;
7063 
7064 	if (!HAS_FS_MOUNT_CALL(mount, create_index)) {
7065 		status = B_READ_ONLY_DEVICE;
7066 		goto out;
7067 	}
7068 
7069 	status = FS_MOUNT_CALL(mount, create_index, name, type, flags);
7070 
7071 out:
7072 	put_mount(mount);
7073 	return status;
7074 }
7075 
7076 
7077 #if 0
7078 static status_t
7079 index_read_stat(struct file_descriptor* descriptor, struct stat* stat)
7080 {
7081 	struct vnode* vnode = descriptor->u.vnode;
7082 
7083 	// ToDo: currently unused!
7084 	FUNCTION(("index_read_stat: stat 0x%p\n", stat));
7085 	if (!HAS_FS_CALL(vnode, read_index_stat))
7086 		return B_UNSUPPORTED;
7087 
7088 	return B_UNSUPPORTED;
7089 	//return FS_CALL(vnode, read_index_stat, descriptor->cookie, stat);
7090 }
7091 
7092 
7093 static void
7094 index_free_fd(struct file_descriptor* descriptor)
7095 {
7096 	struct vnode* vnode = descriptor->u.vnode;
7097 
7098 	if (vnode != NULL) {
7099 		FS_CALL(vnode, free_index_cookie, descriptor->cookie);
7100 		put_vnode(vnode);
7101 	}
7102 }
7103 #endif
7104 
7105 
7106 static status_t
7107 index_name_read_stat(dev_t mountID, const char* name, struct stat* stat,
7108 	bool kernel)
7109 {
7110 	FUNCTION(("index_remove(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7111 		mountID, name, kernel));
7112 
7113 	struct fs_mount* mount;
7114 	status_t status = get_mount(mountID, &mount);
7115 	if (status != B_OK)
7116 		return status;
7117 
7118 	if (!HAS_FS_MOUNT_CALL(mount, read_index_stat)) {
7119 		status = B_UNSUPPORTED;
7120 		goto out;
7121 	}
7122 
7123 	status = FS_MOUNT_CALL(mount, read_index_stat, name, stat);
7124 
7125 out:
7126 	put_mount(mount);
7127 	return status;
7128 }
7129 
7130 
7131 static status_t
7132 index_remove(dev_t mountID, const char* name, bool kernel)
7133 {
7134 	FUNCTION(("index_remove(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7135 		mountID, name, kernel));
7136 
7137 	struct fs_mount* mount;
7138 	status_t status = get_mount(mountID, &mount);
7139 	if (status != B_OK)
7140 		return status;
7141 
7142 	if (!HAS_FS_MOUNT_CALL(mount, remove_index)) {
7143 		status = B_READ_ONLY_DEVICE;
7144 		goto out;
7145 	}
7146 
7147 	status = FS_MOUNT_CALL(mount, remove_index, name);
7148 
7149 out:
7150 	put_mount(mount);
7151 	return status;
7152 }
7153 
7154 
7155 /*!	TODO: the query FS API is still the pretty much the same as in R5.
7156 		It would be nice if the FS would find some more kernel support
7157 		for them.
7158 		For example, query parsing should be moved into the kernel.
7159 */
7160 static int
7161 query_open(dev_t device, const char* query, uint32 flags, port_id port,
7162 	int32 token, bool kernel)
7163 {
7164 	struct fs_mount* mount;
7165 	void* cookie;
7166 
7167 	FUNCTION(("query_open(device = %" B_PRId32 ", query = \"%s\", kernel = %d)\n",
7168 		device, query, kernel));
7169 
7170 	status_t status = get_mount(device, &mount);
7171 	if (status != B_OK)
7172 		return status;
7173 
7174 	if (!HAS_FS_MOUNT_CALL(mount, open_query)) {
7175 		status = B_UNSUPPORTED;
7176 		goto error;
7177 	}
7178 
7179 	status = FS_MOUNT_CALL(mount, open_query, query, flags, port, token,
7180 		&cookie);
7181 	if (status != B_OK)
7182 		goto error;
7183 
7184 	// get fd for the index directory
7185 	int fd;
7186 	fd = get_new_fd(FDTYPE_QUERY, mount, NULL, cookie, O_CLOEXEC, kernel);
7187 	if (fd >= 0)
7188 		return fd;
7189 
7190 	status = fd;
7191 
7192 	// something went wrong
7193 	FS_MOUNT_CALL(mount, close_query, cookie);
7194 	FS_MOUNT_CALL(mount, free_query_cookie, cookie);
7195 
7196 error:
7197 	put_mount(mount);
7198 	return status;
7199 }
7200 
7201 
7202 static status_t
7203 query_close(struct file_descriptor* descriptor)
7204 {
7205 	struct fs_mount* mount = descriptor->u.mount;
7206 
7207 	FUNCTION(("query_close(descriptor = %p)\n", descriptor));
7208 
7209 	if (HAS_FS_MOUNT_CALL(mount, close_query))
7210 		return FS_MOUNT_CALL(mount, close_query, descriptor->cookie);
7211 
7212 	return B_OK;
7213 }
7214 
7215 
7216 static void
7217 query_free_fd(struct file_descriptor* descriptor)
7218 {
7219 	struct fs_mount* mount = descriptor->u.mount;
7220 
7221 	if (mount != NULL) {
7222 		FS_MOUNT_CALL(mount, free_query_cookie, descriptor->cookie);
7223 		put_mount(mount);
7224 	}
7225 }
7226 
7227 
7228 static status_t
7229 query_read(struct io_context* ioContext, struct file_descriptor* descriptor,
7230 	struct dirent* buffer, size_t bufferSize, uint32* _count)
7231 {
7232 	struct fs_mount* mount = descriptor->u.mount;
7233 
7234 	if (HAS_FS_MOUNT_CALL(mount, read_query)) {
7235 		return FS_MOUNT_CALL(mount, read_query, descriptor->cookie, buffer,
7236 			bufferSize, _count);
7237 	}
7238 
7239 	return B_UNSUPPORTED;
7240 }
7241 
7242 
7243 static status_t
7244 query_rewind(struct file_descriptor* descriptor)
7245 {
7246 	struct fs_mount* mount = descriptor->u.mount;
7247 
7248 	if (HAS_FS_MOUNT_CALL(mount, rewind_query))
7249 		return FS_MOUNT_CALL(mount, rewind_query, descriptor->cookie);
7250 
7251 	return B_UNSUPPORTED;
7252 }
7253 
7254 
7255 //	#pragma mark - General File System functions
7256 
7257 
7258 static dev_t
7259 fs_mount(char* path, const char* device, const char* fsName, uint32 flags,
7260 	const char* args, bool kernel)
7261 {
7262 	struct ::fs_mount* mount;
7263 	status_t status = B_OK;
7264 	fs_volume* volume = NULL;
7265 	int32 layer = 0;
7266 	Vnode* coveredNode = NULL;
7267 
7268 	FUNCTION(("fs_mount: path = '%s', device = '%s', fs_name = '%s', flags = %#"
7269 		B_PRIx32 ", args = '%s'\n", path, device, fsName, flags, args));
7270 
7271 	// The path is always safe, we just have to make sure that fsName is
7272 	// almost valid - we can't make any assumptions about args, though.
7273 	// A NULL fsName is OK, if a device was given and the FS is not virtual.
7274 	// We'll get it from the DDM later.
7275 	if (fsName == NULL) {
7276 		if (!device || flags & B_MOUNT_VIRTUAL_DEVICE)
7277 			return B_BAD_VALUE;
7278 	} else if (fsName[0] == '\0')
7279 		return B_BAD_VALUE;
7280 
7281 	RecursiveLocker mountOpLocker(sMountOpLock);
7282 
7283 	// Helper to delete a newly created file device on failure.
7284 	// Not exactly beautiful, but helps to keep the code below cleaner.
7285 	struct FileDeviceDeleter {
7286 		FileDeviceDeleter() : id(-1) {}
7287 		~FileDeviceDeleter()
7288 		{
7289 			KDiskDeviceManager::Default()->DeleteFileDevice(id);
7290 		}
7291 
7292 		partition_id id;
7293 	} fileDeviceDeleter;
7294 
7295 	// If the file system is not a "virtual" one, the device argument should
7296 	// point to a real file/device (if given at all).
7297 	// get the partition
7298 	KDiskDeviceManager* ddm = KDiskDeviceManager::Default();
7299 	KPartition* partition = NULL;
7300 	KPath normalizedDevice;
7301 	bool newlyCreatedFileDevice = false;
7302 
7303 	if (!(flags & B_MOUNT_VIRTUAL_DEVICE) && device != NULL) {
7304 		// normalize the device path
7305 		status = normalizedDevice.SetTo(device, true);
7306 		if (status != B_OK)
7307 			return status;
7308 
7309 		// get a corresponding partition from the DDM
7310 		partition = ddm->RegisterPartition(normalizedDevice.Path());
7311 		if (partition == NULL) {
7312 			// Partition not found: This either means, the user supplied
7313 			// an invalid path, or the path refers to an image file. We try
7314 			// to let the DDM create a file device for the path.
7315 			partition_id deviceID = ddm->CreateFileDevice(
7316 				normalizedDevice.Path(), &newlyCreatedFileDevice);
7317 			if (deviceID >= 0) {
7318 				partition = ddm->RegisterPartition(deviceID);
7319 				if (newlyCreatedFileDevice)
7320 					fileDeviceDeleter.id = deviceID;
7321 			}
7322 		}
7323 
7324 		if (!partition) {
7325 			TRACE(("fs_mount(): Partition `%s' not found.\n",
7326 				normalizedDevice.Path()));
7327 			return B_ENTRY_NOT_FOUND;
7328 		}
7329 
7330 		device = normalizedDevice.Path();
7331 			// correct path to file device
7332 	}
7333 	PartitionRegistrar partitionRegistrar(partition, true);
7334 
7335 	// Write lock the partition's device. For the time being, we keep the lock
7336 	// until we're done mounting -- not nice, but ensure, that no-one is
7337 	// interfering.
7338 	// TODO: Just mark the partition busy while mounting!
7339 	KDiskDevice* diskDevice = NULL;
7340 	if (partition) {
7341 		diskDevice = ddm->WriteLockDevice(partition->Device()->ID());
7342 		if (!diskDevice) {
7343 			TRACE(("fs_mount(): Failed to lock disk device!\n"));
7344 			return B_ERROR;
7345 		}
7346 	}
7347 
7348 	DeviceWriteLocker writeLocker(diskDevice, true);
7349 		// this takes over the write lock acquired before
7350 
7351 	if (partition != NULL) {
7352 		// make sure, that the partition is not busy
7353 		if (partition->IsBusy()) {
7354 			TRACE(("fs_mount(): Partition is busy.\n"));
7355 			return B_BUSY;
7356 		}
7357 
7358 		// if no FS name had been supplied, we get it from the partition
7359 		if (fsName == NULL) {
7360 			KDiskSystem* diskSystem = partition->DiskSystem();
7361 			if (!diskSystem) {
7362 				TRACE(("fs_mount(): No FS name was given, and the DDM didn't "
7363 					"recognize it.\n"));
7364 				return B_BAD_VALUE;
7365 			}
7366 
7367 			if (!diskSystem->IsFileSystem()) {
7368 				TRACE(("fs_mount(): No FS name was given, and the DDM found a "
7369 					"partitioning system.\n"));
7370 				return B_BAD_VALUE;
7371 			}
7372 
7373 			// The disk system name will not change, and the KDiskSystem
7374 			// object will not go away while the disk device is locked (and
7375 			// the partition has a reference to it), so this is safe.
7376 			fsName = diskSystem->Name();
7377 		}
7378 	}
7379 
7380 	mount = new(std::nothrow) (struct ::fs_mount);
7381 	if (mount == NULL)
7382 		return B_NO_MEMORY;
7383 
7384 	mount->device_name = strdup(device);
7385 		// "device" can be NULL
7386 
7387 	status = mount->entry_cache.Init();
7388 	if (status != B_OK)
7389 		goto err1;
7390 
7391 	// initialize structure
7392 	mount->id = sNextMountID++;
7393 	mount->partition = NULL;
7394 	mount->root_vnode = NULL;
7395 	mount->covers_vnode = NULL;
7396 	mount->unmounting = false;
7397 	mount->owns_file_device = false;
7398 	mount->volume = NULL;
7399 
7400 	// build up the volume(s)
7401 	while (true) {
7402 		char* layerFSName = get_file_system_name_for_layer(fsName, layer);
7403 		if (layerFSName == NULL) {
7404 			if (layer == 0) {
7405 				status = B_NO_MEMORY;
7406 				goto err1;
7407 			}
7408 
7409 			break;
7410 		}
7411 		MemoryDeleter layerFSNameDeleter(layerFSName);
7412 
7413 		volume = (fs_volume*)malloc(sizeof(fs_volume));
7414 		if (volume == NULL) {
7415 			status = B_NO_MEMORY;
7416 			goto err1;
7417 		}
7418 
7419 		volume->id = mount->id;
7420 		volume->partition = partition != NULL ? partition->ID() : -1;
7421 		volume->layer = layer++;
7422 		volume->private_volume = NULL;
7423 		volume->ops = NULL;
7424 		volume->sub_volume = NULL;
7425 		volume->super_volume = NULL;
7426 		volume->file_system = NULL;
7427 		volume->file_system_name = NULL;
7428 
7429 		volume->file_system_name = get_file_system_name(layerFSName);
7430 		if (volume->file_system_name == NULL) {
7431 			status = B_NO_MEMORY;
7432 			free(volume);
7433 			goto err1;
7434 		}
7435 
7436 		volume->file_system = get_file_system(layerFSName);
7437 		if (volume->file_system == NULL) {
7438 			status = B_DEVICE_NOT_FOUND;
7439 			free(volume->file_system_name);
7440 			free(volume);
7441 			goto err1;
7442 		}
7443 
7444 		if (mount->volume == NULL)
7445 			mount->volume = volume;
7446 		else {
7447 			volume->super_volume = mount->volume;
7448 			mount->volume->sub_volume = volume;
7449 			mount->volume = volume;
7450 		}
7451 	}
7452 
7453 	// insert mount struct into list before we call FS's mount() function
7454 	// so that vnodes can be created for this mount
7455 	mutex_lock(&sMountMutex);
7456 	sMountsTable->Insert(mount);
7457 	mutex_unlock(&sMountMutex);
7458 
7459 	ino_t rootID;
7460 
7461 	if (!sRoot) {
7462 		// we haven't mounted anything yet
7463 		if (strcmp(path, "/") != 0) {
7464 			status = B_ERROR;
7465 			goto err2;
7466 		}
7467 
7468 		status = mount->volume->file_system->mount(mount->volume, device, flags,
7469 			args, &rootID);
7470 		if (status != 0)
7471 			goto err2;
7472 	} else {
7473 		status = path_to_vnode(path, true, &coveredNode, NULL, kernel);
7474 		if (status != B_OK)
7475 			goto err2;
7476 
7477 		mount->covers_vnode = coveredNode;
7478 
7479 		// make sure covered_vnode is a directory
7480 		if (!S_ISDIR(coveredNode->Type())) {
7481 			status = B_NOT_A_DIRECTORY;
7482 			goto err3;
7483 		}
7484 
7485 		if (coveredNode->IsCovered()) {
7486 			// this is already a covered vnode
7487 			status = B_BUSY;
7488 			goto err3;
7489 		}
7490 
7491 		// mount it/them
7492 		fs_volume* volume = mount->volume;
7493 		while (volume) {
7494 			status = volume->file_system->mount(volume, device, flags, args,
7495 				&rootID);
7496 			if (status != B_OK) {
7497 				if (volume->sub_volume)
7498 					goto err4;
7499 				goto err3;
7500 			}
7501 
7502 			volume = volume->super_volume;
7503 		}
7504 
7505 		volume = mount->volume;
7506 		while (volume) {
7507 			if (volume->ops->all_layers_mounted != NULL)
7508 				volume->ops->all_layers_mounted(volume);
7509 			volume = volume->super_volume;
7510 		}
7511 	}
7512 
7513 	// the root node is supposed to be owned by the file system - it must
7514 	// exist at this point
7515 	mount->root_vnode = lookup_vnode(mount->id, rootID);
7516 	if (mount->root_vnode == NULL || mount->root_vnode->ref_count != 1) {
7517 		panic("fs_mount: file system does not own its root node!\n");
7518 		status = B_ERROR;
7519 		goto err4;
7520 	}
7521 
7522 	// set up the links between the root vnode and the vnode it covers
7523 	rw_lock_write_lock(&sVnodeLock);
7524 	if (coveredNode != NULL) {
7525 		if (coveredNode->IsCovered()) {
7526 			// the vnode is covered now
7527 			status = B_BUSY;
7528 			rw_lock_write_unlock(&sVnodeLock);
7529 			goto err4;
7530 		}
7531 
7532 		mount->root_vnode->covers = coveredNode;
7533 		mount->root_vnode->SetCovering(true);
7534 
7535 		coveredNode->covered_by = mount->root_vnode;
7536 		coveredNode->SetCovered(true);
7537 	}
7538 	rw_lock_write_unlock(&sVnodeLock);
7539 
7540 	if (!sRoot) {
7541 		sRoot = mount->root_vnode;
7542 		mutex_lock(&sIOContextRootLock);
7543 		get_current_io_context(true)->root = sRoot;
7544 		mutex_unlock(&sIOContextRootLock);
7545 		inc_vnode_ref_count(sRoot);
7546 	}
7547 
7548 	// supply the partition (if any) with the mount cookie and mark it mounted
7549 	if (partition) {
7550 		partition->SetMountCookie(mount->volume->private_volume);
7551 		partition->SetVolumeID(mount->id);
7552 
7553 		// keep a partition reference as long as the partition is mounted
7554 		partitionRegistrar.Detach();
7555 		mount->partition = partition;
7556 		mount->owns_file_device = newlyCreatedFileDevice;
7557 		fileDeviceDeleter.id = -1;
7558 	}
7559 
7560 	notify_mount(mount->id,
7561 		coveredNode != NULL ? coveredNode->device : -1,
7562 		coveredNode ? coveredNode->id : -1);
7563 
7564 	return mount->id;
7565 
7566 err4:
7567 	FS_MOUNT_CALL_NO_PARAMS(mount, unmount);
7568 err3:
7569 	if (coveredNode != NULL)
7570 		put_vnode(coveredNode);
7571 err2:
7572 	mutex_lock(&sMountMutex);
7573 	sMountsTable->Remove(mount);
7574 	mutex_unlock(&sMountMutex);
7575 err1:
7576 	delete mount;
7577 
7578 	return status;
7579 }
7580 
7581 
7582 static status_t
7583 fs_unmount(char* path, dev_t mountID, uint32 flags, bool kernel)
7584 {
7585 	struct fs_mount* mount;
7586 	status_t err;
7587 
7588 	FUNCTION(("fs_unmount(path '%s', dev %" B_PRId32 ", kernel %d\n", path,
7589 		mountID, kernel));
7590 
7591 	struct vnode* pathVnode = NULL;
7592 	if (path != NULL) {
7593 		err = path_to_vnode(path, true, &pathVnode, NULL, kernel);
7594 		if (err != B_OK)
7595 			return B_ENTRY_NOT_FOUND;
7596 	}
7597 
7598 	RecursiveLocker mountOpLocker(sMountOpLock);
7599 
7600 	// this lock is not strictly necessary, but here in case of KDEBUG
7601 	// to keep the ASSERT in find_mount() working.
7602 	KDEBUG_ONLY(mutex_lock(&sMountMutex));
7603 	mount = find_mount(path != NULL ? pathVnode->device : mountID);
7604 	KDEBUG_ONLY(mutex_unlock(&sMountMutex));
7605 	if (mount == NULL) {
7606 		panic("fs_unmount: find_mount() failed on root vnode @%p of mount\n",
7607 			pathVnode);
7608 	}
7609 
7610 	if (path != NULL) {
7611 		put_vnode(pathVnode);
7612 
7613 		if (mount->root_vnode != pathVnode) {
7614 			// not mountpoint
7615 			return B_BAD_VALUE;
7616 		}
7617 	}
7618 
7619 	// if the volume is associated with a partition, lock the device of the
7620 	// partition as long as we are unmounting
7621 	KDiskDeviceManager* ddm = KDiskDeviceManager::Default();
7622 	KPartition* partition = mount->partition;
7623 	KDiskDevice* diskDevice = NULL;
7624 	if (partition != NULL) {
7625 		if (partition->Device() == NULL) {
7626 			dprintf("fs_unmount(): There is no device!\n");
7627 			return B_ERROR;
7628 		}
7629 		diskDevice = ddm->WriteLockDevice(partition->Device()->ID());
7630 		if (!diskDevice) {
7631 			TRACE(("fs_unmount(): Failed to lock disk device!\n"));
7632 			return B_ERROR;
7633 		}
7634 	}
7635 	DeviceWriteLocker writeLocker(diskDevice, true);
7636 
7637 	// make sure, that the partition is not busy
7638 	if (partition != NULL) {
7639 		if ((flags & B_UNMOUNT_BUSY_PARTITION) == 0 && partition->IsBusy()) {
7640 			TRACE(("fs_unmount(): Partition is busy.\n"));
7641 			return B_BUSY;
7642 		}
7643 	}
7644 
7645 	// grab the vnode master mutex to keep someone from creating
7646 	// a vnode while we're figuring out if we can continue
7647 	WriteLocker vnodesWriteLocker(&sVnodeLock);
7648 
7649 	bool disconnectedDescriptors = false;
7650 
7651 	while (true) {
7652 		bool busy = false;
7653 
7654 		// cycle through the list of vnodes associated with this mount and
7655 		// make sure all of them are not busy or have refs on them
7656 		VnodeList::Iterator iterator = mount->vnodes.GetIterator();
7657 		while (struct vnode* vnode = iterator.Next()) {
7658 			if (vnode->IsBusy()) {
7659 				busy = true;
7660 				break;
7661 			}
7662 
7663 			// check the vnode's ref count -- subtract additional references for
7664 			// covering
7665 			int32 refCount = vnode->ref_count;
7666 			if (vnode->covers != NULL)
7667 				refCount--;
7668 			if (vnode->covered_by != NULL)
7669 				refCount--;
7670 
7671 			if (refCount != 0) {
7672 				// there are still vnodes in use on this mount, so we cannot
7673 				// unmount yet
7674 				busy = true;
7675 				break;
7676 			}
7677 		}
7678 
7679 		if (!busy)
7680 			break;
7681 
7682 		if ((flags & B_FORCE_UNMOUNT) == 0)
7683 			return B_BUSY;
7684 
7685 		if (disconnectedDescriptors) {
7686 			// wait a bit until the last access is finished, and then try again
7687 			vnodesWriteLocker.Unlock();
7688 			snooze(100000);
7689 			// TODO: if there is some kind of bug that prevents the ref counts
7690 			// from getting back to zero, this will fall into an endless loop...
7691 			vnodesWriteLocker.Lock();
7692 			continue;
7693 		}
7694 
7695 		// the file system is still busy - but we're forced to unmount it,
7696 		// so let's disconnect all open file descriptors
7697 
7698 		mount->unmounting = true;
7699 			// prevent new vnodes from being created
7700 
7701 		vnodesWriteLocker.Unlock();
7702 
7703 		disconnect_mount_or_vnode_fds(mount, NULL);
7704 		disconnectedDescriptors = true;
7705 
7706 		vnodesWriteLocker.Lock();
7707 	}
7708 
7709 	// We can safely continue. Mark all of the vnodes busy and this mount
7710 	// structure in unmounting state. Also undo the vnode covers/covered_by
7711 	// links.
7712 	mount->unmounting = true;
7713 
7714 	VnodeList::Iterator iterator = mount->vnodes.GetIterator();
7715 	while (struct vnode* vnode = iterator.Next()) {
7716 		// Remove all covers/covered_by links from other mounts' nodes to this
7717 		// vnode and adjust the node ref count accordingly. We will release the
7718 		// references to the external vnodes below.
7719 		if (Vnode* coveredNode = vnode->covers) {
7720 			if (Vnode* coveringNode = vnode->covered_by) {
7721 				// We have both covered and covering vnodes, so just remove us
7722 				// from the chain.
7723 				coveredNode->covered_by = coveringNode;
7724 				coveringNode->covers = coveredNode;
7725 				vnode->ref_count -= 2;
7726 
7727 				vnode->covered_by = NULL;
7728 				vnode->covers = NULL;
7729 				vnode->SetCovering(false);
7730 				vnode->SetCovered(false);
7731 			} else {
7732 				// We only have a covered vnode. Remove its link to us.
7733 				coveredNode->covered_by = NULL;
7734 				coveredNode->SetCovered(false);
7735 				vnode->ref_count--;
7736 
7737 				// If the other node is an external vnode, we keep its link
7738 				// link around so we can put the reference later on. Otherwise
7739 				// we get rid of it right now.
7740 				if (coveredNode->mount == mount) {
7741 					vnode->covers = NULL;
7742 					coveredNode->ref_count--;
7743 				}
7744 			}
7745 		} else if (Vnode* coveringNode = vnode->covered_by) {
7746 			// We only have a covering vnode. Remove its link to us.
7747 			coveringNode->covers = NULL;
7748 			coveringNode->SetCovering(false);
7749 			vnode->ref_count--;
7750 
7751 			// If the other node is an external vnode, we keep its link
7752 			// link around so we can put the reference later on. Otherwise
7753 			// we get rid of it right now.
7754 			if (coveringNode->mount == mount) {
7755 				vnode->covered_by = NULL;
7756 				coveringNode->ref_count--;
7757 			}
7758 		}
7759 
7760 		vnode->SetBusy(true);
7761 		vnode_to_be_freed(vnode);
7762 	}
7763 
7764 	vnodesWriteLocker.Unlock();
7765 
7766 	// Free all vnodes associated with this mount.
7767 	// They will be removed from the mount list by free_vnode(), so
7768 	// we don't have to do this.
7769 	while (struct vnode* vnode = mount->vnodes.Head()) {
7770 		// Put the references to external covered/covering vnodes we kept above.
7771 		if (Vnode* coveredNode = vnode->covers)
7772 			put_vnode(coveredNode);
7773 		if (Vnode* coveringNode = vnode->covered_by)
7774 			put_vnode(coveringNode);
7775 
7776 		free_vnode(vnode, false);
7777 	}
7778 
7779 	// remove the mount structure from the hash table
7780 	mutex_lock(&sMountMutex);
7781 	sMountsTable->Remove(mount);
7782 	mutex_unlock(&sMountMutex);
7783 
7784 	mountOpLocker.Unlock();
7785 
7786 	FS_MOUNT_CALL_NO_PARAMS(mount, unmount);
7787 	notify_unmount(mount->id);
7788 
7789 	// dereference the partition and mark it unmounted
7790 	if (partition) {
7791 		partition->SetVolumeID(-1);
7792 		partition->SetMountCookie(NULL);
7793 
7794 		if (mount->owns_file_device)
7795 			KDiskDeviceManager::Default()->DeleteFileDevice(partition->ID());
7796 		partition->Unregister();
7797 	}
7798 
7799 	delete mount;
7800 	return B_OK;
7801 }
7802 
7803 
7804 static status_t
7805 fs_sync(dev_t device)
7806 {
7807 	struct fs_mount* mount;
7808 	status_t status = get_mount(device, &mount);
7809 	if (status != B_OK)
7810 		return status;
7811 
7812 	struct vnode marker;
7813 	memset(&marker, 0, sizeof(marker));
7814 	marker.SetBusy(true);
7815 	marker.SetRemoved(true);
7816 
7817 	// First, synchronize all file caches
7818 
7819 	while (true) {
7820 		WriteLocker locker(sVnodeLock);
7821 			// Note: That's the easy way. Which is probably OK for sync(),
7822 			// since it's a relatively rare call and doesn't need to allow for
7823 			// a lot of concurrency. Using a read lock would be possible, but
7824 			// also more involved, since we had to lock the individual nodes
7825 			// and take care of the locking order, which we might not want to
7826 			// do while holding fs_mount::rlock.
7827 
7828 		// synchronize access to vnode list
7829 		recursive_lock_lock(&mount->rlock);
7830 
7831 		struct vnode* vnode;
7832 		if (!marker.IsRemoved()) {
7833 			vnode = mount->vnodes.GetNext(&marker);
7834 			mount->vnodes.Remove(&marker);
7835 			marker.SetRemoved(true);
7836 		} else
7837 			vnode = mount->vnodes.First();
7838 
7839 		while (vnode != NULL && (vnode->cache == NULL
7840 			|| vnode->IsRemoved() || vnode->IsBusy())) {
7841 			// TODO: we could track writes (and writable mapped vnodes)
7842 			//	and have a simple flag that we could test for here
7843 			vnode = mount->vnodes.GetNext(vnode);
7844 		}
7845 
7846 		if (vnode != NULL) {
7847 			// insert marker vnode again
7848 			mount->vnodes.Insert(mount->vnodes.GetNext(vnode), &marker);
7849 			marker.SetRemoved(false);
7850 		}
7851 
7852 		recursive_lock_unlock(&mount->rlock);
7853 
7854 		if (vnode == NULL)
7855 			break;
7856 
7857 		vnode = lookup_vnode(mount->id, vnode->id);
7858 		if (vnode == NULL || vnode->IsBusy())
7859 			continue;
7860 
7861 		if (vnode->ref_count == 0) {
7862 			// this vnode has been unused before
7863 			vnode_used(vnode);
7864 		}
7865 		inc_vnode_ref_count(vnode);
7866 
7867 		locker.Unlock();
7868 
7869 		if (vnode->cache != NULL && !vnode->IsRemoved())
7870 			vnode->cache->WriteModified();
7871 
7872 		put_vnode(vnode);
7873 	}
7874 
7875 	// And then, let the file systems do their synchronizing work
7876 
7877 	if (HAS_FS_MOUNT_CALL(mount, sync))
7878 		status = FS_MOUNT_CALL_NO_PARAMS(mount, sync);
7879 
7880 	put_mount(mount);
7881 	return status;
7882 }
7883 
7884 
7885 static status_t
7886 fs_read_info(dev_t device, struct fs_info* info)
7887 {
7888 	struct fs_mount* mount;
7889 	status_t status = get_mount(device, &mount);
7890 	if (status != B_OK)
7891 		return status;
7892 
7893 	memset(info, 0, sizeof(struct fs_info));
7894 
7895 	if (HAS_FS_MOUNT_CALL(mount, read_fs_info))
7896 		status = FS_MOUNT_CALL(mount, read_fs_info, info);
7897 
7898 	// fill in info the file system doesn't (have to) know about
7899 	if (status == B_OK) {
7900 		info->dev = mount->id;
7901 		info->root = mount->root_vnode->id;
7902 
7903 		fs_volume* volume = mount->volume;
7904 		while (volume->super_volume != NULL)
7905 			volume = volume->super_volume;
7906 
7907 		strlcpy(info->fsh_name, volume->file_system_name,
7908 			sizeof(info->fsh_name));
7909 		if (mount->device_name != NULL) {
7910 			strlcpy(info->device_name, mount->device_name,
7911 				sizeof(info->device_name));
7912 		}
7913 	}
7914 
7915 	// if the call is not supported by the file system, there are still
7916 	// the parts that we filled out ourselves
7917 
7918 	put_mount(mount);
7919 	return status;
7920 }
7921 
7922 
7923 static status_t
7924 fs_write_info(dev_t device, const struct fs_info* info, int mask)
7925 {
7926 	struct fs_mount* mount;
7927 	status_t status = get_mount(device, &mount);
7928 	if (status != B_OK)
7929 		return status;
7930 
7931 	if (HAS_FS_MOUNT_CALL(mount, write_fs_info))
7932 		status = FS_MOUNT_CALL(mount, write_fs_info, info, mask);
7933 	else
7934 		status = B_READ_ONLY_DEVICE;
7935 
7936 	put_mount(mount);
7937 	return status;
7938 }
7939 
7940 
7941 static dev_t
7942 fs_next_device(int32* _cookie)
7943 {
7944 	struct fs_mount* mount = NULL;
7945 	dev_t device = *_cookie;
7946 
7947 	mutex_lock(&sMountMutex);
7948 
7949 	// Since device IDs are assigned sequentially, this algorithm
7950 	// does work good enough. It makes sure that the device list
7951 	// returned is sorted, and that no device is skipped when an
7952 	// already visited device got unmounted.
7953 
7954 	while (device < sNextMountID) {
7955 		mount = find_mount(device++);
7956 		if (mount != NULL && mount->volume->private_volume != NULL)
7957 			break;
7958 	}
7959 
7960 	*_cookie = device;
7961 
7962 	if (mount != NULL)
7963 		device = mount->id;
7964 	else
7965 		device = B_BAD_VALUE;
7966 
7967 	mutex_unlock(&sMountMutex);
7968 
7969 	return device;
7970 }
7971 
7972 
7973 ssize_t
7974 fs_read_attr(int fd, const char *attribute, uint32 type, off_t pos,
7975 	void *buffer, size_t readBytes)
7976 {
7977 	int attrFD = attr_open(fd, NULL, attribute, O_RDONLY, true);
7978 	if (attrFD < 0)
7979 		return attrFD;
7980 
7981 	ssize_t bytesRead = _kern_read(attrFD, pos, buffer, readBytes);
7982 
7983 	_kern_close(attrFD);
7984 
7985 	return bytesRead;
7986 }
7987 
7988 
7989 static status_t
7990 get_cwd(char* buffer, size_t size, bool kernel)
7991 {
7992 	// Get current working directory from io context
7993 	struct io_context* context = get_current_io_context(kernel);
7994 	status_t status;
7995 
7996 	FUNCTION(("vfs_get_cwd: buf %p, size %ld\n", buffer, size));
7997 
7998 	mutex_lock(&context->io_mutex);
7999 
8000 	struct vnode* vnode = context->cwd;
8001 	if (vnode)
8002 		inc_vnode_ref_count(vnode);
8003 
8004 	mutex_unlock(&context->io_mutex);
8005 
8006 	if (vnode) {
8007 		status = dir_vnode_to_path(vnode, buffer, size, kernel);
8008 		put_vnode(vnode);
8009 	} else
8010 		status = B_ERROR;
8011 
8012 	return status;
8013 }
8014 
8015 
8016 static status_t
8017 set_cwd(int fd, char* path, bool kernel)
8018 {
8019 	struct io_context* context;
8020 	struct vnode* vnode = NULL;
8021 	struct vnode* oldDirectory;
8022 	status_t status;
8023 
8024 	FUNCTION(("set_cwd: path = \'%s\'\n", path));
8025 
8026 	// Get vnode for passed path, and bail if it failed
8027 	status = fd_and_path_to_vnode(fd, path, true, &vnode, NULL, kernel);
8028 	if (status < 0)
8029 		return status;
8030 
8031 	if (!S_ISDIR(vnode->Type())) {
8032 		// nope, can't cwd to here
8033 		status = B_NOT_A_DIRECTORY;
8034 		goto err;
8035 	}
8036 
8037 	// We need to have the permission to enter the directory, too
8038 	if (HAS_FS_CALL(vnode, access)) {
8039 		status = FS_CALL(vnode, access, X_OK);
8040 		if (status != B_OK)
8041 			goto err;
8042 	}
8043 
8044 	// Get current io context and lock
8045 	context = get_current_io_context(kernel);
8046 	mutex_lock(&context->io_mutex);
8047 
8048 	// save the old current working directory first
8049 	oldDirectory = context->cwd;
8050 	context->cwd = vnode;
8051 
8052 	mutex_unlock(&context->io_mutex);
8053 
8054 	if (oldDirectory)
8055 		put_vnode(oldDirectory);
8056 
8057 	return B_NO_ERROR;
8058 
8059 err:
8060 	put_vnode(vnode);
8061 	return status;
8062 }
8063 
8064 
8065 //	#pragma mark - kernel mirrored syscalls
8066 
8067 
8068 dev_t
8069 _kern_mount(const char* path, const char* device, const char* fsName,
8070 	uint32 flags, const char* args, size_t argsLength)
8071 {
8072 	KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8073 	if (pathBuffer.InitCheck() != B_OK)
8074 		return B_NO_MEMORY;
8075 
8076 	return fs_mount(pathBuffer.LockBuffer(), device, fsName, flags, args, true);
8077 }
8078 
8079 
8080 status_t
8081 _kern_unmount(const char* path, uint32 flags)
8082 {
8083 	KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8084 	if (pathBuffer.InitCheck() != B_OK)
8085 		return B_NO_MEMORY;
8086 
8087 	return fs_unmount(pathBuffer.LockBuffer(), -1, flags, true);
8088 }
8089 
8090 
8091 status_t
8092 _kern_read_fs_info(dev_t device, struct fs_info* info)
8093 {
8094 	if (info == NULL)
8095 		return B_BAD_VALUE;
8096 
8097 	return fs_read_info(device, info);
8098 }
8099 
8100 
8101 status_t
8102 _kern_write_fs_info(dev_t device, const struct fs_info* info, int mask)
8103 {
8104 	if (info == NULL)
8105 		return B_BAD_VALUE;
8106 
8107 	return fs_write_info(device, info, mask);
8108 }
8109 
8110 
8111 status_t
8112 _kern_sync(void)
8113 {
8114 	// Note: _kern_sync() is also called from _user_sync()
8115 	int32 cookie = 0;
8116 	dev_t device;
8117 	while ((device = next_dev(&cookie)) >= 0) {
8118 		status_t status = fs_sync(device);
8119 		if (status != B_OK && status != B_BAD_VALUE) {
8120 			dprintf("sync: device %" B_PRIdDEV " couldn't sync: %s\n", device,
8121 				strerror(status));
8122 		}
8123 	}
8124 
8125 	return B_OK;
8126 }
8127 
8128 
8129 dev_t
8130 _kern_next_device(int32* _cookie)
8131 {
8132 	return fs_next_device(_cookie);
8133 }
8134 
8135 
8136 status_t
8137 _kern_get_next_fd_info(team_id teamID, uint32* _cookie, fd_info* info,
8138 	size_t infoSize)
8139 {
8140 	if (infoSize != sizeof(fd_info))
8141 		return B_BAD_VALUE;
8142 
8143 	// get the team
8144 	Team* team = Team::Get(teamID);
8145 	if (team == NULL)
8146 		return B_BAD_TEAM_ID;
8147 	BReference<Team> teamReference(team, true);
8148 
8149 	// now that we have a team reference, its I/O context won't go away
8150 	io_context* context = team->io_context;
8151 	MutexLocker contextLocker(context->io_mutex);
8152 
8153 	uint32 slot = *_cookie;
8154 
8155 	struct file_descriptor* descriptor;
8156 	while (slot < context->table_size
8157 		&& (descriptor = context->fds[slot]) == NULL) {
8158 		slot++;
8159 	}
8160 
8161 	if (slot >= context->table_size)
8162 		return B_ENTRY_NOT_FOUND;
8163 
8164 	info->number = slot;
8165 	info->open_mode = descriptor->open_mode;
8166 
8167 	struct vnode* vnode = fd_vnode(descriptor);
8168 	if (vnode != NULL) {
8169 		info->device = vnode->device;
8170 		info->node = vnode->id;
8171 	} else if (descriptor->u.mount != NULL) {
8172 		info->device = descriptor->u.mount->id;
8173 		info->node = -1;
8174 	}
8175 
8176 	*_cookie = slot + 1;
8177 	return B_OK;
8178 }
8179 
8180 
8181 int
8182 _kern_open_entry_ref(dev_t device, ino_t inode, const char* name, int openMode,
8183 	int perms)
8184 {
8185 	if ((openMode & O_CREAT) != 0) {
8186 		return file_create_entry_ref(device, inode, name, openMode, perms,
8187 			true);
8188 	}
8189 
8190 	return file_open_entry_ref(device, inode, name, openMode, true);
8191 }
8192 
8193 
8194 /*!	\brief Opens a node specified by a FD + path pair.
8195 
8196 	At least one of \a fd and \a path must be specified.
8197 	If only \a fd is given, the function opens the node identified by this
8198 	FD. If only a path is given, this path is opened. If both are given and
8199 	the path is absolute, \a fd is ignored; a relative path is reckoned off
8200 	of the directory (!) identified by \a fd.
8201 
8202 	\param fd The FD. May be < 0.
8203 	\param path The absolute or relative path. May be \c NULL.
8204 	\param openMode The open mode.
8205 	\return A FD referring to the newly opened node, or an error code,
8206 			if an error occurs.
8207 */
8208 int
8209 _kern_open(int fd, const char* path, int openMode, int perms)
8210 {
8211 	KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8212 	if (pathBuffer.InitCheck() != B_OK)
8213 		return B_NO_MEMORY;
8214 
8215 	if ((openMode & O_CREAT) != 0)
8216 		return file_create(fd, pathBuffer.LockBuffer(), openMode, perms, true);
8217 
8218 	return file_open(fd, pathBuffer.LockBuffer(), openMode, true);
8219 }
8220 
8221 
8222 /*!	\brief Opens a directory specified by entry_ref or node_ref.
8223 
8224 	The supplied name may be \c NULL, in which case directory identified
8225 	by \a device and \a inode will be opened. Otherwise \a device and
8226 	\a inode identify the parent directory of the directory to be opened
8227 	and \a name its entry name.
8228 
8229 	\param device If \a name is specified the ID of the device the parent
8230 		   directory of the directory to be opened resides on, otherwise
8231 		   the device of the directory itself.
8232 	\param inode If \a name is specified the node ID of the parent
8233 		   directory of the directory to be opened, otherwise node ID of the
8234 		   directory itself.
8235 	\param name The entry name of the directory to be opened. If \c NULL,
8236 		   the \a device + \a inode pair identify the node to be opened.
8237 	\return The FD of the newly opened directory or an error code, if
8238 			something went wrong.
8239 */
8240 int
8241 _kern_open_dir_entry_ref(dev_t device, ino_t inode, const char* name)
8242 {
8243 	return dir_open_entry_ref(device, inode, name, true);
8244 }
8245 
8246 
8247 /*!	\brief Opens a directory specified by a FD + path pair.
8248 
8249 	At least one of \a fd and \a path must be specified.
8250 	If only \a fd is given, the function opens the directory identified by this
8251 	FD. If only a path is given, this path is opened. If both are given and
8252 	the path is absolute, \a fd is ignored; a relative path is reckoned off
8253 	of the directory (!) identified by \a fd.
8254 
8255 	\param fd The FD. May be < 0.
8256 	\param path The absolute or relative path. May be \c NULL.
8257 	\return A FD referring to the newly opened directory, or an error code,
8258 			if an error occurs.
8259 */
8260 int
8261 _kern_open_dir(int fd, const char* path)
8262 {
8263 	KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8264 	if (pathBuffer.InitCheck() != B_OK)
8265 		return B_NO_MEMORY;
8266 
8267 	return dir_open(fd, pathBuffer.LockBuffer(), true);
8268 }
8269 
8270 
8271 status_t
8272 _kern_fcntl(int fd, int op, size_t argument)
8273 {
8274 	return common_fcntl(fd, op, argument, true);
8275 }
8276 
8277 
8278 status_t
8279 _kern_fsync(int fd)
8280 {
8281 	return common_sync(fd, true);
8282 }
8283 
8284 
8285 status_t
8286 _kern_lock_node(int fd)
8287 {
8288 	return common_lock_node(fd, true);
8289 }
8290 
8291 
8292 status_t
8293 _kern_unlock_node(int fd)
8294 {
8295 	return common_unlock_node(fd, true);
8296 }
8297 
8298 
8299 status_t
8300 _kern_create_dir_entry_ref(dev_t device, ino_t inode, const char* name,
8301 	int perms)
8302 {
8303 	return dir_create_entry_ref(device, inode, name, perms, true);
8304 }
8305 
8306 
8307 /*!	\brief Creates a directory specified by a FD + path pair.
8308 
8309 	\a path must always be specified (it contains the name of the new directory
8310 	at least). If only a path is given, this path identifies the location at
8311 	which the directory shall be created. If both \a fd and \a path are given
8312 	and the path is absolute, \a fd is ignored; a relative path is reckoned off
8313 	of the directory (!) identified by \a fd.
8314 
8315 	\param fd The FD. May be < 0.
8316 	\param path The absolute or relative path. Must not be \c NULL.
8317 	\param perms The access permissions the new directory shall have.
8318 	\return \c B_OK, if the directory has been created successfully, another
8319 			error code otherwise.
8320 */
8321 status_t
8322 _kern_create_dir(int fd, const char* path, int perms)
8323 {
8324 	KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8325 	if (pathBuffer.InitCheck() != B_OK)
8326 		return B_NO_MEMORY;
8327 
8328 	return dir_create(fd, pathBuffer.LockBuffer(), perms, true);
8329 }
8330 
8331 
8332 status_t
8333 _kern_remove_dir(int fd, const char* path)
8334 {
8335 	KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8336 	if (pathBuffer.InitCheck() != B_OK)
8337 		return B_NO_MEMORY;
8338 
8339 	return dir_remove(fd, pathBuffer.LockBuffer(), true);
8340 }
8341 
8342 
8343 /*!	\brief Reads the contents of a symlink referred to by a FD + path pair.
8344 
8345 	At least one of \a fd and \a path must be specified.
8346 	If only \a fd is given, the function the symlink to be read is the node
8347 	identified by this FD. If only a path is given, this path identifies the
8348 	symlink to be read. If both are given and the path is absolute, \a fd is
8349 	ignored; a relative path is reckoned off of the directory (!) identified
8350 	by \a fd.
8351 	If this function fails with B_BUFFER_OVERFLOW, the \a _bufferSize pointer
8352 	will still be updated to reflect the required buffer size.
8353 
8354 	\param fd The FD. May be < 0.
8355 	\param path The absolute or relative path. May be \c NULL.
8356 	\param buffer The buffer into which the contents of the symlink shall be
8357 		   written.
8358 	\param _bufferSize A pointer to the size of the supplied buffer.
8359 	\return The length of the link on success or an appropriate error code
8360 */
8361 status_t
8362 _kern_read_link(int fd, const char* path, char* buffer, size_t* _bufferSize)
8363 {
8364 	KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8365 	if (pathBuffer.InitCheck() != B_OK)
8366 		return B_NO_MEMORY;
8367 
8368 	return common_read_link(fd, pathBuffer.LockBuffer(),
8369 		buffer, _bufferSize, true);
8370 }
8371 
8372 
8373 /*!	\brief Creates a symlink specified by a FD + path pair.
8374 
8375 	\a path must always be specified (it contains the name of the new symlink
8376 	at least). If only a path is given, this path identifies the location at
8377 	which the symlink shall be created. If both \a fd and \a path are given and
8378 	the path is absolute, \a fd is ignored; a relative path is reckoned off
8379 	of the directory (!) identified by \a fd.
8380 
8381 	\param fd The FD. May be < 0.
8382 	\param toPath The absolute or relative path. Must not be \c NULL.
8383 	\param mode The access permissions the new symlink shall have.
8384 	\return \c B_OK, if the symlink has been created successfully, another
8385 			error code otherwise.
8386 */
8387 status_t
8388 _kern_create_symlink(int fd, const char* path, const char* toPath, int mode)
8389 {
8390 	KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8391 	if (pathBuffer.InitCheck() != B_OK)
8392 		return B_NO_MEMORY;
8393 
8394 	return common_create_symlink(fd, pathBuffer.LockBuffer(),
8395 		toPath, mode, true);
8396 }
8397 
8398 
8399 status_t
8400 _kern_create_link(int pathFD, const char* path, int toFD, const char* toPath,
8401 	bool traverseLeafLink)
8402 {
8403 	KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8404 	KPath toPathBuffer(toPath, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8405 	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
8406 		return B_NO_MEMORY;
8407 
8408 	return common_create_link(pathFD, pathBuffer.LockBuffer(), toFD,
8409 		toPathBuffer.LockBuffer(), traverseLeafLink, true);
8410 }
8411 
8412 
8413 /*!	\brief Removes an entry specified by a FD + path pair from its directory.
8414 
8415 	\a path must always be specified (it contains at least the name of the entry
8416 	to be deleted). If only a path is given, this path identifies the entry
8417 	directly. If both \a fd and \a path are given and the path is absolute,
8418 	\a fd is ignored; a relative path is reckoned off of the directory (!)
8419 	identified by \a fd.
8420 
8421 	\param fd The FD. May be < 0.
8422 	\param path The absolute or relative path. Must not be \c NULL.
8423 	\return \c B_OK, if the entry has been removed successfully, another
8424 			error code otherwise.
8425 */
8426 status_t
8427 _kern_unlink(int fd, const char* path)
8428 {
8429 	KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8430 	if (pathBuffer.InitCheck() != B_OK)
8431 		return B_NO_MEMORY;
8432 
8433 	return common_unlink(fd, pathBuffer.LockBuffer(), true);
8434 }
8435 
8436 
8437 /*!	\brief Moves an entry specified by a FD + path pair to a an entry specified
8438 		   by another FD + path pair.
8439 
8440 	\a oldPath and \a newPath must always be specified (they contain at least
8441 	the name of the entry). If only a path is given, this path identifies the
8442 	entry directly. If both a FD and a path are given and the path is absolute,
8443 	the FD is ignored; a relative path is reckoned off of the directory (!)
8444 	identified by the respective FD.
8445 
8446 	\param oldFD The FD of the old location. May be < 0.
8447 	\param oldPath The absolute or relative path of the old location. Must not
8448 		   be \c NULL.
8449 	\param newFD The FD of the new location. May be < 0.
8450 	\param newPath The absolute or relative path of the new location. Must not
8451 		   be \c NULL.
8452 	\return \c B_OK, if the entry has been moved successfully, another
8453 			error code otherwise.
8454 */
8455 status_t
8456 _kern_rename(int oldFD, const char* oldPath, int newFD, const char* newPath)
8457 {
8458 	KPath oldPathBuffer(oldPath, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8459 	KPath newPathBuffer(newPath, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8460 	if (oldPathBuffer.InitCheck() != B_OK || newPathBuffer.InitCheck() != B_OK)
8461 		return B_NO_MEMORY;
8462 
8463 	return common_rename(oldFD, oldPathBuffer.LockBuffer(),
8464 		newFD, newPathBuffer.LockBuffer(), true);
8465 }
8466 
8467 
8468 status_t
8469 _kern_access(int fd, const char* path, int mode, bool effectiveUserGroup)
8470 {
8471 	KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8472 	if (pathBuffer.InitCheck() != B_OK)
8473 		return B_NO_MEMORY;
8474 
8475 	return common_access(fd, pathBuffer.LockBuffer(), mode, effectiveUserGroup,
8476 		true);
8477 }
8478 
8479 
8480 /*!	\brief Reads stat data of an entity specified by a FD + path pair.
8481 
8482 	If only \a fd is given, the stat operation associated with the type
8483 	of the FD (node, attr, attr dir etc.) is performed. If only \a path is
8484 	given, this path identifies the entry for whose node to retrieve the
8485 	stat data. If both \a fd and \a path are given and the path is absolute,
8486 	\a fd is ignored; a relative path is reckoned off of the directory (!)
8487 	identified by \a fd and specifies the entry whose stat data shall be
8488 	retrieved.
8489 
8490 	\param fd The FD. May be < 0.
8491 	\param path The absolute or relative path. Must not be \c NULL.
8492 	\param traverseLeafLink If \a path is given, \c true specifies that the
8493 		   function shall not stick to symlinks, but traverse them.
8494 	\param stat The buffer the stat data shall be written into.
8495 	\param statSize The size of the supplied stat buffer.
8496 	\return \c B_OK, if the the stat data have been read successfully, another
8497 			error code otherwise.
8498 */
8499 status_t
8500 _kern_read_stat(int fd, const char* path, bool traverseLeafLink,
8501 	struct stat* stat, size_t statSize)
8502 {
8503 	struct stat completeStat;
8504 	struct stat* originalStat = NULL;
8505 	status_t status;
8506 
8507 	if (statSize > sizeof(struct stat))
8508 		return B_BAD_VALUE;
8509 
8510 	// this supports different stat extensions
8511 	if (statSize < sizeof(struct stat)) {
8512 		originalStat = stat;
8513 		stat = &completeStat;
8514 	}
8515 
8516 	status = vfs_read_stat(fd, path, traverseLeafLink, stat, true);
8517 
8518 	if (status == B_OK && originalStat != NULL)
8519 		memcpy(originalStat, stat, statSize);
8520 
8521 	return status;
8522 }
8523 
8524 
8525 /*!	\brief Writes stat data of an entity specified by a FD + path pair.
8526 
8527 	If only \a fd is given, the stat operation associated with the type
8528 	of the FD (node, attr, attr dir etc.) is performed. If only \a path is
8529 	given, this path identifies the entry for whose node to write the
8530 	stat data. If both \a fd and \a path are given and the path is absolute,
8531 	\a fd is ignored; a relative path is reckoned off of the directory (!)
8532 	identified by \a fd and specifies the entry whose stat data shall be
8533 	written.
8534 
8535 	\param fd The FD. May be < 0.
8536 	\param path The absolute or relative path. May be \c NULL.
8537 	\param traverseLeafLink If \a path is given, \c true specifies that the
8538 		   function shall not stick to symlinks, but traverse them.
8539 	\param stat The buffer containing the stat data to be written.
8540 	\param statSize The size of the supplied stat buffer.
8541 	\param statMask A mask specifying which parts of the stat data shall be
8542 		   written.
8543 	\return \c B_OK, if the the stat data have been written successfully,
8544 			another error code otherwise.
8545 */
8546 status_t
8547 _kern_write_stat(int fd, const char* path, bool traverseLeafLink,
8548 	const struct stat* stat, size_t statSize, int statMask)
8549 {
8550 	struct stat completeStat;
8551 
8552 	if (statSize > sizeof(struct stat))
8553 		return B_BAD_VALUE;
8554 
8555 	// this supports different stat extensions
8556 	if (statSize < sizeof(struct stat)) {
8557 		memset((uint8*)&completeStat + statSize, 0,
8558 			sizeof(struct stat) - statSize);
8559 		memcpy(&completeStat, stat, statSize);
8560 		stat = &completeStat;
8561 	}
8562 
8563 	status_t status;
8564 
8565 	if (path != NULL) {
8566 		// path given: write the stat of the node referred to by (fd, path)
8567 		KPath pathBuffer(path, KPath::DEFAULT, B_PATH_NAME_LENGTH + 1);
8568 		if (pathBuffer.InitCheck() != B_OK)
8569 			return B_NO_MEMORY;
8570 
8571 		status = common_path_write_stat(fd, pathBuffer.LockBuffer(),
8572 			traverseLeafLink, stat, statMask, true);
8573 	} else {
8574 		// no path given: get the FD and use the FD operation
8575 		struct file_descriptor* descriptor
8576 			= get_fd(get_current_io_context(true), fd);
8577 		if (descriptor == NULL)
8578 			return B_FILE_ERROR;
8579 
8580 		if (descriptor->ops->fd_write_stat)
8581 			status = descriptor->ops->fd_write_stat(descriptor, stat, statMask);
8582 		else
8583 			status = B_UNSUPPORTED;
8584 
8585 		put_fd(descriptor);
8586 	}
8587 
8588 	return status;
8589 }
8590 
8591 
8592 int
8593 _kern_open_attr_dir(int fd, const char* path, bool traverseLeafLink)
8594 {
8595 	KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8596 	if (pathBuffer.InitCheck() != B_OK)
8597 		return B_NO_MEMORY;
8598 
8599 	return attr_dir_open(fd, pathBuffer.LockBuffer(), traverseLeafLink, true);
8600 }
8601 
8602 
8603 int
8604 _kern_open_attr(int fd, const char* path, const char* name, uint32 type,
8605 	int openMode)
8606 {
8607 	KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8608 	if (pathBuffer.InitCheck() != B_OK)
8609 		return B_NO_MEMORY;
8610 
8611 	if ((openMode & O_CREAT) != 0) {
8612 		return attr_create(fd, pathBuffer.LockBuffer(), name, type, openMode,
8613 			true);
8614 	}
8615 
8616 	return attr_open(fd, pathBuffer.LockBuffer(), name, openMode, true);
8617 }
8618 
8619 
8620 status_t
8621 _kern_remove_attr(int fd, const char* name)
8622 {
8623 	return attr_remove(fd, name, true);
8624 }
8625 
8626 
8627 status_t
8628 _kern_rename_attr(int fromFile, const char* fromName, int toFile,
8629 	const char* toName)
8630 {
8631 	return attr_rename(fromFile, fromName, toFile, toName, true);
8632 }
8633 
8634 
8635 int
8636 _kern_open_index_dir(dev_t device)
8637 {
8638 	return index_dir_open(device, true);
8639 }
8640 
8641 
8642 status_t
8643 _kern_create_index(dev_t device, const char* name, uint32 type, uint32 flags)
8644 {
8645 	return index_create(device, name, type, flags, true);
8646 }
8647 
8648 
8649 status_t
8650 _kern_read_index_stat(dev_t device, const char* name, struct stat* stat)
8651 {
8652 	return index_name_read_stat(device, name, stat, true);
8653 }
8654 
8655 
8656 status_t
8657 _kern_remove_index(dev_t device, const char* name)
8658 {
8659 	return index_remove(device, name, true);
8660 }
8661 
8662 
8663 status_t
8664 _kern_getcwd(char* buffer, size_t size)
8665 {
8666 	TRACE(("_kern_getcwd: buf %p, %ld\n", buffer, size));
8667 
8668 	// Call vfs to get current working directory
8669 	return get_cwd(buffer, size, true);
8670 }
8671 
8672 
8673 status_t
8674 _kern_setcwd(int fd, const char* path)
8675 {
8676 	KPath pathBuffer(path, KPath::LAZY_ALLOC, B_PATH_NAME_LENGTH + 1);
8677 	if (pathBuffer.InitCheck() != B_OK)
8678 		return B_NO_MEMORY;
8679 
8680 	return set_cwd(fd, pathBuffer.LockBuffer(), true);
8681 }
8682 
8683 
8684 //	#pragma mark - userland syscalls
8685 
8686 
8687 dev_t
8688 _user_mount(const char* userPath, const char* userDevice,
8689 	const char* userFileSystem, uint32 flags, const char* userArgs,
8690 	size_t argsLength)
8691 {
8692 	char fileSystem[B_FILE_NAME_LENGTH];
8693 	KPath path, device;
8694 	char* args = NULL;
8695 	status_t status;
8696 
8697 	if (!IS_USER_ADDRESS(userPath)
8698 		|| !IS_USER_ADDRESS(userFileSystem)
8699 		|| !IS_USER_ADDRESS(userDevice))
8700 		return B_BAD_ADDRESS;
8701 
8702 	if (path.InitCheck() != B_OK || device.InitCheck() != B_OK)
8703 		return B_NO_MEMORY;
8704 
8705 	if (user_strlcpy(path.LockBuffer(), userPath, B_PATH_NAME_LENGTH) < B_OK)
8706 		return B_BAD_ADDRESS;
8707 
8708 	if (userFileSystem != NULL
8709 		&& user_strlcpy(fileSystem, userFileSystem, sizeof(fileSystem)) < B_OK)
8710 		return B_BAD_ADDRESS;
8711 
8712 	if (userDevice != NULL
8713 		&& user_strlcpy(device.LockBuffer(), userDevice, B_PATH_NAME_LENGTH)
8714 			< B_OK)
8715 		return B_BAD_ADDRESS;
8716 
8717 	if (userArgs != NULL && argsLength > 0) {
8718 		if (!IS_USER_ADDRESS(userArgs))
8719 			return B_BAD_ADDRESS;
8720 
8721 		// this is a safety restriction
8722 		if (argsLength >= 65536)
8723 			return B_NAME_TOO_LONG;
8724 
8725 		args = (char*)malloc(argsLength + 1);
8726 		if (args == NULL)
8727 			return B_NO_MEMORY;
8728 
8729 		if (user_strlcpy(args, userArgs, argsLength + 1) < B_OK) {
8730 			free(args);
8731 			return B_BAD_ADDRESS;
8732 		}
8733 	}
8734 	path.UnlockBuffer();
8735 	device.UnlockBuffer();
8736 
8737 	status = fs_mount(path.LockBuffer(),
8738 		userDevice != NULL ? device.Path() : NULL,
8739 		userFileSystem ? fileSystem : NULL, flags, args, false);
8740 
8741 	free(args);
8742 	return status;
8743 }
8744 
8745 
8746 status_t
8747 _user_unmount(const char* userPath, uint32 flags)
8748 {
8749 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
8750 
8751 	if (!IS_USER_ADDRESS(userPath))
8752 		return B_BAD_ADDRESS;
8753 
8754 	if (pathBuffer.InitCheck() != B_OK)
8755 		return B_NO_MEMORY;
8756 
8757 	char* path = pathBuffer.LockBuffer();
8758 
8759 	if (user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
8760 		return B_BAD_ADDRESS;
8761 
8762 	return fs_unmount(path, -1, flags & ~B_UNMOUNT_BUSY_PARTITION, false);
8763 }
8764 
8765 
8766 status_t
8767 _user_read_fs_info(dev_t device, struct fs_info* userInfo)
8768 {
8769 	struct fs_info info;
8770 	status_t status;
8771 
8772 	if (userInfo == NULL)
8773 		return B_BAD_VALUE;
8774 
8775 	if (!IS_USER_ADDRESS(userInfo))
8776 		return B_BAD_ADDRESS;
8777 
8778 	status = fs_read_info(device, &info);
8779 	if (status != B_OK)
8780 		return status;
8781 
8782 	if (user_memcpy(userInfo, &info, sizeof(struct fs_info)) != B_OK)
8783 		return B_BAD_ADDRESS;
8784 
8785 	return B_OK;
8786 }
8787 
8788 
8789 status_t
8790 _user_write_fs_info(dev_t device, const struct fs_info* userInfo, int mask)
8791 {
8792 	struct fs_info info;
8793 
8794 	if (userInfo == NULL)
8795 		return B_BAD_VALUE;
8796 
8797 	if (!IS_USER_ADDRESS(userInfo)
8798 		|| user_memcpy(&info, userInfo, sizeof(struct fs_info)) != B_OK)
8799 		return B_BAD_ADDRESS;
8800 
8801 	return fs_write_info(device, &info, mask);
8802 }
8803 
8804 
8805 dev_t
8806 _user_next_device(int32* _userCookie)
8807 {
8808 	int32 cookie;
8809 	dev_t device;
8810 
8811 	if (!IS_USER_ADDRESS(_userCookie)
8812 		|| user_memcpy(&cookie, _userCookie, sizeof(int32)) != B_OK)
8813 		return B_BAD_ADDRESS;
8814 
8815 	device = fs_next_device(&cookie);
8816 
8817 	if (device >= B_OK) {
8818 		// update user cookie
8819 		if (user_memcpy(_userCookie, &cookie, sizeof(int32)) != B_OK)
8820 			return B_BAD_ADDRESS;
8821 	}
8822 
8823 	return device;
8824 }
8825 
8826 
8827 status_t
8828 _user_sync(void)
8829 {
8830 	return _kern_sync();
8831 }
8832 
8833 
8834 status_t
8835 _user_get_next_fd_info(team_id team, uint32* userCookie, fd_info* userInfo,
8836 	size_t infoSize)
8837 {
8838 	struct fd_info info;
8839 	uint32 cookie;
8840 
8841 	// only root can do this (or should root's group be enough?)
8842 	if (geteuid() != 0)
8843 		return B_NOT_ALLOWED;
8844 
8845 	if (infoSize != sizeof(fd_info))
8846 		return B_BAD_VALUE;
8847 
8848 	if (!IS_USER_ADDRESS(userCookie) || !IS_USER_ADDRESS(userInfo)
8849 		|| user_memcpy(&cookie, userCookie, sizeof(uint32)) != B_OK)
8850 		return B_BAD_ADDRESS;
8851 
8852 	status_t status = _kern_get_next_fd_info(team, &cookie, &info, infoSize);
8853 	if (status != B_OK)
8854 		return status;
8855 
8856 	if (user_memcpy(userCookie, &cookie, sizeof(uint32)) != B_OK
8857 		|| user_memcpy(userInfo, &info, infoSize) != B_OK)
8858 		return B_BAD_ADDRESS;
8859 
8860 	return status;
8861 }
8862 
8863 
8864 status_t
8865 _user_entry_ref_to_path(dev_t device, ino_t inode, const char* leaf,
8866 	char* userPath, size_t pathLength)
8867 {
8868 	if (!IS_USER_ADDRESS(userPath))
8869 		return B_BAD_ADDRESS;
8870 
8871 	KPath path(B_PATH_NAME_LENGTH + 1);
8872 	if (path.InitCheck() != B_OK)
8873 		return B_NO_MEMORY;
8874 
8875 	// copy the leaf name onto the stack
8876 	char stackLeaf[B_FILE_NAME_LENGTH];
8877 	if (leaf != NULL) {
8878 		if (!IS_USER_ADDRESS(leaf))
8879 			return B_BAD_ADDRESS;
8880 
8881 		int length = user_strlcpy(stackLeaf, leaf, B_FILE_NAME_LENGTH);
8882 		if (length < 0)
8883 			return length;
8884 		if (length >= B_FILE_NAME_LENGTH)
8885 			return B_NAME_TOO_LONG;
8886 
8887 		leaf = stackLeaf;
8888 	}
8889 
8890 	status_t status = vfs_entry_ref_to_path(device, inode, leaf,
8891 		false, path.LockBuffer(), path.BufferSize());
8892 	if (status != B_OK)
8893 		return status;
8894 
8895 	path.UnlockBuffer();
8896 
8897 	int length = user_strlcpy(userPath, path.Path(), pathLength);
8898 	if (length < 0)
8899 		return length;
8900 	if (length >= (int)pathLength)
8901 		return B_BUFFER_OVERFLOW;
8902 
8903 	return B_OK;
8904 }
8905 
8906 
8907 status_t
8908 _user_normalize_path(const char* userPath, bool traverseLink, char* buffer)
8909 {
8910 	if (userPath == NULL || buffer == NULL)
8911 		return B_BAD_VALUE;
8912 	if (!IS_USER_ADDRESS(userPath) || !IS_USER_ADDRESS(buffer))
8913 		return B_BAD_ADDRESS;
8914 
8915 	// copy path from userland
8916 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
8917 	if (pathBuffer.InitCheck() != B_OK)
8918 		return B_NO_MEMORY;
8919 	char* path = pathBuffer.LockBuffer();
8920 
8921 	if (user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
8922 		return B_BAD_ADDRESS;
8923 
8924 	status_t error = normalize_path(path, pathBuffer.BufferSize(), traverseLink,
8925 		false);
8926 	if (error != B_OK)
8927 		return error;
8928 
8929 	// copy back to userland
8930 	int len = user_strlcpy(buffer, path, B_PATH_NAME_LENGTH);
8931 	if (len < 0)
8932 		return len;
8933 	if (len >= B_PATH_NAME_LENGTH)
8934 		return B_BUFFER_OVERFLOW;
8935 
8936 	return B_OK;
8937 }
8938 
8939 
8940 int
8941 _user_open_entry_ref(dev_t device, ino_t inode, const char* userName,
8942 	int openMode, int perms)
8943 {
8944 	char name[B_FILE_NAME_LENGTH];
8945 
8946 	if (userName == NULL || device < 0 || inode < 0)
8947 		return B_BAD_VALUE;
8948 	if (!IS_USER_ADDRESS(userName)
8949 		|| user_strlcpy(name, userName, sizeof(name)) < B_OK)
8950 		return B_BAD_ADDRESS;
8951 
8952 	if ((openMode & O_CREAT) != 0) {
8953 		return file_create_entry_ref(device, inode, name, openMode, perms,
8954 			false);
8955 	}
8956 
8957 	return file_open_entry_ref(device, inode, name, openMode, false);
8958 }
8959 
8960 
8961 int
8962 _user_open(int fd, const char* userPath, int openMode, int perms)
8963 {
8964 	KPath path(B_PATH_NAME_LENGTH + 1);
8965 	if (path.InitCheck() != B_OK)
8966 		return B_NO_MEMORY;
8967 
8968 	char* buffer = path.LockBuffer();
8969 
8970 	if (!IS_USER_ADDRESS(userPath)
8971 		|| user_strlcpy(buffer, userPath, B_PATH_NAME_LENGTH) < B_OK)
8972 		return B_BAD_ADDRESS;
8973 
8974 	if ((openMode & O_CREAT) != 0)
8975 		return file_create(fd, buffer, openMode, perms, false);
8976 
8977 	return file_open(fd, buffer, openMode, false);
8978 }
8979 
8980 
8981 int
8982 _user_open_dir_entry_ref(dev_t device, ino_t inode, const char* userName)
8983 {
8984 	if (userName != NULL) {
8985 		char name[B_FILE_NAME_LENGTH];
8986 
8987 		if (!IS_USER_ADDRESS(userName)
8988 			|| user_strlcpy(name, userName, sizeof(name)) < B_OK)
8989 			return B_BAD_ADDRESS;
8990 
8991 		return dir_open_entry_ref(device, inode, name, false);
8992 	}
8993 	return dir_open_entry_ref(device, inode, NULL, false);
8994 }
8995 
8996 
8997 int
8998 _user_open_dir(int fd, const char* userPath)
8999 {
9000 	if (userPath == NULL)
9001 		return dir_open(fd, NULL, false);
9002 
9003 	KPath path(B_PATH_NAME_LENGTH + 1);
9004 	if (path.InitCheck() != B_OK)
9005 		return B_NO_MEMORY;
9006 
9007 	char* buffer = path.LockBuffer();
9008 
9009 	if (!IS_USER_ADDRESS(userPath)
9010 		|| user_strlcpy(buffer, userPath, B_PATH_NAME_LENGTH) < B_OK)
9011 		return B_BAD_ADDRESS;
9012 
9013 	return dir_open(fd, buffer, false);
9014 }
9015 
9016 
9017 /*!	\brief Opens a directory's parent directory and returns the entry name
9018 		   of the former.
9019 
9020 	Aside from that it returns the directory's entry name, this method is
9021 	equivalent to \code _user_open_dir(fd, "..") \endcode. It really is
9022 	equivalent, if \a userName is \c NULL.
9023 
9024 	If a name buffer is supplied and the name does not fit the buffer, the
9025 	function fails. A buffer of size \c B_FILE_NAME_LENGTH should be safe.
9026 
9027 	\param fd A FD referring to a directory.
9028 	\param userName Buffer the directory's entry name shall be written into.
9029 		   May be \c NULL.
9030 	\param nameLength Size of the name buffer.
9031 	\return The file descriptor of the opened parent directory, if everything
9032 			went fine, an error code otherwise.
9033 */
9034 int
9035 _user_open_parent_dir(int fd, char* userName, size_t nameLength)
9036 {
9037 	bool kernel = false;
9038 
9039 	if (userName && !IS_USER_ADDRESS(userName))
9040 		return B_BAD_ADDRESS;
9041 
9042 	// open the parent dir
9043 	int parentFD = dir_open(fd, (char*)"..", kernel);
9044 	if (parentFD < 0)
9045 		return parentFD;
9046 	FDCloser fdCloser(parentFD, kernel);
9047 
9048 	if (userName) {
9049 		// get the vnodes
9050 		struct vnode* parentVNode = get_vnode_from_fd(parentFD, kernel);
9051 		struct vnode* dirVNode = get_vnode_from_fd(fd, kernel);
9052 		VNodePutter parentVNodePutter(parentVNode);
9053 		VNodePutter dirVNodePutter(dirVNode);
9054 		if (!parentVNode || !dirVNode)
9055 			return B_FILE_ERROR;
9056 
9057 		// get the vnode name
9058 		char _buffer[sizeof(struct dirent) + B_FILE_NAME_LENGTH];
9059 		struct dirent* buffer = (struct dirent*)_buffer;
9060 		status_t status = get_vnode_name(dirVNode, parentVNode, buffer,
9061 			sizeof(_buffer), get_current_io_context(false));
9062 		if (status != B_OK)
9063 			return status;
9064 
9065 		// copy the name to the userland buffer
9066 		int len = user_strlcpy(userName, buffer->d_name, nameLength);
9067 		if (len < 0)
9068 			return len;
9069 		if (len >= (int)nameLength)
9070 			return B_BUFFER_OVERFLOW;
9071 	}
9072 
9073 	return fdCloser.Detach();
9074 }
9075 
9076 
9077 status_t
9078 _user_fcntl(int fd, int op, size_t argument)
9079 {
9080 	status_t status = common_fcntl(fd, op, argument, false);
9081 	if (op == F_SETLKW)
9082 		syscall_restart_handle_post(status);
9083 
9084 	return status;
9085 }
9086 
9087 
9088 status_t
9089 _user_fsync(int fd)
9090 {
9091 	return common_sync(fd, false);
9092 }
9093 
9094 
9095 status_t
9096 _user_flock(int fd, int operation)
9097 {
9098 	FUNCTION(("_user_fcntl(fd = %d, op = %d)\n", fd, operation));
9099 
9100 	// Check if the operation is valid
9101 	switch (operation & ~LOCK_NB) {
9102 		case LOCK_UN:
9103 		case LOCK_SH:
9104 		case LOCK_EX:
9105 			break;
9106 
9107 		default:
9108 			return B_BAD_VALUE;
9109 	}
9110 
9111 	struct file_descriptor* descriptor;
9112 	struct vnode* vnode;
9113 	descriptor = get_fd_and_vnode(fd, &vnode, false);
9114 	if (descriptor == NULL)
9115 		return B_FILE_ERROR;
9116 
9117 	if (descriptor->type != FDTYPE_FILE) {
9118 		put_fd(descriptor);
9119 		return B_BAD_VALUE;
9120 	}
9121 
9122 	struct flock flock;
9123 	flock.l_start = 0;
9124 	flock.l_len = OFF_MAX;
9125 	flock.l_whence = 0;
9126 	flock.l_type = (operation & LOCK_SH) != 0 ? F_RDLCK : F_WRLCK;
9127 
9128 	status_t status;
9129 	if ((operation & LOCK_UN) != 0)
9130 		status = release_advisory_lock(vnode, &flock);
9131 	else {
9132 		status = acquire_advisory_lock(vnode,
9133 			thread_get_current_thread()->team->session_id, &flock,
9134 			(operation & LOCK_NB) == 0);
9135 	}
9136 
9137 	syscall_restart_handle_post(status);
9138 
9139 	put_fd(descriptor);
9140 	return status;
9141 }
9142 
9143 
9144 status_t
9145 _user_lock_node(int fd)
9146 {
9147 	return common_lock_node(fd, false);
9148 }
9149 
9150 
9151 status_t
9152 _user_unlock_node(int fd)
9153 {
9154 	return common_unlock_node(fd, false);
9155 }
9156 
9157 
9158 status_t
9159 _user_create_dir_entry_ref(dev_t device, ino_t inode, const char* userName,
9160 	int perms)
9161 {
9162 	char name[B_FILE_NAME_LENGTH];
9163 	status_t status;
9164 
9165 	if (!IS_USER_ADDRESS(userName))
9166 		return B_BAD_ADDRESS;
9167 
9168 	status = user_strlcpy(name, userName, sizeof(name));
9169 	if (status < 0)
9170 		return status;
9171 
9172 	return dir_create_entry_ref(device, inode, name, perms, false);
9173 }
9174 
9175 
9176 status_t
9177 _user_create_dir(int fd, const char* userPath, int perms)
9178 {
9179 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9180 	if (pathBuffer.InitCheck() != B_OK)
9181 		return B_NO_MEMORY;
9182 
9183 	char* path = pathBuffer.LockBuffer();
9184 
9185 	if (!IS_USER_ADDRESS(userPath)
9186 		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9187 		return B_BAD_ADDRESS;
9188 
9189 	return dir_create(fd, path, perms, false);
9190 }
9191 
9192 
9193 status_t
9194 _user_remove_dir(int fd, const char* userPath)
9195 {
9196 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9197 	if (pathBuffer.InitCheck() != B_OK)
9198 		return B_NO_MEMORY;
9199 
9200 	char* path = pathBuffer.LockBuffer();
9201 
9202 	if (userPath != NULL) {
9203 		if (!IS_USER_ADDRESS(userPath)
9204 			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9205 			return B_BAD_ADDRESS;
9206 	}
9207 
9208 	return dir_remove(fd, userPath ? path : NULL, false);
9209 }
9210 
9211 
9212 status_t
9213 _user_read_link(int fd, const char* userPath, char* userBuffer,
9214 	size_t* userBufferSize)
9215 {
9216 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1), linkBuffer;
9217 	if (pathBuffer.InitCheck() != B_OK || linkBuffer.InitCheck() != B_OK)
9218 		return B_NO_MEMORY;
9219 
9220 	size_t bufferSize;
9221 
9222 	if (!IS_USER_ADDRESS(userBuffer) || !IS_USER_ADDRESS(userBufferSize)
9223 		|| user_memcpy(&bufferSize, userBufferSize, sizeof(size_t)) != B_OK)
9224 		return B_BAD_ADDRESS;
9225 
9226 	char* path = pathBuffer.LockBuffer();
9227 	char* buffer = linkBuffer.LockBuffer();
9228 
9229 	if (userPath) {
9230 		if (!IS_USER_ADDRESS(userPath)
9231 			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9232 			return B_BAD_ADDRESS;
9233 
9234 		if (bufferSize > B_PATH_NAME_LENGTH)
9235 			bufferSize = B_PATH_NAME_LENGTH;
9236 	}
9237 
9238 	status_t status = common_read_link(fd, userPath ? path : NULL, buffer,
9239 		&bufferSize, false);
9240 
9241 	// we also update the bufferSize in case of errors
9242 	// (the real length will be returned in case of B_BUFFER_OVERFLOW)
9243 	if (user_memcpy(userBufferSize, &bufferSize, sizeof(size_t)) != B_OK)
9244 		return B_BAD_ADDRESS;
9245 
9246 	if (status != B_OK)
9247 		return status;
9248 
9249 	if (user_memcpy(userBuffer, buffer, bufferSize) != B_OK)
9250 		return B_BAD_ADDRESS;
9251 
9252 	return B_OK;
9253 }
9254 
9255 
9256 status_t
9257 _user_create_symlink(int fd, const char* userPath, const char* userToPath,
9258 	int mode)
9259 {
9260 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9261 	KPath toPathBuffer(B_PATH_NAME_LENGTH + 1);
9262 	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
9263 		return B_NO_MEMORY;
9264 
9265 	char* path = pathBuffer.LockBuffer();
9266 	char* toPath = toPathBuffer.LockBuffer();
9267 
9268 	if (!IS_USER_ADDRESS(userPath)
9269 		|| !IS_USER_ADDRESS(userToPath)
9270 		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK
9271 		|| user_strlcpy(toPath, userToPath, B_PATH_NAME_LENGTH) < B_OK)
9272 		return B_BAD_ADDRESS;
9273 
9274 	return common_create_symlink(fd, path, toPath, mode, false);
9275 }
9276 
9277 
9278 status_t
9279 _user_create_link(int pathFD, const char* userPath, int toFD,
9280 	const char* userToPath, bool traverseLeafLink)
9281 {
9282 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9283 	KPath toPathBuffer(B_PATH_NAME_LENGTH + 1);
9284 	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
9285 		return B_NO_MEMORY;
9286 
9287 	char* path = pathBuffer.LockBuffer();
9288 	char* toPath = toPathBuffer.LockBuffer();
9289 
9290 	if (!IS_USER_ADDRESS(userPath)
9291 		|| !IS_USER_ADDRESS(userToPath)
9292 		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK
9293 		|| user_strlcpy(toPath, userToPath, B_PATH_NAME_LENGTH) < B_OK)
9294 		return B_BAD_ADDRESS;
9295 
9296 	status_t status = check_path(toPath);
9297 	if (status != B_OK)
9298 		return status;
9299 
9300 	return common_create_link(pathFD, path, toFD, toPath, traverseLeafLink,
9301 		false);
9302 }
9303 
9304 
9305 status_t
9306 _user_unlink(int fd, const char* userPath)
9307 {
9308 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9309 	if (pathBuffer.InitCheck() != B_OK)
9310 		return B_NO_MEMORY;
9311 
9312 	char* path = pathBuffer.LockBuffer();
9313 
9314 	if (!IS_USER_ADDRESS(userPath)
9315 		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9316 		return B_BAD_ADDRESS;
9317 
9318 	return common_unlink(fd, path, false);
9319 }
9320 
9321 
9322 status_t
9323 _user_rename(int oldFD, const char* userOldPath, int newFD,
9324 	const char* userNewPath)
9325 {
9326 	KPath oldPathBuffer(B_PATH_NAME_LENGTH + 1);
9327 	KPath newPathBuffer(B_PATH_NAME_LENGTH + 1);
9328 	if (oldPathBuffer.InitCheck() != B_OK || newPathBuffer.InitCheck() != B_OK)
9329 		return B_NO_MEMORY;
9330 
9331 	char* oldPath = oldPathBuffer.LockBuffer();
9332 	char* newPath = newPathBuffer.LockBuffer();
9333 
9334 	if (!IS_USER_ADDRESS(userOldPath) || !IS_USER_ADDRESS(userNewPath)
9335 		|| user_strlcpy(oldPath, userOldPath, B_PATH_NAME_LENGTH) < B_OK
9336 		|| user_strlcpy(newPath, userNewPath, B_PATH_NAME_LENGTH) < B_OK)
9337 		return B_BAD_ADDRESS;
9338 
9339 	return common_rename(oldFD, oldPath, newFD, newPath, false);
9340 }
9341 
9342 
9343 status_t
9344 _user_create_fifo(int fd, const char* userPath, mode_t perms)
9345 {
9346 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9347 	if (pathBuffer.InitCheck() != B_OK)
9348 		return B_NO_MEMORY;
9349 
9350 	char* path = pathBuffer.LockBuffer();
9351 
9352 	if (!IS_USER_ADDRESS(userPath)
9353 		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK) {
9354 		return B_BAD_ADDRESS;
9355 	}
9356 
9357 	// split into directory vnode and filename path
9358 	char filename[B_FILE_NAME_LENGTH];
9359 	struct vnode* dir;
9360 	status_t status = fd_and_path_to_dir_vnode(fd, path, &dir, filename, false);
9361 	if (status != B_OK)
9362 		return status;
9363 
9364 	VNodePutter _(dir);
9365 
9366 	// the underlying FS needs to support creating FIFOs
9367 	if (!HAS_FS_CALL(dir, create_special_node))
9368 		return B_UNSUPPORTED;
9369 
9370 	// create the entry	-- the FIFO sub node is set up automatically
9371 	fs_vnode superVnode;
9372 	ino_t nodeID;
9373 	status = FS_CALL(dir, create_special_node, filename, NULL,
9374 		S_IFIFO | (perms & S_IUMSK), 0, &superVnode, &nodeID);
9375 
9376 	// create_special_node() acquired a reference for us that we don't need.
9377 	if (status == B_OK)
9378 		put_vnode(dir->mount->volume, nodeID);
9379 
9380 	return status;
9381 }
9382 
9383 
9384 status_t
9385 _user_create_pipe(int* userFDs)
9386 {
9387 	// rootfs should support creating FIFOs, but let's be sure
9388 	if (!HAS_FS_CALL(sRoot, create_special_node))
9389 		return B_UNSUPPORTED;
9390 
9391 	// create the node	-- the FIFO sub node is set up automatically
9392 	fs_vnode superVnode;
9393 	ino_t nodeID;
9394 	status_t status = FS_CALL(sRoot, create_special_node, NULL, NULL,
9395 		S_IFIFO | S_IRUSR | S_IWUSR, 0, &superVnode, &nodeID);
9396 	if (status != B_OK)
9397 		return status;
9398 
9399 	// We've got one reference to the node and need another one.
9400 	struct vnode* vnode;
9401 	status = get_vnode(sRoot->mount->id, nodeID, &vnode, true, false);
9402 	if (status != B_OK) {
9403 		// that should not happen
9404 		dprintf("_user_create_pipe(): Failed to lookup vnode (%" B_PRIdDEV ", "
9405 			"%" B_PRIdINO ")\n", sRoot->mount->id, sRoot->id);
9406 		return status;
9407 	}
9408 
9409 	// Everything looks good so far. Open two FDs for reading respectively
9410 	// writing.
9411 	int fds[2];
9412 	fds[0] = open_vnode(vnode, O_RDONLY, false);
9413 	fds[1] = open_vnode(vnode, O_WRONLY, false);
9414 
9415 	FDCloser closer0(fds[0], false);
9416 	FDCloser closer1(fds[1], false);
9417 
9418 	status = (fds[0] >= 0 ? (fds[1] >= 0 ? B_OK : fds[1]) : fds[0]);
9419 
9420 	// copy FDs to userland
9421 	if (status == B_OK) {
9422 		if (!IS_USER_ADDRESS(userFDs)
9423 			|| user_memcpy(userFDs, fds, sizeof(fds)) != B_OK) {
9424 			status = B_BAD_ADDRESS;
9425 		}
9426 	}
9427 
9428 	// keep FDs, if everything went fine
9429 	if (status == B_OK) {
9430 		closer0.Detach();
9431 		closer1.Detach();
9432 	}
9433 
9434 	return status;
9435 }
9436 
9437 
9438 status_t
9439 _user_access(int fd, const char* userPath, int mode, bool effectiveUserGroup)
9440 {
9441 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9442 	if (pathBuffer.InitCheck() != B_OK)
9443 		return B_NO_MEMORY;
9444 
9445 	char* path = pathBuffer.LockBuffer();
9446 
9447 	if (!IS_USER_ADDRESS(userPath)
9448 		|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9449 		return B_BAD_ADDRESS;
9450 
9451 	return common_access(fd, path, mode, effectiveUserGroup, false);
9452 }
9453 
9454 
9455 status_t
9456 _user_read_stat(int fd, const char* userPath, bool traverseLink,
9457 	struct stat* userStat, size_t statSize)
9458 {
9459 	struct stat stat;
9460 	status_t status;
9461 
9462 	if (statSize > sizeof(struct stat))
9463 		return B_BAD_VALUE;
9464 
9465 	if (!IS_USER_ADDRESS(userStat))
9466 		return B_BAD_ADDRESS;
9467 
9468 	if (userPath != NULL) {
9469 		// path given: get the stat of the node referred to by (fd, path)
9470 		if (!IS_USER_ADDRESS(userPath))
9471 			return B_BAD_ADDRESS;
9472 
9473 		KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9474 		if (pathBuffer.InitCheck() != B_OK)
9475 			return B_NO_MEMORY;
9476 
9477 		char* path = pathBuffer.LockBuffer();
9478 
9479 		ssize_t length = user_strlcpy(path, userPath, B_PATH_NAME_LENGTH);
9480 		if (length < B_OK)
9481 			return length;
9482 		if (length >= B_PATH_NAME_LENGTH)
9483 			return B_NAME_TOO_LONG;
9484 
9485 		status = common_path_read_stat(fd, path, traverseLink, &stat, false);
9486 	} else {
9487 		// no path given: get the FD and use the FD operation
9488 		struct file_descriptor* descriptor
9489 			= get_fd(get_current_io_context(false), fd);
9490 		if (descriptor == NULL)
9491 			return B_FILE_ERROR;
9492 
9493 		if (descriptor->ops->fd_read_stat)
9494 			status = descriptor->ops->fd_read_stat(descriptor, &stat);
9495 		else
9496 			status = B_UNSUPPORTED;
9497 
9498 		put_fd(descriptor);
9499 	}
9500 
9501 	if (status != B_OK)
9502 		return status;
9503 
9504 	return user_memcpy(userStat, &stat, statSize);
9505 }
9506 
9507 
9508 status_t
9509 _user_write_stat(int fd, const char* userPath, bool traverseLeafLink,
9510 	const struct stat* userStat, size_t statSize, int statMask)
9511 {
9512 	if (statSize > sizeof(struct stat))
9513 		return B_BAD_VALUE;
9514 
9515 	struct stat stat;
9516 
9517 	if (!IS_USER_ADDRESS(userStat)
9518 		|| user_memcpy(&stat, userStat, statSize) < B_OK)
9519 		return B_BAD_ADDRESS;
9520 
9521 	// clear additional stat fields
9522 	if (statSize < sizeof(struct stat))
9523 		memset((uint8*)&stat + statSize, 0, sizeof(struct stat) - statSize);
9524 
9525 	status_t status;
9526 
9527 	if (userPath != NULL) {
9528 		// path given: write the stat of the node referred to by (fd, path)
9529 		if (!IS_USER_ADDRESS(userPath))
9530 			return B_BAD_ADDRESS;
9531 
9532 		KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9533 		if (pathBuffer.InitCheck() != B_OK)
9534 			return B_NO_MEMORY;
9535 
9536 		char* path = pathBuffer.LockBuffer();
9537 
9538 		ssize_t length = user_strlcpy(path, userPath, B_PATH_NAME_LENGTH);
9539 		if (length < B_OK)
9540 			return length;
9541 		if (length >= B_PATH_NAME_LENGTH)
9542 			return B_NAME_TOO_LONG;
9543 
9544 		status = common_path_write_stat(fd, path, traverseLeafLink, &stat,
9545 			statMask, false);
9546 	} else {
9547 		// no path given: get the FD and use the FD operation
9548 		struct file_descriptor* descriptor
9549 			= get_fd(get_current_io_context(false), fd);
9550 		if (descriptor == NULL)
9551 			return B_FILE_ERROR;
9552 
9553 		if (descriptor->ops->fd_write_stat) {
9554 			status = descriptor->ops->fd_write_stat(descriptor, &stat,
9555 				statMask);
9556 		} else
9557 			status = B_UNSUPPORTED;
9558 
9559 		put_fd(descriptor);
9560 	}
9561 
9562 	return status;
9563 }
9564 
9565 
9566 int
9567 _user_open_attr_dir(int fd, const char* userPath, bool traverseLeafLink)
9568 {
9569 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9570 	if (pathBuffer.InitCheck() != B_OK)
9571 		return B_NO_MEMORY;
9572 
9573 	char* path = pathBuffer.LockBuffer();
9574 
9575 	if (userPath != NULL) {
9576 		if (!IS_USER_ADDRESS(userPath)
9577 			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9578 			return B_BAD_ADDRESS;
9579 	}
9580 
9581 	return attr_dir_open(fd, userPath ? path : NULL, traverseLeafLink, false);
9582 }
9583 
9584 
9585 ssize_t
9586 _user_read_attr(int fd, const char* userAttribute, off_t pos, void* userBuffer,
9587 	size_t readBytes)
9588 {
9589 	char attribute[B_FILE_NAME_LENGTH];
9590 
9591 	if (userAttribute == NULL)
9592 		return B_BAD_VALUE;
9593 	if (!IS_USER_ADDRESS(userAttribute)
9594 		|| user_strlcpy(attribute, userAttribute, sizeof(attribute)) < B_OK) {
9595 		return B_BAD_ADDRESS;
9596 	}
9597 
9598 	int attr = attr_open(fd, NULL, attribute, O_RDONLY, false);
9599 	if (attr < 0)
9600 		return attr;
9601 
9602 	ssize_t bytes = _user_read(attr, pos, userBuffer, readBytes);
9603 	_user_close(attr);
9604 
9605 	return bytes;
9606 }
9607 
9608 
9609 ssize_t
9610 _user_write_attr(int fd, const char* userAttribute, uint32 type, off_t pos,
9611 	const void* buffer, size_t writeBytes)
9612 {
9613 	char attribute[B_FILE_NAME_LENGTH];
9614 
9615 	if (userAttribute == NULL)
9616 		return B_BAD_VALUE;
9617 	if (!IS_USER_ADDRESS(userAttribute)
9618 		|| user_strlcpy(attribute, userAttribute, sizeof(attribute)) < B_OK) {
9619 		return B_BAD_ADDRESS;
9620 	}
9621 
9622 	// Try to support the BeOS typical truncation as well as the position
9623 	// argument
9624 	int attr = attr_create(fd, NULL, attribute, type,
9625 		O_CREAT | O_WRONLY | (pos != 0 ? 0 : O_TRUNC), false);
9626 	if (attr < 0)
9627 		return attr;
9628 
9629 	ssize_t bytes = _user_write(attr, pos, buffer, writeBytes);
9630 	_user_close(attr);
9631 
9632 	return bytes;
9633 }
9634 
9635 
9636 status_t
9637 _user_stat_attr(int fd, const char* userAttribute,
9638 	struct attr_info* userAttrInfo)
9639 {
9640 	char attribute[B_FILE_NAME_LENGTH];
9641 
9642 	if (userAttribute == NULL || userAttrInfo == NULL)
9643 		return B_BAD_VALUE;
9644 	if (!IS_USER_ADDRESS(userAttribute) || !IS_USER_ADDRESS(userAttrInfo)
9645 		|| user_strlcpy(attribute, userAttribute, sizeof(attribute)) < B_OK) {
9646 		return B_BAD_ADDRESS;
9647 	}
9648 
9649 	int attr = attr_open(fd, NULL, attribute, O_RDONLY, false);
9650 	if (attr < 0)
9651 		return attr;
9652 
9653 	struct file_descriptor* descriptor
9654 		= get_fd(get_current_io_context(false), attr);
9655 	if (descriptor == NULL) {
9656 		_user_close(attr);
9657 		return B_FILE_ERROR;
9658 	}
9659 
9660 	struct stat stat;
9661 	status_t status;
9662 	if (descriptor->ops->fd_read_stat)
9663 		status = descriptor->ops->fd_read_stat(descriptor, &stat);
9664 	else
9665 		status = B_UNSUPPORTED;
9666 
9667 	put_fd(descriptor);
9668 	_user_close(attr);
9669 
9670 	if (status == B_OK) {
9671 		attr_info info;
9672 		info.type = stat.st_type;
9673 		info.size = stat.st_size;
9674 
9675 		if (user_memcpy(userAttrInfo, &info, sizeof(struct attr_info)) != B_OK)
9676 			return B_BAD_ADDRESS;
9677 	}
9678 
9679 	return status;
9680 }
9681 
9682 
9683 int
9684 _user_open_attr(int fd, const char* userPath, const char* userName,
9685 	uint32 type, int openMode)
9686 {
9687 	char name[B_FILE_NAME_LENGTH];
9688 
9689 	if (!IS_USER_ADDRESS(userName)
9690 		|| user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9691 		return B_BAD_ADDRESS;
9692 
9693 	KPath pathBuffer(B_PATH_NAME_LENGTH + 1);
9694 	if (pathBuffer.InitCheck() != B_OK)
9695 		return B_NO_MEMORY;
9696 
9697 	char* path = pathBuffer.LockBuffer();
9698 
9699 	if (userPath != NULL) {
9700 		if (!IS_USER_ADDRESS(userPath)
9701 			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9702 			return B_BAD_ADDRESS;
9703 	}
9704 
9705 	if ((openMode & O_CREAT) != 0) {
9706 		return attr_create(fd, userPath ? path : NULL, name, type, openMode,
9707 			false);
9708 	}
9709 
9710 	return attr_open(fd, userPath ? path : NULL, name, openMode, false);
9711 }
9712 
9713 
9714 status_t
9715 _user_remove_attr(int fd, const char* userName)
9716 {
9717 	char name[B_FILE_NAME_LENGTH];
9718 
9719 	if (!IS_USER_ADDRESS(userName)
9720 		|| user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9721 		return B_BAD_ADDRESS;
9722 
9723 	return attr_remove(fd, name, false);
9724 }
9725 
9726 
9727 status_t
9728 _user_rename_attr(int fromFile, const char* userFromName, int toFile,
9729 	const char* userToName)
9730 {
9731 	if (!IS_USER_ADDRESS(userFromName)
9732 		|| !IS_USER_ADDRESS(userToName))
9733 		return B_BAD_ADDRESS;
9734 
9735 	KPath fromNameBuffer(B_FILE_NAME_LENGTH);
9736 	KPath toNameBuffer(B_FILE_NAME_LENGTH);
9737 	if (fromNameBuffer.InitCheck() != B_OK || toNameBuffer.InitCheck() != B_OK)
9738 		return B_NO_MEMORY;
9739 
9740 	char* fromName = fromNameBuffer.LockBuffer();
9741 	char* toName = toNameBuffer.LockBuffer();
9742 
9743 	if (user_strlcpy(fromName, userFromName, B_FILE_NAME_LENGTH) < B_OK
9744 		|| user_strlcpy(toName, userToName, B_FILE_NAME_LENGTH) < B_OK)
9745 		return B_BAD_ADDRESS;
9746 
9747 	return attr_rename(fromFile, fromName, toFile, toName, false);
9748 }
9749 
9750 
9751 int
9752 _user_open_index_dir(dev_t device)
9753 {
9754 	return index_dir_open(device, false);
9755 }
9756 
9757 
9758 status_t
9759 _user_create_index(dev_t device, const char* userName, uint32 type,
9760 	uint32 flags)
9761 {
9762 	char name[B_FILE_NAME_LENGTH];
9763 
9764 	if (!IS_USER_ADDRESS(userName)
9765 		|| user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9766 		return B_BAD_ADDRESS;
9767 
9768 	return index_create(device, name, type, flags, false);
9769 }
9770 
9771 
9772 status_t
9773 _user_read_index_stat(dev_t device, const char* userName, struct stat* userStat)
9774 {
9775 	char name[B_FILE_NAME_LENGTH];
9776 	struct stat stat;
9777 	status_t status;
9778 
9779 	if (!IS_USER_ADDRESS(userName)
9780 		|| !IS_USER_ADDRESS(userStat)
9781 		|| user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9782 		return B_BAD_ADDRESS;
9783 
9784 	status = index_name_read_stat(device, name, &stat, false);
9785 	if (status == B_OK) {
9786 		if (user_memcpy(userStat, &stat, sizeof(stat)) != B_OK)
9787 			return B_BAD_ADDRESS;
9788 	}
9789 
9790 	return status;
9791 }
9792 
9793 
9794 status_t
9795 _user_remove_index(dev_t device, const char* userName)
9796 {
9797 	char name[B_FILE_NAME_LENGTH];
9798 
9799 	if (!IS_USER_ADDRESS(userName)
9800 		|| user_strlcpy(name, userName, B_FILE_NAME_LENGTH) < B_OK)
9801 		return B_BAD_ADDRESS;
9802 
9803 	return index_remove(device, name, false);
9804 }
9805 
9806 
9807 status_t
9808 _user_getcwd(char* userBuffer, size_t size)
9809 {
9810 	if (size == 0)
9811 		return B_BAD_VALUE;
9812 	if (!IS_USER_ADDRESS(userBuffer))
9813 		return B_BAD_ADDRESS;
9814 
9815 	if (size > kMaxPathLength)
9816 		size = kMaxPathLength;
9817 
9818 	KPath pathBuffer(size);
9819 	if (pathBuffer.InitCheck() != B_OK)
9820 		return B_NO_MEMORY;
9821 
9822 	TRACE(("user_getcwd: buf %p, %ld\n", userBuffer, size));
9823 
9824 	char* path = pathBuffer.LockBuffer();
9825 
9826 	status_t status = get_cwd(path, size, false);
9827 	if (status != B_OK)
9828 		return status;
9829 
9830 	// Copy back the result
9831 	if (user_strlcpy(userBuffer, path, size) < B_OK)
9832 		return B_BAD_ADDRESS;
9833 
9834 	return status;
9835 }
9836 
9837 
9838 status_t
9839 _user_setcwd(int fd, const char* userPath)
9840 {
9841 	TRACE(("user_setcwd: path = %p\n", userPath));
9842 
9843 	KPath pathBuffer(B_PATH_NAME_LENGTH);
9844 	if (pathBuffer.InitCheck() != B_OK)
9845 		return B_NO_MEMORY;
9846 
9847 	char* path = pathBuffer.LockBuffer();
9848 
9849 	if (userPath != NULL) {
9850 		if (!IS_USER_ADDRESS(userPath)
9851 			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9852 			return B_BAD_ADDRESS;
9853 	}
9854 
9855 	return set_cwd(fd, userPath != NULL ? path : NULL, false);
9856 }
9857 
9858 
9859 status_t
9860 _user_change_root(const char* userPath)
9861 {
9862 	// only root is allowed to chroot()
9863 	if (geteuid() != 0)
9864 		return B_NOT_ALLOWED;
9865 
9866 	// alloc path buffer
9867 	KPath pathBuffer(B_PATH_NAME_LENGTH);
9868 	if (pathBuffer.InitCheck() != B_OK)
9869 		return B_NO_MEMORY;
9870 
9871 	// copy userland path to kernel
9872 	char* path = pathBuffer.LockBuffer();
9873 	if (userPath != NULL) {
9874 		if (!IS_USER_ADDRESS(userPath)
9875 			|| user_strlcpy(path, userPath, B_PATH_NAME_LENGTH) < B_OK)
9876 			return B_BAD_ADDRESS;
9877 	}
9878 
9879 	// get the vnode
9880 	struct vnode* vnode;
9881 	status_t status = path_to_vnode(path, true, &vnode, NULL, false);
9882 	if (status != B_OK)
9883 		return status;
9884 
9885 	// set the new root
9886 	struct io_context* context = get_current_io_context(false);
9887 	mutex_lock(&sIOContextRootLock);
9888 	struct vnode* oldRoot = context->root;
9889 	context->root = vnode;
9890 	mutex_unlock(&sIOContextRootLock);
9891 
9892 	put_vnode(oldRoot);
9893 
9894 	return B_OK;
9895 }
9896 
9897 
9898 int
9899 _user_open_query(dev_t device, const char* userQuery, size_t queryLength,
9900 	uint32 flags, port_id port, int32 token)
9901 {
9902 	char* query;
9903 
9904 	if (device < 0 || userQuery == NULL || queryLength == 0)
9905 		return B_BAD_VALUE;
9906 
9907 	if (!IS_USER_ADDRESS(userQuery))
9908 		return B_BAD_ADDRESS;
9909 
9910 	// this is a safety restriction
9911 	if (queryLength >= 65536)
9912 		return B_NAME_TOO_LONG;
9913 
9914 	query = (char*)malloc(queryLength + 1);
9915 	if (query == NULL)
9916 		return B_NO_MEMORY;
9917 	if (user_strlcpy(query, userQuery, queryLength + 1) < B_OK) {
9918 		free(query);
9919 		return B_BAD_ADDRESS;
9920 	}
9921 
9922 	int fd = query_open(device, query, flags, port, token, false);
9923 
9924 	free(query);
9925 	return fd;
9926 }
9927 
9928 
9929 #include "vfs_request_io.cpp"
9930