xref: /haiku/src/system/kernel/fs/vfs.cpp (revision 1322e37a03386ffe6321ca3fbb03bd3c4e443074)
1 /*
2  * Copyright 2005-2013, Ingo Weinhold, ingo_weinhold@gmx.de.
3  * Copyright 2002-2018, Axel Dörfler, axeld@pinc-software.de.
4  * Distributed under the terms of the MIT License.
5  *
6  * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
7  * Distributed under the terms of the NewOS License.
8  */
9 
10 
11 /*! Virtual File System and File System Interface Layer */
12 
13 
14 #include <ctype.h>
15 #include <fcntl.h>
16 #include <limits.h>
17 #include <stddef.h>
18 #include <stdio.h>
19 #include <string.h>
20 #include <sys/file.h>
21 #include <sys/ioctl.h>
22 #include <sys/resource.h>
23 #include <sys/stat.h>
24 #include <unistd.h>
25 
26 #include <fs_attr.h>
27 #include <fs_info.h>
28 #include <fs_interface.h>
29 #include <fs_volume.h>
30 #include <NodeMonitor.h>
31 #include <OS.h>
32 #include <StorageDefs.h>
33 
34 #include <AutoDeleter.h>
35 #include <block_cache.h>
36 #include <boot/kernel_args.h>
37 #include <debug_heap.h>
38 #include <disk_device_manager/KDiskDevice.h>
39 #include <disk_device_manager/KDiskDeviceManager.h>
40 #include <disk_device_manager/KDiskDeviceUtils.h>
41 #include <disk_device_manager/KDiskSystem.h>
42 #include <fd.h>
43 #include <file_cache.h>
44 #include <fs/node_monitor.h>
45 #include <KPath.h>
46 #include <lock.h>
47 #include <low_resource_manager.h>
48 #include <slab/Slab.h>
49 #include <StackOrHeapArray.h>
50 #include <syscalls.h>
51 #include <syscall_restart.h>
52 #include <tracing.h>
53 #include <util/atomic.h>
54 #include <util/AutoLock.h>
55 #include <util/ThreadAutoLock.h>
56 #include <util/DoublyLinkedList.h>
57 #include <vfs.h>
58 #include <vm/vm.h>
59 #include <vm/VMCache.h>
60 #include <wait_for_objects.h>
61 
62 #include "EntryCache.h"
63 #include "fifo.h"
64 #include "IORequest.h"
65 #include "unused_vnodes.h"
66 #include "vfs_tracing.h"
67 #include "Vnode.h"
68 #include "../cache/vnode_store.h"
69 
70 
71 //#define TRACE_VFS
72 #ifdef TRACE_VFS
73 #	define TRACE(x) dprintf x
74 #	define FUNCTION(x) dprintf x
75 #else
76 #	define TRACE(x) ;
77 #	define FUNCTION(x) ;
78 #endif
79 
80 #define ADD_DEBUGGER_COMMANDS
81 
82 
83 #define HAS_FS_CALL(vnode, op)			(vnode->ops->op != NULL)
84 #define HAS_FS_MOUNT_CALL(mount, op)	(mount->volume->ops->op != NULL)
85 
86 #if KDEBUG
87 #	define FS_CALL(vnode, op, params...) \
88 		( HAS_FS_CALL(vnode, op) ? \
89 			vnode->ops->op(vnode->mount->volume, vnode, params) \
90 			: (panic("FS_CALL: vnode %p op " #op " is NULL", vnode), 0))
91 #	define FS_CALL_NO_PARAMS(vnode, op) \
92 		( HAS_FS_CALL(vnode, op) ? \
93 			vnode->ops->op(vnode->mount->volume, vnode) \
94 			: (panic("FS_CALL_NO_PARAMS: vnode %p op " #op " is NULL", vnode), 0))
95 #	define FS_MOUNT_CALL(mount, op, params...) \
96 		( HAS_FS_MOUNT_CALL(mount, op) ? \
97 			mount->volume->ops->op(mount->volume, params) \
98 			: (panic("FS_MOUNT_CALL: mount %p op " #op " is NULL", mount), 0))
99 #	define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
100 		( HAS_FS_MOUNT_CALL(mount, op) ? \
101 			mount->volume->ops->op(mount->volume) \
102 			: (panic("FS_MOUNT_CALL_NO_PARAMS: mount %p op " #op " is NULL", mount), 0))
103 #else
104 #	define FS_CALL(vnode, op, params...) \
105 			vnode->ops->op(vnode->mount->volume, vnode, params)
106 #	define FS_CALL_NO_PARAMS(vnode, op) \
107 			vnode->ops->op(vnode->mount->volume, vnode)
108 #	define FS_MOUNT_CALL(mount, op, params...) \
109 			mount->volume->ops->op(mount->volume, params)
110 #	define FS_MOUNT_CALL_NO_PARAMS(mount, op) \
111 			mount->volume->ops->op(mount->volume)
112 #endif
113 
114 
115 const static size_t kMaxPathLength = 65536;
116 	// The absolute maximum path length (for getcwd() - this is not depending
117 	// on PATH_MAX
118 
119 
120 typedef DoublyLinkedList<vnode> VnodeList;
121 
122 /*!	\brief Structure to manage a mounted file system
123 
124 	Note: The root_vnode and root_vnode->covers fields (what others?) are
125 	initialized in fs_mount() and not changed afterwards. That is as soon
126 	as the mount is mounted and it is made sure it won't be unmounted
127 	(e.g. by holding a reference to a vnode of that mount) (read) access
128 	to those fields is always safe, even without additional locking. Morever
129 	while mounted the mount holds a reference to the root_vnode->covers vnode,
130 	and thus making the access path vnode->mount->root_vnode->covers->mount->...
131 	safe if a reference to vnode is held (note that for the root mount
132 	root_vnode->covers is NULL, though).
133 */
134 struct fs_mount {
135 	fs_mount()
136 		:
137 		volume(NULL),
138 		device_name(NULL)
139 	{
140 		mutex_init(&lock, "mount lock");
141 	}
142 
143 	~fs_mount()
144 	{
145 		mutex_destroy(&lock);
146 		free(device_name);
147 
148 		while (volume) {
149 			fs_volume* superVolume = volume->super_volume;
150 
151 			if (volume->file_system != NULL)
152 				put_module(volume->file_system->info.name);
153 
154 			free(volume->file_system_name);
155 			free(volume);
156 			volume = superVolume;
157 		}
158 	}
159 
160 	struct fs_mount* next;
161 	dev_t			id;
162 	fs_volume*		volume;
163 	char*			device_name;
164 	mutex			lock;	// guards the vnodes list
165 	struct vnode*	root_vnode;
166 	struct vnode*	covers_vnode;	// immutable
167 	KPartition*		partition;
168 	VnodeList		vnodes;
169 	EntryCache		entry_cache;
170 	bool			unmounting;
171 	bool			owns_file_device;
172 };
173 
174 
175 namespace {
176 
177 struct advisory_lock : public DoublyLinkedListLinkImpl<advisory_lock> {
178 	list_link		link;
179 	void*			bound_to;
180 	team_id			team;
181 	pid_t			session;
182 	off_t			start;
183 	off_t			end;
184 	bool			shared;
185 };
186 
187 typedef DoublyLinkedList<advisory_lock> LockList;
188 
189 } // namespace
190 
191 
192 struct advisory_locking {
193 	sem_id			lock;
194 	sem_id			wait_sem;
195 	LockList		locks;
196 
197 	advisory_locking()
198 		:
199 		lock(-1),
200 		wait_sem(-1)
201 	{
202 	}
203 
204 	~advisory_locking()
205 	{
206 		if (lock >= 0)
207 			delete_sem(lock);
208 		if (wait_sem >= 0)
209 			delete_sem(wait_sem);
210 	}
211 };
212 
213 /*!	\brief Guards sMountsTable.
214 
215 	The holder is allowed to read/write access the sMountsTable.
216 	Manipulation of the fs_mount structures themselves
217 	(and their destruction) requires different locks though.
218 */
219 static rw_lock sMountLock = RW_LOCK_INITIALIZER("vfs_mount_lock");
220 
221 /*!	\brief Guards mount/unmount operations.
222 
223 	The fs_mount() and fs_unmount() hold the lock during their whole operation.
224 	That is locking the lock ensures that no FS is mounted/unmounted. In
225 	particular this means that
226 	- sMountsTable will not be modified,
227 	- the fields immutable after initialization of the fs_mount structures in
228 	  sMountsTable will not be modified,
229 
230 	The thread trying to lock the lock must not hold sVnodeLock or
231 	sMountLock.
232 */
233 static recursive_lock sMountOpLock;
234 
235 /*!	\brief Guards sVnodeTable.
236 
237 	The holder is allowed read/write access to sVnodeTable and to
238 	any unbusy vnode in that table, save to the immutable fields (device, id,
239 	private_node, mount) to which only read-only access is allowed.
240 	The mutable fields advisory_locking, mandatory_locked_by, and ref_count, as
241 	well as the busy, removed, unused flags, and the vnode's type can also be
242 	write accessed when holding a read lock to sVnodeLock *and* having the vnode
243 	locked. Write access to covered_by and covers requires to write lock
244 	sVnodeLock.
245 
246 	The thread trying to acquire the lock must not hold sMountLock.
247 	You must not hold this lock when calling create_sem(), as this might call
248 	vfs_free_unused_vnodes() and thus cause a deadlock.
249 */
250 static rw_lock sVnodeLock = RW_LOCK_INITIALIZER("vfs_vnode_lock");
251 
252 /*!	\brief Guards io_context::root.
253 
254 	Must be held when setting or getting the io_context::root field.
255 	The only operation allowed while holding this lock besides getting or
256 	setting the field is inc_vnode_ref_count() on io_context::root.
257 */
258 static mutex sIOContextRootLock = MUTEX_INITIALIZER("io_context::root lock");
259 
260 
261 namespace {
262 
263 struct vnode_hash_key {
264 	dev_t	device;
265 	ino_t	vnode;
266 };
267 
268 struct VnodeHash {
269 	typedef vnode_hash_key	KeyType;
270 	typedef	struct vnode	ValueType;
271 
272 #define VHASH(mountid, vnodeid) \
273 	(((uint32)((vnodeid) >> 32) + (uint32)(vnodeid)) ^ (uint32)(mountid))
274 
275 	size_t HashKey(KeyType key) const
276 	{
277 		return VHASH(key.device, key.vnode);
278 	}
279 
280 	size_t Hash(ValueType* vnode) const
281 	{
282 		return VHASH(vnode->device, vnode->id);
283 	}
284 
285 #undef VHASH
286 
287 	bool Compare(KeyType key, ValueType* vnode) const
288 	{
289 		return vnode->device == key.device && vnode->id == key.vnode;
290 	}
291 
292 	ValueType*& GetLink(ValueType* value) const
293 	{
294 		return value->next;
295 	}
296 };
297 
298 typedef BOpenHashTable<VnodeHash> VnodeTable;
299 
300 
301 struct MountHash {
302 	typedef dev_t			KeyType;
303 	typedef	struct fs_mount	ValueType;
304 
305 	size_t HashKey(KeyType key) const
306 	{
307 		return key;
308 	}
309 
310 	size_t Hash(ValueType* mount) const
311 	{
312 		return mount->id;
313 	}
314 
315 	bool Compare(KeyType key, ValueType* mount) const
316 	{
317 		return mount->id == key;
318 	}
319 
320 	ValueType*& GetLink(ValueType* value) const
321 	{
322 		return value->next;
323 	}
324 };
325 
326 typedef BOpenHashTable<MountHash> MountTable;
327 
328 } // namespace
329 
330 
331 object_cache* sPathNameCache;
332 object_cache* sVnodeCache;
333 object_cache* sFileDescriptorCache;
334 
335 #define VNODE_HASH_TABLE_SIZE 1024
336 static VnodeTable* sVnodeTable;
337 static struct vnode* sRoot;
338 
339 #define MOUNTS_HASH_TABLE_SIZE 16
340 static MountTable* sMountsTable;
341 static dev_t sNextMountID = 1;
342 
343 #define MAX_TEMP_IO_VECS 8
344 
345 // How long to wait for busy vnodes (10s)
346 #define BUSY_VNODE_RETRIES 2000
347 #define BUSY_VNODE_DELAY 5000
348 
349 mode_t __gUmask = 022;
350 
351 /* function declarations */
352 
353 static void free_unused_vnodes();
354 
355 // file descriptor operation prototypes
356 static status_t file_read(struct file_descriptor* descriptor, off_t pos,
357 	void* buffer, size_t* _bytes);
358 static status_t file_write(struct file_descriptor* descriptor, off_t pos,
359 	const void* buffer, size_t* _bytes);
360 static off_t file_seek(struct file_descriptor* descriptor, off_t pos,
361 	int seekType);
362 static void file_free_fd(struct file_descriptor* descriptor);
363 static status_t file_close(struct file_descriptor* descriptor);
364 static status_t file_select(struct file_descriptor* descriptor, uint8 event,
365 	struct selectsync* sync);
366 static status_t file_deselect(struct file_descriptor* descriptor, uint8 event,
367 	struct selectsync* sync);
368 static status_t dir_read(struct io_context* context,
369 	struct file_descriptor* descriptor, struct dirent* buffer,
370 	size_t bufferSize, uint32* _count);
371 static status_t dir_read(struct io_context* ioContext, struct vnode* vnode,
372 	void* cookie, struct dirent* buffer, size_t bufferSize, uint32* _count);
373 static status_t dir_rewind(struct file_descriptor* descriptor);
374 static void dir_free_fd(struct file_descriptor* descriptor);
375 static status_t dir_close(struct file_descriptor* descriptor);
376 static status_t attr_dir_read(struct io_context* context,
377 	struct file_descriptor* descriptor, struct dirent* buffer,
378 	size_t bufferSize, uint32* _count);
379 static status_t attr_dir_rewind(struct file_descriptor* descriptor);
380 static void attr_dir_free_fd(struct file_descriptor* descriptor);
381 static status_t attr_dir_close(struct file_descriptor* descriptor);
382 static status_t attr_read(struct file_descriptor* descriptor, off_t pos,
383 	void* buffer, size_t* _bytes);
384 static status_t attr_write(struct file_descriptor* descriptor, off_t pos,
385 	const void* buffer, size_t* _bytes);
386 static off_t attr_seek(struct file_descriptor* descriptor, off_t pos,
387 	int seekType);
388 static void attr_free_fd(struct file_descriptor* descriptor);
389 static status_t attr_close(struct file_descriptor* descriptor);
390 static status_t attr_read_stat(struct file_descriptor* descriptor,
391 	struct stat* statData);
392 static status_t attr_write_stat(struct file_descriptor* descriptor,
393 	const struct stat* stat, int statMask);
394 static status_t index_dir_read(struct io_context* context,
395 	struct file_descriptor* descriptor, struct dirent* buffer,
396 	size_t bufferSize, uint32* _count);
397 static status_t index_dir_rewind(struct file_descriptor* descriptor);
398 static void index_dir_free_fd(struct file_descriptor* descriptor);
399 static status_t index_dir_close(struct file_descriptor* descriptor);
400 static status_t query_read(struct io_context* context,
401 	struct file_descriptor* descriptor, struct dirent* buffer,
402 	size_t bufferSize, uint32* _count);
403 static status_t query_rewind(struct file_descriptor* descriptor);
404 static void query_free_fd(struct file_descriptor* descriptor);
405 static status_t query_close(struct file_descriptor* descriptor);
406 
407 static status_t common_ioctl(struct file_descriptor* descriptor, ulong op,
408 	void* buffer, size_t length);
409 static status_t common_read_stat(struct file_descriptor* descriptor,
410 	struct stat* statData);
411 static status_t common_write_stat(struct file_descriptor* descriptor,
412 	const struct stat* statData, int statMask);
413 static status_t common_path_read_stat(int fd, char* path, bool traverseLeafLink,
414 	struct stat* stat, bool kernel);
415 
416 static status_t vnode_path_to_vnode(struct vnode* vnode, char* path,
417 	bool traverseLeafLink, int count, bool kernel,
418 	struct vnode** _vnode, ino_t* _parentID);
419 static status_t dir_vnode_to_path(struct vnode* vnode, char* buffer,
420 	size_t bufferSize, bool kernel);
421 static status_t fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
422 	struct vnode** _vnode, ino_t* _parentID, bool kernel);
423 static void inc_vnode_ref_count(struct vnode* vnode);
424 static status_t dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree,
425 	bool reenter);
426 static inline void put_vnode(struct vnode* vnode);
427 static status_t fs_unmount(char* path, dev_t mountID, uint32 flags,
428 	bool kernel);
429 static int open_vnode(struct vnode* vnode, int openMode, bool kernel);
430 
431 
432 static struct fd_ops sFileOps = {
433 	file_read,
434 	file_write,
435 	file_seek,
436 	common_ioctl,
437 	NULL,		// set_flags
438 	file_select,
439 	file_deselect,
440 	NULL,		// read_dir()
441 	NULL,		// rewind_dir()
442 	common_read_stat,
443 	common_write_stat,
444 	file_close,
445 	file_free_fd
446 };
447 
448 static struct fd_ops sDirectoryOps = {
449 	NULL,		// read()
450 	NULL,		// write()
451 	NULL,		// seek()
452 	common_ioctl,
453 	NULL,		// set_flags
454 	NULL,		// select()
455 	NULL,		// deselect()
456 	dir_read,
457 	dir_rewind,
458 	common_read_stat,
459 	common_write_stat,
460 	dir_close,
461 	dir_free_fd
462 };
463 
464 static struct fd_ops sAttributeDirectoryOps = {
465 	NULL,		// read()
466 	NULL,		// write()
467 	NULL,		// seek()
468 	common_ioctl,
469 	NULL,		// set_flags
470 	NULL,		// select()
471 	NULL,		// deselect()
472 	attr_dir_read,
473 	attr_dir_rewind,
474 	common_read_stat,
475 	common_write_stat,
476 	attr_dir_close,
477 	attr_dir_free_fd
478 };
479 
480 static struct fd_ops sAttributeOps = {
481 	attr_read,
482 	attr_write,
483 	attr_seek,
484 	common_ioctl,
485 	NULL,		// set_flags
486 	NULL,		// select()
487 	NULL,		// deselect()
488 	NULL,		// read_dir()
489 	NULL,		// rewind_dir()
490 	attr_read_stat,
491 	attr_write_stat,
492 	attr_close,
493 	attr_free_fd
494 };
495 
496 static struct fd_ops sIndexDirectoryOps = {
497 	NULL,		// read()
498 	NULL,		// write()
499 	NULL,		// seek()
500 	NULL,		// ioctl()
501 	NULL,		// set_flags
502 	NULL,		// select()
503 	NULL,		// deselect()
504 	index_dir_read,
505 	index_dir_rewind,
506 	NULL,		// read_stat()
507 	NULL,		// write_stat()
508 	index_dir_close,
509 	index_dir_free_fd
510 };
511 
512 #if 0
513 static struct fd_ops sIndexOps = {
514 	NULL,		// read()
515 	NULL,		// write()
516 	NULL,		// seek()
517 	NULL,		// ioctl()
518 	NULL,		// set_flags
519 	NULL,		// select()
520 	NULL,		// deselect()
521 	NULL,		// dir_read()
522 	NULL,		// dir_rewind()
523 	index_read_stat,	// read_stat()
524 	NULL,		// write_stat()
525 	NULL,		// dir_close()
526 	NULL		// free_fd()
527 };
528 #endif
529 
530 static struct fd_ops sQueryOps = {
531 	NULL,		// read()
532 	NULL,		// write()
533 	NULL,		// seek()
534 	NULL,		// ioctl()
535 	NULL,		// set_flags
536 	NULL,		// select()
537 	NULL,		// deselect()
538 	query_read,
539 	query_rewind,
540 	NULL,		// read_stat()
541 	NULL,		// write_stat()
542 	query_close,
543 	query_free_fd
544 };
545 
546 
547 namespace {
548 
549 class VNodePutter {
550 public:
551 	VNodePutter(struct vnode* vnode = NULL) : fVNode(vnode) {}
552 
553 	~VNodePutter()
554 	{
555 		Put();
556 	}
557 
558 	void SetTo(struct vnode* vnode)
559 	{
560 		Put();
561 		fVNode = vnode;
562 	}
563 
564 	void Put()
565 	{
566 		if (fVNode) {
567 			put_vnode(fVNode);
568 			fVNode = NULL;
569 		}
570 	}
571 
572 	struct vnode* Detach()
573 	{
574 		struct vnode* vnode = fVNode;
575 		fVNode = NULL;
576 		return vnode;
577 	}
578 
579 private:
580 	struct vnode* fVNode;
581 };
582 
583 
584 class FDCloser {
585 public:
586 	FDCloser() : fFD(-1), fKernel(true) {}
587 
588 	FDCloser(int fd, bool kernel) : fFD(fd), fKernel(kernel) {}
589 
590 	~FDCloser()
591 	{
592 		Close();
593 	}
594 
595 	void SetTo(int fd, bool kernel)
596 	{
597 		Close();
598 		fFD = fd;
599 		fKernel = kernel;
600 	}
601 
602 	void Close()
603 	{
604 		if (fFD >= 0) {
605 			if (fKernel)
606 				_kern_close(fFD);
607 			else
608 				_user_close(fFD);
609 			fFD = -1;
610 		}
611 	}
612 
613 	int Detach()
614 	{
615 		int fd = fFD;
616 		fFD = -1;
617 		return fd;
618 	}
619 
620 private:
621 	int		fFD;
622 	bool	fKernel;
623 };
624 
625 } // namespace
626 
627 
628 #if VFS_PAGES_IO_TRACING
629 
630 namespace VFSPagesIOTracing {
631 
632 class PagesIOTraceEntry : public AbstractTraceEntry {
633 protected:
634 	PagesIOTraceEntry(struct vnode* vnode, void* cookie, off_t pos,
635 		const generic_io_vec* vecs, uint32 count, uint32 flags,
636 		generic_size_t bytesRequested, status_t status,
637 		generic_size_t bytesTransferred)
638 		:
639 		fVnode(vnode),
640 		fMountID(vnode->mount->id),
641 		fNodeID(vnode->id),
642 		fCookie(cookie),
643 		fPos(pos),
644 		fCount(count),
645 		fFlags(flags),
646 		fBytesRequested(bytesRequested),
647 		fStatus(status),
648 		fBytesTransferred(bytesTransferred)
649 	{
650 		fVecs = (generic_io_vec*)alloc_tracing_buffer_memcpy(vecs,
651 			sizeof(generic_io_vec) * count, false);
652 	}
653 
654 	void AddDump(TraceOutput& out, const char* mode)
655 	{
656 		out.Print("vfs pages io %5s: vnode: %p (%" B_PRId32 ", %" B_PRId64 "), "
657 			"cookie: %p, pos: %" B_PRIdOFF ", size: %" B_PRIu64 ", vecs: {",
658 			mode, fVnode, fMountID, fNodeID, fCookie, fPos,
659 			(uint64)fBytesRequested);
660 
661 		if (fVecs != NULL) {
662 			for (uint32 i = 0; i < fCount; i++) {
663 				if (i > 0)
664 					out.Print(", ");
665 				out.Print("(%" B_PRIx64 ", %" B_PRIu64 ")", (uint64)fVecs[i].base,
666 					(uint64)fVecs[i].length);
667 			}
668 		}
669 
670 		out.Print("}, flags: %#" B_PRIx32 " -> status: %#" B_PRIx32 ", "
671 			"transferred: %" B_PRIu64, fFlags, fStatus,
672 			(uint64)fBytesTransferred);
673 	}
674 
675 protected:
676 	struct vnode*	fVnode;
677 	dev_t			fMountID;
678 	ino_t			fNodeID;
679 	void*			fCookie;
680 	off_t			fPos;
681 	generic_io_vec*	fVecs;
682 	uint32			fCount;
683 	uint32			fFlags;
684 	generic_size_t	fBytesRequested;
685 	status_t		fStatus;
686 	generic_size_t	fBytesTransferred;
687 };
688 
689 
690 class ReadPages : public PagesIOTraceEntry {
691 public:
692 	ReadPages(struct vnode* vnode, void* cookie, off_t pos,
693 		const generic_io_vec* vecs, uint32 count, uint32 flags,
694 		generic_size_t bytesRequested, status_t status,
695 		generic_size_t bytesTransferred)
696 		:
697 		PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
698 			bytesRequested, status, bytesTransferred)
699 	{
700 		Initialized();
701 	}
702 
703 	virtual void AddDump(TraceOutput& out)
704 	{
705 		PagesIOTraceEntry::AddDump(out, "read");
706 	}
707 };
708 
709 
710 class WritePages : public PagesIOTraceEntry {
711 public:
712 	WritePages(struct vnode* vnode, void* cookie, off_t pos,
713 		const generic_io_vec* vecs, uint32 count, uint32 flags,
714 		generic_size_t bytesRequested, status_t status,
715 		generic_size_t bytesTransferred)
716 		:
717 		PagesIOTraceEntry(vnode, cookie, pos, vecs, count, flags,
718 			bytesRequested, status, bytesTransferred)
719 	{
720 		Initialized();
721 	}
722 
723 	virtual void AddDump(TraceOutput& out)
724 	{
725 		PagesIOTraceEntry::AddDump(out, "write");
726 	}
727 };
728 
729 }	// namespace VFSPagesIOTracing
730 
731 #	define TPIO(x) new(std::nothrow) VFSPagesIOTracing::x;
732 #else
733 #	define TPIO(x) ;
734 #endif	// VFS_PAGES_IO_TRACING
735 
736 
737 /*! Finds the mounted device (the fs_mount structure) with the given ID.
738 	Note, you must hold the sMountLock lock when you call this function.
739 */
740 static struct fs_mount*
741 find_mount(dev_t id)
742 {
743 	ASSERT_READ_LOCKED_RW_LOCK(&sMountLock);
744 
745 	return sMountsTable->Lookup(id);
746 }
747 
748 
749 static status_t
750 get_mount(dev_t id, struct fs_mount** _mount)
751 {
752 	struct fs_mount* mount;
753 
754 	ReadLocker nodeLocker(sVnodeLock);
755 	ReadLocker mountLocker(sMountLock);
756 
757 	mount = find_mount(id);
758 	if (mount == NULL)
759 		return B_BAD_VALUE;
760 
761 	struct vnode* rootNode = mount->root_vnode;
762 	if (mount->unmounting || rootNode == NULL || rootNode->IsBusy()
763 		|| rootNode->ref_count == 0) {
764 		// might have been called during a mount/unmount operation
765 		return B_BUSY;
766 	}
767 
768 	inc_vnode_ref_count(rootNode);
769 	*_mount = mount;
770 	return B_OK;
771 }
772 
773 
774 static void
775 put_mount(struct fs_mount* mount)
776 {
777 	if (mount)
778 		put_vnode(mount->root_vnode);
779 }
780 
781 
782 /*!	Tries to open the specified file system module.
783 	Accepts a file system name of the form "bfs" or "file_systems/bfs/v1".
784 	Returns a pointer to file system module interface, or NULL if it
785 	could not open the module.
786 */
787 static file_system_module_info*
788 get_file_system(const char* fsName)
789 {
790 	char name[B_FILE_NAME_LENGTH];
791 	if (strncmp(fsName, "file_systems/", strlen("file_systems/"))) {
792 		// construct module name if we didn't get one
793 		// (we currently support only one API)
794 		snprintf(name, sizeof(name), "file_systems/%s/v1", fsName);
795 		fsName = NULL;
796 	}
797 
798 	file_system_module_info* info;
799 	if (get_module(fsName ? fsName : name, (module_info**)&info) != B_OK)
800 		return NULL;
801 
802 	return info;
803 }
804 
805 
806 /*!	Accepts a file system name of the form "bfs" or "file_systems/bfs/v1"
807 	and returns a compatible fs_info.fsh_name name ("bfs" in both cases).
808 	The name is allocated for you, and you have to free() it when you're
809 	done with it.
810 	Returns NULL if the required memory is not available.
811 */
812 static char*
813 get_file_system_name(const char* fsName)
814 {
815 	const size_t length = strlen("file_systems/");
816 
817 	if (strncmp(fsName, "file_systems/", length)) {
818 		// the name already seems to be the module's file name
819 		return strdup(fsName);
820 	}
821 
822 	fsName += length;
823 	const char* end = strchr(fsName, '/');
824 	if (end == NULL) {
825 		// this doesn't seem to be a valid name, but well...
826 		return strdup(fsName);
827 	}
828 
829 	// cut off the trailing /v1
830 
831 	char* name = (char*)malloc(end + 1 - fsName);
832 	if (name == NULL)
833 		return NULL;
834 
835 	strlcpy(name, fsName, end + 1 - fsName);
836 	return name;
837 }
838 
839 
840 /*!	Accepts a list of file system names separated by a colon, one for each
841 	layer and returns the file system name for the specified layer.
842 	The name is allocated for you, and you have to free() it when you're
843 	done with it.
844 	Returns NULL if the required memory is not available or if there is no
845 	name for the specified layer.
846 */
847 static char*
848 get_file_system_name_for_layer(const char* fsNames, int32 layer)
849 {
850 	while (layer >= 0) {
851 		const char* end = strchr(fsNames, ':');
852 		if (end == NULL) {
853 			if (layer == 0)
854 				return strdup(fsNames);
855 			return NULL;
856 		}
857 
858 		if (layer == 0) {
859 			size_t length = end - fsNames + 1;
860 			char* result = (char*)malloc(length);
861 			strlcpy(result, fsNames, length);
862 			return result;
863 		}
864 
865 		fsNames = end + 1;
866 		layer--;
867 	}
868 
869 	return NULL;
870 }
871 
872 
873 static void
874 add_vnode_to_mount_list(struct vnode* vnode, struct fs_mount* mount)
875 {
876 	MutexLocker _(mount->lock);
877 	mount->vnodes.Add(vnode);
878 }
879 
880 
881 static void
882 remove_vnode_from_mount_list(struct vnode* vnode, struct fs_mount* mount)
883 {
884 	MutexLocker _(mount->lock);
885 	mount->vnodes.Remove(vnode);
886 }
887 
888 
889 /*!	\brief Looks up a vnode by mount and node ID in the sVnodeTable.
890 
891 	The caller must hold the sVnodeLock (read lock at least).
892 
893 	\param mountID the mount ID.
894 	\param vnodeID the node ID.
895 
896 	\return The vnode structure, if it was found in the hash table, \c NULL
897 			otherwise.
898 */
899 static struct vnode*
900 lookup_vnode(dev_t mountID, ino_t vnodeID)
901 {
902 	ASSERT_READ_LOCKED_RW_LOCK(&sVnodeLock);
903 
904 	struct vnode_hash_key key;
905 
906 	key.device = mountID;
907 	key.vnode = vnodeID;
908 
909 	return sVnodeTable->Lookup(key);
910 }
911 
912 
913 /*!	\brief Checks whether or not a busy vnode should be waited for (again).
914 
915 	This will also wait for BUSY_VNODE_DELAY before returning if one should
916 	still wait for the vnode becoming unbusy.
917 
918 	\return \c true if one should retry, \c false if not.
919 */
920 static bool
921 retry_busy_vnode(int32& tries, dev_t mountID, ino_t vnodeID)
922 {
923 	if (--tries < 0) {
924 		// vnode doesn't seem to become unbusy
925 		dprintf("vnode %" B_PRIdDEV ":%" B_PRIdINO
926 			" is not becoming unbusy!\n", mountID, vnodeID);
927 		return false;
928 	}
929 	snooze(BUSY_VNODE_DELAY);
930 	return true;
931 }
932 
933 
934 /*!	Creates a new vnode with the given mount and node ID.
935 	If the node already exists, it is returned instead and no new node is
936 	created. In either case -- but not, if an error occurs -- the function write
937 	locks \c sVnodeLock and keeps it locked for the caller when returning. On
938 	error the lock is not held on return.
939 
940 	\param mountID The mount ID.
941 	\param vnodeID The vnode ID.
942 	\param _vnode Will be set to the new vnode on success.
943 	\param _nodeCreated Will be set to \c true when the returned vnode has
944 		been newly created, \c false when it already existed. Will not be
945 		changed on error.
946 	\return \c B_OK, when the vnode was successfully created and inserted or
947 		a node with the given ID was found, \c B_NO_MEMORY or
948 		\c B_ENTRY_NOT_FOUND on error.
949 */
950 static status_t
951 create_new_vnode_and_lock(dev_t mountID, ino_t vnodeID, struct vnode*& _vnode,
952 	bool& _nodeCreated)
953 {
954 	FUNCTION(("create_new_vnode_and_lock()\n"));
955 
956 	struct vnode* vnode = (struct vnode*)object_cache_alloc(sVnodeCache, 0);
957 	if (vnode == NULL)
958 		return B_NO_MEMORY;
959 
960 	// initialize basic values
961 	memset(vnode, 0, sizeof(struct vnode));
962 	vnode->device = mountID;
963 	vnode->id = vnodeID;
964 	vnode->ref_count = 1;
965 	vnode->SetBusy(true);
966 
967 	// look up the node -- it might have been added by someone else in the
968 	// meantime
969 	rw_lock_write_lock(&sVnodeLock);
970 	struct vnode* existingVnode = lookup_vnode(mountID, vnodeID);
971 	if (existingVnode != NULL) {
972 		object_cache_free(sVnodeCache, vnode, 0);
973 		_vnode = existingVnode;
974 		_nodeCreated = false;
975 		return B_OK;
976 	}
977 
978 	// get the mount structure
979 	rw_lock_read_lock(&sMountLock);
980 	vnode->mount = find_mount(mountID);
981 	if (!vnode->mount || vnode->mount->unmounting) {
982 		rw_lock_read_unlock(&sMountLock);
983 		rw_lock_write_unlock(&sVnodeLock);
984 		object_cache_free(sVnodeCache, vnode, 0);
985 		return B_ENTRY_NOT_FOUND;
986 	}
987 
988 	// add the vnode to the mount's node list and the hash table
989 	sVnodeTable->Insert(vnode);
990 	add_vnode_to_mount_list(vnode, vnode->mount);
991 
992 	rw_lock_read_unlock(&sMountLock);
993 
994 	_vnode = vnode;
995 	_nodeCreated = true;
996 
997 	// keep the vnode lock locked
998 	return B_OK;
999 }
1000 
1001 
1002 /*!	Frees the vnode and all resources it has acquired, and removes
1003 	it from the vnode hash as well as from its mount structure.
1004 	Will also make sure that any cache modifications are written back.
1005 */
1006 static void
1007 free_vnode(struct vnode* vnode, bool reenter)
1008 {
1009 	ASSERT_PRINT(vnode->ref_count == 0 && vnode->IsBusy(), "vnode: %p\n",
1010 		vnode);
1011 	ASSERT_PRINT(vnode->advisory_locking == NULL, "vnode: %p\n", vnode);
1012 
1013 	// write back any changes in this vnode's cache -- but only
1014 	// if the vnode won't be deleted, in which case the changes
1015 	// will be discarded
1016 
1017 	if (!vnode->IsRemoved() && HAS_FS_CALL(vnode, fsync))
1018 		FS_CALL_NO_PARAMS(vnode, fsync);
1019 
1020 	// Note: If this vnode has a cache attached, there will still be two
1021 	// references to that cache at this point. The last one belongs to the vnode
1022 	// itself (cf. vfs_get_vnode_cache()) and one belongs to the node's file
1023 	// cache. Each but the last reference to a cache also includes a reference
1024 	// to the vnode. The file cache, however, released its reference (cf.
1025 	// file_cache_create()), so that this vnode's ref count has the chance to
1026 	// ever drop to 0. Deleting the file cache now, will cause the next to last
1027 	// cache reference to be released, which will also release a (no longer
1028 	// existing) vnode reference. To avoid problems, we set the vnode's ref
1029 	// count, so that it will neither become negative nor 0.
1030 	vnode->ref_count = 2;
1031 
1032 	if (!vnode->IsUnpublished()) {
1033 		if (vnode->IsRemoved())
1034 			FS_CALL(vnode, remove_vnode, reenter);
1035 		else
1036 			FS_CALL(vnode, put_vnode, reenter);
1037 	}
1038 
1039 	// If the vnode has a VMCache attached, make sure that it won't try to get
1040 	// another reference via VMVnodeCache::AcquireUnreferencedStoreRef(). As
1041 	// long as the vnode is busy and in the hash, that won't happen, but as
1042 	// soon as we've removed it from the hash, it could reload the vnode -- with
1043 	// a new cache attached!
1044 	if (vnode->cache != NULL && vnode->cache->type == CACHE_TYPE_VNODE)
1045 		((VMVnodeCache*)vnode->cache)->VnodeDeleted();
1046 
1047 	// The file system has removed the resources of the vnode now, so we can
1048 	// make it available again (by removing the busy vnode from the hash).
1049 	rw_lock_write_lock(&sVnodeLock);
1050 	sVnodeTable->Remove(vnode);
1051 	rw_lock_write_unlock(&sVnodeLock);
1052 
1053 	// if we have a VMCache attached, remove it
1054 	if (vnode->cache)
1055 		vnode->cache->ReleaseRef();
1056 
1057 	vnode->cache = NULL;
1058 
1059 	remove_vnode_from_mount_list(vnode, vnode->mount);
1060 
1061 	object_cache_free(sVnodeCache, vnode, 0);
1062 }
1063 
1064 
1065 /*!	\brief Decrements the reference counter of the given vnode and deletes it,
1066 	if the counter dropped to 0.
1067 
1068 	The caller must, of course, own a reference to the vnode to call this
1069 	function.
1070 	The caller must not hold the sVnodeLock or the sMountLock.
1071 
1072 	\param vnode the vnode.
1073 	\param alwaysFree don't move this vnode into the unused list, but really
1074 		   delete it if possible.
1075 	\param reenter \c true, if this function is called (indirectly) from within
1076 		   a file system. This will be passed to file system hooks only.
1077 	\return \c B_OK, if everything went fine, an error code otherwise.
1078 */
1079 static status_t
1080 dec_vnode_ref_count(struct vnode* vnode, bool alwaysFree, bool reenter)
1081 {
1082 	ReadLocker locker(sVnodeLock);
1083 	AutoLocker<Vnode> nodeLocker(vnode);
1084 
1085 	int32 oldRefCount = atomic_add(&vnode->ref_count, -1);
1086 
1087 	ASSERT_PRINT(oldRefCount > 0, "vnode %p\n", vnode);
1088 
1089 	TRACE(("dec_vnode_ref_count: vnode %p, ref now %" B_PRId32 "\n", vnode,
1090 		vnode->ref_count));
1091 
1092 	if (oldRefCount != 1)
1093 		return B_OK;
1094 
1095 	if (vnode->IsBusy())
1096 		panic("dec_vnode_ref_count: called on busy vnode %p\n", vnode);
1097 
1098 	bool freeNode = false;
1099 	bool freeUnusedNodes = false;
1100 
1101 	// Just insert the vnode into an unused list if we don't need
1102 	// to delete it
1103 	if (vnode->IsRemoved() || alwaysFree) {
1104 		vnode_to_be_freed(vnode);
1105 		vnode->SetBusy(true);
1106 		freeNode = true;
1107 	} else
1108 		freeUnusedNodes = vnode_unused(vnode);
1109 
1110 	nodeLocker.Unlock();
1111 	locker.Unlock();
1112 
1113 	if (freeNode)
1114 		free_vnode(vnode, reenter);
1115 	else if (freeUnusedNodes)
1116 		free_unused_vnodes();
1117 
1118 	return B_OK;
1119 }
1120 
1121 
1122 /*!	\brief Increments the reference counter of the given vnode.
1123 
1124 	The caller must make sure that the node isn't deleted while this function
1125 	is called. This can be done either:
1126 	- by ensuring that a reference to the node exists and remains in existence,
1127 	  or
1128 	- by holding the vnode's lock (which also requires read locking sVnodeLock)
1129 	  or by holding sVnodeLock write locked.
1130 
1131 	In the second case the caller is responsible for dealing with the ref count
1132 	0 -> 1 transition. That is 1. this function must not be invoked when the
1133 	node is busy in the first place and 2. vnode_used() must be called for the
1134 	node.
1135 
1136 	\param vnode the vnode.
1137 */
1138 static void
1139 inc_vnode_ref_count(struct vnode* vnode)
1140 {
1141 	atomic_add(&vnode->ref_count, 1);
1142 	TRACE(("inc_vnode_ref_count: vnode %p, ref now %" B_PRId32 "\n", vnode,
1143 		vnode->ref_count));
1144 }
1145 
1146 
1147 static bool
1148 is_special_node_type(int type)
1149 {
1150 	// at the moment only FIFOs are supported
1151 	return S_ISFIFO(type);
1152 }
1153 
1154 
1155 static status_t
1156 create_special_sub_node(struct vnode* vnode, uint32 flags)
1157 {
1158 	if (S_ISFIFO(vnode->Type()))
1159 		return create_fifo_vnode(vnode->mount->volume, vnode);
1160 
1161 	return B_BAD_VALUE;
1162 }
1163 
1164 
1165 /*!	\brief Retrieves a vnode for a given mount ID, node ID pair.
1166 
1167 	If the node is not yet in memory, it will be loaded.
1168 
1169 	The caller must not hold the sVnodeLock or the sMountLock.
1170 
1171 	\param mountID the mount ID.
1172 	\param vnodeID the node ID.
1173 	\param _vnode Pointer to a vnode* variable into which the pointer to the
1174 		   retrieved vnode structure shall be written.
1175 	\param reenter \c true, if this function is called (indirectly) from within
1176 		   a file system.
1177 	\return \c B_OK, if everything when fine, an error code otherwise.
1178 */
1179 static status_t
1180 get_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode, bool canWait,
1181 	int reenter)
1182 {
1183 	FUNCTION(("get_vnode: mountid %" B_PRId32 " vnid 0x%" B_PRIx64 " %p\n",
1184 		mountID, vnodeID, _vnode));
1185 
1186 	rw_lock_read_lock(&sVnodeLock);
1187 
1188 	int32 tries = BUSY_VNODE_RETRIES;
1189 restart:
1190 	struct vnode* vnode = lookup_vnode(mountID, vnodeID);
1191 	AutoLocker<Vnode> nodeLocker(vnode);
1192 
1193 	if (vnode && vnode->IsBusy()) {
1194 		// vnodes in the Removed state (except ones still Unpublished)
1195 		// which are also Busy will disappear soon, so we do not wait for them.
1196 		const bool doNotWait = vnode->IsRemoved() && !vnode->IsUnpublished();
1197 
1198 		nodeLocker.Unlock();
1199 		rw_lock_read_unlock(&sVnodeLock);
1200 		if (!canWait) {
1201 			dprintf("vnode %" B_PRIdDEV ":%" B_PRIdINO " is busy!\n",
1202 				mountID, vnodeID);
1203 			return B_BUSY;
1204 		}
1205 		if (doNotWait || !retry_busy_vnode(tries, mountID, vnodeID))
1206 			return B_BUSY;
1207 
1208 		rw_lock_read_lock(&sVnodeLock);
1209 		goto restart;
1210 	}
1211 
1212 	TRACE(("get_vnode: tried to lookup vnode, got %p\n", vnode));
1213 
1214 	status_t status;
1215 
1216 	if (vnode) {
1217 		if (vnode->ref_count == 0) {
1218 			// this vnode has been unused before
1219 			vnode_used(vnode);
1220 		}
1221 		inc_vnode_ref_count(vnode);
1222 
1223 		nodeLocker.Unlock();
1224 		rw_lock_read_unlock(&sVnodeLock);
1225 	} else {
1226 		// we need to create a new vnode and read it in
1227 		rw_lock_read_unlock(&sVnodeLock);
1228 			// unlock -- create_new_vnode_and_lock() write-locks on success
1229 		bool nodeCreated;
1230 		status = create_new_vnode_and_lock(mountID, vnodeID, vnode,
1231 			nodeCreated);
1232 		if (status != B_OK)
1233 			return status;
1234 
1235 		if (!nodeCreated) {
1236 			rw_lock_read_lock(&sVnodeLock);
1237 			rw_lock_write_unlock(&sVnodeLock);
1238 			goto restart;
1239 		}
1240 
1241 		rw_lock_write_unlock(&sVnodeLock);
1242 
1243 		int type;
1244 		uint32 flags;
1245 		status = FS_MOUNT_CALL(vnode->mount, get_vnode, vnodeID, vnode, &type,
1246 			&flags, reenter);
1247 		if (status == B_OK && vnode->private_node == NULL)
1248 			status = B_BAD_VALUE;
1249 
1250 		bool gotNode = status == B_OK;
1251 		bool publishSpecialSubNode = false;
1252 		if (gotNode) {
1253 			vnode->SetType(type);
1254 			publishSpecialSubNode = is_special_node_type(type)
1255 				&& (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
1256 		}
1257 
1258 		if (gotNode && publishSpecialSubNode)
1259 			status = create_special_sub_node(vnode, flags);
1260 
1261 		if (status != B_OK) {
1262 			if (gotNode)
1263 				FS_CALL(vnode, put_vnode, reenter);
1264 
1265 			rw_lock_write_lock(&sVnodeLock);
1266 			sVnodeTable->Remove(vnode);
1267 			remove_vnode_from_mount_list(vnode, vnode->mount);
1268 			rw_lock_write_unlock(&sVnodeLock);
1269 
1270 			object_cache_free(sVnodeCache, vnode, 0);
1271 			return status;
1272 		}
1273 
1274 		rw_lock_read_lock(&sVnodeLock);
1275 		vnode->Lock();
1276 
1277 		vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
1278 		vnode->SetBusy(false);
1279 
1280 		vnode->Unlock();
1281 		rw_lock_read_unlock(&sVnodeLock);
1282 	}
1283 
1284 	TRACE(("get_vnode: returning %p\n", vnode));
1285 
1286 	*_vnode = vnode;
1287 	return B_OK;
1288 }
1289 
1290 
1291 /*!	\brief Decrements the reference counter of the given vnode and deletes it,
1292 	if the counter dropped to 0.
1293 
1294 	The caller must, of course, own a reference to the vnode to call this
1295 	function.
1296 	The caller must not hold the sVnodeLock or the sMountLock.
1297 
1298 	\param vnode the vnode.
1299 */
1300 static inline void
1301 put_vnode(struct vnode* vnode)
1302 {
1303 	dec_vnode_ref_count(vnode, false, false);
1304 }
1305 
1306 
1307 static void
1308 free_unused_vnodes(int32 level)
1309 {
1310 	unused_vnodes_check_started();
1311 
1312 	if (level == B_NO_LOW_RESOURCE) {
1313 		unused_vnodes_check_done();
1314 		return;
1315 	}
1316 
1317 	flush_hot_vnodes();
1318 
1319 	// determine how many nodes to free
1320 	uint32 count = 1;
1321 	{
1322 		MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1323 
1324 		switch (level) {
1325 			case B_LOW_RESOURCE_NOTE:
1326 				count = sUnusedVnodes / 100;
1327 				break;
1328 			case B_LOW_RESOURCE_WARNING:
1329 				count = sUnusedVnodes / 10;
1330 				break;
1331 			case B_LOW_RESOURCE_CRITICAL:
1332 				count = sUnusedVnodes;
1333 				break;
1334 		}
1335 
1336 		if (count > sUnusedVnodes)
1337 			count = sUnusedVnodes;
1338 	}
1339 
1340 	// Write back the modified pages of some unused vnodes and free them.
1341 
1342 	for (uint32 i = 0; i < count; i++) {
1343 		ReadLocker vnodesReadLocker(sVnodeLock);
1344 
1345 		// get the first node
1346 		MutexLocker unusedVnodesLocker(sUnusedVnodesLock);
1347 		struct vnode* vnode = (struct vnode*)list_get_first_item(
1348 			&sUnusedVnodeList);
1349 		unusedVnodesLocker.Unlock();
1350 
1351 		if (vnode == NULL)
1352 			break;
1353 
1354 		// lock the node
1355 		AutoLocker<Vnode> nodeLocker(vnode);
1356 
1357 		// Check whether the node is still unused -- since we only append to the
1358 		// tail of the unused queue, the vnode should still be at its head.
1359 		// Alternatively we could check its ref count for 0 and its busy flag,
1360 		// but if the node is no longer at the head of the queue, it means it
1361 		// has been touched in the meantime, i.e. it is no longer the least
1362 		// recently used unused vnode and we rather don't free it.
1363 		unusedVnodesLocker.Lock();
1364 		if (vnode != list_get_first_item(&sUnusedVnodeList))
1365 			continue;
1366 		unusedVnodesLocker.Unlock();
1367 
1368 		ASSERT(!vnode->IsBusy());
1369 
1370 		// grab a reference
1371 		inc_vnode_ref_count(vnode);
1372 		vnode_used(vnode);
1373 
1374 		// write back changes and free the node
1375 		nodeLocker.Unlock();
1376 		vnodesReadLocker.Unlock();
1377 
1378 		if (vnode->cache != NULL)
1379 			vnode->cache->WriteModified();
1380 
1381 		dec_vnode_ref_count(vnode, true, false);
1382 			// this should free the vnode when it's still unused
1383 	}
1384 
1385 	unused_vnodes_check_done();
1386 }
1387 
1388 
1389 /*!	Gets the vnode the given vnode is covering.
1390 
1391 	The caller must have \c sVnodeLock read-locked at least.
1392 
1393 	The function returns a reference to the retrieved vnode (if any), the caller
1394 	is responsible to free.
1395 
1396 	\param vnode The vnode whose covered node shall be returned.
1397 	\return The covered vnode, or \c NULL if the given vnode doesn't cover any
1398 		vnode.
1399 */
1400 static inline Vnode*
1401 get_covered_vnode_locked(Vnode* vnode)
1402 {
1403 	if (Vnode* coveredNode = vnode->covers) {
1404 		while (coveredNode->covers != NULL)
1405 			coveredNode = coveredNode->covers;
1406 
1407 		inc_vnode_ref_count(coveredNode);
1408 		return coveredNode;
1409 	}
1410 
1411 	return NULL;
1412 }
1413 
1414 
1415 /*!	Gets the vnode the given vnode is covering.
1416 
1417 	The caller must not hold \c sVnodeLock. Note that this implies a race
1418 	condition, since the situation can change at any time.
1419 
1420 	The function returns a reference to the retrieved vnode (if any), the caller
1421 	is responsible to free.
1422 
1423 	\param vnode The vnode whose covered node shall be returned.
1424 	\return The covered vnode, or \c NULL if the given vnode doesn't cover any
1425 		vnode.
1426 */
1427 static inline Vnode*
1428 get_covered_vnode(Vnode* vnode)
1429 {
1430 	if (!vnode->IsCovering())
1431 		return NULL;
1432 
1433 	ReadLocker vnodeReadLocker(sVnodeLock);
1434 	return get_covered_vnode_locked(vnode);
1435 }
1436 
1437 
1438 /*!	Gets the vnode the given vnode is covered by.
1439 
1440 	The caller must have \c sVnodeLock read-locked at least.
1441 
1442 	The function returns a reference to the retrieved vnode (if any), the caller
1443 	is responsible to free.
1444 
1445 	\param vnode The vnode whose covering node shall be returned.
1446 	\return The covering vnode, or \c NULL if the given vnode isn't covered by
1447 		any vnode.
1448 */
1449 static Vnode*
1450 get_covering_vnode_locked(Vnode* vnode)
1451 {
1452 	if (Vnode* coveringNode = vnode->covered_by) {
1453 		while (coveringNode->covered_by != NULL)
1454 			coveringNode = coveringNode->covered_by;
1455 
1456 		inc_vnode_ref_count(coveringNode);
1457 		return coveringNode;
1458 	}
1459 
1460 	return NULL;
1461 }
1462 
1463 
1464 /*!	Gets the vnode the given vnode is covered by.
1465 
1466 	The caller must not hold \c sVnodeLock. Note that this implies a race
1467 	condition, since the situation can change at any time.
1468 
1469 	The function returns a reference to the retrieved vnode (if any), the caller
1470 	is responsible to free.
1471 
1472 	\param vnode The vnode whose covering node shall be returned.
1473 	\return The covering vnode, or \c NULL if the given vnode isn't covered by
1474 		any vnode.
1475 */
1476 static inline Vnode*
1477 get_covering_vnode(Vnode* vnode)
1478 {
1479 	if (!vnode->IsCovered())
1480 		return NULL;
1481 
1482 	ReadLocker vnodeReadLocker(sVnodeLock);
1483 	return get_covering_vnode_locked(vnode);
1484 }
1485 
1486 
1487 static void
1488 free_unused_vnodes()
1489 {
1490 	free_unused_vnodes(
1491 		low_resource_state(B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1492 			| B_KERNEL_RESOURCE_ADDRESS_SPACE));
1493 }
1494 
1495 
1496 static void
1497 vnode_low_resource_handler(void* /*data*/, uint32 resources, int32 level)
1498 {
1499 	TRACE(("vnode_low_resource_handler(level = %" B_PRId32 ")\n", level));
1500 
1501 	free_unused_vnodes(level);
1502 }
1503 
1504 
1505 static inline void
1506 put_advisory_locking(struct advisory_locking* locking)
1507 {
1508 	release_sem(locking->lock);
1509 }
1510 
1511 
1512 /*!	Returns the advisory_locking object of the \a vnode in case it
1513 	has one, and locks it.
1514 	You have to call put_advisory_locking() when you're done with
1515 	it.
1516 	Note, you must not have the vnode mutex locked when calling
1517 	this function.
1518 */
1519 static struct advisory_locking*
1520 get_advisory_locking(struct vnode* vnode)
1521 {
1522 	rw_lock_read_lock(&sVnodeLock);
1523 	vnode->Lock();
1524 
1525 	struct advisory_locking* locking = vnode->advisory_locking;
1526 	sem_id lock = locking != NULL ? locking->lock : B_ERROR;
1527 
1528 	vnode->Unlock();
1529 	rw_lock_read_unlock(&sVnodeLock);
1530 
1531 	if (lock >= 0)
1532 		lock = acquire_sem(lock);
1533 	if (lock < 0) {
1534 		// This means the locking has been deleted in the mean time
1535 		// or had never existed in the first place - otherwise, we
1536 		// would get the lock at some point.
1537 		return NULL;
1538 	}
1539 
1540 	return locking;
1541 }
1542 
1543 
1544 /*!	Creates a locked advisory_locking object, and attaches it to the
1545 	given \a vnode.
1546 	Returns B_OK in case of success - also if the vnode got such an
1547 	object from someone else in the mean time, you'll still get this
1548 	one locked then.
1549 */
1550 static status_t
1551 create_advisory_locking(struct vnode* vnode)
1552 {
1553 	if (vnode == NULL)
1554 		return B_FILE_ERROR;
1555 
1556 	ObjectDeleter<advisory_locking> lockingDeleter;
1557 	struct advisory_locking* locking = NULL;
1558 
1559 	while (get_advisory_locking(vnode) == NULL) {
1560 		// no locking object set on the vnode yet, create one
1561 		if (locking == NULL) {
1562 			locking = new(std::nothrow) advisory_locking;
1563 			if (locking == NULL)
1564 				return B_NO_MEMORY;
1565 			lockingDeleter.SetTo(locking);
1566 
1567 			locking->wait_sem = create_sem(0, "advisory lock");
1568 			if (locking->wait_sem < 0)
1569 				return locking->wait_sem;
1570 
1571 			locking->lock = create_sem(0, "advisory locking");
1572 			if (locking->lock < 0)
1573 				return locking->lock;
1574 		}
1575 
1576 		// set our newly created locking object
1577 		ReadLocker _(sVnodeLock);
1578 		AutoLocker<Vnode> nodeLocker(vnode);
1579 		if (vnode->advisory_locking == NULL) {
1580 			vnode->advisory_locking = locking;
1581 			lockingDeleter.Detach();
1582 			return B_OK;
1583 		}
1584 	}
1585 
1586 	// The vnode already had a locking object. That's just as well.
1587 
1588 	return B_OK;
1589 }
1590 
1591 
1592 /*! Returns \c true when either \a flock is \c NULL or the \a flock intersects
1593 	with the advisory_lock \a lock.
1594 */
1595 static bool
1596 advisory_lock_intersects(struct advisory_lock* lock, struct flock* flock)
1597 {
1598 	if (flock == NULL)
1599 		return true;
1600 
1601 	return lock->start <= flock->l_start - 1 + flock->l_len
1602 		&& lock->end >= flock->l_start;
1603 }
1604 
1605 
1606 /*!	Tests whether acquiring a lock would block.
1607 */
1608 static status_t
1609 test_advisory_lock(struct vnode* vnode, struct flock* flock)
1610 {
1611 	flock->l_type = F_UNLCK;
1612 
1613 	struct advisory_locking* locking = get_advisory_locking(vnode);
1614 	if (locking == NULL)
1615 		return B_OK;
1616 
1617 	team_id team = team_get_current_team_id();
1618 
1619 	LockList::Iterator iterator = locking->locks.GetIterator();
1620 	while (iterator.HasNext()) {
1621 		struct advisory_lock* lock = iterator.Next();
1622 
1623 		 if (lock->team != team && advisory_lock_intersects(lock, flock)) {
1624 			// locks do overlap
1625 			if (flock->l_type != F_RDLCK || !lock->shared) {
1626 				// collision
1627 				flock->l_type = lock->shared ? F_RDLCK : F_WRLCK;
1628 				flock->l_whence = SEEK_SET;
1629 				flock->l_start = lock->start;
1630 				flock->l_len = lock->end - lock->start + 1;
1631 				flock->l_pid = lock->team;
1632 				break;
1633 			}
1634 		}
1635 	}
1636 
1637 	put_advisory_locking(locking);
1638 	return B_OK;
1639 }
1640 
1641 
1642 /*!	Removes the specified lock, or all locks of the calling team
1643 	if \a flock is NULL.
1644 */
1645 static status_t
1646 release_advisory_lock(struct vnode* vnode, struct io_context* context,
1647 	struct file_descriptor* descriptor, struct flock* flock)
1648 {
1649 	FUNCTION(("release_advisory_lock(vnode = %p, flock = %p)\n", vnode, flock));
1650 
1651 	struct advisory_locking* locking = get_advisory_locking(vnode);
1652 	if (locking == NULL)
1653 		return B_OK;
1654 
1655 	// find matching lock entries
1656 
1657 	LockList::Iterator iterator = locking->locks.GetIterator();
1658 	while (iterator.HasNext()) {
1659 		struct advisory_lock* lock = iterator.Next();
1660 		bool removeLock = false;
1661 
1662 		if (descriptor != NULL && lock->bound_to == descriptor) {
1663 			// Remove flock() locks
1664 			removeLock = true;
1665 		} else if (lock->bound_to == context
1666 				&& advisory_lock_intersects(lock, flock)) {
1667 			// Remove POSIX locks
1668 			bool endsBeyond = false;
1669 			bool startsBefore = false;
1670 			if (flock != NULL) {
1671 				startsBefore = lock->start < flock->l_start;
1672 				endsBeyond = lock->end > flock->l_start - 1 + flock->l_len;
1673 			}
1674 
1675 			if (!startsBefore && !endsBeyond) {
1676 				// lock is completely contained in flock
1677 				removeLock = true;
1678 			} else if (startsBefore && !endsBeyond) {
1679 				// cut the end of the lock
1680 				lock->end = flock->l_start - 1;
1681 			} else if (!startsBefore && endsBeyond) {
1682 				// cut the start of the lock
1683 				lock->start = flock->l_start + flock->l_len;
1684 			} else {
1685 				// divide the lock into two locks
1686 				struct advisory_lock* secondLock = new advisory_lock;
1687 				if (secondLock == NULL) {
1688 					// TODO: we should probably revert the locks we already
1689 					// changed... (ie. allocate upfront)
1690 					put_advisory_locking(locking);
1691 					return B_NO_MEMORY;
1692 				}
1693 
1694 				lock->end = flock->l_start - 1;
1695 
1696 				secondLock->bound_to = context;
1697 				secondLock->team = lock->team;
1698 				secondLock->session = lock->session;
1699 				// values must already be normalized when getting here
1700 				secondLock->start = flock->l_start + flock->l_len;
1701 				secondLock->end = lock->end;
1702 				secondLock->shared = lock->shared;
1703 
1704 				locking->locks.Add(secondLock);
1705 			}
1706 		}
1707 
1708 		if (removeLock) {
1709 			// this lock is no longer used
1710 			iterator.Remove();
1711 			delete lock;
1712 		}
1713 	}
1714 
1715 	bool removeLocking = locking->locks.IsEmpty();
1716 	release_sem_etc(locking->wait_sem, 1, B_RELEASE_ALL);
1717 
1718 	put_advisory_locking(locking);
1719 
1720 	if (removeLocking) {
1721 		// We can remove the whole advisory locking structure; it's no
1722 		// longer used
1723 		locking = get_advisory_locking(vnode);
1724 		if (locking != NULL) {
1725 			ReadLocker locker(sVnodeLock);
1726 			AutoLocker<Vnode> nodeLocker(vnode);
1727 
1728 			// the locking could have been changed in the mean time
1729 			if (locking->locks.IsEmpty()) {
1730 				vnode->advisory_locking = NULL;
1731 				nodeLocker.Unlock();
1732 				locker.Unlock();
1733 
1734 				// we've detached the locking from the vnode, so we can
1735 				// safely delete it
1736 				delete locking;
1737 			} else {
1738 				// the locking is in use again
1739 				nodeLocker.Unlock();
1740 				locker.Unlock();
1741 				release_sem_etc(locking->lock, 1, B_DO_NOT_RESCHEDULE);
1742 			}
1743 		}
1744 	}
1745 
1746 	return B_OK;
1747 }
1748 
1749 
1750 /*!	Acquires an advisory lock for the \a vnode. If \a wait is \c true, it
1751 	will wait for the lock to become available, if there are any collisions
1752 	(it will return B_PERMISSION_DENIED in this case if \a wait is \c false).
1753 
1754 	If \a descriptor is NULL, POSIX semantics are used for this lock. Otherwise,
1755 	BSD flock() semantics are used, that is, all children can unlock the file
1756 	in question (we even allow parents to remove the lock, though, but that
1757 	seems to be in line to what the BSD's are doing).
1758 */
1759 static status_t
1760 acquire_advisory_lock(struct vnode* vnode, io_context* context,
1761 	struct file_descriptor* descriptor, struct flock* flock, bool wait)
1762 {
1763 	FUNCTION(("acquire_advisory_lock(vnode = %p, flock = %p, wait = %s)\n",
1764 		vnode, flock, wait ? "yes" : "no"));
1765 
1766 	bool shared = flock->l_type == F_RDLCK;
1767 	void* boundTo = descriptor != NULL ? (void*)descriptor : (void*)context;
1768 	status_t status = B_OK;
1769 
1770 	// TODO: do deadlock detection!
1771 
1772 	struct advisory_locking* locking;
1773 
1774 	while (true) {
1775 		// if this vnode has an advisory_locking structure attached,
1776 		// lock that one and search for any colliding file lock
1777 		status = create_advisory_locking(vnode);
1778 		if (status != B_OK)
1779 			return status;
1780 
1781 		locking = vnode->advisory_locking;
1782 		team_id team = team_get_current_team_id();
1783 		sem_id waitForLock = -1;
1784 
1785 		// test for collisions
1786 		LockList::Iterator iterator = locking->locks.GetIterator();
1787 		while (iterator.HasNext()) {
1788 			struct advisory_lock* lock = iterator.Next();
1789 
1790 			// TODO: locks from the same team might be joinable!
1791 			if ((lock->team != team || lock->bound_to != boundTo)
1792 					&& advisory_lock_intersects(lock, flock)) {
1793 				// locks do overlap
1794 				if (!shared || !lock->shared) {
1795 					// we need to wait
1796 					waitForLock = locking->wait_sem;
1797 					break;
1798 				}
1799 			}
1800 		}
1801 
1802 		if (waitForLock < 0)
1803 			break;
1804 
1805 		// We need to wait. Do that or fail now, if we've been asked not to.
1806 
1807 		if (!wait) {
1808 			put_advisory_locking(locking);
1809 			return descriptor != NULL ? B_WOULD_BLOCK : B_PERMISSION_DENIED;
1810 		}
1811 
1812 		status = switch_sem_etc(locking->lock, waitForLock, 1,
1813 			B_CAN_INTERRUPT, 0);
1814 		if (status != B_OK && status != B_BAD_SEM_ID)
1815 			return status;
1816 
1817 		// We have been notified, but we need to re-lock the locking object. So
1818 		// go another round...
1819 	}
1820 
1821 	// install new lock
1822 
1823 	struct advisory_lock* lock = new(std::nothrow) advisory_lock;
1824 	if (lock == NULL) {
1825 		put_advisory_locking(locking);
1826 		return B_NO_MEMORY;
1827 	}
1828 
1829 	lock->bound_to = boundTo;
1830 	lock->team = team_get_current_team_id();
1831 	lock->session = thread_get_current_thread()->team->session_id;
1832 	// values must already be normalized when getting here
1833 	lock->start = flock->l_start;
1834 	lock->end = flock->l_start - 1 + flock->l_len;
1835 	lock->shared = shared;
1836 
1837 	locking->locks.Add(lock);
1838 	put_advisory_locking(locking);
1839 
1840 	return status;
1841 }
1842 
1843 
1844 /*!	Normalizes the \a flock structure to make it easier to compare the
1845 	structure with others. The l_start and l_len fields are set to absolute
1846 	values according to the l_whence field.
1847 */
1848 static status_t
1849 normalize_flock(struct file_descriptor* descriptor, struct flock* flock)
1850 {
1851 	switch (flock->l_whence) {
1852 		case SEEK_SET:
1853 			break;
1854 		case SEEK_CUR:
1855 			flock->l_start += descriptor->pos;
1856 			break;
1857 		case SEEK_END:
1858 		{
1859 			struct vnode* vnode = descriptor->u.vnode;
1860 			struct stat stat;
1861 			status_t status;
1862 
1863 			if (!HAS_FS_CALL(vnode, read_stat))
1864 				return B_UNSUPPORTED;
1865 
1866 			status = FS_CALL(vnode, read_stat, &stat);
1867 			if (status != B_OK)
1868 				return status;
1869 
1870 			flock->l_start += stat.st_size;
1871 			break;
1872 		}
1873 		default:
1874 			return B_BAD_VALUE;
1875 	}
1876 
1877 	if (flock->l_start < 0)
1878 		flock->l_start = 0;
1879 	if (flock->l_len == 0)
1880 		flock->l_len = OFF_MAX;
1881 
1882 	// don't let the offset and length overflow
1883 	if (flock->l_start > 0 && OFF_MAX - flock->l_start < flock->l_len)
1884 		flock->l_len = OFF_MAX - flock->l_start;
1885 
1886 	if (flock->l_len < 0) {
1887 		// a negative length reverses the region
1888 		flock->l_start += flock->l_len;
1889 		flock->l_len = -flock->l_len;
1890 	}
1891 
1892 	return B_OK;
1893 }
1894 
1895 
1896 static void
1897 replace_vnode_if_disconnected(struct fs_mount* mount,
1898 	struct vnode* vnodeToDisconnect, struct vnode*& vnode,
1899 	struct vnode* fallBack, bool lockRootLock)
1900 {
1901 	struct vnode* givenVnode = vnode;
1902 	bool vnodeReplaced = false;
1903 
1904 	ReadLocker vnodeReadLocker(sVnodeLock);
1905 
1906 	if (lockRootLock)
1907 		mutex_lock(&sIOContextRootLock);
1908 
1909 	while (vnode != NULL && vnode->mount == mount
1910 		&& (vnodeToDisconnect == NULL || vnodeToDisconnect == vnode)) {
1911 		if (vnode->covers != NULL) {
1912 			// redirect the vnode to the covered vnode
1913 			vnode = vnode->covers;
1914 		} else
1915 			vnode = fallBack;
1916 
1917 		vnodeReplaced = true;
1918 	}
1919 
1920 	// If we've replaced the node, grab a reference for the new one.
1921 	if (vnodeReplaced && vnode != NULL)
1922 		inc_vnode_ref_count(vnode);
1923 
1924 	if (lockRootLock)
1925 		mutex_unlock(&sIOContextRootLock);
1926 
1927 	vnodeReadLocker.Unlock();
1928 
1929 	if (vnodeReplaced)
1930 		put_vnode(givenVnode);
1931 }
1932 
1933 
1934 /*!	Disconnects all file descriptors that are associated with the
1935 	\a vnodeToDisconnect, or if this is NULL, all vnodes of the specified
1936 	\a mount object.
1937 
1938 	Note, after you've called this function, there might still be ongoing
1939 	accesses - they won't be interrupted if they already happened before.
1940 	However, any subsequent access will fail.
1941 
1942 	This is not a cheap function and should be used with care and rarely.
1943 	TODO: there is currently no means to stop a blocking read/write!
1944 */
1945 static void
1946 disconnect_mount_or_vnode_fds(struct fs_mount* mount,
1947 	struct vnode* vnodeToDisconnect)
1948 {
1949 	// iterate over all teams and peek into their file descriptors
1950 	TeamListIterator teamIterator;
1951 	while (Team* team = teamIterator.Next()) {
1952 		BReference<Team> teamReference(team, true);
1953 		TeamLocker teamLocker(team);
1954 
1955 		// lock the I/O context
1956 		io_context* context = team->io_context;
1957 		if (context == NULL)
1958 			continue;
1959 		MutexLocker contextLocker(context->io_mutex);
1960 
1961 		teamLocker.Unlock();
1962 
1963 		replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->root,
1964 			sRoot, true);
1965 		replace_vnode_if_disconnected(mount, vnodeToDisconnect, context->cwd,
1966 			sRoot, false);
1967 
1968 		for (uint32 i = 0; i < context->table_size; i++) {
1969 			struct file_descriptor* descriptor = context->fds[i];
1970 			if (descriptor == NULL || (descriptor->open_mode & O_DISCONNECTED) != 0)
1971 				continue;
1972 
1973 			inc_fd_ref_count(descriptor);
1974 
1975 			// if this descriptor points at this mount, we
1976 			// need to disconnect it to be able to unmount
1977 			struct vnode* vnode = fd_vnode(descriptor);
1978 			if (vnodeToDisconnect != NULL) {
1979 				if (vnode == vnodeToDisconnect)
1980 					disconnect_fd(descriptor);
1981 			} else if ((vnode != NULL && vnode->mount == mount)
1982 				|| (vnode == NULL && descriptor->u.mount == mount))
1983 				disconnect_fd(descriptor);
1984 
1985 			put_fd(descriptor);
1986 		}
1987 	}
1988 }
1989 
1990 
1991 /*!	\brief Gets the root node of the current IO context.
1992 	If \a kernel is \c true, the kernel IO context will be used.
1993 	The caller obtains a reference to the returned node.
1994 */
1995 struct vnode*
1996 get_root_vnode(bool kernel)
1997 {
1998 	if (!kernel) {
1999 		// Get current working directory from io context
2000 		struct io_context* context = get_current_io_context(kernel);
2001 
2002 		mutex_lock(&sIOContextRootLock);
2003 
2004 		struct vnode* root = context->root;
2005 		if (root != NULL)
2006 			inc_vnode_ref_count(root);
2007 
2008 		mutex_unlock(&sIOContextRootLock);
2009 
2010 		if (root != NULL)
2011 			return root;
2012 
2013 		// That should never happen.
2014 		dprintf("get_root_vnode(): IO context for team %" B_PRId32 " doesn't "
2015 			"have a root\n", team_get_current_team_id());
2016 	}
2017 
2018 	inc_vnode_ref_count(sRoot);
2019 	return sRoot;
2020 }
2021 
2022 
2023 /*!	\brief Gets the directory path and leaf name for a given path.
2024 
2025 	The supplied \a path is transformed to refer to the directory part of
2026 	the entry identified by the original path, and into the buffer \a filename
2027 	the leaf name of the original entry is written.
2028 	Neither the returned path nor the leaf name can be expected to be
2029 	canonical.
2030 
2031 	\param path The path to be analyzed. Must be able to store at least one
2032 		   additional character.
2033 	\param filename The buffer into which the leaf name will be written.
2034 		   Must be of size B_FILE_NAME_LENGTH at least.
2035 	\return \c B_OK, if everything went fine, \c B_NAME_TOO_LONG, if the leaf
2036 		   name is longer than \c B_FILE_NAME_LENGTH, or \c B_ENTRY_NOT_FOUND,
2037 		   if the given path name is empty.
2038 */
2039 static status_t
2040 get_dir_path_and_leaf(char* path, char* filename)
2041 {
2042 	if (*path == '\0')
2043 		return B_ENTRY_NOT_FOUND;
2044 
2045 	char* last = strrchr(path, '/');
2046 		// '/' are not allowed in file names!
2047 
2048 	FUNCTION(("get_dir_path_and_leaf(path = %s)\n", path));
2049 
2050 	if (last == NULL) {
2051 		// this path is single segment with no '/' in it
2052 		// ex. "foo"
2053 		if (strlcpy(filename, path, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2054 			return B_NAME_TOO_LONG;
2055 
2056 		strcpy(path, ".");
2057 	} else {
2058 		last++;
2059 		if (last[0] == '\0') {
2060 			// special case: the path ends in one or more '/' - remove them
2061 			while (*--last == '/' && last != path);
2062 			last[1] = '\0';
2063 
2064 			if (last == path && last[0] == '/') {
2065 				// This path points to the root of the file system
2066 				strcpy(filename, ".");
2067 				return B_OK;
2068 			}
2069 			for (; last != path && *(last - 1) != '/'; last--);
2070 				// rewind to the start of the leaf before the '/'
2071 		}
2072 
2073 		// normal leaf: replace the leaf portion of the path with a '.'
2074 		if (strlcpy(filename, last, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2075 			return B_NAME_TOO_LONG;
2076 
2077 		last[0] = '.';
2078 		last[1] = '\0';
2079 	}
2080 	return B_OK;
2081 }
2082 
2083 
2084 static status_t
2085 entry_ref_to_vnode(dev_t mountID, ino_t directoryID, const char* name,
2086 	bool traverse, bool kernel, struct vnode** _vnode)
2087 {
2088 	char clonedName[B_FILE_NAME_LENGTH + 1];
2089 	if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH) >= B_FILE_NAME_LENGTH)
2090 		return B_NAME_TOO_LONG;
2091 
2092 	// get the directory vnode and let vnode_path_to_vnode() do the rest
2093 	struct vnode* directory;
2094 
2095 	status_t status = get_vnode(mountID, directoryID, &directory, true, false);
2096 	if (status < 0)
2097 		return status;
2098 
2099 	return vnode_path_to_vnode(directory, clonedName, traverse, 0, kernel,
2100 		_vnode, NULL);
2101 }
2102 
2103 
2104 /*!	Looks up the entry with name \a name in the directory represented by \a dir
2105 	and returns the respective vnode.
2106 	On success a reference to the vnode is acquired for the caller.
2107 */
2108 static status_t
2109 lookup_dir_entry(struct vnode* dir, const char* name, struct vnode** _vnode)
2110 {
2111 	ino_t id;
2112 	bool missing;
2113 
2114 	if (dir->mount->entry_cache.Lookup(dir->id, name, id, missing)) {
2115 		return missing ? B_ENTRY_NOT_FOUND
2116 			: get_vnode(dir->device, id, _vnode, true, false);
2117 	}
2118 
2119 	status_t status = FS_CALL(dir, lookup, name, &id);
2120 	if (status != B_OK)
2121 		return status;
2122 
2123 	// The lookup() hook calls get_vnode() or publish_vnode(), so we do already
2124 	// have a reference and just need to look the node up.
2125 	rw_lock_read_lock(&sVnodeLock);
2126 	*_vnode = lookup_vnode(dir->device, id);
2127 	rw_lock_read_unlock(&sVnodeLock);
2128 
2129 	if (*_vnode == NULL) {
2130 		panic("lookup_dir_entry(): could not lookup vnode (mountid 0x%" B_PRIx32
2131 			" vnid 0x%" B_PRIx64 ")\n", dir->device, id);
2132 		return B_ENTRY_NOT_FOUND;
2133 	}
2134 
2135 //	ktrace_printf("lookup_dir_entry(): dir: %p (%ld, %lld), name: \"%s\" -> "
2136 //		"%p (%ld, %lld)", dir, dir->mount->id, dir->id, name, *_vnode,
2137 //		(*_vnode)->mount->id, (*_vnode)->id);
2138 
2139 	return B_OK;
2140 }
2141 
2142 
2143 /*!	Returns the vnode for the relative path starting at the specified \a vnode.
2144 	\a path must not be NULL.
2145 	If it returns successfully, \a path contains the name of the last path
2146 	component. This function clobbers the buffer pointed to by \a path only
2147 	if it does contain more than one component.
2148 	Note, this reduces the ref_count of the starting \a vnode, no matter if
2149 	it is successful or not!
2150 */
2151 static status_t
2152 vnode_path_to_vnode(struct vnode* vnode, char* path, bool traverseLeafLink,
2153 	int count, struct io_context* ioContext, struct vnode** _vnode,
2154 	ino_t* _parentID)
2155 {
2156 	status_t status = B_OK;
2157 	ino_t lastParentID = vnode->id;
2158 
2159 	FUNCTION(("vnode_path_to_vnode(vnode = %p, path = %s)\n", vnode, path));
2160 
2161 	if (path == NULL) {
2162 		put_vnode(vnode);
2163 		return B_BAD_VALUE;
2164 	}
2165 
2166 	if (*path == '\0') {
2167 		put_vnode(vnode);
2168 		return B_ENTRY_NOT_FOUND;
2169 	}
2170 
2171 	while (true) {
2172 		struct vnode* nextVnode;
2173 		char* nextPath;
2174 
2175 		TRACE(("vnode_path_to_vnode: top of loop. p = %p, p = '%s'\n", path,
2176 			path));
2177 
2178 		// done?
2179 		if (path[0] == '\0')
2180 			break;
2181 
2182 		// walk to find the next path component ("path" will point to a single
2183 		// path component), and filter out multiple slashes
2184 		for (nextPath = path + 1; *nextPath != '\0' && *nextPath != '/';
2185 				nextPath++);
2186 
2187 		bool directoryFound = false;
2188 		if (*nextPath == '/') {
2189 			directoryFound = true;
2190 			*nextPath = '\0';
2191 			do
2192 				nextPath++;
2193 			while (*nextPath == '/');
2194 		}
2195 
2196 		// See if the '..' is at a covering vnode move to the covered
2197 		// vnode so we pass the '..' path to the underlying filesystem.
2198 		// Also prevent breaking the root of the IO context.
2199 		if (strcmp("..", path) == 0) {
2200 			if (vnode == ioContext->root) {
2201 				// Attempted prison break! Keep it contained.
2202 				path = nextPath;
2203 				continue;
2204 			}
2205 
2206 			if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2207 				nextVnode = coveredVnode;
2208 				put_vnode(vnode);
2209 				vnode = nextVnode;
2210 			}
2211 		}
2212 
2213 		// check if vnode is really a directory
2214 		if (status == B_OK && !S_ISDIR(vnode->Type()))
2215 			status = B_NOT_A_DIRECTORY;
2216 
2217 		// Check if we have the right to search the current directory vnode.
2218 		// If a file system doesn't have the access() function, we assume that
2219 		// searching a directory is always allowed
2220 		if (status == B_OK && HAS_FS_CALL(vnode, access))
2221 			status = FS_CALL(vnode, access, X_OK);
2222 
2223 		// Tell the filesystem to get the vnode of this path component (if we
2224 		// got the permission from the call above)
2225 		if (status == B_OK)
2226 			status = lookup_dir_entry(vnode, path, &nextVnode);
2227 
2228 		if (status != B_OK) {
2229 			put_vnode(vnode);
2230 			return status;
2231 		}
2232 
2233 		// If the new node is a symbolic link, resolve it (if we've been told
2234 		// to do it)
2235 		if (S_ISLNK(nextVnode->Type())
2236 			&& (traverseLeafLink || directoryFound)) {
2237 			size_t bufferSize;
2238 			char* buffer;
2239 
2240 			TRACE(("traverse link\n"));
2241 
2242 			// it's not exactly nice style using goto in this way, but hey,
2243 			// it works :-/
2244 			if (count + 1 > B_MAX_SYMLINKS) {
2245 				status = B_LINK_LIMIT;
2246 				goto resolve_link_error;
2247 			}
2248 
2249 			bufferSize = B_PATH_NAME_LENGTH;
2250 			buffer = (char*)object_cache_alloc(sPathNameCache, 0);
2251 			if (buffer == NULL) {
2252 				status = B_NO_MEMORY;
2253 				goto resolve_link_error;
2254 			}
2255 
2256 			if (HAS_FS_CALL(nextVnode, read_symlink)) {
2257 				bufferSize--;
2258 				status = FS_CALL(nextVnode, read_symlink, buffer, &bufferSize);
2259 				// null-terminate
2260 				if (status >= 0 && bufferSize < B_PATH_NAME_LENGTH)
2261 					buffer[bufferSize] = '\0';
2262 			} else
2263 				status = B_BAD_VALUE;
2264 
2265 			if (status != B_OK) {
2266 				free(buffer);
2267 
2268 		resolve_link_error:
2269 				put_vnode(vnode);
2270 				put_vnode(nextVnode);
2271 
2272 				return status;
2273 			}
2274 			put_vnode(nextVnode);
2275 
2276 			// Check if we start from the root directory or the current
2277 			// directory ("vnode" still points to that one).
2278 			// Cut off all leading slashes if it's the root directory
2279 			path = buffer;
2280 			bool absoluteSymlink = false;
2281 			if (path[0] == '/') {
2282 				// we don't need the old directory anymore
2283 				put_vnode(vnode);
2284 
2285 				while (*++path == '/')
2286 					;
2287 
2288 				mutex_lock(&sIOContextRootLock);
2289 				vnode = ioContext->root;
2290 				inc_vnode_ref_count(vnode);
2291 				mutex_unlock(&sIOContextRootLock);
2292 
2293 				absoluteSymlink = true;
2294 			}
2295 
2296 			inc_vnode_ref_count(vnode);
2297 				// balance the next recursion - we will decrement the
2298 				// ref_count of the vnode, no matter if we succeeded or not
2299 
2300 			if (absoluteSymlink && *path == '\0') {
2301 				// symlink was just "/"
2302 				nextVnode = vnode;
2303 			} else {
2304 				status = vnode_path_to_vnode(vnode, path, true, count + 1,
2305 					ioContext, &nextVnode, &lastParentID);
2306 			}
2307 
2308 			object_cache_free(sPathNameCache, buffer, 0);
2309 
2310 			if (status != B_OK) {
2311 				put_vnode(vnode);
2312 				return status;
2313 			}
2314 		} else
2315 			lastParentID = vnode->id;
2316 
2317 		// decrease the ref count on the old dir we just looked up into
2318 		put_vnode(vnode);
2319 
2320 		path = nextPath;
2321 		vnode = nextVnode;
2322 
2323 		// see if we hit a covered node
2324 		if (Vnode* coveringNode = get_covering_vnode(vnode)) {
2325 			put_vnode(vnode);
2326 			vnode = coveringNode;
2327 		}
2328 	}
2329 
2330 	*_vnode = vnode;
2331 	if (_parentID)
2332 		*_parentID = lastParentID;
2333 
2334 	return B_OK;
2335 }
2336 
2337 
2338 static status_t
2339 vnode_path_to_vnode(struct vnode* vnode, char* path, bool traverseLeafLink,
2340 	int count, bool kernel, struct vnode** _vnode, ino_t* _parentID)
2341 {
2342 	return vnode_path_to_vnode(vnode, path, traverseLeafLink, count,
2343 		get_current_io_context(kernel), _vnode, _parentID);
2344 }
2345 
2346 
2347 static status_t
2348 path_to_vnode(char* path, bool traverseLink, struct vnode** _vnode,
2349 	ino_t* _parentID, bool kernel)
2350 {
2351 	struct vnode* start = NULL;
2352 
2353 	FUNCTION(("path_to_vnode(path = \"%s\")\n", path));
2354 
2355 	if (!path)
2356 		return B_BAD_VALUE;
2357 
2358 	if (*path == '\0')
2359 		return B_ENTRY_NOT_FOUND;
2360 
2361 	// figure out if we need to start at root or at cwd
2362 	if (*path == '/') {
2363 		if (sRoot == NULL) {
2364 			// we're a bit early, aren't we?
2365 			return B_ERROR;
2366 		}
2367 
2368 		while (*++path == '/')
2369 			;
2370 		start = get_root_vnode(kernel);
2371 
2372 		if (*path == '\0') {
2373 			*_vnode = start;
2374 			return B_OK;
2375 		}
2376 
2377 	} else {
2378 		struct io_context* context = get_current_io_context(kernel);
2379 
2380 		mutex_lock(&context->io_mutex);
2381 		start = context->cwd;
2382 		if (start != NULL)
2383 			inc_vnode_ref_count(start);
2384 		mutex_unlock(&context->io_mutex);
2385 
2386 		if (start == NULL)
2387 			return B_ERROR;
2388 	}
2389 
2390 	return vnode_path_to_vnode(start, path, traverseLink, 0, kernel, _vnode,
2391 		_parentID);
2392 }
2393 
2394 
2395 /*! Returns the vnode in the next to last segment of the path, and returns
2396 	the last portion in filename.
2397 	The path buffer must be able to store at least one additional character.
2398 */
2399 static status_t
2400 path_to_dir_vnode(char* path, struct vnode** _vnode, char* filename,
2401 	bool kernel)
2402 {
2403 	status_t status = get_dir_path_and_leaf(path, filename);
2404 	if (status != B_OK)
2405 		return status;
2406 
2407 	return path_to_vnode(path, true, _vnode, NULL, kernel);
2408 }
2409 
2410 
2411 /*!	\brief Retrieves the directory vnode and the leaf name of an entry referred
2412 		   to by a FD + path pair.
2413 
2414 	\a path must be given in either case. \a fd might be omitted, in which
2415 	case \a path is either an absolute path or one relative to the current
2416 	directory. If both a supplied and \a path is relative it is reckoned off
2417 	of the directory referred to by \a fd. If \a path is absolute \a fd is
2418 	ignored.
2419 
2420 	The caller has the responsibility to call put_vnode() on the returned
2421 	directory vnode.
2422 
2423 	\param fd The FD. May be < 0.
2424 	\param path The absolute or relative path. Must not be \c NULL. The buffer
2425 	       is modified by this function. It must have at least room for a
2426 	       string one character longer than the path it contains.
2427 	\param _vnode A pointer to a variable the directory vnode shall be written
2428 		   into.
2429 	\param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2430 		   the leaf name of the specified entry will be written.
2431 	\param kernel \c true, if invoked from inside the kernel, \c false if
2432 		   invoked from userland.
2433 	\return \c B_OK, if everything went fine, another error code otherwise.
2434 */
2435 static status_t
2436 fd_and_path_to_dir_vnode(int fd, char* path, struct vnode** _vnode,
2437 	char* filename, bool kernel)
2438 {
2439 	if (!path)
2440 		return B_BAD_VALUE;
2441 	if (*path == '\0')
2442 		return B_ENTRY_NOT_FOUND;
2443 	if (fd < 0)
2444 		return path_to_dir_vnode(path, _vnode, filename, kernel);
2445 
2446 	status_t status = get_dir_path_and_leaf(path, filename);
2447 	if (status != B_OK)
2448 		return status;
2449 
2450 	return fd_and_path_to_vnode(fd, path, true, _vnode, NULL, kernel);
2451 }
2452 
2453 
2454 /*!	\brief Retrieves the directory vnode and the leaf name of an entry referred
2455 		   to by a vnode + path pair.
2456 
2457 	\a path must be given in either case. \a vnode might be omitted, in which
2458 	case \a path is either an absolute path or one relative to the current
2459 	directory. If both a supplied and \a path is relative it is reckoned off
2460 	of the directory referred to by \a vnode. If \a path is absolute \a vnode is
2461 	ignored.
2462 
2463 	The caller has the responsibility to call put_vnode() on the returned
2464 	directory vnode.
2465 
2466 	\param vnode The vnode. May be \c NULL.
2467 	\param path The absolute or relative path. Must not be \c NULL. The buffer
2468 	       is modified by this function. It must have at least room for a
2469 	       string one character longer than the path it contains.
2470 	\param _vnode A pointer to a variable the directory vnode shall be written
2471 		   into.
2472 	\param filename A buffer of size B_FILE_NAME_LENGTH or larger into which
2473 		   the leaf name of the specified entry will be written.
2474 	\param kernel \c true, if invoked from inside the kernel, \c false if
2475 		   invoked from userland.
2476 	\return \c B_OK, if everything went fine, another error code otherwise.
2477 */
2478 static status_t
2479 vnode_and_path_to_dir_vnode(struct vnode* vnode, char* path,
2480 	struct vnode** _vnode, char* filename, bool kernel)
2481 {
2482 	if (!path)
2483 		return B_BAD_VALUE;
2484 	if (*path == '\0')
2485 		return B_ENTRY_NOT_FOUND;
2486 	if (vnode == NULL || path[0] == '/')
2487 		return path_to_dir_vnode(path, _vnode, filename, kernel);
2488 
2489 	status_t status = get_dir_path_and_leaf(path, filename);
2490 	if (status != B_OK)
2491 		return status;
2492 
2493 	inc_vnode_ref_count(vnode);
2494 		// vnode_path_to_vnode() always decrements the ref count
2495 
2496 	return vnode_path_to_vnode(vnode, path, true, 0, kernel, _vnode, NULL);
2497 }
2498 
2499 
2500 /*! Returns a vnode's name in the d_name field of a supplied dirent buffer.
2501 */
2502 static status_t
2503 get_vnode_name(struct vnode* vnode, struct vnode* parent, struct dirent* buffer,
2504 	size_t bufferSize, struct io_context* ioContext)
2505 {
2506 	if (bufferSize < sizeof(struct dirent))
2507 		return B_BAD_VALUE;
2508 
2509 	// See if the vnode is covering another vnode and move to the covered
2510 	// vnode so we get the underlying file system
2511 	VNodePutter vnodePutter;
2512 	if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2513 		vnode = coveredVnode;
2514 		vnodePutter.SetTo(vnode);
2515 	}
2516 
2517 	if (HAS_FS_CALL(vnode, get_vnode_name)) {
2518 		// The FS supports getting the name of a vnode.
2519 		if (FS_CALL(vnode, get_vnode_name, buffer->d_name,
2520 			(char*)buffer + bufferSize - buffer->d_name) == B_OK)
2521 			return B_OK;
2522 	}
2523 
2524 	// The FS doesn't support getting the name of a vnode. So we search the
2525 	// parent directory for the vnode, if the caller let us.
2526 
2527 	if (parent == NULL || !HAS_FS_CALL(parent, read_dir))
2528 		return B_UNSUPPORTED;
2529 
2530 	void* cookie;
2531 
2532 	status_t status = FS_CALL(parent, open_dir, &cookie);
2533 	if (status >= B_OK) {
2534 		while (true) {
2535 			uint32 num = 1;
2536 			// We use the FS hook directly instead of dir_read(), since we don't
2537 			// want the entries to be fixed. We have already resolved vnode to
2538 			// the covered node.
2539 			status = FS_CALL(parent, read_dir, cookie, buffer, bufferSize,
2540 				&num);
2541 			if (status != B_OK)
2542 				break;
2543 			if (num == 0) {
2544 				status = B_ENTRY_NOT_FOUND;
2545 				break;
2546 			}
2547 
2548 			if (vnode->id == buffer->d_ino) {
2549 				// found correct entry!
2550 				break;
2551 			}
2552 		}
2553 
2554 		FS_CALL(parent, close_dir, cookie);
2555 		FS_CALL(parent, free_dir_cookie, cookie);
2556 	}
2557 	return status;
2558 }
2559 
2560 
2561 static status_t
2562 get_vnode_name(struct vnode* vnode, struct vnode* parent, char* name,
2563 	size_t nameSize, bool kernel)
2564 {
2565 	char buffer[offsetof(struct dirent, d_name) + B_FILE_NAME_LENGTH + 1];
2566 	struct dirent* dirent = (struct dirent*)buffer;
2567 
2568 	status_t status = get_vnode_name(vnode, parent, dirent, sizeof(buffer),
2569 		get_current_io_context(kernel));
2570 	if (status != B_OK)
2571 		return status;
2572 
2573 	if (strlcpy(name, dirent->d_name, nameSize) >= nameSize)
2574 		return B_BUFFER_OVERFLOW;
2575 
2576 	return B_OK;
2577 }
2578 
2579 
2580 /*!	Gets the full path to a given directory vnode.
2581 	It uses the fs_get_vnode_name() call to get the name of a vnode; if a
2582 	file system doesn't support this call, it will fall back to iterating
2583 	through the parent directory to get the name of the child.
2584 
2585 	To protect against circular loops, it supports a maximum tree depth
2586 	of 256 levels.
2587 
2588 	Note that the path may not be correct the time this function returns!
2589 	It doesn't use any locking to prevent returning the correct path, as
2590 	paths aren't safe anyway: the path to a file can change at any time.
2591 
2592 	It might be a good idea, though, to check if the returned path exists
2593 	in the calling function (it's not done here because of efficiency)
2594 */
2595 static status_t
2596 dir_vnode_to_path(struct vnode* vnode, char* buffer, size_t bufferSize,
2597 	bool kernel)
2598 {
2599 	FUNCTION(("dir_vnode_to_path(%p, %p, %lu)\n", vnode, buffer, bufferSize));
2600 
2601 	if (vnode == NULL || buffer == NULL || bufferSize == 0)
2602 		return B_BAD_VALUE;
2603 
2604 	if (!S_ISDIR(vnode->Type()))
2605 		return B_NOT_A_DIRECTORY;
2606 
2607 	char* path = buffer;
2608 	int32 insert = bufferSize;
2609 	int32 maxLevel = 256;
2610 	int32 length;
2611 	status_t status = B_OK;
2612 	struct io_context* ioContext = get_current_io_context(kernel);
2613 
2614 	// we don't use get_vnode() here because this call is more
2615 	// efficient and does all we need from get_vnode()
2616 	inc_vnode_ref_count(vnode);
2617 
2618 	path[--insert] = '\0';
2619 		// the path is filled right to left
2620 
2621 	while (true) {
2622 		// If the node is the context's root, bail out. Otherwise resolve mount
2623 		// points.
2624 		if (vnode == ioContext->root)
2625 			break;
2626 
2627 		if (Vnode* coveredVnode = get_covered_vnode(vnode)) {
2628 			put_vnode(vnode);
2629 			vnode = coveredVnode;
2630 		}
2631 
2632 		// lookup the parent vnode
2633 		struct vnode* parentVnode;
2634 		status = lookup_dir_entry(vnode, "..", &parentVnode);
2635 		if (status != B_OK)
2636 			goto out;
2637 
2638 		if (parentVnode == vnode) {
2639 			// The caller apparently got their hands on a node outside of their
2640 			// context's root. Now we've hit the global root.
2641 			put_vnode(parentVnode);
2642 			break;
2643 		}
2644 
2645 		// get the node's name
2646 		char nameBuffer[offsetof(struct dirent, d_name) + B_FILE_NAME_LENGTH + 1];
2647 			// also used for fs_read_dir()
2648 		char* name = &((struct dirent*)nameBuffer)->d_name[0];
2649 		status = get_vnode_name(vnode, parentVnode, (struct dirent*)nameBuffer,
2650 			sizeof(nameBuffer), ioContext);
2651 
2652 		// release the current vnode, we only need its parent from now on
2653 		put_vnode(vnode);
2654 		vnode = parentVnode;
2655 
2656 		if (status != B_OK)
2657 			goto out;
2658 
2659 		// TODO: add an explicit check for loops in about 10 levels to do
2660 		// real loop detection
2661 
2662 		// don't go deeper as 'maxLevel' to prevent circular loops
2663 		if (maxLevel-- < 0) {
2664 			status = B_LINK_LIMIT;
2665 			goto out;
2666 		}
2667 
2668 		// add the name in front of the current path
2669 		name[B_FILE_NAME_LENGTH - 1] = '\0';
2670 		length = strlen(name);
2671 		insert -= length;
2672 		if (insert <= 0) {
2673 			status = B_RESULT_NOT_REPRESENTABLE;
2674 			goto out;
2675 		}
2676 		memcpy(path + insert, name, length);
2677 		path[--insert] = '/';
2678 	}
2679 
2680 	// the root dir will result in an empty path: fix it
2681 	if (path[insert] == '\0')
2682 		path[--insert] = '/';
2683 
2684 	TRACE(("  path is: %s\n", path + insert));
2685 
2686 	// move the path to the start of the buffer
2687 	length = bufferSize - insert;
2688 	memmove(buffer, path + insert, length);
2689 
2690 out:
2691 	put_vnode(vnode);
2692 	return status;
2693 }
2694 
2695 
2696 /*!	Checks the length of every path component, and adds a '.'
2697 	if the path ends in a slash.
2698 	The given path buffer must be able to store at least one
2699 	additional character.
2700 */
2701 static status_t
2702 check_path(char* to)
2703 {
2704 	int32 length = 0;
2705 
2706 	// check length of every path component
2707 
2708 	while (*to) {
2709 		char* begin;
2710 		if (*to == '/')
2711 			to++, length++;
2712 
2713 		begin = to;
2714 		while (*to != '/' && *to)
2715 			to++, length++;
2716 
2717 		if (to - begin > B_FILE_NAME_LENGTH)
2718 			return B_NAME_TOO_LONG;
2719 	}
2720 
2721 	if (length == 0)
2722 		return B_ENTRY_NOT_FOUND;
2723 
2724 	// complete path if there is a slash at the end
2725 
2726 	if (*(to - 1) == '/') {
2727 		if (length > B_PATH_NAME_LENGTH - 2)
2728 			return B_NAME_TOO_LONG;
2729 
2730 		to[0] = '.';
2731 		to[1] = '\0';
2732 	}
2733 
2734 	return B_OK;
2735 }
2736 
2737 
2738 static struct file_descriptor*
2739 get_fd_and_vnode(int fd, struct vnode** _vnode, bool kernel)
2740 {
2741 	struct file_descriptor* descriptor
2742 		= get_fd(get_current_io_context(kernel), fd);
2743 	if (descriptor == NULL)
2744 		return NULL;
2745 
2746 	struct vnode* vnode = fd_vnode(descriptor);
2747 	if (vnode == NULL) {
2748 		put_fd(descriptor);
2749 		return NULL;
2750 	}
2751 
2752 	// ToDo: when we can close a file descriptor at any point, investigate
2753 	//	if this is still valid to do (accessing the vnode without ref_count
2754 	//	or locking)
2755 	*_vnode = vnode;
2756 	return descriptor;
2757 }
2758 
2759 
2760 static struct vnode*
2761 get_vnode_from_fd(int fd, bool kernel)
2762 {
2763 	struct file_descriptor* descriptor;
2764 	struct vnode* vnode;
2765 
2766 	descriptor = get_fd(get_current_io_context(kernel), fd);
2767 	if (descriptor == NULL)
2768 		return NULL;
2769 
2770 	vnode = fd_vnode(descriptor);
2771 	if (vnode != NULL)
2772 		inc_vnode_ref_count(vnode);
2773 
2774 	put_fd(descriptor);
2775 	return vnode;
2776 }
2777 
2778 
2779 /*!	Gets the vnode from an FD + path combination. If \a fd is lower than zero,
2780 	only the path will be considered. In this case, the \a path must not be
2781 	NULL.
2782 	If \a fd is a valid file descriptor, \a path may be NULL for directories,
2783 	and should be NULL for files.
2784 */
2785 static status_t
2786 fd_and_path_to_vnode(int fd, char* path, bool traverseLeafLink,
2787 	struct vnode** _vnode, ino_t* _parentID, bool kernel)
2788 {
2789 	if (fd < 0 && !path)
2790 		return B_BAD_VALUE;
2791 
2792 	if (path != NULL && *path == '\0')
2793 		return B_ENTRY_NOT_FOUND;
2794 
2795 	if (fd < 0 || (path != NULL && path[0] == '/')) {
2796 		// no FD or absolute path
2797 		return path_to_vnode(path, traverseLeafLink, _vnode, _parentID, kernel);
2798 	}
2799 
2800 	// FD only, or FD + relative path
2801 	struct vnode* vnode = get_vnode_from_fd(fd, kernel);
2802 	if (vnode == NULL)
2803 		return B_FILE_ERROR;
2804 
2805 	if (path != NULL) {
2806 		return vnode_path_to_vnode(vnode, path, traverseLeafLink, 0, kernel,
2807 			_vnode, _parentID);
2808 	}
2809 
2810 	// there is no relative path to take into account
2811 
2812 	*_vnode = vnode;
2813 	if (_parentID)
2814 		*_parentID = -1;
2815 
2816 	return B_OK;
2817 }
2818 
2819 
2820 static int
2821 get_new_fd(int type, struct fs_mount* mount, struct vnode* vnode,
2822 	void* cookie, int openMode, bool kernel)
2823 {
2824 	struct file_descriptor* descriptor;
2825 	int fd;
2826 
2827 	// If the vnode is locked, we don't allow creating a new file/directory
2828 	// file_descriptor for it
2829 	if (vnode && vnode->mandatory_locked_by != NULL
2830 		&& (type == FDTYPE_FILE || type == FDTYPE_DIR))
2831 		return B_BUSY;
2832 
2833 	if ((openMode & O_RDWR) != 0 && (openMode & O_WRONLY) != 0)
2834 		return B_BAD_VALUE;
2835 
2836 	descriptor = alloc_fd();
2837 	if (!descriptor)
2838 		return B_NO_MEMORY;
2839 
2840 	if (vnode)
2841 		descriptor->u.vnode = vnode;
2842 	else
2843 		descriptor->u.mount = mount;
2844 	descriptor->cookie = cookie;
2845 
2846 	switch (type) {
2847 		// vnode types
2848 		case FDTYPE_FILE:
2849 			descriptor->ops = &sFileOps;
2850 			break;
2851 		case FDTYPE_DIR:
2852 			descriptor->ops = &sDirectoryOps;
2853 			break;
2854 		case FDTYPE_ATTR:
2855 			descriptor->ops = &sAttributeOps;
2856 			break;
2857 		case FDTYPE_ATTR_DIR:
2858 			descriptor->ops = &sAttributeDirectoryOps;
2859 			break;
2860 
2861 		// mount types
2862 		case FDTYPE_INDEX_DIR:
2863 			descriptor->ops = &sIndexDirectoryOps;
2864 			break;
2865 		case FDTYPE_QUERY:
2866 			descriptor->ops = &sQueryOps;
2867 			break;
2868 
2869 		default:
2870 			panic("get_new_fd() called with unknown type %d\n", type);
2871 			break;
2872 	}
2873 	descriptor->type = type;
2874 	descriptor->open_mode = openMode;
2875 
2876 	io_context* context = get_current_io_context(kernel);
2877 	fd = new_fd(context, descriptor);
2878 	if (fd < 0) {
2879 		descriptor->ops = NULL;
2880 		put_fd(descriptor);
2881 		return B_NO_MORE_FDS;
2882 	}
2883 
2884 	mutex_lock(&context->io_mutex);
2885 	fd_set_close_on_exec(context, fd, (openMode & O_CLOEXEC) != 0);
2886 	mutex_unlock(&context->io_mutex);
2887 
2888 	return fd;
2889 }
2890 
2891 
2892 /*!	In-place normalizes \a path. It's otherwise semantically equivalent to
2893 	vfs_normalize_path(). See there for more documentation.
2894 */
2895 static status_t
2896 normalize_path(char* path, size_t pathSize, bool traverseLink, bool kernel)
2897 {
2898 	VNodePutter dirPutter;
2899 	struct vnode* dir = NULL;
2900 	status_t error;
2901 
2902 	for (int i = 0; i < B_MAX_SYMLINKS; i++) {
2903 		// get dir vnode + leaf name
2904 		struct vnode* nextDir;
2905 		char leaf[B_FILE_NAME_LENGTH];
2906 		error = vnode_and_path_to_dir_vnode(dir, path, &nextDir, leaf, kernel);
2907 		if (error != B_OK)
2908 			return error;
2909 
2910 		dir = nextDir;
2911 		strcpy(path, leaf);
2912 		dirPutter.SetTo(dir);
2913 
2914 		// get file vnode, if we shall resolve links
2915 		bool fileExists = false;
2916 		struct vnode* fileVnode;
2917 		VNodePutter fileVnodePutter;
2918 		if (traverseLink) {
2919 			inc_vnode_ref_count(dir);
2920 			if (vnode_path_to_vnode(dir, path, false, 0, kernel, &fileVnode,
2921 					NULL) == B_OK) {
2922 				fileVnodePutter.SetTo(fileVnode);
2923 				fileExists = true;
2924 			}
2925 		}
2926 
2927 		if (!fileExists || !traverseLink || !S_ISLNK(fileVnode->Type())) {
2928 			// we're done -- construct the path
2929 			bool hasLeaf = true;
2930 			if (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0) {
2931 				// special cases "." and ".." -- get the dir, forget the leaf
2932 				inc_vnode_ref_count(dir);
2933 				error = vnode_path_to_vnode(dir, leaf, false, 0, kernel,
2934 					&nextDir, NULL);
2935 				if (error != B_OK)
2936 					return error;
2937 				dir = nextDir;
2938 				dirPutter.SetTo(dir);
2939 				hasLeaf = false;
2940 			}
2941 
2942 			// get the directory path
2943 			error = dir_vnode_to_path(dir, path, B_PATH_NAME_LENGTH, kernel);
2944 			if (error != B_OK)
2945 				return error;
2946 
2947 			// append the leaf name
2948 			if (hasLeaf) {
2949 				// insert a directory separator if this is not the file system
2950 				// root
2951 				if ((strcmp(path, "/") != 0
2952 					&& strlcat(path, "/", pathSize) >= pathSize)
2953 					|| strlcat(path, leaf, pathSize) >= pathSize) {
2954 					return B_NAME_TOO_LONG;
2955 				}
2956 			}
2957 
2958 			return B_OK;
2959 		}
2960 
2961 		// read link
2962 		if (HAS_FS_CALL(fileVnode, read_symlink)) {
2963 			size_t bufferSize = B_PATH_NAME_LENGTH - 1;
2964 			error = FS_CALL(fileVnode, read_symlink, path, &bufferSize);
2965 			if (error != B_OK)
2966 				return error;
2967 			if (bufferSize < B_PATH_NAME_LENGTH)
2968 				path[bufferSize] = '\0';
2969 		} else
2970 			return B_BAD_VALUE;
2971 	}
2972 
2973 	return B_LINK_LIMIT;
2974 }
2975 
2976 
2977 static status_t
2978 resolve_covered_parent(struct vnode* parent, dev_t* _device, ino_t* _node,
2979 	struct io_context* ioContext)
2980 {
2981 	// Make sure the IO context root is not bypassed.
2982 	if (parent == ioContext->root) {
2983 		*_device = parent->device;
2984 		*_node = parent->id;
2985 		return B_OK;
2986 	}
2987 
2988 	inc_vnode_ref_count(parent);
2989 		// vnode_path_to_vnode() puts the node
2990 
2991 	// ".." is guaranteed not to be clobbered by this call
2992 	struct vnode* vnode;
2993 	status_t status = vnode_path_to_vnode(parent, (char*)"..", false, 0,
2994 		ioContext, &vnode, NULL);
2995 	if (status == B_OK) {
2996 		*_device = vnode->device;
2997 		*_node = vnode->id;
2998 		put_vnode(vnode);
2999 	}
3000 
3001 	return status;
3002 }
3003 
3004 
3005 #ifdef ADD_DEBUGGER_COMMANDS
3006 
3007 
3008 static void
3009 _dump_advisory_locking(advisory_locking* locking)
3010 {
3011 	if (locking == NULL)
3012 		return;
3013 
3014 	kprintf("   lock:        %" B_PRId32, locking->lock);
3015 	kprintf("   wait_sem:    %" B_PRId32, locking->wait_sem);
3016 
3017 	int32 index = 0;
3018 	LockList::Iterator iterator = locking->locks.GetIterator();
3019 	while (iterator.HasNext()) {
3020 		struct advisory_lock* lock = iterator.Next();
3021 
3022 		kprintf("   [%2" B_PRId32 "] team:   %" B_PRId32 "\n", index++, lock->team);
3023 		kprintf("        start:  %" B_PRIdOFF "\n", lock->start);
3024 		kprintf("        end:    %" B_PRIdOFF "\n", lock->end);
3025 		kprintf("        shared? %s\n", lock->shared ? "yes" : "no");
3026 	}
3027 }
3028 
3029 
3030 static void
3031 _dump_mount(struct fs_mount* mount)
3032 {
3033 	kprintf("MOUNT: %p\n", mount);
3034 	kprintf(" id:            %" B_PRIdDEV "\n", mount->id);
3035 	kprintf(" device_name:   %s\n", mount->device_name);
3036 	kprintf(" root_vnode:    %p\n", mount->root_vnode);
3037 	kprintf(" covers:        %p\n", mount->root_vnode->covers);
3038 	kprintf(" partition:     %p\n", mount->partition);
3039 	kprintf(" lock:          %p\n", &mount->lock);
3040 	kprintf(" flags:        %s%s\n", mount->unmounting ? " unmounting" : "",
3041 		mount->owns_file_device ? " owns_file_device" : "");
3042 
3043 	fs_volume* volume = mount->volume;
3044 	while (volume != NULL) {
3045 		kprintf(" volume %p:\n", volume);
3046 		kprintf("  layer:            %" B_PRId32 "\n", volume->layer);
3047 		kprintf("  private_volume:   %p\n", volume->private_volume);
3048 		kprintf("  ops:              %p\n", volume->ops);
3049 		kprintf("  file_system:      %p\n", volume->file_system);
3050 		kprintf("  file_system_name: %s\n", volume->file_system_name);
3051 		volume = volume->super_volume;
3052 	}
3053 
3054 	set_debug_variable("_volume", (addr_t)mount->volume->private_volume);
3055 	set_debug_variable("_root", (addr_t)mount->root_vnode);
3056 	set_debug_variable("_covers", (addr_t)mount->root_vnode->covers);
3057 	set_debug_variable("_partition", (addr_t)mount->partition);
3058 }
3059 
3060 
3061 static bool
3062 debug_prepend_vnode_name_to_path(char* buffer, size_t& bufferSize,
3063 	const char* name)
3064 {
3065 	bool insertSlash = buffer[bufferSize] != '\0';
3066 	size_t nameLength = strlen(name);
3067 
3068 	if (bufferSize < nameLength + (insertSlash ? 1 : 0))
3069 		return false;
3070 
3071 	if (insertSlash)
3072 		buffer[--bufferSize] = '/';
3073 
3074 	bufferSize -= nameLength;
3075 	memcpy(buffer + bufferSize, name, nameLength);
3076 
3077 	return true;
3078 }
3079 
3080 
3081 static bool
3082 debug_prepend_vnode_id_to_path(char* buffer, size_t& bufferSize, dev_t devID,
3083 	ino_t nodeID)
3084 {
3085 	if (bufferSize == 0)
3086 		return false;
3087 
3088 	bool insertSlash = buffer[bufferSize] != '\0';
3089 	if (insertSlash)
3090 		buffer[--bufferSize] = '/';
3091 
3092 	size_t size = snprintf(buffer, bufferSize,
3093 		"<%" B_PRIdDEV ",%" B_PRIdINO ">", devID, nodeID);
3094 	if (size > bufferSize) {
3095 		if (insertSlash)
3096 			bufferSize++;
3097 		return false;
3098 	}
3099 
3100 	if (size < bufferSize)
3101 		memmove(buffer + bufferSize - size, buffer, size);
3102 
3103 	bufferSize -= size;
3104 	return true;
3105 }
3106 
3107 
3108 static char*
3109 debug_resolve_vnode_path(struct vnode* vnode, char* buffer, size_t bufferSize,
3110 	bool& _truncated)
3111 {
3112 	// null-terminate the path
3113 	buffer[--bufferSize] = '\0';
3114 
3115 	while (true) {
3116 		while (vnode->covers != NULL)
3117 			vnode = vnode->covers;
3118 
3119 		if (vnode == sRoot) {
3120 			_truncated = bufferSize == 0;
3121 			if (!_truncated)
3122 				buffer[--bufferSize] = '/';
3123 			return buffer + bufferSize;
3124 		}
3125 
3126 		// resolve the name
3127 		ino_t dirID;
3128 		const char* name = vnode->mount->entry_cache.DebugReverseLookup(
3129 			vnode->id, dirID);
3130 		if (name == NULL) {
3131 			// Failed to resolve the name -- prepend "<dev,node>/".
3132 			_truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3133 				vnode->mount->id, vnode->id);
3134 			return buffer + bufferSize;
3135 		}
3136 
3137 		// prepend the name
3138 		if (!debug_prepend_vnode_name_to_path(buffer, bufferSize, name)) {
3139 			_truncated = true;
3140 			return buffer + bufferSize;
3141 		}
3142 
3143 		// resolve the directory node
3144 		struct vnode* nextVnode = lookup_vnode(vnode->mount->id, dirID);
3145 		if (nextVnode == NULL) {
3146 			_truncated = !debug_prepend_vnode_id_to_path(buffer, bufferSize,
3147 				vnode->mount->id, dirID);
3148 			return buffer + bufferSize;
3149 		}
3150 
3151 		vnode = nextVnode;
3152 	}
3153 }
3154 
3155 
3156 static void
3157 _dump_vnode(struct vnode* vnode, bool printPath)
3158 {
3159 	kprintf("VNODE: %p\n", vnode);
3160 	kprintf(" device:        %" B_PRIdDEV "\n", vnode->device);
3161 	kprintf(" id:            %" B_PRIdINO "\n", vnode->id);
3162 	kprintf(" ref_count:     %" B_PRId32 "\n", vnode->ref_count);
3163 	kprintf(" private_node:  %p\n", vnode->private_node);
3164 	kprintf(" mount:         %p\n", vnode->mount);
3165 	kprintf(" covered_by:    %p\n", vnode->covered_by);
3166 	kprintf(" covers:        %p\n", vnode->covers);
3167 	kprintf(" cache:         %p\n", vnode->cache);
3168 	kprintf(" type:          %#" B_PRIx32 "\n", vnode->Type());
3169 	kprintf(" flags:         %s%s%s\n", vnode->IsRemoved() ? "r" : "-",
3170 		vnode->IsBusy() ? "b" : "-", vnode->IsUnpublished() ? "u" : "-");
3171 	kprintf(" advisory_lock: %p\n", vnode->advisory_locking);
3172 
3173 	_dump_advisory_locking(vnode->advisory_locking);
3174 
3175 	if (printPath) {
3176 		void* buffer = debug_malloc(B_PATH_NAME_LENGTH);
3177 		if (buffer != NULL) {
3178 			bool truncated;
3179 			char* path = debug_resolve_vnode_path(vnode, (char*)buffer,
3180 				B_PATH_NAME_LENGTH, truncated);
3181 			if (path != NULL) {
3182 				kprintf(" path:          ");
3183 				if (truncated)
3184 					kputs("<truncated>/");
3185 				kputs(path);
3186 				kputs("\n");
3187 			} else
3188 				kprintf("Failed to resolve vnode path.\n");
3189 
3190 			debug_free(buffer);
3191 		} else
3192 			kprintf("Failed to allocate memory for constructing the path.\n");
3193 	}
3194 
3195 	set_debug_variable("_node", (addr_t)vnode->private_node);
3196 	set_debug_variable("_mount", (addr_t)vnode->mount);
3197 	set_debug_variable("_covered_by", (addr_t)vnode->covered_by);
3198 	set_debug_variable("_covers", (addr_t)vnode->covers);
3199 	set_debug_variable("_adv_lock", (addr_t)vnode->advisory_locking);
3200 }
3201 
3202 
3203 static int
3204 dump_mount(int argc, char** argv)
3205 {
3206 	if (argc != 2 || !strcmp(argv[1], "--help")) {
3207 		kprintf("usage: %s [id|address]\n", argv[0]);
3208 		return 0;
3209 	}
3210 
3211 	ulong val = parse_expression(argv[1]);
3212 	uint32 id = val;
3213 
3214 	struct fs_mount* mount = sMountsTable->Lookup(id);
3215 	if (mount == NULL) {
3216 		if (IS_USER_ADDRESS(id)) {
3217 			kprintf("fs_mount not found\n");
3218 			return 0;
3219 		}
3220 		mount = (fs_mount*)val;
3221 	}
3222 
3223 	_dump_mount(mount);
3224 	return 0;
3225 }
3226 
3227 
3228 static int
3229 dump_mounts(int argc, char** argv)
3230 {
3231 	if (argc != 1) {
3232 		kprintf("usage: %s\n", argv[0]);
3233 		return 0;
3234 	}
3235 
3236 	kprintf("%-*s    id %-*s   %-*s   %-*s   fs_name\n",
3237 		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "root",
3238 		B_PRINTF_POINTER_WIDTH, "covers", B_PRINTF_POINTER_WIDTH, "cookie");
3239 
3240 	struct fs_mount* mount;
3241 
3242 	MountTable::Iterator iterator(sMountsTable);
3243 	while (iterator.HasNext()) {
3244 		mount = iterator.Next();
3245 		kprintf("%p%4" B_PRIdDEV " %p %p %p %s\n", mount, mount->id, mount->root_vnode,
3246 			mount->root_vnode->covers, mount->volume->private_volume,
3247 			mount->volume->file_system_name);
3248 
3249 		fs_volume* volume = mount->volume;
3250 		while (volume->super_volume != NULL) {
3251 			volume = volume->super_volume;
3252 			kprintf("                                     %p %s\n",
3253 				volume->private_volume, volume->file_system_name);
3254 		}
3255 	}
3256 
3257 	return 0;
3258 }
3259 
3260 
3261 static int
3262 dump_vnode(int argc, char** argv)
3263 {
3264 	bool printPath = false;
3265 	int argi = 1;
3266 	if (argc >= 2 && strcmp(argv[argi], "-p") == 0) {
3267 		printPath = true;
3268 		argi++;
3269 	}
3270 
3271 	if (argi >= argc || argi + 2 < argc) {
3272 		print_debugger_command_usage(argv[0]);
3273 		return 0;
3274 	}
3275 
3276 	struct vnode* vnode = NULL;
3277 
3278 	if (argi + 1 == argc) {
3279 		vnode = (struct vnode*)parse_expression(argv[argi]);
3280 		if (IS_USER_ADDRESS(vnode)) {
3281 			kprintf("invalid vnode address\n");
3282 			return 0;
3283 		}
3284 		_dump_vnode(vnode, printPath);
3285 		return 0;
3286 	}
3287 
3288 	dev_t device = parse_expression(argv[argi]);
3289 	ino_t id = parse_expression(argv[argi + 1]);
3290 
3291 	VnodeTable::Iterator iterator(sVnodeTable);
3292 	while (iterator.HasNext()) {
3293 		vnode = iterator.Next();
3294 		if (vnode->id != id || vnode->device != device)
3295 			continue;
3296 
3297 		_dump_vnode(vnode, printPath);
3298 	}
3299 
3300 	return 0;
3301 }
3302 
3303 
3304 static int
3305 dump_vnodes(int argc, char** argv)
3306 {
3307 	if (argc != 2 || !strcmp(argv[1], "--help")) {
3308 		kprintf("usage: %s [device]\n", argv[0]);
3309 		return 0;
3310 	}
3311 
3312 	// restrict dumped nodes to a certain device if requested
3313 	dev_t device = parse_expression(argv[1]);
3314 
3315 	struct vnode* vnode;
3316 
3317 	kprintf("%-*s   dev     inode  ref %-*s   %-*s   %-*s   flags\n",
3318 		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache",
3319 		B_PRINTF_POINTER_WIDTH, "fs-node", B_PRINTF_POINTER_WIDTH, "locking");
3320 
3321 	VnodeTable::Iterator iterator(sVnodeTable);
3322 	while (iterator.HasNext()) {
3323 		vnode = iterator.Next();
3324 		if (vnode->device != device)
3325 			continue;
3326 
3327 		kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO "%5" B_PRId32 " %p %p %p %s%s%s\n",
3328 			vnode, vnode->device, vnode->id, vnode->ref_count, vnode->cache,
3329 			vnode->private_node, vnode->advisory_locking,
3330 			vnode->IsRemoved() ? "r" : "-", vnode->IsBusy() ? "b" : "-",
3331 			vnode->IsUnpublished() ? "u" : "-");
3332 	}
3333 
3334 	return 0;
3335 }
3336 
3337 
3338 static int
3339 dump_vnode_caches(int argc, char** argv)
3340 {
3341 	struct vnode* vnode;
3342 
3343 	if (argc > 2 || !strcmp(argv[1], "--help")) {
3344 		kprintf("usage: %s [device]\n", argv[0]);
3345 		return 0;
3346 	}
3347 
3348 	// restrict dumped nodes to a certain device if requested
3349 	dev_t device = -1;
3350 	if (argc > 1)
3351 		device = parse_expression(argv[1]);
3352 
3353 	kprintf("%-*s   dev     inode %-*s       size   pages\n",
3354 		B_PRINTF_POINTER_WIDTH, "address", B_PRINTF_POINTER_WIDTH, "cache");
3355 
3356 	VnodeTable::Iterator iterator(sVnodeTable);
3357 	while (iterator.HasNext()) {
3358 		vnode = iterator.Next();
3359 		if (vnode->cache == NULL)
3360 			continue;
3361 		if (device != -1 && vnode->device != device)
3362 			continue;
3363 
3364 		kprintf("%p%4" B_PRIdDEV "%10" B_PRIdINO " %p %8" B_PRIdOFF "%8" B_PRId32 "\n",
3365 			vnode, vnode->device, vnode->id, vnode->cache,
3366 			(vnode->cache->virtual_end + B_PAGE_SIZE - 1) / B_PAGE_SIZE,
3367 			vnode->cache->page_count);
3368 	}
3369 
3370 	return 0;
3371 }
3372 
3373 
3374 int
3375 dump_io_context(int argc, char** argv)
3376 {
3377 	if (argc > 2 || !strcmp(argv[1], "--help")) {
3378 		kprintf("usage: %s [team-id|address]\n", argv[0]);
3379 		return 0;
3380 	}
3381 
3382 	struct io_context* context = NULL;
3383 
3384 	if (argc > 1) {
3385 		ulong num = parse_expression(argv[1]);
3386 		if (IS_KERNEL_ADDRESS(num))
3387 			context = (struct io_context*)num;
3388 		else {
3389 			Team* team = team_get_team_struct_locked(num);
3390 			if (team == NULL) {
3391 				kprintf("could not find team with ID %lu\n", num);
3392 				return 0;
3393 			}
3394 			context = (struct io_context*)team->io_context;
3395 		}
3396 	} else
3397 		context = get_current_io_context(true);
3398 
3399 	kprintf("I/O CONTEXT: %p\n", context);
3400 	kprintf(" root vnode:\t%p\n", context->root);
3401 	kprintf(" cwd vnode:\t%p\n", context->cwd);
3402 	kprintf(" used fds:\t%" B_PRIu32 "\n", context->num_used_fds);
3403 	kprintf(" max fds:\t%" B_PRIu32 "\n", context->table_size);
3404 
3405 	if (context->num_used_fds) {
3406 		kprintf("   no.  type    %*s  ref  open  mode         pos    %*s\n",
3407 			B_PRINTF_POINTER_WIDTH, "ops", B_PRINTF_POINTER_WIDTH, "cookie");
3408 	}
3409 
3410 	for (uint32 i = 0; i < context->table_size; i++) {
3411 		struct file_descriptor* fd = context->fds[i];
3412 		if (fd == NULL)
3413 			continue;
3414 
3415 		kprintf("  %3" B_PRIu32 ":  %4" B_PRId32 "  %p  %3" B_PRId32 "  %4"
3416 			B_PRIu32 "  %4" B_PRIx32 "  %10" B_PRIdOFF "  %p  %s %p\n", i,
3417 			fd->type, fd->ops, fd->ref_count, fd->open_count, fd->open_mode,
3418 			fd->pos, fd->cookie,
3419 			fd->type >= FDTYPE_INDEX && fd->type <= FDTYPE_QUERY
3420 				? "mount" : "vnode",
3421 			fd->u.vnode);
3422 	}
3423 
3424 	kprintf(" used monitors:\t%" B_PRIu32 "\n", context->num_monitors);
3425 	kprintf(" max monitors:\t%" B_PRIu32 "\n", context->max_monitors);
3426 
3427 	set_debug_variable("_cwd", (addr_t)context->cwd);
3428 
3429 	return 0;
3430 }
3431 
3432 
3433 int
3434 dump_vnode_usage(int argc, char** argv)
3435 {
3436 	if (argc != 1) {
3437 		kprintf("usage: %s\n", argv[0]);
3438 		return 0;
3439 	}
3440 
3441 	kprintf("Unused vnodes: %" B_PRIu32 " (max unused %" B_PRIu32 ")\n",
3442 		sUnusedVnodes, kMaxUnusedVnodes);
3443 
3444 	uint32 count = sVnodeTable->CountElements();
3445 
3446 	kprintf("%" B_PRIu32 " vnodes total (%" B_PRIu32 " in use).\n", count,
3447 		count - sUnusedVnodes);
3448 	return 0;
3449 }
3450 
3451 #endif	// ADD_DEBUGGER_COMMANDS
3452 
3453 
3454 /*!	Clears memory specified by an iovec array.
3455 */
3456 static void
3457 zero_iovecs(const iovec* vecs, size_t vecCount, size_t bytes)
3458 {
3459 	for (size_t i = 0; i < vecCount && bytes > 0; i++) {
3460 		size_t length = std::min(vecs[i].iov_len, bytes);
3461 		memset(vecs[i].iov_base, 0, length);
3462 		bytes -= length;
3463 	}
3464 }
3465 
3466 
3467 /*!	Does the dirty work of combining the file_io_vecs with the iovecs
3468 	and calls the file system hooks to read/write the request to disk.
3469 */
3470 static status_t
3471 common_file_io_vec_pages(struct vnode* vnode, void* cookie,
3472 	const file_io_vec* fileVecs, size_t fileVecCount, const iovec* vecs,
3473 	size_t vecCount, uint32* _vecIndex, size_t* _vecOffset, size_t* _numBytes,
3474 	bool doWrite)
3475 {
3476 	if (fileVecCount == 0) {
3477 		// There are no file vecs at this offset, so we're obviously trying
3478 		// to access the file outside of its bounds
3479 		return B_BAD_VALUE;
3480 	}
3481 
3482 	size_t numBytes = *_numBytes;
3483 	uint32 fileVecIndex;
3484 	size_t vecOffset = *_vecOffset;
3485 	uint32 vecIndex = *_vecIndex;
3486 	status_t status;
3487 	size_t size;
3488 
3489 	if (!doWrite && vecOffset == 0) {
3490 		// now directly read the data from the device
3491 		// the first file_io_vec can be read directly
3492 		// TODO: we could also write directly
3493 
3494 		if (fileVecs[0].length < (off_t)numBytes)
3495 			size = fileVecs[0].length;
3496 		else
3497 			size = numBytes;
3498 
3499 		if (fileVecs[0].offset >= 0) {
3500 			status = FS_CALL(vnode, read_pages, cookie, fileVecs[0].offset,
3501 				&vecs[vecIndex], vecCount - vecIndex, &size);
3502 		} else {
3503 			// sparse read
3504 			zero_iovecs(&vecs[vecIndex], vecCount - vecIndex, size);
3505 			status = B_OK;
3506 		}
3507 		if (status != B_OK)
3508 			return status;
3509 
3510 		ASSERT((off_t)size <= fileVecs[0].length);
3511 
3512 		// If the file portion was contiguous, we're already done now
3513 		if (size == numBytes)
3514 			return B_OK;
3515 
3516 		// if we reached the end of the file, we can return as well
3517 		if ((off_t)size != fileVecs[0].length) {
3518 			*_numBytes = size;
3519 			return B_OK;
3520 		}
3521 
3522 		fileVecIndex = 1;
3523 
3524 		// first, find out where we have to continue in our iovecs
3525 		for (; vecIndex < vecCount; vecIndex++) {
3526 			if (size < vecs[vecIndex].iov_len)
3527 				break;
3528 
3529 			size -= vecs[vecIndex].iov_len;
3530 		}
3531 
3532 		vecOffset = size;
3533 	} else {
3534 		fileVecIndex = 0;
3535 		size = 0;
3536 	}
3537 
3538 	// Too bad, let's process the rest of the file_io_vecs
3539 
3540 	size_t totalSize = size;
3541 	size_t bytesLeft = numBytes - size;
3542 
3543 	for (; fileVecIndex < fileVecCount; fileVecIndex++) {
3544 		const file_io_vec &fileVec = fileVecs[fileVecIndex];
3545 		off_t fileOffset = fileVec.offset;
3546 		off_t fileLeft = min_c(fileVec.length, (off_t)bytesLeft);
3547 
3548 		TRACE(("FILE VEC [%" B_PRIu32 "] length %" B_PRIdOFF "\n", fileVecIndex,
3549 			fileLeft));
3550 
3551 		// process the complete fileVec
3552 		while (fileLeft > 0) {
3553 			iovec tempVecs[MAX_TEMP_IO_VECS];
3554 			uint32 tempCount = 0;
3555 
3556 			// size tracks how much of what is left of the current fileVec
3557 			// (fileLeft) has been assigned to tempVecs
3558 			size = 0;
3559 
3560 			// assign what is left of the current fileVec to the tempVecs
3561 			for (size = 0; (off_t)size < fileLeft && vecIndex < vecCount
3562 					&& tempCount < MAX_TEMP_IO_VECS;) {
3563 				// try to satisfy one iovec per iteration (or as much as
3564 				// possible)
3565 
3566 				// bytes left of the current iovec
3567 				size_t vecLeft = vecs[vecIndex].iov_len - vecOffset;
3568 				if (vecLeft == 0) {
3569 					vecOffset = 0;
3570 					vecIndex++;
3571 					continue;
3572 				}
3573 
3574 				TRACE(("fill vec %" B_PRIu32 ", offset = %lu, size = %lu\n",
3575 					vecIndex, vecOffset, size));
3576 
3577 				// actually available bytes
3578 				size_t tempVecSize = min_c(vecLeft, fileLeft - size);
3579 
3580 				tempVecs[tempCount].iov_base
3581 					= (void*)((addr_t)vecs[vecIndex].iov_base + vecOffset);
3582 				tempVecs[tempCount].iov_len = tempVecSize;
3583 				tempCount++;
3584 
3585 				size += tempVecSize;
3586 				vecOffset += tempVecSize;
3587 			}
3588 
3589 			size_t bytes = size;
3590 
3591 			if (fileOffset == -1) {
3592 				if (doWrite) {
3593 					panic("sparse write attempt: vnode %p", vnode);
3594 					status = B_IO_ERROR;
3595 				} else {
3596 					// sparse read
3597 					zero_iovecs(tempVecs, tempCount, bytes);
3598 					status = B_OK;
3599 				}
3600 			} else if (doWrite) {
3601 				status = FS_CALL(vnode, write_pages, cookie, fileOffset,
3602 					tempVecs, tempCount, &bytes);
3603 			} else {
3604 				status = FS_CALL(vnode, read_pages, cookie, fileOffset,
3605 					tempVecs, tempCount, &bytes);
3606 			}
3607 			if (status != B_OK)
3608 				return status;
3609 
3610 			totalSize += bytes;
3611 			bytesLeft -= size;
3612 			if (fileOffset >= 0)
3613 				fileOffset += size;
3614 			fileLeft -= size;
3615 			//dprintf("-> file left = %Lu\n", fileLeft);
3616 
3617 			if (size != bytes || vecIndex >= vecCount) {
3618 				// there are no more bytes or iovecs, let's bail out
3619 				*_numBytes = totalSize;
3620 				return B_OK;
3621 			}
3622 		}
3623 	}
3624 
3625 	*_vecIndex = vecIndex;
3626 	*_vecOffset = vecOffset;
3627 	*_numBytes = totalSize;
3628 	return B_OK;
3629 }
3630 
3631 
3632 static bool
3633 is_user_in_group(gid_t gid)
3634 {
3635 	if (gid == getegid())
3636 		return true;
3637 
3638 	gid_t groups[NGROUPS_MAX];
3639 	int groupCount = getgroups(NGROUPS_MAX, groups);
3640 	for (int i = 0; i < groupCount; i++) {
3641 		if (gid == groups[i])
3642 			return true;
3643 	}
3644 
3645 	return false;
3646 }
3647 
3648 
3649 static status_t
3650 free_io_context(io_context* context)
3651 {
3652 	uint32 i;
3653 
3654 	TIOC(FreeIOContext(context));
3655 
3656 	if (context->root)
3657 		put_vnode(context->root);
3658 
3659 	if (context->cwd)
3660 		put_vnode(context->cwd);
3661 
3662 	mutex_lock(&context->io_mutex);
3663 
3664 	for (i = 0; i < context->table_size; i++) {
3665 		if (struct file_descriptor* descriptor = context->fds[i]) {
3666 			close_fd(context, descriptor);
3667 			put_fd(descriptor);
3668 		}
3669 	}
3670 
3671 	mutex_destroy(&context->io_mutex);
3672 
3673 	remove_node_monitors(context);
3674 	free(context->fds);
3675 	free(context);
3676 
3677 	return B_OK;
3678 }
3679 
3680 
3681 static status_t
3682 resize_monitor_table(struct io_context* context, const int newSize)
3683 {
3684 	int	status = B_OK;
3685 
3686 	if (newSize <= 0 || newSize > MAX_NODE_MONITORS)
3687 		return B_BAD_VALUE;
3688 
3689 	mutex_lock(&context->io_mutex);
3690 
3691 	if ((size_t)newSize < context->num_monitors) {
3692 		status = B_BUSY;
3693 		goto out;
3694 	}
3695 	context->max_monitors = newSize;
3696 
3697 out:
3698 	mutex_unlock(&context->io_mutex);
3699 	return status;
3700 }
3701 
3702 
3703 //	#pragma mark - public API for file systems
3704 
3705 
3706 extern "C" status_t
3707 new_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3708 	fs_vnode_ops* ops)
3709 {
3710 	FUNCTION(("new_vnode(volume = %p (%" B_PRId32 "), vnodeID = %" B_PRId64
3711 		", node = %p)\n", volume, volume->id, vnodeID, privateNode));
3712 
3713 	if (privateNode == NULL)
3714 		return B_BAD_VALUE;
3715 
3716 	int32 tries = BUSY_VNODE_RETRIES;
3717 restart:
3718 	// create the node
3719 	bool nodeCreated;
3720 	struct vnode* vnode;
3721 	status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3722 		nodeCreated);
3723 	if (status != B_OK)
3724 		return status;
3725 
3726 	WriteLocker nodeLocker(sVnodeLock, true);
3727 		// create_new_vnode_and_lock() has locked for us
3728 
3729 	if (!nodeCreated && vnode->IsBusy()) {
3730 		nodeLocker.Unlock();
3731 		if (!retry_busy_vnode(tries, volume->id, vnodeID))
3732 			return B_BUSY;
3733 		goto restart;
3734 	}
3735 
3736 	// file system integrity check:
3737 	// test if the vnode already exists and bail out if this is the case!
3738 	if (!nodeCreated) {
3739 		panic("vnode %" B_PRIdDEV ":%" B_PRIdINO " already exists (node = %p, "
3740 			"vnode->node = %p)!", volume->id, vnodeID, privateNode,
3741 			vnode->private_node);
3742 		return B_ERROR;
3743 	}
3744 
3745 	vnode->private_node = privateNode;
3746 	vnode->ops = ops;
3747 	vnode->SetUnpublished(true);
3748 
3749 	TRACE(("returns: %s\n", strerror(status)));
3750 
3751 	return status;
3752 }
3753 
3754 
3755 extern "C" status_t
3756 publish_vnode(fs_volume* volume, ino_t vnodeID, void* privateNode,
3757 	fs_vnode_ops* ops, int type, uint32 flags)
3758 {
3759 	FUNCTION(("publish_vnode()\n"));
3760 
3761 	int32 tries = BUSY_VNODE_RETRIES;
3762 restart:
3763 	WriteLocker locker(sVnodeLock);
3764 
3765 	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3766 
3767 	bool nodeCreated = false;
3768 	if (vnode == NULL) {
3769 		if (privateNode == NULL)
3770 			return B_BAD_VALUE;
3771 
3772 		// create the node
3773 		locker.Unlock();
3774 			// create_new_vnode_and_lock() will re-lock for us on success
3775 		status_t status = create_new_vnode_and_lock(volume->id, vnodeID, vnode,
3776 			nodeCreated);
3777 		if (status != B_OK)
3778 			return status;
3779 
3780 		locker.SetTo(sVnodeLock, true);
3781 	}
3782 
3783 	if (nodeCreated) {
3784 		vnode->private_node = privateNode;
3785 		vnode->ops = ops;
3786 		vnode->SetUnpublished(true);
3787 	} else if (vnode->IsBusy() && vnode->IsUnpublished()
3788 		&& vnode->private_node == privateNode && vnode->ops == ops) {
3789 		// already known, but not published
3790 	} else if (vnode->IsBusy()) {
3791 		locker.Unlock();
3792 		if (!retry_busy_vnode(tries, volume->id, vnodeID))
3793 			return B_BUSY;
3794 		goto restart;
3795 	} else
3796 		return B_BAD_VALUE;
3797 
3798 	bool publishSpecialSubNode = false;
3799 
3800 	vnode->SetType(type);
3801 	vnode->SetRemoved((flags & B_VNODE_PUBLISH_REMOVED) != 0);
3802 	publishSpecialSubNode = is_special_node_type(type)
3803 		&& (flags & B_VNODE_DONT_CREATE_SPECIAL_SUB_NODE) == 0;
3804 
3805 	status_t status = B_OK;
3806 
3807 	// create sub vnodes, if necessary
3808 	if (volume->sub_volume != NULL || publishSpecialSubNode) {
3809 		locker.Unlock();
3810 
3811 		fs_volume* subVolume = volume;
3812 		if (volume->sub_volume != NULL) {
3813 			while (status == B_OK && subVolume->sub_volume != NULL) {
3814 				subVolume = subVolume->sub_volume;
3815 				status = subVolume->ops->create_sub_vnode(subVolume, vnodeID,
3816 					vnode);
3817 			}
3818 		}
3819 
3820 		if (status == B_OK && publishSpecialSubNode)
3821 			status = create_special_sub_node(vnode, flags);
3822 
3823 		if (status != B_OK) {
3824 			// error -- clean up the created sub vnodes
3825 			while (subVolume->super_volume != volume) {
3826 				subVolume = subVolume->super_volume;
3827 				subVolume->ops->delete_sub_vnode(subVolume, vnode);
3828 			}
3829 		}
3830 
3831 		if (status == B_OK) {
3832 			ReadLocker vnodesReadLocker(sVnodeLock);
3833 			AutoLocker<Vnode> nodeLocker(vnode);
3834 			vnode->SetBusy(false);
3835 			vnode->SetUnpublished(false);
3836 		} else {
3837 			locker.Lock();
3838 			sVnodeTable->Remove(vnode);
3839 			remove_vnode_from_mount_list(vnode, vnode->mount);
3840 			object_cache_free(sVnodeCache, vnode, 0);
3841 		}
3842 	} else {
3843 		// we still hold the write lock -- mark the node unbusy and published
3844 		vnode->SetBusy(false);
3845 		vnode->SetUnpublished(false);
3846 	}
3847 
3848 	TRACE(("returns: %s\n", strerror(status)));
3849 
3850 	return status;
3851 }
3852 
3853 
3854 extern "C" status_t
3855 get_vnode(fs_volume* volume, ino_t vnodeID, void** _privateNode)
3856 {
3857 	struct vnode* vnode;
3858 
3859 	if (volume == NULL)
3860 		return B_BAD_VALUE;
3861 
3862 	status_t status = get_vnode(volume->id, vnodeID, &vnode, true, true);
3863 	if (status != B_OK)
3864 		return status;
3865 
3866 	// If this is a layered FS, we need to get the node cookie for the requested
3867 	// layer.
3868 	if (HAS_FS_CALL(vnode, get_super_vnode)) {
3869 		fs_vnode resolvedNode;
3870 		status_t status = FS_CALL(vnode, get_super_vnode, volume,
3871 			&resolvedNode);
3872 		if (status != B_OK) {
3873 			panic("get_vnode(): Failed to get super node for vnode %p, "
3874 				"volume: %p", vnode, volume);
3875 			put_vnode(vnode);
3876 			return status;
3877 		}
3878 
3879 		if (_privateNode != NULL)
3880 			*_privateNode = resolvedNode.private_node;
3881 	} else if (_privateNode != NULL)
3882 		*_privateNode = vnode->private_node;
3883 
3884 	return B_OK;
3885 }
3886 
3887 
3888 extern "C" status_t
3889 acquire_vnode(fs_volume* volume, ino_t vnodeID)
3890 {
3891 	ReadLocker nodeLocker(sVnodeLock);
3892 
3893 	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3894 	if (vnode == NULL)
3895 		return B_BAD_VALUE;
3896 
3897 	inc_vnode_ref_count(vnode);
3898 	return B_OK;
3899 }
3900 
3901 
3902 extern "C" status_t
3903 put_vnode(fs_volume* volume, ino_t vnodeID)
3904 {
3905 	struct vnode* vnode;
3906 
3907 	rw_lock_read_lock(&sVnodeLock);
3908 	vnode = lookup_vnode(volume->id, vnodeID);
3909 	rw_lock_read_unlock(&sVnodeLock);
3910 
3911 	if (vnode == NULL)
3912 		return B_BAD_VALUE;
3913 
3914 	dec_vnode_ref_count(vnode, false, true);
3915 	return B_OK;
3916 }
3917 
3918 
3919 extern "C" status_t
3920 remove_vnode(fs_volume* volume, ino_t vnodeID)
3921 {
3922 	ReadLocker locker(sVnodeLock);
3923 
3924 	struct vnode* vnode = lookup_vnode(volume->id, vnodeID);
3925 	if (vnode == NULL)
3926 		return B_ENTRY_NOT_FOUND;
3927 
3928 	if (vnode->covered_by != NULL || vnode->covers != NULL) {
3929 		// this vnode is in use
3930 		return B_BUSY;
3931 	}
3932 
3933 	vnode->Lock();
3934 
3935 	vnode->SetRemoved(true);
3936 	bool removeUnpublished = false;
3937 
3938 	if (vnode->IsUnpublished()) {
3939 		// prepare the vnode for deletion
3940 		removeUnpublished = true;
3941 		vnode->SetBusy(true);
3942 	}
3943 
3944 	vnode->Unlock();
3945 	locker.Unlock();
3946 
3947 	if (removeUnpublished) {
3948 		// If the vnode hasn't been published yet, we delete it here
3949 		atomic_add(&vnode->ref_count, -1);
3950 		free_vnode(vnode, true);
3951 	}
3952 
3953 	return B_OK;
3954 }
3955 
3956 
3957 extern "C" status_t
3958 unremove_vnode(fs_volume* volume, ino_t vnodeID)
3959 {
3960 	struct vnode* vnode;
3961 
3962 	rw_lock_read_lock(&sVnodeLock);
3963 
3964 	vnode = lookup_vnode(volume->id, vnodeID);
3965 	if (vnode) {
3966 		AutoLocker<Vnode> nodeLocker(vnode);
3967 		vnode->SetRemoved(false);
3968 	}
3969 
3970 	rw_lock_read_unlock(&sVnodeLock);
3971 	return B_OK;
3972 }
3973 
3974 
3975 extern "C" status_t
3976 get_vnode_removed(fs_volume* volume, ino_t vnodeID, bool* _removed)
3977 {
3978 	ReadLocker _(sVnodeLock);
3979 
3980 	if (struct vnode* vnode = lookup_vnode(volume->id, vnodeID)) {
3981 		if (_removed != NULL)
3982 			*_removed = vnode->IsRemoved();
3983 		return B_OK;
3984 	}
3985 
3986 	return B_BAD_VALUE;
3987 }
3988 
3989 
3990 extern "C" fs_volume*
3991 volume_for_vnode(fs_vnode* _vnode)
3992 {
3993 	if (_vnode == NULL)
3994 		return NULL;
3995 
3996 	struct vnode* vnode = static_cast<struct vnode*>(_vnode);
3997 	return vnode->mount->volume;
3998 }
3999 
4000 
4001 extern "C" status_t
4002 check_access_permissions(int accessMode, mode_t mode, gid_t nodeGroupID,
4003 	uid_t nodeUserID)
4004 {
4005 	// get node permissions
4006 	int userPermissions = (mode & S_IRWXU) >> 6;
4007 	int groupPermissions = (mode & S_IRWXG) >> 3;
4008 	int otherPermissions = mode & S_IRWXO;
4009 
4010 	// get the node permissions for this uid/gid
4011 	int permissions = 0;
4012 	uid_t uid = geteuid();
4013 
4014 	if (uid == 0) {
4015 		// user is root
4016 		// root has always read/write permission, but at least one of the
4017 		// X bits must be set for execute permission
4018 		permissions = userPermissions | groupPermissions | otherPermissions
4019 			| S_IROTH | S_IWOTH;
4020 		if (S_ISDIR(mode))
4021 			permissions |= S_IXOTH;
4022 	} else if (uid == nodeUserID) {
4023 		// user is node owner
4024 		permissions = userPermissions;
4025 	} else if (is_user_in_group(nodeGroupID)) {
4026 		// user is in owning group
4027 		permissions = groupPermissions;
4028 	} else {
4029 		// user is one of the others
4030 		permissions = otherPermissions;
4031 	}
4032 
4033 	return (accessMode & ~permissions) == 0 ? B_OK : B_PERMISSION_DENIED;
4034 }
4035 
4036 
4037 #if 0
4038 extern "C" status_t
4039 read_pages(int fd, off_t pos, const iovec* vecs, size_t count,
4040 	size_t* _numBytes)
4041 {
4042 	struct file_descriptor* descriptor;
4043 	struct vnode* vnode;
4044 
4045 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4046 	if (descriptor == NULL)
4047 		return B_FILE_ERROR;
4048 
4049 	status_t status = vfs_read_pages(vnode, descriptor->cookie, pos, vecs,
4050 		count, 0, _numBytes);
4051 
4052 	put_fd(descriptor);
4053 	return status;
4054 }
4055 
4056 
4057 extern "C" status_t
4058 write_pages(int fd, off_t pos, const iovec* vecs, size_t count,
4059 	size_t* _numBytes)
4060 {
4061 	struct file_descriptor* descriptor;
4062 	struct vnode* vnode;
4063 
4064 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4065 	if (descriptor == NULL)
4066 		return B_FILE_ERROR;
4067 
4068 	status_t status = vfs_write_pages(vnode, descriptor->cookie, pos, vecs,
4069 		count, 0, _numBytes);
4070 
4071 	put_fd(descriptor);
4072 	return status;
4073 }
4074 #endif
4075 
4076 
4077 extern "C" status_t
4078 read_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
4079 	const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
4080 	size_t* _bytes)
4081 {
4082 	struct file_descriptor* descriptor;
4083 	struct vnode* vnode;
4084 
4085 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4086 	if (descriptor == NULL)
4087 		return B_FILE_ERROR;
4088 
4089 	status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
4090 		fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
4091 		false);
4092 
4093 	put_fd(descriptor);
4094 	return status;
4095 }
4096 
4097 
4098 extern "C" status_t
4099 write_file_io_vec_pages(int fd, const file_io_vec* fileVecs, size_t fileVecCount,
4100 	const iovec* vecs, size_t vecCount, uint32* _vecIndex, size_t* _vecOffset,
4101 	size_t* _bytes)
4102 {
4103 	struct file_descriptor* descriptor;
4104 	struct vnode* vnode;
4105 
4106 	descriptor = get_fd_and_vnode(fd, &vnode, true);
4107 	if (descriptor == NULL)
4108 		return B_FILE_ERROR;
4109 
4110 	status_t status = common_file_io_vec_pages(vnode, descriptor->cookie,
4111 		fileVecs, fileVecCount, vecs, vecCount, _vecIndex, _vecOffset, _bytes,
4112 		true);
4113 
4114 	put_fd(descriptor);
4115 	return status;
4116 }
4117 
4118 
4119 extern "C" status_t
4120 entry_cache_add(dev_t mountID, ino_t dirID, const char* name, ino_t nodeID)
4121 {
4122 	// lookup mount -- the caller is required to make sure that the mount
4123 	// won't go away
4124 	ReadLocker locker(sMountLock);
4125 	struct fs_mount* mount = find_mount(mountID);
4126 	if (mount == NULL)
4127 		return B_BAD_VALUE;
4128 	locker.Unlock();
4129 
4130 	return mount->entry_cache.Add(dirID, name, nodeID, false);
4131 }
4132 
4133 
4134 extern "C" status_t
4135 entry_cache_add_missing(dev_t mountID, ino_t dirID, const char* name)
4136 {
4137 	// lookup mount -- the caller is required to make sure that the mount
4138 	// won't go away
4139 	ReadLocker locker(sMountLock);
4140 	struct fs_mount* mount = find_mount(mountID);
4141 	if (mount == NULL)
4142 		return B_BAD_VALUE;
4143 	locker.Unlock();
4144 
4145 	return mount->entry_cache.Add(dirID, name, -1, true);
4146 }
4147 
4148 
4149 extern "C" status_t
4150 entry_cache_remove(dev_t mountID, ino_t dirID, const char* name)
4151 {
4152 	// lookup mount -- the caller is required to make sure that the mount
4153 	// won't go away
4154 	ReadLocker locker(sMountLock);
4155 	struct fs_mount* mount = find_mount(mountID);
4156 	if (mount == NULL)
4157 		return B_BAD_VALUE;
4158 	locker.Unlock();
4159 
4160 	return mount->entry_cache.Remove(dirID, name);
4161 }
4162 
4163 
4164 //	#pragma mark - private VFS API
4165 //	Functions the VFS exports for other parts of the kernel
4166 
4167 
4168 /*! Acquires another reference to the vnode that has to be released
4169 	by calling vfs_put_vnode().
4170 */
4171 void
4172 vfs_acquire_vnode(struct vnode* vnode)
4173 {
4174 	inc_vnode_ref_count(vnode);
4175 }
4176 
4177 
4178 /*! This is currently called from file_cache_create() only.
4179 	It's probably a temporary solution as long as devfs requires that
4180 	fs_read_pages()/fs_write_pages() are called with the standard
4181 	open cookie and not with a device cookie.
4182 	If that's done differently, remove this call; it has no other
4183 	purpose.
4184 */
4185 extern "C" status_t
4186 vfs_get_cookie_from_fd(int fd, void** _cookie)
4187 {
4188 	struct file_descriptor* descriptor;
4189 
4190 	descriptor = get_fd(get_current_io_context(true), fd);
4191 	if (descriptor == NULL)
4192 		return B_FILE_ERROR;
4193 
4194 	*_cookie = descriptor->cookie;
4195 	return B_OK;
4196 }
4197 
4198 
4199 extern "C" status_t
4200 vfs_get_vnode_from_fd(int fd, bool kernel, struct vnode** vnode)
4201 {
4202 	*vnode = get_vnode_from_fd(fd, kernel);
4203 
4204 	if (*vnode == NULL)
4205 		return B_FILE_ERROR;
4206 
4207 	return B_NO_ERROR;
4208 }
4209 
4210 
4211 extern "C" status_t
4212 vfs_get_vnode_from_path(const char* path, bool kernel, struct vnode** _vnode)
4213 {
4214 	TRACE(("vfs_get_vnode_from_path: entry. path = '%s', kernel %d\n",
4215 		path, kernel));
4216 
4217 	KPath pathBuffer;
4218 	if (pathBuffer.InitCheck() != B_OK)
4219 		return B_NO_MEMORY;
4220 
4221 	char* buffer = pathBuffer.LockBuffer();
4222 	strlcpy(buffer, path, pathBuffer.BufferSize());
4223 
4224 	struct vnode* vnode;
4225 	status_t status = path_to_vnode(buffer, true, &vnode, NULL, kernel);
4226 	if (status != B_OK)
4227 		return status;
4228 
4229 	*_vnode = vnode;
4230 	return B_OK;
4231 }
4232 
4233 
4234 extern "C" status_t
4235 vfs_get_vnode(dev_t mountID, ino_t vnodeID, bool canWait, struct vnode** _vnode)
4236 {
4237 	struct vnode* vnode = NULL;
4238 
4239 	status_t status = get_vnode(mountID, vnodeID, &vnode, canWait, false);
4240 	if (status != B_OK)
4241 		return status;
4242 
4243 	*_vnode = vnode;
4244 	return B_OK;
4245 }
4246 
4247 
4248 extern "C" status_t
4249 vfs_entry_ref_to_vnode(dev_t mountID, ino_t directoryID,
4250 	const char* name, struct vnode** _vnode)
4251 {
4252 	return entry_ref_to_vnode(mountID, directoryID, name, false, true, _vnode);
4253 }
4254 
4255 
4256 extern "C" void
4257 vfs_vnode_to_node_ref(struct vnode* vnode, dev_t* _mountID, ino_t* _vnodeID)
4258 {
4259 	*_mountID = vnode->device;
4260 	*_vnodeID = vnode->id;
4261 }
4262 
4263 
4264 /*!
4265 	Helper function abstracting the process of "converting" a given
4266 	vnode-pointer to a fs_vnode-pointer.
4267 	Currently only used in bindfs.
4268 */
4269 extern "C" fs_vnode*
4270 vfs_fsnode_for_vnode(struct vnode* vnode)
4271 {
4272 	return vnode;
4273 }
4274 
4275 
4276 /*!
4277 	Calls fs_open() on the given vnode and returns a new
4278 	file descriptor for it
4279 */
4280 int
4281 vfs_open_vnode(struct vnode* vnode, int openMode, bool kernel)
4282 {
4283 	return open_vnode(vnode, openMode, kernel);
4284 }
4285 
4286 
4287 /*!	Looks up a vnode with the given mount and vnode ID.
4288 	Must only be used with "in-use" vnodes as it doesn't grab a reference
4289 	to the node.
4290 	It's currently only be used by file_cache_create().
4291 */
4292 extern "C" status_t
4293 vfs_lookup_vnode(dev_t mountID, ino_t vnodeID, struct vnode** _vnode)
4294 {
4295 	rw_lock_read_lock(&sVnodeLock);
4296 	struct vnode* vnode = lookup_vnode(mountID, vnodeID);
4297 	rw_lock_read_unlock(&sVnodeLock);
4298 
4299 	if (vnode == NULL)
4300 		return B_ERROR;
4301 
4302 	*_vnode = vnode;
4303 	return B_OK;
4304 }
4305 
4306 
4307 extern "C" status_t
4308 vfs_get_fs_node_from_path(fs_volume* volume, const char* path,
4309 	bool traverseLeafLink, bool kernel, void** _node)
4310 {
4311 	TRACE(("vfs_get_fs_node_from_path(volume = %p, path = \"%s\", kernel %d)\n",
4312 		volume, path, kernel));
4313 
4314 	KPath pathBuffer;
4315 	if (pathBuffer.InitCheck() != B_OK)
4316 		return B_NO_MEMORY;
4317 
4318 	fs_mount* mount;
4319 	status_t status = get_mount(volume->id, &mount);
4320 	if (status != B_OK)
4321 		return status;
4322 
4323 	char* buffer = pathBuffer.LockBuffer();
4324 	strlcpy(buffer, path, pathBuffer.BufferSize());
4325 
4326 	struct vnode* vnode = mount->root_vnode;
4327 
4328 	if (buffer[0] == '/')
4329 		status = path_to_vnode(buffer, traverseLeafLink, &vnode, NULL, kernel);
4330 	else {
4331 		inc_vnode_ref_count(vnode);
4332 			// vnode_path_to_vnode() releases a reference to the starting vnode
4333 		status = vnode_path_to_vnode(vnode, buffer, traverseLeafLink, 0,
4334 			kernel, &vnode, NULL);
4335 	}
4336 
4337 	put_mount(mount);
4338 
4339 	if (status != B_OK)
4340 		return status;
4341 
4342 	if (vnode->device != volume->id) {
4343 		// wrong mount ID - must not gain access on foreign file system nodes
4344 		put_vnode(vnode);
4345 		return B_BAD_VALUE;
4346 	}
4347 
4348 	// Use get_vnode() to resolve the cookie for the right layer.
4349 	status = get_vnode(volume, vnode->id, _node);
4350 	put_vnode(vnode);
4351 
4352 	return status;
4353 }
4354 
4355 
4356 status_t
4357 vfs_read_stat(int fd, const char* path, bool traverseLeafLink,
4358 	struct stat* stat, bool kernel)
4359 {
4360 	status_t status;
4361 
4362 	if (path != NULL) {
4363 		// path given: get the stat of the node referred to by (fd, path)
4364 		KPath pathBuffer(path);
4365 		if (pathBuffer.InitCheck() != B_OK)
4366 			return B_NO_MEMORY;
4367 
4368 		status = common_path_read_stat(fd, pathBuffer.LockBuffer(),
4369 			traverseLeafLink, stat, kernel);
4370 	} else {
4371 		// no path given: get the FD and use the FD operation
4372 		struct file_descriptor* descriptor
4373 			= get_fd(get_current_io_context(kernel), fd);
4374 		if (descriptor == NULL)
4375 			return B_FILE_ERROR;
4376 
4377 		if (descriptor->ops->fd_read_stat)
4378 			status = descriptor->ops->fd_read_stat(descriptor, stat);
4379 		else
4380 			status = B_UNSUPPORTED;
4381 
4382 		put_fd(descriptor);
4383 	}
4384 
4385 	return status;
4386 }
4387 
4388 
4389 /*!	Finds the full path to the file that contains the module \a moduleName,
4390 	puts it into \a pathBuffer, and returns B_OK for success.
4391 	If \a pathBuffer was too small, it returns \c B_BUFFER_OVERFLOW,
4392 	\c B_ENTRY_NOT_FOUNT if no file could be found.
4393 	\a pathBuffer is clobbered in any case and must not be relied on if this
4394 	functions returns unsuccessfully.
4395 	\a basePath and \a pathBuffer must not point to the same space.
4396 */
4397 status_t
4398 vfs_get_module_path(const char* basePath, const char* moduleName,
4399 	char* pathBuffer, size_t bufferSize)
4400 {
4401 	struct vnode* dir;
4402 	struct vnode* file;
4403 	status_t status;
4404 	size_t length;
4405 	char* path;
4406 
4407 	if (bufferSize == 0
4408 		|| strlcpy(pathBuffer, basePath, bufferSize) >= bufferSize)
4409 		return B_BUFFER_OVERFLOW;
4410 
4411 	status = path_to_vnode(pathBuffer, true, &dir, NULL, true);
4412 	if (status != B_OK)
4413 		return status;
4414 
4415 	// the path buffer had been clobbered by the above call
4416 	length = strlcpy(pathBuffer, basePath, bufferSize);
4417 	if (pathBuffer[length - 1] != '/')
4418 		pathBuffer[length++] = '/';
4419 
4420 	path = pathBuffer + length;
4421 	bufferSize -= length;
4422 
4423 	while (moduleName) {
4424 		char* nextPath = strchr(moduleName, '/');
4425 		if (nextPath == NULL)
4426 			length = strlen(moduleName);
4427 		else {
4428 			length = nextPath - moduleName;
4429 			nextPath++;
4430 		}
4431 
4432 		if (length + 1 >= bufferSize) {
4433 			status = B_BUFFER_OVERFLOW;
4434 			goto err;
4435 		}
4436 
4437 		memcpy(path, moduleName, length);
4438 		path[length] = '\0';
4439 		moduleName = nextPath;
4440 
4441 		status = vnode_path_to_vnode(dir, path, true, 0, true, &file, NULL);
4442 		if (status != B_OK) {
4443 			// vnode_path_to_vnode() has already released the reference to dir
4444 			return status;
4445 		}
4446 
4447 		if (S_ISDIR(file->Type())) {
4448 			// goto the next directory
4449 			path[length] = '/';
4450 			path[length + 1] = '\0';
4451 			path += length + 1;
4452 			bufferSize -= length + 1;
4453 
4454 			dir = file;
4455 		} else if (S_ISREG(file->Type())) {
4456 			// it's a file so it should be what we've searched for
4457 			put_vnode(file);
4458 
4459 			return B_OK;
4460 		} else {
4461 			TRACE(("vfs_get_module_path(): something is strange here: "
4462 				"0x%08" B_PRIx32 "...\n", file->Type()));
4463 			status = B_ERROR;
4464 			dir = file;
4465 			goto err;
4466 		}
4467 	}
4468 
4469 	// if we got here, the moduleName just pointed to a directory, not to
4470 	// a real module - what should we do in this case?
4471 	status = B_ENTRY_NOT_FOUND;
4472 
4473 err:
4474 	put_vnode(dir);
4475 	return status;
4476 }
4477 
4478 
4479 /*!	\brief Normalizes a given path.
4480 
4481 	The path must refer to an existing or non-existing entry in an existing
4482 	directory, that is chopping off the leaf component the remaining path must
4483 	refer to an existing directory.
4484 
4485 	The returned will be canonical in that it will be absolute, will not
4486 	contain any "." or ".." components or duplicate occurrences of '/'s,
4487 	and none of the directory components will by symbolic links.
4488 
4489 	Any two paths referring to the same entry, will result in the same
4490 	normalized path (well, that is pretty much the definition of `normalized',
4491 	isn't it :-).
4492 
4493 	\param path The path to be normalized.
4494 	\param buffer The buffer into which the normalized path will be written.
4495 		   May be the same one as \a path.
4496 	\param bufferSize The size of \a buffer.
4497 	\param traverseLink If \c true, the function also resolves leaf symlinks.
4498 	\param kernel \c true, if the IO context of the kernel shall be used,
4499 		   otherwise that of the team this thread belongs to. Only relevant,
4500 		   if the path is relative (to get the CWD).
4501 	\return \c B_OK if everything went fine, another error code otherwise.
4502 */
4503 status_t
4504 vfs_normalize_path(const char* path, char* buffer, size_t bufferSize,
4505 	bool traverseLink, bool kernel)
4506 {
4507 	if (!path || !buffer || bufferSize < 1)
4508 		return B_BAD_VALUE;
4509 
4510 	if (path != buffer) {
4511 		if (strlcpy(buffer, path, bufferSize) >= bufferSize)
4512 			return B_BUFFER_OVERFLOW;
4513 	}
4514 
4515 	return normalize_path(buffer, bufferSize, traverseLink, kernel);
4516 }
4517 
4518 
4519 /*!	\brief Gets the parent of the passed in node.
4520 
4521 	Gets the parent of the passed in node, and correctly resolves covered
4522 	nodes.
4523 */
4524 extern "C" status_t
4525 vfs_resolve_parent(struct vnode* parent, dev_t* device, ino_t* node)
4526 {
4527 	return resolve_covered_parent(parent, device, node,
4528 		get_current_io_context(true));
4529 }
4530 
4531 
4532 /*!	\brief Creates a special node in the file system.
4533 
4534 	The caller gets a reference to the newly created node (which is passed
4535 	back through \a _createdVnode) and is responsible for releasing it.
4536 
4537 	\param path The path where to create the entry for the node. Can be \c NULL,
4538 		in which case the node is created without an entry in the root FS -- it
4539 		will automatically be deleted when the last reference has been released.
4540 	\param subVnode The definition of the subnode. Can be \c NULL, in which case
4541 		the target file system will just create the node with its standard
4542 		operations. Depending on the type of the node a subnode might be created
4543 		automatically, though.
4544 	\param mode The type and permissions for the node to be created.
4545 	\param flags Flags to be passed to the creating FS.
4546 	\param kernel \c true, if called in the kernel context (relevant only if
4547 		\a path is not \c NULL and not absolute).
4548 	\param _superVnode Pointer to a pre-allocated structure to be filled by the
4549 		file system creating the node, with the private data pointer and
4550 		operations for the super node. Can be \c NULL.
4551 	\param _createVnode Pointer to pre-allocated storage where to store the
4552 		pointer to the newly created node.
4553 	\return \c B_OK, if everything went fine, another error code otherwise.
4554 */
4555 status_t
4556 vfs_create_special_node(const char* path, fs_vnode* subVnode, mode_t mode,
4557 	uint32 flags, bool kernel, fs_vnode* _superVnode,
4558 	struct vnode** _createdVnode)
4559 {
4560 	struct vnode* dirNode;
4561 	char _leaf[B_FILE_NAME_LENGTH];
4562 	char* leaf = NULL;
4563 
4564 	if (path) {
4565 		// We've got a path. Get the dir vnode and the leaf name.
4566 		KPath tmpPathBuffer;
4567 		if (tmpPathBuffer.InitCheck() != B_OK)
4568 			return B_NO_MEMORY;
4569 
4570 		char* tmpPath = tmpPathBuffer.LockBuffer();
4571 		if (strlcpy(tmpPath, path, B_PATH_NAME_LENGTH) >= B_PATH_NAME_LENGTH)
4572 			return B_NAME_TOO_LONG;
4573 
4574 		// get the dir vnode and the leaf name
4575 		leaf = _leaf;
4576 		status_t error = path_to_dir_vnode(tmpPath, &dirNode, leaf, kernel);
4577 		if (error != B_OK)
4578 			return error;
4579 	} else {
4580 		// No path. Create the node in the root FS.
4581 		dirNode = sRoot;
4582 		inc_vnode_ref_count(dirNode);
4583 	}
4584 
4585 	VNodePutter _(dirNode);
4586 
4587 	// check support for creating special nodes
4588 	if (!HAS_FS_CALL(dirNode, create_special_node))
4589 		return B_UNSUPPORTED;
4590 
4591 	// create the node
4592 	fs_vnode superVnode;
4593 	ino_t nodeID;
4594 	status_t status = FS_CALL(dirNode, create_special_node, leaf, subVnode,
4595 		mode, flags, _superVnode != NULL ? _superVnode : &superVnode, &nodeID);
4596 	if (status != B_OK)
4597 		return status;
4598 
4599 	// lookup the node
4600 	rw_lock_read_lock(&sVnodeLock);
4601 	*_createdVnode = lookup_vnode(dirNode->mount->id, nodeID);
4602 	rw_lock_read_unlock(&sVnodeLock);
4603 
4604 	if (*_createdVnode == NULL) {
4605 		panic("vfs_create_special_node(): lookup of node failed");
4606 		return B_ERROR;
4607 	}
4608 
4609 	return B_OK;
4610 }
4611 
4612 
4613 extern "C" void
4614 vfs_put_vnode(struct vnode* vnode)
4615 {
4616 	put_vnode(vnode);
4617 }
4618 
4619 
4620 extern "C" status_t
4621 vfs_get_cwd(dev_t* _mountID, ino_t* _vnodeID)
4622 {
4623 	// Get current working directory from io context
4624 	struct io_context* context = get_current_io_context(false);
4625 	status_t status = B_OK;
4626 
4627 	mutex_lock(&context->io_mutex);
4628 
4629 	if (context->cwd != NULL) {
4630 		*_mountID = context->cwd->device;
4631 		*_vnodeID = context->cwd->id;
4632 	} else
4633 		status = B_ERROR;
4634 
4635 	mutex_unlock(&context->io_mutex);
4636 	return status;
4637 }
4638 
4639 
4640 status_t
4641 vfs_unmount(dev_t mountID, uint32 flags)
4642 {
4643 	return fs_unmount(NULL, mountID, flags, true);
4644 }
4645 
4646 
4647 extern "C" status_t
4648 vfs_disconnect_vnode(dev_t mountID, ino_t vnodeID)
4649 {
4650 	struct vnode* vnode;
4651 
4652 	status_t status = get_vnode(mountID, vnodeID, &vnode, true, true);
4653 	if (status != B_OK)
4654 		return status;
4655 
4656 	disconnect_mount_or_vnode_fds(vnode->mount, vnode);
4657 	put_vnode(vnode);
4658 	return B_OK;
4659 }
4660 
4661 
4662 extern "C" void
4663 vfs_free_unused_vnodes(int32 level)
4664 {
4665 	vnode_low_resource_handler(NULL,
4666 		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
4667 			| B_KERNEL_RESOURCE_ADDRESS_SPACE,
4668 		level);
4669 }
4670 
4671 
4672 extern "C" bool
4673 vfs_can_page(struct vnode* vnode, void* cookie)
4674 {
4675 	FUNCTION(("vfs_canpage: vnode %p\n", vnode));
4676 
4677 	if (HAS_FS_CALL(vnode, can_page))
4678 		return FS_CALL(vnode, can_page, cookie);
4679 	return false;
4680 }
4681 
4682 
4683 extern "C" status_t
4684 vfs_read_pages(struct vnode* vnode, void* cookie, off_t pos,
4685 	const generic_io_vec* vecs, size_t count, uint32 flags,
4686 	generic_size_t* _numBytes)
4687 {
4688 	FUNCTION(("vfs_read_pages: vnode %p, vecs %p, pos %" B_PRIdOFF "\n", vnode,
4689 		vecs, pos));
4690 
4691 #if VFS_PAGES_IO_TRACING
4692 	generic_size_t bytesRequested = *_numBytes;
4693 #endif
4694 
4695 	IORequest request;
4696 	status_t status = request.Init(pos, vecs, count, *_numBytes, false, flags);
4697 	if (status == B_OK) {
4698 		status = vfs_vnode_io(vnode, cookie, &request);
4699 		if (status == B_OK)
4700 			status = request.Wait();
4701 		*_numBytes = request.TransferredBytes();
4702 	}
4703 
4704 	TPIO(ReadPages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4705 		status, *_numBytes));
4706 
4707 	return status;
4708 }
4709 
4710 
4711 extern "C" status_t
4712 vfs_write_pages(struct vnode* vnode, void* cookie, off_t pos,
4713 	const generic_io_vec* vecs, size_t count, uint32 flags,
4714 	generic_size_t* _numBytes)
4715 {
4716 	FUNCTION(("vfs_write_pages: vnode %p, vecs %p, pos %" B_PRIdOFF "\n", vnode,
4717 		vecs, pos));
4718 
4719 #if VFS_PAGES_IO_TRACING
4720 	generic_size_t bytesRequested = *_numBytes;
4721 #endif
4722 
4723 	IORequest request;
4724 	status_t status = request.Init(pos, vecs, count, *_numBytes, true, flags);
4725 	if (status == B_OK) {
4726 		status = vfs_vnode_io(vnode, cookie, &request);
4727 		if (status == B_OK)
4728 			status = request.Wait();
4729 		*_numBytes = request.TransferredBytes();
4730 	}
4731 
4732 	TPIO(WritePages(vnode, cookie, pos, vecs, count, flags, bytesRequested,
4733 		status, *_numBytes));
4734 
4735 	return status;
4736 }
4737 
4738 
4739 /*!	Gets the vnode's VMCache object. If it didn't have one, it will be
4740 	created if \a allocate is \c true.
4741 	In case it's successful, it will also grab a reference to the cache
4742 	it returns.
4743 */
4744 extern "C" status_t
4745 vfs_get_vnode_cache(struct vnode* vnode, VMCache** _cache, bool allocate)
4746 {
4747 	if (vnode->cache != NULL) {
4748 		vnode->cache->AcquireRef();
4749 		*_cache = vnode->cache;
4750 		return B_OK;
4751 	}
4752 
4753 	rw_lock_read_lock(&sVnodeLock);
4754 	vnode->Lock();
4755 
4756 	status_t status = B_OK;
4757 
4758 	// The cache could have been created in the meantime
4759 	if (vnode->cache == NULL) {
4760 		if (allocate) {
4761 			// TODO: actually the vnode needs to be busy already here, or
4762 			//	else this won't work...
4763 			bool wasBusy = vnode->IsBusy();
4764 			vnode->SetBusy(true);
4765 
4766 			vnode->Unlock();
4767 			rw_lock_read_unlock(&sVnodeLock);
4768 
4769 			status = vm_create_vnode_cache(vnode, &vnode->cache);
4770 
4771 			rw_lock_read_lock(&sVnodeLock);
4772 			vnode->Lock();
4773 			vnode->SetBusy(wasBusy);
4774 		} else
4775 			status = B_BAD_VALUE;
4776 	}
4777 
4778 	vnode->Unlock();
4779 	rw_lock_read_unlock(&sVnodeLock);
4780 
4781 	if (status == B_OK) {
4782 		vnode->cache->AcquireRef();
4783 		*_cache = vnode->cache;
4784 	}
4785 
4786 	return status;
4787 }
4788 
4789 
4790 /*!	Sets the vnode's VMCache object, for subsystems that want to manage
4791 	their own.
4792 	In case it's successful, it will also grab a reference to the cache
4793 	it returns.
4794 */
4795 extern "C" status_t
4796 vfs_set_vnode_cache(struct vnode* vnode, VMCache* _cache)
4797 {
4798 	rw_lock_read_lock(&sVnodeLock);
4799 	vnode->Lock();
4800 
4801 	status_t status = B_OK;
4802 	if (vnode->cache != NULL) {
4803 		status = B_NOT_ALLOWED;
4804 	} else {
4805 		vnode->cache = _cache;
4806 		_cache->AcquireRef();
4807 	}
4808 
4809 	vnode->Unlock();
4810 	rw_lock_read_unlock(&sVnodeLock);
4811 	return status;
4812 }
4813 
4814 
4815 status_t
4816 vfs_get_file_map(struct vnode* vnode, off_t offset, size_t size,
4817 	file_io_vec* vecs, size_t* _count)
4818 {
4819 	FUNCTION(("vfs_get_file_map: vnode %p, vecs %p, offset %" B_PRIdOFF
4820 		", size = %" B_PRIuSIZE "\n", vnode, vecs, offset, size));
4821 
4822 	return FS_CALL(vnode, get_file_map, offset, size, vecs, _count);
4823 }
4824 
4825 
4826 status_t
4827 vfs_stat_vnode(struct vnode* vnode, struct stat* stat)
4828 {
4829 	status_t status = FS_CALL(vnode, read_stat, stat);
4830 
4831 	// fill in the st_dev and st_ino fields
4832 	if (status == B_OK) {
4833 		stat->st_dev = vnode->device;
4834 		stat->st_ino = vnode->id;
4835 		// the rdev field must stay unset for non-special files
4836 		if (!S_ISBLK(stat->st_mode) && !S_ISCHR(stat->st_mode))
4837 			stat->st_rdev = -1;
4838 	}
4839 
4840 	return status;
4841 }
4842 
4843 
4844 status_t
4845 vfs_stat_node_ref(dev_t device, ino_t inode, struct stat* stat)
4846 {
4847 	struct vnode* vnode;
4848 	status_t status = get_vnode(device, inode, &vnode, true, false);
4849 	if (status != B_OK)
4850 		return status;
4851 
4852 	status = vfs_stat_vnode(vnode, stat);
4853 
4854 	put_vnode(vnode);
4855 	return status;
4856 }
4857 
4858 
4859 status_t
4860 vfs_get_vnode_name(struct vnode* vnode, char* name, size_t nameSize)
4861 {
4862 	return get_vnode_name(vnode, NULL, name, nameSize, true);
4863 }
4864 
4865 
4866 status_t
4867 vfs_entry_ref_to_path(dev_t device, ino_t inode, const char* leaf,
4868 	bool kernel, char* path, size_t pathLength)
4869 {
4870 	struct vnode* vnode;
4871 	status_t status;
4872 
4873 	// filter invalid leaf names
4874 	if (leaf != NULL && (leaf[0] == '\0' || strchr(leaf, '/')))
4875 		return B_BAD_VALUE;
4876 
4877 	// get the vnode matching the dir's node_ref
4878 	if (leaf && (strcmp(leaf, ".") == 0 || strcmp(leaf, "..") == 0)) {
4879 		// special cases "." and "..": we can directly get the vnode of the
4880 		// referenced directory
4881 		status = entry_ref_to_vnode(device, inode, leaf, false, kernel, &vnode);
4882 		leaf = NULL;
4883 	} else
4884 		status = get_vnode(device, inode, &vnode, true, false);
4885 	if (status != B_OK)
4886 		return status;
4887 
4888 	// get the directory path
4889 	status = dir_vnode_to_path(vnode, path, pathLength, kernel);
4890 	put_vnode(vnode);
4891 		// we don't need the vnode anymore
4892 	if (status != B_OK)
4893 		return status;
4894 
4895 	// append the leaf name
4896 	if (leaf) {
4897 		// insert a directory separator if this is not the file system root
4898 		if ((strcmp(path, "/") && strlcat(path, "/", pathLength)
4899 				>= pathLength)
4900 			|| strlcat(path, leaf, pathLength) >= pathLength) {
4901 			return B_NAME_TOO_LONG;
4902 		}
4903 	}
4904 
4905 	return B_OK;
4906 }
4907 
4908 
4909 /*!	If the given descriptor locked its vnode, that lock will be released. */
4910 void
4911 vfs_unlock_vnode_if_locked(struct file_descriptor* descriptor)
4912 {
4913 	struct vnode* vnode = fd_vnode(descriptor);
4914 
4915 	if (vnode != NULL && vnode->mandatory_locked_by == descriptor)
4916 		vnode->mandatory_locked_by = NULL;
4917 }
4918 
4919 
4920 /*!	Releases any POSIX locks on the file descriptor. */
4921 status_t
4922 vfs_release_posix_lock(io_context* context, struct file_descriptor* descriptor)
4923 {
4924 	struct vnode* vnode = descriptor->u.vnode;
4925 	if (vnode == NULL)
4926 		return B_OK;
4927 
4928 	if (HAS_FS_CALL(vnode, release_lock))
4929 		return FS_CALL(vnode, release_lock, descriptor->cookie, NULL);
4930 
4931 	return release_advisory_lock(vnode, context, NULL, NULL);
4932 }
4933 
4934 
4935 /*!	Closes all file descriptors of the specified I/O context that
4936 	have the O_CLOEXEC flag set.
4937 */
4938 void
4939 vfs_exec_io_context(io_context* context)
4940 {
4941 	uint32 i;
4942 
4943 	for (i = 0; i < context->table_size; i++) {
4944 		mutex_lock(&context->io_mutex);
4945 
4946 		struct file_descriptor* descriptor = context->fds[i];
4947 		bool remove = false;
4948 
4949 		if (descriptor != NULL && fd_close_on_exec(context, i)) {
4950 			context->fds[i] = NULL;
4951 			context->num_used_fds--;
4952 
4953 			remove = true;
4954 		}
4955 
4956 		mutex_unlock(&context->io_mutex);
4957 
4958 		if (remove) {
4959 			close_fd(context, descriptor);
4960 			put_fd(descriptor);
4961 		}
4962 	}
4963 }
4964 
4965 
4966 /*! Sets up a new io_control structure, and inherits the properties
4967 	of the parent io_control if it is given.
4968 */
4969 io_context*
4970 vfs_new_io_context(io_context* parentContext, bool purgeCloseOnExec)
4971 {
4972 	io_context* context = (io_context*)malloc(sizeof(io_context));
4973 	if (context == NULL)
4974 		return NULL;
4975 
4976 	TIOC(NewIOContext(context, parentContext));
4977 
4978 	memset(context, 0, sizeof(io_context));
4979 	context->ref_count = 1;
4980 
4981 	MutexLocker parentLocker;
4982 
4983 	size_t tableSize;
4984 	if (parentContext != NULL) {
4985 		parentLocker.SetTo(parentContext->io_mutex, false);
4986 		tableSize = parentContext->table_size;
4987 	} else
4988 		tableSize = DEFAULT_FD_TABLE_SIZE;
4989 
4990 	// allocate space for FDs and their close-on-exec flag
4991 	context->fds = (file_descriptor**)malloc(
4992 		sizeof(struct file_descriptor*) * tableSize
4993 		+ sizeof(struct select_info**) * tableSize
4994 		+ (tableSize + 7) / 8);
4995 	if (context->fds == NULL) {
4996 		free(context);
4997 		return NULL;
4998 	}
4999 
5000 	context->select_infos = (select_info**)(context->fds + tableSize);
5001 	context->fds_close_on_exec = (uint8*)(context->select_infos + tableSize);
5002 
5003 	memset(context->fds, 0, sizeof(struct file_descriptor*) * tableSize
5004 		+ sizeof(struct select_info**) * tableSize
5005 		+ (tableSize + 7) / 8);
5006 
5007 	mutex_init(&context->io_mutex, "I/O context");
5008 
5009 	// Copy all parent file descriptors
5010 
5011 	if (parentContext != NULL) {
5012 		size_t i;
5013 
5014 		mutex_lock(&sIOContextRootLock);
5015 		context->root = parentContext->root;
5016 		if (context->root)
5017 			inc_vnode_ref_count(context->root);
5018 		mutex_unlock(&sIOContextRootLock);
5019 
5020 		context->cwd = parentContext->cwd;
5021 		if (context->cwd)
5022 			inc_vnode_ref_count(context->cwd);
5023 
5024 		if (parentContext->inherit_fds) {
5025 			for (i = 0; i < tableSize; i++) {
5026 				struct file_descriptor* descriptor = parentContext->fds[i];
5027 
5028 				if (descriptor != NULL
5029 					&& (descriptor->open_mode & O_DISCONNECTED) == 0) {
5030 					bool closeOnExec = fd_close_on_exec(parentContext, i);
5031 					if (closeOnExec && purgeCloseOnExec)
5032 						continue;
5033 
5034 					TFD(InheritFD(context, i, descriptor, parentContext));
5035 
5036 					context->fds[i] = descriptor;
5037 					context->num_used_fds++;
5038 					atomic_add(&descriptor->ref_count, 1);
5039 					atomic_add(&descriptor->open_count, 1);
5040 
5041 					if (closeOnExec)
5042 						fd_set_close_on_exec(context, i, true);
5043 				}
5044 			}
5045 		}
5046 
5047 		parentLocker.Unlock();
5048 	} else {
5049 		context->root = sRoot;
5050 		context->cwd = sRoot;
5051 
5052 		if (context->root)
5053 			inc_vnode_ref_count(context->root);
5054 
5055 		if (context->cwd)
5056 			inc_vnode_ref_count(context->cwd);
5057 	}
5058 
5059 	context->table_size = tableSize;
5060 	context->inherit_fds = parentContext != NULL;
5061 
5062 	list_init(&context->node_monitors);
5063 	context->max_monitors = DEFAULT_NODE_MONITORS;
5064 
5065 	return context;
5066 }
5067 
5068 
5069 void
5070 vfs_get_io_context(io_context* context)
5071 {
5072 	atomic_add(&context->ref_count, 1);
5073 }
5074 
5075 
5076 void
5077 vfs_put_io_context(io_context* context)
5078 {
5079 	if (atomic_add(&context->ref_count, -1) == 1)
5080 		free_io_context(context);
5081 }
5082 
5083 
5084 status_t
5085 vfs_resize_fd_table(struct io_context* context, uint32 newSize)
5086 {
5087 	if (newSize == 0 || newSize > MAX_FD_TABLE_SIZE)
5088 		return B_BAD_VALUE;
5089 
5090 	TIOC(ResizeIOContext(context, newSize));
5091 
5092 	MutexLocker _(context->io_mutex);
5093 
5094 	uint32 oldSize = context->table_size;
5095 	int oldCloseOnExitBitmapSize = (oldSize + 7) / 8;
5096 	int newCloseOnExitBitmapSize = (newSize + 7) / 8;
5097 
5098 	// If the tables shrink, make sure none of the fds being dropped are in use.
5099 	if (newSize < oldSize) {
5100 		for (uint32 i = oldSize; i-- > newSize;) {
5101 			if (context->fds[i])
5102 				return B_BUSY;
5103 		}
5104 	}
5105 
5106 	// store pointers to the old tables
5107 	file_descriptor** oldFDs = context->fds;
5108 	select_info** oldSelectInfos = context->select_infos;
5109 	uint8* oldCloseOnExecTable = context->fds_close_on_exec;
5110 
5111 	// allocate new tables
5112 	file_descriptor** newFDs = (file_descriptor**)malloc(
5113 		sizeof(struct file_descriptor*) * newSize
5114 		+ sizeof(struct select_infos**) * newSize
5115 		+ newCloseOnExitBitmapSize);
5116 	if (newFDs == NULL)
5117 		return B_NO_MEMORY;
5118 
5119 	context->fds = newFDs;
5120 	context->select_infos = (select_info**)(context->fds + newSize);
5121 	context->fds_close_on_exec = (uint8*)(context->select_infos + newSize);
5122 	context->table_size = newSize;
5123 
5124 	// copy entries from old tables
5125 	uint32 toCopy = min_c(oldSize, newSize);
5126 
5127 	memcpy(context->fds, oldFDs, sizeof(void*) * toCopy);
5128 	memcpy(context->select_infos, oldSelectInfos, sizeof(void*) * toCopy);
5129 	memcpy(context->fds_close_on_exec, oldCloseOnExecTable,
5130 		min_c(oldCloseOnExitBitmapSize, newCloseOnExitBitmapSize));
5131 
5132 	// clear additional entries, if the tables grow
5133 	if (newSize > oldSize) {
5134 		memset(context->fds + oldSize, 0, sizeof(void*) * (newSize - oldSize));
5135 		memset(context->select_infos + oldSize, 0,
5136 			sizeof(void*) * (newSize - oldSize));
5137 		memset(context->fds_close_on_exec + oldCloseOnExitBitmapSize, 0,
5138 			newCloseOnExitBitmapSize - oldCloseOnExitBitmapSize);
5139 	}
5140 
5141 	free(oldFDs);
5142 
5143 	return B_OK;
5144 }
5145 
5146 
5147 /*!	\brief Resolves a vnode to the vnode it is covered by, if any.
5148 
5149 	Given an arbitrary vnode (identified by mount and node ID), the function
5150 	checks, whether the vnode is covered by another vnode. If it is, the
5151 	function returns the mount and node ID of the covering vnode. Otherwise
5152 	it simply returns the supplied mount and node ID.
5153 
5154 	In case of error (e.g. the supplied node could not be found) the variables
5155 	for storing the resolved mount and node ID remain untouched and an error
5156 	code is returned.
5157 
5158 	\param mountID The mount ID of the vnode in question.
5159 	\param nodeID The node ID of the vnode in question.
5160 	\param resolvedMountID Pointer to storage for the resolved mount ID.
5161 	\param resolvedNodeID Pointer to storage for the resolved node ID.
5162 	\return
5163 	- \c B_OK, if everything went fine,
5164 	- another error code, if something went wrong.
5165 */
5166 status_t
5167 vfs_resolve_vnode_to_covering_vnode(dev_t mountID, ino_t nodeID,
5168 	dev_t* resolvedMountID, ino_t* resolvedNodeID)
5169 {
5170 	// get the node
5171 	struct vnode* node;
5172 	status_t error = get_vnode(mountID, nodeID, &node, true, false);
5173 	if (error != B_OK)
5174 		return error;
5175 
5176 	// resolve the node
5177 	if (Vnode* coveringNode = get_covering_vnode(node)) {
5178 		put_vnode(node);
5179 		node = coveringNode;
5180 	}
5181 
5182 	// set the return values
5183 	*resolvedMountID = node->device;
5184 	*resolvedNodeID = node->id;
5185 
5186 	put_vnode(node);
5187 
5188 	return B_OK;
5189 }
5190 
5191 
5192 status_t
5193 vfs_get_mount_point(dev_t mountID, dev_t* _mountPointMountID,
5194 	ino_t* _mountPointNodeID)
5195 {
5196 	ReadLocker nodeLocker(sVnodeLock);
5197 	ReadLocker mountLocker(sMountLock);
5198 
5199 	struct fs_mount* mount = find_mount(mountID);
5200 	if (mount == NULL)
5201 		return B_BAD_VALUE;
5202 
5203 	Vnode* mountPoint = mount->covers_vnode;
5204 
5205 	*_mountPointMountID = mountPoint->device;
5206 	*_mountPointNodeID = mountPoint->id;
5207 
5208 	return B_OK;
5209 }
5210 
5211 
5212 status_t
5213 vfs_bind_mount_directory(dev_t mountID, ino_t nodeID, dev_t coveredMountID,
5214 	ino_t coveredNodeID)
5215 {
5216 	// get the vnodes
5217 	Vnode* vnode;
5218 	status_t error = get_vnode(mountID, nodeID, &vnode, true, false);
5219 	if (error != B_OK)
5220 		return B_BAD_VALUE;
5221 	VNodePutter vnodePutter(vnode);
5222 
5223 	Vnode* coveredVnode;
5224 	error = get_vnode(coveredMountID, coveredNodeID, &coveredVnode, true,
5225 		false);
5226 	if (error != B_OK)
5227 		return B_BAD_VALUE;
5228 	VNodePutter coveredVnodePutter(coveredVnode);
5229 
5230 	// establish the covered/covering links
5231 	WriteLocker locker(sVnodeLock);
5232 
5233 	if (vnode->covers != NULL || coveredVnode->covered_by != NULL
5234 		|| vnode->mount->unmounting || coveredVnode->mount->unmounting) {
5235 		return B_BUSY;
5236 	}
5237 
5238 	vnode->covers = coveredVnode;
5239 	vnode->SetCovering(true);
5240 
5241 	coveredVnode->covered_by = vnode;
5242 	coveredVnode->SetCovered(true);
5243 
5244 	// the vnodes do now reference each other
5245 	inc_vnode_ref_count(vnode);
5246 	inc_vnode_ref_count(coveredVnode);
5247 
5248 	return B_OK;
5249 }
5250 
5251 
5252 int
5253 vfs_getrlimit(int resource, struct rlimit* rlp)
5254 {
5255 	if (!rlp)
5256 		return B_BAD_ADDRESS;
5257 
5258 	switch (resource) {
5259 		case RLIMIT_NOFILE:
5260 		{
5261 			struct io_context* context = get_current_io_context(false);
5262 			MutexLocker _(context->io_mutex);
5263 
5264 			rlp->rlim_cur = context->table_size;
5265 			rlp->rlim_max = MAX_FD_TABLE_SIZE;
5266 			return 0;
5267 		}
5268 
5269 		case RLIMIT_NOVMON:
5270 		{
5271 			struct io_context* context = get_current_io_context(false);
5272 			MutexLocker _(context->io_mutex);
5273 
5274 			rlp->rlim_cur = context->max_monitors;
5275 			rlp->rlim_max = MAX_NODE_MONITORS;
5276 			return 0;
5277 		}
5278 
5279 		default:
5280 			return B_BAD_VALUE;
5281 	}
5282 }
5283 
5284 
5285 int
5286 vfs_setrlimit(int resource, const struct rlimit* rlp)
5287 {
5288 	if (!rlp)
5289 		return B_BAD_ADDRESS;
5290 
5291 	switch (resource) {
5292 		case RLIMIT_NOFILE:
5293 			/* TODO: check getuid() */
5294 			if (rlp->rlim_max != RLIM_SAVED_MAX
5295 				&& rlp->rlim_max != MAX_FD_TABLE_SIZE)
5296 				return B_NOT_ALLOWED;
5297 
5298 			return vfs_resize_fd_table(get_current_io_context(false),
5299 				rlp->rlim_cur);
5300 
5301 		case RLIMIT_NOVMON:
5302 			/* TODO: check getuid() */
5303 			if (rlp->rlim_max != RLIM_SAVED_MAX
5304 				&& rlp->rlim_max != MAX_NODE_MONITORS)
5305 				return B_NOT_ALLOWED;
5306 
5307 			return resize_monitor_table(get_current_io_context(false),
5308 				rlp->rlim_cur);
5309 
5310 		default:
5311 			return B_BAD_VALUE;
5312 	}
5313 }
5314 
5315 
5316 status_t
5317 vfs_init(kernel_args* args)
5318 {
5319 	vnode::StaticInit();
5320 
5321 	sVnodeTable = new(std::nothrow) VnodeTable();
5322 	if (sVnodeTable == NULL || sVnodeTable->Init(VNODE_HASH_TABLE_SIZE) != B_OK)
5323 		panic("vfs_init: error creating vnode hash table\n");
5324 
5325 	struct vnode dummy_vnode;
5326 	list_init_etc(&sUnusedVnodeList, offset_of_member(dummy_vnode, unused_link));
5327 
5328 	struct fs_mount dummyMount;
5329 	sMountsTable = new(std::nothrow) MountTable();
5330 	if (sMountsTable == NULL
5331 			|| sMountsTable->Init(MOUNTS_HASH_TABLE_SIZE) != B_OK)
5332 		panic("vfs_init: error creating mounts hash table\n");
5333 
5334 	sPathNameCache = create_object_cache("vfs path names",
5335 		B_PATH_NAME_LENGTH + 1, 8, NULL, NULL, NULL);
5336 	if (sPathNameCache == NULL)
5337 		panic("vfs_init: error creating path name object_cache\n");
5338 
5339 	sVnodeCache = create_object_cache("vfs vnodes",
5340 		sizeof(struct vnode), 8, NULL, NULL, NULL);
5341 	if (sVnodeCache == NULL)
5342 		panic("vfs_init: error creating vnode object_cache\n");
5343 
5344 	sFileDescriptorCache = create_object_cache("vfs fds",
5345 		sizeof(file_descriptor), 8, NULL, NULL, NULL);
5346 	if (sFileDescriptorCache == NULL)
5347 		panic("vfs_init: error creating file descriptor object_cache\n");
5348 
5349 	node_monitor_init();
5350 
5351 	sRoot = NULL;
5352 
5353 	recursive_lock_init(&sMountOpLock, "vfs_mount_op_lock");
5354 
5355 	if (block_cache_init() != B_OK)
5356 		return B_ERROR;
5357 
5358 #ifdef ADD_DEBUGGER_COMMANDS
5359 	// add some debugger commands
5360 	add_debugger_command_etc("vnode", &dump_vnode,
5361 		"Print info about the specified vnode",
5362 		"[ \"-p\" ] ( <vnode> | <devID> <nodeID> )\n"
5363 		"Prints information about the vnode specified by address <vnode> or\n"
5364 		"<devID>, <vnodeID> pair. If \"-p\" is given, a path of the vnode is\n"
5365 		"constructed and printed. It might not be possible to construct a\n"
5366 		"complete path, though.\n",
5367 		0);
5368 	add_debugger_command("vnodes", &dump_vnodes,
5369 		"list all vnodes (from the specified device)");
5370 	add_debugger_command("vnode_caches", &dump_vnode_caches,
5371 		"list all vnode caches");
5372 	add_debugger_command("mount", &dump_mount,
5373 		"info about the specified fs_mount");
5374 	add_debugger_command("mounts", &dump_mounts, "list all fs_mounts");
5375 	add_debugger_command("io_context", &dump_io_context,
5376 		"info about the I/O context");
5377 	add_debugger_command("vnode_usage", &dump_vnode_usage,
5378 		"info about vnode usage");
5379 #endif
5380 
5381 	register_low_resource_handler(&vnode_low_resource_handler, NULL,
5382 		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
5383 			| B_KERNEL_RESOURCE_ADDRESS_SPACE,
5384 		0);
5385 
5386 	fifo_init();
5387 	file_map_init();
5388 
5389 	return file_cache_init();
5390 }
5391 
5392 
5393 //	#pragma mark - fd_ops implementations
5394 
5395 
5396 /*!
5397 	Calls fs_open() on the given vnode and returns a new
5398 	file descriptor for it
5399 */
5400 static int
5401 open_vnode(struct vnode* vnode, int openMode, bool kernel)
5402 {
5403 	void* cookie;
5404 	status_t status = FS_CALL(vnode, open, openMode, &cookie);
5405 	if (status != B_OK)
5406 		return status;
5407 
5408 	int fd = get_new_fd(FDTYPE_FILE, NULL, vnode, cookie, openMode, kernel);
5409 	if (fd < 0) {
5410 		FS_CALL(vnode, close, cookie);
5411 		FS_CALL(vnode, free_cookie, cookie);
5412 	}
5413 	return fd;
5414 }
5415 
5416 
5417 /*!
5418 	Calls fs_open() on the given vnode and returns a new
5419 	file descriptor for it
5420 */
5421 static int
5422 create_vnode(struct vnode* directory, const char* name, int openMode,
5423 	int perms, bool kernel)
5424 {
5425 	bool traverse = ((openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0);
5426 	status_t status = B_ERROR;
5427 	struct vnode* vnode;
5428 	void* cookie;
5429 	ino_t newID;
5430 
5431 	// This is somewhat tricky: If the entry already exists, the FS responsible
5432 	// for the directory might not necessarily also be the one responsible for
5433 	// the node the entry refers to (e.g. in case of mount points or FIFOs). So
5434 	// we can actually never call the create() hook without O_EXCL. Instead we
5435 	// try to look the entry up first. If it already exists, we just open the
5436 	// node (unless O_EXCL), otherwise we call create() with O_EXCL. This
5437 	// introduces a race condition, since someone else might have created the
5438 	// entry in the meantime. We hope the respective FS returns the correct
5439 	// error code and retry (up to 3 times) again.
5440 
5441 	for (int i = 0; i < 3 && status != B_OK; i++) {
5442 		// look the node up
5443 		status = lookup_dir_entry(directory, name, &vnode);
5444 		if (status == B_OK) {
5445 			VNodePutter putter(vnode);
5446 
5447 			if ((openMode & O_EXCL) != 0)
5448 				return B_FILE_EXISTS;
5449 
5450 			// If the node is a symlink, we have to follow it, unless
5451 			// O_NOTRAVERSE is set.
5452 			if (S_ISLNK(vnode->Type()) && traverse) {
5453 				putter.Put();
5454 				char clonedName[B_FILE_NAME_LENGTH + 1];
5455 				if (strlcpy(clonedName, name, B_FILE_NAME_LENGTH)
5456 						>= B_FILE_NAME_LENGTH) {
5457 					return B_NAME_TOO_LONG;
5458 				}
5459 
5460 				inc_vnode_ref_count(directory);
5461 				status = vnode_path_to_vnode(directory, clonedName, true, 0,
5462 					kernel, &vnode, NULL);
5463 				if (status != B_OK)
5464 					return status;
5465 
5466 				putter.SetTo(vnode);
5467 			}
5468 
5469 			if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type()))
5470 				return B_LINK_LIMIT;
5471 
5472 			int fd = open_vnode(vnode, openMode & ~O_CREAT, kernel);
5473 			// on success keep the vnode reference for the FD
5474 			if (fd >= 0)
5475 				putter.Detach();
5476 
5477 			return fd;
5478 		}
5479 
5480 		// it doesn't exist yet -- try to create it
5481 
5482 		if (!HAS_FS_CALL(directory, create))
5483 			return B_READ_ONLY_DEVICE;
5484 
5485 		status = FS_CALL(directory, create, name, openMode | O_EXCL, perms,
5486 			&cookie, &newID);
5487 		if (status != B_OK
5488 			&& ((openMode & O_EXCL) != 0 || status != B_FILE_EXISTS)) {
5489 			return status;
5490 		}
5491 	}
5492 
5493 	if (status != B_OK)
5494 		return status;
5495 
5496 	// the node has been created successfully
5497 
5498 	rw_lock_read_lock(&sVnodeLock);
5499 	vnode = lookup_vnode(directory->device, newID);
5500 	rw_lock_read_unlock(&sVnodeLock);
5501 
5502 	if (vnode == NULL) {
5503 		panic("vfs: fs_create() returned success but there is no vnode, "
5504 			"mount ID %" B_PRIdDEV "!\n", directory->device);
5505 		return B_BAD_VALUE;
5506 	}
5507 
5508 	int fd = get_new_fd(FDTYPE_FILE, NULL, vnode, cookie, openMode, kernel);
5509 	if (fd >= 0)
5510 		return fd;
5511 
5512 	status = fd;
5513 
5514 	// something went wrong, clean up
5515 
5516 	FS_CALL(vnode, close, cookie);
5517 	FS_CALL(vnode, free_cookie, cookie);
5518 	put_vnode(vnode);
5519 
5520 	FS_CALL(directory, unlink, name);
5521 
5522 	return status;
5523 }
5524 
5525 
5526 /*! Calls fs open_dir() on the given vnode and returns a new
5527 	file descriptor for it
5528 */
5529 static int
5530 open_dir_vnode(struct vnode* vnode, bool kernel)
5531 {
5532 	if (!HAS_FS_CALL(vnode, open_dir))
5533 		return B_UNSUPPORTED;
5534 
5535 	void* cookie;
5536 	status_t status = FS_CALL(vnode, open_dir, &cookie);
5537 	if (status != B_OK)
5538 		return status;
5539 
5540 	// directory is opened, create a fd
5541 	status = get_new_fd(FDTYPE_DIR, NULL, vnode, cookie, O_CLOEXEC, kernel);
5542 	if (status >= 0)
5543 		return status;
5544 
5545 	FS_CALL(vnode, close_dir, cookie);
5546 	FS_CALL(vnode, free_dir_cookie, cookie);
5547 
5548 	return status;
5549 }
5550 
5551 
5552 /*! Calls fs open_attr_dir() on the given vnode and returns a new
5553 	file descriptor for it.
5554 	Used by attr_dir_open(), and attr_dir_open_fd().
5555 */
5556 static int
5557 open_attr_dir_vnode(struct vnode* vnode, bool kernel)
5558 {
5559 	if (!HAS_FS_CALL(vnode, open_attr_dir))
5560 		return B_UNSUPPORTED;
5561 
5562 	void* cookie;
5563 	status_t status = FS_CALL(vnode, open_attr_dir, &cookie);
5564 	if (status != B_OK)
5565 		return status;
5566 
5567 	// directory is opened, create a fd
5568 	status = get_new_fd(FDTYPE_ATTR_DIR, NULL, vnode, cookie, O_CLOEXEC,
5569 		kernel);
5570 	if (status >= 0)
5571 		return status;
5572 
5573 	FS_CALL(vnode, close_attr_dir, cookie);
5574 	FS_CALL(vnode, free_attr_dir_cookie, cookie);
5575 
5576 	return status;
5577 }
5578 
5579 
5580 static int
5581 file_create_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5582 	int openMode, int perms, bool kernel)
5583 {
5584 	FUNCTION(("file_create_entry_ref: name = '%s', omode %x, perms %d, "
5585 		"kernel %d\n", name, openMode, perms, kernel));
5586 
5587 	// get directory to put the new file in
5588 	struct vnode* directory;
5589 	status_t status = get_vnode(mountID, directoryID, &directory, true, false);
5590 	if (status != B_OK)
5591 		return status;
5592 
5593 	status = create_vnode(directory, name, openMode, perms, kernel);
5594 	put_vnode(directory);
5595 
5596 	return status;
5597 }
5598 
5599 
5600 static int
5601 file_create(int fd, char* path, int openMode, int perms, bool kernel)
5602 {
5603 	FUNCTION(("file_create: path '%s', omode %x, perms %d, kernel %d\n", path,
5604 		openMode, perms, kernel));
5605 
5606 	// get directory to put the new file in
5607 	char name[B_FILE_NAME_LENGTH];
5608 	struct vnode* directory;
5609 	status_t status = fd_and_path_to_dir_vnode(fd, path, &directory, name,
5610 		kernel);
5611 	if (status < 0)
5612 		return status;
5613 
5614 	status = create_vnode(directory, name, openMode, perms, kernel);
5615 
5616 	put_vnode(directory);
5617 	return status;
5618 }
5619 
5620 
5621 static int
5622 file_open_entry_ref(dev_t mountID, ino_t directoryID, const char* name,
5623 	int openMode, bool kernel)
5624 {
5625 	if (name == NULL || *name == '\0')
5626 		return B_BAD_VALUE;
5627 
5628 	FUNCTION(("file_open_entry_ref(ref = (%" B_PRId32 ", %" B_PRId64 ", %s), "
5629 		"openMode = %d)\n", mountID, directoryID, name, openMode));
5630 
5631 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5632 
5633 	// get the vnode matching the entry_ref
5634 	struct vnode* vnode;
5635 	status_t status = entry_ref_to_vnode(mountID, directoryID, name, traverse,
5636 		kernel, &vnode);
5637 	if (status != B_OK)
5638 		return status;
5639 
5640 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
5641 		put_vnode(vnode);
5642 		return B_LINK_LIMIT;
5643 	}
5644 
5645 	int newFD = open_vnode(vnode, openMode, kernel);
5646 	if (newFD >= 0) {
5647 		// The vnode reference has been transferred to the FD
5648 		cache_node_opened(vnode, FDTYPE_FILE, vnode->cache, mountID,
5649 			directoryID, vnode->id, name);
5650 	} else
5651 		put_vnode(vnode);
5652 
5653 	return newFD;
5654 }
5655 
5656 
5657 static int
5658 file_open(int fd, char* path, int openMode, bool kernel)
5659 {
5660 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
5661 
5662 	FUNCTION(("file_open: fd: %d, entry path = '%s', omode %d, kernel %d\n",
5663 		fd, path, openMode, kernel));
5664 
5665 	// get the vnode matching the vnode + path combination
5666 	struct vnode* vnode;
5667 	ino_t parentID;
5668 	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode,
5669 		&parentID, kernel);
5670 	if (status != B_OK)
5671 		return status;
5672 
5673 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
5674 		put_vnode(vnode);
5675 		return B_LINK_LIMIT;
5676 	}
5677 
5678 	// open the vnode
5679 	int newFD = open_vnode(vnode, openMode, kernel);
5680 	if (newFD >= 0) {
5681 		// The vnode reference has been transferred to the FD
5682 		cache_node_opened(vnode, FDTYPE_FILE, vnode->cache,
5683 			vnode->device, parentID, vnode->id, NULL);
5684 	} else
5685 		put_vnode(vnode);
5686 
5687 	return newFD;
5688 }
5689 
5690 
5691 static status_t
5692 file_close(struct file_descriptor* descriptor)
5693 {
5694 	struct vnode* vnode = descriptor->u.vnode;
5695 	status_t status = B_OK;
5696 
5697 	FUNCTION(("file_close(descriptor = %p)\n", descriptor));
5698 
5699 	cache_node_closed(vnode, FDTYPE_FILE, vnode->cache, vnode->device,
5700 		vnode->id);
5701 	if (HAS_FS_CALL(vnode, close)) {
5702 		status = FS_CALL(vnode, close, descriptor->cookie);
5703 	}
5704 
5705 	if (status == B_OK) {
5706 		// remove all outstanding locks for this team
5707 		if (HAS_FS_CALL(vnode, release_lock))
5708 			status = FS_CALL(vnode, release_lock, descriptor->cookie, NULL);
5709 		else
5710 			status = release_advisory_lock(vnode, NULL, descriptor, NULL);
5711 	}
5712 	return status;
5713 }
5714 
5715 
5716 static void
5717 file_free_fd(struct file_descriptor* descriptor)
5718 {
5719 	struct vnode* vnode = descriptor->u.vnode;
5720 
5721 	if (vnode != NULL) {
5722 		FS_CALL(vnode, free_cookie, descriptor->cookie);
5723 		put_vnode(vnode);
5724 	}
5725 }
5726 
5727 
5728 static status_t
5729 file_read(struct file_descriptor* descriptor, off_t pos, void* buffer,
5730 	size_t* length)
5731 {
5732 	struct vnode* vnode = descriptor->u.vnode;
5733 	FUNCTION(("file_read: buf %p, pos %" B_PRIdOFF ", len %p = %ld\n", buffer,
5734 		pos, length, *length));
5735 
5736 	if (S_ISDIR(vnode->Type()))
5737 		return B_IS_A_DIRECTORY;
5738 
5739 	return FS_CALL(vnode, read, descriptor->cookie, pos, buffer, length);
5740 }
5741 
5742 
5743 static status_t
5744 file_write(struct file_descriptor* descriptor, off_t pos, const void* buffer,
5745 	size_t* length)
5746 {
5747 	struct vnode* vnode = descriptor->u.vnode;
5748 	FUNCTION(("file_write: buf %p, pos %" B_PRIdOFF ", len %p\n", buffer, pos,
5749 		length));
5750 
5751 	if (S_ISDIR(vnode->Type()))
5752 		return B_IS_A_DIRECTORY;
5753 	if (!HAS_FS_CALL(vnode, write))
5754 		return B_READ_ONLY_DEVICE;
5755 
5756 	return FS_CALL(vnode, write, descriptor->cookie, pos, buffer, length);
5757 }
5758 
5759 
5760 static off_t
5761 file_seek(struct file_descriptor* descriptor, off_t pos, int seekType)
5762 {
5763 	struct vnode* vnode = descriptor->u.vnode;
5764 	off_t offset;
5765 	bool isDevice = false;
5766 
5767 	FUNCTION(("file_seek(pos = %" B_PRIdOFF ", seekType = %d)\n", pos,
5768 		seekType));
5769 
5770 	// some kinds of files are not seekable
5771 	switch (vnode->Type() & S_IFMT) {
5772 		case S_IFIFO:
5773 		case S_IFSOCK:
5774 			return ESPIPE;
5775 
5776 		// drivers publish block devices as chr, so pick both
5777 		case S_IFBLK:
5778 		case S_IFCHR:
5779 			isDevice = true;
5780 			break;
5781 		// The Open Group Base Specs don't mention any file types besides pipes,
5782 		// fifos, and sockets specially, so we allow seeking them.
5783 		case S_IFREG:
5784 		case S_IFDIR:
5785 		case S_IFLNK:
5786 			break;
5787 	}
5788 
5789 	switch (seekType) {
5790 		case SEEK_SET:
5791 			offset = 0;
5792 			break;
5793 		case SEEK_CUR:
5794 			offset = descriptor->pos;
5795 			break;
5796 		case SEEK_END:
5797 		{
5798 			// stat() the node
5799 			if (!HAS_FS_CALL(vnode, read_stat))
5800 				return B_UNSUPPORTED;
5801 
5802 			struct stat stat;
5803 			status_t status = FS_CALL(vnode, read_stat, &stat);
5804 			if (status != B_OK)
5805 				return status;
5806 
5807 			offset = stat.st_size;
5808 
5809 			if (offset == 0 && isDevice) {
5810 				// stat() on regular drivers doesn't report size
5811 				device_geometry geometry;
5812 
5813 				if (HAS_FS_CALL(vnode, ioctl)) {
5814 					status = FS_CALL(vnode, ioctl, descriptor->cookie,
5815 						B_GET_GEOMETRY, &geometry, sizeof(geometry));
5816 					if (status == B_OK)
5817 						offset = (off_t)geometry.bytes_per_sector
5818 							* geometry.sectors_per_track
5819 							* geometry.cylinder_count
5820 							* geometry.head_count;
5821 				}
5822 			}
5823 
5824 			break;
5825 		}
5826 		case SEEK_DATA:
5827 		case SEEK_HOLE:
5828 		{
5829 			status_t status = B_BAD_VALUE;
5830 			if (HAS_FS_CALL(vnode, ioctl)) {
5831 				offset = pos;
5832 				status = FS_CALL(vnode, ioctl, descriptor->cookie,
5833 					seekType == SEEK_DATA ? FIOSEEKDATA : FIOSEEKHOLE,
5834 					&offset, sizeof(offset));
5835 				if (status == B_OK) {
5836 					if (offset > pos)
5837 						offset -= pos;
5838 					break;
5839 				}
5840 			}
5841 			if (status != B_BAD_VALUE && status != B_DEV_INVALID_IOCTL)
5842 				return status;
5843 
5844 			// basic implementation with stat() the node
5845 			if (!HAS_FS_CALL(vnode, read_stat) || isDevice)
5846 				return B_BAD_VALUE;
5847 
5848 			struct stat stat;
5849 			status = FS_CALL(vnode, read_stat, &stat);
5850 			if (status != B_OK)
5851 				return status;
5852 
5853 			off_t end = stat.st_size;
5854 			if (pos >= end)
5855 				return ENXIO;
5856 			offset = seekType == SEEK_HOLE ? end - pos : 0;
5857 			break;
5858 		}
5859 		default:
5860 			return B_BAD_VALUE;
5861 	}
5862 
5863 	// assumes off_t is 64 bits wide
5864 	if (offset > 0 && LONGLONG_MAX - offset < pos)
5865 		return B_BUFFER_OVERFLOW;
5866 
5867 	pos += offset;
5868 	if (pos < 0)
5869 		return B_BAD_VALUE;
5870 
5871 	return descriptor->pos = pos;
5872 }
5873 
5874 
5875 static status_t
5876 file_select(struct file_descriptor* descriptor, uint8 event,
5877 	struct selectsync* sync)
5878 {
5879 	FUNCTION(("file_select(%p, %u, %p)\n", descriptor, event, sync));
5880 
5881 	struct vnode* vnode = descriptor->u.vnode;
5882 
5883 	// If the FS has no select() hook, notify select() now.
5884 	if (!HAS_FS_CALL(vnode, select)) {
5885 		if (!SELECT_TYPE_IS_OUTPUT_ONLY(event))
5886 			return notify_select_event(sync, event);
5887 		else
5888 			return B_OK;
5889 	}
5890 
5891 	return FS_CALL(vnode, select, descriptor->cookie, event, sync);
5892 }
5893 
5894 
5895 static status_t
5896 file_deselect(struct file_descriptor* descriptor, uint8 event,
5897 	struct selectsync* sync)
5898 {
5899 	struct vnode* vnode = descriptor->u.vnode;
5900 
5901 	if (!HAS_FS_CALL(vnode, deselect))
5902 		return B_OK;
5903 
5904 	return FS_CALL(vnode, deselect, descriptor->cookie, event, sync);
5905 }
5906 
5907 
5908 static status_t
5909 dir_create_entry_ref(dev_t mountID, ino_t parentID, const char* name, int perms,
5910 	bool kernel)
5911 {
5912 	struct vnode* vnode;
5913 	status_t status;
5914 
5915 	if (name == NULL || *name == '\0')
5916 		return B_BAD_VALUE;
5917 
5918 	FUNCTION(("dir_create_entry_ref(dev = %" B_PRId32 ", ino = %" B_PRId64 ", "
5919 		"name = '%s', perms = %d)\n", mountID, parentID, name, perms));
5920 
5921 	status = get_vnode(mountID, parentID, &vnode, true, false);
5922 	if (status != B_OK)
5923 		return status;
5924 
5925 	if (HAS_FS_CALL(vnode, create_dir))
5926 		status = FS_CALL(vnode, create_dir, name, perms);
5927 	else
5928 		status = B_READ_ONLY_DEVICE;
5929 
5930 	put_vnode(vnode);
5931 	return status;
5932 }
5933 
5934 
5935 static status_t
5936 dir_create(int fd, char* path, int perms, bool kernel)
5937 {
5938 	char filename[B_FILE_NAME_LENGTH];
5939 	struct vnode* vnode;
5940 	status_t status;
5941 
5942 	FUNCTION(("dir_create: path '%s', perms %d, kernel %d\n", path, perms,
5943 		kernel));
5944 
5945 	status = fd_and_path_to_dir_vnode(fd, path, &vnode, filename, kernel);
5946 	if (status < 0)
5947 		return status;
5948 
5949 	if (HAS_FS_CALL(vnode, create_dir)) {
5950 		status = FS_CALL(vnode, create_dir, filename, perms);
5951 	} else
5952 		status = B_READ_ONLY_DEVICE;
5953 
5954 	put_vnode(vnode);
5955 	return status;
5956 }
5957 
5958 
5959 static int
5960 dir_open_entry_ref(dev_t mountID, ino_t parentID, const char* name, bool kernel)
5961 {
5962 	FUNCTION(("dir_open_entry_ref()\n"));
5963 
5964 	if (name && name[0] == '\0')
5965 		return B_BAD_VALUE;
5966 
5967 	// get the vnode matching the entry_ref/node_ref
5968 	struct vnode* vnode;
5969 	status_t status;
5970 	if (name) {
5971 		status = entry_ref_to_vnode(mountID, parentID, name, true, kernel,
5972 			&vnode);
5973 	} else
5974 		status = get_vnode(mountID, parentID, &vnode, true, false);
5975 	if (status != B_OK)
5976 		return status;
5977 
5978 	int newFD = open_dir_vnode(vnode, kernel);
5979 	if (newFD >= 0) {
5980 		// The vnode reference has been transferred to the FD
5981 		cache_node_opened(vnode, FDTYPE_DIR, vnode->cache, mountID, parentID,
5982 			vnode->id, name);
5983 	} else
5984 		put_vnode(vnode);
5985 
5986 	return newFD;
5987 }
5988 
5989 
5990 static int
5991 dir_open(int fd, char* path, bool kernel)
5992 {
5993 	FUNCTION(("dir_open: fd: %d, entry path = '%s', kernel %d\n", fd, path,
5994 		kernel));
5995 
5996 	// get the vnode matching the vnode + path combination
5997 	struct vnode* vnode = NULL;
5998 	ino_t parentID;
5999 	status_t status = fd_and_path_to_vnode(fd, path, true, &vnode, &parentID,
6000 		kernel);
6001 	if (status != B_OK)
6002 		return status;
6003 
6004 	// open the dir
6005 	int newFD = open_dir_vnode(vnode, kernel);
6006 	if (newFD >= 0) {
6007 		// The vnode reference has been transferred to the FD
6008 		cache_node_opened(vnode, FDTYPE_DIR, vnode->cache, vnode->device,
6009 			parentID, vnode->id, NULL);
6010 	} else
6011 		put_vnode(vnode);
6012 
6013 	return newFD;
6014 }
6015 
6016 
6017 static status_t
6018 dir_close(struct file_descriptor* descriptor)
6019 {
6020 	struct vnode* vnode = descriptor->u.vnode;
6021 
6022 	FUNCTION(("dir_close(descriptor = %p)\n", descriptor));
6023 
6024 	cache_node_closed(vnode, FDTYPE_DIR, vnode->cache, vnode->device,
6025 		vnode->id);
6026 	if (HAS_FS_CALL(vnode, close_dir))
6027 		return FS_CALL(vnode, close_dir, descriptor->cookie);
6028 
6029 	return B_OK;
6030 }
6031 
6032 
6033 static void
6034 dir_free_fd(struct file_descriptor* descriptor)
6035 {
6036 	struct vnode* vnode = descriptor->u.vnode;
6037 
6038 	if (vnode != NULL) {
6039 		FS_CALL(vnode, free_dir_cookie, descriptor->cookie);
6040 		put_vnode(vnode);
6041 	}
6042 }
6043 
6044 
6045 static status_t
6046 dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
6047 	struct dirent* buffer, size_t bufferSize, uint32* _count)
6048 {
6049 	return dir_read(ioContext, descriptor->u.vnode, descriptor->cookie, buffer,
6050 		bufferSize, _count);
6051 }
6052 
6053 
6054 static status_t
6055 fix_dirent(struct vnode* parent, struct dirent* entry,
6056 	struct io_context* ioContext)
6057 {
6058 	// set d_pdev and d_pino
6059 	entry->d_pdev = parent->device;
6060 	entry->d_pino = parent->id;
6061 
6062 	// If this is the ".." entry and the directory covering another vnode,
6063 	// we need to replace d_dev and d_ino with the actual values.
6064 	if (strcmp(entry->d_name, "..") == 0 && parent->IsCovering()) {
6065 		return resolve_covered_parent(parent, &entry->d_dev, &entry->d_ino,
6066 			ioContext);
6067 	}
6068 
6069 	// resolve covered vnodes
6070 	ReadLocker _(&sVnodeLock);
6071 
6072 	struct vnode* vnode = lookup_vnode(entry->d_dev, entry->d_ino);
6073 	if (vnode != NULL && vnode->covered_by != NULL) {
6074 		do {
6075 			vnode = vnode->covered_by;
6076 		} while (vnode->covered_by != NULL);
6077 
6078 		entry->d_dev = vnode->device;
6079 		entry->d_ino = vnode->id;
6080 	}
6081 
6082 	return B_OK;
6083 }
6084 
6085 
6086 static status_t
6087 dir_read(struct io_context* ioContext, struct vnode* vnode, void* cookie,
6088 	struct dirent* buffer, size_t bufferSize, uint32* _count)
6089 {
6090 	if (!HAS_FS_CALL(vnode, read_dir))
6091 		return B_UNSUPPORTED;
6092 
6093 	status_t error = FS_CALL(vnode, read_dir, cookie, buffer, bufferSize,
6094 		_count);
6095 	if (error != B_OK)
6096 		return error;
6097 
6098 	// we need to adjust the read dirents
6099 	uint32 count = *_count;
6100 	for (uint32 i = 0; i < count; i++) {
6101 		error = fix_dirent(vnode, buffer, ioContext);
6102 		if (error != B_OK)
6103 			return error;
6104 
6105 		buffer = (struct dirent*)((uint8*)buffer + buffer->d_reclen);
6106 	}
6107 
6108 	return error;
6109 }
6110 
6111 
6112 static status_t
6113 dir_rewind(struct file_descriptor* descriptor)
6114 {
6115 	struct vnode* vnode = descriptor->u.vnode;
6116 
6117 	if (HAS_FS_CALL(vnode, rewind_dir)) {
6118 		return FS_CALL(vnode, rewind_dir, descriptor->cookie);
6119 	}
6120 
6121 	return B_UNSUPPORTED;
6122 }
6123 
6124 
6125 static status_t
6126 dir_remove(int fd, char* path, bool kernel)
6127 {
6128 	char name[B_FILE_NAME_LENGTH];
6129 	struct vnode* directory;
6130 	status_t status;
6131 
6132 	if (path != NULL) {
6133 		// we need to make sure our path name doesn't stop with "/", ".",
6134 		// or ".."
6135 		char* lastSlash;
6136 		while ((lastSlash = strrchr(path, '/')) != NULL) {
6137 			char* leaf = lastSlash + 1;
6138 			if (!strcmp(leaf, ".."))
6139 				return B_NOT_ALLOWED;
6140 
6141 			// omit multiple slashes
6142 			while (lastSlash > path && lastSlash[-1] == '/')
6143 				lastSlash--;
6144 
6145 			if (leaf[0]
6146 				&& strcmp(leaf, ".")) {
6147 				break;
6148 			}
6149 			// "name/" -> "name", or "name/." -> "name"
6150 			lastSlash[0] = '\0';
6151 		}
6152 
6153 		if (!strcmp(path, ".") || !strcmp(path, ".."))
6154 			return B_NOT_ALLOWED;
6155 	}
6156 
6157 	status = fd_and_path_to_dir_vnode(fd, path, &directory, name, kernel);
6158 	if (status != B_OK)
6159 		return status;
6160 
6161 	if (HAS_FS_CALL(directory, remove_dir))
6162 		status = FS_CALL(directory, remove_dir, name);
6163 	else
6164 		status = B_READ_ONLY_DEVICE;
6165 
6166 	put_vnode(directory);
6167 	return status;
6168 }
6169 
6170 
6171 static status_t
6172 common_ioctl(struct file_descriptor* descriptor, ulong op, void* buffer,
6173 	size_t length)
6174 {
6175 	struct vnode* vnode = descriptor->u.vnode;
6176 
6177 	if (HAS_FS_CALL(vnode, ioctl))
6178 		return FS_CALL(vnode, ioctl, descriptor->cookie, op, buffer, length);
6179 
6180 	return B_DEV_INVALID_IOCTL;
6181 }
6182 
6183 
6184 static status_t
6185 common_fcntl(int fd, int op, size_t argument, bool kernel)
6186 {
6187 	struct flock flock;
6188 
6189 	FUNCTION(("common_fcntl(fd = %d, op = %d, argument = %lx, %s)\n",
6190 		fd, op, argument, kernel ? "kernel" : "user"));
6191 
6192 	struct io_context* context = get_current_io_context(kernel);
6193 
6194 	struct file_descriptor* descriptor = get_fd(context, fd);
6195 	if (descriptor == NULL)
6196 		return B_FILE_ERROR;
6197 
6198 	struct vnode* vnode = fd_vnode(descriptor);
6199 
6200 	status_t status = B_OK;
6201 
6202 	if (op == F_SETLK || op == F_SETLKW || op == F_GETLK) {
6203 		if (descriptor->type != FDTYPE_FILE)
6204 			status = B_BAD_VALUE;
6205 		else if (kernel)
6206 			memcpy(&flock, (struct flock*)argument, sizeof(struct flock));
6207 		else if (user_memcpy(&flock, (struct flock*)argument,
6208 				sizeof(struct flock)) != B_OK)
6209 			status = B_BAD_ADDRESS;
6210 		if (status != B_OK) {
6211 			put_fd(descriptor);
6212 			return status;
6213 		}
6214 	}
6215 
6216 	switch (op) {
6217 		case F_SETFD:
6218 		{
6219 			// Set file descriptor flags
6220 
6221 			// O_CLOEXEC is the only flag available at this time
6222 			mutex_lock(&context->io_mutex);
6223 			fd_set_close_on_exec(context, fd, (argument & FD_CLOEXEC) != 0);
6224 			mutex_unlock(&context->io_mutex);
6225 
6226 			status = B_OK;
6227 			break;
6228 		}
6229 
6230 		case F_GETFD:
6231 		{
6232 			// Get file descriptor flags
6233 			mutex_lock(&context->io_mutex);
6234 			status = fd_close_on_exec(context, fd) ? FD_CLOEXEC : 0;
6235 			mutex_unlock(&context->io_mutex);
6236 			break;
6237 		}
6238 
6239 		case F_SETFL:
6240 			// Set file descriptor open mode
6241 
6242 			// we only accept changes to O_APPEND and O_NONBLOCK
6243 			argument &= O_APPEND | O_NONBLOCK;
6244 			if (descriptor->ops->fd_set_flags != NULL) {
6245 				status = descriptor->ops->fd_set_flags(descriptor, argument);
6246 			} else if (vnode != NULL && HAS_FS_CALL(vnode, set_flags)) {
6247 				status = FS_CALL(vnode, set_flags, descriptor->cookie,
6248 					(int)argument);
6249 			} else
6250 				status = B_UNSUPPORTED;
6251 
6252 			if (status == B_OK) {
6253 				// update this descriptor's open_mode field
6254 				descriptor->open_mode = (descriptor->open_mode
6255 					& ~(O_APPEND | O_NONBLOCK)) | argument;
6256 			}
6257 
6258 			break;
6259 
6260 		case F_GETFL:
6261 			// Get file descriptor open mode
6262 			status = descriptor->open_mode;
6263 			break;
6264 
6265 		case F_DUPFD:
6266 		case F_DUPFD_CLOEXEC:
6267 		{
6268 			status = new_fd_etc(context, descriptor, (int)argument);
6269 			if (status >= 0) {
6270 				mutex_lock(&context->io_mutex);
6271 				fd_set_close_on_exec(context, status, op == F_DUPFD_CLOEXEC);
6272 				mutex_unlock(&context->io_mutex);
6273 
6274 				atomic_add(&descriptor->ref_count, 1);
6275 			}
6276 			break;
6277 		}
6278 
6279 		case F_GETLK:
6280 			if (vnode != NULL) {
6281 				struct flock normalizedLock;
6282 
6283 				memcpy(&normalizedLock, &flock, sizeof(struct flock));
6284 				status = normalize_flock(descriptor, &normalizedLock);
6285 				if (status != B_OK)
6286 					break;
6287 
6288 				if (HAS_FS_CALL(vnode, test_lock)) {
6289 					status = FS_CALL(vnode, test_lock, descriptor->cookie,
6290 						&normalizedLock);
6291 				} else
6292 					status = test_advisory_lock(vnode, &normalizedLock);
6293 				if (status == B_OK) {
6294 					if (normalizedLock.l_type == F_UNLCK) {
6295 						// no conflicting lock found, copy back the same struct
6296 						// we were given except change type to F_UNLCK
6297 						flock.l_type = F_UNLCK;
6298 						if (kernel) {
6299 							memcpy((struct flock*)argument, &flock,
6300 								sizeof(struct flock));
6301 						} else {
6302 							status = user_memcpy((struct flock*)argument,
6303 								&flock, sizeof(struct flock));
6304 						}
6305 					} else {
6306 						// a conflicting lock was found, copy back its range and
6307 						// type
6308 						if (normalizedLock.l_len == OFF_MAX)
6309 							normalizedLock.l_len = 0;
6310 
6311 						if (kernel) {
6312 							memcpy((struct flock*)argument,
6313 								&normalizedLock, sizeof(struct flock));
6314 						} else {
6315 							status = user_memcpy((struct flock*)argument,
6316 								&normalizedLock, sizeof(struct flock));
6317 						}
6318 					}
6319 				}
6320 			} else
6321 				status = B_BAD_VALUE;
6322 			break;
6323 
6324 		case F_SETLK:
6325 		case F_SETLKW:
6326 			status = normalize_flock(descriptor, &flock);
6327 			if (status != B_OK)
6328 				break;
6329 
6330 			if (vnode == NULL) {
6331 				status = B_BAD_VALUE;
6332 			} else if (flock.l_type == F_UNLCK) {
6333 				if (HAS_FS_CALL(vnode, release_lock)) {
6334 					status = FS_CALL(vnode, release_lock, descriptor->cookie,
6335 						&flock);
6336 				} else {
6337 					status = release_advisory_lock(vnode, context, NULL,
6338 						&flock);
6339 				}
6340 			} else {
6341 				// the open mode must match the lock type
6342 				if (((descriptor->open_mode & O_RWMASK) == O_RDONLY
6343 						&& flock.l_type == F_WRLCK)
6344 					|| ((descriptor->open_mode & O_RWMASK) == O_WRONLY
6345 						&& flock.l_type == F_RDLCK))
6346 					status = B_FILE_ERROR;
6347 				else {
6348 					if (HAS_FS_CALL(vnode, acquire_lock)) {
6349 						status = FS_CALL(vnode, acquire_lock,
6350 							descriptor->cookie, &flock, op == F_SETLKW);
6351 					} else {
6352 						status = acquire_advisory_lock(vnode, context, NULL,
6353 							&flock, op == F_SETLKW);
6354 					}
6355 				}
6356 			}
6357 			break;
6358 
6359 		// ToDo: add support for more ops?
6360 
6361 		default:
6362 			status = B_BAD_VALUE;
6363 	}
6364 
6365 	put_fd(descriptor);
6366 	return status;
6367 }
6368 
6369 
6370 static status_t
6371 common_sync(int fd, bool kernel)
6372 {
6373 	struct file_descriptor* descriptor;
6374 	struct vnode* vnode;
6375 	status_t status;
6376 
6377 	FUNCTION(("common_fsync: entry. fd %d kernel %d\n", fd, kernel));
6378 
6379 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6380 	if (descriptor == NULL)
6381 		return B_FILE_ERROR;
6382 
6383 	if (HAS_FS_CALL(vnode, fsync))
6384 		status = FS_CALL_NO_PARAMS(vnode, fsync);
6385 	else
6386 		status = B_UNSUPPORTED;
6387 
6388 	put_fd(descriptor);
6389 	return status;
6390 }
6391 
6392 
6393 static status_t
6394 common_lock_node(int fd, bool kernel)
6395 {
6396 	struct file_descriptor* descriptor;
6397 	struct vnode* vnode;
6398 
6399 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6400 	if (descriptor == NULL)
6401 		return B_FILE_ERROR;
6402 
6403 	status_t status = B_OK;
6404 
6405 	// We need to set the locking atomically - someone
6406 	// else might set one at the same time
6407 	if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by, descriptor,
6408 			(file_descriptor*)NULL) != NULL)
6409 		status = B_BUSY;
6410 
6411 	put_fd(descriptor);
6412 	return status;
6413 }
6414 
6415 
6416 static status_t
6417 common_unlock_node(int fd, bool kernel)
6418 {
6419 	struct file_descriptor* descriptor;
6420 	struct vnode* vnode;
6421 
6422 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
6423 	if (descriptor == NULL)
6424 		return B_FILE_ERROR;
6425 
6426 	status_t status = B_OK;
6427 
6428 	// We need to set the locking atomically - someone
6429 	// else might set one at the same time
6430 	if (atomic_pointer_test_and_set(&vnode->mandatory_locked_by,
6431 			(file_descriptor*)NULL, descriptor) != descriptor)
6432 		status = B_BAD_VALUE;
6433 
6434 	put_fd(descriptor);
6435 	return status;
6436 }
6437 
6438 
6439 static status_t
6440 common_preallocate(int fd, off_t offset, off_t length, bool kernel)
6441 {
6442 	CObjectDeleter<struct file_descriptor, void, put_fd> descriptor;
6443 	struct vnode* vnode;
6444 
6445 	if (offset < 0 || length == 0)
6446 		return B_BAD_VALUE;
6447 	if (offset > OFF_MAX - length)
6448 		return B_FILE_TOO_LARGE;
6449 
6450 	descriptor.SetTo(get_fd_and_vnode(fd, &vnode, kernel));
6451 	if (!descriptor.IsSet() || (descriptor->open_mode & O_RWMASK) == O_RDONLY)
6452 		return B_FILE_ERROR;
6453 
6454 	switch (vnode->Type() & S_IFMT) {
6455 		case S_IFIFO:
6456 		case S_IFSOCK:
6457 			return ESPIPE;
6458 
6459 		case S_IFBLK:
6460 		case S_IFCHR:
6461 		case S_IFDIR:
6462 		case S_IFLNK:
6463 			return B_DEVICE_NOT_FOUND;
6464 
6465 		case S_IFREG:
6466 			break;
6467 	}
6468 
6469 	status_t status = B_OK;
6470 	if (HAS_FS_CALL(vnode, preallocate)) {
6471 		status = FS_CALL(vnode, preallocate, offset, length);
6472 	} else {
6473 		status = HAS_FS_CALL(vnode, write)
6474 			? B_UNSUPPORTED : B_READ_ONLY_DEVICE;
6475 	}
6476 
6477 	return status;
6478 }
6479 
6480 
6481 static status_t
6482 common_read_link(int fd, char* path, char* buffer, size_t* _bufferSize,
6483 	bool kernel)
6484 {
6485 	struct vnode* vnode;
6486 	status_t status;
6487 
6488 	status = fd_and_path_to_vnode(fd, path, false, &vnode, NULL, kernel);
6489 	if (status != B_OK)
6490 		return status;
6491 
6492 	if (HAS_FS_CALL(vnode, read_symlink)) {
6493 		status = FS_CALL(vnode, read_symlink, buffer, _bufferSize);
6494 	} else
6495 		status = B_BAD_VALUE;
6496 
6497 	put_vnode(vnode);
6498 	return status;
6499 }
6500 
6501 
6502 static status_t
6503 common_create_symlink(int fd, char* path, const char* toPath, int mode,
6504 	bool kernel)
6505 {
6506 	// path validity checks have to be in the calling function!
6507 	char name[B_FILE_NAME_LENGTH];
6508 	struct vnode* vnode;
6509 	status_t status;
6510 
6511 	FUNCTION(("common_create_symlink(fd = %d, path = %s, toPath = %s, "
6512 		"mode = %d, kernel = %d)\n", fd, path, toPath, mode, kernel));
6513 
6514 	status = fd_and_path_to_dir_vnode(fd, path, &vnode, name, kernel);
6515 	if (status != B_OK)
6516 		return status;
6517 
6518 	if (HAS_FS_CALL(vnode, create_symlink))
6519 		status = FS_CALL(vnode, create_symlink, name, toPath, mode);
6520 	else {
6521 		status = HAS_FS_CALL(vnode, write)
6522 			? B_UNSUPPORTED : B_READ_ONLY_DEVICE;
6523 	}
6524 
6525 	put_vnode(vnode);
6526 
6527 	return status;
6528 }
6529 
6530 
6531 static status_t
6532 common_create_link(int pathFD, char* path, int toFD, char* toPath,
6533 	bool traverseLeafLink, bool kernel)
6534 {
6535 	// path validity checks have to be in the calling function!
6536 
6537 	FUNCTION(("common_create_link(path = %s, toPath = %s, kernel = %d)\n", path,
6538 		toPath, kernel));
6539 
6540 	char name[B_FILE_NAME_LENGTH];
6541 	struct vnode* directory;
6542 	status_t status = fd_and_path_to_dir_vnode(pathFD, path, &directory, name,
6543 		kernel);
6544 	if (status != B_OK)
6545 		return status;
6546 
6547 	struct vnode* vnode;
6548 	status = fd_and_path_to_vnode(toFD, toPath, traverseLeafLink, &vnode, NULL,
6549 		kernel);
6550 	if (status != B_OK)
6551 		goto err;
6552 
6553 	if (directory->mount != vnode->mount) {
6554 		status = B_CROSS_DEVICE_LINK;
6555 		goto err1;
6556 	}
6557 
6558 	if (HAS_FS_CALL(directory, link))
6559 		status = FS_CALL(directory, link, name, vnode);
6560 	else
6561 		status = B_READ_ONLY_DEVICE;
6562 
6563 err1:
6564 	put_vnode(vnode);
6565 err:
6566 	put_vnode(directory);
6567 
6568 	return status;
6569 }
6570 
6571 
6572 static status_t
6573 common_unlink(int fd, char* path, bool kernel)
6574 {
6575 	char filename[B_FILE_NAME_LENGTH];
6576 	struct vnode* vnode;
6577 	status_t status;
6578 
6579 	FUNCTION(("common_unlink: fd: %d, path '%s', kernel %d\n", fd, path,
6580 		kernel));
6581 
6582 	status = fd_and_path_to_dir_vnode(fd, path, &vnode, filename, kernel);
6583 	if (status < 0)
6584 		return status;
6585 
6586 	if (HAS_FS_CALL(vnode, unlink))
6587 		status = FS_CALL(vnode, unlink, filename);
6588 	else
6589 		status = B_READ_ONLY_DEVICE;
6590 
6591 	put_vnode(vnode);
6592 
6593 	return status;
6594 }
6595 
6596 
6597 static status_t
6598 common_access(int fd, char* path, int mode, bool effectiveUserGroup, bool kernel)
6599 {
6600 	struct vnode* vnode;
6601 	status_t status;
6602 
6603 	// TODO: honor effectiveUserGroup argument
6604 
6605 	status = fd_and_path_to_vnode(fd, path, true, &vnode, NULL, kernel);
6606 	if (status != B_OK)
6607 		return status;
6608 
6609 	if (HAS_FS_CALL(vnode, access))
6610 		status = FS_CALL(vnode, access, mode);
6611 	else
6612 		status = B_OK;
6613 
6614 	put_vnode(vnode);
6615 
6616 	return status;
6617 }
6618 
6619 
6620 static status_t
6621 common_rename(int fd, char* path, int newFD, char* newPath, bool kernel)
6622 {
6623 	struct vnode* fromVnode;
6624 	struct vnode* toVnode;
6625 	char fromName[B_FILE_NAME_LENGTH];
6626 	char toName[B_FILE_NAME_LENGTH];
6627 	status_t status;
6628 
6629 	FUNCTION(("common_rename(fd = %d, path = %s, newFD = %d, newPath = %s, "
6630 		"kernel = %d)\n", fd, path, newFD, newPath, kernel));
6631 
6632 	status = fd_and_path_to_dir_vnode(fd, path, &fromVnode, fromName, kernel);
6633 	if (status != B_OK)
6634 		return status;
6635 
6636 	status = fd_and_path_to_dir_vnode(newFD, newPath, &toVnode, toName, kernel);
6637 	if (status != B_OK)
6638 		goto err1;
6639 
6640 	if (fromVnode->device != toVnode->device) {
6641 		status = B_CROSS_DEVICE_LINK;
6642 		goto err2;
6643 	}
6644 
6645 	if (fromName[0] == '\0' || toName[0] == '\0'
6646 		|| !strcmp(fromName, ".") || !strcmp(fromName, "..")
6647 		|| !strcmp(toName, ".") || !strcmp(toName, "..")
6648 		|| (fromVnode == toVnode && !strcmp(fromName, toName))) {
6649 		status = B_BAD_VALUE;
6650 		goto err2;
6651 	}
6652 
6653 	if (HAS_FS_CALL(fromVnode, rename))
6654 		status = FS_CALL(fromVnode, rename, fromName, toVnode, toName);
6655 	else
6656 		status = B_READ_ONLY_DEVICE;
6657 
6658 err2:
6659 	put_vnode(toVnode);
6660 err1:
6661 	put_vnode(fromVnode);
6662 
6663 	return status;
6664 }
6665 
6666 
6667 static status_t
6668 common_read_stat(struct file_descriptor* descriptor, struct stat* stat)
6669 {
6670 	struct vnode* vnode = descriptor->u.vnode;
6671 
6672 	FUNCTION(("common_read_stat: stat %p\n", stat));
6673 
6674 	// TODO: remove this once all file systems properly set them!
6675 	stat->st_crtim.tv_nsec = 0;
6676 	stat->st_ctim.tv_nsec = 0;
6677 	stat->st_mtim.tv_nsec = 0;
6678 	stat->st_atim.tv_nsec = 0;
6679 
6680 	return vfs_stat_vnode(vnode, stat);
6681 }
6682 
6683 
6684 static status_t
6685 common_write_stat(struct file_descriptor* descriptor, const struct stat* stat,
6686 	int statMask)
6687 {
6688 	struct vnode* vnode = descriptor->u.vnode;
6689 
6690 	FUNCTION(("common_write_stat(vnode = %p, stat = %p, statMask = %d)\n",
6691 		vnode, stat, statMask));
6692 
6693 	if ((descriptor->open_mode & O_RWMASK) == O_RDONLY
6694 		&& (statMask & B_STAT_SIZE) != 0) {
6695 		return B_BAD_VALUE;
6696 	}
6697 
6698 	if (!HAS_FS_CALL(vnode, write_stat))
6699 		return B_READ_ONLY_DEVICE;
6700 
6701 	return FS_CALL(vnode, write_stat, stat, statMask);
6702 }
6703 
6704 
6705 static status_t
6706 common_path_read_stat(int fd, char* path, bool traverseLeafLink,
6707 	struct stat* stat, bool kernel)
6708 {
6709 	FUNCTION(("common_path_read_stat: fd: %d, path '%s', stat %p,\n", fd, path,
6710 		stat));
6711 
6712 	struct vnode* vnode;
6713 	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6714 		NULL, kernel);
6715 	if (status != B_OK)
6716 		return status;
6717 
6718 	status = vfs_stat_vnode(vnode, stat);
6719 
6720 	put_vnode(vnode);
6721 	return status;
6722 }
6723 
6724 
6725 static status_t
6726 common_path_write_stat(int fd, char* path, bool traverseLeafLink,
6727 	const struct stat* stat, int statMask, bool kernel)
6728 {
6729 	FUNCTION(("common_write_stat: fd: %d, path '%s', stat %p, stat_mask %d, "
6730 		"kernel %d\n", fd, path, stat, statMask, kernel));
6731 
6732 	struct vnode* vnode;
6733 	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6734 		NULL, kernel);
6735 	if (status != B_OK)
6736 		return status;
6737 
6738 	if (HAS_FS_CALL(vnode, write_stat))
6739 		status = FS_CALL(vnode, write_stat, stat, statMask);
6740 	else
6741 		status = B_READ_ONLY_DEVICE;
6742 
6743 	put_vnode(vnode);
6744 
6745 	return status;
6746 }
6747 
6748 
6749 static int
6750 attr_dir_open(int fd, char* path, bool traverseLeafLink, bool kernel)
6751 {
6752 	FUNCTION(("attr_dir_open(fd = %d, path = '%s', kernel = %d)\n", fd, path,
6753 		kernel));
6754 
6755 	struct vnode* vnode;
6756 	status_t status = fd_and_path_to_vnode(fd, path, traverseLeafLink, &vnode,
6757 		NULL, kernel);
6758 	if (status != B_OK)
6759 		return status;
6760 
6761 	status = open_attr_dir_vnode(vnode, kernel);
6762 	if (status < 0)
6763 		put_vnode(vnode);
6764 
6765 	return status;
6766 }
6767 
6768 
6769 static status_t
6770 attr_dir_close(struct file_descriptor* descriptor)
6771 {
6772 	struct vnode* vnode = descriptor->u.vnode;
6773 
6774 	FUNCTION(("attr_dir_close(descriptor = %p)\n", descriptor));
6775 
6776 	if (HAS_FS_CALL(vnode, close_attr_dir))
6777 		return FS_CALL(vnode, close_attr_dir, descriptor->cookie);
6778 
6779 	return B_OK;
6780 }
6781 
6782 
6783 static void
6784 attr_dir_free_fd(struct file_descriptor* descriptor)
6785 {
6786 	struct vnode* vnode = descriptor->u.vnode;
6787 
6788 	if (vnode != NULL) {
6789 		FS_CALL(vnode, free_attr_dir_cookie, descriptor->cookie);
6790 		put_vnode(vnode);
6791 	}
6792 }
6793 
6794 
6795 static status_t
6796 attr_dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
6797 	struct dirent* buffer, size_t bufferSize, uint32* _count)
6798 {
6799 	struct vnode* vnode = descriptor->u.vnode;
6800 
6801 	FUNCTION(("attr_dir_read(descriptor = %p)\n", descriptor));
6802 
6803 	if (HAS_FS_CALL(vnode, read_attr_dir))
6804 		return FS_CALL(vnode, read_attr_dir, descriptor->cookie, buffer,
6805 			bufferSize, _count);
6806 
6807 	return B_UNSUPPORTED;
6808 }
6809 
6810 
6811 static status_t
6812 attr_dir_rewind(struct file_descriptor* descriptor)
6813 {
6814 	struct vnode* vnode = descriptor->u.vnode;
6815 
6816 	FUNCTION(("attr_dir_rewind(descriptor = %p)\n", descriptor));
6817 
6818 	if (HAS_FS_CALL(vnode, rewind_attr_dir))
6819 		return FS_CALL(vnode, rewind_attr_dir, descriptor->cookie);
6820 
6821 	return B_UNSUPPORTED;
6822 }
6823 
6824 
6825 static int
6826 attr_create(int fd, char* path, const char* name, uint32 type,
6827 	int openMode, bool kernel)
6828 {
6829 	if (name == NULL || *name == '\0')
6830 		return B_BAD_VALUE;
6831 
6832 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
6833 	struct vnode* vnode;
6834 	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode, NULL,
6835 		kernel);
6836 	if (status != B_OK)
6837 		return status;
6838 
6839 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
6840 		status = B_LINK_LIMIT;
6841 		goto err;
6842 	}
6843 
6844 	if (!HAS_FS_CALL(vnode, create_attr)) {
6845 		status = B_READ_ONLY_DEVICE;
6846 		goto err;
6847 	}
6848 
6849 	void* cookie;
6850 	status = FS_CALL(vnode, create_attr, name, type, openMode, &cookie);
6851 	if (status != B_OK)
6852 		goto err;
6853 
6854 	fd = get_new_fd(FDTYPE_ATTR, NULL, vnode, cookie, openMode, kernel);
6855 	if (fd >= 0)
6856 		return fd;
6857 
6858 	status = fd;
6859 
6860 	FS_CALL(vnode, close_attr, cookie);
6861 	FS_CALL(vnode, free_attr_cookie, cookie);
6862 
6863 	FS_CALL(vnode, remove_attr, name);
6864 
6865 err:
6866 	put_vnode(vnode);
6867 
6868 	return status;
6869 }
6870 
6871 
6872 static int
6873 attr_open(int fd, char* path, const char* name, int openMode, bool kernel)
6874 {
6875 	if (name == NULL || *name == '\0')
6876 		return B_BAD_VALUE;
6877 
6878 	bool traverse = (openMode & (O_NOTRAVERSE | O_NOFOLLOW)) == 0;
6879 	struct vnode* vnode;
6880 	status_t status = fd_and_path_to_vnode(fd, path, traverse, &vnode, NULL,
6881 		kernel);
6882 	if (status != B_OK)
6883 		return status;
6884 
6885 	if ((openMode & O_NOFOLLOW) != 0 && S_ISLNK(vnode->Type())) {
6886 		status = B_LINK_LIMIT;
6887 		goto err;
6888 	}
6889 
6890 	if (!HAS_FS_CALL(vnode, open_attr)) {
6891 		status = B_UNSUPPORTED;
6892 		goto err;
6893 	}
6894 
6895 	void* cookie;
6896 	status = FS_CALL(vnode, open_attr, name, openMode, &cookie);
6897 	if (status != B_OK)
6898 		goto err;
6899 
6900 	// now we only need a file descriptor for this attribute and we're done
6901 	fd = get_new_fd(FDTYPE_ATTR, NULL, vnode, cookie, openMode, kernel);
6902 	if (fd >= 0)
6903 		return fd;
6904 
6905 	status = fd;
6906 
6907 	FS_CALL(vnode, close_attr, cookie);
6908 	FS_CALL(vnode, free_attr_cookie, cookie);
6909 
6910 err:
6911 	put_vnode(vnode);
6912 
6913 	return status;
6914 }
6915 
6916 
6917 static status_t
6918 attr_close(struct file_descriptor* descriptor)
6919 {
6920 	struct vnode* vnode = descriptor->u.vnode;
6921 
6922 	FUNCTION(("attr_close(descriptor = %p)\n", descriptor));
6923 
6924 	if (HAS_FS_CALL(vnode, close_attr))
6925 		return FS_CALL(vnode, close_attr, descriptor->cookie);
6926 
6927 	return B_OK;
6928 }
6929 
6930 
6931 static void
6932 attr_free_fd(struct file_descriptor* descriptor)
6933 {
6934 	struct vnode* vnode = descriptor->u.vnode;
6935 
6936 	if (vnode != NULL) {
6937 		FS_CALL(vnode, free_attr_cookie, descriptor->cookie);
6938 		put_vnode(vnode);
6939 	}
6940 }
6941 
6942 
6943 static status_t
6944 attr_read(struct file_descriptor* descriptor, off_t pos, void* buffer,
6945 	size_t* length)
6946 {
6947 	struct vnode* vnode = descriptor->u.vnode;
6948 
6949 	FUNCTION(("attr_read: buf %p, pos %" B_PRIdOFF ", len %p = %ld\n", buffer,
6950 		pos, length, *length));
6951 
6952 	if (!HAS_FS_CALL(vnode, read_attr))
6953 		return B_UNSUPPORTED;
6954 
6955 	return FS_CALL(vnode, read_attr, descriptor->cookie, pos, buffer, length);
6956 }
6957 
6958 
6959 static status_t
6960 attr_write(struct file_descriptor* descriptor, off_t pos, const void* buffer,
6961 	size_t* length)
6962 {
6963 	struct vnode* vnode = descriptor->u.vnode;
6964 
6965 	FUNCTION(("attr_write: buf %p, pos %" B_PRIdOFF ", len %p\n", buffer, pos,
6966 		length));
6967 
6968 	if (!HAS_FS_CALL(vnode, write_attr))
6969 		return B_UNSUPPORTED;
6970 
6971 	return FS_CALL(vnode, write_attr, descriptor->cookie, pos, buffer, length);
6972 }
6973 
6974 
6975 static off_t
6976 attr_seek(struct file_descriptor* descriptor, off_t pos, int seekType)
6977 {
6978 	off_t offset;
6979 
6980 	switch (seekType) {
6981 		case SEEK_SET:
6982 			offset = 0;
6983 			break;
6984 		case SEEK_CUR:
6985 			offset = descriptor->pos;
6986 			break;
6987 		case SEEK_END:
6988 		{
6989 			struct vnode* vnode = descriptor->u.vnode;
6990 			if (!HAS_FS_CALL(vnode, read_stat))
6991 				return B_UNSUPPORTED;
6992 
6993 			struct stat stat;
6994 			status_t status = FS_CALL(vnode, read_attr_stat, descriptor->cookie,
6995 				&stat);
6996 			if (status != B_OK)
6997 				return status;
6998 
6999 			offset = stat.st_size;
7000 			break;
7001 		}
7002 		default:
7003 			return B_BAD_VALUE;
7004 	}
7005 
7006 	// assumes off_t is 64 bits wide
7007 	if (offset > 0 && LONGLONG_MAX - offset < pos)
7008 		return B_BUFFER_OVERFLOW;
7009 
7010 	pos += offset;
7011 	if (pos < 0)
7012 		return B_BAD_VALUE;
7013 
7014 	return descriptor->pos = pos;
7015 }
7016 
7017 
7018 static status_t
7019 attr_read_stat(struct file_descriptor* descriptor, struct stat* stat)
7020 {
7021 	struct vnode* vnode = descriptor->u.vnode;
7022 
7023 	FUNCTION(("attr_read_stat: stat 0x%p\n", stat));
7024 
7025 	if (!HAS_FS_CALL(vnode, read_attr_stat))
7026 		return B_UNSUPPORTED;
7027 
7028 	return FS_CALL(vnode, read_attr_stat, descriptor->cookie, stat);
7029 }
7030 
7031 
7032 static status_t
7033 attr_write_stat(struct file_descriptor* descriptor, const struct stat* stat,
7034 	int statMask)
7035 {
7036 	struct vnode* vnode = descriptor->u.vnode;
7037 
7038 	FUNCTION(("attr_write_stat: stat = %p, statMask %d\n", stat, statMask));
7039 
7040 	if (!HAS_FS_CALL(vnode, write_attr_stat))
7041 		return B_READ_ONLY_DEVICE;
7042 
7043 	return FS_CALL(vnode, write_attr_stat, descriptor->cookie, stat, statMask);
7044 }
7045 
7046 
7047 static status_t
7048 attr_remove(int fd, const char* name, bool kernel)
7049 {
7050 	struct file_descriptor* descriptor;
7051 	struct vnode* vnode;
7052 	status_t status;
7053 
7054 	if (name == NULL || *name == '\0')
7055 		return B_BAD_VALUE;
7056 
7057 	FUNCTION(("attr_remove: fd = %d, name = \"%s\", kernel %d\n", fd, name,
7058 		kernel));
7059 
7060 	descriptor = get_fd_and_vnode(fd, &vnode, kernel);
7061 	if (descriptor == NULL)
7062 		return B_FILE_ERROR;
7063 
7064 	if (HAS_FS_CALL(vnode, remove_attr))
7065 		status = FS_CALL(vnode, remove_attr, name);
7066 	else
7067 		status = B_READ_ONLY_DEVICE;
7068 
7069 	put_fd(descriptor);
7070 
7071 	return status;
7072 }
7073 
7074 
7075 static status_t
7076 attr_rename(int fromFD, const char* fromName, int toFD, const char* toName,
7077 	bool kernel)
7078 {
7079 	struct file_descriptor* fromDescriptor;
7080 	struct file_descriptor* toDescriptor;
7081 	struct vnode* fromVnode;
7082 	struct vnode* toVnode;
7083 	status_t status;
7084 
7085 	if (fromName == NULL || *fromName == '\0' || toName == NULL
7086 		|| *toName == '\0')
7087 		return B_BAD_VALUE;
7088 
7089 	FUNCTION(("attr_rename: from fd = %d, from name = \"%s\", to fd = %d, to "
7090 		"name = \"%s\", kernel %d\n", fromFD, fromName, toFD, toName, kernel));
7091 
7092 	fromDescriptor = get_fd_and_vnode(fromFD, &fromVnode, kernel);
7093 	if (fromDescriptor == NULL)
7094 		return B_FILE_ERROR;
7095 
7096 	toDescriptor = get_fd_and_vnode(toFD, &toVnode, kernel);
7097 	if (toDescriptor == NULL) {
7098 		status = B_FILE_ERROR;
7099 		goto err;
7100 	}
7101 
7102 	// are the files on the same volume?
7103 	if (fromVnode->device != toVnode->device) {
7104 		status = B_CROSS_DEVICE_LINK;
7105 		goto err1;
7106 	}
7107 
7108 	if (HAS_FS_CALL(fromVnode, rename_attr)) {
7109 		status = FS_CALL(fromVnode, rename_attr, fromName, toVnode, toName);
7110 	} else
7111 		status = B_READ_ONLY_DEVICE;
7112 
7113 err1:
7114 	put_fd(toDescriptor);
7115 err:
7116 	put_fd(fromDescriptor);
7117 
7118 	return status;
7119 }
7120 
7121 
7122 static int
7123 index_dir_open(dev_t mountID, bool kernel)
7124 {
7125 	struct fs_mount* mount;
7126 	void* cookie;
7127 
7128 	FUNCTION(("index_dir_open(mountID = %" B_PRId32 ", kernel = %d)\n", mountID,
7129 		kernel));
7130 
7131 	status_t status = get_mount(mountID, &mount);
7132 	if (status != B_OK)
7133 		return status;
7134 
7135 	if (!HAS_FS_MOUNT_CALL(mount, open_index_dir)) {
7136 		status = B_UNSUPPORTED;
7137 		goto error;
7138 	}
7139 
7140 	status = FS_MOUNT_CALL(mount, open_index_dir, &cookie);
7141 	if (status != B_OK)
7142 		goto error;
7143 
7144 	// get fd for the index directory
7145 	int fd;
7146 	fd = get_new_fd(FDTYPE_INDEX_DIR, mount, NULL, cookie, O_CLOEXEC, kernel);
7147 	if (fd >= 0)
7148 		return fd;
7149 
7150 	// something went wrong
7151 	FS_MOUNT_CALL(mount, close_index_dir, cookie);
7152 	FS_MOUNT_CALL(mount, free_index_dir_cookie, cookie);
7153 
7154 	status = fd;
7155 
7156 error:
7157 	put_mount(mount);
7158 	return status;
7159 }
7160 
7161 
7162 static status_t
7163 index_dir_close(struct file_descriptor* descriptor)
7164 {
7165 	struct fs_mount* mount = descriptor->u.mount;
7166 
7167 	FUNCTION(("index_dir_close(descriptor = %p)\n", descriptor));
7168 
7169 	if (HAS_FS_MOUNT_CALL(mount, close_index_dir))
7170 		return FS_MOUNT_CALL(mount, close_index_dir, descriptor->cookie);
7171 
7172 	return B_OK;
7173 }
7174 
7175 
7176 static void
7177 index_dir_free_fd(struct file_descriptor* descriptor)
7178 {
7179 	struct fs_mount* mount = descriptor->u.mount;
7180 
7181 	if (mount != NULL) {
7182 		FS_MOUNT_CALL(mount, free_index_dir_cookie, descriptor->cookie);
7183 		put_mount(mount);
7184 	}
7185 }
7186 
7187 
7188 static status_t
7189 index_dir_read(struct io_context* ioContext, struct file_descriptor* descriptor,
7190 	struct dirent* buffer, size_t bufferSize, uint32* _count)
7191 {
7192 	struct fs_mount* mount = descriptor->u.mount;
7193 
7194 	if (HAS_FS_MOUNT_CALL(mount, read_index_dir)) {
7195 		return FS_MOUNT_CALL(mount, read_index_dir, descriptor->cookie, buffer,
7196 			bufferSize, _count);
7197 	}
7198 
7199 	return B_UNSUPPORTED;
7200 }
7201 
7202 
7203 static status_t
7204 index_dir_rewind(struct file_descriptor* descriptor)
7205 {
7206 	struct fs_mount* mount = descriptor->u.mount;
7207 
7208 	if (HAS_FS_MOUNT_CALL(mount, rewind_index_dir))
7209 		return FS_MOUNT_CALL(mount, rewind_index_dir, descriptor->cookie);
7210 
7211 	return B_UNSUPPORTED;
7212 }
7213 
7214 
7215 static status_t
7216 index_create(dev_t mountID, const char* name, uint32 type, uint32 flags,
7217 	bool kernel)
7218 {
7219 	FUNCTION(("index_create(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7220 		mountID, name, kernel));
7221 
7222 	struct fs_mount* mount;
7223 	status_t status = get_mount(mountID, &mount);
7224 	if (status != B_OK)
7225 		return status;
7226 
7227 	if (!HAS_FS_MOUNT_CALL(mount, create_index)) {
7228 		status = B_READ_ONLY_DEVICE;
7229 		goto out;
7230 	}
7231 
7232 	status = FS_MOUNT_CALL(mount, create_index, name, type, flags);
7233 
7234 out:
7235 	put_mount(mount);
7236 	return status;
7237 }
7238 
7239 
7240 #if 0
7241 static status_t
7242 index_read_stat(struct file_descriptor* descriptor, struct stat* stat)
7243 {
7244 	struct vnode* vnode = descriptor->u.vnode;
7245 
7246 	// ToDo: currently unused!
7247 	FUNCTION(("index_read_stat: stat 0x%p\n", stat));
7248 	if (!HAS_FS_CALL(vnode, read_index_stat))
7249 		return B_UNSUPPORTED;
7250 
7251 	return B_UNSUPPORTED;
7252 	//return FS_CALL(vnode, read_index_stat, descriptor->cookie, stat);
7253 }
7254 
7255 
7256 static void
7257 index_free_fd(struct file_descriptor* descriptor)
7258 {
7259 	struct vnode* vnode = descriptor->u.vnode;
7260 
7261 	if (vnode != NULL) {
7262 		FS_CALL(vnode, free_index_cookie, descriptor->cookie);
7263 		put_vnode(vnode);
7264 	}
7265 }
7266 #endif
7267 
7268 
7269 static status_t
7270 index_name_read_stat(dev_t mountID, const char* name, struct stat* stat,
7271 	bool kernel)
7272 {
7273 	FUNCTION(("index_remove(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7274 		mountID, name, kernel));
7275 
7276 	struct fs_mount* mount;
7277 	status_t status = get_mount(mountID, &mount);
7278 	if (status != B_OK)
7279 		return status;
7280 
7281 	if (!HAS_FS_MOUNT_CALL(mount, read_index_stat)) {
7282 		status = B_UNSUPPORTED;
7283 		goto out;
7284 	}
7285 
7286 	status = FS_MOUNT_CALL(mount, read_index_stat, name, stat);
7287 
7288 out:
7289 	put_mount(mount);
7290 	return status;
7291 }
7292 
7293 
7294 static status_t
7295 index_remove(dev_t mountID, const char* name, bool kernel)
7296 {
7297 	FUNCTION(("index_remove(mountID = %" B_PRId32 ", name = %s, kernel = %d)\n",
7298 		mountID, name, kernel));
7299 
7300 	struct fs_mount* mount;
7301 	status_t status = get_mount(mountID, &mount);
7302 	if (status != B_OK)
7303 		return status;
7304 
7305 	if (!HAS_FS_MOUNT_CALL(mount, remove_index)) {
7306 		status = B_READ_ONLY_DEVICE;
7307 		goto out;
7308 	}
7309 
7310 	status = FS_MOUNT_CALL(mount, remove_index, name);
7311 
7312 out:
7313 	put_mount(mount);
7314 	return status;
7315 }
7316 
7317 
7318 /*!	TODO: the query FS API is still the pretty much the same as in R5.
7319 		It would be nice if the FS would find some more kernel support
7320 		for them.
7321 		For example, query parsing should be moved into the kernel.
7322 */
7323 static int
7324 query_open(dev_t device, const char* query, uint32 flags, port_id port,
7325 	int32 token, bool kernel)
7326 {
7327 	struct fs_mount* mount;
7328 	void* cookie;
7329 
7330 	FUNCTION(("query_open(device = %" B_PRId32 ", query = \"%s\", kernel = %d)\n",
7331 		device, query, kernel));
7332 
7333 	status_t status = get_mount(device, &mount);
7334 	if (status != B_OK)
7335 		return status;
7336 
7337 	if (!HAS_FS_MOUNT_CALL(mount, open_query)) {
7338 		status = B_UNSUPPORTED;
7339 		goto error;
7340 	}
7341 
7342 	status = FS_MOUNT_CALL(mount, open_query, query, flags, port, token,
7343 		&cookie);
7344 	if (status != B_OK)
7345 		goto error;
7346 
7347 	// get fd for the index directory
7348 	int fd;
7349 	fd = get_new_fd(FDTYPE_QUERY, mount, NULL, cookie, O_CLOEXEC, kernel);
7350 	if (fd >= 0)
7351 		return fd;
7352 
7353 	status = fd;
7354 
7355 	// something went wrong
7356 	FS_MOUNT_CALL(mount, close_query, cookie);
7357 	FS_MOUNT_CALL(mount, free_query_cookie, cookie);
7358 
7359 error:
7360 	put_mount(mount);
7361 	return status;
7362 }
7363 
7364 
7365 static status_t
7366 query_close(struct file_descriptor* descriptor)
7367 {
7368 	struct fs_mount* mount = descriptor->u.mount;
7369 
7370 	FUNCTION(("query_close(descriptor = %p)\n", descriptor));
7371 
7372 	if (HAS_FS_MOUNT_CALL(mount, close_query))
7373 		return FS_MOUNT_CALL(mount, close_query, descriptor->cookie);
7374 
7375 	return B_OK;
7376 }
7377 
7378 
7379 static void
7380 query_free_fd(struct file_descriptor* descriptor)
7381 {
7382 	struct fs_mount* mount = descriptor->u.mount;
7383 
7384 	if (mount != NULL) {
7385 		FS_MOUNT_CALL(mount, free_query_cookie, descriptor->cookie);
7386 		put_mount(mount);
7387 	}
7388 }
7389 
7390 
7391 static status_t
7392 query_read(struct io_context* ioContext, struct file_descriptor* descriptor,
7393 	struct dirent* buffer, size_t bufferSize, uint32* _count)
7394 {
7395 	struct fs_mount* mount = descriptor->u.mount;
7396 
7397 	if (HAS_FS_MOUNT_CALL(mount, read_query)) {
7398 		return FS_MOUNT_CALL(mount, read_query, descriptor->cookie, buffer,
7399 			bufferSize, _count);
7400 	}
7401 
7402 	return B_UNSUPPORTED;
7403 }
7404 
7405 
7406 static status_t
7407 query_rewind(struct file_descriptor* descriptor)
7408 {
7409 	struct fs_mount* mount = descriptor->u.mount;
7410 
7411 	if (HAS_FS_MOUNT_CALL(mount, rewind_query))
7412 		return FS_MOUNT_CALL(mount, rewind_query, descriptor->cookie);
7413 
7414 	return B_UNSUPPORTED;
7415 }
7416 
7417 
7418 //	#pragma mark - General File System functions
7419 
7420 
7421 static dev_t
7422 fs_mount(char* path, const char* device, const char* fsName, uint32 flags,
7423 	const char* args, bool kernel)
7424 {
7425 	struct ::fs_mount* mount;
7426 	status_t status = B_OK;
7427 	fs_volume* volume = NULL;
7428 	int32 layer = 0;
7429 	Vnode* coveredNode = NULL;
7430 
7431 	FUNCTION(("fs_mount: path = '%s', device = '%s', fs_name = '%s', flags = %#"
7432 		B_PRIx32 ", args = '%s'\n", path, device, fsName, flags, args));
7433 
7434 	// The path is always safe, we just have to make sure that fsName is
7435 	// almost valid - we can't make any assumptions about args, though.
7436 	// A NULL fsName is OK, if a device was given and the FS is not virtual.
7437 	// We'll get it from the DDM later.
7438 	if (fsName == NULL) {
7439 		if (!device || flags & B_MOUNT_VIRTUAL_DEVICE)
7440 			return B_BAD_VALUE;
7441 	} else if (fsName[0] == '\0')
7442 		return B_BAD_VALUE;
7443 
7444 	RecursiveLocker mountOpLocker(sMountOpLock);
7445 
7446 	// Helper to delete a newly created file device on failure.
7447 	// Not exactly beautiful, but helps to keep the code below cleaner.
7448 	struct FileDeviceDeleter {
7449 		FileDeviceDeleter() : id(-1) {}
7450 		~FileDeviceDeleter()
7451 		{
7452 			KDiskDeviceManager::Default()->DeleteFileDevice(id);
7453 		}
7454 
7455 		partition_id id;
7456 	} fileDeviceDeleter;
7457 
7458 	// If the file system is not a "virtual" one, the device argument should
7459 	// point to a real file/device (if given at all).
7460 	// get the partition
7461 	KDiskDeviceManager* ddm = KDiskDeviceManager::Default();
7462 	KPartition* partition = NULL;
7463 	KPath normalizedDevice;
7464 	bool newlyCreatedFileDevice = false;
7465 
7466 	if (!(flags & B_MOUNT_VIRTUAL_DEVICE) && device != NULL) {
7467 		// normalize the device path
7468 		status = normalizedDevice.SetTo(device, true);
7469 		if (status != B_OK)
7470 			return status;
7471 
7472 		// get a corresponding partition from the DDM
7473 		partition = ddm->RegisterPartition(normalizedDevice.Path());
7474 		if (partition == NULL) {
7475 			// Partition not found: This either means, the user supplied
7476 			// an invalid path, or the path refers to an image file. We try
7477 			// to let the DDM create a file device for the path.
7478 			partition_id deviceID = ddm->CreateFileDevice(
7479 				normalizedDevice.Path(), &newlyCreatedFileDevice);
7480 			if (deviceID >= 0) {
7481 				partition = ddm->RegisterPartition(deviceID);
7482 				if (newlyCreatedFileDevice)
7483 					fileDeviceDeleter.id = deviceID;
7484 			}
7485 		}
7486 
7487 		if (!partition) {
7488 			TRACE(("fs_mount(): Partition `%s' not found.\n",
7489 				normalizedDevice.Path()));
7490 			return B_ENTRY_NOT_FOUND;
7491 		}
7492 
7493 		device = normalizedDevice.Path();
7494 			// correct path to file device
7495 	}
7496 	PartitionRegistrar partitionRegistrar(partition, true);
7497 
7498 	// Write lock the partition's device. For the time being, we keep the lock
7499 	// until we're done mounting -- not nice, but ensure, that no-one is
7500 	// interfering.
7501 	// TODO: Just mark the partition busy while mounting!
7502 	KDiskDevice* diskDevice = NULL;
7503 	if (partition) {
7504 		diskDevice = ddm->WriteLockDevice(partition->Device()->ID());
7505 		if (!diskDevice) {
7506 			TRACE(("fs_mount(): Failed to lock disk device!\n"));
7507 			return B_ERROR;
7508 		}
7509 	}
7510 
7511 	DeviceWriteLocker writeLocker(diskDevice, true);
7512 		// this takes over the write lock acquired before
7513 
7514 	if (partition != NULL) {
7515 		// make sure, that the partition is not busy
7516 		if (partition->IsBusy()) {
7517 			TRACE(("fs_mount(): Partition is busy.\n"));
7518 			return B_BUSY;
7519 		}
7520 
7521 		// if no FS name had been supplied, we get it from the partition
7522 		if (fsName == NULL) {
7523 			KDiskSystem* diskSystem = partition->DiskSystem();
7524 			if (!diskSystem) {
7525 				TRACE(("fs_mount(): No FS name was given, and the DDM didn't "
7526 					"recognize it.\n"));
7527 				return B_BAD_VALUE;
7528 			}
7529 
7530 			if (!diskSystem->IsFileSystem()) {
7531 				TRACE(("fs_mount(): No FS name was given, and the DDM found a "
7532 					"partitioning system.\n"));
7533 				return B_BAD_VALUE;
7534 			}
7535 
7536 			// The disk system name will not change, and the KDiskSystem
7537 			// object will not go away while the disk device is locked (and
7538 			// the partition has a reference to it), so this is safe.
7539 			fsName = diskSystem->Name();
7540 		}
7541 	}
7542 
7543 	mount = new(std::nothrow) (struct ::fs_mount);
7544 	if (mount == NULL)
7545 		return B_NO_MEMORY;
7546 
7547 	mount->device_name = strdup(device);
7548 		// "device" can be NULL
7549 
7550 	status = mount->entry_cache.Init();
7551 	if (status != B_OK)
7552 		goto err1;
7553 
7554 	// initialize structure
7555 	mount->id = sNextMountID++;
7556 	mount->partition = NULL;
7557 	mount->root_vnode = NULL;
7558 	mount->covers_vnode = NULL;
7559 	mount->unmounting = false;
7560 	mount->owns_file_device = false;
7561 	mount->volume = NULL;
7562 
7563 	// build up the volume(s)
7564 	while (true) {
7565 		char* layerFSName = get_file_system_name_for_layer(fsName, layer);
7566 		if (layerFSName == NULL) {
7567 			if (layer == 0) {
7568 				status = B_NO_MEMORY;
7569 				goto err1;
7570 			}
7571 
7572 			break;
7573 		}
7574 		MemoryDeleter layerFSNameDeleter(layerFSName);
7575 
7576 		volume = (fs_volume*)malloc(sizeof(fs_volume));
7577 		if (volume == NULL) {
7578 			status = B_NO_MEMORY;
7579 			goto err1;
7580 		}
7581 
7582 		volume->id = mount->id;
7583 		volume->partition = partition != NULL ? partition->ID() : -1;
7584 		volume->layer = layer++;
7585 		volume->private_volume = NULL;
7586 		volume->ops = NULL;
7587 		volume->sub_volume = NULL;
7588 		volume->super_volume = NULL;
7589 		volume->file_system = NULL;
7590 		volume->file_system_name = NULL;
7591 
7592 		volume->file_system_name = get_file_system_name(layerFSName);
7593 		if (volume->file_system_name == NULL) {
7594 			status = B_NO_MEMORY;
7595 			free(volume);
7596 			goto err1;
7597 		}
7598 
7599 		volume->file_system = get_file_system(layerFSName);
7600 		if (volume->file_system == NULL) {
7601 			status = B_DEVICE_NOT_FOUND;
7602 			free(volume->file_system_name);
7603 			free(volume);
7604 			goto err1;
7605 		}
7606 
7607 		if (mount->volume == NULL)
7608 			mount->volume = volume;
7609 		else {
7610 			volume->super_volume = mount->volume;
7611 			mount->volume->sub_volume = volume;
7612 			mount->volume = volume;
7613 		}
7614 	}
7615 
7616 	// insert mount struct into list before we call FS's mount() function
7617 	// so that vnodes can be created for this mount
7618 	rw_lock_write_lock(&sMountLock);
7619 	sMountsTable->Insert(mount);
7620 	rw_lock_write_unlock(&sMountLock);
7621 
7622 	ino_t rootID;
7623 
7624 	if (!sRoot) {
7625 		// we haven't mounted anything yet
7626 		if (strcmp(path, "/") != 0) {
7627 			status = B_ERROR;
7628 			goto err2;
7629 		}
7630 
7631 		status = mount->volume->file_system->mount(mount->volume, device, flags,
7632 			args, &rootID);
7633 		if (status != B_OK || mount->volume->ops == NULL)
7634 			goto err2;
7635 	} else {
7636 		status = path_to_vnode(path, true, &coveredNode, NULL, kernel);
7637 		if (status != B_OK)
7638 			goto err2;
7639 
7640 		mount->covers_vnode = coveredNode;
7641 
7642 		// make sure covered_vnode is a directory
7643 		if (!S_ISDIR(coveredNode->Type())) {
7644 			status = B_NOT_A_DIRECTORY;
7645 			goto err3;
7646 		}
7647 
7648 		if (coveredNode->IsCovered()) {
7649 			// this is already a covered vnode
7650 			status = B_BUSY;
7651 			goto err3;
7652 		}
7653 
7654 		// mount it/them
7655 		fs_volume* volume = mount->volume;
7656 		while (volume) {
7657 			status = volume->file_system->mount(volume, device, flags, args,
7658 				&rootID);
7659 			if (status != B_OK || volume->ops == NULL) {
7660 				if (status == B_OK && volume->ops == NULL)
7661 					panic("fs_mount: mount() succeeded but ops is NULL!");
7662 				if (volume->sub_volume)
7663 					goto err4;
7664 				goto err3;
7665 			}
7666 
7667 			volume = volume->super_volume;
7668 		}
7669 
7670 		volume = mount->volume;
7671 		while (volume) {
7672 			if (volume->ops->all_layers_mounted != NULL)
7673 				volume->ops->all_layers_mounted(volume);
7674 			volume = volume->super_volume;
7675 		}
7676 	}
7677 
7678 	// the root node is supposed to be owned by the file system - it must
7679 	// exist at this point
7680 	rw_lock_write_lock(&sVnodeLock);
7681 	mount->root_vnode = lookup_vnode(mount->id, rootID);
7682 	if (mount->root_vnode == NULL || mount->root_vnode->ref_count != 1) {
7683 		panic("fs_mount: file system does not own its root node!\n");
7684 		status = B_ERROR;
7685 		rw_lock_write_unlock(&sVnodeLock);
7686 		goto err4;
7687 	}
7688 
7689 	// set up the links between the root vnode and the vnode it covers
7690 	if (coveredNode != NULL) {
7691 		if (coveredNode->IsCovered()) {
7692 			// the vnode is covered now
7693 			status = B_BUSY;
7694 			rw_lock_write_unlock(&sVnodeLock);
7695 			goto err4;
7696 		}
7697 
7698 		mount->root_vnode->covers = coveredNode;
7699 		mount->root_vnode->SetCovering(true);
7700 
7701 		coveredNode->covered_by = mount->root_vnode;
7702 		coveredNode->SetCovered(true);
7703 	}
7704 	rw_lock_write_unlock(&sVnodeLock);
7705 
7706 	if (!sRoot) {
7707 		sRoot = mount->root_vnode;
7708 		mutex_lock(&sIOContextRootLock);
7709 		get_current_io_context(true)->root = sRoot;
7710 		mutex_unlock(&sIOContextRootLock);
7711 		inc_vnode_ref_count(sRoot);
7712 	}
7713 
7714 	// supply the partition (if any) with the mount cookie and mark it mounted
7715 	if (partition) {
7716 		partition->SetMountCookie(mount->volume->private_volume);
7717 		partition->SetVolumeID(mount->id);
7718 
7719 		// keep a partition reference as long as the partition is mounted
7720 		partitionRegistrar.Detach();
7721 		mount->partition = partition;
7722 		mount->owns_file_device = newlyCreatedFileDevice;
7723 		fileDeviceDeleter.id = -1;
7724 	}
7725 
7726 	notify_mount(mount->id,
7727 		coveredNode != NULL ? coveredNode->device : -1,
7728 		coveredNode ? coveredNode->id : -1);
7729 
7730 	return mount->id;
7731 
7732 err4:
7733 	FS_MOUNT_CALL_NO_PARAMS(mount, unmount);
7734 err3:
7735 	if (coveredNode != NULL)
7736 		put_vnode(coveredNode);
7737 err2:
7738 	rw_lock_write_lock(&sMountLock);
7739 	sMountsTable->Remove(mount);
7740 	rw_lock_write_unlock(&sMountLock);
7741 err1:
7742 	delete mount;
7743 
7744 	return status;
7745 }
7746 
7747 
7748 static status_t
7749 fs_unmount(char* path, dev_t mountID, uint32 flags, bool kernel)
7750 {
7751 	struct fs_mount* mount;
7752 	status_t err;
7753 
7754 	FUNCTION(("fs_unmount(path '%s', dev %" B_PRId32 ", kernel %d\n", path,
7755 		mountID, kernel));
7756 
7757 	struct vnode* pathVnode = NULL;
7758 	if (path != NULL) {
7759 		err = path_to_vnode(path, true, &pathVnode, NULL, kernel);
7760 		if (err != B_OK)
7761 			return B_ENTRY_NOT_FOUND;
7762 	}
7763 
7764 	RecursiveLocker mountOpLocker(sMountOpLock);
7765 	ReadLocker mountLocker(sMountLock);
7766 
7767 	mount = find_mount(path != NULL ? pathVnode->device : mountID);
7768 	if (mount == NULL) {
7769 		panic("fs_unmount: find_mount() failed on root vnode @%p of mount\n",
7770 			pathVnode);
7771 	}
7772 
7773 	mountLocker.Unlock();
7774 
7775 	if (path != NULL) {
7776 		put_vnode(pathVnode);
7777 
7778 		if (mount->root_vnode != pathVnode) {
7779 			// not mountpoint
7780 			return B_BAD_VALUE;
7781 		}
7782 	}
7783 
7784 	// if the volume is associated with a partition, lock the device of the
7785 	// partition as long as we are unmounting
7786 	KDiskDeviceManager* ddm = KDiskDeviceManager::Default();
7787 	KPartition* partition = mount->partition;
7788 	KDiskDevice* diskDevice = NULL;
7789 	if (partition != NULL) {
7790 		if (partition->Device() == NULL) {
7791 			dprintf("fs_unmount(): There is no device!\n");
7792 			return B_ERROR;
7793 		}
7794 		diskDevice = ddm->WriteLockDevice(partition->Device()->ID());
7795 		if (!diskDevice) {
7796 			TRACE(("fs_unmount(): Failed to lock disk device!\n"));
7797 			return B_ERROR;
7798 		}
7799 	}
7800 	DeviceWriteLocker writeLocker(diskDevice, true);
7801 
7802 	// make sure, that the partition is not busy
7803 	if (partition != NULL) {
7804 		if ((flags & B_UNMOUNT_BUSY_PARTITION) == 0 && partition->IsBusy()) {
7805 			TRACE(("fs_unmount(): Partition is busy.\n"));
7806 			return B_BUSY;
7807 		}
7808 	}
7809 
7810 	// grab the vnode master mutex to keep someone from creating
7811 	// a vnode while we're figuring out if we can continue
7812 	WriteLocker vnodesWriteLocker(&sVnodeLock);
7813 
7814 	bool disconnectedDescriptors = false;
7815 
7816 	while (true) {
7817 		bool busy = false;
7818 
7819 		// cycle through the list of vnodes associated with this mount and
7820 		// make sure all of them are not busy or have refs on them
7821 		VnodeList::Iterator iterator = mount->vnodes.GetIterator();
7822 		while (struct vnode* vnode = iterator.Next()) {
7823 			if (vnode->IsBusy()) {
7824 				busy = true;
7825 				break;
7826 			}
7827 
7828 			// check the vnode's ref count -- subtract additional references for
7829 			// covering
7830 			int32 refCount = vnode->ref_count;
7831 			if (vnode->covers != NULL)
7832 				refCount--;
7833 			if (vnode->covered_by != NULL)
7834 				refCount--;
7835 
7836 			if (refCount != 0) {
7837 				// there are still vnodes in use on this mount, so we cannot
7838 				// unmount yet
7839 				busy = true;
7840 				break;
7841 			}
7842 		}
7843 
7844 		if (!busy)
7845 			break;
7846 
7847 		if ((flags & B_FORCE_UNMOUNT) == 0)
7848 			return B_BUSY;
7849 
7850 		if (disconnectedDescriptors) {
7851 			// wait a bit until the last access is finished, and then try again
7852 			vnodesWriteLocker.Unlock();
7853 			snooze(100000);
7854 			// TODO: if there is some kind of bug that prevents the ref counts
7855 			// from getting back to zero, this will fall into an endless loop...
7856 			vnodesWriteLocker.Lock();
7857 			continue;
7858 		}
7859 
7860 		// the file system is still busy - but we're forced to unmount it,
7861 		// so let's disconnect all open file descriptors
7862 
7863 		mount->unmounting = true;
7864 			// prevent new vnodes from being created
7865 
7866 		vnodesWriteLocker.Unlock();
7867 
7868 		disconnect_mount_or_vnode_fds(mount, NULL);
7869 		disconnectedDescriptors = true;
7870 
7871 		vnodesWriteLocker.Lock();
7872 	}
7873 
7874 	// We can safely continue. Mark all of the vnodes busy and this mount
7875 	// structure in unmounting state. Also undo the vnode covers/covered_by
7876 	// links.
7877 	mount->unmounting = true;
7878 
7879 	VnodeList::Iterator iterator = mount->vnodes.GetIterator();
7880 	while (struct vnode* vnode = iterator.Next()) {
7881 		// Remove all covers/covered_by links from other mounts' nodes to this
7882 		// vnode and adjust the node ref count accordingly. We will release the
7883 		// references to the external vnodes below.
7884 		if (Vnode* coveredNode = vnode->covers) {
7885 			if (Vnode* coveringNode = vnode->covered_by) {
7886 				// We have both covered and covering vnodes, so just remove us
7887 				// from the chain.
7888 				coveredNode->covered_by = coveringNode;
7889 				coveringNode->covers = coveredNode;
7890 				vnode->ref_count -= 2;
7891 
7892 				vnode->covered_by = NULL;
7893 				vnode->covers = NULL;
7894 				vnode->SetCovering(false);
7895 				vnode->SetCovered(false);
7896 			} else {
7897 				// We only have a covered vnode. Remove its link to us.
7898 				coveredNode->covered_by = NULL;
7899 				coveredNode->SetCovered(false);
7900 				vnode->ref_count--;
7901 
7902 				// If the other node is an external vnode, we keep its link
7903 				// link around so we can put the reference later on. Otherwise
7904 				// we get rid of it right now.
7905 				if (coveredNode->mount == mount) {
7906 					vnode->covers = NULL;
7907 					coveredNode->ref_count--;
7908 				}
7909 			}
7910 		} else if (Vnode* coveringNode = vnode->covered_by) {
7911 			// We only have a covering vnode. Remove its link to us.
7912 			coveringNode->covers = NULL;
7913 			coveringNode->SetCovering(false);
7914 			vnode->ref_count--;
7915 
7916 			// If the other node is an external vnode, we keep its link
7917 			// link around so we can put the reference later on. Otherwise
7918 			// we get rid of it right now.
7919 			if (coveringNode->mount == mount) {
7920 				vnode->covered_by = NULL;
7921 				coveringNode->ref_count--;
7922 			}
7923 		}
7924 
7925 		vnode->SetBusy(true);
7926 		vnode_to_be_freed(vnode);
7927 	}
7928 
7929 	vnodesWriteLocker.Unlock();
7930 
7931 	// Free all vnodes associated with this mount.
7932 	// They will be removed from the mount list by free_vnode(), so
7933 	// we don't have to do this.
7934 	while (struct vnode* vnode = mount->vnodes.Head()) {
7935 		// Put the references to external covered/covering vnodes we kept above.
7936 		if (Vnode* coveredNode = vnode->covers)
7937 			put_vnode(coveredNode);
7938 		if (Vnode* coveringNode = vnode->covered_by)
7939 			put_vnode(coveringNode);
7940 
7941 		free_vnode(vnode, false);
7942 	}
7943 
7944 	// remove the mount structure from the hash table
7945 	rw_lock_write_lock(&sMountLock);
7946 	sMountsTable->Remove(mount);
7947 	rw_lock_write_unlock(&sMountLock);
7948 
7949 	mountOpLocker.Unlock();
7950 
7951 	FS_MOUNT_CALL_NO_PARAMS(mount, unmount);
7952 	notify_unmount(mount->id);
7953 
7954 	// dereference the partition and mark it unmounted
7955 	if (partition) {
7956 		partition->SetVolumeID(-1);
7957 		partition->SetMountCookie(NULL);
7958 
7959 		if (mount->owns_file_device)
7960 			KDiskDeviceManager::Default()->DeleteFileDevice(partition->ID());
7961 		partition->Unregister();
7962 	}
7963 
7964 	delete mount;
7965 	return B_OK;
7966 }
7967 
7968 
7969 static status_t
7970 fs_sync(dev_t device)
7971 {
7972 	struct fs_mount* mount;
7973 	status_t status = get_mount(device, &mount);
7974 	if (status != B_OK)
7975 		return status;
7976 
7977 	struct vnode marker;
7978 	memset(&marker, 0, sizeof(marker));
7979 	marker.SetBusy(true);
7980 	marker.SetRemoved(true);
7981 
7982 	// First, synchronize all file caches
7983 
7984 	while (true) {
7985 		WriteLocker locker(sVnodeLock);
7986 			// Note: That's the easy way. Which is probably OK for sync(),
7987 			// since it's a relatively rare call and doesn't need to allow for
7988 			// a lot of concurrency. Using a read lock would be possible, but
7989 			// also more involved, since we had to lock the individual nodes
7990 			// and take care of the locking order, which we might not want to
7991 			// do while holding fs_mount::lock.
7992 
7993 		// synchronize access to vnode list
7994 		mutex_lock(&mount->lock);
7995 
7996 		struct vnode* vnode;
7997 		if (!marker.IsRemoved()) {
7998 			vnode = mount->vnodes.GetNext(&marker);
7999 			mount->vnodes.Remove(&marker);
8000 			marker.SetRemoved(true);
8001 		} else
8002 			vnode = mount->vnodes.First();
8003 
8004 		while (vnode != NULL && (vnode->cache == NULL
8005 			|| vnode->IsRemoved() || vnode->IsBusy())) {
8006 			// TODO: we could track writes (and writable mapped vnodes)
8007 			//	and have a simple flag that we could test for here
8008 			vnode = mount->vnodes.GetNext(vnode);
8009 		}
8010 
8011 		if (vnode != NULL) {
8012 			// insert marker vnode again
8013 			mount->vnodes.InsertBefore(mount->vnodes.GetNext(vnode), &marker);
8014 			marker.SetRemoved(false);
8015 		}
8016 
8017 		mutex_unlock(&mount->lock);
8018 
8019 		if (vnode == NULL)
8020 			break;
8021 
8022 		vnode = lookup_vnode(mount->id, vnode->id);
8023 		if (vnode == NULL || vnode->IsBusy())
8024 			continue;
8025 
8026 		if (vnode->ref_count == 0) {
8027 			// this vnode has been unused before
8028 			vnode_used(vnode);
8029 		}
8030 		inc_vnode_ref_count(vnode);
8031 
8032 		locker.Unlock();
8033 
8034 		if (vnode->cache != NULL && !vnode->IsRemoved())
8035 			vnode->cache->WriteModified();
8036 
8037 		put_vnode(vnode);
8038 	}
8039 
8040 	// Let the file systems do their synchronizing work
8041 	if (HAS_FS_MOUNT_CALL(mount, sync))
8042 		status = FS_MOUNT_CALL_NO_PARAMS(mount, sync);
8043 
8044 	// Finally, flush the underlying device's write cache (if possible.)
8045 	if (mount->partition != NULL && mount->partition->Device() != NULL)
8046 		ioctl(mount->partition->Device()->FD(), B_FLUSH_DRIVE_CACHE);
8047 
8048 	put_mount(mount);
8049 	return status;
8050 }
8051 
8052 
8053 static status_t
8054 fs_read_info(dev_t device, struct fs_info* info)
8055 {
8056 	struct fs_mount* mount;
8057 	status_t status = get_mount(device, &mount);
8058 	if (status != B_OK)
8059 		return status;
8060 
8061 	memset(info, 0, sizeof(struct fs_info));
8062 
8063 	if (HAS_FS_MOUNT_CALL(mount, read_fs_info))
8064 		status = FS_MOUNT_CALL(mount, read_fs_info, info);
8065 
8066 	// fill in info the file system doesn't (have to) know about
8067 	if (status == B_OK) {
8068 		info->dev = mount->id;
8069 		info->root = mount->root_vnode->id;
8070 
8071 		fs_volume* volume = mount->volume;
8072 		while (volume->super_volume != NULL)
8073 			volume = volume->super_volume;
8074 
8075 		strlcpy(info->fsh_name, volume->file_system_name,
8076 			sizeof(info->fsh_name));
8077 		if (mount->device_name != NULL) {
8078 			strlcpy(info->device_name, mount->device_name,
8079 				sizeof(info->device_name));
8080 		}
8081 	}
8082 
8083 	// if the call is not supported by the file system, there are still
8084 	// the parts that we filled out ourselves
8085 
8086 	put_mount(mount);
8087 	return status;
8088 }
8089 
8090 
8091 static status_t
8092 fs_write_info(dev_t device, const struct fs_info* info, int mask)
8093 {
8094 	struct fs_mount* mount;
8095 	status_t status = get_mount(device, &mount);
8096 	if (status != B_OK)
8097 		return status;
8098 
8099 	if (HAS_FS_MOUNT_CALL(mount, write_fs_info))
8100 		status = FS_MOUNT_CALL(mount, write_fs_info, info, mask);
8101 	else
8102 		status = B_READ_ONLY_DEVICE;
8103 
8104 	put_mount(mount);
8105 	return status;
8106 }
8107 
8108 
8109 static dev_t
8110 fs_next_device(int32* _cookie)
8111 {
8112 	struct fs_mount* mount = NULL;
8113 	dev_t device = *_cookie;
8114 
8115 	rw_lock_read_lock(&sMountLock);
8116 
8117 	// Since device IDs are assigned sequentially, this algorithm
8118 	// does work good enough. It makes sure that the device list
8119 	// returned is sorted, and that no device is skipped when an
8120 	// already visited device got unmounted.
8121 
8122 	while (device < sNextMountID) {
8123 		mount = find_mount(device++);
8124 		if (mount != NULL && mount->volume->private_volume != NULL)
8125 			break;
8126 	}
8127 
8128 	*_cookie = device;
8129 
8130 	if (mount != NULL)
8131 		device = mount->id;
8132 	else
8133 		device = B_BAD_VALUE;
8134 
8135 	rw_lock_read_unlock(&sMountLock);
8136 
8137 	return device;
8138 }
8139 
8140 
8141 ssize_t
8142 fs_read_attr(int fd, const char *attribute, uint32 type, off_t pos,
8143 	void *buffer, size_t readBytes)
8144 {
8145 	int attrFD = attr_open(fd, NULL, attribute, O_RDONLY, true);
8146 	if (attrFD < 0)
8147 		return attrFD;
8148 
8149 	ssize_t bytesRead = _kern_read(attrFD, pos, buffer, readBytes);
8150 
8151 	_kern_close(attrFD);
8152 
8153 	return bytesRead;
8154 }
8155 
8156 
8157 static status_t
8158 get_cwd(char* buffer, size_t size, bool kernel)
8159 {
8160 	// Get current working directory from io context
8161 	struct io_context* context = get_current_io_context(kernel);
8162 	status_t status;
8163 
8164 	FUNCTION(("vfs_get_cwd: buf %p, size %ld\n", buffer, size));
8165 
8166 	mutex_lock(&context->io_mutex);
8167 
8168 	struct vnode* vnode = context->cwd;
8169 	if (vnode)
8170 		inc_vnode_ref_count(vnode);
8171 
8172 	mutex_unlock(&context->io_mutex);
8173 
8174 	if (vnode) {
8175 		status = dir_vnode_to_path(vnode, buffer, size, kernel);
8176 		put_vnode(vnode);
8177 	} else
8178 		status = B_ERROR;
8179 
8180 	return status;
8181 }
8182 
8183 
8184 static status_t
8185 set_cwd(int fd, char* path, bool kernel)
8186 {
8187 	struct io_context* context;
8188 	struct vnode* vnode = NULL;
8189 	struct vnode* oldDirectory;
8190 	status_t status;
8191 
8192 	FUNCTION(("set_cwd: path = \'%s\'\n", path));
8193 
8194 	// Get vnode for passed path, and bail if it failed
8195 	status = fd_and_path_to_vnode(fd, path, true, &vnode, NULL, kernel);
8196 	if (status < 0)
8197 		return status;
8198 
8199 	if (!S_ISDIR(vnode->Type())) {
8200 		// nope, can't cwd to here
8201 		status = B_NOT_A_DIRECTORY;
8202 		goto err;
8203 	}
8204 
8205 	// We need to have the permission to enter the directory, too
8206 	if (HAS_FS_CALL(vnode, access)) {
8207 		status = FS_CALL(vnode, access, X_OK);
8208 		if (status != B_OK)
8209 			goto err;
8210 	}
8211 
8212 	// Get current io context and lock
8213 	context = get_current_io_context(kernel);
8214 	mutex_lock(&context->io_mutex);
8215 
8216 	// save the old current working directory first
8217 	oldDirectory = context->cwd;
8218 	context->cwd = vnode;
8219 
8220 	mutex_unlock(&context->io_mutex);
8221 
8222 	if (oldDirectory)
8223 		put_vnode(oldDirectory);
8224 
8225 	return B_NO_ERROR;
8226 
8227 err:
8228 	put_vnode(vnode);
8229 	return status;
8230 }
8231 
8232 
8233 static status_t
8234 user_copy_name(char* to, const char* from, size_t length)
8235 {
8236 	ssize_t len = user_strlcpy(to, from, length);
8237 	if (len < 0)
8238 		return len;
8239 	if (len >= (ssize_t)length)
8240 		return B_NAME_TOO_LONG;
8241 	return B_OK;
8242 }
8243 
8244 
8245 //	#pragma mark - kernel mirrored syscalls
8246 
8247 
8248 dev_t
8249 _kern_mount(const char* path, const char* device, const char* fsName,
8250 	uint32 flags, const char* args, size_t argsLength)
8251 {
8252 	KPath pathBuffer(path);
8253 	if (pathBuffer.InitCheck() != B_OK)
8254 		return B_NO_MEMORY;
8255 
8256 	return fs_mount(pathBuffer.LockBuffer(), device, fsName, flags, args, true);
8257 }
8258 
8259 
8260 status_t
8261 _kern_unmount(const char* path, uint32 flags)
8262 {
8263 	KPath pathBuffer(path);
8264 	if (pathBuffer.InitCheck() != B_OK)
8265 		return B_NO_MEMORY;
8266 
8267 	return fs_unmount(pathBuffer.LockBuffer(), -1, flags, true);
8268 }
8269 
8270 
8271 status_t
8272 _kern_read_fs_info(dev_t device, struct fs_info* info)
8273 {
8274 	if (info == NULL)
8275 		return B_BAD_VALUE;
8276 
8277 	return fs_read_info(device, info);
8278 }
8279 
8280 
8281 status_t
8282 _kern_write_fs_info(dev_t device, const struct fs_info* info, int mask)
8283 {
8284 	if (info == NULL)
8285 		return B_BAD_VALUE;
8286 
8287 	return fs_write_info(device, info, mask);
8288 }
8289 
8290 
8291 status_t
8292 _kern_sync(void)
8293 {
8294 	// Note: _kern_sync() is also called from _user_sync()
8295 	int32 cookie = 0;
8296 	dev_t device;
8297 	while ((device = next_dev(&cookie)) >= 0) {
8298 		status_t status = fs_sync(device);
8299 		if (status != B_OK && status != B_BAD_VALUE) {
8300 			dprintf("sync: device %" B_PRIdDEV " couldn't sync: %s\n", device,
8301 				strerror(status));
8302 		}
8303 	}
8304 
8305 	return B_OK;
8306 }
8307 
8308 
8309 dev_t
8310 _kern_next_device(int32* _cookie)
8311 {
8312 	return fs_next_device(_cookie);
8313 }
8314 
8315 
8316 status_t
8317 _kern_get_next_fd_info(team_id teamID, uint32* _cookie, fd_info* info,
8318 	size_t infoSize)
8319 {
8320 	if (infoSize != sizeof(fd_info))
8321 		return B_BAD_VALUE;
8322 
8323 	// get the team
8324 	Team* team = Team::Get(teamID);
8325 	if (team == NULL)
8326 		return B_BAD_TEAM_ID;
8327 	BReference<Team> teamReference(team, true);
8328 
8329 	// now that we have a team reference, its I/O context won't go away
8330 	io_context* context = team->io_context;
8331 	MutexLocker contextLocker(context->io_mutex);
8332 
8333 	uint32 slot = *_cookie;
8334 
8335 	struct file_descriptor* descriptor;
8336 	while (slot < context->table_size
8337 		&& (descriptor = context->fds[slot]) == NULL) {
8338 		slot++;
8339 	}
8340 
8341 	if (slot >= context->table_size)
8342 		return B_ENTRY_NOT_FOUND;
8343 
8344 	info->number = slot;
8345 	info->open_mode = descriptor->open_mode;
8346 
8347 	struct vnode* vnode = fd_vnode(descriptor);
8348 	if (vnode != NULL) {
8349 		info->device = vnode->device;
8350 		info->node = vnode->id;
8351 	} else if (descriptor->u.mount != NULL) {
8352 		info->device = descriptor->u.mount->id;
8353 		info->node = -1;
8354 	}
8355 
8356 	*_cookie = slot + 1;
8357 	return B_OK;
8358 }
8359 
8360 
8361 int
8362 _kern_open_entry_ref(dev_t device, ino_t inode, const char* name, int openMode,
8363 	int perms)
8364 {
8365 	if ((openMode & O_CREAT) != 0) {
8366 		return file_create_entry_ref(device, inode, name, openMode, perms,
8367 			true);
8368 	}
8369 
8370 	return file_open_entry_ref(device, inode, name, openMode, true);
8371 }
8372 
8373 
8374 /*!	\brief Opens a node specified by a FD + path pair.
8375 
8376 	At least one of \a fd and \a path must be specified.
8377 	If only \a fd is given, the function opens the node identified by this
8378 	FD. If only a path is given, this path is opened. If both are given and
8379 	the path is absolute, \a fd is ignored; a relative path is reckoned off
8380 	of the directory (!) identified by \a fd.
8381 
8382 	\param fd The FD. May be < 0.
8383 	\param path The absolute or relative path. May be \c NULL.
8384 	\param openMode The open mode.
8385 	\return A FD referring to the newly opened node, or an error code,
8386 			if an error occurs.
8387 */
8388 int
8389 _kern_open(int fd, const char* path, int openMode, int perms)
8390 {
8391 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8392 	if (pathBuffer.InitCheck() != B_OK)
8393 		return B_NO_MEMORY;
8394 
8395 	if ((openMode & O_CREAT) != 0)
8396 		return file_create(fd, pathBuffer.LockBuffer(), openMode, perms, true);
8397 
8398 	return file_open(fd, pathBuffer.LockBuffer(), openMode, true);
8399 }
8400 
8401 
8402 /*!	\brief Opens a directory specified by entry_ref or node_ref.
8403 
8404 	The supplied name may be \c NULL, in which case directory identified
8405 	by \a device and \a inode will be opened. Otherwise \a device and
8406 	\a inode identify the parent directory of the directory to be opened
8407 	and \a name its entry name.
8408 
8409 	\param device If \a name is specified the ID of the device the parent
8410 		   directory of the directory to be opened resides on, otherwise
8411 		   the device of the directory itself.
8412 	\param inode If \a name is specified the node ID of the parent
8413 		   directory of the directory to be opened, otherwise node ID of the
8414 		   directory itself.
8415 	\param name The entry name of the directory to be opened. If \c NULL,
8416 		   the \a device + \a inode pair identify the node to be opened.
8417 	\return The FD of the newly opened directory or an error code, if
8418 			something went wrong.
8419 */
8420 int
8421 _kern_open_dir_entry_ref(dev_t device, ino_t inode, const char* name)
8422 {
8423 	return dir_open_entry_ref(device, inode, name, true);
8424 }
8425 
8426 
8427 /*!	\brief Opens a directory specified by a FD + path pair.
8428 
8429 	At least one of \a fd and \a path must be specified.
8430 	If only \a fd is given, the function opens the directory identified by this
8431 	FD. If only a path is given, this path is opened. If both are given and
8432 	the path is absolute, \a fd is ignored; a relative path is reckoned off
8433 	of the directory (!) identified by \a fd.
8434 
8435 	\param fd The FD. May be < 0.
8436 	\param path The absolute or relative path. May be \c NULL.
8437 	\return A FD referring to the newly opened directory, or an error code,
8438 			if an error occurs.
8439 */
8440 int
8441 _kern_open_dir(int fd, const char* path)
8442 {
8443 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8444 	if (pathBuffer.InitCheck() != B_OK)
8445 		return B_NO_MEMORY;
8446 
8447 	return dir_open(fd, pathBuffer.LockBuffer(), true);
8448 }
8449 
8450 
8451 status_t
8452 _kern_fcntl(int fd, int op, size_t argument)
8453 {
8454 	return common_fcntl(fd, op, argument, true);
8455 }
8456 
8457 
8458 status_t
8459 _kern_fsync(int fd)
8460 {
8461 	return common_sync(fd, true);
8462 }
8463 
8464 
8465 status_t
8466 _kern_lock_node(int fd)
8467 {
8468 	return common_lock_node(fd, true);
8469 }
8470 
8471 
8472 status_t
8473 _kern_unlock_node(int fd)
8474 {
8475 	return common_unlock_node(fd, true);
8476 }
8477 
8478 
8479 status_t
8480 _kern_preallocate(int fd, off_t offset, off_t length)
8481 {
8482 	return common_preallocate(fd, offset, length, true);
8483 }
8484 
8485 
8486 status_t
8487 _kern_create_dir_entry_ref(dev_t device, ino_t inode, const char* name,
8488 	int perms)
8489 {
8490 	return dir_create_entry_ref(device, inode, name, perms, true);
8491 }
8492 
8493 
8494 /*!	\brief Creates a directory specified by a FD + path pair.
8495 
8496 	\a path must always be specified (it contains the name of the new directory
8497 	at least). If only a path is given, this path identifies the location at
8498 	which the directory shall be created. If both \a fd and \a path are given
8499 	and the path is absolute, \a fd is ignored; a relative path is reckoned off
8500 	of the directory (!) identified by \a fd.
8501 
8502 	\param fd The FD. May be < 0.
8503 	\param path The absolute or relative path. Must not be \c NULL.
8504 	\param perms The access permissions the new directory shall have.
8505 	\return \c B_OK, if the directory has been created successfully, another
8506 			error code otherwise.
8507 */
8508 status_t
8509 _kern_create_dir(int fd, const char* path, int perms)
8510 {
8511 	KPath pathBuffer(path, KPath::DEFAULT);
8512 	if (pathBuffer.InitCheck() != B_OK)
8513 		return B_NO_MEMORY;
8514 
8515 	return dir_create(fd, pathBuffer.LockBuffer(), perms, true);
8516 }
8517 
8518 
8519 status_t
8520 _kern_remove_dir(int fd, const char* path)
8521 {
8522 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8523 	if (pathBuffer.InitCheck() != B_OK)
8524 		return B_NO_MEMORY;
8525 
8526 	return dir_remove(fd, pathBuffer.LockBuffer(), true);
8527 }
8528 
8529 
8530 /*!	\brief Reads the contents of a symlink referred to by a FD + path pair.
8531 
8532 	At least one of \a fd and \a path must be specified.
8533 	If only \a fd is given, the function the symlink to be read is the node
8534 	identified by this FD. If only a path is given, this path identifies the
8535 	symlink to be read. If both are given and the path is absolute, \a fd is
8536 	ignored; a relative path is reckoned off of the directory (!) identified
8537 	by \a fd.
8538 	If this function fails with B_BUFFER_OVERFLOW, the \a _bufferSize pointer
8539 	will still be updated to reflect the required buffer size.
8540 
8541 	\param fd The FD. May be < 0.
8542 	\param path The absolute or relative path. May be \c NULL.
8543 	\param buffer The buffer into which the contents of the symlink shall be
8544 		   written.
8545 	\param _bufferSize A pointer to the size of the supplied buffer.
8546 	\return The length of the link on success or an appropriate error code
8547 */
8548 status_t
8549 _kern_read_link(int fd, const char* path, char* buffer, size_t* _bufferSize)
8550 {
8551 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8552 	if (pathBuffer.InitCheck() != B_OK)
8553 		return B_NO_MEMORY;
8554 
8555 	return common_read_link(fd, pathBuffer.LockBuffer(),
8556 		buffer, _bufferSize, true);
8557 }
8558 
8559 
8560 /*!	\brief Creates a symlink specified by a FD + path pair.
8561 
8562 	\a path must always be specified (it contains the name of the new symlink
8563 	at least). If only a path is given, this path identifies the location at
8564 	which the symlink shall be created. If both \a fd and \a path are given and
8565 	the path is absolute, \a fd is ignored; a relative path is reckoned off
8566 	of the directory (!) identified by \a fd.
8567 
8568 	\param fd The FD. May be < 0.
8569 	\param toPath The absolute or relative path. Must not be \c NULL.
8570 	\param mode The access permissions the new symlink shall have.
8571 	\return \c B_OK, if the symlink has been created successfully, another
8572 			error code otherwise.
8573 */
8574 status_t
8575 _kern_create_symlink(int fd, const char* path, const char* toPath, int mode)
8576 {
8577 	KPath pathBuffer(path);
8578 	if (pathBuffer.InitCheck() != B_OK)
8579 		return B_NO_MEMORY;
8580 
8581 	return common_create_symlink(fd, pathBuffer.LockBuffer(),
8582 		toPath, mode, true);
8583 }
8584 
8585 
8586 status_t
8587 _kern_create_link(int pathFD, const char* path, int toFD, const char* toPath,
8588 	bool traverseLeafLink)
8589 {
8590 	KPath pathBuffer(path);
8591 	KPath toPathBuffer(toPath);
8592 	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
8593 		return B_NO_MEMORY;
8594 
8595 	return common_create_link(pathFD, pathBuffer.LockBuffer(), toFD,
8596 		toPathBuffer.LockBuffer(), traverseLeafLink, true);
8597 }
8598 
8599 
8600 /*!	\brief Removes an entry specified by a FD + path pair from its directory.
8601 
8602 	\a path must always be specified (it contains at least the name of the entry
8603 	to be deleted). If only a path is given, this path identifies the entry
8604 	directly. If both \a fd and \a path are given and the path is absolute,
8605 	\a fd is ignored; a relative path is reckoned off of the directory (!)
8606 	identified by \a fd.
8607 
8608 	\param fd The FD. May be < 0.
8609 	\param path The absolute or relative path. Must not be \c NULL.
8610 	\return \c B_OK, if the entry has been removed successfully, another
8611 			error code otherwise.
8612 */
8613 status_t
8614 _kern_unlink(int fd, const char* path)
8615 {
8616 	KPath pathBuffer(path);
8617 	if (pathBuffer.InitCheck() != B_OK)
8618 		return B_NO_MEMORY;
8619 
8620 	return common_unlink(fd, pathBuffer.LockBuffer(), true);
8621 }
8622 
8623 
8624 /*!	\brief Moves an entry specified by a FD + path pair to a an entry specified
8625 		   by another FD + path pair.
8626 
8627 	\a oldPath and \a newPath must always be specified (they contain at least
8628 	the name of the entry). If only a path is given, this path identifies the
8629 	entry directly. If both a FD and a path are given and the path is absolute,
8630 	the FD is ignored; a relative path is reckoned off of the directory (!)
8631 	identified by the respective FD.
8632 
8633 	\param oldFD The FD of the old location. May be < 0.
8634 	\param oldPath The absolute or relative path of the old location. Must not
8635 		   be \c NULL.
8636 	\param newFD The FD of the new location. May be < 0.
8637 	\param newPath The absolute or relative path of the new location. Must not
8638 		   be \c NULL.
8639 	\return \c B_OK, if the entry has been moved successfully, another
8640 			error code otherwise.
8641 */
8642 status_t
8643 _kern_rename(int oldFD, const char* oldPath, int newFD, const char* newPath)
8644 {
8645 	KPath oldPathBuffer(oldPath);
8646 	KPath newPathBuffer(newPath);
8647 	if (oldPathBuffer.InitCheck() != B_OK || newPathBuffer.InitCheck() != B_OK)
8648 		return B_NO_MEMORY;
8649 
8650 	return common_rename(oldFD, oldPathBuffer.LockBuffer(),
8651 		newFD, newPathBuffer.LockBuffer(), true);
8652 }
8653 
8654 
8655 status_t
8656 _kern_access(int fd, const char* path, int mode, bool effectiveUserGroup)
8657 {
8658 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8659 	if (pathBuffer.InitCheck() != B_OK)
8660 		return B_NO_MEMORY;
8661 
8662 	return common_access(fd, pathBuffer.LockBuffer(), mode, effectiveUserGroup,
8663 		true);
8664 }
8665 
8666 
8667 /*!	\brief Reads stat data of an entity specified by a FD + path pair.
8668 
8669 	If only \a fd is given, the stat operation associated with the type
8670 	of the FD (node, attr, attr dir etc.) is performed. If only \a path is
8671 	given, this path identifies the entry for whose node to retrieve the
8672 	stat data. If both \a fd and \a path are given and the path is absolute,
8673 	\a fd is ignored; a relative path is reckoned off of the directory (!)
8674 	identified by \a fd and specifies the entry whose stat data shall be
8675 	retrieved.
8676 
8677 	\param fd The FD. May be < 0.
8678 	\param path The absolute or relative path. Must not be \c NULL.
8679 	\param traverseLeafLink If \a path is given, \c true specifies that the
8680 		   function shall not stick to symlinks, but traverse them.
8681 	\param stat The buffer the stat data shall be written into.
8682 	\param statSize The size of the supplied stat buffer.
8683 	\return \c B_OK, if the the stat data have been read successfully, another
8684 			error code otherwise.
8685 */
8686 status_t
8687 _kern_read_stat(int fd, const char* path, bool traverseLeafLink,
8688 	struct stat* stat, size_t statSize)
8689 {
8690 	struct stat completeStat;
8691 	struct stat* originalStat = NULL;
8692 	status_t status;
8693 
8694 	if (statSize > sizeof(struct stat))
8695 		return B_BAD_VALUE;
8696 
8697 	// this supports different stat extensions
8698 	if (statSize < sizeof(struct stat)) {
8699 		originalStat = stat;
8700 		stat = &completeStat;
8701 	}
8702 
8703 	status = vfs_read_stat(fd, path, traverseLeafLink, stat, true);
8704 
8705 	if (status == B_OK && originalStat != NULL)
8706 		memcpy(originalStat, stat, statSize);
8707 
8708 	return status;
8709 }
8710 
8711 
8712 /*!	\brief Writes stat data of an entity specified by a FD + path pair.
8713 
8714 	If only \a fd is given, the stat operation associated with the type
8715 	of the FD (node, attr, attr dir etc.) is performed. If only \a path is
8716 	given, this path identifies the entry for whose node to write the
8717 	stat data. If both \a fd and \a path are given and the path is absolute,
8718 	\a fd is ignored; a relative path is reckoned off of the directory (!)
8719 	identified by \a fd and specifies the entry whose stat data shall be
8720 	written.
8721 
8722 	\param fd The FD. May be < 0.
8723 	\param path The absolute or relative path. May be \c NULL.
8724 	\param traverseLeafLink If \a path is given, \c true specifies that the
8725 		   function shall not stick to symlinks, but traverse them.
8726 	\param stat The buffer containing the stat data to be written.
8727 	\param statSize The size of the supplied stat buffer.
8728 	\param statMask A mask specifying which parts of the stat data shall be
8729 		   written.
8730 	\return \c B_OK, if the the stat data have been written successfully,
8731 			another error code otherwise.
8732 */
8733 status_t
8734 _kern_write_stat(int fd, const char* path, bool traverseLeafLink,
8735 	const struct stat* stat, size_t statSize, int statMask)
8736 {
8737 	struct stat completeStat;
8738 
8739 	if (statSize > sizeof(struct stat))
8740 		return B_BAD_VALUE;
8741 
8742 	// this supports different stat extensions
8743 	if (statSize < sizeof(struct stat)) {
8744 		memset((uint8*)&completeStat + statSize, 0,
8745 			sizeof(struct stat) - statSize);
8746 		memcpy(&completeStat, stat, statSize);
8747 		stat = &completeStat;
8748 	}
8749 
8750 	status_t status;
8751 
8752 	if (path != NULL) {
8753 		// path given: write the stat of the node referred to by (fd, path)
8754 		KPath pathBuffer(path);
8755 		if (pathBuffer.InitCheck() != B_OK)
8756 			return B_NO_MEMORY;
8757 
8758 		status = common_path_write_stat(fd, pathBuffer.LockBuffer(),
8759 			traverseLeafLink, stat, statMask, true);
8760 	} else {
8761 		// no path given: get the FD and use the FD operation
8762 		struct file_descriptor* descriptor
8763 			= get_fd(get_current_io_context(true), fd);
8764 		if (descriptor == NULL)
8765 			return B_FILE_ERROR;
8766 
8767 		if (descriptor->ops->fd_write_stat)
8768 			status = descriptor->ops->fd_write_stat(descriptor, stat, statMask);
8769 		else
8770 			status = B_UNSUPPORTED;
8771 
8772 		put_fd(descriptor);
8773 	}
8774 
8775 	return status;
8776 }
8777 
8778 
8779 int
8780 _kern_open_attr_dir(int fd, const char* path, bool traverseLeafLink)
8781 {
8782 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8783 	if (pathBuffer.InitCheck() != B_OK)
8784 		return B_NO_MEMORY;
8785 
8786 	return attr_dir_open(fd, pathBuffer.LockBuffer(), traverseLeafLink, true);
8787 }
8788 
8789 
8790 int
8791 _kern_open_attr(int fd, const char* path, const char* name, uint32 type,
8792 	int openMode)
8793 {
8794 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8795 	if (pathBuffer.InitCheck() != B_OK)
8796 		return B_NO_MEMORY;
8797 
8798 	if ((openMode & O_CREAT) != 0) {
8799 		return attr_create(fd, pathBuffer.LockBuffer(), name, type, openMode,
8800 			true);
8801 	}
8802 
8803 	return attr_open(fd, pathBuffer.LockBuffer(), name, openMode, true);
8804 }
8805 
8806 
8807 status_t
8808 _kern_remove_attr(int fd, const char* name)
8809 {
8810 	return attr_remove(fd, name, true);
8811 }
8812 
8813 
8814 status_t
8815 _kern_rename_attr(int fromFile, const char* fromName, int toFile,
8816 	const char* toName)
8817 {
8818 	return attr_rename(fromFile, fromName, toFile, toName, true);
8819 }
8820 
8821 
8822 int
8823 _kern_open_index_dir(dev_t device)
8824 {
8825 	return index_dir_open(device, true);
8826 }
8827 
8828 
8829 status_t
8830 _kern_create_index(dev_t device, const char* name, uint32 type, uint32 flags)
8831 {
8832 	return index_create(device, name, type, flags, true);
8833 }
8834 
8835 
8836 status_t
8837 _kern_read_index_stat(dev_t device, const char* name, struct stat* stat)
8838 {
8839 	return index_name_read_stat(device, name, stat, true);
8840 }
8841 
8842 
8843 status_t
8844 _kern_remove_index(dev_t device, const char* name)
8845 {
8846 	return index_remove(device, name, true);
8847 }
8848 
8849 
8850 status_t
8851 _kern_getcwd(char* buffer, size_t size)
8852 {
8853 	TRACE(("_kern_getcwd: buf %p, %ld\n", buffer, size));
8854 
8855 	// Call vfs to get current working directory
8856 	return get_cwd(buffer, size, true);
8857 }
8858 
8859 
8860 status_t
8861 _kern_setcwd(int fd, const char* path)
8862 {
8863 	KPath pathBuffer(path, KPath::LAZY_ALLOC);
8864 	if (pathBuffer.InitCheck() != B_OK)
8865 		return B_NO_MEMORY;
8866 
8867 	return set_cwd(fd, pathBuffer.LockBuffer(), true);
8868 }
8869 
8870 
8871 //	#pragma mark - userland syscalls
8872 
8873 
8874 dev_t
8875 _user_mount(const char* userPath, const char* userDevice,
8876 	const char* userFileSystem, uint32 flags, const char* userArgs,
8877 	size_t argsLength)
8878 {
8879 	char fileSystem[B_FILE_NAME_LENGTH];
8880 	KPath path, device;
8881 	char* args = NULL;
8882 	status_t status;
8883 
8884 	if (!IS_USER_ADDRESS(userPath))
8885 		return B_BAD_ADDRESS;
8886 
8887 	if (path.InitCheck() != B_OK || device.InitCheck() != B_OK)
8888 		return B_NO_MEMORY;
8889 
8890 	status = user_copy_name(path.LockBuffer(), userPath,
8891 		B_PATH_NAME_LENGTH);
8892 	if (status != B_OK)
8893 		return status;
8894 	path.UnlockBuffer();
8895 
8896 	if (userFileSystem != NULL) {
8897 		if (!IS_USER_ADDRESS(userFileSystem))
8898 			return B_BAD_ADDRESS;
8899 
8900 		status = user_copy_name(fileSystem, userFileSystem, sizeof(fileSystem));
8901 		if (status != B_OK)
8902 			return status;
8903 	}
8904 
8905 	if (userDevice != NULL) {
8906 		if (!IS_USER_ADDRESS(userDevice))
8907 			return B_BAD_ADDRESS;
8908 
8909 		status = user_copy_name(device.LockBuffer(), userDevice,
8910 			B_PATH_NAME_LENGTH);
8911 		if (status != B_OK)
8912 			return status;
8913 		device.UnlockBuffer();
8914 	}
8915 
8916 	if (userArgs != NULL && argsLength > 0) {
8917 		if (!IS_USER_ADDRESS(userArgs))
8918 			return B_BAD_ADDRESS;
8919 
8920 		// this is a safety restriction
8921 		if (argsLength >= 65536)
8922 			return B_NAME_TOO_LONG;
8923 
8924 		args = (char*)malloc(argsLength + 1);
8925 		if (args == NULL)
8926 			return B_NO_MEMORY;
8927 
8928 		status = user_copy_name(args, userArgs, argsLength + 1);
8929 		if (status != B_OK) {
8930 			free(args);
8931 			return status;
8932 		}
8933 	}
8934 
8935 	status = fs_mount(path.LockBuffer(),
8936 		userDevice != NULL ? device.Path() : NULL,
8937 		userFileSystem ? fileSystem : NULL, flags, args, false);
8938 
8939 	free(args);
8940 	return status;
8941 }
8942 
8943 
8944 status_t
8945 _user_unmount(const char* userPath, uint32 flags)
8946 {
8947 	if (!IS_USER_ADDRESS(userPath))
8948 		return B_BAD_ADDRESS;
8949 
8950 	KPath pathBuffer;
8951 	if (pathBuffer.InitCheck() != B_OK)
8952 		return B_NO_MEMORY;
8953 
8954 	char* path = pathBuffer.LockBuffer();
8955 
8956 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
8957 	if (status != B_OK)
8958 		return status;
8959 
8960 	return fs_unmount(path, -1, flags & ~B_UNMOUNT_BUSY_PARTITION, false);
8961 }
8962 
8963 
8964 status_t
8965 _user_read_fs_info(dev_t device, struct fs_info* userInfo)
8966 {
8967 	struct fs_info info;
8968 	status_t status;
8969 
8970 	if (userInfo == NULL)
8971 		return B_BAD_VALUE;
8972 
8973 	if (!IS_USER_ADDRESS(userInfo))
8974 		return B_BAD_ADDRESS;
8975 
8976 	status = fs_read_info(device, &info);
8977 	if (status != B_OK)
8978 		return status;
8979 
8980 	if (user_memcpy(userInfo, &info, sizeof(struct fs_info)) != B_OK)
8981 		return B_BAD_ADDRESS;
8982 
8983 	return B_OK;
8984 }
8985 
8986 
8987 status_t
8988 _user_write_fs_info(dev_t device, const struct fs_info* userInfo, int mask)
8989 {
8990 	struct fs_info info;
8991 
8992 	if (userInfo == NULL)
8993 		return B_BAD_VALUE;
8994 
8995 	if (!IS_USER_ADDRESS(userInfo)
8996 		|| user_memcpy(&info, userInfo, sizeof(struct fs_info)) != B_OK)
8997 		return B_BAD_ADDRESS;
8998 
8999 	return fs_write_info(device, &info, mask);
9000 }
9001 
9002 
9003 dev_t
9004 _user_next_device(int32* _userCookie)
9005 {
9006 	int32 cookie;
9007 	dev_t device;
9008 
9009 	if (!IS_USER_ADDRESS(_userCookie)
9010 		|| user_memcpy(&cookie, _userCookie, sizeof(int32)) != B_OK)
9011 		return B_BAD_ADDRESS;
9012 
9013 	device = fs_next_device(&cookie);
9014 
9015 	if (device >= B_OK) {
9016 		// update user cookie
9017 		if (user_memcpy(_userCookie, &cookie, sizeof(int32)) != B_OK)
9018 			return B_BAD_ADDRESS;
9019 	}
9020 
9021 	return device;
9022 }
9023 
9024 
9025 status_t
9026 _user_sync(void)
9027 {
9028 	return _kern_sync();
9029 }
9030 
9031 
9032 status_t
9033 _user_get_next_fd_info(team_id team, uint32* userCookie, fd_info* userInfo,
9034 	size_t infoSize)
9035 {
9036 	struct fd_info info;
9037 	uint32 cookie;
9038 
9039 	// only root can do this
9040 	if (geteuid() != 0)
9041 		return B_NOT_ALLOWED;
9042 
9043 	if (infoSize != sizeof(fd_info))
9044 		return B_BAD_VALUE;
9045 
9046 	if (!IS_USER_ADDRESS(userCookie) || !IS_USER_ADDRESS(userInfo)
9047 		|| user_memcpy(&cookie, userCookie, sizeof(uint32)) != B_OK)
9048 		return B_BAD_ADDRESS;
9049 
9050 	status_t status = _kern_get_next_fd_info(team, &cookie, &info, infoSize);
9051 	if (status != B_OK)
9052 		return status;
9053 
9054 	if (user_memcpy(userCookie, &cookie, sizeof(uint32)) != B_OK
9055 		|| user_memcpy(userInfo, &info, infoSize) != B_OK)
9056 		return B_BAD_ADDRESS;
9057 
9058 	return status;
9059 }
9060 
9061 
9062 status_t
9063 _user_entry_ref_to_path(dev_t device, ino_t inode, const char* leaf,
9064 	char* userPath, size_t pathLength)
9065 {
9066 	if (!IS_USER_ADDRESS(userPath))
9067 		return B_BAD_ADDRESS;
9068 
9069 	KPath path;
9070 	if (path.InitCheck() != B_OK)
9071 		return B_NO_MEMORY;
9072 
9073 	// copy the leaf name onto the stack
9074 	char stackLeaf[B_FILE_NAME_LENGTH];
9075 	if (leaf != NULL) {
9076 		if (!IS_USER_ADDRESS(leaf))
9077 			return B_BAD_ADDRESS;
9078 
9079 		int status = user_copy_name(stackLeaf, leaf, B_FILE_NAME_LENGTH);
9080 		if (status != B_OK)
9081 			return status;
9082 
9083 		leaf = stackLeaf;
9084 	}
9085 
9086 	status_t status = vfs_entry_ref_to_path(device, inode, leaf,
9087 		false, path.LockBuffer(), path.BufferSize());
9088 	if (status != B_OK)
9089 		return status;
9090 
9091 	path.UnlockBuffer();
9092 
9093 	int length = user_strlcpy(userPath, path.Path(), pathLength);
9094 	if (length < 0)
9095 		return length;
9096 	if (length >= (int)pathLength)
9097 		return B_BUFFER_OVERFLOW;
9098 
9099 	return B_OK;
9100 }
9101 
9102 
9103 status_t
9104 _user_normalize_path(const char* userPath, bool traverseLink, char* buffer)
9105 {
9106 	if (userPath == NULL || buffer == NULL)
9107 		return B_BAD_VALUE;
9108 	if (!IS_USER_ADDRESS(userPath) || !IS_USER_ADDRESS(buffer))
9109 		return B_BAD_ADDRESS;
9110 
9111 	// copy path from userland
9112 	KPath pathBuffer;
9113 	if (pathBuffer.InitCheck() != B_OK)
9114 		return B_NO_MEMORY;
9115 	char* path = pathBuffer.LockBuffer();
9116 
9117 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9118 	if (status != B_OK)
9119 		return status;
9120 
9121 	status_t error = normalize_path(path, pathBuffer.BufferSize(), traverseLink,
9122 		false);
9123 	if (error != B_OK)
9124 		return error;
9125 
9126 	// copy back to userland
9127 	int len = user_strlcpy(buffer, path, B_PATH_NAME_LENGTH);
9128 	if (len < 0)
9129 		return len;
9130 	if (len >= B_PATH_NAME_LENGTH)
9131 		return B_BUFFER_OVERFLOW;
9132 
9133 	return B_OK;
9134 }
9135 
9136 
9137 int
9138 _user_open_entry_ref(dev_t device, ino_t inode, const char* userName,
9139 	int openMode, int perms)
9140 {
9141 	char name[B_FILE_NAME_LENGTH];
9142 
9143 	if (userName == NULL || device < 0 || inode < 0)
9144 		return B_BAD_VALUE;
9145 	if (!IS_USER_ADDRESS(userName))
9146 		return B_BAD_ADDRESS;
9147 	status_t status = user_copy_name(name, userName, sizeof(name));
9148 	if (status != B_OK)
9149 		return status;
9150 
9151 	if ((openMode & O_CREAT) != 0) {
9152 		return file_create_entry_ref(device, inode, name, openMode, perms,
9153 			false);
9154 	}
9155 
9156 	return file_open_entry_ref(device, inode, name, openMode, false);
9157 }
9158 
9159 
9160 int
9161 _user_open(int fd, const char* userPath, int openMode, int perms)
9162 {
9163 	KPath path;
9164 	if (path.InitCheck() != B_OK)
9165 		return B_NO_MEMORY;
9166 
9167 	char* buffer = path.LockBuffer();
9168 
9169 	if (!IS_USER_ADDRESS(userPath))
9170 		return B_BAD_ADDRESS;
9171 	status_t status = user_copy_name(buffer, userPath, B_PATH_NAME_LENGTH);
9172 	if (status != B_OK)
9173 		return status;
9174 
9175 	if ((openMode & O_CREAT) != 0)
9176 		return file_create(fd, buffer, openMode, perms, false);
9177 
9178 	return file_open(fd, buffer, openMode, false);
9179 }
9180 
9181 
9182 int
9183 _user_open_dir_entry_ref(dev_t device, ino_t inode, const char* userName)
9184 {
9185 	if (userName != NULL) {
9186 		char name[B_FILE_NAME_LENGTH];
9187 
9188 		if (!IS_USER_ADDRESS(userName))
9189 			return B_BAD_ADDRESS;
9190 		status_t status = user_copy_name(name, userName, sizeof(name));
9191 		if (status != B_OK)
9192 			return status;
9193 
9194 		return dir_open_entry_ref(device, inode, name, false);
9195 	}
9196 	return dir_open_entry_ref(device, inode, NULL, false);
9197 }
9198 
9199 
9200 int
9201 _user_open_dir(int fd, const char* userPath)
9202 {
9203 	if (userPath == NULL)
9204 		return dir_open(fd, NULL, false);
9205 
9206 	KPath path;
9207 	if (path.InitCheck() != B_OK)
9208 		return B_NO_MEMORY;
9209 
9210 	char* buffer = path.LockBuffer();
9211 
9212 	if (!IS_USER_ADDRESS(userPath))
9213 		return B_BAD_ADDRESS;
9214 	status_t status = user_copy_name(buffer, userPath, B_PATH_NAME_LENGTH);
9215 	if (status != B_OK)
9216 		return status;
9217 
9218 	return dir_open(fd, buffer, false);
9219 }
9220 
9221 
9222 /*!	\brief Opens a directory's parent directory and returns the entry name
9223 		   of the former.
9224 
9225 	Aside from that it returns the directory's entry name, this method is
9226 	equivalent to \code _user_open_dir(fd, "..") \endcode. It really is
9227 	equivalent, if \a userName is \c NULL.
9228 
9229 	If a name buffer is supplied and the name does not fit the buffer, the
9230 	function fails. A buffer of size \c B_FILE_NAME_LENGTH should be safe.
9231 
9232 	\param fd A FD referring to a directory.
9233 	\param userName Buffer the directory's entry name shall be written into.
9234 		   May be \c NULL.
9235 	\param nameLength Size of the name buffer.
9236 	\return The file descriptor of the opened parent directory, if everything
9237 			went fine, an error code otherwise.
9238 */
9239 int
9240 _user_open_parent_dir(int fd, char* userName, size_t nameLength)
9241 {
9242 	bool kernel = false;
9243 
9244 	if (userName && !IS_USER_ADDRESS(userName))
9245 		return B_BAD_ADDRESS;
9246 
9247 	// open the parent dir
9248 	int parentFD = dir_open(fd, (char*)"..", kernel);
9249 	if (parentFD < 0)
9250 		return parentFD;
9251 	FDCloser fdCloser(parentFD, kernel);
9252 
9253 	if (userName) {
9254 		// get the vnodes
9255 		struct vnode* parentVNode = get_vnode_from_fd(parentFD, kernel);
9256 		struct vnode* dirVNode = get_vnode_from_fd(fd, kernel);
9257 		VNodePutter parentVNodePutter(parentVNode);
9258 		VNodePutter dirVNodePutter(dirVNode);
9259 		if (!parentVNode || !dirVNode)
9260 			return B_FILE_ERROR;
9261 
9262 		// get the vnode name
9263 		char _buffer[offsetof(struct dirent, d_name) + B_FILE_NAME_LENGTH + 1];
9264 		struct dirent* buffer = (struct dirent*)_buffer;
9265 		status_t status = get_vnode_name(dirVNode, parentVNode, buffer,
9266 			sizeof(_buffer), get_current_io_context(false));
9267 		if (status != B_OK)
9268 			return status;
9269 
9270 		// copy the name to the userland buffer
9271 		int len = user_strlcpy(userName, buffer->d_name, nameLength);
9272 		if (len < 0)
9273 			return len;
9274 		if (len >= (int)nameLength)
9275 			return B_BUFFER_OVERFLOW;
9276 	}
9277 
9278 	return fdCloser.Detach();
9279 }
9280 
9281 
9282 status_t
9283 _user_fcntl(int fd, int op, size_t argument)
9284 {
9285 	status_t status = common_fcntl(fd, op, argument, false);
9286 	if (op == F_SETLKW)
9287 		syscall_restart_handle_post(status);
9288 
9289 	return status;
9290 }
9291 
9292 
9293 status_t
9294 _user_fsync(int fd)
9295 {
9296 	return common_sync(fd, false);
9297 }
9298 
9299 
9300 status_t
9301 _user_flock(int fd, int operation)
9302 {
9303 	FUNCTION(("_user_fcntl(fd = %d, op = %d)\n", fd, operation));
9304 
9305 	// Check if the operation is valid
9306 	switch (operation & ~LOCK_NB) {
9307 		case LOCK_UN:
9308 		case LOCK_SH:
9309 		case LOCK_EX:
9310 			break;
9311 
9312 		default:
9313 			return B_BAD_VALUE;
9314 	}
9315 
9316 	struct file_descriptor* descriptor;
9317 	struct vnode* vnode;
9318 	descriptor = get_fd_and_vnode(fd, &vnode, false);
9319 	if (descriptor == NULL)
9320 		return B_FILE_ERROR;
9321 
9322 	if (descriptor->type != FDTYPE_FILE) {
9323 		put_fd(descriptor);
9324 		return B_BAD_VALUE;
9325 	}
9326 
9327 	struct flock flock;
9328 	flock.l_start = 0;
9329 	flock.l_len = OFF_MAX;
9330 	flock.l_whence = 0;
9331 	flock.l_type = (operation & LOCK_SH) != 0 ? F_RDLCK : F_WRLCK;
9332 
9333 	status_t status;
9334 	if ((operation & LOCK_UN) != 0) {
9335 		if (HAS_FS_CALL(vnode, release_lock))
9336 			status = FS_CALL(vnode, release_lock, descriptor->cookie, &flock);
9337 		else
9338 			status = release_advisory_lock(vnode, NULL, descriptor, &flock);
9339 	} else {
9340 		if (HAS_FS_CALL(vnode, acquire_lock)) {
9341 			status = FS_CALL(vnode, acquire_lock, descriptor->cookie, &flock,
9342 				(operation & LOCK_NB) == 0);
9343 		} else {
9344 			status = acquire_advisory_lock(vnode, NULL, descriptor, &flock,
9345 				(operation & LOCK_NB) == 0);
9346 		}
9347 	}
9348 
9349 	syscall_restart_handle_post(status);
9350 
9351 	put_fd(descriptor);
9352 	return status;
9353 }
9354 
9355 
9356 status_t
9357 _user_lock_node(int fd)
9358 {
9359 	return common_lock_node(fd, false);
9360 }
9361 
9362 
9363 status_t
9364 _user_unlock_node(int fd)
9365 {
9366 	return common_unlock_node(fd, false);
9367 }
9368 
9369 
9370 status_t
9371 _user_preallocate(int fd, off_t offset, off_t length)
9372 {
9373 	return common_preallocate(fd, offset, length, false);
9374 }
9375 
9376 
9377 status_t
9378 _user_create_dir_entry_ref(dev_t device, ino_t inode, const char* userName,
9379 	int perms)
9380 {
9381 	char name[B_FILE_NAME_LENGTH];
9382 	status_t status;
9383 
9384 	if (!IS_USER_ADDRESS(userName))
9385 		return B_BAD_ADDRESS;
9386 
9387 	status = user_copy_name(name, userName, sizeof(name));
9388 	if (status != B_OK)
9389 		return status;
9390 
9391 	return dir_create_entry_ref(device, inode, name, perms, false);
9392 }
9393 
9394 
9395 status_t
9396 _user_create_dir(int fd, const char* userPath, int perms)
9397 {
9398 	KPath pathBuffer;
9399 	if (pathBuffer.InitCheck() != B_OK)
9400 		return B_NO_MEMORY;
9401 
9402 	char* path = pathBuffer.LockBuffer();
9403 
9404 	if (!IS_USER_ADDRESS(userPath))
9405 		return B_BAD_ADDRESS;
9406 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9407 	if (status != B_OK)
9408 		return status;
9409 
9410 	return dir_create(fd, path, perms, false);
9411 }
9412 
9413 
9414 status_t
9415 _user_remove_dir(int fd, const char* userPath)
9416 {
9417 	KPath pathBuffer;
9418 	if (pathBuffer.InitCheck() != B_OK)
9419 		return B_NO_MEMORY;
9420 
9421 	char* path = pathBuffer.LockBuffer();
9422 
9423 	if (userPath != NULL) {
9424 		if (!IS_USER_ADDRESS(userPath))
9425 			return B_BAD_ADDRESS;
9426 		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9427 		if (status != B_OK)
9428 			return status;
9429 	}
9430 
9431 	return dir_remove(fd, userPath ? path : NULL, false);
9432 }
9433 
9434 
9435 status_t
9436 _user_read_link(int fd, const char* userPath, char* userBuffer,
9437 	size_t* userBufferSize)
9438 {
9439 	KPath pathBuffer, linkBuffer;
9440 	if (pathBuffer.InitCheck() != B_OK || linkBuffer.InitCheck() != B_OK)
9441 		return B_NO_MEMORY;
9442 
9443 	size_t bufferSize;
9444 
9445 	if (!IS_USER_ADDRESS(userBuffer) || !IS_USER_ADDRESS(userBufferSize)
9446 		|| user_memcpy(&bufferSize, userBufferSize, sizeof(size_t)) != B_OK)
9447 		return B_BAD_ADDRESS;
9448 
9449 	char* path = pathBuffer.LockBuffer();
9450 	char* buffer = linkBuffer.LockBuffer();
9451 
9452 	if (userPath) {
9453 		if (!IS_USER_ADDRESS(userPath))
9454 			return B_BAD_ADDRESS;
9455 		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9456 		if (status != B_OK)
9457 			return status;
9458 
9459 		if (bufferSize > B_PATH_NAME_LENGTH)
9460 			bufferSize = B_PATH_NAME_LENGTH;
9461 	}
9462 
9463 	size_t newBufferSize = bufferSize;
9464 	status_t status = common_read_link(fd, userPath ? path : NULL, buffer,
9465 		&newBufferSize, false);
9466 
9467 	// we also update the bufferSize in case of errors
9468 	// (the real length will be returned in case of B_BUFFER_OVERFLOW)
9469 	if (user_memcpy(userBufferSize, &newBufferSize, sizeof(size_t)) != B_OK)
9470 		return B_BAD_ADDRESS;
9471 
9472 	if (status != B_OK)
9473 		return status;
9474 
9475 	bufferSize = min_c(newBufferSize, bufferSize);
9476 	if (user_memcpy(userBuffer, buffer, bufferSize) != B_OK)
9477 		return B_BAD_ADDRESS;
9478 
9479 	return B_OK;
9480 }
9481 
9482 
9483 status_t
9484 _user_create_symlink(int fd, const char* userPath, const char* userToPath,
9485 	int mode)
9486 {
9487 	KPath pathBuffer;
9488 	KPath toPathBuffer;
9489 	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
9490 		return B_NO_MEMORY;
9491 
9492 	char* path = pathBuffer.LockBuffer();
9493 	char* toPath = toPathBuffer.LockBuffer();
9494 
9495 	if (!IS_USER_ADDRESS(userPath) || !IS_USER_ADDRESS(userToPath))
9496 		return B_BAD_ADDRESS;
9497 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9498 	if (status != B_OK)
9499 		return status;
9500 	status = user_copy_name(toPath, userToPath, B_PATH_NAME_LENGTH);
9501 	if (status != B_OK)
9502 		return status;
9503 
9504 	return common_create_symlink(fd, path, toPath, mode, false);
9505 }
9506 
9507 
9508 status_t
9509 _user_create_link(int pathFD, const char* userPath, int toFD,
9510 	const char* userToPath, bool traverseLeafLink)
9511 {
9512 	KPath pathBuffer;
9513 	KPath toPathBuffer;
9514 	if (pathBuffer.InitCheck() != B_OK || toPathBuffer.InitCheck() != B_OK)
9515 		return B_NO_MEMORY;
9516 
9517 	char* path = pathBuffer.LockBuffer();
9518 	char* toPath = toPathBuffer.LockBuffer();
9519 
9520 	if (!IS_USER_ADDRESS(userPath) || !IS_USER_ADDRESS(userToPath))
9521 		return B_BAD_ADDRESS;
9522 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9523 	if (status != B_OK)
9524 		return status;
9525 	status = user_copy_name(toPath, userToPath, B_PATH_NAME_LENGTH);
9526 	if (status != B_OK)
9527 		return status;
9528 
9529 	status = check_path(toPath);
9530 	if (status != B_OK)
9531 		return status;
9532 
9533 	return common_create_link(pathFD, path, toFD, toPath, traverseLeafLink,
9534 		false);
9535 }
9536 
9537 
9538 status_t
9539 _user_unlink(int fd, const char* userPath)
9540 {
9541 	KPath pathBuffer;
9542 	if (pathBuffer.InitCheck() != B_OK)
9543 		return B_NO_MEMORY;
9544 
9545 	char* path = pathBuffer.LockBuffer();
9546 
9547 	if (!IS_USER_ADDRESS(userPath))
9548 		return B_BAD_ADDRESS;
9549 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9550 	if (status != B_OK)
9551 		return status;
9552 
9553 	return common_unlink(fd, path, false);
9554 }
9555 
9556 
9557 status_t
9558 _user_rename(int oldFD, const char* userOldPath, int newFD,
9559 	const char* userNewPath)
9560 {
9561 	KPath oldPathBuffer;
9562 	KPath newPathBuffer;
9563 	if (oldPathBuffer.InitCheck() != B_OK || newPathBuffer.InitCheck() != B_OK)
9564 		return B_NO_MEMORY;
9565 
9566 	char* oldPath = oldPathBuffer.LockBuffer();
9567 	char* newPath = newPathBuffer.LockBuffer();
9568 
9569 	if (!IS_USER_ADDRESS(userOldPath) || !IS_USER_ADDRESS(userNewPath))
9570 		return B_BAD_ADDRESS;
9571 	status_t status = user_copy_name(oldPath, userOldPath, B_PATH_NAME_LENGTH);
9572 	if (status != B_OK)
9573 		return status;
9574 	status = user_copy_name(newPath, userNewPath, B_PATH_NAME_LENGTH);
9575 	if (status != B_OK)
9576 		return status;
9577 
9578 	return common_rename(oldFD, oldPath, newFD, newPath, false);
9579 }
9580 
9581 
9582 status_t
9583 _user_create_fifo(int fd, const char* userPath, mode_t perms)
9584 {
9585 	KPath pathBuffer;
9586 	if (pathBuffer.InitCheck() != B_OK)
9587 		return B_NO_MEMORY;
9588 
9589 	char* path = pathBuffer.LockBuffer();
9590 
9591 	if (!IS_USER_ADDRESS(userPath))
9592 		return B_BAD_ADDRESS;
9593 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9594 	if (status != B_OK)
9595 		return status;
9596 
9597 	// split into directory vnode and filename path
9598 	char filename[B_FILE_NAME_LENGTH];
9599 	struct vnode* dir;
9600 	status = fd_and_path_to_dir_vnode(fd, path, &dir, filename, false);
9601 	if (status != B_OK)
9602 		return status;
9603 
9604 	VNodePutter _(dir);
9605 
9606 	// the underlying FS needs to support creating FIFOs
9607 	if (!HAS_FS_CALL(dir, create_special_node))
9608 		return B_UNSUPPORTED;
9609 
9610 	// create the entry	-- the FIFO sub node is set up automatically
9611 	fs_vnode superVnode;
9612 	ino_t nodeID;
9613 	status = FS_CALL(dir, create_special_node, filename, NULL,
9614 		S_IFIFO | (perms & S_IUMSK), 0, &superVnode, &nodeID);
9615 
9616 	// create_special_node() acquired a reference for us that we don't need.
9617 	if (status == B_OK)
9618 		put_vnode(dir->mount->volume, nodeID);
9619 
9620 	return status;
9621 }
9622 
9623 
9624 status_t
9625 _user_create_pipe(int* userFDs)
9626 {
9627 	// rootfs should support creating FIFOs, but let's be sure
9628 	if (!HAS_FS_CALL(sRoot, create_special_node))
9629 		return B_UNSUPPORTED;
9630 
9631 	// create the node	-- the FIFO sub node is set up automatically
9632 	fs_vnode superVnode;
9633 	ino_t nodeID;
9634 	status_t status = FS_CALL(sRoot, create_special_node, NULL, NULL,
9635 		S_IFIFO | S_IRUSR | S_IWUSR, 0, &superVnode, &nodeID);
9636 	if (status != B_OK)
9637 		return status;
9638 
9639 	// We've got one reference to the node and need another one.
9640 	struct vnode* vnode;
9641 	status = get_vnode(sRoot->mount->id, nodeID, &vnode, true, false);
9642 	if (status != B_OK) {
9643 		// that should not happen
9644 		dprintf("_user_create_pipe(): Failed to lookup vnode (%" B_PRIdDEV ", "
9645 			"%" B_PRIdINO ")\n", sRoot->mount->id, sRoot->id);
9646 		return status;
9647 	}
9648 
9649 	// Everything looks good so far. Open two FDs for reading respectively
9650 	// writing.
9651 	int fds[2];
9652 	fds[0] = open_vnode(vnode, O_RDONLY, false);
9653 	fds[1] = open_vnode(vnode, O_WRONLY, false);
9654 
9655 	FDCloser closer0(fds[0], false);
9656 	FDCloser closer1(fds[1], false);
9657 
9658 	status = (fds[0] >= 0 ? (fds[1] >= 0 ? B_OK : fds[1]) : fds[0]);
9659 
9660 	// copy FDs to userland
9661 	if (status == B_OK) {
9662 		if (!IS_USER_ADDRESS(userFDs)
9663 			|| user_memcpy(userFDs, fds, sizeof(fds)) != B_OK) {
9664 			status = B_BAD_ADDRESS;
9665 		}
9666 	}
9667 
9668 	// keep FDs, if everything went fine
9669 	if (status == B_OK) {
9670 		closer0.Detach();
9671 		closer1.Detach();
9672 	}
9673 
9674 	return status;
9675 }
9676 
9677 
9678 status_t
9679 _user_access(int fd, const char* userPath, int mode, bool effectiveUserGroup)
9680 {
9681 	KPath pathBuffer;
9682 	if (pathBuffer.InitCheck() != B_OK)
9683 		return B_NO_MEMORY;
9684 
9685 	char* path = pathBuffer.LockBuffer();
9686 
9687 	if (!IS_USER_ADDRESS(userPath))
9688 		return B_BAD_ADDRESS;
9689 	status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9690 	if (status != B_OK)
9691 		return status;
9692 
9693 	return common_access(fd, path, mode, effectiveUserGroup, false);
9694 }
9695 
9696 
9697 status_t
9698 _user_read_stat(int fd, const char* userPath, bool traverseLink,
9699 	struct stat* userStat, size_t statSize)
9700 {
9701 	struct stat stat = {0};
9702 	status_t status;
9703 
9704 	if (statSize > sizeof(struct stat))
9705 		return B_BAD_VALUE;
9706 
9707 	if (!IS_USER_ADDRESS(userStat))
9708 		return B_BAD_ADDRESS;
9709 
9710 	if (userPath != NULL) {
9711 		// path given: get the stat of the node referred to by (fd, path)
9712 		if (!IS_USER_ADDRESS(userPath))
9713 			return B_BAD_ADDRESS;
9714 
9715 		KPath pathBuffer;
9716 		if (pathBuffer.InitCheck() != B_OK)
9717 			return B_NO_MEMORY;
9718 
9719 		char* path = pathBuffer.LockBuffer();
9720 
9721 		status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9722 		if (status != B_OK)
9723 			return status;
9724 
9725 		status = common_path_read_stat(fd, path, traverseLink, &stat, false);
9726 	} else {
9727 		// no path given: get the FD and use the FD operation
9728 		struct file_descriptor* descriptor
9729 			= get_fd(get_current_io_context(false), fd);
9730 		if (descriptor == NULL)
9731 			return B_FILE_ERROR;
9732 
9733 		if (descriptor->ops->fd_read_stat)
9734 			status = descriptor->ops->fd_read_stat(descriptor, &stat);
9735 		else
9736 			status = B_UNSUPPORTED;
9737 
9738 		put_fd(descriptor);
9739 	}
9740 
9741 	if (status != B_OK)
9742 		return status;
9743 
9744 	return user_memcpy(userStat, &stat, statSize);
9745 }
9746 
9747 
9748 status_t
9749 _user_write_stat(int fd, const char* userPath, bool traverseLeafLink,
9750 	const struct stat* userStat, size_t statSize, int statMask)
9751 {
9752 	if (statSize > sizeof(struct stat))
9753 		return B_BAD_VALUE;
9754 
9755 	struct stat stat;
9756 
9757 	if (!IS_USER_ADDRESS(userStat)
9758 		|| user_memcpy(&stat, userStat, statSize) < B_OK)
9759 		return B_BAD_ADDRESS;
9760 
9761 	// clear additional stat fields
9762 	if (statSize < sizeof(struct stat))
9763 		memset((uint8*)&stat + statSize, 0, sizeof(struct stat) - statSize);
9764 
9765 	status_t status;
9766 
9767 	if (userPath != NULL) {
9768 		// path given: write the stat of the node referred to by (fd, path)
9769 		if (!IS_USER_ADDRESS(userPath))
9770 			return B_BAD_ADDRESS;
9771 
9772 		KPath pathBuffer;
9773 		if (pathBuffer.InitCheck() != B_OK)
9774 			return B_NO_MEMORY;
9775 
9776 		char* path = pathBuffer.LockBuffer();
9777 
9778 		status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9779 		if (status != B_OK)
9780 			return status;
9781 
9782 		status = common_path_write_stat(fd, path, traverseLeafLink, &stat,
9783 			statMask, false);
9784 	} else {
9785 		// no path given: get the FD and use the FD operation
9786 		struct file_descriptor* descriptor
9787 			= get_fd(get_current_io_context(false), fd);
9788 		if (descriptor == NULL)
9789 			return B_FILE_ERROR;
9790 
9791 		if (descriptor->ops->fd_write_stat) {
9792 			status = descriptor->ops->fd_write_stat(descriptor, &stat,
9793 				statMask);
9794 		} else
9795 			status = B_UNSUPPORTED;
9796 
9797 		put_fd(descriptor);
9798 	}
9799 
9800 	return status;
9801 }
9802 
9803 
9804 int
9805 _user_open_attr_dir(int fd, const char* userPath, bool traverseLeafLink)
9806 {
9807 	KPath pathBuffer;
9808 	if (pathBuffer.InitCheck() != B_OK)
9809 		return B_NO_MEMORY;
9810 
9811 	char* path = pathBuffer.LockBuffer();
9812 
9813 	if (userPath != NULL) {
9814 		if (!IS_USER_ADDRESS(userPath))
9815 			return B_BAD_ADDRESS;
9816 		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9817 		if (status != B_OK)
9818 			return status;
9819 	}
9820 
9821 	return attr_dir_open(fd, userPath ? path : NULL, traverseLeafLink, false);
9822 }
9823 
9824 
9825 ssize_t
9826 _user_read_attr(int fd, const char* userAttribute, off_t pos, void* userBuffer,
9827 	size_t readBytes)
9828 {
9829 	char attribute[B_FILE_NAME_LENGTH];
9830 
9831 	if (userAttribute == NULL)
9832 		return B_BAD_VALUE;
9833 	if (!IS_USER_ADDRESS(userAttribute))
9834 		return B_BAD_ADDRESS;
9835 	status_t status = user_copy_name(attribute, userAttribute, sizeof(attribute));
9836 	if (status != B_OK)
9837 		return status;
9838 
9839 	int attr = attr_open(fd, NULL, attribute, O_RDONLY, false);
9840 	if (attr < 0)
9841 		return attr;
9842 
9843 	ssize_t bytes = _user_read(attr, pos, userBuffer, readBytes);
9844 	_user_close(attr);
9845 
9846 	return bytes;
9847 }
9848 
9849 
9850 ssize_t
9851 _user_write_attr(int fd, const char* userAttribute, uint32 type, off_t pos,
9852 	const void* buffer, size_t writeBytes)
9853 {
9854 	char attribute[B_FILE_NAME_LENGTH];
9855 
9856 	if (userAttribute == NULL)
9857 		return B_BAD_VALUE;
9858 	if (!IS_USER_ADDRESS(userAttribute))
9859 		return B_BAD_ADDRESS;
9860 	status_t status = user_copy_name(attribute, userAttribute, sizeof(attribute));
9861 	if (status != B_OK)
9862 		return status;
9863 
9864 	// Try to support the BeOS typical truncation as well as the position
9865 	// argument
9866 	int attr = attr_create(fd, NULL, attribute, type,
9867 		O_CREAT | O_WRONLY | (pos != 0 ? 0 : O_TRUNC), false);
9868 	if (attr < 0)
9869 		return attr;
9870 
9871 	ssize_t bytes = _user_write(attr, pos, buffer, writeBytes);
9872 	_user_close(attr);
9873 
9874 	return bytes;
9875 }
9876 
9877 
9878 status_t
9879 _user_stat_attr(int fd, const char* userAttribute,
9880 	struct attr_info* userAttrInfo)
9881 {
9882 	char attribute[B_FILE_NAME_LENGTH];
9883 
9884 	if (userAttribute == NULL || userAttrInfo == NULL)
9885 		return B_BAD_VALUE;
9886 	if (!IS_USER_ADDRESS(userAttribute) || !IS_USER_ADDRESS(userAttrInfo))
9887 		return B_BAD_ADDRESS;
9888 	status_t status = user_copy_name(attribute, userAttribute,
9889 		sizeof(attribute));
9890 	if (status != B_OK)
9891 		return status;
9892 
9893 	int attr = attr_open(fd, NULL, attribute, O_RDONLY, false);
9894 	if (attr < 0)
9895 		return attr;
9896 
9897 	struct file_descriptor* descriptor
9898 		= get_fd(get_current_io_context(false), attr);
9899 	if (descriptor == NULL) {
9900 		_user_close(attr);
9901 		return B_FILE_ERROR;
9902 	}
9903 
9904 	struct stat stat;
9905 	if (descriptor->ops->fd_read_stat)
9906 		status = descriptor->ops->fd_read_stat(descriptor, &stat);
9907 	else
9908 		status = B_UNSUPPORTED;
9909 
9910 	put_fd(descriptor);
9911 	_user_close(attr);
9912 
9913 	if (status == B_OK) {
9914 		attr_info info;
9915 		info.type = stat.st_type;
9916 		info.size = stat.st_size;
9917 
9918 		if (user_memcpy(userAttrInfo, &info, sizeof(struct attr_info)) != B_OK)
9919 			return B_BAD_ADDRESS;
9920 	}
9921 
9922 	return status;
9923 }
9924 
9925 
9926 int
9927 _user_open_attr(int fd, const char* userPath, const char* userName,
9928 	uint32 type, int openMode)
9929 {
9930 	char name[B_FILE_NAME_LENGTH];
9931 
9932 	if (!IS_USER_ADDRESS(userName))
9933 		return B_BAD_ADDRESS;
9934 	status_t status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
9935 	if (status != B_OK)
9936 		return status;
9937 
9938 	KPath pathBuffer;
9939 	if (pathBuffer.InitCheck() != B_OK)
9940 		return B_NO_MEMORY;
9941 
9942 	char* path = pathBuffer.LockBuffer();
9943 
9944 	if (userPath != NULL) {
9945 		if (!IS_USER_ADDRESS(userPath))
9946 			return B_BAD_ADDRESS;
9947 		status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
9948 		if (status != B_OK)
9949 			return status;
9950 	}
9951 
9952 	if ((openMode & O_CREAT) != 0) {
9953 		return attr_create(fd, userPath ? path : NULL, name, type, openMode,
9954 			false);
9955 	}
9956 
9957 	return attr_open(fd, userPath ? path : NULL, name, openMode, false);
9958 }
9959 
9960 
9961 status_t
9962 _user_remove_attr(int fd, const char* userName)
9963 {
9964 	char name[B_FILE_NAME_LENGTH];
9965 
9966 	if (!IS_USER_ADDRESS(userName))
9967 		return B_BAD_ADDRESS;
9968 	status_t status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
9969 	if (status != B_OK)
9970 		return status;
9971 
9972 	return attr_remove(fd, name, false);
9973 }
9974 
9975 
9976 status_t
9977 _user_rename_attr(int fromFile, const char* userFromName, int toFile,
9978 	const char* userToName)
9979 {
9980 	if (!IS_USER_ADDRESS(userFromName)
9981 		|| !IS_USER_ADDRESS(userToName))
9982 		return B_BAD_ADDRESS;
9983 
9984 	KPath fromNameBuffer(B_FILE_NAME_LENGTH);
9985 	KPath toNameBuffer(B_FILE_NAME_LENGTH);
9986 	if (fromNameBuffer.InitCheck() != B_OK || toNameBuffer.InitCheck() != B_OK)
9987 		return B_NO_MEMORY;
9988 
9989 	char* fromName = fromNameBuffer.LockBuffer();
9990 	char* toName = toNameBuffer.LockBuffer();
9991 
9992 	status_t status = user_copy_name(fromName, userFromName, B_FILE_NAME_LENGTH);
9993 	if (status != B_OK)
9994 		return status;
9995 	status = user_copy_name(toName, userToName, B_FILE_NAME_LENGTH);
9996 	if (status != B_OK)
9997 		return status;
9998 
9999 	return attr_rename(fromFile, fromName, toFile, toName, false);
10000 }
10001 
10002 
10003 int
10004 _user_open_index_dir(dev_t device)
10005 {
10006 	return index_dir_open(device, false);
10007 }
10008 
10009 
10010 status_t
10011 _user_create_index(dev_t device, const char* userName, uint32 type,
10012 	uint32 flags)
10013 {
10014 	char name[B_FILE_NAME_LENGTH];
10015 
10016 	if (!IS_USER_ADDRESS(userName))
10017 		return B_BAD_ADDRESS;
10018 	status_t status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
10019 	if (status != B_OK)
10020 		return status;
10021 
10022 	return index_create(device, name, type, flags, false);
10023 }
10024 
10025 
10026 status_t
10027 _user_read_index_stat(dev_t device, const char* userName, struct stat* userStat)
10028 {
10029 	char name[B_FILE_NAME_LENGTH];
10030 	struct stat stat = {0};
10031 	status_t status;
10032 
10033 	if (!IS_USER_ADDRESS(userName) || !IS_USER_ADDRESS(userStat))
10034 		return B_BAD_ADDRESS;
10035 	status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
10036 	if (status != B_OK)
10037 		return status;
10038 
10039 	status = index_name_read_stat(device, name, &stat, false);
10040 	if (status == B_OK) {
10041 		if (user_memcpy(userStat, &stat, sizeof(stat)) != B_OK)
10042 			return B_BAD_ADDRESS;
10043 	}
10044 
10045 	return status;
10046 }
10047 
10048 
10049 status_t
10050 _user_remove_index(dev_t device, const char* userName)
10051 {
10052 	char name[B_FILE_NAME_LENGTH];
10053 
10054 	if (!IS_USER_ADDRESS(userName))
10055 		return B_BAD_ADDRESS;
10056 	status_t status = user_copy_name(name, userName, B_FILE_NAME_LENGTH);
10057 	if (status != B_OK)
10058 		return status;
10059 
10060 	return index_remove(device, name, false);
10061 }
10062 
10063 
10064 status_t
10065 _user_getcwd(char* userBuffer, size_t size)
10066 {
10067 	if (size == 0)
10068 		return B_BAD_VALUE;
10069 	if (!IS_USER_ADDRESS(userBuffer))
10070 		return B_BAD_ADDRESS;
10071 
10072 	if (size > kMaxPathLength)
10073 		size = kMaxPathLength;
10074 
10075 	KPath pathBuffer(size);
10076 	if (pathBuffer.InitCheck() != B_OK)
10077 		return B_NO_MEMORY;
10078 
10079 	TRACE(("user_getcwd: buf %p, %ld\n", userBuffer, size));
10080 
10081 	char* path = pathBuffer.LockBuffer();
10082 
10083 	status_t status = get_cwd(path, size, false);
10084 	if (status != B_OK)
10085 		return status;
10086 
10087 	// Copy back the result
10088 	if (user_strlcpy(userBuffer, path, size) < B_OK)
10089 		return B_BAD_ADDRESS;
10090 
10091 	return status;
10092 }
10093 
10094 
10095 status_t
10096 _user_setcwd(int fd, const char* userPath)
10097 {
10098 	TRACE(("user_setcwd: path = %p\n", userPath));
10099 
10100 	KPath pathBuffer;
10101 	if (pathBuffer.InitCheck() != B_OK)
10102 		return B_NO_MEMORY;
10103 
10104 	char* path = pathBuffer.LockBuffer();
10105 
10106 	if (userPath != NULL) {
10107 		if (!IS_USER_ADDRESS(userPath))
10108 			return B_BAD_ADDRESS;
10109 		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
10110 		if (status != B_OK)
10111 			return status;
10112 	}
10113 
10114 	return set_cwd(fd, userPath != NULL ? path : NULL, false);
10115 }
10116 
10117 
10118 status_t
10119 _user_change_root(const char* userPath)
10120 {
10121 	// only root is allowed to chroot()
10122 	if (geteuid() != 0)
10123 		return B_NOT_ALLOWED;
10124 
10125 	// alloc path buffer
10126 	KPath pathBuffer;
10127 	if (pathBuffer.InitCheck() != B_OK)
10128 		return B_NO_MEMORY;
10129 
10130 	// copy userland path to kernel
10131 	char* path = pathBuffer.LockBuffer();
10132 	if (userPath != NULL) {
10133 		if (!IS_USER_ADDRESS(userPath))
10134 			return B_BAD_ADDRESS;
10135 		status_t status = user_copy_name(path, userPath, B_PATH_NAME_LENGTH);
10136 		if (status != B_OK)
10137 			return status;
10138 	}
10139 
10140 	// get the vnode
10141 	struct vnode* vnode;
10142 	status_t status = path_to_vnode(path, true, &vnode, NULL, false);
10143 	if (status != B_OK)
10144 		return status;
10145 
10146 	// set the new root
10147 	struct io_context* context = get_current_io_context(false);
10148 	mutex_lock(&sIOContextRootLock);
10149 	struct vnode* oldRoot = context->root;
10150 	context->root = vnode;
10151 	mutex_unlock(&sIOContextRootLock);
10152 
10153 	put_vnode(oldRoot);
10154 
10155 	return B_OK;
10156 }
10157 
10158 
10159 int
10160 _user_open_query(dev_t device, const char* userQuery, size_t queryLength,
10161 	uint32 flags, port_id port, int32 token)
10162 {
10163 	if (device < 0 || userQuery == NULL || queryLength == 0)
10164 		return B_BAD_VALUE;
10165 
10166 	if (!IS_USER_ADDRESS(userQuery))
10167 		return B_BAD_ADDRESS;
10168 
10169 	// this is a safety restriction
10170 	if (queryLength >= 65536)
10171 		return B_NAME_TOO_LONG;
10172 
10173 	BStackOrHeapArray<char, 128> query(queryLength + 1);
10174 	if (!query.IsValid())
10175 		return B_NO_MEMORY;
10176 
10177 	if (user_strlcpy(query, userQuery, queryLength + 1) < B_OK)
10178 		return B_BAD_ADDRESS;
10179 
10180 	return query_open(device, query, flags, port, token, false);
10181 }
10182 
10183 
10184 #include "vfs_request_io.cpp"
10185