xref: /haiku/src/add-ons/kernel/file_systems/bfs/Inode.cpp (revision a085e81e62d7a860f809b4fb7c7bf5654c396985)
1 /*
2  * Copyright 2001-2012, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 
7 //! Inode access functions
8 
9 
10 #include "Debug.h"
11 #include "Inode.h"
12 #include "BPlusTree.h"
13 #include "Index.h"
14 
15 
16 #if BFS_TRACING && !defined(BFS_SHELL) && !defined(_BOOT_MODE)
17 namespace BFSInodeTracing {
18 
19 class Create : public AbstractTraceEntry {
20 public:
21 	Create(Inode* inode, Inode* parent, const char* name, int32 mode,
22 			int openMode, uint32 type)
23 		:
24 		fInode(inode),
25 		fID(inode->ID()),
26 		fParent(parent),
27 		fParentID(parent != NULL ? parent->ID() : 0),
28 		fMode(mode),
29 		fOpenMode(openMode),
30 		fType(type)
31 	{
32 		if (name != NULL)
33 			strlcpy(fName, name, sizeof(fName));
34 		else
35 			fName[0] = '\0';
36 
37 		Initialized();
38 	}
39 
40 	virtual void AddDump(TraceOutput& out)
41 	{
42 		out.Print("bfs:Create %Ld (%p), parent %Ld (%p), \"%s\", "
43 			"mode %lx, omode %x, type %lx", fID, fInode, fParentID,
44 			fParent, fName, fMode, fOpenMode, fType);
45 	}
46 
47 private:
48 	Inode*	fInode;
49 	ino_t	fID;
50 	Inode*	fParent;
51 	ino_t	fParentID;
52 	char	fName[32];
53 	int32	fMode;
54 	int		fOpenMode;
55 	uint32	fType;
56 };
57 
58 class Remove : public AbstractTraceEntry {
59 public:
60 	Remove(Inode* inode, const char* name)
61 		:
62 		fInode(inode),
63 		fID(inode->ID())
64 	{
65 		strlcpy(fName, name, sizeof(fName));
66 		Initialized();
67 	}
68 
69 	virtual void AddDump(TraceOutput& out)
70 	{
71 		out.Print("bfs:Remove %Ld (%p), \"%s\"", fID, fInode, fName);
72 	}
73 
74 private:
75 	Inode*	fInode;
76 	ino_t	fID;
77 	char	fName[32];
78 };
79 
80 class Action : public AbstractTraceEntry {
81 public:
82 	Action(const char* action, Inode* inode)
83 		:
84 		fInode(inode),
85 		fID(inode->ID())
86 	{
87 		strlcpy(fAction, action, sizeof(fAction));
88 		Initialized();
89 	}
90 
91 	virtual void AddDump(TraceOutput& out)
92 	{
93 		out.Print("bfs:%s %Ld (%p)\n", fAction, fID, fInode);
94 	}
95 
96 private:
97 	Inode*	fInode;
98 	ino_t	fID;
99 	char	fAction[16];
100 };
101 
102 class Resize : public AbstractTraceEntry {
103 public:
104 	Resize(Inode* inode, off_t oldSize, off_t newSize, bool trim)
105 		:
106 		fInode(inode),
107 		fID(inode->ID()),
108 		fOldSize(oldSize),
109 		fNewSize(newSize),
110 		fTrim(trim)
111 	{
112 		Initialized();
113 	}
114 
115 	virtual void AddDump(TraceOutput& out)
116 	{
117 		out.Print("bfs:%s %Ld (%p), %Ld -> %Ld", fTrim ? "Trim" : "Resize",
118 			fID, fInode, fOldSize, fNewSize);
119 	}
120 
121 private:
122 	Inode*	fInode;
123 	ino_t	fID;
124 	off_t	fOldSize;
125 	off_t	fNewSize;
126 	bool	fTrim;
127 };
128 
129 }	// namespace BFSInodeTracing
130 
131 #	define T(x) new(std::nothrow) BFSInodeTracing::x;
132 #else
133 #	define T(x) ;
134 #endif
135 
136 
137 /*!	A helper class used by Inode::Create() to keep track of the belongings
138 	of an inode creation in progress.
139 	This class will make sure everything is cleaned up properly.
140 */
141 class InodeAllocator {
142 public:
143 							InodeAllocator(Transaction& transaction);
144 							~InodeAllocator();
145 
146 			status_t		New(block_run* parentRun, mode_t mode,
147 								block_run& run, fs_vnode_ops* vnodeOps,
148 								Inode** _inode);
149 			status_t		CreateTree();
150 			status_t		Keep(fs_vnode_ops* vnodeOps, uint32 publishFlags);
151 
152 private:
153 	static	void			_TransactionListener(int32 id, int32 event,
154 								void* _inode);
155 
156 			Transaction*	fTransaction;
157 			block_run		fRun;
158 			Inode*			fInode;
159 };
160 
161 
162 InodeAllocator::InodeAllocator(Transaction& transaction)
163 	:
164 	fTransaction(&transaction),
165 	fInode(NULL)
166 {
167 }
168 
169 
170 InodeAllocator::~InodeAllocator()
171 {
172 	if (fTransaction != NULL) {
173 		Volume* volume = fTransaction->GetVolume();
174 
175 		if (fInode != NULL) {
176 			fInode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
177 				// this unblocks any pending bfs_read_vnode() calls
178 			fInode->Free(*fTransaction);
179 
180 			if (fInode->fTree != NULL)
181 				fTransaction->RemoveListener(fInode->fTree);
182 			fTransaction->RemoveListener(fInode);
183 
184 			remove_vnode(volume->FSVolume(), fInode->ID());
185 		} else
186 			volume->Free(*fTransaction, fRun);
187 	}
188 
189 	delete fInode;
190 }
191 
192 
193 status_t
194 InodeAllocator::New(block_run* parentRun, mode_t mode, block_run& run,
195 	fs_vnode_ops* vnodeOps, Inode** _inode)
196 {
197 	Volume* volume = fTransaction->GetVolume();
198 
199 	status_t status = volume->AllocateForInode(*fTransaction, parentRun, mode,
200 		fRun);
201 	if (status < B_OK) {
202 		// don't free the space in the destructor, because
203 		// the allocation failed
204 		fTransaction = NULL;
205 		RETURN_ERROR(status);
206 	}
207 
208 	run = fRun;
209 	fInode = new Inode(volume, *fTransaction, volume->ToVnode(run), mode, run);
210 	if (fInode == NULL)
211 		RETURN_ERROR(B_NO_MEMORY);
212 
213 	if (!volume->IsInitializing()) {
214 		status = new_vnode(volume->FSVolume(), fInode->ID(), fInode,
215 			vnodeOps != NULL ? vnodeOps : &gBFSVnodeOps);
216 		if (status < B_OK) {
217 			delete fInode;
218 			fInode = NULL;
219 			RETURN_ERROR(status);
220 		}
221 	}
222 
223 	fInode->WriteLockInTransaction(*fTransaction);
224 	*_inode = fInode;
225 	return B_OK;
226 }
227 
228 
229 status_t
230 InodeAllocator::CreateTree()
231 {
232 	Volume* volume = fTransaction->GetVolume();
233 
234 	// force S_STR_INDEX to be set, if no type is set
235 	if ((fInode->Mode() & S_INDEX_TYPES) == 0)
236 		fInode->Node().mode |= HOST_ENDIAN_TO_BFS_INT32(S_STR_INDEX);
237 
238 	BPlusTree* tree = fInode->fTree = new BPlusTree(*fTransaction, fInode);
239 	if (tree == NULL || tree->InitCheck() < B_OK)
240 		return B_ERROR;
241 
242 	if (fInode->IsRegularNode()) {
243 		if (tree->Insert(*fTransaction, ".", fInode->ID()) < B_OK
244 			|| tree->Insert(*fTransaction, "..",
245 					volume->ToVnode(fInode->Parent())) < B_OK)
246 			return B_ERROR;
247 	}
248 	return B_OK;
249 }
250 
251 
252 status_t
253 InodeAllocator::Keep(fs_vnode_ops* vnodeOps, uint32 publishFlags)
254 {
255 	ASSERT(fInode != NULL && fTransaction != NULL);
256 	Volume* volume = fTransaction->GetVolume();
257 
258 	status_t status = fInode->WriteBack(*fTransaction);
259 	if (status < B_OK) {
260 		FATAL(("writing new inode %" B_PRIdINO " failed!\n", fInode->ID()));
261 		return status;
262 	}
263 
264 	// Symbolic links are not published -- the caller needs to do this once
265 	// the contents have been written.
266 	if (!fInode->IsSymLink() && volume->ID() >= 0) {
267 		status = publish_vnode(volume->FSVolume(), fInode->ID(), fInode,
268 			vnodeOps != NULL ? vnodeOps : &gBFSVnodeOps, fInode->Mode(),
269 			publishFlags);
270 	}
271 
272 	if (status == B_OK) {
273 		cache_add_transaction_listener(volume->BlockCache(), fTransaction->ID(),
274 			TRANSACTION_ABORTED, &_TransactionListener, fInode);
275 	}
276 
277 	fTransaction = NULL;
278 	fInode = NULL;
279 
280 	return status;
281 }
282 
283 
284 /*static*/ void
285 InodeAllocator::_TransactionListener(int32 id, int32 event, void* _inode)
286 {
287 	Inode* inode = (Inode*)_inode;
288 
289 	if (event == TRANSACTION_ABORTED)
290 		panic("transaction %d aborted, inode %p still around!\n", (int)id, inode);
291 }
292 
293 
294 //	#pragma mark -
295 
296 
297 status_t
298 bfs_inode::InitCheck(Volume* volume) const
299 {
300 	if (Magic1() != INODE_MAGIC1
301 		|| !(Flags() & INODE_IN_USE)
302 		|| inode_num.Length() != 1
303 		// matches inode size?
304 		|| (uint32)InodeSize() != volume->InodeSize()
305 		// parent resides on disk?
306 		|| parent.AllocationGroup() > int32(volume->AllocationGroups())
307 		|| parent.AllocationGroup() < 0
308 		|| parent.Start() > (1L << volume->AllocationGroupShift())
309 		|| parent.Length() != 1
310 		// attributes, too?
311 		|| attributes.AllocationGroup() > int32(volume->AllocationGroups())
312 		|| attributes.AllocationGroup() < 0
313 		|| attributes.Start() > (1L << volume->AllocationGroupShift()))
314 		RETURN_ERROR(B_BAD_DATA);
315 
316 	if (Flags() & INODE_DELETED)
317 		return B_NOT_ALLOWED;
318 
319 	// TODO: Add some tests to check the integrity of the other stuff here,
320 	// especially for the data_stream!
321 
322 	return B_OK;
323 }
324 
325 
326 //	#pragma mark - Inode
327 
328 
329 Inode::Inode(Volume* volume, ino_t id)
330 	:
331 	fVolume(volume),
332 	fID(id),
333 	fTree(NULL),
334 	fAttributes(NULL),
335 	fCache(NULL),
336 	fMap(NULL)
337 {
338 	PRINT(("Inode::Inode(volume = %p, id = %Ld) @ %p\n", volume, id, this));
339 
340 	UpdateNodeFromDisk();
341 
342 	rw_lock_init(&fLock, "bfs inode");
343 	recursive_lock_init(&fSmallDataLock, "bfs inode small data");
344 
345 	// these two will help to maintain the indices
346 	fOldSize = Size();
347 	fOldLastModified = LastModified();
348 
349 	if (IsContainer())
350 		fTree = new BPlusTree(this);
351 	if (NeedsFileCache()) {
352 		SetFileCache(file_cache_create(fVolume->ID(), ID(), Size()));
353 		SetMap(file_map_create(volume->ID(), ID(), Size()));
354 	}
355 }
356 
357 
358 Inode::Inode(Volume* volume, Transaction& transaction, ino_t id, mode_t mode,
359 		block_run& run)
360 	:
361 	fVolume(volume),
362 	fID(id),
363 	fTree(NULL),
364 	fAttributes(NULL),
365 	fCache(NULL),
366 	fMap(NULL)
367 {
368 	PRINT(("Inode::Inode(volume = %p, transaction = %p, id = %Ld) @ %p\n",
369 		volume, &transaction, id, this));
370 
371 	rw_lock_init(&fLock, "bfs inode");
372 	recursive_lock_init(&fSmallDataLock, "bfs inode small data");
373 
374 	NodeGetter node(volume, transaction, this, true);
375 	memset(&fNode, 0, sizeof(bfs_inode));
376 
377 	// Initialize the bfs_inode structure -- it's not written back to disk
378 	// here, because it will be done so already when the inode could be
379 	// created completely.
380 
381 	Node().magic1 = HOST_ENDIAN_TO_BFS_INT32(INODE_MAGIC1);
382 	Node().inode_num = run;
383 	Node().mode = HOST_ENDIAN_TO_BFS_INT32(mode);
384 	Node().flags = HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
385 
386 	Node().create_time = Node().last_modified_time = Node().status_change_time
387 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
388 
389 	Node().inode_size = HOST_ENDIAN_TO_BFS_INT32(volume->InodeSize());
390 
391 	// these two will help to maintain the indices
392 	fOldSize = Size();
393 	fOldLastModified = LastModified();
394 }
395 
396 
397 Inode::~Inode()
398 {
399 	PRINT(("Inode::~Inode() @ %p\n", this));
400 
401 	file_cache_delete(FileCache());
402 	file_map_delete(Map());
403 	delete fTree;
404 
405 	rw_lock_destroy(&fLock);
406 	recursive_lock_destroy(&fSmallDataLock);
407 }
408 
409 
410 status_t
411 Inode::InitCheck(bool checkNode) const
412 {
413 	// test inode magic and flags
414 	if (checkNode) {
415 		status_t status = Node().InitCheck(fVolume);
416 		if (status == B_BUSY)
417 			return B_BUSY;
418 
419 		if (status != B_OK) {
420 			FATAL(("inode at block %" B_PRIdOFF " corrupt!\n", BlockNumber()));
421 			RETURN_ERROR(B_BAD_DATA);
422 		}
423 	}
424 
425 	if (IsContainer()) {
426 		// inodes that have a B+tree
427 		if (fTree == NULL)
428 			RETURN_ERROR(B_NO_MEMORY);
429 
430 		status_t status = fTree->InitCheck();
431 		if (status != B_OK) {
432 			FATAL(("inode tree at block %" B_PRIdOFF " corrupt!\n",
433 				BlockNumber()));
434 			RETURN_ERROR(B_BAD_DATA);
435 		}
436 	}
437 
438 	if (NeedsFileCache() && (fCache == NULL || fMap == NULL))
439 		return B_NO_MEMORY;
440 
441 	return B_OK;
442 }
443 
444 
445 /*!	Adds this inode to the specified transaction. This means that the inode will
446 	be write locked until the transaction ended.
447 	To ensure that the inode will stay valid until that point, an extra reference
448 	is acquired to it as long as this transaction stays active.
449 */
450 void
451 Inode::WriteLockInTransaction(Transaction& transaction)
452 {
453 	// These flags can only change while holding the transaction lock
454 	if ((Flags() & INODE_IN_TRANSACTION) != 0)
455 		return;
456 
457 	// We share the same list link with the removed list, so we have to remove
458 	// the inode from that list here (and add it back when we no longer need it)
459 	if ((Flags() & INODE_DELETED) != 0)
460 		fVolume->RemovedInodes().Remove(this);
461 
462 	if (!fVolume->IsInitializing())
463 		acquire_vnode(fVolume->FSVolume(), ID());
464 
465 	rw_lock_write_lock(&Lock());
466 	Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
467 
468 	transaction.AddListener(this);
469 }
470 
471 
472 status_t
473 Inode::WriteBack(Transaction& transaction)
474 {
475 	NodeGetter node(fVolume, transaction, this);
476 	if (node.WritableNode() == NULL)
477 		return B_IO_ERROR;
478 
479 	memcpy(node.WritableNode(), &Node(), sizeof(bfs_inode));
480 	return B_OK;
481 }
482 
483 
484 void
485 Inode::UpdateNodeFromDisk()
486 {
487 	NodeGetter node(fVolume, this);
488 	memcpy(&fNode, node.Node(), sizeof(bfs_inode));
489 	fNode.flags &= HOST_ENDIAN_TO_BFS_INT32(INODE_PERMANENT_FLAGS);
490 }
491 
492 
493 status_t
494 Inode::CheckPermissions(int accessMode) const
495 {
496 	// you never have write access to a read-only volume
497 	if ((accessMode & W_OK) != 0 && fVolume->IsReadOnly())
498 		return B_READ_ONLY_DEVICE;
499 
500 	// get node permissions
501 	mode_t mode = Mode();
502 	int userPermissions = (mode & S_IRWXU) >> 6;
503 	int groupPermissions = (mode & S_IRWXG) >> 3;
504 	int otherPermissions = mode & S_IRWXO;
505 
506 	// get the node permissions for this uid/gid
507 	int permissions = 0;
508 	uid_t uid = geteuid();
509 	gid_t gid = getegid();
510 
511 	if (uid == 0) {
512 		// user is root
513 		// root has always read/write permission, but at least one of the
514 		// X bits must be set for execute permission
515 		permissions = userPermissions | groupPermissions | otherPermissions
516 			| R_OK | W_OK;
517 	} else if (uid == (uid_t)fNode.UserID()) {
518 		// user is node owner
519 		permissions = userPermissions;
520 	} else if (gid == (gid_t)fNode.GroupID()) {
521 		// user is in owning group
522 		permissions = groupPermissions;
523 	} else {
524 		// user is one of the others
525 		permissions = otherPermissions;
526 	}
527 
528 	return (accessMode & ~permissions) == 0 ? B_OK : B_NOT_ALLOWED;
529 }
530 
531 
532 //	#pragma mark - attributes
533 
534 
535 void
536 Inode::_AddIterator(AttributeIterator* iterator)
537 {
538 	RecursiveLocker _(fSmallDataLock);
539 	fIterators.Add(iterator);
540 }
541 
542 
543 void
544 Inode::_RemoveIterator(AttributeIterator* iterator)
545 {
546 	RecursiveLocker _(fSmallDataLock);
547 	fIterators.Remove(iterator);
548 }
549 
550 
551 /*!	Tries to free up "bytes" space in the small_data section by moving
552 	attributes to real files. Used for system attributes like the name.
553 	You need to hold the fSmallDataLock when you call this method
554 */
555 status_t
556 Inode::_MakeSpaceForSmallData(Transaction& transaction, bfs_inode* node,
557 	const char* name, int32 bytes)
558 {
559 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
560 
561 	while (bytes > 0) {
562 		small_data* item = node->SmallDataStart();
563 		small_data* max = NULL;
564 		int32 index = 0, maxIndex = 0;
565 		for (; !item->IsLast(node); item = item->Next(), index++) {
566 			// should not remove those
567 			if (*item->Name() == FILE_NAME_NAME || !strcmp(name, item->Name()))
568 				continue;
569 
570 			if (max == NULL || max->Size() < item->Size()) {
571 				maxIndex = index;
572 				max = item;
573 			}
574 
575 			// Remove the first one large enough to free the needed amount of
576 			// bytes
577 			if (bytes < (int32)item->Size())
578 				break;
579 		}
580 
581 		if (item->IsLast(node) || (int32)item->Size() < bytes)
582 			return B_ERROR;
583 
584 		bytes -= max->Size();
585 
586 		// Move the attribute to a real attribute file
587 		// Luckily, this doesn't cause any index updates
588 
589 		Inode* attribute;
590 		status_t status = CreateAttribute(transaction, item->Name(),
591 			item->Type(), &attribute);
592 		if (status != B_OK)
593 			RETURN_ERROR(status);
594 
595 		size_t length = item->DataSize();
596 		status = attribute->WriteAt(transaction, 0, item->Data(), &length);
597 
598 		ReleaseAttribute(attribute);
599 
600 		if (status != B_OK) {
601 			Vnode vnode(fVolume, Attributes());
602 			Inode* attributes;
603 			if (vnode.Get(&attributes) < B_OK
604 				|| attributes->Remove(transaction, name) < B_OK) {
605 				FATAL(("Could not remove newly created attribute!\n"));
606 			}
607 
608 			RETURN_ERROR(status);
609 		}
610 
611 		_RemoveSmallData(node, max, maxIndex);
612 	}
613 	return B_OK;
614 }
615 
616 
617 /*!	Private function which removes the given attribute from the small_data
618 	section.
619 	You need to hold the fSmallDataLock when you call this method
620 */
621 status_t
622 Inode::_RemoveSmallData(bfs_inode* node, small_data* item, int32 index)
623 {
624 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
625 
626 	small_data* next = item->Next();
627 	if (!next->IsLast(node)) {
628 		// find the last attribute
629 		small_data* last = next;
630 		while (!last->IsLast(node))
631 			last = last->Next();
632 
633 		int32 size = (uint8*)last - (uint8*)next;
634 		if (size < 0
635 			|| size > (uint8*)node + fVolume->BlockSize() - (uint8*)next)
636 			return B_BAD_DATA;
637 
638 		memmove(item, next, size);
639 
640 		// Move the "last" one to its new location and
641 		// correctly terminate the small_data section
642 		last = (small_data*)((uint8*)last - ((uint8*)next - (uint8*)item));
643 		memset(last, 0, (uint8*)node + fVolume->BlockSize() - (uint8*)last);
644 	} else
645 		memset(item, 0, item->Size());
646 
647 	// update all current iterators
648 	SinglyLinkedList<AttributeIterator>::Iterator iterator
649 		= fIterators.GetIterator();
650 	while (iterator.HasNext()) {
651 		iterator.Next()->Update(index, -1);
652 	}
653 
654 	return B_OK;
655 }
656 
657 
658 //!	Removes the given attribute from the small_data section.
659 status_t
660 Inode::_RemoveSmallData(Transaction& transaction, NodeGetter& nodeGetter,
661 	const char* name)
662 {
663 	if (name == NULL)
664 		return B_BAD_VALUE;
665 
666 	bfs_inode* node = nodeGetter.WritableNode();
667 	RecursiveLocker locker(fSmallDataLock);
668 
669 	// search for the small_data item
670 
671 	small_data* item = node->SmallDataStart();
672 	int32 index = 0;
673 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
674 		item = item->Next();
675 		index++;
676 	}
677 
678 	if (item->IsLast(node))
679 		return B_ENTRY_NOT_FOUND;
680 
681 	nodeGetter.MakeWritable(transaction);
682 
683 	status_t status = _RemoveSmallData(node, item, index);
684 	if (status == B_OK) {
685 		Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
686 			bfs_inode::ToInode(real_time_clock_usecs()));
687 
688 		status = WriteBack(transaction);
689 	}
690 
691 	return status;
692 }
693 
694 
695 /*!	Try to place the given attribute in the small_data section - if the
696 	new attribute is too big to fit in that section, it returns B_DEVICE_FULL.
697 	In that case, the attribute should be written to a real attribute file;
698 	it's the caller's responsibility to remove any existing attributes in the
699 	small data section if that's the case.
700 
701 	Note that you need to write back the inode yourself after having called that
702 	method - it's a bad API decision that it needs a transaction but enforces
703 	you to write back the inode all by yourself, but it's just more efficient
704 	in most cases...
705 */
706 status_t
707 Inode::_AddSmallData(Transaction& transaction, NodeGetter& nodeGetter,
708 	const char* name, uint32 type, off_t pos, const uint8* data, size_t length,
709 	bool force)
710 {
711 	bfs_inode* node = nodeGetter.WritableNode();
712 
713 	if (node == NULL || name == NULL || data == NULL)
714 		return B_BAD_VALUE;
715 
716 	// reject any requests that can't fit into the small_data section
717 	uint32 nameLength = strlen(name);
718 	uint32 spaceNeeded = sizeof(small_data) + nameLength + 3 + pos + length + 1;
719 	if (spaceNeeded > fVolume->InodeSize() - sizeof(bfs_inode))
720 		return B_DEVICE_FULL;
721 
722 	nodeGetter.MakeWritable(transaction);
723 	RecursiveLocker locker(fSmallDataLock);
724 
725 	// Find the last item or one with the same name we have to add
726 	small_data* item = node->SmallDataStart();
727 	int32 index = 0;
728 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
729 		item = item->Next();
730 		index++;
731 	}
732 
733 	// is the attribute already in the small_data section?
734 	// then just replace the data part of that one
735 	if (!item->IsLast(node)) {
736 		// find last attribute
737 		small_data* last = item;
738 		while (!last->IsLast(node))
739 			last = last->Next();
740 
741 		// try to change the attributes value
742 		if (item->data_size > pos + length
743 			|| force
744 			|| ((uint8*)last + pos + length - item->DataSize())
745 					<= ((uint8*)node + fVolume->InodeSize())) {
746 			// Make room for the new attribute if needed (and we are forced
747 			// to do so)
748 			if (force && ((uint8*)last + pos + length - item->DataSize())
749 					> ((uint8*)node + fVolume->InodeSize())) {
750 				// We also take the free space at the end of the small_data
751 				// section into account, and request only what's really needed
752 				uint32 needed = pos + length - item->DataSize() -
753 					(uint32)((uint8*)node + fVolume->InodeSize()
754 						- (uint8*)last);
755 
756 				if (_MakeSpaceForSmallData(transaction, node, name, needed)
757 						!= B_OK)
758 					return B_ERROR;
759 
760 				// reset our pointers
761 				item = node->SmallDataStart();
762 				index = 0;
763 				while (!item->IsLast(node) && strcmp(item->Name(), name)) {
764 					item = item->Next();
765 					index++;
766 				}
767 
768 				last = item;
769 				while (!last->IsLast(node))
770 					last = last->Next();
771 			}
772 
773 			size_t oldDataSize = item->DataSize();
774 
775 			// Normally, we can just overwrite the attribute data as the size
776 			// is specified by the type and does not change that often
777 			if (pos + length != item->DataSize()) {
778 				// move the attributes after the current one
779 				small_data* next = item->Next();
780 				if (!next->IsLast(node)) {
781 					memmove((uint8*)item + spaceNeeded, next,
782 						(uint8*)last - (uint8*)next);
783 				}
784 
785 				// Move the "last" one to its new location and
786 				// correctly terminate the small_data section
787 				last = (small_data*)((uint8*)last
788 					- ((uint8*)next - ((uint8*)item + spaceNeeded)));
789 				if ((uint8*)last < (uint8*)node + fVolume->BlockSize()) {
790 					memset(last, 0, (uint8*)node + fVolume->BlockSize()
791 						- (uint8*)last);
792 				}
793 
794 				item->data_size = HOST_ENDIAN_TO_BFS_INT16(pos + length);
795 			}
796 
797 			item->type = HOST_ENDIAN_TO_BFS_INT32(type);
798 
799 			if ((uint64)oldDataSize < (uint64)pos) {
800 				// Fill gap with zeros
801 				memset(item->Data() + oldDataSize, 0, pos - oldDataSize);
802 			}
803 			memcpy(item->Data() + pos, data, length);
804 			item->Data()[pos + length] = '\0';
805 
806 			return B_OK;
807 		}
808 
809 		return B_DEVICE_FULL;
810 	}
811 
812 	// try to add the new attribute!
813 
814 	if ((uint8*)item + spaceNeeded > (uint8*)node + fVolume->InodeSize()) {
815 		// there is not enough space for it!
816 		if (!force)
817 			return B_DEVICE_FULL;
818 
819 		// make room for the new attribute
820 		if (_MakeSpaceForSmallData(transaction, node, name, spaceNeeded) < B_OK)
821 			return B_ERROR;
822 
823 		// get new last item!
824 		item = node->SmallDataStart();
825 		index = 0;
826 		while (!item->IsLast(node)) {
827 			item = item->Next();
828 			index++;
829 		}
830 	}
831 
832 	memset(item, 0, spaceNeeded);
833 	item->type = HOST_ENDIAN_TO_BFS_INT32(type);
834 	item->name_size = HOST_ENDIAN_TO_BFS_INT16(nameLength);
835 	item->data_size = HOST_ENDIAN_TO_BFS_INT16(length);
836 	strcpy(item->Name(), name);
837 	memcpy(item->Data() + pos, data, length);
838 
839 	// correctly terminate the small_data section
840 	item = item->Next();
841 	if (!item->IsLast(node))
842 		memset(item, 0, (uint8*)node + fVolume->InodeSize() - (uint8*)item);
843 
844 	// update all current iterators
845 	SinglyLinkedList<AttributeIterator>::Iterator iterator
846 		= fIterators.GetIterator();
847 	while (iterator.HasNext()) {
848 		iterator.Next()->Update(index, 1);
849 	}
850 
851 	return B_OK;
852 }
853 
854 
855 /*!	Iterates through the small_data section of an inode.
856 	To start at the beginning of this section, you let smallData
857 	point to NULL, like:
858 		small_data* data = NULL;
859 		while (inode->GetNextSmallData(&data) { ... }
860 
861 	This function is reentrant and doesn't allocate any memory;
862 	you can safely stop calling it at any point (you don't need
863 	to iterate through the whole list).
864 	You need to hold the fSmallDataLock when you call this method
865 */
866 status_t
867 Inode::_GetNextSmallData(bfs_inode* node, small_data** _smallData) const
868 {
869 	if (node == NULL)
870 		RETURN_ERROR(B_BAD_VALUE);
871 
872 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
873 
874 	small_data* data = *_smallData;
875 
876 	// begin from the start?
877 	if (data == NULL)
878 		data = node->SmallDataStart();
879 	else
880 		data = data->Next();
881 
882 	// is already last item?
883 	if (data->IsLast(node))
884 		return B_ENTRY_NOT_FOUND;
885 
886 	*_smallData = data;
887 
888 	return B_OK;
889 }
890 
891 
892 /*!	Finds the attribute "name" in the small data section, and
893 	returns a pointer to it (or NULL if it doesn't exist).
894 	You need to hold the fSmallDataLock when you call this method
895 */
896 small_data*
897 Inode::FindSmallData(const bfs_inode* node, const char* name) const
898 {
899 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
900 
901 	small_data* smallData = NULL;
902 	while (_GetNextSmallData(const_cast<bfs_inode*>(node), &smallData)
903 			== B_OK) {
904 		if (!strcmp(smallData->Name(), name))
905 			return smallData;
906 	}
907 	return NULL;
908 }
909 
910 
911 /*!	Returns a pointer to the node's name if present in the small data
912 	section, NULL otherwise.
913 	You need to hold the fSmallDataLock when you call this method
914 */
915 const char*
916 Inode::Name(const bfs_inode* node) const
917 {
918 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
919 
920 	small_data* smallData = NULL;
921 	while (_GetNextSmallData((bfs_inode*)node, &smallData) == B_OK) {
922 		if (*smallData->Name() == FILE_NAME_NAME
923 			&& smallData->NameSize() == FILE_NAME_NAME_LENGTH)
924 			return (const char*)smallData->Data();
925 	}
926 	return NULL;
927 }
928 
929 
930 /*!	Copies the node's name into the provided buffer.
931 	The buffer should be B_FILE_NAME_LENGTH bytes large.
932 */
933 status_t
934 Inode::GetName(char* buffer, size_t size) const
935 {
936 	NodeGetter node(fVolume, this);
937 	RecursiveLocker locker(fSmallDataLock);
938 
939 	const char* name = Name(node.Node());
940 	if (name == NULL)
941 		return B_ENTRY_NOT_FOUND;
942 
943 	strlcpy(buffer, name, size);
944 	return B_OK;
945 }
946 
947 
948 /*!	Changes or set the name of a file: in the inode small_data section only, it
949 	doesn't change it in the parent directory's b+tree.
950 	Note that you need to write back the inode yourself after having called
951 	that method. It suffers from the same API decision as AddSmallData() does
952 	(and for the same reason).
953 */
954 status_t
955 Inode::SetName(Transaction& transaction, const char* name)
956 {
957 	if (name == NULL || *name == '\0')
958 		return B_BAD_VALUE;
959 
960 	NodeGetter node(fVolume, transaction, this);
961 	const char nameTag[2] = {FILE_NAME_NAME, 0};
962 
963 	return _AddSmallData(transaction, node, nameTag, FILE_NAME_TYPE, 0,
964 		(uint8*)name, strlen(name), true);
965 }
966 
967 
968 status_t
969 Inode::_RemoveAttribute(Transaction& transaction, const char* name,
970 	bool hasIndex, Index* index)
971 {
972 	// remove the attribute file if it exists
973 	Vnode vnode(fVolume, Attributes());
974 	Inode* attributes;
975 	status_t status = vnode.Get(&attributes);
976 	if (status < B_OK)
977 		return status;
978 
979 	// update index
980 	if (index != NULL) {
981 		Inode* attribute;
982 		if ((hasIndex || fVolume->CheckForLiveQuery(name))
983 			&& GetAttribute(name, &attribute) == B_OK) {
984 			uint8 data[BPLUSTREE_MAX_KEY_LENGTH];
985 			size_t length = BPLUSTREE_MAX_KEY_LENGTH;
986 			if (attribute->ReadAt(0, data, &length) == B_OK) {
987 				index->Update(transaction, name, attribute->Type(), data,
988 					length, NULL, 0, this);
989 			}
990 
991 			ReleaseAttribute(attribute);
992 		}
993 	}
994 
995 	if ((status = attributes->Remove(transaction, name)) < B_OK)
996 		return status;
997 
998 	if (attributes->IsEmpty()) {
999 		attributes->WriteLockInTransaction(transaction);
1000 
1001 		// remove attribute directory (don't fail if that can't be done)
1002 		if (remove_vnode(fVolume->FSVolume(), attributes->ID()) == B_OK) {
1003 			// update the inode, so that no one will ever doubt it's deleted :-)
1004 			attributes->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
1005 			if (attributes->WriteBack(transaction) == B_OK) {
1006 				Attributes().SetTo(0, 0, 0);
1007 				WriteBack(transaction);
1008 			} else {
1009 				unremove_vnode(fVolume->FSVolume(), attributes->ID());
1010 				attributes->Node().flags
1011 					&= ~HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
1012 			}
1013 		}
1014 	}
1015 
1016 	return status;
1017 }
1018 
1019 
1020 /*!	Reads data from the specified attribute.
1021 	This is a high-level attribute function that understands attributes
1022 	in the small_data section as well as real attribute files.
1023 */
1024 status_t
1025 Inode::ReadAttribute(const char* name, int32 type, off_t pos, uint8* buffer,
1026 	size_t* _length)
1027 {
1028 	if (pos < 0)
1029 		pos = 0;
1030 
1031 	// search in the small_data section (which has to be locked first)
1032 	{
1033 		NodeGetter node(fVolume, this);
1034 		RecursiveLocker locker(fSmallDataLock);
1035 
1036 		small_data* smallData = FindSmallData(node.Node(), name);
1037 		if (smallData != NULL) {
1038 			size_t length = *_length;
1039 			if (pos >= smallData->data_size) {
1040 				*_length = 0;
1041 				return B_OK;
1042 			}
1043 			if (length + pos > smallData->DataSize())
1044 				length = smallData->DataSize() - pos;
1045 
1046 			memcpy(buffer, smallData->Data() + pos, length);
1047 			*_length = length;
1048 			return B_OK;
1049 		}
1050 	}
1051 
1052 	// search in the attribute directory
1053 	Inode* attribute;
1054 	status_t status = GetAttribute(name, &attribute);
1055 	if (status == B_OK) {
1056 		status = attribute->ReadAt(pos, (uint8*)buffer, _length);
1057 
1058 		ReleaseAttribute(attribute);
1059 	}
1060 
1061 	RETURN_ERROR(status);
1062 }
1063 
1064 
1065 /*!	Writes data to the specified attribute.
1066 	This is a high-level attribute function that understands attributes
1067 	in the small_data section as well as real attribute files.
1068 */
1069 status_t
1070 Inode::WriteAttribute(Transaction& transaction, const char* name, int32 type,
1071 	off_t pos, const uint8* buffer, size_t* _length, bool* _created)
1072 {
1073 	if (pos < 0)
1074 		return B_BAD_VALUE;
1075 
1076 	// needed to maintain the index
1077 	uint8 oldBuffer[BPLUSTREE_MAX_KEY_LENGTH];
1078 	uint8* oldData = NULL;
1079 	size_t oldLength = 0;
1080 	bool created = false;
1081 
1082 	// TODO: we actually depend on that the contents of "buffer" are constant.
1083 	// If they get changed during the write (hey, user programs), we may mess
1084 	// up our index trees!
1085 	// TODO: for attribute files, we need to log the first
1086 	// BPLUSTREE_MAX_KEY_LENGTH bytes of the data stream, or the same as above
1087 	// might happen.
1088 
1089 	Index index(fVolume);
1090 	bool hasIndex = index.SetTo(name) == B_OK;
1091 
1092 	Inode* attribute = NULL;
1093 	status_t status = B_OK;
1094 
1095 	if (GetAttribute(name, &attribute) != B_OK) {
1096 		// No attribute inode exists yet
1097 
1098 		// save the old attribute data
1099 		NodeGetter node(fVolume, transaction, this);
1100 		recursive_lock_lock(&fSmallDataLock);
1101 
1102 		small_data* smallData = FindSmallData(node.Node(), name);
1103 		if (smallData != NULL) {
1104 			oldLength = smallData->DataSize();
1105 			if (oldLength > 0) {
1106 				if (oldLength > BPLUSTREE_MAX_KEY_LENGTH)
1107 					oldLength = BPLUSTREE_MAX_KEY_LENGTH;
1108 				memcpy(oldData = oldBuffer, smallData->Data(), oldLength);
1109 			}
1110 		} else
1111 			created = true;
1112 
1113 		recursive_lock_unlock(&fSmallDataLock);
1114 
1115 		// if the attribute doesn't exist yet (as a file), try to put it in the
1116 		// small_data section first - if that fails (due to insufficent space),
1117 		// create a real attribute file
1118 		status = _AddSmallData(transaction, node, name, type, pos, buffer,
1119 			*_length);
1120 		if (status == B_DEVICE_FULL) {
1121 			if (smallData != NULL) {
1122 				// remove the old attribute from the small data section - there
1123 				// is no space left for the new data
1124 				status = _RemoveSmallData(transaction, node, name);
1125 			} else
1126 				status = B_OK;
1127 
1128 			if (status == B_OK)
1129 				status = CreateAttribute(transaction, name, type, &attribute);
1130 			if (status != B_OK)
1131 				RETURN_ERROR(status);
1132 
1133 			created = true;
1134 		} else if (status == B_OK) {
1135 			// Update status time on attribute write
1136 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1137 				bfs_inode::ToInode(real_time_clock_usecs()));
1138 
1139 			status = WriteBack(transaction);
1140 		}
1141 	}
1142 
1143 	if (attribute != NULL) {
1144 		WriteLocker writeLocker(attribute->fLock);
1145 
1146 		if (hasIndex || fVolume->CheckForLiveQuery(name)) {
1147 			// Save the old attribute data (if this fails, oldLength will
1148 			// reflect it)
1149 			while (attribute->Size() > 0) {
1150 				bigtime_t oldModified = attribute->LastModified();
1151 				writeLocker.Unlock();
1152 
1153 				oldLength = BPLUSTREE_MAX_KEY_LENGTH;
1154 				if (attribute->ReadAt(0, oldBuffer, &oldLength) == B_OK)
1155 					oldData = oldBuffer;
1156 
1157 				writeLocker.Lock();
1158 
1159 				// Read until the data hasn't changed in between
1160 				if (oldModified == attribute->LastModified())
1161 					break;
1162 
1163 				oldLength = 0;
1164 			}
1165 		}
1166 
1167 		// check if the data fits into the small_data section again
1168 		NodeGetter node(fVolume, transaction, this);
1169 		status = _AddSmallData(transaction, node, name, type, pos, buffer,
1170 			*_length);
1171 
1172 		if (status == B_OK) {
1173 			// it does - remove its file
1174 			writeLocker.Unlock();
1175 			status = _RemoveAttribute(transaction, name, false, NULL);
1176 		} else {
1177 			// The attribute type might have been changed - we need to
1178 			// adopt the new one
1179 			attribute->Node().type = HOST_ENDIAN_TO_BFS_INT32(type);
1180 			status = attribute->WriteBack(transaction);
1181 			writeLocker.Unlock();
1182 
1183 			if (status == B_OK) {
1184 				status = attribute->WriteAt(transaction, pos, buffer,
1185 					_length);
1186 			}
1187 		}
1188 
1189 		if (status == B_OK) {
1190 			// Update status time on attribute write
1191 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1192 				bfs_inode::ToInode(real_time_clock_usecs()));
1193 
1194 			status = WriteBack(transaction);
1195 		}
1196 
1197 		attribute->WriteLockInTransaction(transaction);
1198 		ReleaseAttribute(attribute);
1199 	}
1200 
1201 	// TODO: find a better way than this "pos" thing (the begin of the old key
1202 	//	must be copied to the start of the new one for a comparison)
1203 	if (status == B_OK && pos == 0) {
1204 		// index only the first BPLUSTREE_MAX_KEY_LENGTH bytes
1205 		uint16 length = *_length;
1206 		if (length > BPLUSTREE_MAX_KEY_LENGTH)
1207 			length = BPLUSTREE_MAX_KEY_LENGTH;
1208 
1209 		// Update index. Note, Index::Update() may be called even if
1210 		// initializing the index failed - it will just update the live
1211 		// queries in this case
1212 		if (pos < length || (uint64)pos < (uint64)oldLength) {
1213 			index.Update(transaction, name, type, oldData, oldLength, buffer,
1214 				length, this);
1215 		}
1216 	}
1217 
1218 	if (_created != NULL)
1219 		*_created = created;
1220 
1221 	return status;
1222 }
1223 
1224 
1225 /*!	Removes the specified attribute from the inode.
1226 	This is a high-level attribute function that understands attributes
1227 	in the small_data section as well as real attribute files.
1228 */
1229 status_t
1230 Inode::RemoveAttribute(Transaction& transaction, const char* name)
1231 {
1232 	Index index(fVolume);
1233 	bool hasIndex = index.SetTo(name) == B_OK;
1234 	NodeGetter node(fVolume, this);
1235 
1236 	// update index for attributes in the small_data section
1237 	{
1238 		RecursiveLocker _(fSmallDataLock);
1239 
1240 		small_data* smallData = FindSmallData(node.Node(), name);
1241 		if (smallData != NULL) {
1242 			uint32 length = smallData->DataSize();
1243 			if (length > BPLUSTREE_MAX_KEY_LENGTH)
1244 				length = BPLUSTREE_MAX_KEY_LENGTH;
1245 			index.Update(transaction, name, smallData->Type(),
1246 				smallData->Data(), length, NULL, 0, this);
1247 		}
1248 	}
1249 
1250 	status_t status = _RemoveSmallData(transaction, node, name);
1251 	if (status == B_ENTRY_NOT_FOUND && !Attributes().IsZero()) {
1252 		// remove the attribute file if it exists
1253 		status = _RemoveAttribute(transaction, name, hasIndex, &index);
1254 		if (status == B_OK) {
1255 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1256 				bfs_inode::ToInode(real_time_clock_usecs()));
1257 			WriteBack(transaction);
1258 		}
1259 	}
1260 
1261 	return status;
1262 }
1263 
1264 
1265 /*!	Returns the attribute inode with the specified \a name, in case it exists.
1266 	This method can only return real attribute files; the attributes in the
1267 	small data section are ignored.
1268 */
1269 status_t
1270 Inode::GetAttribute(const char* name, Inode** _attribute)
1271 {
1272 	// does this inode even have attributes?
1273 	if (Attributes().IsZero())
1274 		return B_ENTRY_NOT_FOUND;
1275 
1276 	Vnode vnode(fVolume, Attributes());
1277 	Inode* attributes;
1278 	if (vnode.Get(&attributes) < B_OK) {
1279 		FATAL(("get_vnode() failed in Inode::GetAttribute(name = \"%s\")\n",
1280 			name));
1281 		return B_ERROR;
1282 	}
1283 
1284 	BPlusTree* tree = attributes->Tree();
1285 	if (tree == NULL)
1286 		return B_BAD_VALUE;
1287 
1288 	InodeReadLocker locker(attributes);
1289 
1290 	ino_t id;
1291 	status_t status = tree->Find((uint8*)name, (uint16)strlen(name), &id);
1292 	if (status == B_OK) {
1293 		Vnode vnode(fVolume, id);
1294 		Inode* inode;
1295 		// Check if the attribute is really an attribute
1296 		if (vnode.Get(&inode) != B_OK || !inode->IsAttribute())
1297 			return B_ERROR;
1298 
1299 		*_attribute = inode;
1300 		vnode.Keep();
1301 		return B_OK;
1302 	}
1303 
1304 	return status;
1305 }
1306 
1307 
1308 void
1309 Inode::ReleaseAttribute(Inode* attribute)
1310 {
1311 	if (attribute == NULL)
1312 		return;
1313 
1314 	put_vnode(fVolume->FSVolume(), attribute->ID());
1315 }
1316 
1317 
1318 status_t
1319 Inode::CreateAttribute(Transaction& transaction, const char* name, uint32 type,
1320 	Inode** attribute)
1321 {
1322 	// do we need to create the attribute directory first?
1323 	if (Attributes().IsZero()) {
1324 		status_t status = Inode::Create(transaction, this, NULL,
1325 			S_ATTR_DIR | S_DIRECTORY | 0666, 0, 0, NULL);
1326 		if (status < B_OK)
1327 			RETURN_ERROR(status);
1328 	}
1329 	Vnode vnode(fVolume, Attributes());
1330 	Inode* attributes;
1331 	if (vnode.Get(&attributes) < B_OK)
1332 		return B_ERROR;
1333 
1334 	// Inode::Create() locks the inode for us
1335 	return Inode::Create(transaction, attributes, name,
1336 		S_ATTR | S_FILE | 0666, 0, type, NULL, NULL, attribute);
1337 }
1338 
1339 
1340 //	#pragma mark - directory tree
1341 
1342 
1343 bool
1344 Inode::IsEmpty()
1345 {
1346 	TreeIterator iterator(fTree);
1347 
1348 	// index and attribute directories are really empty when they are
1349 	// empty - directories for standard files always contain ".", and
1350 	// "..", so we need to ignore those two
1351 
1352 	uint32 count = 0;
1353 	char name[BPLUSTREE_MAX_KEY_LENGTH];
1354 	uint16 length;
1355 	ino_t id;
1356 	while (iterator.GetNextEntry(name, &length, B_FILE_NAME_LENGTH,
1357 			&id) == B_OK) {
1358 		if (Mode() & (S_ATTR_DIR | S_INDEX_DIR))
1359 			return false;
1360 
1361 		if (++count > 2 || (strcmp(".", name) && strcmp("..", name)))
1362 			return false;
1363 	}
1364 	return true;
1365 }
1366 
1367 
1368 status_t
1369 Inode::ContainerContentsChanged(Transaction& transaction)
1370 {
1371 	ASSERT(!InLastModifiedIndex());
1372 
1373 	Node().last_modified_time = Node().status_change_time
1374 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
1375 
1376 	return WriteBack(transaction);
1377 }
1378 
1379 
1380 //	#pragma mark - data stream
1381 
1382 
1383 /*!	Computes the number of bytes this inode occupies in the file system.
1384 	This includes the file meta blocks used for maintaining its data stream.
1385 
1386 	TODO: However, the attributes in extra files are not really accounted for;
1387 	depending on the speed penalty, this should be changed, though (the value
1388 	could be cached in the inode structure or Inode object, though).
1389 */
1390 off_t
1391 Inode::AllocatedSize() const
1392 {
1393 	if (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0) {
1394 		// This symlink does not have a data stream
1395 		return Node().InodeSize();
1396 	}
1397 
1398 	const data_stream& data = Node().data;
1399 	uint32 blockSize = fVolume->BlockSize();
1400 	off_t size = blockSize;
1401 
1402 	if (data.MaxDoubleIndirectRange() != 0) {
1403 		off_t doubleIndirectSize = data.MaxDoubleIndirectRange()
1404 			- data.MaxIndirectRange();
1405 		int32 indirectSize = double_indirect_max_indirect_size(
1406 			data.double_indirect.Length(), fVolume->BlockSize());
1407 
1408 		size += (2 * data.double_indirect.Length()
1409 				+ doubleIndirectSize / indirectSize)
1410 			* blockSize + data.MaxDoubleIndirectRange();
1411 	} else if (data.MaxIndirectRange() != 0)
1412 		size += data.indirect.Length() + data.MaxIndirectRange();
1413 	else
1414 		size += data.MaxDirectRange();
1415 
1416 	if (!Node().attributes.IsZero()) {
1417 		// TODO: to make this exact, we'd had to count all attributes
1418 		size += 2 * blockSize;
1419 			// 2 blocks, one for the attributes inode, one for its B+tree
1420 	}
1421 
1422 	return size;
1423 }
1424 
1425 
1426 /*!	Finds the block_run where "pos" is located in the data_stream of
1427 	the inode.
1428 	If successful, "offset" will then be set to the file offset
1429 	of the block_run returned; so "pos - offset" is for the block_run
1430 	what "pos" is for the whole stream.
1431 	The caller has to make sure that "pos" is inside the stream.
1432 */
1433 status_t
1434 Inode::FindBlockRun(off_t pos, block_run& run, off_t& offset)
1435 {
1436 	data_stream* data = &Node().data;
1437 
1438 	// find matching block run
1439 
1440 	if (data->MaxIndirectRange() > 0 && pos >= data->MaxDirectRange()) {
1441 		if (data->MaxDoubleIndirectRange() > 0
1442 			&& pos >= data->MaxIndirectRange()) {
1443 			// access to double indirect blocks
1444 
1445 			CachedBlock cached(fVolume);
1446 
1447 			int32 runsPerBlock;
1448 			int32 directSize;
1449 			int32 indirectSize;
1450 			get_double_indirect_sizes(data->double_indirect.Length(),
1451 				fVolume->BlockSize(), runsPerBlock, directSize, indirectSize);
1452 
1453 			off_t start = pos - data->MaxIndirectRange();
1454 			int32 index = start / indirectSize;
1455 
1456 			block_run* indirect = (block_run*)cached.SetTo(
1457 				fVolume->ToBlock(data->double_indirect) + index / runsPerBlock);
1458 			if (indirect == NULL)
1459 				RETURN_ERROR(B_ERROR);
1460 
1461 			int32 current = (start % indirectSize) / directSize;
1462 
1463 			indirect = (block_run*)cached.SetTo(
1464 				fVolume->ToBlock(indirect[index % runsPerBlock])
1465 				+ current / runsPerBlock);
1466 			if (indirect == NULL)
1467 				RETURN_ERROR(B_ERROR);
1468 
1469 			run = indirect[current % runsPerBlock];
1470 			if (run.Length() != data->double_indirect.Length())
1471 				RETURN_ERROR(B_BAD_DATA);
1472 
1473 			offset = data->MaxIndirectRange() + (index * indirectSize)
1474 				+ (current * directSize);
1475 		} else {
1476 			// access to indirect blocks
1477 
1478 			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1479 			off_t runBlockEnd = data->MaxDirectRange();
1480 
1481 			CachedBlock cached(fVolume);
1482 			off_t block = fVolume->ToBlock(data->indirect);
1483 
1484 			for (int32 i = 0; i < data->indirect.Length(); i++) {
1485 				block_run* indirect = (block_run*)cached.SetTo(block + i);
1486 				if (indirect == NULL)
1487 					RETURN_ERROR(B_IO_ERROR);
1488 
1489 				int32 current = -1;
1490 				while (++current < runsPerBlock) {
1491 					if (indirect[current].IsZero())
1492 						break;
1493 
1494 					runBlockEnd += (uint32)indirect[current].Length()
1495 						<< cached.BlockShift();
1496 					if (runBlockEnd > pos) {
1497 						run = indirect[current];
1498 						offset = runBlockEnd - ((uint32)run.Length()
1499 							<< cached.BlockShift());
1500 						return fVolume->ValidateBlockRun(run);
1501 					}
1502 				}
1503 			}
1504 			RETURN_ERROR(B_ERROR);
1505 		}
1506 	} else {
1507 		// access from direct blocks
1508 
1509 		off_t runBlockEnd = 0LL;
1510 		int32 current = -1;
1511 
1512 		while (++current < NUM_DIRECT_BLOCKS) {
1513 			if (data->direct[current].IsZero())
1514 				break;
1515 
1516 			runBlockEnd += (uint32)data->direct[current].Length()
1517 				<< fVolume->BlockShift();
1518 			if (runBlockEnd > pos) {
1519 				run = data->direct[current];
1520 				offset = runBlockEnd
1521 					- ((uint32)run.Length() << fVolume->BlockShift());
1522 				return fVolume->ValidateBlockRun(run);
1523 			}
1524 		}
1525 
1526 		return B_ENTRY_NOT_FOUND;
1527 	}
1528 	return fVolume->ValidateBlockRun(run);
1529 }
1530 
1531 
1532 status_t
1533 Inode::ReadAt(off_t pos, uint8* buffer, size_t* _length)
1534 {
1535 	size_t length = *_length;
1536 
1537 	// set/check boundaries for pos/length
1538 	if (pos < 0)
1539 		return B_BAD_VALUE;
1540 
1541 	InodeReadLocker locker(this);
1542 
1543 	if (pos >= Size() || length == 0) {
1544 		*_length = 0;
1545 		return B_NO_ERROR;
1546 	}
1547 
1548 	locker.Unlock();
1549 
1550 	return file_cache_read(FileCache(), NULL, pos, buffer, _length);
1551 }
1552 
1553 
1554 status_t
1555 Inode::WriteAt(Transaction& transaction, off_t pos, const uint8* buffer,
1556 	size_t* _length)
1557 {
1558 	InodeReadLocker locker(this);
1559 
1560 	// update the last modification time in memory, it will be written
1561 	// back to the inode, and the index when the file is closed
1562 	// TODO: should update the internal last modified time only at this point!
1563 	Node().last_modified_time = Node().status_change_time
1564 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
1565 
1566 	// TODO: support INODE_LOGGED!
1567 
1568 	size_t length = *_length;
1569 	bool changeSize = (uint64)pos + (uint64)length > (uint64)Size();
1570 
1571 	// set/check boundaries for pos/length
1572 	if (pos < 0)
1573 		return B_BAD_VALUE;
1574 
1575 	locker.Unlock();
1576 
1577 	// the transaction doesn't have to be started already
1578 	if (changeSize && !transaction.IsStarted())
1579 		transaction.Start(fVolume, BlockNumber());
1580 
1581 	WriteLocker writeLocker(fLock);
1582 
1583 	// Work around possible race condition: Someone might have shrunken the file
1584 	// while we had no lock.
1585 	if (!transaction.IsStarted()
1586 		&& (uint64)pos + (uint64)length > (uint64)Size()) {
1587 		writeLocker.Unlock();
1588 		transaction.Start(fVolume, BlockNumber());
1589 		writeLocker.Lock();
1590 	}
1591 
1592 	off_t oldSize = Size();
1593 
1594 	if ((uint64)pos + (uint64)length > (uint64)oldSize) {
1595 		// let's grow the data stream to the size needed
1596 		status_t status = SetFileSize(transaction, pos + length);
1597 		if (status != B_OK) {
1598 			*_length = 0;
1599 			WriteLockInTransaction(transaction);
1600 			RETURN_ERROR(status);
1601 		}
1602 		// TODO: In theory we would need to update the file size
1603 		// index here as part of the current transaction - this might just
1604 		// be a bit too expensive, but worth a try.
1605 
1606 		// we need to write back the inode here because it has to
1607 		// go into this transaction (we cannot wait until the file
1608 		// is closed)
1609 		status = WriteBack(transaction);
1610 		if (status != B_OK) {
1611 			WriteLockInTransaction(transaction);
1612 			return status;
1613 		}
1614 	}
1615 
1616 	writeLocker.Unlock();
1617 
1618 	if (oldSize < pos)
1619 		FillGapWithZeros(oldSize, pos);
1620 
1621 	// If we don't want to write anything, we can now return (we may
1622 	// just have changed the file size using the position parameter)
1623 	if (length == 0)
1624 		return B_OK;
1625 
1626 	status_t status = file_cache_write(FileCache(), NULL, pos, buffer, _length);
1627 
1628 	if (transaction.IsStarted())
1629 		WriteLockInTransaction(transaction);
1630 
1631 	return status;
1632 }
1633 
1634 
1635 /*!	Fills the gap between the old file size and the new file size
1636 	with zeros.
1637 	It's more or less a copy of Inode::WriteAt() but it can handle
1638 	length differences of more than just 4 GB, and it never uses
1639 	the log, even if the INODE_LOGGED flag is set.
1640 */
1641 status_t
1642 Inode::FillGapWithZeros(off_t pos, off_t newSize)
1643 {
1644 	while (pos < newSize) {
1645 		size_t size;
1646 		if (newSize > pos + 1024 * 1024 * 1024)
1647 			size = 1024 * 1024 * 1024;
1648 		else
1649 			size = newSize - pos;
1650 
1651 		status_t status = file_cache_write(FileCache(), NULL, pos, NULL, &size);
1652 		if (status < B_OK)
1653 			return status;
1654 
1655 		pos += size;
1656 	}
1657 
1658 	return B_OK;
1659 }
1660 
1661 
1662 /*!	Allocates \a length blocks, and clears their contents. Growing
1663 	the indirect and double indirect range uses this method.
1664 	The allocated block_run is saved in "run"
1665 */
1666 status_t
1667 Inode::_AllocateBlockArray(Transaction& transaction, block_run& run,
1668 	size_t length, bool variableSize)
1669 {
1670 	if (!run.IsZero())
1671 		return B_BAD_VALUE;
1672 
1673 	status_t status = fVolume->Allocate(transaction, this, length, run,
1674 		variableSize ? 1 : length);
1675 	if (status != B_OK)
1676 		return status;
1677 
1678 	// make sure those blocks are empty
1679 	CachedBlock cached(fVolume);
1680 	off_t block = fVolume->ToBlock(run);
1681 
1682 	for (int32 i = 0; i < run.Length(); i++) {
1683 		block_run* runs = (block_run*)cached.SetToWritable(transaction,
1684 			block + i, true);
1685 		if (runs == NULL)
1686 			return B_IO_ERROR;
1687 	}
1688 	return B_OK;
1689 }
1690 
1691 
1692 /*!	Grows the stream to \a size, and fills the direct/indirect/double indirect
1693 	ranges with the runs.
1694 	This method will also determine the size of the preallocation, if any.
1695 */
1696 status_t
1697 Inode::_GrowStream(Transaction& transaction, off_t size)
1698 {
1699 	data_stream* data = &Node().data;
1700 
1701 	// is the data stream already large enough to hold the new size?
1702 	// (can be the case with preallocated blocks)
1703 	if (size < data->MaxDirectRange()
1704 		|| size < data->MaxIndirectRange()
1705 		|| size < data->MaxDoubleIndirectRange()) {
1706 		data->size = HOST_ENDIAN_TO_BFS_INT64(size);
1707 		return B_OK;
1708 	}
1709 
1710 	// how many bytes are still needed? (unused ranges are always zero)
1711 	uint16 minimum = 1;
1712 	off_t bytes;
1713 	if (data->Size() < data->MaxDoubleIndirectRange()) {
1714 		bytes = size - data->MaxDoubleIndirectRange();
1715 		// The double indirect range can only handle multiples of
1716 		// its base length
1717 		minimum = data->double_indirect.Length();
1718 	} else if (data->Size() < data->MaxIndirectRange())
1719 		bytes = size - data->MaxIndirectRange();
1720 	else if (data->Size() < data->MaxDirectRange())
1721 		bytes = size - data->MaxDirectRange();
1722 	else {
1723 		// no preallocation left to be used
1724 		bytes = size - data->Size();
1725 		if (data->MaxDoubleIndirectRange() > 0)
1726 			minimum = data->double_indirect.Length();
1727 	}
1728 
1729 	// do we have enough free blocks on the disk?
1730 	off_t blocksNeeded = (bytes + fVolume->BlockSize() - 1)
1731 		>> fVolume->BlockShift();
1732 	if (blocksNeeded > fVolume->FreeBlocks())
1733 		return B_DEVICE_FULL;
1734 
1735 	off_t blocksRequested = blocksNeeded;
1736 		// because of preallocations and partial allocations, the number of
1737 		// blocks we need to allocate may be different from the one we request
1738 		// from the block allocator
1739 
1740 	// Should we preallocate some blocks?
1741 	// Attributes, attribute directories, and long symlinks usually won't get
1742 	// that big, and should stay close to the inode - preallocating could be
1743 	// counterproductive.
1744 	// Also, if free disk space is tight, don't preallocate.
1745 	if (!IsAttribute() && !IsAttributeDirectory() && !IsSymLink()
1746 		&& fVolume->FreeBlocks() > 128) {
1747 		off_t roundTo = 0;
1748 		if (IsFile()) {
1749 			// Request preallocated blocks depending on the file size and growth
1750 			if (size < 1 * 1024 * 1024 && bytes < 512 * 1024) {
1751 				// Preallocate 64 KB for file sizes <1 MB and grow rates <512 KB
1752 				roundTo = 65536 >> fVolume->BlockShift();
1753 			} else if (size < 32 * 1024 * 1024 && bytes <= 1 * 1024 * 1024) {
1754 				// Preallocate 512 KB for file sizes between 1 MB and 32 MB, and
1755 				// grow rates smaller than 1 MB
1756 				roundTo = (512 * 1024) >> fVolume->BlockShift();
1757 			} else {
1758 				// Preallocate 1/16 of the file size (ie. 4 MB for 64 MB,
1759 				// 64 MB for 1 GB)
1760 				roundTo = size >> (fVolume->BlockShift() + 4);
1761 			}
1762 		} else if (IsIndex()) {
1763 			// Always preallocate 64 KB for index directories
1764 			roundTo = 65536 >> fVolume->BlockShift();
1765 		} else {
1766 			// Preallocate only 4 KB - directories only get trimmed when their
1767 			// vnode is flushed, which might not happen very often.
1768 			roundTo = 4096 >> fVolume->BlockShift();
1769 		}
1770 		if (roundTo > 1) {
1771 			// Round to next "roundTo" block count
1772 			blocksRequested = ((blocksNeeded + roundTo) / roundTo) * roundTo;
1773 		}
1774 	}
1775 
1776 	while (blocksNeeded > 0) {
1777 		// the requested blocks do not need to be returned with a
1778 		// single allocation, so we need to iterate until we have
1779 		// enough blocks allocated
1780 		if (minimum > 1) {
1781 			// make sure that "blocks" is a multiple of minimum
1782 			blocksRequested = round_up(blocksRequested, minimum);
1783 		}
1784 
1785 		block_run run;
1786 		status_t status = fVolume->Allocate(transaction, this, blocksRequested,
1787 			run, minimum);
1788 		if (status != B_OK)
1789 			return status;
1790 
1791 		// okay, we have the needed blocks, so just distribute them to the
1792 		// different ranges of the stream (direct, indirect & double indirect)
1793 
1794 		blocksNeeded -= run.Length();
1795 		// don't preallocate if the first allocation was already too small
1796 		blocksRequested = blocksNeeded;
1797 
1798 		// Direct block range
1799 
1800 		if (data->Size() <= data->MaxDirectRange()) {
1801 			// let's try to put them into the direct block range
1802 			int32 free = 0;
1803 			for (; free < NUM_DIRECT_BLOCKS; free++) {
1804 				if (data->direct[free].IsZero())
1805 					break;
1806 			}
1807 
1808 			if (free < NUM_DIRECT_BLOCKS) {
1809 				// can we merge the last allocated run with the new one?
1810 				int32 last = free - 1;
1811 				if (free > 0 && data->direct[last].MergeableWith(run)) {
1812 					data->direct[last].length = HOST_ENDIAN_TO_BFS_INT16(
1813 						data->direct[last].Length() + run.Length());
1814 				} else
1815 					data->direct[free] = run;
1816 
1817 				data->max_direct_range = HOST_ENDIAN_TO_BFS_INT64(
1818 					data->MaxDirectRange()
1819 					+ run.Length() * fVolume->BlockSize());
1820 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1821 					? data->max_direct_range : size);
1822 				continue;
1823 			}
1824 		}
1825 
1826 		// Indirect block range
1827 
1828 		if (data->Size() <= data->MaxIndirectRange()
1829 			|| !data->MaxIndirectRange()) {
1830 			CachedBlock cached(fVolume);
1831 			block_run* runs = NULL;
1832 			uint32 free = 0;
1833 			off_t block;
1834 
1835 			// if there is no indirect block yet, create one
1836 			if (data->indirect.IsZero()) {
1837 				status = _AllocateBlockArray(transaction, data->indirect,
1838 					NUM_ARRAY_BLOCKS, true);
1839 				if (status != B_OK)
1840 					return status;
1841 
1842 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1843 					data->MaxDirectRange());
1844 				// insert the block_run in the first block
1845 				runs = (block_run*)cached.SetTo(data->indirect);
1846 			} else {
1847 				uint32 numberOfRuns = fVolume->BlockSize() / sizeof(block_run);
1848 				block = fVolume->ToBlock(data->indirect);
1849 
1850 				// search first empty entry
1851 				int32 i = 0;
1852 				for (; i < data->indirect.Length(); i++) {
1853 					if ((runs = (block_run*)cached.SetTo(block + i)) == NULL)
1854 						return B_IO_ERROR;
1855 
1856 					for (free = 0; free < numberOfRuns; free++)
1857 						if (runs[free].IsZero())
1858 							break;
1859 
1860 					if (free < numberOfRuns)
1861 						break;
1862 				}
1863 				if (i == data->indirect.Length())
1864 					runs = NULL;
1865 			}
1866 
1867 			if (runs != NULL) {
1868 				// try to insert the run to the last one - note that this
1869 				// doesn't take block borders into account, so it could be
1870 				// further optimized
1871 				cached.MakeWritable(transaction);
1872 
1873 				int32 last = free - 1;
1874 				if (free > 0 && runs[last].MergeableWith(run)) {
1875 					runs[last].length = HOST_ENDIAN_TO_BFS_INT16(
1876 						runs[last].Length() + run.Length());
1877 				} else
1878 					runs[free] = run;
1879 
1880 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1881 					data->MaxIndirectRange()
1882 					+ ((uint32)run.Length() << fVolume->BlockShift()));
1883 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1884 					? data->MaxIndirectRange() : size);
1885 				continue;
1886 			}
1887 		}
1888 
1889 		// Double indirect block range
1890 
1891 		if (data->Size() <= data->MaxDoubleIndirectRange()
1892 			|| !data->max_double_indirect_range) {
1893 			// We make sure here that we have this minimum allocated, so if
1894 			// the allocation succeeds, we don't run into an endless loop.
1895 			if (!data->max_double_indirect_range)
1896 				minimum = _DoubleIndirectBlockLength();
1897 			else
1898 				minimum = data->double_indirect.Length();
1899 
1900 			if ((run.Length() % minimum) != 0) {
1901 				// The number of allocated blocks isn't a multiple of 'minimum',
1902 				// so we have to change this. This can happen the first time the
1903 				// stream grows into the double indirect range.
1904 				// First, free the remaining blocks that don't fit into this
1905 				// multiple.
1906 				int32 rest = run.Length() % minimum;
1907 				run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length() - rest);
1908 
1909 				status = fVolume->Free(transaction,
1910 					block_run::Run(run.AllocationGroup(),
1911 					run.Start() + run.Length(), rest));
1912 				if (status != B_OK)
1913 					return status;
1914 
1915 				blocksNeeded += rest;
1916 				blocksRequested = round_up(blocksNeeded, minimum);
1917 
1918 				// Are there any blocks left in the run? If not, allocate
1919 				// a new one
1920 				if (run.length == 0)
1921 					continue;
1922 			}
1923 
1924 			// if there is no double indirect block yet, create one
1925 			if (data->double_indirect.IsZero()) {
1926 				status = _AllocateBlockArray(transaction,
1927 					data->double_indirect, _DoubleIndirectBlockLength());
1928 				if (status != B_OK)
1929 					return status;
1930 
1931 				data->max_double_indirect_range = data->max_indirect_range;
1932 			}
1933 
1934 			// calculate the index where to insert the new blocks
1935 
1936 			int32 runsPerBlock;
1937 			int32 directSize;
1938 			int32 indirectSize;
1939 			get_double_indirect_sizes(data->double_indirect.Length(),
1940 				fVolume->BlockSize(), runsPerBlock, directSize, indirectSize);
1941 
1942 			off_t start = data->MaxDoubleIndirectRange()
1943 				- data->MaxIndirectRange();
1944 			int32 indirectIndex = start / indirectSize;
1945 			int32 index = (start % indirectSize) / directSize;
1946 			int32 runsPerArray = runsPerBlock * minimum;
1947 
1948 			// distribute the blocks to the array and allocate
1949 			// new array blocks when needed
1950 
1951 			CachedBlock cached(fVolume);
1952 			CachedBlock cachedDirect(fVolume);
1953 			block_run* array = NULL;
1954 			uint32 runLength = run.Length();
1955 
1956 			while (run.length != 0) {
1957 				// get the indirect array block
1958 				if (array == NULL) {
1959 					uint32 block = indirectIndex / runsPerBlock;
1960 					if (block >= minimum)
1961 						return EFBIG;
1962 
1963 					array = (block_run*)cached.SetTo(fVolume->ToBlock(
1964 						data->double_indirect) + block);
1965 					if (array == NULL)
1966 						return B_IO_ERROR;
1967 				}
1968 
1969 				do {
1970 					// do we need a new array block?
1971 					if (array[indirectIndex % runsPerBlock].IsZero()) {
1972 						cached.MakeWritable(transaction);
1973 
1974 						status = _AllocateBlockArray(transaction,
1975 							array[indirectIndex % runsPerBlock],
1976 							data->double_indirect.Length());
1977 						if (status != B_OK)
1978 							return status;
1979 					}
1980 
1981 					block_run* runs = (block_run*)cachedDirect.SetToWritable(
1982 						transaction, fVolume->ToBlock(array[indirectIndex
1983 							% runsPerBlock]) + index / runsPerBlock);
1984 					if (runs == NULL)
1985 						return B_IO_ERROR;
1986 
1987 					do {
1988 						// insert the block_run into the array
1989 						runs[index % runsPerBlock] = run;
1990 						runs[index % runsPerBlock].length
1991 							= HOST_ENDIAN_TO_BFS_INT16(minimum);
1992 
1993 						// alter the remaining block_run
1994 						run.start = HOST_ENDIAN_TO_BFS_INT16(run.Start()
1995 							+ minimum);
1996 						run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
1997 							- minimum);
1998 					} while ((++index % runsPerBlock) != 0 && run.length);
1999 				} while ((index % runsPerArray) != 0 && run.length);
2000 
2001 				if (index == runsPerArray)
2002 					index = 0;
2003 				if (++indirectIndex % runsPerBlock == 0) {
2004 					array = NULL;
2005 					index = 0;
2006 				}
2007 			}
2008 
2009 			data->max_double_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
2010 				data->MaxDoubleIndirectRange()
2011 				+ (runLength << fVolume->BlockShift()));
2012 			data->size = blocksNeeded > 0 ? HOST_ENDIAN_TO_BFS_INT64(
2013 				data->max_double_indirect_range) : size;
2014 
2015 			continue;
2016 		}
2017 
2018 		RETURN_ERROR(EFBIG);
2019 	}
2020 	// update the size of the data stream
2021 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
2022 
2023 	return B_OK;
2024 }
2025 
2026 
2027 size_t
2028 Inode::_DoubleIndirectBlockLength() const
2029 {
2030 	if (fVolume->BlockSize() > DOUBLE_INDIRECT_ARRAY_SIZE)
2031 		return 1;
2032 
2033 	return DOUBLE_INDIRECT_ARRAY_SIZE / fVolume->BlockSize();
2034 }
2035 
2036 
2037 /*!	Frees the statically sized array of the double indirect part of a data
2038 	stream.
2039 */
2040 status_t
2041 Inode::_FreeStaticStreamArray(Transaction& transaction, int32 level,
2042 	block_run run, off_t size, off_t offset, off_t& max)
2043 {
2044 	int32 indirectSize;
2045 	if (level == 0) {
2046 		indirectSize = double_indirect_max_indirect_size(run.Length(),
2047 			fVolume->BlockSize());
2048 	} else {
2049 		indirectSize = double_indirect_max_direct_size(run.Length(),
2050 			fVolume->BlockSize());
2051 	}
2052 
2053 	off_t start;
2054 	if (size > offset)
2055 		start = size - offset;
2056 	else
2057 		start = 0;
2058 
2059 	int32 index = start / indirectSize;
2060 	int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
2061 
2062 	CachedBlock cached(fVolume);
2063 	off_t blockNumber = fVolume->ToBlock(run);
2064 
2065 	// set the file offset to the current block run
2066 	offset += (off_t)index * indirectSize;
2067 
2068 	for (int32 i = index / runsPerBlock; i < run.Length(); i++) {
2069 		block_run* array = (block_run*)cached.SetToWritable(transaction,
2070 			blockNumber + i);
2071 		if (array == NULL)
2072 			RETURN_ERROR(B_ERROR);
2073 
2074 		for (index = index % runsPerBlock; index < runsPerBlock; index++) {
2075 			if (array[index].IsZero()) {
2076 				// we also want to break out of the outer loop
2077 				i = run.Length();
2078 				break;
2079 			}
2080 
2081 			status_t status = B_OK;
2082 			if (level == 0) {
2083 				status = _FreeStaticStreamArray(transaction, 1, array[index],
2084 					size, offset, max);
2085 			} else if (offset >= size)
2086 				status = fVolume->Free(transaction, array[index]);
2087 			else
2088 				max = HOST_ENDIAN_TO_BFS_INT64(offset + indirectSize);
2089 
2090 			if (status < B_OK)
2091 				RETURN_ERROR(status);
2092 
2093 			if (offset >= size)
2094 				array[index].SetTo(0, 0, 0);
2095 
2096 			offset += indirectSize;
2097 		}
2098 		index = 0;
2099 	}
2100 	return B_OK;
2101 }
2102 
2103 
2104 /*!	Frees all block_runs in the array which come after the specified size.
2105 	It also trims the last block_run that contain the size.
2106 	"offset" and "max" are maintained until the last block_run that doesn't
2107 	have to be freed - after this, the values won't be correct anymore, but
2108 	will still assure correct function for all subsequent calls.
2109 	"max" is considered to be in file system byte order.
2110 */
2111 status_t
2112 Inode::_FreeStreamArray(Transaction& transaction, block_run* array,
2113 	uint32 arrayLength, off_t size, off_t& offset, off_t& max)
2114 {
2115 	PRINT(("FreeStreamArray: arrayLength %lu, size %Ld, offset %Ld, max %Ld\n",
2116 		arrayLength, size, offset, max));
2117 
2118 	off_t newOffset = offset;
2119 	uint32 i = 0;
2120 	for (; i < arrayLength; i++, offset = newOffset) {
2121 		if (array[i].IsZero())
2122 			break;
2123 
2124 		newOffset += (off_t)array[i].Length() << fVolume->BlockShift();
2125 		if (newOffset <= size)
2126 			continue;
2127 
2128 		block_run run = array[i];
2129 
2130 		// determine the block_run to be freed
2131 		if (newOffset > size && offset < size) {
2132 			// free partial block_run (and update the original block_run)
2133 			run.start = HOST_ENDIAN_TO_BFS_INT16(array[i].Start()
2134 				+ ((size + fVolume->BlockSize() - 1 - offset)
2135 					>> fVolume->BlockShift()));
2136 				// round to next block
2137 			array[i].length = HOST_ENDIAN_TO_BFS_INT16(run.Start()
2138 				- array[i].Start());
2139 			run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
2140 				- array[i].Length());
2141 
2142 			if (run.length == 0)
2143 				continue;
2144 
2145 			// update maximum range
2146 			max = HOST_ENDIAN_TO_BFS_INT64(offset + ((off_t)array[i].Length()
2147 				<< fVolume->BlockShift()));
2148 		} else {
2149 			// free the whole block_run
2150 			array[i].SetTo(0, 0, 0);
2151 
2152 			if ((off_t)BFS_ENDIAN_TO_HOST_INT64(max) > offset)
2153 				max = HOST_ENDIAN_TO_BFS_INT64(offset);
2154 		}
2155 
2156 		if (fVolume->Free(transaction, run) < B_OK)
2157 			return B_IO_ERROR;
2158 	}
2159 	return B_OK;
2160 }
2161 
2162 
2163 status_t
2164 Inode::_ShrinkStream(Transaction& transaction, off_t size)
2165 {
2166 	data_stream* data = &Node().data;
2167 	status_t status;
2168 
2169 	if (data->MaxDoubleIndirectRange() > size) {
2170 		off_t* maxDoubleIndirect = &data->max_double_indirect_range;
2171 			// gcc 4 work-around: "error: cannot bind packed field
2172 			// 'data->data_stream::max_double_indirect_range' to 'off_t&'"
2173 		status = _FreeStaticStreamArray(transaction, 0, data->double_indirect,
2174 			size, data->MaxIndirectRange(), *maxDoubleIndirect);
2175 		if (status != B_OK)
2176 			return status;
2177 
2178 		if (size <= data->MaxIndirectRange()) {
2179 			fVolume->Free(transaction, data->double_indirect);
2180 			data->double_indirect.SetTo(0, 0, 0);
2181 			data->max_double_indirect_range = 0;
2182 		}
2183 	}
2184 
2185 	if (data->MaxIndirectRange() > size) {
2186 		CachedBlock cached(fVolume);
2187 		off_t block = fVolume->ToBlock(data->indirect);
2188 		off_t offset = data->MaxDirectRange();
2189 
2190 		for (int32 i = 0; i < data->indirect.Length(); i++) {
2191 			block_run* array = (block_run*)cached.SetToWritable(transaction,
2192 				block + i);
2193 			if (array == NULL)
2194 				break;
2195 
2196 			off_t* maxIndirect = &data->max_indirect_range;
2197 				// gcc 4 work-around: "error: cannot bind packed field
2198 				// 'data->data_stream::max_indirect_range' to 'off_t&'"
2199 			if (_FreeStreamArray(transaction, array, fVolume->BlockSize()
2200 					/ sizeof(block_run), size, offset, *maxIndirect) != B_OK)
2201 				return B_IO_ERROR;
2202 		}
2203 		if (data->max_direct_range == data->max_indirect_range) {
2204 			fVolume->Free(transaction, data->indirect);
2205 			data->indirect.SetTo(0, 0, 0);
2206 			data->max_indirect_range = 0;
2207 		}
2208 	}
2209 
2210 	if (data->MaxDirectRange() > size) {
2211 		off_t offset = 0;
2212 		off_t *maxDirect = &data->max_direct_range;
2213 			// gcc 4 work-around: "error: cannot bind packed field
2214 			// 'data->data_stream::max_direct_range' to 'off_t&'"
2215 		status = _FreeStreamArray(transaction, data->direct, NUM_DIRECT_BLOCKS,
2216 			size, offset, *maxDirect);
2217 		if (status < B_OK)
2218 			return status;
2219 	}
2220 
2221 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
2222 	return B_OK;
2223 }
2224 
2225 
2226 status_t
2227 Inode::SetFileSize(Transaction& transaction, off_t size)
2228 {
2229 	if (size < 0)
2230 		return B_BAD_VALUE;
2231 
2232 	off_t oldSize = Size();
2233 
2234 	if (size == oldSize)
2235 		return B_OK;
2236 
2237 	T(Resize(this, oldSize, size, false));
2238 
2239 	// should the data stream grow or shrink?
2240 	status_t status;
2241 	if (size > oldSize) {
2242 		status = _GrowStream(transaction, size);
2243 		if (status < B_OK) {
2244 			// if the growing of the stream fails, the whole operation
2245 			// fails, so we should shrink the stream to its former size
2246 			_ShrinkStream(transaction, oldSize);
2247 		}
2248 	} else
2249 		status = _ShrinkStream(transaction, size);
2250 
2251 	if (status < B_OK)
2252 		return status;
2253 
2254 	file_cache_set_size(FileCache(), size);
2255 	file_map_set_size(Map(), size);
2256 
2257 	return WriteBack(transaction);
2258 }
2259 
2260 
2261 status_t
2262 Inode::Append(Transaction& transaction, off_t bytes)
2263 {
2264 	return SetFileSize(transaction, Size() + bytes);
2265 }
2266 
2267 
2268 /*!	Checks whether or not this inode's data stream needs to be trimmed
2269 	because of an earlier preallocation.
2270 	Returns true if there are any blocks to be trimmed.
2271 */
2272 bool
2273 Inode::NeedsTrimming() const
2274 {
2275 	// We never trim preallocated index blocks to make them grow as smooth as
2276 	// possible. There are only few indices anyway, so this doesn't hurt.
2277 	// Also, if an inode is already in deleted state, we don't bother trimming
2278 	// it.
2279 	if (IsIndex() || IsDeleted()
2280 		|| (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0))
2281 		return false;
2282 
2283 	off_t roundedSize = round_up(Size(), fVolume->BlockSize());
2284 
2285 	return Node().data.MaxDirectRange() > roundedSize
2286 		|| Node().data.MaxIndirectRange() > roundedSize
2287 		|| Node().data.MaxDoubleIndirectRange() > roundedSize;
2288 }
2289 
2290 
2291 status_t
2292 Inode::TrimPreallocation(Transaction& transaction)
2293 {
2294 	T(Resize(this, max_c(Node().data.MaxDirectRange(),
2295 		Node().data.MaxIndirectRange()), Size(), true));
2296 
2297 	status_t status = _ShrinkStream(transaction, Size());
2298 	if (status < B_OK)
2299 		return status;
2300 
2301 	return WriteBack(transaction);
2302 }
2303 
2304 
2305 //!	Frees the file's data stream and removes all attributes
2306 status_t
2307 Inode::Free(Transaction& transaction)
2308 {
2309 	FUNCTION();
2310 
2311 	// Perhaps there should be an implementation of Inode::ShrinkStream() that
2312 	// just frees the data_stream, but doesn't change the inode (since it is
2313 	// freed anyway) - that would make an undelete command possible
2314 	if (!IsSymLink() || (Flags() & INODE_LONG_SYMLINK) != 0) {
2315 		status_t status = SetFileSize(transaction, 0);
2316 		if (status < B_OK)
2317 			return status;
2318 	}
2319 
2320 	// Free all attributes, and remove their indices
2321 	{
2322 		// We have to limit the scope of AttributeIterator, so that its
2323 		// destructor is not called after the inode is deleted
2324 		AttributeIterator iterator(this);
2325 
2326 		char name[B_FILE_NAME_LENGTH];
2327 		uint32 type;
2328 		size_t length;
2329 		ino_t id;
2330 		while (iterator.GetNext(name, &length, &type, &id) == B_OK) {
2331 			RemoveAttribute(transaction, name);
2332 		}
2333 	}
2334 
2335 	if (WriteBack(transaction) < B_OK)
2336 		return B_IO_ERROR;
2337 
2338 	return fVolume->Free(transaction, BlockRun());
2339 }
2340 
2341 
2342 //!	Synchronizes (writes back to disk) the file stream of the inode.
2343 status_t
2344 Inode::Sync()
2345 {
2346 	if (FileCache())
2347 		return file_cache_sync(FileCache());
2348 
2349 	// We may also want to flush the attribute's data stream to
2350 	// disk here... (do we?)
2351 
2352 	if (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0)
2353 		return B_OK;
2354 
2355 	InodeReadLocker locker(this);
2356 
2357 	data_stream* data = &Node().data;
2358 	status_t status = B_OK;
2359 
2360 	// flush direct range
2361 
2362 	for (int32 i = 0; i < NUM_DIRECT_BLOCKS; i++) {
2363 		if (data->direct[i].IsZero())
2364 			return B_OK;
2365 
2366 		status = block_cache_sync_etc(fVolume->BlockCache(),
2367 			fVolume->ToBlock(data->direct[i]), data->direct[i].Length());
2368 		if (status != B_OK)
2369 			return status;
2370 	}
2371 
2372 	// flush indirect range
2373 
2374 	if (data->max_indirect_range == 0)
2375 		return B_OK;
2376 
2377 	CachedBlock cached(fVolume);
2378 	off_t block = fVolume->ToBlock(data->indirect);
2379 	int32 count = fVolume->BlockSize() / sizeof(block_run);
2380 
2381 	for (int32 j = 0; j < data->indirect.Length(); j++) {
2382 		block_run* runs = (block_run*)cached.SetTo(block + j);
2383 		if (runs == NULL)
2384 			break;
2385 
2386 		for (int32 i = 0; i < count; i++) {
2387 			if (runs[i].IsZero())
2388 				return B_OK;
2389 
2390 			status = block_cache_sync_etc(fVolume->BlockCache(),
2391 				fVolume->ToBlock(runs[i]), runs[i].Length());
2392 			if (status != B_OK)
2393 				return status;
2394 		}
2395 	}
2396 
2397 	// flush double indirect range
2398 
2399 	if (data->max_double_indirect_range == 0)
2400 		return B_OK;
2401 
2402 	off_t indirectBlock = fVolume->ToBlock(data->double_indirect);
2403 
2404 	for (int32 l = 0; l < data->double_indirect.Length(); l++) {
2405 		block_run* indirectRuns = (block_run*)cached.SetTo(indirectBlock + l);
2406 		if (indirectRuns == NULL)
2407 			return B_FILE_ERROR;
2408 
2409 		CachedBlock directCached(fVolume);
2410 
2411 		for (int32 k = 0; k < count; k++) {
2412 			if (indirectRuns[k].IsZero())
2413 				return B_OK;
2414 
2415 			block = fVolume->ToBlock(indirectRuns[k]);
2416 			for (int32 j = 0; j < indirectRuns[k].Length(); j++) {
2417 				block_run* runs = (block_run*)directCached.SetTo(block + j);
2418 				if (runs == NULL)
2419 					return B_FILE_ERROR;
2420 
2421 				for (int32 i = 0; i < count; i++) {
2422 					if (runs[i].IsZero())
2423 						return B_OK;
2424 
2425 					// TODO: combine single block_runs to bigger ones when
2426 					// they are adjacent
2427 					status = block_cache_sync_etc(fVolume->BlockCache(),
2428 						fVolume->ToBlock(runs[i]), runs[i].Length());
2429 					if (status != B_OK)
2430 						return status;
2431 				}
2432 			}
2433 		}
2434 	}
2435 	return B_OK;
2436 }
2437 
2438 
2439 //	#pragma mark - TransactionListener implementation
2440 
2441 
2442 void
2443 Inode::TransactionDone(bool success)
2444 {
2445 	if (!success) {
2446 		// revert any changes made to the cached bfs_inode
2447 		UpdateNodeFromDisk();
2448 	}
2449 }
2450 
2451 
2452 void
2453 Inode::RemovedFromTransaction()
2454 {
2455 	Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
2456 
2457 	// See AddInode() why we do this here
2458 	if ((Flags() & INODE_DELETED) != 0)
2459 		fVolume->RemovedInodes().Add(this);
2460 
2461 	rw_lock_write_unlock(&Lock());
2462 
2463 	if (!fVolume->IsInitializing())
2464 		put_vnode(fVolume->FSVolume(), ID());
2465 }
2466 
2467 
2468 //	#pragma mark - creation/deletion
2469 
2470 
2471 status_t
2472 Inode::Remove(Transaction& transaction, const char* name, ino_t* _id,
2473 	bool isDirectory, bool force)
2474 {
2475 	if (fTree == NULL)
2476 		RETURN_ERROR(B_BAD_VALUE);
2477 
2478 	WriteLockInTransaction(transaction);
2479 
2480 	// does the file even exist?
2481 	off_t id;
2482 	if (fTree->Find((uint8*)name, (uint16)strlen(name), &id) < B_OK)
2483 		return B_ENTRY_NOT_FOUND;
2484 
2485 	if (_id)
2486 		*_id = id;
2487 
2488 	Vnode vnode(fVolume, id);
2489 	Inode* inode;
2490 	status_t status = vnode.Get(&inode);
2491 	if (status < B_OK) {
2492 		REPORT_ERROR(status);
2493 		return B_ENTRY_NOT_FOUND;
2494 	}
2495 
2496 	T(Remove(inode, name));
2497 	inode->WriteLockInTransaction(transaction);
2498 
2499 	// Inode::IsContainer() is true also for indices (furthermore, the S_IFDIR
2500 	// bit is set for indices in BFS, not for attribute directories) - but you
2501 	// should really be able to do whatever you want with your indices
2502 	// without having to remove all files first :)
2503 	if (!inode->IsIndex() && !force) {
2504 		// if it's not of the correct type, don't delete it!
2505 		if (inode->IsContainer() != isDirectory)
2506 			return isDirectory ? B_NOT_A_DIRECTORY : B_IS_A_DIRECTORY;
2507 
2508 		// only delete empty directories
2509 		if (isDirectory && !inode->IsEmpty())
2510 			return B_DIRECTORY_NOT_EMPTY;
2511 	}
2512 
2513 	// remove_vnode() allows the inode to be accessed until the last put_vnode()
2514 	status = remove_vnode(fVolume->FSVolume(), id);
2515 	if (status != B_OK)
2516 		return status;
2517 
2518 	if (fTree->Remove(transaction, name, id) != B_OK && !force) {
2519 		unremove_vnode(fVolume->FSVolume(), id);
2520 		RETURN_ERROR(B_ERROR);
2521 	}
2522 
2523 #ifdef DEBUG
2524 	if (fTree->Find((uint8*)name, (uint16)strlen(name), &id) == B_OK) {
2525 		DIE(("deleted entry still there"));
2526 	}
2527 #endif
2528 
2529 	ContainerContentsChanged(transaction);
2530 
2531 	// update the inode, so that no one will ever doubt it's deleted :-)
2532 	inode->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
2533 	inode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
2534 
2535 	// In balance to the Inode::Create() method, the main indices
2536 	// are updated here (name, size, & last_modified)
2537 
2538 	Index index(fVolume);
2539 	if (inode->InNameIndex()) {
2540 		index.RemoveName(transaction, name, inode);
2541 			// If removing from the index fails, it is not regarded as a
2542 			// fatal error and will not be reported back!
2543 			// Deleted inodes won't be visible in queries anyway.
2544 	}
2545 
2546 	if (inode->InSizeIndex())
2547 		index.RemoveSize(transaction, inode);
2548 	if (inode->InLastModifiedIndex())
2549 		index.RemoveLastModified(transaction, inode);
2550 
2551 	return inode->WriteBack(transaction);
2552 }
2553 
2554 
2555 /*!	Creates the inode with the specified \a parent directory, and automatically
2556 	adds the created inode to that parent directory. If an attribute directory
2557 	is created, it will also automatically  be added to the \a parent inode as
2558 	such. However, the indices root node, and the regular root node won't be
2559 	added to the superblock.
2560 	It will also create the initial B+tree for the inode if it's a directory
2561 	of any kind.
2562 	\a name may be \c NULL, but only if no \a parent is given.
2563 	If the "_id" or "_inode" variable is given and non-NULL to store the
2564 	inode's ID, the inode stays locked - you have to call put_vnode() if you
2565 	don't use it anymore.
2566 	If the node already exists, this method will fail if \c O_EXCL is set, or
2567 	it's a directory or a symlink. Otherwise, it will just be returned.
2568 	If \c O_TRUNC has been specified, the file will also be truncated.
2569 */
2570 status_t
2571 Inode::Create(Transaction& transaction, Inode* parent, const char* name,
2572 	int32 mode, int openMode, uint32 type, bool* _created, ino_t* _id,
2573 	Inode** _inode, fs_vnode_ops* vnodeOps, uint32 publishFlags)
2574 {
2575 	FUNCTION_START(("name = %s, mode = %ld\n", name, mode));
2576 
2577 	block_run parentRun = parent ? parent->BlockRun() : block_run::Run(0, 0, 0);
2578 	Volume* volume = transaction.GetVolume();
2579 	BPlusTree* tree = NULL;
2580 
2581 	if (parent != NULL && (mode & S_ATTR_DIR) == 0 && parent->IsContainer()) {
2582 		// check if the file already exists in the directory
2583 		tree = parent->Tree();
2584 	}
2585 
2586 	if (parent != NULL) {
2587 		// the parent directory is locked during the whole inode creation
2588 		parent->WriteLockInTransaction(transaction);
2589 	}
2590 
2591 	if (parent != NULL && !volume->IsInitializing() && parent->IsContainer()) {
2592 		// don't create anything in removed directories
2593 		bool removed;
2594 		if (get_vnode_removed(volume->FSVolume(), parent->ID(), &removed)
2595 				== B_OK && removed) {
2596 			RETURN_ERROR(B_ENTRY_NOT_FOUND);
2597 		}
2598 	}
2599 
2600 	if (tree != NULL) {
2601 		// Does the file already exist?
2602 		off_t offset;
2603 		if (tree->Find((uint8*)name, (uint16)strlen(name), &offset) == B_OK) {
2604 			// Return if the file should be a directory/link or opened in
2605 			// exclusive mode
2606 			if (S_ISDIR(mode) || S_ISLNK(mode) || (openMode & O_EXCL) != 0)
2607 				return B_FILE_EXISTS;
2608 
2609 			Vnode vnode(volume, offset);
2610 			Inode* inode;
2611 			status_t status = vnode.Get(&inode);
2612 			if (status != B_OK) {
2613 				REPORT_ERROR(status);
2614 				return B_ENTRY_NOT_FOUND;
2615 			}
2616 
2617 			if (inode->IsDirectory() && (openMode & O_RWMASK) != O_RDONLY)
2618 				return B_IS_A_DIRECTORY;
2619 			if ((openMode & O_DIRECTORY) != 0 && !inode->IsDirectory())
2620 				return B_NOT_A_DIRECTORY;
2621 
2622 			// we want to open the file, so we should have the rights to do so
2623 			if (inode->CheckPermissions(open_mode_to_access(openMode)
2624 					| ((openMode & O_TRUNC) != 0 ? W_OK : 0)) != B_OK)
2625 				return B_NOT_ALLOWED;
2626 
2627 			if ((openMode & O_TRUNC) != 0) {
2628 				// truncate the existing file
2629 				inode->WriteLockInTransaction(transaction);
2630 
2631 				status_t status = inode->SetFileSize(transaction, 0);
2632 				if (status == B_OK)
2633 					status = inode->WriteBack(transaction);
2634 
2635 				if (status != B_OK)
2636 					return status;
2637 			}
2638 
2639 			if (_created)
2640 				*_created = false;
2641 			if (_id)
2642 				*_id = inode->ID();
2643 			if (_inode)
2644 				*_inode = inode;
2645 
2646 			// Only keep the vnode in memory if the _id or _inode pointer is
2647 			// provided
2648 			if (_id != NULL || _inode != NULL)
2649 				vnode.Keep();
2650 
2651 			return B_OK;
2652 		}
2653 	} else if (parent != NULL && (mode & S_ATTR_DIR) == 0) {
2654 		return B_BAD_VALUE;
2655 	} else if ((openMode & O_DIRECTORY) != 0) {
2656 		// TODO: we might need to return B_NOT_A_DIRECTORY here
2657 		return B_ENTRY_NOT_FOUND;
2658 	}
2659 
2660 	status_t status;
2661 
2662 	// do we have the power to create new files at all?
2663 	if (parent != NULL && (status = parent->CheckPermissions(W_OK)) != B_OK)
2664 		RETURN_ERROR(status);
2665 
2666 	// allocate space for the new inode
2667 	InodeAllocator allocator(transaction);
2668 	block_run run;
2669 	Inode* inode;
2670 	status = allocator.New(&parentRun, mode, run, vnodeOps, &inode);
2671 	if (status < B_OK)
2672 		return status;
2673 
2674 	T(Create(inode, parent, name, mode, openMode, type));
2675 
2676 	// Initialize the parts of the bfs_inode structure that
2677 	// InodeAllocator::New() hasn't touched yet
2678 
2679 	bfs_inode* node = &inode->Node();
2680 
2681 	if (parent == NULL) {
2682 		// we set the parent to itself in this case
2683 		// (only happens for the root and indices node)
2684 		node->parent = run;
2685 	} else
2686 		node->parent = parentRun;
2687 
2688 	node->uid = HOST_ENDIAN_TO_BFS_INT32(geteuid());
2689 	node->gid = HOST_ENDIAN_TO_BFS_INT32(parent
2690 		? parent->Node().GroupID() : getegid());
2691 		// the group ID is inherited from the parent, if available
2692 
2693 	node->type = HOST_ENDIAN_TO_BFS_INT32(type);
2694 
2695 	inode->WriteBack(transaction);
2696 		// make sure the initialized node is available to others
2697 
2698 	// only add the name to regular files, directories, or symlinks
2699 	// don't add it to attributes, or indices
2700 	if (tree && inode->IsRegularNode()
2701 		&& inode->SetName(transaction, name) != B_OK)
2702 		return B_ERROR;
2703 
2704 	// Initialize b+tree if it's a directory (and add "." & ".." if it's
2705 	// a standard directory for files - not for attributes or indices)
2706 	if (inode->IsContainer()) {
2707 		status = allocator.CreateTree();
2708 		if (status != B_OK)
2709 			return status;
2710 	}
2711 
2712 	// Add a link to the inode from the parent, depending on its type
2713 	// (the vnode is not published yet, so it is safe to make the inode
2714 	// accessable to the file system here)
2715 	if (tree != NULL) {
2716 		status = tree->Insert(transaction, name, inode->ID());
2717 	} else if (parent != NULL && (mode & S_ATTR_DIR) != 0) {
2718 		parent->Attributes() = run;
2719 		status = parent->WriteBack(transaction);
2720 	}
2721 
2722 	// Note, we only care if the inode could be made accessable for the
2723 	// two cases above; the root node or the indices root node must
2724 	// handle this case on their own (or other cases where "parent" is
2725 	// NULL)
2726 	if (status != B_OK)
2727 		RETURN_ERROR(status);
2728 
2729 	// Update the main indices (name, size & last_modified)
2730 	// (live queries might want to access us after this)
2731 
2732 	Index index(volume);
2733 	if (inode->InNameIndex() && name != NULL) {
2734 		// the name index only contains regular files
2735 		// (but not the root node where name == NULL)
2736 		status = index.InsertName(transaction, name, inode);
2737 		if (status != B_OK && status != B_BAD_INDEX) {
2738 			// We have to remove the node from the parent at this point,
2739 			// because the InodeAllocator destructor can't handle this
2740 			// case (and if it fails, we can't do anything about it...)
2741 			if (tree)
2742 				tree->Remove(transaction, name, inode->ID());
2743 			else if (parent != NULL && (mode & S_ATTR_DIR) != 0)
2744 				parent->Node().attributes.SetTo(0, 0, 0);
2745 
2746 			RETURN_ERROR(status);
2747 		}
2748 	}
2749 
2750 	if (parent != NULL && parent->IsContainer())
2751 		parent->ContainerContentsChanged(transaction);
2752 
2753 	inode->UpdateOldLastModified();
2754 
2755 	// The "size" & "last_modified" indices don't contain directories.
2756 	// If adding to these indices fails, the inode creation will not be
2757 	// harmed; they are considered less important than the "name" index.
2758 	if (inode->InSizeIndex())
2759 		index.InsertSize(transaction, inode);
2760 	if (inode->InLastModifiedIndex())
2761 		index.InsertLastModified(transaction, inode);
2762 
2763 	if (inode->NeedsFileCache()) {
2764 		inode->SetFileCache(file_cache_create(volume->ID(), inode->ID(),
2765 			inode->Size()));
2766 		inode->SetMap(file_map_create(volume->ID(), inode->ID(),
2767 			inode->Size()));
2768 
2769 		if (inode->FileCache() == NULL || inode->Map() == NULL)
2770 			return B_NO_MEMORY;
2771 	}
2772 
2773 	// Everything worked well until this point, we have a fully
2774 	// initialized inode, and we want to keep it
2775 	allocator.Keep(vnodeOps, publishFlags);
2776 
2777 	if (_created)
2778 		*_created = true;
2779 	if (_id != NULL)
2780 		*_id = inode->ID();
2781 	if (_inode != NULL)
2782 		*_inode = inode;
2783 
2784 	// if either _id or _inode is passed, we will keep the inode locked
2785 	if (_id == NULL && _inode == NULL)
2786 		put_vnode(volume->FSVolume(), inode->ID());
2787 
2788 	return B_OK;
2789 }
2790 
2791 
2792 //	#pragma mark - AttributeIterator
2793 
2794 
2795 AttributeIterator::AttributeIterator(Inode* inode)
2796 	:
2797 	fCurrentSmallData(0),
2798 	fInode(inode),
2799 	fAttributes(NULL),
2800 	fIterator(NULL),
2801 	fBuffer(NULL)
2802 {
2803 	inode->_AddIterator(this);
2804 }
2805 
2806 
2807 AttributeIterator::~AttributeIterator()
2808 {
2809 	if (fAttributes)
2810 		put_vnode(fAttributes->GetVolume()->FSVolume(), fAttributes->ID());
2811 
2812 	delete fIterator;
2813 	fInode->_RemoveIterator(this);
2814 }
2815 
2816 
2817 status_t
2818 AttributeIterator::Rewind()
2819 {
2820 	fCurrentSmallData = 0;
2821 
2822 	if (fIterator != NULL)
2823 		fIterator->Rewind();
2824 
2825 	return B_OK;
2826 }
2827 
2828 
2829 status_t
2830 AttributeIterator::GetNext(char* name, size_t* _length, uint32* _type,
2831 	ino_t* _id)
2832 {
2833 	// read attributes out of the small data section
2834 
2835 	if (fCurrentSmallData >= 0) {
2836 		NodeGetter nodeGetter(fInode->GetVolume(), fInode);
2837 		const bfs_inode* node = nodeGetter.Node();
2838 		const small_data* item = ((bfs_inode*)node)->SmallDataStart();
2839 
2840 		RecursiveLocker _(&fInode->SmallDataLock());
2841 
2842 		int32 index = 0;
2843 		for (; !item->IsLast(node); item = item->Next(), index++) {
2844 			if (item->NameSize() == FILE_NAME_NAME_LENGTH
2845 				&& *item->Name() == FILE_NAME_NAME)
2846 				continue;
2847 
2848 			if (index >= fCurrentSmallData)
2849 				break;
2850 		}
2851 
2852 		if (!item->IsLast(node)) {
2853 			strncpy(name, item->Name(), B_FILE_NAME_LENGTH);
2854 			*_type = item->Type();
2855 			*_length = item->NameSize();
2856 			*_id = (ino_t)index;
2857 
2858 			fCurrentSmallData = index + 1;
2859 			return B_OK;
2860 		}
2861 
2862 		// stop traversing the small_data section
2863 		fCurrentSmallData = -1;
2864 	}
2865 
2866 	// read attributes out of the attribute directory
2867 
2868 	if (fInode->Attributes().IsZero())
2869 		return B_ENTRY_NOT_FOUND;
2870 
2871 	Volume* volume = fInode->GetVolume();
2872 
2873 	// if you haven't yet access to the attributes directory, get it
2874 	if (fAttributes == NULL) {
2875 		if (get_vnode(volume->FSVolume(), volume->ToVnode(fInode->Attributes()),
2876 				(void**)&fAttributes) != B_OK) {
2877 			FATAL(("get_vnode() failed in AttributeIterator::GetNext(ino_t"
2878 				" = %" B_PRIdINO ",name = \"%s\")\n", fInode->ID(), name));
2879 			return B_ENTRY_NOT_FOUND;
2880 		}
2881 
2882 		BPlusTree* tree = fAttributes->Tree();
2883 		if (tree == NULL
2884 			|| (fIterator = new TreeIterator(tree)) == NULL) {
2885 			FATAL(("could not get tree in AttributeIterator::GetNext(ino_t"
2886 				" = %" B_PRIdINO ",name = \"%s\")\n", fInode->ID(), name));
2887 			return B_ENTRY_NOT_FOUND;
2888 		}
2889 	}
2890 
2891 	uint16 length;
2892 	ino_t id;
2893 	status_t status = fIterator->GetNextEntry(name, &length,
2894 		B_FILE_NAME_LENGTH, &id);
2895 	if (status < B_OK)
2896 		return status;
2897 
2898 	Vnode vnode(volume, id);
2899 	Inode* attribute;
2900 	if ((status = vnode.Get(&attribute)) == B_OK) {
2901 		*_type = attribute->Type();
2902 		*_length = length;
2903 		*_id = id;
2904 	}
2905 
2906 	return status;
2907 }
2908 
2909 
2910 void
2911 AttributeIterator::Update(uint16 index, int8 change)
2912 {
2913 	// fCurrentSmallData points already to the next item
2914 	if (index < fCurrentSmallData)
2915 		fCurrentSmallData += change;
2916 }
2917 
2918