xref: /haiku/src/add-ons/kernel/file_systems/bfs/Inode.cpp (revision 97901ec593ec4dd50ac115c1c35a6d72f6e489a5)
1 /*
2  * Copyright 2001-2010, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 
7 //! Inode access functions
8 
9 
10 #include "Debug.h"
11 #include "Inode.h"
12 #include "BPlusTree.h"
13 #include "Index.h"
14 
15 
16 #if BFS_TRACING && !defined(BFS_SHELL) && !defined(_BOOT_MODE)
17 namespace BFSInodeTracing {
18 
19 class Create : public AbstractTraceEntry {
20 public:
21 	Create(Inode* inode, Inode* parent, const char* name, int32 mode,
22 			int openMode, uint32 type)
23 		:
24 		fInode(inode),
25 		fID(inode->ID()),
26 		fParent(parent),
27 		fParentID(parent != NULL ? parent->ID() : 0),
28 		fMode(mode),
29 		fOpenMode(openMode),
30 		fType(type)
31 	{
32 		if (name != NULL)
33 			strlcpy(fName, name, sizeof(fName));
34 		else
35 			fName[0] = '\0';
36 
37 		Initialized();
38 	}
39 
40 	virtual void AddDump(TraceOutput& out)
41 	{
42 		out.Print("bfs:Create %Ld (%p), parent %Ld (%p), \"%s\", "
43 			"mode %lx, omode %x, type %lx", fID, fInode, fParentID,
44 			fParent, fName, fMode, fOpenMode, fType);
45 	}
46 
47 private:
48 	Inode*	fInode;
49 	ino_t	fID;
50 	Inode*	fParent;
51 	ino_t	fParentID;
52 	char	fName[32];
53 	int32	fMode;
54 	int		fOpenMode;
55 	uint32	fType;
56 };
57 
58 class Remove : public AbstractTraceEntry {
59 public:
60 	Remove(Inode* inode, const char* name)
61 		:
62 		fInode(inode),
63 		fID(inode->ID())
64 	{
65 		strlcpy(fName, name, sizeof(fName));
66 		Initialized();
67 	}
68 
69 	virtual void AddDump(TraceOutput& out)
70 	{
71 		out.Print("bfs:Remove %Ld (%p), \"%s\"", fID, fInode, fName);
72 	}
73 
74 private:
75 	Inode*	fInode;
76 	ino_t	fID;
77 	char	fName[32];
78 };
79 
80 class Action : public AbstractTraceEntry {
81 public:
82 	Action(const char* action, Inode* inode)
83 		:
84 		fInode(inode),
85 		fID(inode->ID())
86 	{
87 		strlcpy(fAction, action, sizeof(fAction));
88 		Initialized();
89 	}
90 
91 	virtual void AddDump(TraceOutput& out)
92 	{
93 		out.Print("bfs:%s %Ld (%p)\n", fAction, fID, fInode);
94 	}
95 
96 private:
97 	Inode*	fInode;
98 	ino_t	fID;
99 	char	fAction[16];
100 };
101 
102 class Resize : public AbstractTraceEntry {
103 public:
104 	Resize(Inode* inode, off_t oldSize, off_t newSize, bool trim)
105 		:
106 		fInode(inode),
107 		fID(inode->ID()),
108 		fOldSize(oldSize),
109 		fNewSize(newSize),
110 		fTrim(trim)
111 	{
112 		Initialized();
113 	}
114 
115 	virtual void AddDump(TraceOutput& out)
116 	{
117 		out.Print("bfs:%s %Ld (%p), %Ld -> %Ld", fTrim ? "Trim" : "Resize",
118 			fID, fInode, fOldSize, fNewSize);
119 	}
120 
121 private:
122 	Inode*	fInode;
123 	ino_t	fID;
124 	off_t	fOldSize;
125 	off_t	fNewSize;
126 	bool	fTrim;
127 };
128 
129 }	// namespace BFSInodeTracing
130 
131 #	define T(x) new(std::nothrow) BFSInodeTracing::x;
132 #else
133 #	define T(x) ;
134 #endif
135 
136 
137 /*!	A helper class used by Inode::Create() to keep track of the belongings
138 	of an inode creation in progress.
139 	This class will make sure everything is cleaned up properly.
140 */
141 class InodeAllocator {
142 public:
143 							InodeAllocator(Transaction& transaction);
144 							~InodeAllocator();
145 
146 			status_t		New(block_run* parentRun, mode_t mode,
147 								block_run& run, fs_vnode_ops* vnodeOps,
148 								Inode** _inode);
149 			status_t		CreateTree();
150 			status_t		Keep(fs_vnode_ops* vnodeOps, uint32 publishFlags);
151 
152 private:
153 	static	void			_TransactionListener(int32 id, int32 event,
154 								void* _inode);
155 
156 			Transaction*	fTransaction;
157 			block_run		fRun;
158 			Inode*			fInode;
159 };
160 
161 
162 InodeAllocator::InodeAllocator(Transaction& transaction)
163 	:
164 	fTransaction(&transaction),
165 	fInode(NULL)
166 {
167 }
168 
169 
170 InodeAllocator::~InodeAllocator()
171 {
172 	if (fTransaction != NULL) {
173 		Volume* volume = fTransaction->GetVolume();
174 
175 		if (fInode != NULL) {
176 			fInode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
177 				// this unblocks any pending bfs_read_vnode() calls
178 			fInode->Free(*fTransaction);
179 			fTransaction->RemoveListener(fInode);
180 
181 			remove_vnode(volume->FSVolume(), fInode->ID());
182 		} else
183 			volume->Free(*fTransaction, fRun);
184 	}
185 
186 	delete fInode;
187 }
188 
189 
190 status_t
191 InodeAllocator::New(block_run* parentRun, mode_t mode, block_run& run,
192 	fs_vnode_ops* vnodeOps, Inode** _inode)
193 {
194 	Volume* volume = fTransaction->GetVolume();
195 
196 	status_t status = volume->AllocateForInode(*fTransaction, parentRun, mode,
197 		fRun);
198 	if (status < B_OK) {
199 		// don't free the space in the destructor, because
200 		// the allocation failed
201 		fTransaction = NULL;
202 		RETURN_ERROR(status);
203 	}
204 
205 	run = fRun;
206 	fInode = new Inode(volume, *fTransaction, volume->ToVnode(run), mode, run);
207 	if (fInode == NULL)
208 		RETURN_ERROR(B_NO_MEMORY);
209 
210 	if (!volume->IsInitializing()) {
211 		status = new_vnode(volume->FSVolume(), fInode->ID(), fInode,
212 			vnodeOps != NULL ? vnodeOps : &gBFSVnodeOps);
213 		if (status < B_OK) {
214 			delete fInode;
215 			fInode = NULL;
216 			RETURN_ERROR(status);
217 		}
218 	}
219 
220 	fInode->WriteLockInTransaction(*fTransaction);
221 	*_inode = fInode;
222 	return B_OK;
223 }
224 
225 
226 status_t
227 InodeAllocator::CreateTree()
228 {
229 	Volume* volume = fTransaction->GetVolume();
230 
231 	// force S_STR_INDEX to be set, if no type is set
232 	if ((fInode->Mode() & S_INDEX_TYPES) == 0)
233 		fInode->Node().mode |= HOST_ENDIAN_TO_BFS_INT32(S_STR_INDEX);
234 
235 	BPlusTree* tree = fInode->fTree = new BPlusTree(*fTransaction, fInode);
236 	if (tree == NULL || tree->InitCheck() < B_OK)
237 		return B_ERROR;
238 
239 	if (fInode->IsRegularNode()) {
240 		if (tree->Insert(*fTransaction, ".", fInode->ID()) < B_OK
241 			|| tree->Insert(*fTransaction, "..",
242 					volume->ToVnode(fInode->Parent())) < B_OK)
243 			return B_ERROR;
244 	}
245 	return B_OK;
246 }
247 
248 
249 status_t
250 InodeAllocator::Keep(fs_vnode_ops* vnodeOps, uint32 publishFlags)
251 {
252 	ASSERT(fInode != NULL && fTransaction != NULL);
253 	Volume* volume = fTransaction->GetVolume();
254 
255 	status_t status = fInode->WriteBack(*fTransaction);
256 	if (status < B_OK) {
257 		FATAL(("writing new inode %" B_PRIdINO " failed!\n", fInode->ID()));
258 		return status;
259 	}
260 
261 	// Symbolic links are not published -- the caller needs to do this once
262 	// the contents have been written.
263 	if (!fInode->IsSymLink() && volume->ID() >= 0) {
264 		status = publish_vnode(volume->FSVolume(), fInode->ID(), fInode,
265 			vnodeOps != NULL ? vnodeOps : &gBFSVnodeOps, fInode->Mode(),
266 			publishFlags);
267 	}
268 
269 	if (status == B_OK) {
270 		cache_add_transaction_listener(volume->BlockCache(), fTransaction->ID(),
271 			TRANSACTION_ABORTED, &_TransactionListener, fInode);
272 	}
273 
274 	fTransaction = NULL;
275 	fInode = NULL;
276 
277 	return status;
278 }
279 
280 
281 /*static*/ void
282 InodeAllocator::_TransactionListener(int32 id, int32 event, void* _inode)
283 {
284 	Inode* inode = (Inode*)_inode;
285 
286 	if (event == TRANSACTION_ABORTED)
287 		panic("transaction %d aborted, inode %p still around!\n", (int)id, inode);
288 }
289 
290 
291 //	#pragma mark -
292 
293 
294 status_t
295 bfs_inode::InitCheck(Volume* volume) const
296 {
297 	if (Magic1() != INODE_MAGIC1
298 		|| !(Flags() & INODE_IN_USE)
299 		|| inode_num.Length() != 1
300 		// matches inode size?
301 		|| (uint32)InodeSize() != volume->InodeSize()
302 		// parent resides on disk?
303 		|| parent.AllocationGroup() > int32(volume->AllocationGroups())
304 		|| parent.AllocationGroup() < 0
305 		|| parent.Start() > (1L << volume->AllocationGroupShift())
306 		|| parent.Length() != 1
307 		// attributes, too?
308 		|| attributes.AllocationGroup() > int32(volume->AllocationGroups())
309 		|| attributes.AllocationGroup() < 0
310 		|| attributes.Start() > (1L << volume->AllocationGroupShift()))
311 		RETURN_ERROR(B_BAD_DATA);
312 
313 	if (Flags() & INODE_DELETED)
314 		return B_NOT_ALLOWED;
315 
316 	// TODO: Add some tests to check the integrity of the other stuff here,
317 	// especially for the data_stream!
318 
319 	return B_OK;
320 }
321 
322 
323 //	#pragma mark - Inode
324 
325 
326 Inode::Inode(Volume* volume, ino_t id)
327 	:
328 	fVolume(volume),
329 	fID(id),
330 	fTree(NULL),
331 	fAttributes(NULL),
332 	fCache(NULL),
333 	fMap(NULL)
334 {
335 	PRINT(("Inode::Inode(volume = %p, id = %Ld) @ %p\n", volume, id, this));
336 
337 	UpdateNodeFromDisk();
338 
339 	rw_lock_init(&fLock, "bfs inode");
340 	recursive_lock_init(&fSmallDataLock, "bfs inode small data");
341 
342 	// these two will help to maintain the indices
343 	fOldSize = Size();
344 	fOldLastModified = LastModified();
345 
346 	if (IsContainer())
347 		fTree = new BPlusTree(this);
348 	if (NeedsFileCache()) {
349 		SetFileCache(file_cache_create(fVolume->ID(), ID(), Size()));
350 		SetMap(file_map_create(volume->ID(), ID(), Size()));
351 	}
352 }
353 
354 
355 Inode::Inode(Volume* volume, Transaction& transaction, ino_t id, mode_t mode,
356 		block_run& run)
357 	:
358 	fVolume(volume),
359 	fID(id),
360 	fTree(NULL),
361 	fAttributes(NULL),
362 	fCache(NULL),
363 	fMap(NULL)
364 {
365 	PRINT(("Inode::Inode(volume = %p, transaction = %p, id = %Ld) @ %p\n",
366 		volume, &transaction, id, this));
367 
368 	rw_lock_init(&fLock, "bfs inode");
369 	recursive_lock_init(&fSmallDataLock, "bfs inode small data");
370 
371 	NodeGetter node(volume, transaction, this, true);
372 	memset(&fNode, 0, sizeof(bfs_inode));
373 
374 	// Initialize the bfs_inode structure -- it's not written back to disk
375 	// here, because it will be done so already when the inode could be
376 	// created completely.
377 
378 	Node().magic1 = HOST_ENDIAN_TO_BFS_INT32(INODE_MAGIC1);
379 	Node().inode_num = run;
380 	Node().mode = HOST_ENDIAN_TO_BFS_INT32(mode);
381 	Node().flags = HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
382 
383 	Node().create_time = Node().last_modified_time = Node().status_change_time
384 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
385 
386 	Node().inode_size = HOST_ENDIAN_TO_BFS_INT32(volume->InodeSize());
387 
388 	// these two will help to maintain the indices
389 	fOldSize = Size();
390 	fOldLastModified = LastModified();
391 }
392 
393 
394 Inode::~Inode()
395 {
396 	PRINT(("Inode::~Inode() @ %p\n", this));
397 
398 	file_cache_delete(FileCache());
399 	file_map_delete(Map());
400 	delete fTree;
401 
402 	rw_lock_destroy(&fLock);
403 }
404 
405 
406 status_t
407 Inode::InitCheck(bool checkNode) const
408 {
409 	// test inode magic and flags
410 	if (checkNode) {
411 		status_t status = Node().InitCheck(fVolume);
412 		if (status == B_BUSY)
413 			return B_BUSY;
414 
415 		if (status != B_OK) {
416 			FATAL(("inode at block %" B_PRIdOFF " corrupt!\n", BlockNumber()));
417 			RETURN_ERROR(B_BAD_DATA);
418 		}
419 	}
420 
421 	if (IsContainer()) {
422 		// inodes that have a B+tree
423 		if (fTree == NULL)
424 			RETURN_ERROR(B_NO_MEMORY);
425 
426 		status_t status = fTree->InitCheck();
427 		if (status != B_OK) {
428 			FATAL(("inode tree at block %" B_PRIdOFF " corrupt!\n",
429 				BlockNumber()));
430 			RETURN_ERROR(B_BAD_DATA);
431 		}
432 	}
433 
434 	if (NeedsFileCache() && (fCache == NULL || fMap == NULL))
435 		return B_NO_MEMORY;
436 
437 	return B_OK;
438 }
439 
440 
441 /*!	Adds this inode to the specified transaction. This means that the inode will
442 	be write locked until the transaction ended.
443 	To ensure that the inode will stay valid until that point, an extra reference
444 	is acquired to it as long as this transaction stays active.
445 */
446 void
447 Inode::WriteLockInTransaction(Transaction& transaction)
448 {
449 	// These flags can only change while holding the transaction lock
450 	if ((Flags() & INODE_IN_TRANSACTION) != 0)
451 		return;
452 
453 	// We share the same list link with the removed list, so we have to remove
454 	// the inode from that list here (and add it back when we no longer need it)
455 	if ((Flags() & INODE_DELETED) != 0)
456 		fVolume->RemovedInodes().Remove(this);
457 
458 	if (!fVolume->IsInitializing())
459 		acquire_vnode(fVolume->FSVolume(), ID());
460 
461 	rw_lock_write_lock(&Lock());
462 	Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
463 
464 	transaction.AddListener(this);
465 }
466 
467 
468 status_t
469 Inode::WriteBack(Transaction& transaction)
470 {
471 	NodeGetter node(fVolume, transaction, this);
472 	if (node.WritableNode() == NULL)
473 		return B_IO_ERROR;
474 
475 	memcpy(node.WritableNode(), &Node(), sizeof(bfs_inode));
476 	return B_OK;
477 }
478 
479 
480 void
481 Inode::UpdateNodeFromDisk()
482 {
483 	NodeGetter node(fVolume, this);
484 	memcpy(&fNode, node.Node(), sizeof(bfs_inode));
485 	fNode.flags &= HOST_ENDIAN_TO_BFS_INT32(INODE_PERMANENT_FLAGS);
486 }
487 
488 
489 status_t
490 Inode::CheckPermissions(int accessMode) const
491 {
492 	uid_t user = geteuid();
493 	gid_t group = getegid();
494 
495 	// you never have write access to a read-only volume
496 	if ((accessMode & W_OK) != 0 && fVolume->IsReadOnly())
497 		return B_READ_ONLY_DEVICE;
498 
499 	// root users always have full access (but they can't execute files without
500 	// any execute permissions set)
501 	if (user == 0) {
502 		if (!((accessMode & X_OK) != 0 && (Mode() & S_IXUSR) == 0)
503 			|| (Mode() & S_DIRECTORY) != 0) {
504 			return B_OK;
505 		}
506 	}
507 
508 	// shift mode bits, to check directly against accessMode
509 	mode_t mode = Mode();
510 	if (user == (uid_t)fNode.UserID())
511 		mode >>= 6;
512 	else if (group == (gid_t)fNode.GroupID())
513 		mode >>= 3;
514 
515 	if (accessMode & ~(mode & S_IRWXO))
516 		return B_NOT_ALLOWED;
517 
518 	return B_OK;
519 }
520 
521 
522 //	#pragma mark - attributes
523 
524 
525 void
526 Inode::_AddIterator(AttributeIterator* iterator)
527 {
528 	RecursiveLocker _(fSmallDataLock);
529 	fIterators.Add(iterator);
530 }
531 
532 
533 void
534 Inode::_RemoveIterator(AttributeIterator* iterator)
535 {
536 	RecursiveLocker _(fSmallDataLock);
537 	fIterators.Remove(iterator);
538 }
539 
540 
541 /*!	Tries to free up "bytes" space in the small_data section by moving
542 	attributes to real files. Used for system attributes like the name.
543 	You need to hold the fSmallDataLock when you call this method
544 */
545 status_t
546 Inode::_MakeSpaceForSmallData(Transaction& transaction, bfs_inode* node,
547 	const char* name, int32 bytes)
548 {
549 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
550 
551 	while (bytes > 0) {
552 		small_data* item = node->SmallDataStart();
553 		small_data* max = NULL;
554 		int32 index = 0, maxIndex = 0;
555 		for (; !item->IsLast(node); item = item->Next(), index++) {
556 			// should not remove those
557 			if (*item->Name() == FILE_NAME_NAME || !strcmp(name, item->Name()))
558 				continue;
559 
560 			if (max == NULL || max->Size() < item->Size()) {
561 				maxIndex = index;
562 				max = item;
563 			}
564 
565 			// Remove the first one large enough to free the needed amount of
566 			// bytes
567 			if (bytes < (int32)item->Size())
568 				break;
569 		}
570 
571 		if (item->IsLast(node) || (int32)item->Size() < bytes)
572 			return B_ERROR;
573 
574 		bytes -= max->Size();
575 
576 		// Move the attribute to a real attribute file
577 		// Luckily, this doesn't cause any index updates
578 
579 		Inode* attribute;
580 		status_t status = CreateAttribute(transaction, item->Name(),
581 			item->Type(), &attribute);
582 		if (status != B_OK)
583 			RETURN_ERROR(status);
584 
585 		size_t length = item->DataSize();
586 		status = attribute->WriteAt(transaction, 0, item->Data(), &length);
587 
588 		ReleaseAttribute(attribute);
589 
590 		if (status != B_OK) {
591 			Vnode vnode(fVolume, Attributes());
592 			Inode* attributes;
593 			if (vnode.Get(&attributes) < B_OK
594 				|| attributes->Remove(transaction, name) < B_OK) {
595 				FATAL(("Could not remove newly created attribute!\n"));
596 			}
597 
598 			RETURN_ERROR(status);
599 		}
600 
601 		_RemoveSmallData(node, max, maxIndex);
602 	}
603 	return B_OK;
604 }
605 
606 
607 /*!	Private function which removes the given attribute from the small_data
608 	section.
609 	You need to hold the fSmallDataLock when you call this method
610 */
611 status_t
612 Inode::_RemoveSmallData(bfs_inode* node, small_data* item, int32 index)
613 {
614 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
615 
616 	small_data* next = item->Next();
617 	if (!next->IsLast(node)) {
618 		// find the last attribute
619 		small_data* last = next;
620 		while (!last->IsLast(node))
621 			last = last->Next();
622 
623 		int32 size = (uint8*)last - (uint8*)next;
624 		if (size < 0
625 			|| size > (uint8*)node + fVolume->BlockSize() - (uint8*)next)
626 			return B_BAD_DATA;
627 
628 		memmove(item, next, size);
629 
630 		// Move the "last" one to its new location and
631 		// correctly terminate the small_data section
632 		last = (small_data*)((uint8*)last - ((uint8*)next - (uint8*)item));
633 		memset(last, 0, (uint8*)node + fVolume->BlockSize() - (uint8*)last);
634 	} else
635 		memset(item, 0, item->Size());
636 
637 	// update all current iterators
638 	SinglyLinkedList<AttributeIterator>::Iterator iterator
639 		= fIterators.GetIterator();
640 	while (iterator.HasNext()) {
641 		iterator.Next()->Update(index, -1);
642 	}
643 
644 	return B_OK;
645 }
646 
647 
648 //!	Removes the given attribute from the small_data section.
649 status_t
650 Inode::_RemoveSmallData(Transaction& transaction, NodeGetter& nodeGetter,
651 	const char* name)
652 {
653 	if (name == NULL)
654 		return B_BAD_VALUE;
655 
656 	bfs_inode* node = nodeGetter.WritableNode();
657 	RecursiveLocker locker(fSmallDataLock);
658 
659 	// search for the small_data item
660 
661 	small_data* item = node->SmallDataStart();
662 	int32 index = 0;
663 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
664 		item = item->Next();
665 		index++;
666 	}
667 
668 	if (item->IsLast(node))
669 		return B_ENTRY_NOT_FOUND;
670 
671 	nodeGetter.MakeWritable(transaction);
672 
673 	status_t status = _RemoveSmallData(node, item, index);
674 	if (status == B_OK) {
675 		Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
676 			bfs_inode::ToInode(real_time_clock_usecs()));
677 
678 		status = WriteBack(transaction);
679 	}
680 
681 	return status;
682 }
683 
684 
685 /*!	Try to place the given attribute in the small_data section - if the
686 	new attribute is too big to fit in that section, it returns B_DEVICE_FULL.
687 	In that case, the attribute should be written to a real attribute file;
688 	it's the caller's responsibility to remove any existing attributes in the
689 	small data section if that's the case.
690 
691 	Note that you need to write back the inode yourself after having called that
692 	method - it's a bad API decision that it needs a transaction but enforces
693 	you to write back the inode all by yourself, but it's just more efficient
694 	in most cases...
695 */
696 status_t
697 Inode::_AddSmallData(Transaction& transaction, NodeGetter& nodeGetter,
698 	const char* name, uint32 type, off_t pos, const uint8* data, size_t length,
699 	bool force)
700 {
701 	bfs_inode* node = nodeGetter.WritableNode();
702 
703 	if (node == NULL || name == NULL || data == NULL)
704 		return B_BAD_VALUE;
705 
706 	// reject any requests that can't fit into the small_data section
707 	uint32 nameLength = strlen(name);
708 	uint32 spaceNeeded = sizeof(small_data) + nameLength + 3 + pos + length + 1;
709 	if (spaceNeeded > fVolume->InodeSize() - sizeof(bfs_inode))
710 		return B_DEVICE_FULL;
711 
712 	nodeGetter.MakeWritable(transaction);
713 	RecursiveLocker locker(fSmallDataLock);
714 
715 	// Find the last item or one with the same name we have to add
716 	small_data* item = node->SmallDataStart();
717 	int32 index = 0;
718 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
719 		item = item->Next();
720 		index++;
721 	}
722 
723 	// is the attribute already in the small_data section?
724 	// then just replace the data part of that one
725 	if (!item->IsLast(node)) {
726 		// find last attribute
727 		small_data* last = item;
728 		while (!last->IsLast(node))
729 			last = last->Next();
730 
731 		// try to change the attributes value
732 		if (item->data_size > pos + length
733 			|| force
734 			|| ((uint8*)last + pos + length - item->DataSize())
735 					<= ((uint8*)node + fVolume->InodeSize())) {
736 			// Make room for the new attribute if needed (and we are forced
737 			// to do so)
738 			if (force && ((uint8*)last + pos + length - item->DataSize())
739 					> ((uint8*)node + fVolume->InodeSize())) {
740 				// We also take the free space at the end of the small_data
741 				// section into account, and request only what's really needed
742 				uint32 needed = pos + length - item->DataSize() -
743 					(uint32)((uint8*)node + fVolume->InodeSize()
744 						- (uint8*)last);
745 
746 				if (_MakeSpaceForSmallData(transaction, node, name, needed)
747 						!= B_OK)
748 					return B_ERROR;
749 
750 				// reset our pointers
751 				item = node->SmallDataStart();
752 				index = 0;
753 				while (!item->IsLast(node) && strcmp(item->Name(), name)) {
754 					item = item->Next();
755 					index++;
756 				}
757 
758 				last = item;
759 				while (!last->IsLast(node))
760 					last = last->Next();
761 			}
762 
763 			size_t oldDataSize = item->DataSize();
764 
765 			// Normally, we can just overwrite the attribute data as the size
766 			// is specified by the type and does not change that often
767 			if (pos + length != item->DataSize()) {
768 				// move the attributes after the current one
769 				small_data* next = item->Next();
770 				if (!next->IsLast(node)) {
771 					memmove((uint8*)item + spaceNeeded, next,
772 						(uint8*)last - (uint8*)next);
773 				}
774 
775 				// Move the "last" one to its new location and
776 				// correctly terminate the small_data section
777 				last = (small_data*)((uint8*)last
778 					- ((uint8*)next - ((uint8*)item + spaceNeeded)));
779 				if ((uint8*)last < (uint8*)node + fVolume->BlockSize()) {
780 					memset(last, 0, (uint8*)node + fVolume->BlockSize()
781 						- (uint8*)last);
782 				}
783 
784 				item->data_size = HOST_ENDIAN_TO_BFS_INT16(pos + length);
785 			}
786 
787 			item->type = HOST_ENDIAN_TO_BFS_INT32(type);
788 
789 			if (oldDataSize < pos) {
790 				// Fill gap with zeros
791 				memset(item->Data() + oldDataSize, 0, pos - oldDataSize);
792 			}
793 			memcpy(item->Data() + pos, data, length);
794 			item->Data()[pos + length] = '\0';
795 
796 			return B_OK;
797 		}
798 
799 		return B_DEVICE_FULL;
800 	}
801 
802 	// try to add the new attribute!
803 
804 	if ((uint8*)item + spaceNeeded > (uint8*)node + fVolume->InodeSize()) {
805 		// there is not enough space for it!
806 		if (!force)
807 			return B_DEVICE_FULL;
808 
809 		// make room for the new attribute
810 		if (_MakeSpaceForSmallData(transaction, node, name, spaceNeeded) < B_OK)
811 			return B_ERROR;
812 
813 		// get new last item!
814 		item = node->SmallDataStart();
815 		index = 0;
816 		while (!item->IsLast(node)) {
817 			item = item->Next();
818 			index++;
819 		}
820 	}
821 
822 	memset(item, 0, spaceNeeded);
823 	item->type = HOST_ENDIAN_TO_BFS_INT32(type);
824 	item->name_size = HOST_ENDIAN_TO_BFS_INT16(nameLength);
825 	item->data_size = HOST_ENDIAN_TO_BFS_INT16(length);
826 	strcpy(item->Name(), name);
827 	memcpy(item->Data() + pos, data, length);
828 
829 	// correctly terminate the small_data section
830 	item = item->Next();
831 	if (!item->IsLast(node))
832 		memset(item, 0, (uint8*)node + fVolume->InodeSize() - (uint8*)item);
833 
834 	// update all current iterators
835 	SinglyLinkedList<AttributeIterator>::Iterator iterator
836 		= fIterators.GetIterator();
837 	while (iterator.HasNext()) {
838 		iterator.Next()->Update(index, 1);
839 	}
840 
841 	return B_OK;
842 }
843 
844 
845 /*!	Iterates through the small_data section of an inode.
846 	To start at the beginning of this section, you let smallData
847 	point to NULL, like:
848 		small_data* data = NULL;
849 		while (inode->GetNextSmallData(&data) { ... }
850 
851 	This function is reentrant and doesn't allocate any memory;
852 	you can safely stop calling it at any point (you don't need
853 	to iterate through the whole list).
854 	You need to hold the fSmallDataLock when you call this method
855 */
856 status_t
857 Inode::_GetNextSmallData(bfs_inode* node, small_data** _smallData) const
858 {
859 	if (node == NULL)
860 		RETURN_ERROR(B_BAD_VALUE);
861 
862 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
863 
864 	small_data* data = *_smallData;
865 
866 	// begin from the start?
867 	if (data == NULL)
868 		data = node->SmallDataStart();
869 	else
870 		data = data->Next();
871 
872 	// is already last item?
873 	if (data->IsLast(node))
874 		return B_ENTRY_NOT_FOUND;
875 
876 	*_smallData = data;
877 
878 	return B_OK;
879 }
880 
881 
882 /*!	Finds the attribute "name" in the small data section, and
883 	returns a pointer to it (or NULL if it doesn't exist).
884 	You need to hold the fSmallDataLock when you call this method
885 */
886 small_data*
887 Inode::FindSmallData(const bfs_inode* node, const char* name) const
888 {
889 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
890 
891 	small_data* smallData = NULL;
892 	while (_GetNextSmallData(const_cast<bfs_inode*>(node), &smallData)
893 			== B_OK) {
894 		if (!strcmp(smallData->Name(), name))
895 			return smallData;
896 	}
897 	return NULL;
898 }
899 
900 
901 /*!	Returns a pointer to the node's name if present in the small data
902 	section, NULL otherwise.
903 	You need to hold the fSmallDataLock when you call this method
904 */
905 const char*
906 Inode::Name(const bfs_inode* node) const
907 {
908 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
909 
910 	small_data* smallData = NULL;
911 	while (_GetNextSmallData((bfs_inode*)node, &smallData) == B_OK) {
912 		if (*smallData->Name() == FILE_NAME_NAME
913 			&& smallData->NameSize() == FILE_NAME_NAME_LENGTH)
914 			return (const char*)smallData->Data();
915 	}
916 	return NULL;
917 }
918 
919 
920 /*!	Copies the node's name into the provided buffer.
921 	The buffer should be B_FILE_NAME_LENGTH bytes large.
922 */
923 status_t
924 Inode::GetName(char* buffer, size_t size) const
925 {
926 	NodeGetter node(fVolume, this);
927 	RecursiveLocker locker(fSmallDataLock);
928 
929 	const char* name = Name(node.Node());
930 	if (name == NULL)
931 		return B_ENTRY_NOT_FOUND;
932 
933 	strlcpy(buffer, name, size);
934 	return B_OK;
935 }
936 
937 
938 /*!	Changes or set the name of a file: in the inode small_data section only, it
939 	doesn't change it in the parent directory's b+tree.
940 	Note that you need to write back the inode yourself after having called
941 	that method. It suffers from the same API decision as AddSmallData() does
942 	(and for the same reason).
943 */
944 status_t
945 Inode::SetName(Transaction& transaction, const char* name)
946 {
947 	if (name == NULL || *name == '\0')
948 		return B_BAD_VALUE;
949 
950 	NodeGetter node(fVolume, transaction, this);
951 	const char nameTag[2] = {FILE_NAME_NAME, 0};
952 
953 	return _AddSmallData(transaction, node, nameTag, FILE_NAME_TYPE, 0,
954 		(uint8*)name, strlen(name), true);
955 }
956 
957 
958 status_t
959 Inode::_RemoveAttribute(Transaction& transaction, const char* name,
960 	bool hasIndex, Index* index)
961 {
962 	// remove the attribute file if it exists
963 	Vnode vnode(fVolume, Attributes());
964 	Inode* attributes;
965 	status_t status = vnode.Get(&attributes);
966 	if (status < B_OK)
967 		return status;
968 
969 	// update index
970 	if (index != NULL) {
971 		Inode* attribute;
972 		if ((hasIndex || fVolume->CheckForLiveQuery(name))
973 			&& GetAttribute(name, &attribute) == B_OK) {
974 			uint8 data[BPLUSTREE_MAX_KEY_LENGTH];
975 			size_t length = BPLUSTREE_MAX_KEY_LENGTH;
976 			if (attribute->ReadAt(0, data, &length) == B_OK) {
977 				index->Update(transaction, name, attribute->Type(), data,
978 					length, NULL, 0, this);
979 			}
980 
981 			ReleaseAttribute(attribute);
982 		}
983 	}
984 
985 	if ((status = attributes->Remove(transaction, name)) < B_OK)
986 		return status;
987 
988 	if (attributes->IsEmpty()) {
989 		attributes->WriteLockInTransaction(transaction);
990 
991 		// remove attribute directory (don't fail if that can't be done)
992 		if (remove_vnode(fVolume->FSVolume(), attributes->ID()) == B_OK) {
993 			// update the inode, so that no one will ever doubt it's deleted :-)
994 			attributes->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
995 			if (attributes->WriteBack(transaction) == B_OK) {
996 				Attributes().SetTo(0, 0, 0);
997 				WriteBack(transaction);
998 			} else {
999 				unremove_vnode(fVolume->FSVolume(), attributes->ID());
1000 				attributes->Node().flags
1001 					&= ~HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
1002 			}
1003 		}
1004 	}
1005 
1006 	return status;
1007 }
1008 
1009 
1010 /*!	Reads data from the specified attribute.
1011 	This is a high-level attribute function that understands attributes
1012 	in the small_data section as well as real attribute files.
1013 */
1014 status_t
1015 Inode::ReadAttribute(const char* name, int32 type, off_t pos, uint8* buffer,
1016 	size_t* _length)
1017 {
1018 	if (pos < 0)
1019 		pos = 0;
1020 
1021 	// search in the small_data section (which has to be locked first)
1022 	{
1023 		NodeGetter node(fVolume, this);
1024 		RecursiveLocker locker(fSmallDataLock);
1025 
1026 		small_data* smallData = FindSmallData(node.Node(), name);
1027 		if (smallData != NULL) {
1028 			size_t length = *_length;
1029 			if (pos >= smallData->data_size) {
1030 				*_length = 0;
1031 				return B_OK;
1032 			}
1033 			if (length + pos > smallData->DataSize())
1034 				length = smallData->DataSize() - pos;
1035 
1036 			memcpy(buffer, smallData->Data() + pos, length);
1037 			*_length = length;
1038 			return B_OK;
1039 		}
1040 	}
1041 
1042 	// search in the attribute directory
1043 	Inode* attribute;
1044 	status_t status = GetAttribute(name, &attribute);
1045 	if (status == B_OK) {
1046 		status = attribute->ReadAt(pos, (uint8*)buffer, _length);
1047 
1048 		ReleaseAttribute(attribute);
1049 	}
1050 
1051 	RETURN_ERROR(status);
1052 }
1053 
1054 
1055 /*!	Writes data to the specified attribute.
1056 	This is a high-level attribute function that understands attributes
1057 	in the small_data section as well as real attribute files.
1058 */
1059 status_t
1060 Inode::WriteAttribute(Transaction& transaction, const char* name, int32 type,
1061 	off_t pos, const uint8* buffer, size_t* _length, bool* _created)
1062 {
1063 	if (pos < 0)
1064 		return B_BAD_VALUE;
1065 
1066 	// needed to maintain the index
1067 	uint8 oldBuffer[BPLUSTREE_MAX_KEY_LENGTH];
1068 	uint8* oldData = NULL;
1069 	size_t oldLength = 0;
1070 	bool created = false;
1071 
1072 	// TODO: we actually depend on that the contents of "buffer" are constant.
1073 	// If they get changed during the write (hey, user programs), we may mess
1074 	// up our index trees!
1075 	// TODO: for attribute files, we need to log the first
1076 	// BPLUSTREE_MAX_KEY_LENGTH bytes of the data stream, or the same as above
1077 	// might happen.
1078 
1079 	Index index(fVolume);
1080 	bool hasIndex = index.SetTo(name) == B_OK;
1081 
1082 	Inode* attribute = NULL;
1083 	status_t status = B_OK;
1084 
1085 	if (GetAttribute(name, &attribute) != B_OK) {
1086 		// No attribute inode exists yet
1087 
1088 		// save the old attribute data
1089 		NodeGetter node(fVolume, transaction, this);
1090 		recursive_lock_lock(&fSmallDataLock);
1091 
1092 		small_data* smallData = FindSmallData(node.Node(), name);
1093 		if (smallData != NULL) {
1094 			oldLength = smallData->DataSize();
1095 			if (oldLength > 0) {
1096 				if (oldLength > BPLUSTREE_MAX_KEY_LENGTH)
1097 					oldLength = BPLUSTREE_MAX_KEY_LENGTH;
1098 				memcpy(oldData = oldBuffer, smallData->Data(), oldLength);
1099 			}
1100 		} else
1101 			created = true;
1102 
1103 		recursive_lock_unlock(&fSmallDataLock);
1104 
1105 		// if the attribute doesn't exist yet (as a file), try to put it in the
1106 		// small_data section first - if that fails (due to insufficent space),
1107 		// create a real attribute file
1108 		status = _AddSmallData(transaction, node, name, type, pos, buffer,
1109 			*_length);
1110 		if (status == B_DEVICE_FULL) {
1111 			if (smallData != NULL) {
1112 				// remove the old attribute from the small data section - there
1113 				// is no space left for the new data
1114 				status = _RemoveSmallData(transaction, node, name);
1115 			} else
1116 				status = B_OK;
1117 
1118 			if (status == B_OK)
1119 				status = CreateAttribute(transaction, name, type, &attribute);
1120 			if (status != B_OK)
1121 				RETURN_ERROR(status);
1122 
1123 			created = true;
1124 		} else if (status == B_OK) {
1125 			// Update status time on attribute write
1126 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1127 				bfs_inode::ToInode(real_time_clock_usecs()));
1128 
1129 			status = WriteBack(transaction);
1130 		}
1131 	}
1132 
1133 	if (attribute != NULL) {
1134 		WriteLocker writeLocker(attribute->fLock);
1135 
1136 		if (hasIndex || fVolume->CheckForLiveQuery(name)) {
1137 			// Save the old attribute data (if this fails, oldLength will
1138 			// reflect it)
1139 			while (attribute->Size() > 0) {
1140 				bigtime_t oldModified = attribute->LastModified();
1141 				writeLocker.Unlock();
1142 
1143 				oldLength = BPLUSTREE_MAX_KEY_LENGTH;
1144 				if (attribute->ReadAt(0, oldBuffer, &oldLength) == B_OK)
1145 					oldData = oldBuffer;
1146 
1147 				writeLocker.Lock();
1148 
1149 				// Read until the data hasn't changed in between
1150 				if (oldModified == attribute->LastModified())
1151 					break;
1152 
1153 				oldLength = 0;
1154 			}
1155 		}
1156 
1157 		// check if the data fits into the small_data section again
1158 		NodeGetter node(fVolume, transaction, this);
1159 		status = _AddSmallData(transaction, node, name, type, pos, buffer,
1160 			*_length);
1161 
1162 		if (status == B_OK) {
1163 			// it does - remove its file
1164 			writeLocker.Unlock();
1165 			status = _RemoveAttribute(transaction, name, false, NULL);
1166 		} else {
1167 			// The attribute type might have been changed - we need to
1168 			// adopt the new one
1169 			attribute->Node().type = HOST_ENDIAN_TO_BFS_INT32(type);
1170 			status = attribute->WriteBack(transaction);
1171 			writeLocker.Unlock();
1172 
1173 			if (status == B_OK) {
1174 				status = attribute->WriteAt(transaction, pos, buffer,
1175 					_length);
1176 			}
1177 		}
1178 
1179 		if (status == B_OK) {
1180 			// Update status time on attribute write
1181 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1182 				bfs_inode::ToInode(real_time_clock_usecs()));
1183 
1184 			status = WriteBack(transaction);
1185 		}
1186 
1187 		attribute->WriteLockInTransaction(transaction);
1188 		ReleaseAttribute(attribute);
1189 	}
1190 
1191 	// TODO: find a better way than this "pos" thing (the begin of the old key
1192 	//	must be copied to the start of the new one for a comparison)
1193 	if (status == B_OK && pos == 0) {
1194 		// index only the first BPLUSTREE_MAX_KEY_LENGTH bytes
1195 		uint16 length = *_length;
1196 		if (length > BPLUSTREE_MAX_KEY_LENGTH)
1197 			length = BPLUSTREE_MAX_KEY_LENGTH;
1198 
1199 		// Update index. Note, Index::Update() may be called even if
1200 		// initializing the index failed - it will just update the live
1201 		// queries in this case
1202 		if (pos < length || pos < oldLength) {
1203 			index.Update(transaction, name, type, oldData, oldLength, buffer,
1204 				length, this);
1205 		}
1206 	}
1207 
1208 	if (_created != NULL)
1209 		*_created = created;
1210 
1211 	return status;
1212 }
1213 
1214 
1215 /*!	Removes the specified attribute from the inode.
1216 	This is a high-level attribute function that understands attributes
1217 	in the small_data section as well as real attribute files.
1218 */
1219 status_t
1220 Inode::RemoveAttribute(Transaction& transaction, const char* name)
1221 {
1222 	Index index(fVolume);
1223 	bool hasIndex = index.SetTo(name) == B_OK;
1224 	NodeGetter node(fVolume, this);
1225 
1226 	// update index for attributes in the small_data section
1227 	{
1228 		RecursiveLocker _(fSmallDataLock);
1229 
1230 		small_data* smallData = FindSmallData(node.Node(), name);
1231 		if (smallData != NULL) {
1232 			uint32 length = smallData->DataSize();
1233 			if (length > BPLUSTREE_MAX_KEY_LENGTH)
1234 				length = BPLUSTREE_MAX_KEY_LENGTH;
1235 			index.Update(transaction, name, smallData->Type(),
1236 				smallData->Data(), length, NULL, 0, this);
1237 		}
1238 	}
1239 
1240 	status_t status = _RemoveSmallData(transaction, node, name);
1241 	if (status == B_ENTRY_NOT_FOUND && !Attributes().IsZero()) {
1242 		// remove the attribute file if it exists
1243 		status = _RemoveAttribute(transaction, name, hasIndex, &index);
1244 		if (status == B_OK) {
1245 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1246 				bfs_inode::ToInode(real_time_clock_usecs()));
1247 			WriteBack(transaction);
1248 		}
1249 	}
1250 
1251 	return status;
1252 }
1253 
1254 
1255 /*!	Returns the attribute inode with the specified \a name, in case it exists.
1256 	This method can only return real attribute files; the attributes in the
1257 	small data section are ignored.
1258 */
1259 status_t
1260 Inode::GetAttribute(const char* name, Inode** _attribute)
1261 {
1262 	// does this inode even have attributes?
1263 	if (Attributes().IsZero())
1264 		return B_ENTRY_NOT_FOUND;
1265 
1266 	Vnode vnode(fVolume, Attributes());
1267 	Inode* attributes;
1268 	if (vnode.Get(&attributes) < B_OK) {
1269 		FATAL(("get_vnode() failed in Inode::GetAttribute(name = \"%s\")\n",
1270 			name));
1271 		return B_ERROR;
1272 	}
1273 
1274 	BPlusTree* tree = attributes->Tree();
1275 	if (tree == NULL)
1276 		return B_BAD_VALUE;
1277 
1278 	InodeReadLocker locker(attributes);
1279 
1280 	ino_t id;
1281 	status_t status = tree->Find((uint8*)name, (uint16)strlen(name), &id);
1282 	if (status == B_OK) {
1283 		Vnode vnode(fVolume, id);
1284 		Inode* inode;
1285 		// Check if the attribute is really an attribute
1286 		if (vnode.Get(&inode) != B_OK || !inode->IsAttribute())
1287 			return B_ERROR;
1288 
1289 		*_attribute = inode;
1290 		vnode.Keep();
1291 		return B_OK;
1292 	}
1293 
1294 	return status;
1295 }
1296 
1297 
1298 void
1299 Inode::ReleaseAttribute(Inode* attribute)
1300 {
1301 	if (attribute == NULL)
1302 		return;
1303 
1304 	put_vnode(fVolume->FSVolume(), attribute->ID());
1305 }
1306 
1307 
1308 status_t
1309 Inode::CreateAttribute(Transaction& transaction, const char* name, uint32 type,
1310 	Inode** attribute)
1311 {
1312 	// do we need to create the attribute directory first?
1313 	if (Attributes().IsZero()) {
1314 		status_t status = Inode::Create(transaction, this, NULL,
1315 			S_ATTR_DIR | S_DIRECTORY | 0666, 0, 0, NULL);
1316 		if (status < B_OK)
1317 			RETURN_ERROR(status);
1318 	}
1319 	Vnode vnode(fVolume, Attributes());
1320 	Inode* attributes;
1321 	if (vnode.Get(&attributes) < B_OK)
1322 		return B_ERROR;
1323 
1324 	// Inode::Create() locks the inode for us
1325 	return Inode::Create(transaction, attributes, name,
1326 		S_ATTR | S_FILE | 0666, 0, type, NULL, NULL, attribute);
1327 }
1328 
1329 
1330 //	#pragma mark - directory tree
1331 
1332 
1333 bool
1334 Inode::IsEmpty()
1335 {
1336 	TreeIterator iterator(fTree);
1337 
1338 	// index and attribute directories are really empty when they are
1339 	// empty - directories for standard files always contain ".", and
1340 	// "..", so we need to ignore those two
1341 
1342 	uint32 count = 0;
1343 	char name[BPLUSTREE_MAX_KEY_LENGTH];
1344 	uint16 length;
1345 	ino_t id;
1346 	while (iterator.GetNextEntry(name, &length, B_FILE_NAME_LENGTH,
1347 			&id) == B_OK) {
1348 		if (Mode() & (S_ATTR_DIR | S_INDEX_DIR))
1349 			return false;
1350 
1351 		if (++count > 2 || (strcmp(".", name) && strcmp("..", name)))
1352 			return false;
1353 	}
1354 	return true;
1355 }
1356 
1357 
1358 status_t
1359 Inode::ContainerContentsChanged(Transaction& transaction)
1360 {
1361 	ASSERT(!InLastModifiedIndex());
1362 
1363 	Node().last_modified_time = Node().status_change_time
1364 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
1365 
1366 	return WriteBack(transaction);
1367 }
1368 
1369 
1370 //	#pragma mark - data stream
1371 
1372 
1373 /*!	Computes the number of bytes this inode occupies in the file system.
1374 	This includes the file meta blocks used for maintaining its data stream.
1375 
1376 	TODO: However, the attributes in extra files are not really accounted for;
1377 	depending on the speed penalty, this should be changed, though (the value
1378 	could be cached in the inode structure or Inode object, though).
1379 */
1380 off_t
1381 Inode::AllocatedSize() const
1382 {
1383 	if (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0) {
1384 		// This symlink does not have a data stream
1385 		return Node().InodeSize();
1386 	}
1387 
1388 	const data_stream& data = Node().data;
1389 	uint32 blockSize = fVolume->BlockSize();
1390 	off_t size = blockSize;
1391 
1392 	if (data.MaxDoubleIndirectRange() != 0) {
1393 		off_t doubleIndirectSize = data.MaxDoubleIndirectRange()
1394 			- data.MaxIndirectRange();
1395 		int32 indirectSize = double_indirect_max_indirect_size(
1396 			data.double_indirect.Length(), fVolume->BlockSize());
1397 
1398 		size += (2 * data.double_indirect.Length()
1399 				+ doubleIndirectSize / indirectSize)
1400 			* blockSize + data.MaxDoubleIndirectRange();
1401 	} else if (data.MaxIndirectRange() != 0)
1402 		size += data.indirect.Length() + data.MaxIndirectRange();
1403 	else
1404 		size += data.MaxDirectRange();
1405 
1406 	if (!Node().attributes.IsZero()) {
1407 		// TODO: to make this exact, we'd had to count all attributes
1408 		size += 2 * blockSize;
1409 			// 2 blocks, one for the attributes inode, one for its B+tree
1410 	}
1411 
1412 	return size;
1413 }
1414 
1415 
1416 /*!	Finds the block_run where "pos" is located in the data_stream of
1417 	the inode.
1418 	If successful, "offset" will then be set to the file offset
1419 	of the block_run returned; so "pos - offset" is for the block_run
1420 	what "pos" is for the whole stream.
1421 	The caller has to make sure that "pos" is inside the stream.
1422 */
1423 status_t
1424 Inode::FindBlockRun(off_t pos, block_run& run, off_t& offset)
1425 {
1426 	data_stream* data = &Node().data;
1427 
1428 	// find matching block run
1429 
1430 	if (data->MaxIndirectRange() > 0 && pos >= data->MaxDirectRange()) {
1431 		if (data->MaxDoubleIndirectRange() > 0
1432 			&& pos >= data->MaxIndirectRange()) {
1433 			// access to double indirect blocks
1434 
1435 			CachedBlock cached(fVolume);
1436 
1437 			int32 runsPerBlock;
1438 			int32 directSize;
1439 			int32 indirectSize;
1440 			get_double_indirect_sizes(data->double_indirect.Length(),
1441 				fVolume->BlockSize(), runsPerBlock, directSize, indirectSize);
1442 
1443 			off_t start = pos - data->MaxIndirectRange();
1444 			int32 index = start / indirectSize;
1445 
1446 			block_run* indirect = (block_run*)cached.SetTo(
1447 				fVolume->ToBlock(data->double_indirect) + index / runsPerBlock);
1448 			if (indirect == NULL)
1449 				RETURN_ERROR(B_ERROR);
1450 
1451 			int32 current = (start % indirectSize) / directSize;
1452 
1453 			indirect = (block_run*)cached.SetTo(
1454 				fVolume->ToBlock(indirect[index % runsPerBlock])
1455 				+ current / runsPerBlock);
1456 			if (indirect == NULL)
1457 				RETURN_ERROR(B_ERROR);
1458 
1459 			run = indirect[current % runsPerBlock];
1460 			if (run.Length() != data->double_indirect.Length())
1461 				RETURN_ERROR(B_BAD_DATA);
1462 
1463 			offset = data->MaxIndirectRange() + (index * indirectSize)
1464 				+ (current * directSize);
1465 		} else {
1466 			// access to indirect blocks
1467 
1468 			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1469 			off_t runBlockEnd = data->MaxDirectRange();
1470 
1471 			CachedBlock cached(fVolume);
1472 			off_t block = fVolume->ToBlock(data->indirect);
1473 
1474 			for (int32 i = 0; i < data->indirect.Length(); i++) {
1475 				block_run* indirect = (block_run*)cached.SetTo(block + i);
1476 				if (indirect == NULL)
1477 					RETURN_ERROR(B_IO_ERROR);
1478 
1479 				int32 current = -1;
1480 				while (++current < runsPerBlock) {
1481 					if (indirect[current].IsZero())
1482 						break;
1483 
1484 					runBlockEnd += indirect[current].Length()
1485 						<< cached.BlockShift();
1486 					if (runBlockEnd > pos) {
1487 						run = indirect[current];
1488 						offset = runBlockEnd - (run.Length()
1489 							<< cached.BlockShift());
1490 						return fVolume->ValidateBlockRun(run);
1491 					}
1492 				}
1493 			}
1494 			RETURN_ERROR(B_ERROR);
1495 		}
1496 	} else {
1497 		// access from direct blocks
1498 
1499 		off_t runBlockEnd = 0LL;
1500 		int32 current = -1;
1501 
1502 		while (++current < NUM_DIRECT_BLOCKS) {
1503 			if (data->direct[current].IsZero())
1504 				break;
1505 
1506 			runBlockEnd += data->direct[current].Length()
1507 				<< fVolume->BlockShift();
1508 			if (runBlockEnd > pos) {
1509 				run = data->direct[current];
1510 				offset = runBlockEnd - (run.Length() << fVolume->BlockShift());
1511 				return fVolume->ValidateBlockRun(run);
1512 			}
1513 		}
1514 
1515 		return B_ENTRY_NOT_FOUND;
1516 	}
1517 	return fVolume->ValidateBlockRun(run);
1518 }
1519 
1520 
1521 status_t
1522 Inode::ReadAt(off_t pos, uint8* buffer, size_t* _length)
1523 {
1524 	size_t length = *_length;
1525 
1526 	// set/check boundaries for pos/length
1527 	if (pos < 0)
1528 		return B_BAD_VALUE;
1529 
1530 	InodeReadLocker locker(this);
1531 
1532 	if (pos >= Size() || length == 0) {
1533 		*_length = 0;
1534 		return B_NO_ERROR;
1535 	}
1536 
1537 	locker.Unlock();
1538 
1539 	return file_cache_read(FileCache(), NULL, pos, buffer, _length);
1540 }
1541 
1542 
1543 status_t
1544 Inode::WriteAt(Transaction& transaction, off_t pos, const uint8* buffer,
1545 	size_t* _length)
1546 {
1547 	InodeReadLocker locker(this);
1548 
1549 	// update the last modification time in memory, it will be written
1550 	// back to the inode, and the index when the file is closed
1551 	// TODO: should update the internal last modified time only at this point!
1552 	Node().last_modified_time = Node().status_change_time
1553 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
1554 
1555 	// TODO: support INODE_LOGGED!
1556 
1557 	size_t length = *_length;
1558 	bool changeSize = pos + length > Size();
1559 
1560 	// set/check boundaries for pos/length
1561 	if (pos < 0)
1562 		return B_BAD_VALUE;
1563 
1564 	locker.Unlock();
1565 
1566 	// the transaction doesn't have to be started already
1567 	if (changeSize && !transaction.IsStarted())
1568 		transaction.Start(fVolume, BlockNumber());
1569 
1570 	WriteLocker writeLocker(fLock);
1571 
1572 	// Work around possible race condition: Someone might have shrunken the file
1573 	// while we had no lock.
1574 	if (!transaction.IsStarted() && pos + length > Size()) {
1575 		writeLocker.Unlock();
1576 		transaction.Start(fVolume, BlockNumber());
1577 		writeLocker.Lock();
1578 	}
1579 
1580 	off_t oldSize = Size();
1581 
1582 	if (pos + length > oldSize) {
1583 		// let's grow the data stream to the size needed
1584 		status_t status = SetFileSize(transaction, pos + length);
1585 		if (status != B_OK) {
1586 			*_length = 0;
1587 			WriteLockInTransaction(transaction);
1588 			RETURN_ERROR(status);
1589 		}
1590 		// TODO: In theory we would need to update the file size
1591 		// index here as part of the current transaction - this might just
1592 		// be a bit too expensive, but worth a try.
1593 
1594 		// we need to write back the inode here because it has to
1595 		// go into this transaction (we cannot wait until the file
1596 		// is closed)
1597 		status = WriteBack(transaction);
1598 		if (status != B_OK) {
1599 			WriteLockInTransaction(transaction);
1600 			return status;
1601 		}
1602 	}
1603 
1604 	writeLocker.Unlock();
1605 
1606 	if (oldSize < pos)
1607 		FillGapWithZeros(oldSize, pos);
1608 
1609 	// If we don't want to write anything, we can now return (we may
1610 	// just have changed the file size using the position parameter)
1611 	if (length == 0)
1612 		return B_OK;
1613 
1614 	status_t status = file_cache_write(FileCache(), NULL, pos, buffer, _length);
1615 
1616 	if (transaction.IsStarted())
1617 		WriteLockInTransaction(transaction);
1618 
1619 	return status;
1620 }
1621 
1622 
1623 /*!	Fills the gap between the old file size and the new file size
1624 	with zeros.
1625 	It's more or less a copy of Inode::WriteAt() but it can handle
1626 	length differences of more than just 4 GB, and it never uses
1627 	the log, even if the INODE_LOGGED flag is set.
1628 */
1629 status_t
1630 Inode::FillGapWithZeros(off_t pos, off_t newSize)
1631 {
1632 	while (pos < newSize) {
1633 		size_t size;
1634 		if (newSize > pos + 1024 * 1024 * 1024)
1635 			size = 1024 * 1024 * 1024;
1636 		else
1637 			size = newSize - pos;
1638 
1639 		status_t status = file_cache_write(FileCache(), NULL, pos, NULL, &size);
1640 		if (status < B_OK)
1641 			return status;
1642 
1643 		pos += size;
1644 	}
1645 
1646 	return B_OK;
1647 }
1648 
1649 
1650 /*!	Allocates \a length blocks, and clears their contents. Growing
1651 	the indirect and double indirect range uses this method.
1652 	The allocated block_run is saved in "run"
1653 */
1654 status_t
1655 Inode::_AllocateBlockArray(Transaction& transaction, block_run& run,
1656 	size_t length, bool variableSize)
1657 {
1658 	if (!run.IsZero())
1659 		return B_BAD_VALUE;
1660 
1661 	status_t status = fVolume->Allocate(transaction, this, length, run,
1662 		variableSize ? 1 : length);
1663 	if (status != B_OK)
1664 		return status;
1665 
1666 	// make sure those blocks are empty
1667 	CachedBlock cached(fVolume);
1668 	off_t block = fVolume->ToBlock(run);
1669 
1670 	for (int32 i = 0; i < run.Length(); i++) {
1671 		block_run* runs = (block_run*)cached.SetToWritable(transaction,
1672 			block + i, true);
1673 		if (runs == NULL)
1674 			return B_IO_ERROR;
1675 	}
1676 	return B_OK;
1677 }
1678 
1679 
1680 /*!	Grows the stream to \a size, and fills the direct/indirect/double indirect
1681 	ranges with the runs.
1682 	This method will also determine the size of the preallocation, if any.
1683 */
1684 status_t
1685 Inode::_GrowStream(Transaction& transaction, off_t size)
1686 {
1687 	data_stream* data = &Node().data;
1688 
1689 	// is the data stream already large enough to hold the new size?
1690 	// (can be the case with preallocated blocks)
1691 	if (size < data->MaxDirectRange()
1692 		|| size < data->MaxIndirectRange()
1693 		|| size < data->MaxDoubleIndirectRange()) {
1694 		data->size = HOST_ENDIAN_TO_BFS_INT64(size);
1695 		return B_OK;
1696 	}
1697 
1698 	// how many bytes are still needed? (unused ranges are always zero)
1699 	uint16 minimum = 1;
1700 	off_t bytes;
1701 	if (data->Size() < data->MaxDoubleIndirectRange()) {
1702 		bytes = size - data->MaxDoubleIndirectRange();
1703 		// The double indirect range can only handle multiples of
1704 		// its base length
1705 		minimum = data->double_indirect.Length();
1706 	} else if (data->Size() < data->MaxIndirectRange())
1707 		bytes = size - data->MaxIndirectRange();
1708 	else if (data->Size() < data->MaxDirectRange())
1709 		bytes = size - data->MaxDirectRange();
1710 	else {
1711 		// no preallocation left to be used
1712 		bytes = size - data->Size();
1713 		if (data->MaxDoubleIndirectRange() > 0)
1714 			minimum = data->double_indirect.Length();
1715 	}
1716 
1717 	// do we have enough free blocks on the disk?
1718 	off_t blocksNeeded = (bytes + fVolume->BlockSize() - 1)
1719 		>> fVolume->BlockShift();
1720 	if (blocksNeeded > fVolume->FreeBlocks())
1721 		return B_DEVICE_FULL;
1722 
1723 	off_t blocksRequested = blocksNeeded;
1724 		// because of preallocations and partial allocations, the number of
1725 		// blocks we need to allocate may be different from the one we request
1726 		// from the block allocator
1727 
1728 	// Should we preallocate some blocks?
1729 	// Attributes, attribute directories, and long symlinks usually won't get
1730 	// that big, and should stay close to the inode - preallocating could be
1731 	// counterproductive.
1732 	// Also, if free disk space is tight, don't preallocate.
1733 	if (!IsAttribute() && !IsAttributeDirectory() && !IsSymLink()
1734 		&& fVolume->FreeBlocks() > 128) {
1735 		off_t roundTo = 0;
1736 		if (IsFile()) {
1737 			// Request preallocated blocks depending on the file size and growth
1738 			if (size < 1 * 1024 * 1024 && bytes < 512 * 1024) {
1739 				// Preallocate 64 KB for file sizes <1 MB and grow rates <512 KB
1740 				roundTo = 65536 >> fVolume->BlockShift();
1741 			} else if (size < 32 * 1024 * 1024 && bytes <= 1 * 1024 * 1024) {
1742 				// Preallocate 512 KB for file sizes between 1 MB and 32 MB, and
1743 				// grow rates smaller than 1 MB
1744 				roundTo = (512 * 1024) >> fVolume->BlockShift();
1745 			} else {
1746 				// Preallocate 1/16 of the file size (ie. 4 MB for 64 MB,
1747 				// 64 MB for 1 GB)
1748 				roundTo = size >> (fVolume->BlockShift() + 4);
1749 			}
1750 		} else if (IsIndex()) {
1751 			// Always preallocate 64 KB for index directories
1752 			roundTo = 65536 >> fVolume->BlockShift();
1753 		} else {
1754 			// Preallocate only 4 KB - directories only get trimmed when their
1755 			// vnode is flushed, which might not happen very often.
1756 			roundTo = 4096 >> fVolume->BlockShift();
1757 		}
1758 		if (roundTo > 1) {
1759 			// Round to next "roundTo" block count
1760 			blocksRequested = ((blocksNeeded + roundTo) / roundTo) * roundTo;
1761 		}
1762 	}
1763 
1764 	while (blocksNeeded > 0) {
1765 		// the requested blocks do not need to be returned with a
1766 		// single allocation, so we need to iterate until we have
1767 		// enough blocks allocated
1768 		if (minimum > 1) {
1769 			// make sure that "blocks" is a multiple of minimum
1770 			blocksRequested = round_up(blocksRequested, minimum);
1771 		}
1772 
1773 		block_run run;
1774 		status_t status = fVolume->Allocate(transaction, this, blocksRequested,
1775 			run, minimum);
1776 		if (status != B_OK)
1777 			return status;
1778 
1779 		// okay, we have the needed blocks, so just distribute them to the
1780 		// different ranges of the stream (direct, indirect & double indirect)
1781 
1782 		blocksNeeded -= run.Length();
1783 		// don't preallocate if the first allocation was already too small
1784 		blocksRequested = blocksNeeded;
1785 
1786 		// Direct block range
1787 
1788 		if (data->Size() <= data->MaxDirectRange()) {
1789 			// let's try to put them into the direct block range
1790 			int32 free = 0;
1791 			for (; free < NUM_DIRECT_BLOCKS; free++) {
1792 				if (data->direct[free].IsZero())
1793 					break;
1794 			}
1795 
1796 			if (free < NUM_DIRECT_BLOCKS) {
1797 				// can we merge the last allocated run with the new one?
1798 				int32 last = free - 1;
1799 				if (free > 0 && data->direct[last].MergeableWith(run)) {
1800 					data->direct[last].length = HOST_ENDIAN_TO_BFS_INT16(
1801 						data->direct[last].Length() + run.Length());
1802 				} else
1803 					data->direct[free] = run;
1804 
1805 				data->max_direct_range = HOST_ENDIAN_TO_BFS_INT64(
1806 					data->MaxDirectRange()
1807 					+ run.Length() * fVolume->BlockSize());
1808 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1809 					? data->max_direct_range : size);
1810 				continue;
1811 			}
1812 		}
1813 
1814 		// Indirect block range
1815 
1816 		if (data->Size() <= data->MaxIndirectRange()
1817 			|| !data->MaxIndirectRange()) {
1818 			CachedBlock cached(fVolume);
1819 			block_run* runs = NULL;
1820 			uint32 free = 0;
1821 			off_t block;
1822 
1823 			// if there is no indirect block yet, create one
1824 			if (data->indirect.IsZero()) {
1825 				status = _AllocateBlockArray(transaction, data->indirect,
1826 					NUM_ARRAY_BLOCKS, true);
1827 				if (status != B_OK)
1828 					return status;
1829 
1830 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1831 					data->MaxDirectRange());
1832 				// insert the block_run in the first block
1833 				runs = (block_run*)cached.SetTo(data->indirect);
1834 			} else {
1835 				uint32 numberOfRuns = fVolume->BlockSize() / sizeof(block_run);
1836 				block = fVolume->ToBlock(data->indirect);
1837 
1838 				// search first empty entry
1839 				int32 i = 0;
1840 				for (; i < data->indirect.Length(); i++) {
1841 					if ((runs = (block_run*)cached.SetTo(block + i)) == NULL)
1842 						return B_IO_ERROR;
1843 
1844 					for (free = 0; free < numberOfRuns; free++)
1845 						if (runs[free].IsZero())
1846 							break;
1847 
1848 					if (free < numberOfRuns)
1849 						break;
1850 				}
1851 				if (i == data->indirect.Length())
1852 					runs = NULL;
1853 			}
1854 
1855 			if (runs != NULL) {
1856 				// try to insert the run to the last one - note that this
1857 				// doesn't take block borders into account, so it could be
1858 				// further optimized
1859 				cached.MakeWritable(transaction);
1860 
1861 				int32 last = free - 1;
1862 				if (free > 0 && runs[last].MergeableWith(run)) {
1863 					runs[last].length = HOST_ENDIAN_TO_BFS_INT16(
1864 						runs[last].Length() + run.Length());
1865 				} else
1866 					runs[free] = run;
1867 
1868 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1869 					data->MaxIndirectRange()
1870 					+ (run.Length() << fVolume->BlockShift()));
1871 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1872 					? data->MaxIndirectRange() : size);
1873 				continue;
1874 			}
1875 		}
1876 
1877 		// Double indirect block range
1878 
1879 		if (data->Size() <= data->MaxDoubleIndirectRange()
1880 			|| !data->max_double_indirect_range) {
1881 			// We make sure here that we have this minimum allocated, so if
1882 			// the allocation succeeds, we don't run into an endless loop.
1883 			if (!data->max_double_indirect_range)
1884 				minimum = _DoubleIndirectBlockLength();
1885 			else
1886 				minimum = data->double_indirect.Length();
1887 
1888 			if ((run.Length() % minimum) != 0) {
1889 				// The number of allocated blocks isn't a multiple of 'minimum',
1890 				// so we have to change this. This can happen the first time the
1891 				// stream grows into the double indirect range.
1892 				// First, free the remaining blocks that don't fit into this
1893 				// multiple.
1894 				int32 rest = run.Length() % minimum;
1895 				run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length() - rest);
1896 
1897 				status = fVolume->Free(transaction,
1898 					block_run::Run(run.AllocationGroup(),
1899 					run.Start() + run.Length(), rest));
1900 				if (status != B_OK)
1901 					return status;
1902 
1903 				blocksNeeded += rest;
1904 				blocksRequested = round_up(blocksNeeded, minimum);
1905 
1906 				// Are there any blocks left in the run? If not, allocate
1907 				// a new one
1908 				if (run.length == 0)
1909 					continue;
1910 			}
1911 
1912 			// if there is no double indirect block yet, create one
1913 			if (data->double_indirect.IsZero()) {
1914 				status = _AllocateBlockArray(transaction,
1915 					data->double_indirect, _DoubleIndirectBlockLength());
1916 				if (status != B_OK)
1917 					return status;
1918 
1919 				data->max_double_indirect_range = data->max_indirect_range;
1920 			}
1921 
1922 			// calculate the index where to insert the new blocks
1923 
1924 			int32 runsPerBlock;
1925 			int32 directSize;
1926 			int32 indirectSize;
1927 			get_double_indirect_sizes(data->double_indirect.Length(),
1928 				fVolume->BlockSize(), runsPerBlock, directSize, indirectSize);
1929 
1930 			off_t start = data->MaxDoubleIndirectRange()
1931 				- data->MaxIndirectRange();
1932 			int32 indirectIndex = start / indirectSize;
1933 			int32 index = (start % indirectSize) / directSize;
1934 			int32 runsPerArray = runsPerBlock * minimum;
1935 
1936 			// distribute the blocks to the array and allocate
1937 			// new array blocks when needed
1938 
1939 			CachedBlock cached(fVolume);
1940 			CachedBlock cachedDirect(fVolume);
1941 			block_run* array = NULL;
1942 			uint32 runLength = run.Length();
1943 
1944 			while (run.length != 0) {
1945 				// get the indirect array block
1946 				if (array == NULL) {
1947 					uint32 block = indirectIndex / runsPerBlock;
1948 					if (block >= minimum)
1949 						return EFBIG;
1950 
1951 					array = (block_run*)cached.SetTo(fVolume->ToBlock(
1952 						data->double_indirect) + block);
1953 					if (array == NULL)
1954 						return B_IO_ERROR;
1955 				}
1956 
1957 				do {
1958 					// do we need a new array block?
1959 					if (array[indirectIndex % runsPerBlock].IsZero()) {
1960 						cached.MakeWritable(transaction);
1961 
1962 						status = _AllocateBlockArray(transaction,
1963 							array[indirectIndex % runsPerBlock],
1964 							data->double_indirect.Length());
1965 						if (status != B_OK)
1966 							return status;
1967 					}
1968 
1969 					block_run* runs = (block_run*)cachedDirect.SetToWritable(
1970 						transaction, fVolume->ToBlock(array[indirectIndex
1971 							% runsPerBlock]) + index / runsPerBlock);
1972 					if (runs == NULL)
1973 						return B_IO_ERROR;
1974 
1975 					do {
1976 						// insert the block_run into the array
1977 						runs[index % runsPerBlock] = run;
1978 						runs[index % runsPerBlock].length
1979 							= HOST_ENDIAN_TO_BFS_INT16(minimum);
1980 
1981 						// alter the remaining block_run
1982 						run.start = HOST_ENDIAN_TO_BFS_INT16(run.Start()
1983 							+ minimum);
1984 						run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
1985 							- minimum);
1986 					} while ((++index % runsPerBlock) != 0 && run.length);
1987 				} while ((index % runsPerArray) != 0 && run.length);
1988 
1989 				if (index == runsPerArray)
1990 					index = 0;
1991 				if (++indirectIndex % runsPerBlock == 0) {
1992 					array = NULL;
1993 					index = 0;
1994 				}
1995 			}
1996 
1997 			data->max_double_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1998 				data->MaxDoubleIndirectRange()
1999 				+ (runLength << fVolume->BlockShift()));
2000 			data->size = blocksNeeded > 0 ? HOST_ENDIAN_TO_BFS_INT64(
2001 				data->max_double_indirect_range) : size;
2002 
2003 			continue;
2004 		}
2005 
2006 		RETURN_ERROR(EFBIG);
2007 	}
2008 	// update the size of the data stream
2009 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
2010 
2011 	return B_OK;
2012 }
2013 
2014 
2015 size_t
2016 Inode::_DoubleIndirectBlockLength() const
2017 {
2018 	if (fVolume->BlockSize() > DOUBLE_INDIRECT_ARRAY_SIZE)
2019 		return 1;
2020 
2021 	return DOUBLE_INDIRECT_ARRAY_SIZE / fVolume->BlockSize();
2022 }
2023 
2024 
2025 /*!	Frees the statically sized array of the double indirect part of a data
2026 	stream.
2027 */
2028 status_t
2029 Inode::_FreeStaticStreamArray(Transaction& transaction, int32 level,
2030 	block_run run, off_t size, off_t offset, off_t& max)
2031 {
2032 	int32 indirectSize;
2033 	if (level == 0) {
2034 		indirectSize = double_indirect_max_indirect_size(run.Length(),
2035 			fVolume->BlockSize());
2036 	} else {
2037 		indirectSize = double_indirect_max_direct_size(run.Length(),
2038 			fVolume->BlockSize());
2039 	}
2040 
2041 	off_t start;
2042 	if (size > offset)
2043 		start = size - offset;
2044 	else
2045 		start = 0;
2046 
2047 	int32 index = start / indirectSize;
2048 	int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
2049 
2050 	CachedBlock cached(fVolume);
2051 	off_t blockNumber = fVolume->ToBlock(run);
2052 
2053 	// set the file offset to the current block run
2054 	offset += (off_t)index * indirectSize;
2055 
2056 	for (int32 i = index / runsPerBlock; i < run.Length(); i++) {
2057 		block_run* array = (block_run*)cached.SetToWritable(transaction,
2058 			blockNumber + i);
2059 		if (array == NULL)
2060 			RETURN_ERROR(B_ERROR);
2061 
2062 		for (index = index % runsPerBlock; index < runsPerBlock; index++) {
2063 			if (array[index].IsZero()) {
2064 				// we also want to break out of the outer loop
2065 				i = run.Length();
2066 				break;
2067 			}
2068 
2069 			status_t status = B_OK;
2070 			if (level == 0) {
2071 				status = _FreeStaticStreamArray(transaction, 1, array[index],
2072 					size, offset, max);
2073 			} else if (offset >= size)
2074 				status = fVolume->Free(transaction, array[index]);
2075 			else
2076 				max = HOST_ENDIAN_TO_BFS_INT64(offset + indirectSize);
2077 
2078 			if (status < B_OK)
2079 				RETURN_ERROR(status);
2080 
2081 			if (offset >= size)
2082 				array[index].SetTo(0, 0, 0);
2083 
2084 			offset += indirectSize;
2085 		}
2086 		index = 0;
2087 	}
2088 	return B_OK;
2089 }
2090 
2091 
2092 /*!	Frees all block_runs in the array which come after the specified size.
2093 	It also trims the last block_run that contain the size.
2094 	"offset" and "max" are maintained until the last block_run that doesn't
2095 	have to be freed - after this, the values won't be correct anymore, but
2096 	will still assure correct function for all subsequent calls.
2097 	"max" is considered to be in file system byte order.
2098 */
2099 status_t
2100 Inode::_FreeStreamArray(Transaction& transaction, block_run* array,
2101 	uint32 arrayLength, off_t size, off_t& offset, off_t& max)
2102 {
2103 	PRINT(("FreeStreamArray: arrayLength %lu, size %Ld, offset %Ld, max %Ld\n",
2104 		arrayLength, size, offset, max));
2105 
2106 	off_t newOffset = offset;
2107 	uint32 i = 0;
2108 	for (; i < arrayLength; i++, offset = newOffset) {
2109 		if (array[i].IsZero())
2110 			break;
2111 
2112 		newOffset += (off_t)array[i].Length() << fVolume->BlockShift();
2113 		if (newOffset <= size)
2114 			continue;
2115 
2116 		block_run run = array[i];
2117 
2118 		// determine the block_run to be freed
2119 		if (newOffset > size && offset < size) {
2120 			// free partial block_run (and update the original block_run)
2121 			run.start = HOST_ENDIAN_TO_BFS_INT16(array[i].Start()
2122 				+ ((size + fVolume->BlockSize() - 1 - offset)
2123 					>> fVolume->BlockShift()));
2124 				// round to next block
2125 			array[i].length = HOST_ENDIAN_TO_BFS_INT16(run.Start()
2126 				- array[i].Start());
2127 			run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
2128 				- array[i].Length());
2129 
2130 			if (run.length == 0)
2131 				continue;
2132 
2133 			// update maximum range
2134 			max = HOST_ENDIAN_TO_BFS_INT64(offset + ((off_t)array[i].Length()
2135 				<< fVolume->BlockShift()));
2136 		} else {
2137 			// free the whole block_run
2138 			array[i].SetTo(0, 0, 0);
2139 
2140 			if ((off_t)BFS_ENDIAN_TO_HOST_INT64(max) > offset)
2141 				max = HOST_ENDIAN_TO_BFS_INT64(offset);
2142 		}
2143 
2144 		if (fVolume->Free(transaction, run) < B_OK)
2145 			return B_IO_ERROR;
2146 	}
2147 	return B_OK;
2148 }
2149 
2150 
2151 status_t
2152 Inode::_ShrinkStream(Transaction& transaction, off_t size)
2153 {
2154 	data_stream* data = &Node().data;
2155 	status_t status;
2156 
2157 	if (data->MaxDoubleIndirectRange() > size) {
2158 		off_t* maxDoubleIndirect = &data->max_double_indirect_range;
2159 			// gcc 4 work-around: "error: cannot bind packed field
2160 			// 'data->data_stream::max_double_indirect_range' to 'off_t&'"
2161 		status = _FreeStaticStreamArray(transaction, 0, data->double_indirect,
2162 			size, data->MaxIndirectRange(), *maxDoubleIndirect);
2163 		if (status != B_OK)
2164 			return status;
2165 
2166 		if (size <= data->MaxIndirectRange()) {
2167 			fVolume->Free(transaction, data->double_indirect);
2168 			data->double_indirect.SetTo(0, 0, 0);
2169 			data->max_double_indirect_range = 0;
2170 		}
2171 	}
2172 
2173 	if (data->MaxIndirectRange() > size) {
2174 		CachedBlock cached(fVolume);
2175 		off_t block = fVolume->ToBlock(data->indirect);
2176 		off_t offset = data->MaxDirectRange();
2177 
2178 		for (int32 i = 0; i < data->indirect.Length(); i++) {
2179 			block_run* array = (block_run*)cached.SetToWritable(transaction,
2180 				block + i);
2181 			if (array == NULL)
2182 				break;
2183 
2184 			off_t* maxIndirect = &data->max_indirect_range;
2185 				// gcc 4 work-around: "error: cannot bind packed field
2186 				// 'data->data_stream::max_indirect_range' to 'off_t&'"
2187 			if (_FreeStreamArray(transaction, array, fVolume->BlockSize()
2188 					/ sizeof(block_run), size, offset, *maxIndirect) != B_OK)
2189 				return B_IO_ERROR;
2190 		}
2191 		if (data->max_direct_range == data->max_indirect_range) {
2192 			fVolume->Free(transaction, data->indirect);
2193 			data->indirect.SetTo(0, 0, 0);
2194 			data->max_indirect_range = 0;
2195 		}
2196 	}
2197 
2198 	if (data->MaxDirectRange() > size) {
2199 		off_t offset = 0;
2200 		off_t *maxDirect = &data->max_direct_range;
2201 			// gcc 4 work-around: "error: cannot bind packed field
2202 			// 'data->data_stream::max_direct_range' to 'off_t&'"
2203 		status = _FreeStreamArray(transaction, data->direct, NUM_DIRECT_BLOCKS,
2204 			size, offset, *maxDirect);
2205 		if (status < B_OK)
2206 			return status;
2207 	}
2208 
2209 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
2210 	return B_OK;
2211 }
2212 
2213 
2214 status_t
2215 Inode::SetFileSize(Transaction& transaction, off_t size)
2216 {
2217 	if (size < 0)
2218 		return B_BAD_VALUE;
2219 
2220 	off_t oldSize = Size();
2221 
2222 	if (size == oldSize)
2223 		return B_OK;
2224 
2225 	T(Resize(this, oldSize, size, false));
2226 
2227 	// should the data stream grow or shrink?
2228 	status_t status;
2229 	if (size > oldSize) {
2230 		status = _GrowStream(transaction, size);
2231 		if (status < B_OK) {
2232 			// if the growing of the stream fails, the whole operation
2233 			// fails, so we should shrink the stream to its former size
2234 			_ShrinkStream(transaction, oldSize);
2235 		}
2236 	} else
2237 		status = _ShrinkStream(transaction, size);
2238 
2239 	if (status < B_OK)
2240 		return status;
2241 
2242 	file_cache_set_size(FileCache(), size);
2243 	file_map_set_size(Map(), size);
2244 
2245 	return WriteBack(transaction);
2246 }
2247 
2248 
2249 status_t
2250 Inode::Append(Transaction& transaction, off_t bytes)
2251 {
2252 	return SetFileSize(transaction, Size() + bytes);
2253 }
2254 
2255 
2256 /*!	Checks whether or not this inode's data stream needs to be trimmed
2257 	because of an earlier preallocation.
2258 	Returns true if there are any blocks to be trimmed.
2259 */
2260 bool
2261 Inode::NeedsTrimming() const
2262 {
2263 	// We never trim preallocated index blocks to make them grow as smooth as
2264 	// possible. There are only few indices anyway, so this doesn't hurt.
2265 	// Also, if an inode is already in deleted state, we don't bother trimming
2266 	// it.
2267 	if (IsIndex() || IsDeleted()
2268 		|| (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0))
2269 		return false;
2270 
2271 	off_t roundedSize = round_up(Size(), fVolume->BlockSize());
2272 
2273 	return Node().data.MaxDirectRange() > roundedSize
2274 		|| Node().data.MaxIndirectRange() > roundedSize
2275 		|| Node().data.MaxDoubleIndirectRange() > roundedSize;
2276 }
2277 
2278 
2279 status_t
2280 Inode::TrimPreallocation(Transaction& transaction)
2281 {
2282 	T(Resize(this, max_c(Node().data.MaxDirectRange(),
2283 		Node().data.MaxIndirectRange()), Size(), true));
2284 
2285 	status_t status = _ShrinkStream(transaction, Size());
2286 	if (status < B_OK)
2287 		return status;
2288 
2289 	return WriteBack(transaction);
2290 }
2291 
2292 
2293 //!	Frees the file's data stream and removes all attributes
2294 status_t
2295 Inode::Free(Transaction& transaction)
2296 {
2297 	FUNCTION();
2298 
2299 	// Perhaps there should be an implementation of Inode::ShrinkStream() that
2300 	// just frees the data_stream, but doesn't change the inode (since it is
2301 	// freed anyway) - that would make an undelete command possible
2302 	if (!IsSymLink() || (Flags() & INODE_LONG_SYMLINK) != 0) {
2303 		status_t status = SetFileSize(transaction, 0);
2304 		if (status < B_OK)
2305 			return status;
2306 	}
2307 
2308 	// Free all attributes, and remove their indices
2309 	{
2310 		// We have to limit the scope of AttributeIterator, so that its
2311 		// destructor is not called after the inode is deleted
2312 		AttributeIterator iterator(this);
2313 
2314 		char name[B_FILE_NAME_LENGTH];
2315 		uint32 type;
2316 		size_t length;
2317 		ino_t id;
2318 		while (iterator.GetNext(name, &length, &type, &id) == B_OK) {
2319 			RemoveAttribute(transaction, name);
2320 		}
2321 	}
2322 
2323 	if (WriteBack(transaction) < B_OK)
2324 		return B_IO_ERROR;
2325 
2326 	return fVolume->Free(transaction, BlockRun());
2327 }
2328 
2329 
2330 //!	Synchronizes (writes back to disk) the file stream of the inode.
2331 status_t
2332 Inode::Sync()
2333 {
2334 	if (FileCache())
2335 		return file_cache_sync(FileCache());
2336 
2337 	// We may also want to flush the attribute's data stream to
2338 	// disk here... (do we?)
2339 
2340 	if (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0)
2341 		return B_OK;
2342 
2343 	InodeReadLocker locker(this);
2344 
2345 	data_stream* data = &Node().data;
2346 	status_t status = B_OK;
2347 
2348 	// flush direct range
2349 
2350 	for (int32 i = 0; i < NUM_DIRECT_BLOCKS; i++) {
2351 		if (data->direct[i].IsZero())
2352 			return B_OK;
2353 
2354 		status = block_cache_sync_etc(fVolume->BlockCache(),
2355 			fVolume->ToBlock(data->direct[i]), data->direct[i].Length());
2356 		if (status != B_OK)
2357 			return status;
2358 	}
2359 
2360 	// flush indirect range
2361 
2362 	if (data->max_indirect_range == 0)
2363 		return B_OK;
2364 
2365 	CachedBlock cached(fVolume);
2366 	off_t block = fVolume->ToBlock(data->indirect);
2367 	int32 count = fVolume->BlockSize() / sizeof(block_run);
2368 
2369 	for (int32 j = 0; j < data->indirect.Length(); j++) {
2370 		block_run* runs = (block_run*)cached.SetTo(block + j);
2371 		if (runs == NULL)
2372 			break;
2373 
2374 		for (int32 i = 0; i < count; i++) {
2375 			if (runs[i].IsZero())
2376 				return B_OK;
2377 
2378 			status = block_cache_sync_etc(fVolume->BlockCache(),
2379 				fVolume->ToBlock(runs[i]), runs[i].Length());
2380 			if (status != B_OK)
2381 				return status;
2382 		}
2383 	}
2384 
2385 	// flush double indirect range
2386 
2387 	if (data->max_double_indirect_range == 0)
2388 		return B_OK;
2389 
2390 	off_t indirectBlock = fVolume->ToBlock(data->double_indirect);
2391 
2392 	for (int32 l = 0; l < data->double_indirect.Length(); l++) {
2393 		block_run* indirectRuns = (block_run*)cached.SetTo(indirectBlock + l);
2394 		if (indirectRuns == NULL)
2395 			return B_FILE_ERROR;
2396 
2397 		CachedBlock directCached(fVolume);
2398 
2399 		for (int32 k = 0; k < count; k++) {
2400 			if (indirectRuns[k].IsZero())
2401 				return B_OK;
2402 
2403 			block = fVolume->ToBlock(indirectRuns[k]);
2404 			for (int32 j = 0; j < indirectRuns[k].Length(); j++) {
2405 				block_run* runs = (block_run*)directCached.SetTo(block + j);
2406 				if (runs == NULL)
2407 					return B_FILE_ERROR;
2408 
2409 				for (int32 i = 0; i < count; i++) {
2410 					if (runs[i].IsZero())
2411 						return B_OK;
2412 
2413 					// TODO: combine single block_runs to bigger ones when
2414 					// they are adjacent
2415 					status = block_cache_sync_etc(fVolume->BlockCache(),
2416 						fVolume->ToBlock(runs[i]), runs[i].Length());
2417 					if (status != B_OK)
2418 						return status;
2419 				}
2420 			}
2421 		}
2422 	}
2423 	return B_OK;
2424 }
2425 
2426 
2427 //	#pragma mark - TransactionListener implementation
2428 
2429 
2430 void
2431 Inode::TransactionDone(bool success)
2432 {
2433 	if (!success) {
2434 		// revert any changes made to the cached bfs_inode
2435 		UpdateNodeFromDisk();
2436 	}
2437 }
2438 
2439 
2440 void
2441 Inode::RemovedFromTransaction()
2442 {
2443 	Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
2444 
2445 	// See AddInode() why we do this here
2446 	if ((Flags() & INODE_DELETED) != 0)
2447 		fVolume->RemovedInodes().Add(this);
2448 
2449 	rw_lock_write_unlock(&Lock());
2450 
2451 	if (!fVolume->IsInitializing())
2452 		put_vnode(fVolume->FSVolume(), ID());
2453 }
2454 
2455 
2456 //	#pragma mark - creation/deletion
2457 
2458 
2459 status_t
2460 Inode::Remove(Transaction& transaction, const char* name, ino_t* _id,
2461 	bool isDirectory, bool force)
2462 {
2463 	if (fTree == NULL)
2464 		RETURN_ERROR(B_BAD_VALUE);
2465 
2466 	WriteLockInTransaction(transaction);
2467 
2468 	// does the file even exist?
2469 	off_t id;
2470 	if (fTree->Find((uint8*)name, (uint16)strlen(name), &id) < B_OK)
2471 		return B_ENTRY_NOT_FOUND;
2472 
2473 	if (_id)
2474 		*_id = id;
2475 
2476 	Vnode vnode(fVolume, id);
2477 	Inode* inode;
2478 	status_t status = vnode.Get(&inode);
2479 	if (status < B_OK) {
2480 		REPORT_ERROR(status);
2481 		return B_ENTRY_NOT_FOUND;
2482 	}
2483 
2484 	T(Remove(inode, name));
2485 	inode->WriteLockInTransaction(transaction);
2486 
2487 	// Inode::IsContainer() is true also for indices (furthermore, the S_IFDIR
2488 	// bit is set for indices in BFS, not for attribute directories) - but you
2489 	// should really be able to do whatever you want with your indices
2490 	// without having to remove all files first :)
2491 	if (!inode->IsIndex() && !force) {
2492 		// if it's not of the correct type, don't delete it!
2493 		if (inode->IsContainer() != isDirectory)
2494 			return isDirectory ? B_NOT_A_DIRECTORY : B_IS_A_DIRECTORY;
2495 
2496 		// only delete empty directories
2497 		if (isDirectory && !inode->IsEmpty())
2498 			return B_DIRECTORY_NOT_EMPTY;
2499 	}
2500 
2501 	// remove_vnode() allows the inode to be accessed until the last put_vnode()
2502 	status = remove_vnode(fVolume->FSVolume(), id);
2503 	if (status != B_OK)
2504 		return status;
2505 
2506 	if (fTree->Remove(transaction, name, id) != B_OK && !force) {
2507 		unremove_vnode(fVolume->FSVolume(), id);
2508 		RETURN_ERROR(B_ERROR);
2509 	}
2510 
2511 #ifdef DEBUG
2512 	if (fTree->Find((uint8*)name, (uint16)strlen(name), &id) == B_OK) {
2513 		DIE(("deleted entry still there"));
2514 	}
2515 #endif
2516 
2517 	ContainerContentsChanged(transaction);
2518 
2519 	// update the inode, so that no one will ever doubt it's deleted :-)
2520 	inode->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
2521 	inode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
2522 
2523 	// In balance to the Inode::Create() method, the main indices
2524 	// are updated here (name, size, & last_modified)
2525 
2526 	Index index(fVolume);
2527 	if (inode->InNameIndex()) {
2528 		index.RemoveName(transaction, name, inode);
2529 			// If removing from the index fails, it is not regarded as a
2530 			// fatal error and will not be reported back!
2531 			// Deleted inodes won't be visible in queries anyway.
2532 	}
2533 
2534 	if (inode->InSizeIndex())
2535 		index.RemoveSize(transaction, inode);
2536 	if (inode->InLastModifiedIndex())
2537 		index.RemoveLastModified(transaction, inode);
2538 
2539 	return inode->WriteBack(transaction);
2540 }
2541 
2542 
2543 /*!	Creates the inode with the specified \a parent directory, and automatically
2544 	adds the created inode to that parent directory. If an attribute directory
2545 	is created, it will also automatically  be added to the \a parent inode as
2546 	such. However, the indices root node, and the regular root node won't be
2547 	added to the super block.
2548 	It will also create the initial B+tree for the inode if it's a directory
2549 	of any kind.
2550 	\a name may be \c NULL, but only if no \a parent is given.
2551 	If the "_id" or "_inode" variable is given and non-NULL to store the
2552 	inode's ID, the inode stays locked - you have to call put_vnode() if you
2553 	don't use it anymore.
2554 	If the node already exists, this method will fail if \c O_EXCL is set, or
2555 	it's a directory or a symlink. Otherwise, it will just be returned.
2556 	If \c O_TRUNC has been specified, the file will also be truncated.
2557 */
2558 status_t
2559 Inode::Create(Transaction& transaction, Inode* parent, const char* name,
2560 	int32 mode, int openMode, uint32 type, bool* _created, ino_t* _id,
2561 	Inode** _inode, fs_vnode_ops* vnodeOps, uint32 publishFlags)
2562 {
2563 	FUNCTION_START(("name = %s, mode = %ld\n", name, mode));
2564 
2565 	block_run parentRun = parent ? parent->BlockRun() : block_run::Run(0, 0, 0);
2566 	Volume* volume = transaction.GetVolume();
2567 	BPlusTree* tree = NULL;
2568 
2569 	if (parent != NULL && (mode & S_ATTR_DIR) == 0 && parent->IsContainer()) {
2570 		// check if the file already exists in the directory
2571 		tree = parent->Tree();
2572 	}
2573 
2574 	if (parent != NULL) {
2575 		// the parent directory is locked during the whole inode creation
2576 		parent->WriteLockInTransaction(transaction);
2577 	}
2578 
2579 	if (parent != NULL && !volume->IsInitializing() && parent->IsContainer()) {
2580 		// don't create anything in removed directories
2581 		bool removed;
2582 		if (get_vnode_removed(volume->FSVolume(), parent->ID(), &removed)
2583 				== B_OK && removed) {
2584 			RETURN_ERROR(B_ENTRY_NOT_FOUND);
2585 		}
2586 	}
2587 
2588 	if (tree != NULL) {
2589 		// Does the file already exist?
2590 		off_t offset;
2591 		if (tree->Find((uint8*)name, (uint16)strlen(name), &offset) == B_OK) {
2592 			// Return if the file should be a directory/link or opened in
2593 			// exclusive mode
2594 			if (S_ISDIR(mode) || S_ISLNK(mode) || (openMode & O_EXCL) != 0)
2595 				return B_FILE_EXISTS;
2596 
2597 			Vnode vnode(volume, offset);
2598 			Inode* inode;
2599 			status_t status = vnode.Get(&inode);
2600 			if (status != B_OK) {
2601 				REPORT_ERROR(status);
2602 				return B_ENTRY_NOT_FOUND;
2603 			}
2604 
2605 			if (inode->IsDirectory() && (openMode & O_RWMASK) != O_RDONLY)
2606 				return B_IS_A_DIRECTORY;
2607 			if ((openMode & O_DIRECTORY) != 0 && !inode->IsDirectory())
2608 				return B_NOT_A_DIRECTORY;
2609 
2610 			// we want to open the file, so we should have the rights to do so
2611 			if (inode->CheckPermissions(open_mode_to_access(openMode)
2612 					| ((openMode & O_TRUNC) != 0 ? W_OK : 0)) != B_OK)
2613 				return B_NOT_ALLOWED;
2614 
2615 			if ((openMode & O_TRUNC) != 0) {
2616 				// truncate the existing file
2617 				inode->WriteLockInTransaction(transaction);
2618 
2619 				status_t status = inode->SetFileSize(transaction, 0);
2620 				if (status == B_OK)
2621 					status = inode->WriteBack(transaction);
2622 
2623 				if (status != B_OK)
2624 					return status;
2625 			}
2626 
2627 			if (_created)
2628 				*_created = false;
2629 			if (_id)
2630 				*_id = inode->ID();
2631 			if (_inode)
2632 				*_inode = inode;
2633 
2634 			// Only keep the vnode in memory if the _id or _inode pointer is
2635 			// provided
2636 			if (_id != NULL || _inode != NULL)
2637 				vnode.Keep();
2638 
2639 			return B_OK;
2640 		}
2641 	} else if (parent != NULL && (mode & S_ATTR_DIR) == 0) {
2642 		return B_BAD_VALUE;
2643 	} else if ((openMode & O_DIRECTORY) != 0) {
2644 		// TODO: we might need to return B_NOT_A_DIRECTORY here
2645 		return B_ENTRY_NOT_FOUND;
2646 	}
2647 
2648 	status_t status;
2649 
2650 	// do we have the power to create new files at all?
2651 	if (parent != NULL && (status = parent->CheckPermissions(W_OK)) != B_OK)
2652 		RETURN_ERROR(status);
2653 
2654 	// allocate space for the new inode
2655 	InodeAllocator allocator(transaction);
2656 	block_run run;
2657 	Inode* inode;
2658 	status = allocator.New(&parentRun, mode, run, vnodeOps, &inode);
2659 	if (status < B_OK)
2660 		return status;
2661 
2662 	T(Create(inode, parent, name, mode, openMode, type));
2663 
2664 	// Initialize the parts of the bfs_inode structure that
2665 	// InodeAllocator::New() hasn't touched yet
2666 
2667 	bfs_inode* node = &inode->Node();
2668 
2669 	if (parent == NULL) {
2670 		// we set the parent to itself in this case
2671 		// (only happens for the root and indices node)
2672 		node->parent = run;
2673 	} else
2674 		node->parent = parentRun;
2675 
2676 	node->uid = HOST_ENDIAN_TO_BFS_INT32(geteuid());
2677 	node->gid = HOST_ENDIAN_TO_BFS_INT32(parent
2678 		? parent->Node().GroupID() : getegid());
2679 		// the group ID is inherited from the parent, if available
2680 
2681 	node->type = HOST_ENDIAN_TO_BFS_INT32(type);
2682 
2683 	inode->WriteBack(transaction);
2684 		// make sure the initialized node is available to others
2685 
2686 	// only add the name to regular files, directories, or symlinks
2687 	// don't add it to attributes, or indices
2688 	if (tree && inode->IsRegularNode()
2689 		&& inode->SetName(transaction, name) != B_OK)
2690 		return B_ERROR;
2691 
2692 	// Initialize b+tree if it's a directory (and add "." & ".." if it's
2693 	// a standard directory for files - not for attributes or indices)
2694 	if (inode->IsContainer()) {
2695 		status = allocator.CreateTree();
2696 		if (status != B_OK)
2697 			return status;
2698 	}
2699 
2700 	// Add a link to the inode from the parent, depending on its type
2701 	// (the vnode is not published yet, so it is safe to make the inode
2702 	// accessable to the file system here)
2703 	if (tree != NULL) {
2704 		status = tree->Insert(transaction, name, inode->ID());
2705 	} else if (parent != NULL && (mode & S_ATTR_DIR) != 0) {
2706 		parent->Attributes() = run;
2707 		status = parent->WriteBack(transaction);
2708 	}
2709 
2710 	// Note, we only care if the inode could be made accessable for the
2711 	// two cases above; the root node or the indices root node must
2712 	// handle this case on their own (or other cases where "parent" is
2713 	// NULL)
2714 	if (status != B_OK)
2715 		RETURN_ERROR(status);
2716 
2717 	// Update the main indices (name, size & last_modified)
2718 	// (live queries might want to access us after this)
2719 
2720 	Index index(volume);
2721 	if (inode->InNameIndex() && name != NULL) {
2722 		// the name index only contains regular files
2723 		// (but not the root node where name == NULL)
2724 		status = index.InsertName(transaction, name, inode);
2725 		if (status != B_OK && status != B_BAD_INDEX) {
2726 			// We have to remove the node from the parent at this point,
2727 			// because the InodeAllocator destructor can't handle this
2728 			// case (and if it fails, we can't do anything about it...)
2729 			if (tree)
2730 				tree->Remove(transaction, name, inode->ID());
2731 			else if (parent != NULL && (mode & S_ATTR_DIR) != 0)
2732 				parent->Node().attributes.SetTo(0, 0, 0);
2733 
2734 			RETURN_ERROR(status);
2735 		}
2736 	}
2737 
2738 	if (parent != NULL && parent->IsContainer())
2739 		parent->ContainerContentsChanged(transaction);
2740 
2741 	inode->UpdateOldLastModified();
2742 
2743 	// The "size" & "last_modified" indices don't contain directories.
2744 	// If adding to these indices fails, the inode creation will not be
2745 	// harmed; they are considered less important than the "name" index.
2746 	if (inode->InSizeIndex())
2747 		index.InsertSize(transaction, inode);
2748 	if (inode->InLastModifiedIndex())
2749 		index.InsertLastModified(transaction, inode);
2750 
2751 	if (inode->NeedsFileCache()) {
2752 		inode->SetFileCache(file_cache_create(volume->ID(), inode->ID(),
2753 			inode->Size()));
2754 		inode->SetMap(file_map_create(volume->ID(), inode->ID(),
2755 			inode->Size()));
2756 
2757 		if (inode->FileCache() == NULL || inode->Map() == NULL)
2758 			return B_NO_MEMORY;
2759 	}
2760 
2761 	// Everything worked well until this point, we have a fully
2762 	// initialized inode, and we want to keep it
2763 	allocator.Keep(vnodeOps, publishFlags);
2764 
2765 	if (_created)
2766 		*_created = true;
2767 	if (_id != NULL)
2768 		*_id = inode->ID();
2769 	if (_inode != NULL)
2770 		*_inode = inode;
2771 
2772 	// if either _id or _inode is passed, we will keep the inode locked
2773 	if (_id == NULL && _inode == NULL)
2774 		put_vnode(volume->FSVolume(), inode->ID());
2775 
2776 	return B_OK;
2777 }
2778 
2779 
2780 //	#pragma mark - AttributeIterator
2781 
2782 
2783 AttributeIterator::AttributeIterator(Inode* inode)
2784 	:
2785 	fCurrentSmallData(0),
2786 	fInode(inode),
2787 	fAttributes(NULL),
2788 	fIterator(NULL),
2789 	fBuffer(NULL)
2790 {
2791 	inode->_AddIterator(this);
2792 }
2793 
2794 
2795 AttributeIterator::~AttributeIterator()
2796 {
2797 	if (fAttributes)
2798 		put_vnode(fAttributes->GetVolume()->FSVolume(), fAttributes->ID());
2799 
2800 	delete fIterator;
2801 	fInode->_RemoveIterator(this);
2802 }
2803 
2804 
2805 status_t
2806 AttributeIterator::Rewind()
2807 {
2808 	fCurrentSmallData = 0;
2809 
2810 	if (fIterator != NULL)
2811 		fIterator->Rewind();
2812 
2813 	return B_OK;
2814 }
2815 
2816 
2817 status_t
2818 AttributeIterator::GetNext(char* name, size_t* _length, uint32* _type,
2819 	ino_t* _id)
2820 {
2821 	// read attributes out of the small data section
2822 
2823 	if (fCurrentSmallData >= 0) {
2824 		NodeGetter nodeGetter(fInode->GetVolume(), fInode);
2825 		const bfs_inode* node = nodeGetter.Node();
2826 		const small_data* item = ((bfs_inode*)node)->SmallDataStart();
2827 
2828 		RecursiveLocker _(&fInode->SmallDataLock());
2829 
2830 		int32 index = 0;
2831 		for (; !item->IsLast(node); item = item->Next(), index++) {
2832 			if (item->NameSize() == FILE_NAME_NAME_LENGTH
2833 				&& *item->Name() == FILE_NAME_NAME)
2834 				continue;
2835 
2836 			if (index >= fCurrentSmallData)
2837 				break;
2838 		}
2839 
2840 		if (!item->IsLast(node)) {
2841 			strncpy(name, item->Name(), B_FILE_NAME_LENGTH);
2842 			*_type = item->Type();
2843 			*_length = item->NameSize();
2844 			*_id = (ino_t)index;
2845 
2846 			fCurrentSmallData = index + 1;
2847 			return B_OK;
2848 		}
2849 
2850 		// stop traversing the small_data section
2851 		fCurrentSmallData = -1;
2852 	}
2853 
2854 	// read attributes out of the attribute directory
2855 
2856 	if (fInode->Attributes().IsZero())
2857 		return B_ENTRY_NOT_FOUND;
2858 
2859 	Volume* volume = fInode->GetVolume();
2860 
2861 	// if you haven't yet access to the attributes directory, get it
2862 	if (fAttributes == NULL) {
2863 		if (get_vnode(volume->FSVolume(), volume->ToVnode(fInode->Attributes()),
2864 				(void**)&fAttributes) != B_OK) {
2865 			FATAL(("get_vnode() failed in AttributeIterator::GetNext(ino_t"
2866 				" = %" B_PRIdINO ",name = \"%s\")\n", fInode->ID(), name));
2867 			return B_ENTRY_NOT_FOUND;
2868 		}
2869 
2870 		BPlusTree* tree = fAttributes->Tree();
2871 		if (tree == NULL
2872 			|| (fIterator = new TreeIterator(tree)) == NULL) {
2873 			FATAL(("could not get tree in AttributeIterator::GetNext(ino_t"
2874 				" = %" B_PRIdINO ",name = \"%s\")\n", fInode->ID(), name));
2875 			return B_ENTRY_NOT_FOUND;
2876 		}
2877 	}
2878 
2879 	uint16 length;
2880 	ino_t id;
2881 	status_t status = fIterator->GetNextEntry(name, &length,
2882 		B_FILE_NAME_LENGTH, &id);
2883 	if (status < B_OK)
2884 		return status;
2885 
2886 	Vnode vnode(volume, id);
2887 	Inode* attribute;
2888 	if ((status = vnode.Get(&attribute)) == B_OK) {
2889 		*_type = attribute->Type();
2890 		*_length = length;
2891 		*_id = id;
2892 	}
2893 
2894 	return status;
2895 }
2896 
2897 
2898 void
2899 AttributeIterator::Update(uint16 index, int8 change)
2900 {
2901 	// fCurrentSmallData points already to the next item
2902 	if (index < fCurrentSmallData)
2903 		fCurrentSmallData += change;
2904 }
2905 
2906