xref: /haiku/src/add-ons/kernel/file_systems/bfs/Inode.cpp (revision bd7645a12ab594d5eb1c6d6a70c3a82a429410dd)
1 /*
2  * Copyright 2001-2010, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 
7 //! Inode access functions
8 
9 
10 #include "Debug.h"
11 #include "Inode.h"
12 #include "BPlusTree.h"
13 #include "Index.h"
14 
15 
16 #if BFS_TRACING && !defined(BFS_SHELL) && !defined(_BOOT_MODE)
17 namespace BFSInodeTracing {
18 
19 class Create : public AbstractTraceEntry {
20 public:
21 	Create(Inode* inode, Inode* parent, const char* name, int32 mode,
22 			int openMode, uint32 type)
23 		:
24 		fInode(inode),
25 		fID(inode->ID()),
26 		fParent(parent),
27 		fParentID(parent != NULL ? parent->ID() : 0),
28 		fMode(mode),
29 		fOpenMode(openMode),
30 		fType(type)
31 	{
32 		if (name != NULL)
33 			strlcpy(fName, name, sizeof(fName));
34 		else
35 			fName[0] = '\0';
36 
37 		Initialized();
38 	}
39 
40 	virtual void AddDump(TraceOutput& out)
41 	{
42 		out.Print("bfs:Create %Ld (%p), parent %Ld (%p), \"%s\", "
43 			"mode %lx, omode %x, type %lx", fID, fInode, fParentID,
44 			fParent, fName, fMode, fOpenMode, fType);
45 	}
46 
47 private:
48 	Inode*	fInode;
49 	ino_t	fID;
50 	Inode*	fParent;
51 	ino_t	fParentID;
52 	char	fName[32];
53 	int32	fMode;
54 	int		fOpenMode;
55 	uint32	fType;
56 };
57 
58 class Remove : public AbstractTraceEntry {
59 public:
60 	Remove(Inode* inode, const char* name)
61 		:
62 		fInode(inode),
63 		fID(inode->ID())
64 	{
65 		strlcpy(fName, name, sizeof(fName));
66 		Initialized();
67 	}
68 
69 	virtual void AddDump(TraceOutput& out)
70 	{
71 		out.Print("bfs:Remove %Ld (%p), \"%s\"", fID, fInode, fName);
72 	}
73 
74 private:
75 	Inode*	fInode;
76 	ino_t	fID;
77 	char	fName[32];
78 };
79 
80 class Action : public AbstractTraceEntry {
81 public:
82 	Action(const char* action, Inode* inode)
83 		:
84 		fInode(inode),
85 		fID(inode->ID())
86 	{
87 		strlcpy(fAction, action, sizeof(fAction));
88 		Initialized();
89 	}
90 
91 	virtual void AddDump(TraceOutput& out)
92 	{
93 		out.Print("bfs:%s %Ld (%p)\n", fAction, fID, fInode);
94 	}
95 
96 private:
97 	Inode*	fInode;
98 	ino_t	fID;
99 	char	fAction[16];
100 };
101 
102 class Resize : public AbstractTraceEntry {
103 public:
104 	Resize(Inode* inode, off_t oldSize, off_t newSize, bool trim)
105 		:
106 		fInode(inode),
107 		fID(inode->ID()),
108 		fOldSize(oldSize),
109 		fNewSize(newSize),
110 		fTrim(trim)
111 	{
112 		Initialized();
113 	}
114 
115 	virtual void AddDump(TraceOutput& out)
116 	{
117 		out.Print("bfs:%s %Ld (%p), %Ld -> %Ld", fTrim ? "Trim" : "Resize",
118 			fID, fInode, fOldSize, fNewSize);
119 	}
120 
121 private:
122 	Inode*	fInode;
123 	ino_t	fID;
124 	off_t	fOldSize;
125 	off_t	fNewSize;
126 	bool	fTrim;
127 };
128 
129 }	// namespace BFSInodeTracing
130 
131 #	define T(x) new(std::nothrow) BFSInodeTracing::x;
132 #else
133 #	define T(x) ;
134 #endif
135 
136 
137 /*!	A helper class used by Inode::Create() to keep track of the belongings
138 	of an inode creation in progress.
139 	This class will make sure everything is cleaned up properly.
140 */
141 class InodeAllocator {
142 public:
143 							InodeAllocator(Transaction& transaction);
144 							~InodeAllocator();
145 
146 			status_t		New(block_run* parentRun, mode_t mode,
147 								block_run& run, fs_vnode_ops* vnodeOps,
148 								Inode** _inode);
149 			status_t		CreateTree();
150 			status_t		Keep(fs_vnode_ops* vnodeOps, uint32 publishFlags);
151 
152 private:
153 	static	void			_TransactionListener(int32 id, int32 event,
154 								void* _inode);
155 
156 			Transaction*	fTransaction;
157 			block_run		fRun;
158 			Inode*			fInode;
159 };
160 
161 
162 InodeAllocator::InodeAllocator(Transaction& transaction)
163 	:
164 	fTransaction(&transaction),
165 	fInode(NULL)
166 {
167 }
168 
169 
170 InodeAllocator::~InodeAllocator()
171 {
172 	if (fTransaction != NULL) {
173 		Volume* volume = fTransaction->GetVolume();
174 
175 		if (fInode != NULL) {
176 			fInode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
177 				// this unblocks any pending bfs_read_vnode() calls
178 			fInode->Free(*fTransaction);
179 			fTransaction->RemoveListener(fInode);
180 
181 			remove_vnode(volume->FSVolume(), fInode->ID());
182 		} else
183 			volume->Free(*fTransaction, fRun);
184 	}
185 
186 	delete fInode;
187 }
188 
189 
190 status_t
191 InodeAllocator::New(block_run* parentRun, mode_t mode, block_run& run,
192 	fs_vnode_ops* vnodeOps, Inode** _inode)
193 {
194 	Volume* volume = fTransaction->GetVolume();
195 
196 	status_t status = volume->AllocateForInode(*fTransaction, parentRun, mode,
197 		fRun);
198 	if (status < B_OK) {
199 		// don't free the space in the destructor, because
200 		// the allocation failed
201 		fTransaction = NULL;
202 		RETURN_ERROR(status);
203 	}
204 
205 	run = fRun;
206 	fInode = new Inode(volume, *fTransaction, volume->ToVnode(run), mode, run);
207 	if (fInode == NULL)
208 		RETURN_ERROR(B_NO_MEMORY);
209 
210 	if (!volume->IsInitializing()) {
211 		status = new_vnode(volume->FSVolume(), fInode->ID(), fInode,
212 			vnodeOps != NULL ? vnodeOps : &gBFSVnodeOps);
213 		if (status < B_OK) {
214 			delete fInode;
215 			fInode = NULL;
216 			RETURN_ERROR(status);
217 		}
218 	}
219 
220 	fInode->WriteLockInTransaction(*fTransaction);
221 	*_inode = fInode;
222 	return B_OK;
223 }
224 
225 
226 status_t
227 InodeAllocator::CreateTree()
228 {
229 	Volume* volume = fTransaction->GetVolume();
230 
231 	// force S_STR_INDEX to be set, if no type is set
232 	if ((fInode->Mode() & S_INDEX_TYPES) == 0)
233 		fInode->Node().mode |= HOST_ENDIAN_TO_BFS_INT32(S_STR_INDEX);
234 
235 	BPlusTree* tree = fInode->fTree = new BPlusTree(*fTransaction, fInode);
236 	if (tree == NULL || tree->InitCheck() < B_OK)
237 		return B_ERROR;
238 
239 	if (fInode->IsRegularNode()) {
240 		if (tree->Insert(*fTransaction, ".", fInode->ID()) < B_OK
241 			|| tree->Insert(*fTransaction, "..",
242 					volume->ToVnode(fInode->Parent())) < B_OK)
243 			return B_ERROR;
244 	}
245 	return B_OK;
246 }
247 
248 
249 status_t
250 InodeAllocator::Keep(fs_vnode_ops* vnodeOps, uint32 publishFlags)
251 {
252 	ASSERT(fInode != NULL && fTransaction != NULL);
253 	Volume* volume = fTransaction->GetVolume();
254 
255 	status_t status = fInode->WriteBack(*fTransaction);
256 	if (status < B_OK) {
257 		FATAL(("writing new inode %" B_PRIdINO " failed!\n", fInode->ID()));
258 		return status;
259 	}
260 
261 	// Symbolic links are not published -- the caller needs to do this once
262 	// the contents have been written.
263 	if (!fInode->IsSymLink() && volume->ID() >= 0) {
264 		status = publish_vnode(volume->FSVolume(), fInode->ID(), fInode,
265 			vnodeOps != NULL ? vnodeOps : &gBFSVnodeOps, fInode->Mode(),
266 			publishFlags);
267 	}
268 
269 	if (status == B_OK) {
270 		cache_add_transaction_listener(volume->BlockCache(), fTransaction->ID(),
271 			TRANSACTION_ABORTED, &_TransactionListener, fInode);
272 	}
273 
274 	fTransaction = NULL;
275 	fInode = NULL;
276 
277 	return status;
278 }
279 
280 
281 /*static*/ void
282 InodeAllocator::_TransactionListener(int32 id, int32 event, void* _inode)
283 {
284 	Inode* inode = (Inode*)_inode;
285 
286 	if (event == TRANSACTION_ABORTED)
287 		panic("transaction %d aborted, inode %p still around!\n", (int)id, inode);
288 }
289 
290 
291 //	#pragma mark -
292 
293 
294 status_t
295 bfs_inode::InitCheck(Volume* volume) const
296 {
297 	if (Magic1() != INODE_MAGIC1
298 		|| !(Flags() & INODE_IN_USE)
299 		|| inode_num.Length() != 1
300 		// matches inode size?
301 		|| (uint32)InodeSize() != volume->InodeSize()
302 		// parent resides on disk?
303 		|| parent.AllocationGroup() > int32(volume->AllocationGroups())
304 		|| parent.AllocationGroup() < 0
305 		|| parent.Start() > (1L << volume->AllocationGroupShift())
306 		|| parent.Length() != 1
307 		// attributes, too?
308 		|| attributes.AllocationGroup() > int32(volume->AllocationGroups())
309 		|| attributes.AllocationGroup() < 0
310 		|| attributes.Start() > (1L << volume->AllocationGroupShift()))
311 		RETURN_ERROR(B_BAD_DATA);
312 
313 	if (Flags() & INODE_DELETED)
314 		return B_NOT_ALLOWED;
315 
316 	// TODO: Add some tests to check the integrity of the other stuff here,
317 	// especially for the data_stream!
318 
319 	return B_OK;
320 }
321 
322 
323 //	#pragma mark - Inode
324 
325 
326 Inode::Inode(Volume* volume, ino_t id)
327 	:
328 	fVolume(volume),
329 	fID(id),
330 	fTree(NULL),
331 	fAttributes(NULL),
332 	fCache(NULL),
333 	fMap(NULL)
334 {
335 	PRINT(("Inode::Inode(volume = %p, id = %Ld) @ %p\n", volume, id, this));
336 
337 	UpdateNodeFromDisk();
338 
339 	rw_lock_init(&fLock, "bfs inode");
340 	recursive_lock_init(&fSmallDataLock, "bfs inode small data");
341 
342 	// these two will help to maintain the indices
343 	fOldSize = Size();
344 	fOldLastModified = LastModified();
345 
346 	if (IsContainer())
347 		fTree = new BPlusTree(this);
348 	if (NeedsFileCache()) {
349 		SetFileCache(file_cache_create(fVolume->ID(), ID(), Size()));
350 		SetMap(file_map_create(volume->ID(), ID(), Size()));
351 	}
352 }
353 
354 
355 Inode::Inode(Volume* volume, Transaction& transaction, ino_t id, mode_t mode,
356 		block_run& run)
357 	:
358 	fVolume(volume),
359 	fID(id),
360 	fTree(NULL),
361 	fAttributes(NULL),
362 	fCache(NULL),
363 	fMap(NULL)
364 {
365 	PRINT(("Inode::Inode(volume = %p, transaction = %p, id = %Ld) @ %p\n",
366 		volume, &transaction, id, this));
367 
368 	rw_lock_init(&fLock, "bfs inode");
369 	recursive_lock_init(&fSmallDataLock, "bfs inode small data");
370 
371 	NodeGetter node(volume, transaction, this, true);
372 	memset(&fNode, 0, sizeof(bfs_inode));
373 
374 	// Initialize the bfs_inode structure -- it's not written back to disk
375 	// here, because it will be done so already when the inode could be
376 	// created completely.
377 
378 	Node().magic1 = HOST_ENDIAN_TO_BFS_INT32(INODE_MAGIC1);
379 	Node().inode_num = run;
380 	Node().mode = HOST_ENDIAN_TO_BFS_INT32(mode);
381 	Node().flags = HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
382 
383 	Node().create_time = Node().last_modified_time = Node().status_change_time
384 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
385 
386 	Node().inode_size = HOST_ENDIAN_TO_BFS_INT32(volume->InodeSize());
387 
388 	// these two will help to maintain the indices
389 	fOldSize = Size();
390 	fOldLastModified = LastModified();
391 }
392 
393 
394 Inode::~Inode()
395 {
396 	PRINT(("Inode::~Inode() @ %p\n", this));
397 
398 	file_cache_delete(FileCache());
399 	file_map_delete(Map());
400 	delete fTree;
401 
402 	rw_lock_destroy(&fLock);
403 }
404 
405 
406 status_t
407 Inode::InitCheck(bool checkNode) const
408 {
409 	// test inode magic and flags
410 	if (checkNode) {
411 		status_t status = Node().InitCheck(fVolume);
412 		if (status == B_BUSY)
413 			return B_BUSY;
414 
415 		if (status != B_OK) {
416 			FATAL(("inode at block %" B_PRIdOFF " corrupt!\n", BlockNumber()));
417 			RETURN_ERROR(B_BAD_DATA);
418 		}
419 	}
420 
421 	if (IsContainer()) {
422 		// inodes that have a B+tree
423 		if (fTree == NULL)
424 			RETURN_ERROR(B_NO_MEMORY);
425 
426 		status_t status = fTree->InitCheck();
427 		if (status != B_OK) {
428 			FATAL(("inode tree at block %" B_PRIdOFF " corrupt!\n",
429 				BlockNumber()));
430 			RETURN_ERROR(B_BAD_DATA);
431 		}
432 	}
433 
434 	if (NeedsFileCache() && (fCache == NULL || fMap == NULL))
435 		return B_NO_MEMORY;
436 
437 	return B_OK;
438 }
439 
440 
441 /*!	Adds this inode to the specified transaction. This means that the inode will
442 	be write locked until the transaction ended.
443 	To ensure that the inode will stay valid until that point, an extra reference
444 	is acquired to it as long as this transaction stays active.
445 */
446 void
447 Inode::WriteLockInTransaction(Transaction& transaction)
448 {
449 	// These flags can only change while holding the transaction lock
450 	if ((Flags() & INODE_IN_TRANSACTION) != 0)
451 		return;
452 
453 	// We share the same list link with the removed list, so we have to remove
454 	// the inode from that list here (and add it back when we no longer need it)
455 	if ((Flags() & INODE_DELETED) != 0)
456 		fVolume->RemovedInodes().Remove(this);
457 
458 	if (!fVolume->IsInitializing())
459 		acquire_vnode(fVolume->FSVolume(), ID());
460 
461 	rw_lock_write_lock(&Lock());
462 	Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
463 
464 	transaction.AddListener(this);
465 }
466 
467 
468 status_t
469 Inode::WriteBack(Transaction& transaction)
470 {
471 	NodeGetter node(fVolume, transaction, this);
472 	if (node.WritableNode() == NULL)
473 		return B_IO_ERROR;
474 
475 	memcpy(node.WritableNode(), &Node(), sizeof(bfs_inode));
476 	return B_OK;
477 }
478 
479 
480 void
481 Inode::UpdateNodeFromDisk()
482 {
483 	NodeGetter node(fVolume, this);
484 	memcpy(&fNode, node.Node(), sizeof(bfs_inode));
485 	fNode.flags &= HOST_ENDIAN_TO_BFS_INT32(INODE_PERMANENT_FLAGS);
486 }
487 
488 
489 status_t
490 Inode::CheckPermissions(int accessMode) const
491 {
492 	uid_t user = geteuid();
493 	gid_t group = getegid();
494 
495 	// you never have write access to a read-only volume
496 	if ((accessMode & W_OK) != 0 && fVolume->IsReadOnly())
497 		return B_READ_ONLY_DEVICE;
498 
499 	// root users always have full access (but they can't execute files without
500 	// any execute permissions set)
501 	if (user == 0) {
502 		if (!((accessMode & X_OK) != 0 && (Mode() & S_IXUSR) == 0)
503 			|| (Mode() & S_DIRECTORY) != 0) {
504 			return B_OK;
505 		}
506 	}
507 
508 	// shift mode bits, to check directly against accessMode
509 	mode_t mode = Mode();
510 	if (user == (uid_t)fNode.UserID())
511 		mode >>= 6;
512 	else if (group == (gid_t)fNode.GroupID())
513 		mode >>= 3;
514 
515 	if (accessMode & ~(mode & S_IRWXO))
516 		return B_NOT_ALLOWED;
517 
518 	return B_OK;
519 }
520 
521 
522 //	#pragma mark - attributes
523 
524 
525 void
526 Inode::_AddIterator(AttributeIterator* iterator)
527 {
528 	RecursiveLocker _(fSmallDataLock);
529 	fIterators.Add(iterator);
530 }
531 
532 
533 void
534 Inode::_RemoveIterator(AttributeIterator* iterator)
535 {
536 	RecursiveLocker _(fSmallDataLock);
537 	fIterators.Remove(iterator);
538 }
539 
540 
541 /*!	Tries to free up "bytes" space in the small_data section by moving
542 	attributes to real files. Used for system attributes like the name.
543 	You need to hold the fSmallDataLock when you call this method
544 */
545 status_t
546 Inode::_MakeSpaceForSmallData(Transaction& transaction, bfs_inode* node,
547 	const char* name, int32 bytes)
548 {
549 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
550 
551 	while (bytes > 0) {
552 		small_data* item = node->SmallDataStart();
553 		small_data* max = NULL;
554 		int32 index = 0, maxIndex = 0;
555 		for (; !item->IsLast(node); item = item->Next(), index++) {
556 			// should not remove those
557 			if (*item->Name() == FILE_NAME_NAME || !strcmp(name, item->Name()))
558 				continue;
559 
560 			if (max == NULL || max->Size() < item->Size()) {
561 				maxIndex = index;
562 				max = item;
563 			}
564 
565 			// Remove the first one large enough to free the needed amount of
566 			// bytes
567 			if (bytes < (int32)item->Size())
568 				break;
569 		}
570 
571 		if (item->IsLast(node) || (int32)item->Size() < bytes)
572 			return B_ERROR;
573 
574 		bytes -= max->Size();
575 
576 		// Move the attribute to a real attribute file
577 		// Luckily, this doesn't cause any index updates
578 
579 		Inode* attribute;
580 		status_t status = CreateAttribute(transaction, item->Name(),
581 			item->Type(), &attribute);
582 		if (status != B_OK)
583 			RETURN_ERROR(status);
584 
585 		size_t length = item->DataSize();
586 		status = attribute->WriteAt(transaction, 0, item->Data(), &length);
587 
588 		ReleaseAttribute(attribute);
589 
590 		if (status != B_OK) {
591 			Vnode vnode(fVolume, Attributes());
592 			Inode* attributes;
593 			if (vnode.Get(&attributes) < B_OK
594 				|| attributes->Remove(transaction, name) < B_OK) {
595 				FATAL(("Could not remove newly created attribute!\n"));
596 			}
597 
598 			RETURN_ERROR(status);
599 		}
600 
601 		_RemoveSmallData(node, max, maxIndex);
602 	}
603 	return B_OK;
604 }
605 
606 
607 /*!	Private function which removes the given attribute from the small_data
608 	section.
609 	You need to hold the fSmallDataLock when you call this method
610 */
611 status_t
612 Inode::_RemoveSmallData(bfs_inode* node, small_data* item, int32 index)
613 {
614 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
615 
616 	small_data* next = item->Next();
617 	if (!next->IsLast(node)) {
618 		// find the last attribute
619 		small_data* last = next;
620 		while (!last->IsLast(node))
621 			last = last->Next();
622 
623 		int32 size = (uint8*)last - (uint8*)next;
624 		if (size < 0
625 			|| size > (uint8*)node + fVolume->BlockSize() - (uint8*)next)
626 			return B_BAD_DATA;
627 
628 		memmove(item, next, size);
629 
630 		// Move the "last" one to its new location and
631 		// correctly terminate the small_data section
632 		last = (small_data*)((uint8*)last - ((uint8*)next - (uint8*)item));
633 		memset(last, 0, (uint8*)node + fVolume->BlockSize() - (uint8*)last);
634 	} else
635 		memset(item, 0, item->Size());
636 
637 	// update all current iterators
638 	SinglyLinkedList<AttributeIterator>::Iterator iterator
639 		= fIterators.GetIterator();
640 	while (iterator.HasNext()) {
641 		iterator.Next()->Update(index, -1);
642 	}
643 
644 	return B_OK;
645 }
646 
647 
648 //!	Removes the given attribute from the small_data section.
649 status_t
650 Inode::_RemoveSmallData(Transaction& transaction, NodeGetter& nodeGetter,
651 	const char* name)
652 {
653 	if (name == NULL)
654 		return B_BAD_VALUE;
655 
656 	bfs_inode* node = nodeGetter.WritableNode();
657 	RecursiveLocker locker(fSmallDataLock);
658 
659 	// search for the small_data item
660 
661 	small_data* item = node->SmallDataStart();
662 	int32 index = 0;
663 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
664 		item = item->Next();
665 		index++;
666 	}
667 
668 	if (item->IsLast(node))
669 		return B_ENTRY_NOT_FOUND;
670 
671 	nodeGetter.MakeWritable(transaction);
672 
673 	status_t status = _RemoveSmallData(node, item, index);
674 	if (status == B_OK) {
675 		Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
676 			bfs_inode::ToInode(real_time_clock_usecs()));
677 
678 		status = WriteBack(transaction);
679 	}
680 
681 	return status;
682 }
683 
684 
685 /*!	Try to place the given attribute in the small_data section - if the
686 	new attribute is too big to fit in that section, it returns B_DEVICE_FULL.
687 	In that case, the attribute should be written to a real attribute file;
688 	it's the caller's responsibility to remove any existing attributes in the
689 	small data section if that's the case.
690 
691 	Note that you need to write back the inode yourself after having called that
692 	method - it's a bad API decision that it needs a transaction but enforces
693 	you to write back the inode all by yourself, but it's just more efficient
694 	in most cases...
695 */
696 status_t
697 Inode::_AddSmallData(Transaction& transaction, NodeGetter& nodeGetter,
698 	const char* name, uint32 type, off_t pos, const uint8* data, size_t length,
699 	bool force)
700 {
701 	bfs_inode* node = nodeGetter.WritableNode();
702 
703 	if (node == NULL || name == NULL || data == NULL)
704 		return B_BAD_VALUE;
705 
706 	// reject any requests that can't fit into the small_data section
707 	uint32 nameLength = strlen(name);
708 	uint32 spaceNeeded = sizeof(small_data) + nameLength + 3 + pos + length + 1;
709 	if (spaceNeeded > fVolume->InodeSize() - sizeof(bfs_inode))
710 		return B_DEVICE_FULL;
711 
712 	nodeGetter.MakeWritable(transaction);
713 	RecursiveLocker locker(fSmallDataLock);
714 
715 	// Find the last item or one with the same name we have to add
716 	small_data* item = node->SmallDataStart();
717 	int32 index = 0;
718 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
719 		item = item->Next();
720 		index++;
721 	}
722 
723 	// is the attribute already in the small_data section?
724 	// then just replace the data part of that one
725 	if (!item->IsLast(node)) {
726 		// find last attribute
727 		small_data* last = item;
728 		while (!last->IsLast(node))
729 			last = last->Next();
730 
731 		// try to change the attributes value
732 		if (item->data_size > pos + length
733 			|| force
734 			|| ((uint8*)last + pos + length - item->DataSize())
735 					<= ((uint8*)node + fVolume->InodeSize())) {
736 			// Make room for the new attribute if needed (and we are forced
737 			// to do so)
738 			if (force && ((uint8*)last + pos + length - item->DataSize())
739 					> ((uint8*)node + fVolume->InodeSize())) {
740 				// We also take the free space at the end of the small_data
741 				// section into account, and request only what's really needed
742 				uint32 needed = pos + length - item->DataSize() -
743 					(uint32)((uint8*)node + fVolume->InodeSize()
744 						- (uint8*)last);
745 
746 				if (_MakeSpaceForSmallData(transaction, node, name, needed)
747 						!= B_OK)
748 					return B_ERROR;
749 
750 				// reset our pointers
751 				item = node->SmallDataStart();
752 				index = 0;
753 				while (!item->IsLast(node) && strcmp(item->Name(), name)) {
754 					item = item->Next();
755 					index++;
756 				}
757 
758 				last = item;
759 				while (!last->IsLast(node))
760 					last = last->Next();
761 			}
762 
763 			size_t oldDataSize = item->DataSize();
764 
765 			// Normally, we can just overwrite the attribute data as the size
766 			// is specified by the type and does not change that often
767 			if (pos + length != item->DataSize()) {
768 				// move the attributes after the current one
769 				small_data* next = item->Next();
770 				if (!next->IsLast(node)) {
771 					memmove((uint8*)item + spaceNeeded, next,
772 						(uint8*)last - (uint8*)next);
773 				}
774 
775 				// Move the "last" one to its new location and
776 				// correctly terminate the small_data section
777 				last = (small_data*)((uint8*)last
778 					- ((uint8*)next - ((uint8*)item + spaceNeeded)));
779 				if ((uint8*)last < (uint8*)node + fVolume->BlockSize()) {
780 					memset(last, 0, (uint8*)node + fVolume->BlockSize()
781 						- (uint8*)last);
782 				}
783 
784 				item->data_size = HOST_ENDIAN_TO_BFS_INT16(pos + length);
785 			}
786 
787 			item->type = HOST_ENDIAN_TO_BFS_INT32(type);
788 
789 			if (oldDataSize < pos) {
790 				// Fill gap with zeros
791 				memset(item->Data() + oldDataSize, 0, pos - oldDataSize);
792 			}
793 			memcpy(item->Data() + pos, data, length);
794 			item->Data()[pos + length] = '\0';
795 
796 			return B_OK;
797 		}
798 
799 		return B_DEVICE_FULL;
800 	}
801 
802 	// try to add the new attribute!
803 
804 	if ((uint8*)item + spaceNeeded > (uint8*)node + fVolume->InodeSize()) {
805 		// there is not enough space for it!
806 		if (!force)
807 			return B_DEVICE_FULL;
808 
809 		// make room for the new attribute
810 		if (_MakeSpaceForSmallData(transaction, node, name, spaceNeeded) < B_OK)
811 			return B_ERROR;
812 
813 		// get new last item!
814 		item = node->SmallDataStart();
815 		index = 0;
816 		while (!item->IsLast(node)) {
817 			item = item->Next();
818 			index++;
819 		}
820 	}
821 
822 	memset(item, 0, spaceNeeded);
823 	item->type = HOST_ENDIAN_TO_BFS_INT32(type);
824 	item->name_size = HOST_ENDIAN_TO_BFS_INT16(nameLength);
825 	item->data_size = HOST_ENDIAN_TO_BFS_INT16(length);
826 	strcpy(item->Name(), name);
827 	memcpy(item->Data() + pos, data, length);
828 
829 	// correctly terminate the small_data section
830 	item = item->Next();
831 	if (!item->IsLast(node))
832 		memset(item, 0, (uint8*)node + fVolume->InodeSize() - (uint8*)item);
833 
834 	// update all current iterators
835 	SinglyLinkedList<AttributeIterator>::Iterator iterator
836 		= fIterators.GetIterator();
837 	while (iterator.HasNext()) {
838 		iterator.Next()->Update(index, 1);
839 	}
840 
841 	return B_OK;
842 }
843 
844 
845 /*!	Iterates through the small_data section of an inode.
846 	To start at the beginning of this section, you let smallData
847 	point to NULL, like:
848 		small_data* data = NULL;
849 		while (inode->GetNextSmallData(&data) { ... }
850 
851 	This function is reentrant and doesn't allocate any memory;
852 	you can safely stop calling it at any point (you don't need
853 	to iterate through the whole list).
854 	You need to hold the fSmallDataLock when you call this method
855 */
856 status_t
857 Inode::_GetNextSmallData(bfs_inode* node, small_data** _smallData) const
858 {
859 	if (node == NULL)
860 		RETURN_ERROR(B_BAD_VALUE);
861 
862 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
863 
864 	small_data* data = *_smallData;
865 
866 	// begin from the start?
867 	if (data == NULL)
868 		data = node->SmallDataStart();
869 	else
870 		data = data->Next();
871 
872 	// is already last item?
873 	if (data->IsLast(node))
874 		return B_ENTRY_NOT_FOUND;
875 
876 	*_smallData = data;
877 
878 	return B_OK;
879 }
880 
881 
882 /*!	Finds the attribute "name" in the small data section, and
883 	returns a pointer to it (or NULL if it doesn't exist).
884 	You need to hold the fSmallDataLock when you call this method
885 */
886 small_data*
887 Inode::FindSmallData(const bfs_inode* node, const char* name) const
888 {
889 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
890 
891 	small_data* smallData = NULL;
892 	while (_GetNextSmallData(const_cast<bfs_inode*>(node), &smallData)
893 			== B_OK) {
894 		if (!strcmp(smallData->Name(), name))
895 			return smallData;
896 	}
897 	return NULL;
898 }
899 
900 
901 /*!	Returns a pointer to the node's name if present in the small data
902 	section, NULL otherwise.
903 	You need to hold the fSmallDataLock when you call this method
904 */
905 const char*
906 Inode::Name(const bfs_inode* node) const
907 {
908 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
909 
910 	small_data* smallData = NULL;
911 	while (_GetNextSmallData((bfs_inode*)node, &smallData) == B_OK) {
912 		if (*smallData->Name() == FILE_NAME_NAME
913 			&& smallData->NameSize() == FILE_NAME_NAME_LENGTH)
914 			return (const char*)smallData->Data();
915 	}
916 	return NULL;
917 }
918 
919 
920 /*!	Copies the node's name into the provided buffer.
921 	The buffer should be B_FILE_NAME_LENGTH bytes large.
922 */
923 status_t
924 Inode::GetName(char* buffer, size_t size) const
925 {
926 	NodeGetter node(fVolume, this);
927 	RecursiveLocker locker(fSmallDataLock);
928 
929 	const char* name = Name(node.Node());
930 	if (name == NULL)
931 		return B_ENTRY_NOT_FOUND;
932 
933 	strlcpy(buffer, name, size);
934 	return B_OK;
935 }
936 
937 
938 /*!	Changes or set the name of a file: in the inode small_data section only, it
939 	doesn't change it in the parent directory's b+tree.
940 	Note that you need to write back the inode yourself after having called
941 	that method. It suffers from the same API decision as AddSmallData() does
942 	(and for the same reason).
943 */
944 status_t
945 Inode::SetName(Transaction& transaction, const char* name)
946 {
947 	if (name == NULL || *name == '\0')
948 		return B_BAD_VALUE;
949 
950 	NodeGetter node(fVolume, transaction, this);
951 	const char nameTag[2] = {FILE_NAME_NAME, 0};
952 
953 	return _AddSmallData(transaction, node, nameTag, FILE_NAME_TYPE, 0,
954 		(uint8*)name, strlen(name), true);
955 }
956 
957 
958 status_t
959 Inode::_RemoveAttribute(Transaction& transaction, const char* name,
960 	bool hasIndex, Index* index)
961 {
962 	// remove the attribute file if it exists
963 	Vnode vnode(fVolume, Attributes());
964 	Inode* attributes;
965 	status_t status = vnode.Get(&attributes);
966 	if (status < B_OK)
967 		return status;
968 
969 	// update index
970 	if (index != NULL) {
971 		Inode* attribute;
972 		if ((hasIndex || fVolume->CheckForLiveQuery(name))
973 			&& GetAttribute(name, &attribute) == B_OK) {
974 			uint8 data[BPLUSTREE_MAX_KEY_LENGTH];
975 			size_t length = BPLUSTREE_MAX_KEY_LENGTH;
976 			if (attribute->ReadAt(0, data, &length) == B_OK) {
977 				index->Update(transaction, name, attribute->Type(), data,
978 					length, NULL, 0, this);
979 			}
980 
981 			ReleaseAttribute(attribute);
982 		}
983 	}
984 
985 	if ((status = attributes->Remove(transaction, name)) < B_OK)
986 		return status;
987 
988 	if (attributes->IsEmpty()) {
989 		attributes->WriteLockInTransaction(transaction);
990 
991 		// remove attribute directory (don't fail if that can't be done)
992 		if (remove_vnode(fVolume->FSVolume(), attributes->ID()) == B_OK) {
993 			// update the inode, so that no one will ever doubt it's deleted :-)
994 			attributes->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
995 			if (attributes->WriteBack(transaction) == B_OK) {
996 				Attributes().SetTo(0, 0, 0);
997 				WriteBack(transaction);
998 			} else {
999 				unremove_vnode(fVolume->FSVolume(), attributes->ID());
1000 				attributes->Node().flags
1001 					&= ~HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
1002 			}
1003 		}
1004 	}
1005 
1006 	return status;
1007 }
1008 
1009 
1010 /*!	Reads data from the specified attribute.
1011 	This is a high-level attribute function that understands attributes
1012 	in the small_data section as well as real attribute files.
1013 */
1014 status_t
1015 Inode::ReadAttribute(const char* name, int32 type, off_t pos, uint8* buffer,
1016 	size_t* _length)
1017 {
1018 	if (pos < 0)
1019 		pos = 0;
1020 
1021 	// search in the small_data section (which has to be locked first)
1022 	{
1023 		NodeGetter node(fVolume, this);
1024 		RecursiveLocker locker(fSmallDataLock);
1025 
1026 		small_data* smallData = FindSmallData(node.Node(), name);
1027 		if (smallData != NULL) {
1028 			size_t length = *_length;
1029 			if (pos >= smallData->data_size) {
1030 				*_length = 0;
1031 				return B_OK;
1032 			}
1033 			if (length + pos > smallData->DataSize())
1034 				length = smallData->DataSize() - pos;
1035 
1036 			memcpy(buffer, smallData->Data() + pos, length);
1037 			*_length = length;
1038 			return B_OK;
1039 		}
1040 	}
1041 
1042 	// search in the attribute directory
1043 	Inode* attribute;
1044 	status_t status = GetAttribute(name, &attribute);
1045 	if (status == B_OK) {
1046 		status = attribute->ReadAt(pos, (uint8*)buffer, _length);
1047 
1048 		ReleaseAttribute(attribute);
1049 	}
1050 
1051 	RETURN_ERROR(status);
1052 }
1053 
1054 
1055 /*!	Writes data to the specified attribute.
1056 	This is a high-level attribute function that understands attributes
1057 	in the small_data section as well as real attribute files.
1058 */
1059 status_t
1060 Inode::WriteAttribute(Transaction& transaction, const char* name, int32 type,
1061 	off_t pos, const uint8* buffer, size_t* _length, bool* _created)
1062 {
1063 	if (pos < 0)
1064 		return B_BAD_VALUE;
1065 
1066 	// needed to maintain the index
1067 	uint8 oldBuffer[BPLUSTREE_MAX_KEY_LENGTH];
1068 	uint8* oldData = NULL;
1069 	size_t oldLength = 0;
1070 	bool created = false;
1071 
1072 	// TODO: we actually depend on that the contents of "buffer" are constant.
1073 	// If they get changed during the write (hey, user programs), we may mess
1074 	// up our index trees!
1075 	// TODO: for attribute files, we need to log the first
1076 	// BPLUSTREE_MAX_KEY_LENGTH bytes of the data stream, or the same as above
1077 	// might happen.
1078 
1079 	Index index(fVolume);
1080 	index.SetTo(name);
1081 
1082 	Inode* attribute = NULL;
1083 	status_t status = B_OK;
1084 
1085 	if (GetAttribute(name, &attribute) != B_OK) {
1086 		// No attribute inode exists yet
1087 
1088 		// save the old attribute data
1089 		NodeGetter node(fVolume, transaction, this);
1090 		recursive_lock_lock(&fSmallDataLock);
1091 
1092 		small_data* smallData = FindSmallData(node.Node(), name);
1093 		if (smallData != NULL) {
1094 			oldLength = smallData->DataSize();
1095 			if (oldLength > 0) {
1096 				if (oldLength > BPLUSTREE_MAX_KEY_LENGTH)
1097 					oldLength = BPLUSTREE_MAX_KEY_LENGTH;
1098 				memcpy(oldData = oldBuffer, smallData->Data(), oldLength);
1099 			}
1100 		} else
1101 			created = true;
1102 
1103 		recursive_lock_unlock(&fSmallDataLock);
1104 
1105 		// if the attribute doesn't exist yet (as a file), try to put it in the
1106 		// small_data section first - if that fails (due to insufficent space),
1107 		// create a real attribute file
1108 		status = _AddSmallData(transaction, node, name, type, pos, buffer,
1109 			*_length);
1110 		if (status == B_DEVICE_FULL) {
1111 			if (smallData != NULL) {
1112 				// remove the old attribute from the small data section - there
1113 				// is no space left for the new data
1114 				status = _RemoveSmallData(transaction, node, name);
1115 			} else
1116 				status = B_OK;
1117 
1118 			if (status == B_OK)
1119 				status = CreateAttribute(transaction, name, type, &attribute);
1120 			if (status != B_OK)
1121 				RETURN_ERROR(status);
1122 
1123 			created = true;
1124 		} else if (status == B_OK) {
1125 			// Update status time on attribute write
1126 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1127 				bfs_inode::ToInode(real_time_clock_usecs()));
1128 
1129 			status = WriteBack(transaction);
1130 		}
1131 	}
1132 
1133 	if (attribute != NULL) {
1134 		// TODO: we need to lock the inode in the transaction, see WriteAt()!
1135 		if (rw_lock_write_lock(&attribute->fLock) == B_OK) {
1136 			// Save the old attribute data (if this fails, oldLength will
1137 			// reflect it)
1138 			if (fVolume->CheckForLiveQuery(name) && attribute->Size() > 0) {
1139 				oldLength = BPLUSTREE_MAX_KEY_LENGTH;
1140 				if (attribute->ReadAt(0, oldBuffer, &oldLength) == B_OK)
1141 					oldData = oldBuffer;
1142 			}
1143 
1144 			// check if the data fits into the small_data section again
1145 			NodeGetter node(fVolume, transaction, this);
1146 			status = _AddSmallData(transaction, node, name, type, pos, buffer,
1147 				*_length);
1148 
1149 			if (status == B_OK) {
1150 				// it does - remove its file
1151 				rw_lock_write_unlock(&attribute->fLock);
1152 				status = _RemoveAttribute(transaction, name, false, NULL);
1153 			} else {
1154 				// The attribute type might have been changed - we need to
1155 				// adopt the new one
1156 				attribute->Node().type = HOST_ENDIAN_TO_BFS_INT32(type);
1157 				status = attribute->WriteBack(transaction);
1158 				rw_lock_write_unlock(&attribute->fLock);
1159 
1160 				if (status == B_OK) {
1161 					status = attribute->WriteAt(transaction, pos, buffer,
1162 						_length);
1163 				}
1164 			}
1165 
1166 			if (status == B_OK) {
1167 				// Update status time on attribute write
1168 				Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1169 					bfs_inode::ToInode(real_time_clock_usecs()));
1170 
1171 				status = WriteBack(transaction);
1172 			}
1173 		} else
1174 			status = B_ERROR;
1175 
1176 		ReleaseAttribute(attribute);
1177 	}
1178 
1179 	// TODO: find a better way than this "pos" thing (the begin of the old key
1180 	//	must be copied to the start of the new one for a comparison)
1181 	if (status == B_OK && pos == 0) {
1182 		// index only the first BPLUSTREE_MAX_KEY_LENGTH bytes
1183 		uint16 length = *_length;
1184 		if (length > BPLUSTREE_MAX_KEY_LENGTH)
1185 			length = BPLUSTREE_MAX_KEY_LENGTH;
1186 
1187 		// Update index. Note, Index::Update() may be called even if
1188 		// initializing the index failed - it will just update the live
1189 		// queries in this case
1190 		if (pos < length || pos < oldLength) {
1191 			index.Update(transaction, name, type, oldData, oldLength, buffer,
1192 				length, this);
1193 		}
1194 	}
1195 
1196 	if (_created != NULL)
1197 		*_created = created;
1198 
1199 	return status;
1200 }
1201 
1202 
1203 /*!	Removes the specified attribute from the inode.
1204 	This is a high-level attribute function that understands attributes
1205 	in the small_data section as well as real attribute files.
1206 */
1207 status_t
1208 Inode::RemoveAttribute(Transaction& transaction, const char* name)
1209 {
1210 	Index index(fVolume);
1211 	bool hasIndex = index.SetTo(name) == B_OK;
1212 	NodeGetter node(fVolume, this);
1213 
1214 	// update index for attributes in the small_data section
1215 	{
1216 		RecursiveLocker _(fSmallDataLock);
1217 
1218 		small_data* smallData = FindSmallData(node.Node(), name);
1219 		if (smallData != NULL) {
1220 			uint32 length = smallData->DataSize();
1221 			if (length > BPLUSTREE_MAX_KEY_LENGTH)
1222 				length = BPLUSTREE_MAX_KEY_LENGTH;
1223 			index.Update(transaction, name, smallData->Type(),
1224 				smallData->Data(), length, NULL, 0, this);
1225 		}
1226 	}
1227 
1228 	status_t status = _RemoveSmallData(transaction, node, name);
1229 	if (status == B_ENTRY_NOT_FOUND && !Attributes().IsZero()) {
1230 		// remove the attribute file if it exists
1231 		status = _RemoveAttribute(transaction, name, hasIndex, &index);
1232 		if (status == B_OK) {
1233 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1234 				bfs_inode::ToInode(real_time_clock_usecs()));
1235 			WriteBack(transaction);
1236 		}
1237 	}
1238 
1239 	return status;
1240 }
1241 
1242 
1243 /*!	Returns the attribute inode with the specified \a name, in case it exists.
1244 	This method can only return real attribute files; the attributes in the
1245 	small data section are ignored.
1246 */
1247 status_t
1248 Inode::GetAttribute(const char* name, Inode** _attribute)
1249 {
1250 	// does this inode even have attributes?
1251 	if (Attributes().IsZero())
1252 		return B_ENTRY_NOT_FOUND;
1253 
1254 	Vnode vnode(fVolume, Attributes());
1255 	Inode* attributes;
1256 	if (vnode.Get(&attributes) < B_OK) {
1257 		FATAL(("get_vnode() failed in Inode::GetAttribute(name = \"%s\")\n",
1258 			name));
1259 		return B_ERROR;
1260 	}
1261 
1262 	BPlusTree* tree = attributes->Tree();
1263 	if (tree == NULL)
1264 		return B_BAD_VALUE;
1265 
1266 	InodeReadLocker locker(attributes);
1267 
1268 	ino_t id;
1269 	status_t status = tree->Find((uint8*)name, (uint16)strlen(name), &id);
1270 	if (status == B_OK) {
1271 		Vnode vnode(fVolume, id);
1272 		Inode* inode;
1273 		// Check if the attribute is really an attribute
1274 		if (vnode.Get(&inode) != B_OK || !inode->IsAttribute())
1275 			return B_ERROR;
1276 
1277 		*_attribute = inode;
1278 		vnode.Keep();
1279 		return B_OK;
1280 	}
1281 
1282 	return status;
1283 }
1284 
1285 
1286 void
1287 Inode::ReleaseAttribute(Inode* attribute)
1288 {
1289 	if (attribute == NULL)
1290 		return;
1291 
1292 	put_vnode(fVolume->FSVolume(), attribute->ID());
1293 }
1294 
1295 
1296 status_t
1297 Inode::CreateAttribute(Transaction& transaction, const char* name, uint32 type,
1298 	Inode** attribute)
1299 {
1300 	// do we need to create the attribute directory first?
1301 	if (Attributes().IsZero()) {
1302 		status_t status = Inode::Create(transaction, this, NULL,
1303 			S_ATTR_DIR | S_DIRECTORY | 0666, 0, 0, NULL);
1304 		if (status < B_OK)
1305 			RETURN_ERROR(status);
1306 	}
1307 	Vnode vnode(fVolume, Attributes());
1308 	Inode* attributes;
1309 	if (vnode.Get(&attributes) < B_OK)
1310 		return B_ERROR;
1311 
1312 	// Inode::Create() locks the inode for us
1313 	return Inode::Create(transaction, attributes, name,
1314 		S_ATTR | S_FILE | 0666, 0, type, NULL, NULL, attribute);
1315 }
1316 
1317 
1318 //	#pragma mark - directory tree
1319 
1320 
1321 bool
1322 Inode::IsEmpty()
1323 {
1324 	TreeIterator iterator(fTree);
1325 
1326 	// index and attribute directories are really empty when they are
1327 	// empty - directories for standard files always contain ".", and
1328 	// "..", so we need to ignore those two
1329 
1330 	uint32 count = 0;
1331 	char name[BPLUSTREE_MAX_KEY_LENGTH];
1332 	uint16 length;
1333 	ino_t id;
1334 	while (iterator.GetNextEntry(name, &length, B_FILE_NAME_LENGTH,
1335 			&id) == B_OK) {
1336 		if (Mode() & (S_ATTR_DIR | S_INDEX_DIR))
1337 			return false;
1338 
1339 		if (++count > 2 || (strcmp(".", name) && strcmp("..", name)))
1340 			return false;
1341 	}
1342 	return true;
1343 }
1344 
1345 
1346 status_t
1347 Inode::ContainerContentsChanged(Transaction& transaction)
1348 {
1349 	ASSERT(!InLastModifiedIndex());
1350 
1351 	Node().last_modified_time = Node().status_change_time
1352 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
1353 
1354 	return WriteBack(transaction);
1355 }
1356 
1357 
1358 //	#pragma mark - data stream
1359 
1360 
1361 /*!	Computes the number of bytes this inode occupies in the file system.
1362 	This includes the file meta blocks used for maintaining its data stream.
1363 
1364 	TODO: However, the attributes in extra files are not really accounted for;
1365 	depending on the speed penalty, this should be changed, though (the value
1366 	could be cached in the inode structure or Inode object, though).
1367 */
1368 off_t
1369 Inode::AllocatedSize() const
1370 {
1371 	if (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0) {
1372 		// This symlink does not have a data stream
1373 		return Node().InodeSize();
1374 	}
1375 
1376 	const data_stream& data = Node().data;
1377 	uint32 blockSize = fVolume->BlockSize();
1378 	off_t size = blockSize;
1379 
1380 	if (data.MaxDoubleIndirectRange() != 0) {
1381 		off_t doubleIndirectSize = data.MaxDoubleIndirectRange()
1382 			- data.MaxIndirectRange();
1383 		int32 indirectSize = double_indirect_max_indirect_size(
1384 			data.double_indirect.Length(), fVolume->BlockSize());
1385 
1386 		size += (2 * data.double_indirect.Length()
1387 				+ doubleIndirectSize / indirectSize)
1388 			* blockSize + data.MaxDoubleIndirectRange();
1389 	} else if (data.MaxIndirectRange() != 0)
1390 		size += data.indirect.Length() + data.MaxIndirectRange();
1391 	else
1392 		size += data.MaxDirectRange();
1393 
1394 	if (!Node().attributes.IsZero()) {
1395 		// TODO: to make this exact, we'd had to count all attributes
1396 		size += 2 * blockSize;
1397 			// 2 blocks, one for the attributes inode, one for its B+tree
1398 	}
1399 
1400 	return size;
1401 }
1402 
1403 
1404 /*!	Finds the block_run where "pos" is located in the data_stream of
1405 	the inode.
1406 	If successful, "offset" will then be set to the file offset
1407 	of the block_run returned; so "pos - offset" is for the block_run
1408 	what "pos" is for the whole stream.
1409 	The caller has to make sure that "pos" is inside the stream.
1410 */
1411 status_t
1412 Inode::FindBlockRun(off_t pos, block_run& run, off_t& offset)
1413 {
1414 	data_stream* data = &Node().data;
1415 
1416 	// find matching block run
1417 
1418 	if (data->MaxIndirectRange() > 0 && pos >= data->MaxDirectRange()) {
1419 		if (data->MaxDoubleIndirectRange() > 0
1420 			&& pos >= data->MaxIndirectRange()) {
1421 			// access to double indirect blocks
1422 
1423 			CachedBlock cached(fVolume);
1424 
1425 			int32 runsPerBlock;
1426 			int32 directSize;
1427 			int32 indirectSize;
1428 			get_double_indirect_sizes(data->double_indirect.Length(),
1429 				fVolume->BlockSize(), runsPerBlock, directSize, indirectSize);
1430 
1431 			off_t start = pos - data->MaxIndirectRange();
1432 			int32 index = start / indirectSize;
1433 
1434 			block_run* indirect = (block_run*)cached.SetTo(
1435 				fVolume->ToBlock(data->double_indirect) + index / runsPerBlock);
1436 			if (indirect == NULL)
1437 				RETURN_ERROR(B_ERROR);
1438 
1439 			int32 current = (start % indirectSize) / directSize;
1440 
1441 			indirect = (block_run*)cached.SetTo(
1442 				fVolume->ToBlock(indirect[index % runsPerBlock])
1443 				+ current / runsPerBlock);
1444 			if (indirect == NULL)
1445 				RETURN_ERROR(B_ERROR);
1446 
1447 			run = indirect[current % runsPerBlock];
1448 			if (run.Length() != data->double_indirect.Length())
1449 				RETURN_ERROR(B_BAD_DATA);
1450 
1451 			offset = data->MaxIndirectRange() + (index * indirectSize)
1452 				+ (current * directSize);
1453 		} else {
1454 			// access to indirect blocks
1455 
1456 			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1457 			off_t runBlockEnd = data->MaxDirectRange();
1458 
1459 			CachedBlock cached(fVolume);
1460 			off_t block = fVolume->ToBlock(data->indirect);
1461 
1462 			for (int32 i = 0; i < data->indirect.Length(); i++) {
1463 				block_run* indirect = (block_run*)cached.SetTo(block + i);
1464 				if (indirect == NULL)
1465 					RETURN_ERROR(B_IO_ERROR);
1466 
1467 				int32 current = -1;
1468 				while (++current < runsPerBlock) {
1469 					if (indirect[current].IsZero())
1470 						break;
1471 
1472 					runBlockEnd += indirect[current].Length()
1473 						<< cached.BlockShift();
1474 					if (runBlockEnd > pos) {
1475 						run = indirect[current];
1476 						offset = runBlockEnd - (run.Length()
1477 							<< cached.BlockShift());
1478 						return fVolume->ValidateBlockRun(run);
1479 					}
1480 				}
1481 			}
1482 			RETURN_ERROR(B_ERROR);
1483 		}
1484 	} else {
1485 		// access from direct blocks
1486 
1487 		off_t runBlockEnd = 0LL;
1488 		int32 current = -1;
1489 
1490 		while (++current < NUM_DIRECT_BLOCKS) {
1491 			if (data->direct[current].IsZero())
1492 				break;
1493 
1494 			runBlockEnd += data->direct[current].Length()
1495 				<< fVolume->BlockShift();
1496 			if (runBlockEnd > pos) {
1497 				run = data->direct[current];
1498 				offset = runBlockEnd - (run.Length() << fVolume->BlockShift());
1499 				return fVolume->ValidateBlockRun(run);
1500 			}
1501 		}
1502 
1503 		return B_ENTRY_NOT_FOUND;
1504 	}
1505 	return fVolume->ValidateBlockRun(run);
1506 }
1507 
1508 
1509 status_t
1510 Inode::ReadAt(off_t pos, uint8* buffer, size_t* _length)
1511 {
1512 	size_t length = *_length;
1513 
1514 	// set/check boundaries for pos/length
1515 	if (pos < 0)
1516 		return B_BAD_VALUE;
1517 
1518 	InodeReadLocker locker(this);
1519 
1520 	if (pos >= Size() || length == 0) {
1521 		*_length = 0;
1522 		return B_NO_ERROR;
1523 	}
1524 
1525 	locker.Unlock();
1526 
1527 	return file_cache_read(FileCache(), NULL, pos, buffer, _length);
1528 }
1529 
1530 
1531 status_t
1532 Inode::WriteAt(Transaction& transaction, off_t pos, const uint8* buffer,
1533 	size_t* _length)
1534 {
1535 	InodeReadLocker locker(this);
1536 
1537 	// update the last modification time in memory, it will be written
1538 	// back to the inode, and the index when the file is closed
1539 	// TODO: should update the internal last modified time only at this point!
1540 	Node().last_modified_time = Node().status_change_time
1541 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
1542 
1543 	// TODO: support INODE_LOGGED!
1544 
1545 	size_t length = *_length;
1546 	bool changeSize = pos + length > Size();
1547 
1548 	// set/check boundaries for pos/length
1549 	if (pos < 0)
1550 		return B_BAD_VALUE;
1551 
1552 	locker.Unlock();
1553 
1554 	// the transaction doesn't have to be started already
1555 	if (changeSize && !transaction.IsStarted())
1556 		transaction.Start(fVolume, BlockNumber());
1557 
1558 	WriteLocker writeLocker(fLock);
1559 
1560 	// Work around possible race condition: Someone might have shrunken the file
1561 	// while we had no lock.
1562 	if (!transaction.IsStarted() && pos + length > Size()) {
1563 		writeLocker.Unlock();
1564 		transaction.Start(fVolume, BlockNumber());
1565 		writeLocker.Lock();
1566 	}
1567 
1568 	if (pos + length > Size()) {
1569 		// let's grow the data stream to the size needed
1570 		status_t status = SetFileSize(transaction, pos + length);
1571 		if (status != B_OK) {
1572 			*_length = 0;
1573 			WriteLockInTransaction(transaction);
1574 			RETURN_ERROR(status);
1575 		}
1576 		// TODO: In theory we would need to update the file size
1577 		// index here as part of the current transaction - this might just
1578 		// be a bit too expensive, but worth a try.
1579 
1580 		// we need to write back the inode here because it has to
1581 		// go into this transaction (we cannot wait until the file
1582 		// is closed)
1583 		status = WriteBack(transaction);
1584 		if (status != B_OK) {
1585 			WriteLockInTransaction(transaction);
1586 			return status;
1587 		}
1588 	}
1589 
1590 	writeLocker.Unlock();
1591 
1592 	// If we don't want to write anything, we can now return (we may
1593 	// just have changed the file size using the position parameter)
1594 	if (length == 0)
1595 		return B_OK;
1596 
1597 	status_t status = file_cache_write(FileCache(), NULL, pos, buffer, _length);
1598 
1599 	if (transaction.IsStarted())
1600 		WriteLockInTransaction(transaction);
1601 
1602 	return status;
1603 }
1604 
1605 
1606 /*!	Fills the gap between the old file size and the new file size
1607 	with zeros.
1608 	It's more or less a copy of Inode::WriteAt() but it can handle
1609 	length differences of more than just 4 GB, and it never uses
1610 	the log, even if the INODE_LOGGED flag is set.
1611 */
1612 status_t
1613 Inode::FillGapWithZeros(off_t pos, off_t newSize)
1614 {
1615 	while (pos < newSize) {
1616 		size_t size;
1617 		if (newSize > pos + 1024 * 1024 * 1024)
1618 			size = 1024 * 1024 * 1024;
1619 		else
1620 			size = newSize - pos;
1621 
1622 		status_t status = file_cache_write(FileCache(), NULL, pos, NULL, &size);
1623 		if (status < B_OK)
1624 			return status;
1625 
1626 		pos += size;
1627 	}
1628 
1629 	return B_OK;
1630 }
1631 
1632 
1633 /*!	Allocates \a length blocks, and clears their contents. Growing
1634 	the indirect and double indirect range uses this method.
1635 	The allocated block_run is saved in "run"
1636 */
1637 status_t
1638 Inode::_AllocateBlockArray(Transaction& transaction, block_run& run,
1639 	size_t length, bool variableSize)
1640 {
1641 	if (!run.IsZero())
1642 		return B_BAD_VALUE;
1643 
1644 	status_t status = fVolume->Allocate(transaction, this, length, run,
1645 		variableSize ? 1 : length);
1646 	if (status != B_OK)
1647 		return status;
1648 
1649 	// make sure those blocks are empty
1650 	CachedBlock cached(fVolume);
1651 	off_t block = fVolume->ToBlock(run);
1652 
1653 	for (int32 i = 0; i < run.Length(); i++) {
1654 		block_run* runs = (block_run*)cached.SetToWritable(transaction,
1655 			block + i, true);
1656 		if (runs == NULL)
1657 			return B_IO_ERROR;
1658 	}
1659 	return B_OK;
1660 }
1661 
1662 
1663 /*!	Grows the stream to \a size, and fills the direct/indirect/double indirect
1664 	ranges with the runs.
1665 	This method will also determine the size of the preallocation, if any.
1666 */
1667 status_t
1668 Inode::_GrowStream(Transaction& transaction, off_t size)
1669 {
1670 	data_stream* data = &Node().data;
1671 
1672 	// is the data stream already large enough to hold the new size?
1673 	// (can be the case with preallocated blocks)
1674 	if (size < data->MaxDirectRange()
1675 		|| size < data->MaxIndirectRange()
1676 		|| size < data->MaxDoubleIndirectRange()) {
1677 		data->size = HOST_ENDIAN_TO_BFS_INT64(size);
1678 		return B_OK;
1679 	}
1680 
1681 	// how many bytes are still needed? (unused ranges are always zero)
1682 	uint16 minimum = 1;
1683 	off_t bytes;
1684 	if (data->Size() < data->MaxDoubleIndirectRange()) {
1685 		bytes = size - data->MaxDoubleIndirectRange();
1686 		// The double indirect range can only handle multiples of
1687 		// its base length
1688 		minimum = data->double_indirect.Length();
1689 	} else if (data->Size() < data->MaxIndirectRange())
1690 		bytes = size - data->MaxIndirectRange();
1691 	else if (data->Size() < data->MaxDirectRange())
1692 		bytes = size - data->MaxDirectRange();
1693 	else {
1694 		// no preallocation left to be used
1695 		bytes = size - data->Size();
1696 		if (data->MaxDoubleIndirectRange() > 0)
1697 			minimum = data->double_indirect.Length();
1698 	}
1699 
1700 	// do we have enough free blocks on the disk?
1701 	off_t blocksNeeded = (bytes + fVolume->BlockSize() - 1)
1702 		>> fVolume->BlockShift();
1703 	if (blocksNeeded > fVolume->FreeBlocks())
1704 		return B_DEVICE_FULL;
1705 
1706 	off_t blocksRequested = blocksNeeded;
1707 		// because of preallocations and partial allocations, the number of
1708 		// blocks we need to allocate may be different from the one we request
1709 		// from the block allocator
1710 
1711 	// Should we preallocate some blocks?
1712 	// Attributes, attribute directories, and long symlinks usually won't get
1713 	// that big, and should stay close to the inode - preallocating could be
1714 	// counterproductive.
1715 	// Also, if free disk space is tight, don't preallocate.
1716 	if (!IsAttribute() && !IsAttributeDirectory() && !IsSymLink()
1717 		&& fVolume->FreeBlocks() > 128) {
1718 		off_t roundTo = 0;
1719 		if (IsFile()) {
1720 			// Request preallocated blocks depending on the file size and growth
1721 			if (size < 1 * 1024 * 1024 && bytes < 512 * 1024) {
1722 				// Preallocate 64 KB for file sizes <1 MB and grow rates <512 KB
1723 				roundTo = 65536 >> fVolume->BlockShift();
1724 			} else if (size < 32 * 1024 * 1024 && bytes <= 1 * 1024 * 1024) {
1725 				// Preallocate 512 KB for file sizes between 1 MB and 32 MB, and
1726 				// grow rates smaller than 1 MB
1727 				roundTo = (512 * 1024) >> fVolume->BlockShift();
1728 			} else {
1729 				// Preallocate 1/16 of the file size (ie. 4 MB for 64 MB,
1730 				// 64 MB for 1 GB)
1731 				roundTo = size >> (fVolume->BlockShift() + 4);
1732 			}
1733 		} else if (IsIndex()) {
1734 			// Always preallocate 64 KB for index directories
1735 			roundTo = 65536 >> fVolume->BlockShift();
1736 		} else {
1737 			// Preallocate only 4 KB - directories only get trimmed when their
1738 			// vnode is flushed, which might not happen very often.
1739 			roundTo = 4096 >> fVolume->BlockShift();
1740 		}
1741 		if (roundTo > 1) {
1742 			// Round to next "roundTo" block count
1743 			blocksRequested = ((blocksNeeded + roundTo) / roundTo) * roundTo;
1744 		}
1745 	}
1746 
1747 	while (blocksNeeded > 0) {
1748 		// the requested blocks do not need to be returned with a
1749 		// single allocation, so we need to iterate until we have
1750 		// enough blocks allocated
1751 		if (minimum > 1) {
1752 			// make sure that "blocks" is a multiple of minimum
1753 			blocksRequested = round_up(blocksRequested, minimum);
1754 		}
1755 
1756 		block_run run;
1757 		status_t status = fVolume->Allocate(transaction, this, blocksRequested,
1758 			run, minimum);
1759 		if (status != B_OK)
1760 			return status;
1761 
1762 		// okay, we have the needed blocks, so just distribute them to the
1763 		// different ranges of the stream (direct, indirect & double indirect)
1764 
1765 		blocksNeeded -= run.Length();
1766 		// don't preallocate if the first allocation was already too small
1767 		blocksRequested = blocksNeeded;
1768 
1769 		// Direct block range
1770 
1771 		if (data->Size() <= data->MaxDirectRange()) {
1772 			// let's try to put them into the direct block range
1773 			int32 free = 0;
1774 			for (; free < NUM_DIRECT_BLOCKS; free++) {
1775 				if (data->direct[free].IsZero())
1776 					break;
1777 			}
1778 
1779 			if (free < NUM_DIRECT_BLOCKS) {
1780 				// can we merge the last allocated run with the new one?
1781 				int32 last = free - 1;
1782 				if (free > 0 && data->direct[last].MergeableWith(run)) {
1783 					data->direct[last].length = HOST_ENDIAN_TO_BFS_INT16(
1784 						data->direct[last].Length() + run.Length());
1785 				} else
1786 					data->direct[free] = run;
1787 
1788 				data->max_direct_range = HOST_ENDIAN_TO_BFS_INT64(
1789 					data->MaxDirectRange()
1790 					+ run.Length() * fVolume->BlockSize());
1791 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1792 					? data->max_direct_range : size);
1793 				continue;
1794 			}
1795 		}
1796 
1797 		// Indirect block range
1798 
1799 		if (data->Size() <= data->MaxIndirectRange()
1800 			|| !data->MaxIndirectRange()) {
1801 			CachedBlock cached(fVolume);
1802 			block_run* runs = NULL;
1803 			uint32 free = 0;
1804 			off_t block;
1805 
1806 			// if there is no indirect block yet, create one
1807 			if (data->indirect.IsZero()) {
1808 				status = _AllocateBlockArray(transaction, data->indirect,
1809 					NUM_ARRAY_BLOCKS, true);
1810 				if (status != B_OK)
1811 					return status;
1812 
1813 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1814 					data->MaxDirectRange());
1815 				// insert the block_run in the first block
1816 				runs = (block_run*)cached.SetTo(data->indirect);
1817 			} else {
1818 				uint32 numberOfRuns = fVolume->BlockSize() / sizeof(block_run);
1819 				block = fVolume->ToBlock(data->indirect);
1820 
1821 				// search first empty entry
1822 				int32 i = 0;
1823 				for (; i < data->indirect.Length(); i++) {
1824 					if ((runs = (block_run*)cached.SetTo(block + i)) == NULL)
1825 						return B_IO_ERROR;
1826 
1827 					for (free = 0; free < numberOfRuns; free++)
1828 						if (runs[free].IsZero())
1829 							break;
1830 
1831 					if (free < numberOfRuns)
1832 						break;
1833 				}
1834 				if (i == data->indirect.Length())
1835 					runs = NULL;
1836 			}
1837 
1838 			if (runs != NULL) {
1839 				// try to insert the run to the last one - note that this
1840 				// doesn't take block borders into account, so it could be
1841 				// further optimized
1842 				cached.MakeWritable(transaction);
1843 
1844 				int32 last = free - 1;
1845 				if (free > 0 && runs[last].MergeableWith(run)) {
1846 					runs[last].length = HOST_ENDIAN_TO_BFS_INT16(
1847 						runs[last].Length() + run.Length());
1848 				} else
1849 					runs[free] = run;
1850 
1851 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1852 					data->MaxIndirectRange()
1853 					+ (run.Length() << fVolume->BlockShift()));
1854 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1855 					? data->MaxIndirectRange() : size);
1856 				continue;
1857 			}
1858 		}
1859 
1860 		// Double indirect block range
1861 
1862 		if (data->Size() <= data->MaxDoubleIndirectRange()
1863 			|| !data->max_double_indirect_range) {
1864 			// We make sure here that we have this minimum allocated, so if
1865 			// the allocation succeeds, we don't run into an endless loop.
1866 			if (!data->max_double_indirect_range)
1867 				minimum = _DoubleIndirectBlockLength();
1868 			else
1869 				minimum = data->double_indirect.Length();
1870 
1871 			if ((run.Length() % minimum) != 0) {
1872 				// The number of allocated blocks isn't a multiple of 'minimum',
1873 				// so we have to change this. This can happen the first time the
1874 				// stream grows into the double indirect range.
1875 				// First, free the remaining blocks that don't fit into this
1876 				// multiple.
1877 				int32 rest = run.Length() % minimum;
1878 				run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length() - rest);
1879 
1880 				status = fVolume->Free(transaction,
1881 					block_run::Run(run.AllocationGroup(),
1882 					run.Start() + run.Length(), rest));
1883 				if (status != B_OK)
1884 					return status;
1885 
1886 				blocksNeeded += rest;
1887 				blocksRequested = round_up(blocksNeeded, minimum);
1888 
1889 				// Are there any blocks left in the run? If not, allocate
1890 				// a new one
1891 				if (run.length == 0)
1892 					continue;
1893 			}
1894 
1895 			// if there is no double indirect block yet, create one
1896 			if (data->double_indirect.IsZero()) {
1897 				status = _AllocateBlockArray(transaction,
1898 					data->double_indirect, _DoubleIndirectBlockLength());
1899 				if (status != B_OK)
1900 					return status;
1901 
1902 				data->max_double_indirect_range = data->max_indirect_range;
1903 			}
1904 
1905 			// calculate the index where to insert the new blocks
1906 
1907 			int32 runsPerBlock;
1908 			int32 directSize;
1909 			int32 indirectSize;
1910 			get_double_indirect_sizes(data->double_indirect.Length(),
1911 				fVolume->BlockSize(), runsPerBlock, directSize, indirectSize);
1912 
1913 			off_t start = data->MaxDoubleIndirectRange()
1914 				- data->MaxIndirectRange();
1915 			int32 indirectIndex = start / indirectSize;
1916 			int32 index = (start % indirectSize) / directSize;
1917 			int32 runsPerArray = runsPerBlock * minimum;
1918 
1919 			// distribute the blocks to the array and allocate
1920 			// new array blocks when needed
1921 
1922 			CachedBlock cached(fVolume);
1923 			CachedBlock cachedDirect(fVolume);
1924 			block_run* array = NULL;
1925 			uint32 runLength = run.Length();
1926 
1927 			while (run.length != 0) {
1928 				// get the indirect array block
1929 				if (array == NULL) {
1930 					uint32 block = indirectIndex / runsPerBlock;
1931 					if (block >= minimum)
1932 						return EFBIG;
1933 
1934 					array = (block_run*)cached.SetTo(fVolume->ToBlock(
1935 						data->double_indirect) + block);
1936 					if (array == NULL)
1937 						return B_IO_ERROR;
1938 				}
1939 
1940 				do {
1941 					// do we need a new array block?
1942 					if (array[indirectIndex % runsPerBlock].IsZero()) {
1943 						cached.MakeWritable(transaction);
1944 
1945 						status = _AllocateBlockArray(transaction,
1946 							array[indirectIndex % runsPerBlock],
1947 							data->double_indirect.Length());
1948 						if (status != B_OK)
1949 							return status;
1950 					}
1951 
1952 					block_run* runs = (block_run*)cachedDirect.SetToWritable(
1953 						transaction, fVolume->ToBlock(array[indirectIndex
1954 							% runsPerBlock]) + index / runsPerBlock);
1955 					if (runs == NULL)
1956 						return B_IO_ERROR;
1957 
1958 					do {
1959 						// insert the block_run into the array
1960 						runs[index % runsPerBlock] = run;
1961 						runs[index % runsPerBlock].length
1962 							= HOST_ENDIAN_TO_BFS_INT16(minimum);
1963 
1964 						// alter the remaining block_run
1965 						run.start = HOST_ENDIAN_TO_BFS_INT16(run.Start()
1966 							+ minimum);
1967 						run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
1968 							- minimum);
1969 					} while ((++index % runsPerBlock) != 0 && run.length);
1970 				} while ((index % runsPerArray) != 0 && run.length);
1971 
1972 				if (index == runsPerArray)
1973 					index = 0;
1974 				if (++indirectIndex % runsPerBlock == 0) {
1975 					array = NULL;
1976 					index = 0;
1977 				}
1978 			}
1979 
1980 			data->max_double_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1981 				data->MaxDoubleIndirectRange()
1982 				+ (runLength << fVolume->BlockShift()));
1983 			data->size = blocksNeeded > 0 ? HOST_ENDIAN_TO_BFS_INT64(
1984 				data->max_double_indirect_range) : size;
1985 
1986 			continue;
1987 		}
1988 
1989 		RETURN_ERROR(EFBIG);
1990 	}
1991 	// update the size of the data stream
1992 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
1993 
1994 	return B_OK;
1995 }
1996 
1997 
1998 size_t
1999 Inode::_DoubleIndirectBlockLength() const
2000 {
2001 	if (fVolume->BlockSize() > DOUBLE_INDIRECT_ARRAY_SIZE)
2002 		return 1;
2003 
2004 	return DOUBLE_INDIRECT_ARRAY_SIZE / fVolume->BlockSize();
2005 }
2006 
2007 
2008 /*!	Frees the statically sized array of the double indirect part of a data
2009 	stream.
2010 */
2011 status_t
2012 Inode::_FreeStaticStreamArray(Transaction& transaction, int32 level,
2013 	block_run run, off_t size, off_t offset, off_t& max)
2014 {
2015 	int32 indirectSize;
2016 	if (level == 0) {
2017 		indirectSize = double_indirect_max_indirect_size(run.Length(),
2018 			fVolume->BlockSize());
2019 	} else {
2020 		indirectSize = double_indirect_max_direct_size(run.Length(),
2021 			fVolume->BlockSize());
2022 	}
2023 
2024 	off_t start;
2025 	if (size > offset)
2026 		start = size - offset;
2027 	else
2028 		start = 0;
2029 
2030 	int32 index = start / indirectSize;
2031 	int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
2032 
2033 	CachedBlock cached(fVolume);
2034 	off_t blockNumber = fVolume->ToBlock(run);
2035 
2036 	// set the file offset to the current block run
2037 	offset += (off_t)index * indirectSize;
2038 
2039 	for (int32 i = index / runsPerBlock; i < run.Length(); i++) {
2040 		block_run* array = (block_run*)cached.SetToWritable(transaction,
2041 			blockNumber + i);
2042 		if (array == NULL)
2043 			RETURN_ERROR(B_ERROR);
2044 
2045 		for (index = index % runsPerBlock; index < runsPerBlock; index++) {
2046 			if (array[index].IsZero()) {
2047 				// we also want to break out of the outer loop
2048 				i = run.Length();
2049 				break;
2050 			}
2051 
2052 			status_t status = B_OK;
2053 			if (level == 0) {
2054 				status = _FreeStaticStreamArray(transaction, 1, array[index],
2055 					size, offset, max);
2056 			} else if (offset >= size)
2057 				status = fVolume->Free(transaction, array[index]);
2058 			else
2059 				max = HOST_ENDIAN_TO_BFS_INT64(offset + indirectSize);
2060 
2061 			if (status < B_OK)
2062 				RETURN_ERROR(status);
2063 
2064 			if (offset >= size)
2065 				array[index].SetTo(0, 0, 0);
2066 
2067 			offset += indirectSize;
2068 		}
2069 		index = 0;
2070 	}
2071 	return B_OK;
2072 }
2073 
2074 
2075 /*!	Frees all block_runs in the array which come after the specified size.
2076 	It also trims the last block_run that contain the size.
2077 	"offset" and "max" are maintained until the last block_run that doesn't
2078 	have to be freed - after this, the values won't be correct anymore, but
2079 	will still assure correct function for all subsequent calls.
2080 	"max" is considered to be in file system byte order.
2081 */
2082 status_t
2083 Inode::_FreeStreamArray(Transaction& transaction, block_run* array,
2084 	uint32 arrayLength, off_t size, off_t& offset, off_t& max)
2085 {
2086 	PRINT(("FreeStreamArray: arrayLength %lu, size %Ld, offset %Ld, max %Ld\n",
2087 		arrayLength, size, offset, max));
2088 
2089 	off_t newOffset = offset;
2090 	uint32 i = 0;
2091 	for (; i < arrayLength; i++, offset = newOffset) {
2092 		if (array[i].IsZero())
2093 			break;
2094 
2095 		newOffset += (off_t)array[i].Length() << fVolume->BlockShift();
2096 		if (newOffset <= size)
2097 			continue;
2098 
2099 		block_run run = array[i];
2100 
2101 		// determine the block_run to be freed
2102 		if (newOffset > size && offset < size) {
2103 			// free partial block_run (and update the original block_run)
2104 			run.start = HOST_ENDIAN_TO_BFS_INT16(array[i].Start()
2105 				+ ((size + fVolume->BlockSize() - 1 - offset)
2106 					>> fVolume->BlockShift()));
2107 				// round to next block
2108 			array[i].length = HOST_ENDIAN_TO_BFS_INT16(run.Start()
2109 				- array[i].Start());
2110 			run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
2111 				- array[i].Length());
2112 
2113 			if (run.length == 0)
2114 				continue;
2115 
2116 			// update maximum range
2117 			max = HOST_ENDIAN_TO_BFS_INT64(offset + ((off_t)array[i].Length()
2118 				<< fVolume->BlockShift()));
2119 		} else {
2120 			// free the whole block_run
2121 			array[i].SetTo(0, 0, 0);
2122 
2123 			if ((off_t)BFS_ENDIAN_TO_HOST_INT64(max) > offset)
2124 				max = HOST_ENDIAN_TO_BFS_INT64(offset);
2125 		}
2126 
2127 		if (fVolume->Free(transaction, run) < B_OK)
2128 			return B_IO_ERROR;
2129 	}
2130 	return B_OK;
2131 }
2132 
2133 
2134 status_t
2135 Inode::_ShrinkStream(Transaction& transaction, off_t size)
2136 {
2137 	data_stream* data = &Node().data;
2138 	status_t status;
2139 
2140 	if (data->MaxDoubleIndirectRange() > size) {
2141 		off_t* maxDoubleIndirect = &data->max_double_indirect_range;
2142 			// gcc 4 work-around: "error: cannot bind packed field
2143 			// 'data->data_stream::max_double_indirect_range' to 'off_t&'"
2144 		status = _FreeStaticStreamArray(transaction, 0, data->double_indirect,
2145 			size, data->MaxIndirectRange(), *maxDoubleIndirect);
2146 		if (status != B_OK)
2147 			return status;
2148 
2149 		if (size <= data->MaxIndirectRange()) {
2150 			fVolume->Free(transaction, data->double_indirect);
2151 			data->double_indirect.SetTo(0, 0, 0);
2152 			data->max_double_indirect_range = 0;
2153 		}
2154 	}
2155 
2156 	if (data->MaxIndirectRange() > size) {
2157 		CachedBlock cached(fVolume);
2158 		off_t block = fVolume->ToBlock(data->indirect);
2159 		off_t offset = data->MaxDirectRange();
2160 
2161 		for (int32 i = 0; i < data->indirect.Length(); i++) {
2162 			block_run* array = (block_run*)cached.SetToWritable(transaction,
2163 				block + i);
2164 			if (array == NULL)
2165 				break;
2166 
2167 			off_t* maxIndirect = &data->max_indirect_range;
2168 				// gcc 4 work-around: "error: cannot bind packed field
2169 				// 'data->data_stream::max_indirect_range' to 'off_t&'"
2170 			if (_FreeStreamArray(transaction, array, fVolume->BlockSize()
2171 					/ sizeof(block_run), size, offset, *maxIndirect) != B_OK)
2172 				return B_IO_ERROR;
2173 		}
2174 		if (data->max_direct_range == data->max_indirect_range) {
2175 			fVolume->Free(transaction, data->indirect);
2176 			data->indirect.SetTo(0, 0, 0);
2177 			data->max_indirect_range = 0;
2178 		}
2179 	}
2180 
2181 	if (data->MaxDirectRange() > size) {
2182 		off_t offset = 0;
2183 		off_t *maxDirect = &data->max_direct_range;
2184 			// gcc 4 work-around: "error: cannot bind packed field
2185 			// 'data->data_stream::max_direct_range' to 'off_t&'"
2186 		status = _FreeStreamArray(transaction, data->direct, NUM_DIRECT_BLOCKS,
2187 			size, offset, *maxDirect);
2188 		if (status < B_OK)
2189 			return status;
2190 	}
2191 
2192 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
2193 	return B_OK;
2194 }
2195 
2196 
2197 status_t
2198 Inode::SetFileSize(Transaction& transaction, off_t size)
2199 {
2200 	if (size < 0)
2201 		return B_BAD_VALUE;
2202 
2203 	off_t oldSize = Size();
2204 
2205 	if (size == oldSize)
2206 		return B_OK;
2207 
2208 	T(Resize(this, oldSize, size, false));
2209 
2210 	// should the data stream grow or shrink?
2211 	status_t status;
2212 	if (size > oldSize) {
2213 		status = _GrowStream(transaction, size);
2214 		if (status < B_OK) {
2215 			// if the growing of the stream fails, the whole operation
2216 			// fails, so we should shrink the stream to its former size
2217 			_ShrinkStream(transaction, oldSize);
2218 		}
2219 	} else
2220 		status = _ShrinkStream(transaction, size);
2221 
2222 	if (status < B_OK)
2223 		return status;
2224 
2225 	file_cache_set_size(FileCache(), size);
2226 	file_map_set_size(Map(), size);
2227 
2228 	return WriteBack(transaction);
2229 }
2230 
2231 
2232 status_t
2233 Inode::Append(Transaction& transaction, off_t bytes)
2234 {
2235 	return SetFileSize(transaction, Size() + bytes);
2236 }
2237 
2238 
2239 /*!	Checks whether or not this inode's data stream needs to be trimmed
2240 	because of an earlier preallocation.
2241 	Returns true if there are any blocks to be trimmed.
2242 */
2243 bool
2244 Inode::NeedsTrimming() const
2245 {
2246 	// We never trim preallocated index blocks to make them grow as smooth as
2247 	// possible. There are only few indices anyway, so this doesn't hurt.
2248 	// Also, if an inode is already in deleted state, we don't bother trimming
2249 	// it.
2250 	if (IsIndex() || IsDeleted()
2251 		|| (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0))
2252 		return false;
2253 
2254 	off_t roundedSize = round_up(Size(), fVolume->BlockSize());
2255 
2256 	return Node().data.MaxDirectRange() > roundedSize
2257 		|| Node().data.MaxIndirectRange() > roundedSize
2258 		|| Node().data.MaxDoubleIndirectRange() > roundedSize;
2259 }
2260 
2261 
2262 status_t
2263 Inode::TrimPreallocation(Transaction& transaction)
2264 {
2265 	T(Resize(this, max_c(Node().data.MaxDirectRange(),
2266 		Node().data.MaxIndirectRange()), Size(), true));
2267 
2268 	status_t status = _ShrinkStream(transaction, Size());
2269 	if (status < B_OK)
2270 		return status;
2271 
2272 	return WriteBack(transaction);
2273 }
2274 
2275 
2276 //!	Frees the file's data stream and removes all attributes
2277 status_t
2278 Inode::Free(Transaction& transaction)
2279 {
2280 	FUNCTION();
2281 
2282 	// Perhaps there should be an implementation of Inode::ShrinkStream() that
2283 	// just frees the data_stream, but doesn't change the inode (since it is
2284 	// freed anyway) - that would make an undelete command possible
2285 	if (!IsSymLink() || (Flags() & INODE_LONG_SYMLINK) != 0) {
2286 		status_t status = SetFileSize(transaction, 0);
2287 		if (status < B_OK)
2288 			return status;
2289 	}
2290 
2291 	// Free all attributes, and remove their indices
2292 	{
2293 		// We have to limit the scope of AttributeIterator, so that its
2294 		// destructor is not called after the inode is deleted
2295 		AttributeIterator iterator(this);
2296 
2297 		char name[B_FILE_NAME_LENGTH];
2298 		uint32 type;
2299 		size_t length;
2300 		ino_t id;
2301 		while (iterator.GetNext(name, &length, &type, &id) == B_OK) {
2302 			RemoveAttribute(transaction, name);
2303 		}
2304 	}
2305 
2306 	if (WriteBack(transaction) < B_OK)
2307 		return B_IO_ERROR;
2308 
2309 	return fVolume->Free(transaction, BlockRun());
2310 }
2311 
2312 
2313 //!	Synchronizes (writes back to disk) the file stream of the inode.
2314 status_t
2315 Inode::Sync()
2316 {
2317 	if (FileCache())
2318 		return file_cache_sync(FileCache());
2319 
2320 	// We may also want to flush the attribute's data stream to
2321 	// disk here... (do we?)
2322 
2323 	if (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0)
2324 		return B_OK;
2325 
2326 	InodeReadLocker locker(this);
2327 
2328 	data_stream* data = &Node().data;
2329 	status_t status = B_OK;
2330 
2331 	// flush direct range
2332 
2333 	for (int32 i = 0; i < NUM_DIRECT_BLOCKS; i++) {
2334 		if (data->direct[i].IsZero())
2335 			return B_OK;
2336 
2337 		status = block_cache_sync_etc(fVolume->BlockCache(),
2338 			fVolume->ToBlock(data->direct[i]), data->direct[i].Length());
2339 		if (status != B_OK)
2340 			return status;
2341 	}
2342 
2343 	// flush indirect range
2344 
2345 	if (data->max_indirect_range == 0)
2346 		return B_OK;
2347 
2348 	CachedBlock cached(fVolume);
2349 	off_t block = fVolume->ToBlock(data->indirect);
2350 	int32 count = fVolume->BlockSize() / sizeof(block_run);
2351 
2352 	for (int32 j = 0; j < data->indirect.Length(); j++) {
2353 		block_run* runs = (block_run*)cached.SetTo(block + j);
2354 		if (runs == NULL)
2355 			break;
2356 
2357 		for (int32 i = 0; i < count; i++) {
2358 			if (runs[i].IsZero())
2359 				return B_OK;
2360 
2361 			status = block_cache_sync_etc(fVolume->BlockCache(),
2362 				fVolume->ToBlock(runs[i]), runs[i].Length());
2363 			if (status != B_OK)
2364 				return status;
2365 		}
2366 	}
2367 
2368 	// flush double indirect range
2369 
2370 	if (data->max_double_indirect_range == 0)
2371 		return B_OK;
2372 
2373 	off_t indirectBlock = fVolume->ToBlock(data->double_indirect);
2374 
2375 	for (int32 l = 0; l < data->double_indirect.Length(); l++) {
2376 		block_run* indirectRuns = (block_run*)cached.SetTo(indirectBlock + l);
2377 		if (indirectRuns == NULL)
2378 			return B_FILE_ERROR;
2379 
2380 		CachedBlock directCached(fVolume);
2381 
2382 		for (int32 k = 0; k < count; k++) {
2383 			if (indirectRuns[k].IsZero())
2384 				return B_OK;
2385 
2386 			block = fVolume->ToBlock(indirectRuns[k]);
2387 			for (int32 j = 0; j < indirectRuns[k].Length(); j++) {
2388 				block_run* runs = (block_run*)directCached.SetTo(block + j);
2389 				if (runs == NULL)
2390 					return B_FILE_ERROR;
2391 
2392 				for (int32 i = 0; i < count; i++) {
2393 					if (runs[i].IsZero())
2394 						return B_OK;
2395 
2396 					// TODO: combine single block_runs to bigger ones when
2397 					// they are adjacent
2398 					status = block_cache_sync_etc(fVolume->BlockCache(),
2399 						fVolume->ToBlock(runs[i]), runs[i].Length());
2400 					if (status != B_OK)
2401 						return status;
2402 				}
2403 			}
2404 		}
2405 	}
2406 	return B_OK;
2407 }
2408 
2409 
2410 //	#pragma mark - TransactionListener implementation
2411 
2412 
2413 void
2414 Inode::TransactionDone(bool success)
2415 {
2416 	if (!success) {
2417 		// revert any changes made to the cached bfs_inode
2418 		UpdateNodeFromDisk();
2419 	}
2420 }
2421 
2422 
2423 void
2424 Inode::RemovedFromTransaction()
2425 {
2426 	Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
2427 
2428 	// See AddInode() why we do this here
2429 	if ((Flags() & INODE_DELETED) != 0)
2430 		fVolume->RemovedInodes().Add(this);
2431 
2432 	rw_lock_write_unlock(&Lock());
2433 
2434 	if (!fVolume->IsInitializing())
2435 		put_vnode(fVolume->FSVolume(), ID());
2436 }
2437 
2438 
2439 //	#pragma mark - creation/deletion
2440 
2441 
2442 status_t
2443 Inode::Remove(Transaction& transaction, const char* name, ino_t* _id,
2444 	bool isDirectory, bool force)
2445 {
2446 	if (fTree == NULL)
2447 		RETURN_ERROR(B_BAD_VALUE);
2448 
2449 	WriteLockInTransaction(transaction);
2450 
2451 	// does the file even exist?
2452 	off_t id;
2453 	if (fTree->Find((uint8*)name, (uint16)strlen(name), &id) < B_OK)
2454 		return B_ENTRY_NOT_FOUND;
2455 
2456 	if (_id)
2457 		*_id = id;
2458 
2459 	Vnode vnode(fVolume, id);
2460 	Inode* inode;
2461 	status_t status = vnode.Get(&inode);
2462 	if (status < B_OK) {
2463 		REPORT_ERROR(status);
2464 		return B_ENTRY_NOT_FOUND;
2465 	}
2466 
2467 	T(Remove(inode, name));
2468 	inode->WriteLockInTransaction(transaction);
2469 
2470 	// Inode::IsContainer() is true also for indices (furthermore, the S_IFDIR
2471 	// bit is set for indices in BFS, not for attribute directories) - but you
2472 	// should really be able to do whatever you want with your indices
2473 	// without having to remove all files first :)
2474 	if (!inode->IsIndex() && !force) {
2475 		// if it's not of the correct type, don't delete it!
2476 		if (inode->IsContainer() != isDirectory)
2477 			return isDirectory ? B_NOT_A_DIRECTORY : B_IS_A_DIRECTORY;
2478 
2479 		// only delete empty directories
2480 		if (isDirectory && !inode->IsEmpty())
2481 			return B_DIRECTORY_NOT_EMPTY;
2482 	}
2483 
2484 	// remove_vnode() allows the inode to be accessed until the last put_vnode()
2485 	status = remove_vnode(fVolume->FSVolume(), id);
2486 	if (status != B_OK)
2487 		return status;
2488 
2489 	if (fTree->Remove(transaction, name, id) != B_OK && !force) {
2490 		unremove_vnode(fVolume->FSVolume(), id);
2491 		RETURN_ERROR(B_ERROR);
2492 	}
2493 
2494 #ifdef DEBUG
2495 	if (fTree->Find((uint8*)name, (uint16)strlen(name), &id) == B_OK) {
2496 		DIE(("deleted entry still there"));
2497 	}
2498 #endif
2499 
2500 	ContainerContentsChanged(transaction);
2501 
2502 	// update the inode, so that no one will ever doubt it's deleted :-)
2503 	inode->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
2504 	inode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
2505 
2506 	// In balance to the Inode::Create() method, the main indices
2507 	// are updated here (name, size, & last_modified)
2508 
2509 	Index index(fVolume);
2510 	if (inode->InNameIndex()) {
2511 		index.RemoveName(transaction, name, inode);
2512 			// If removing from the index fails, it is not regarded as a
2513 			// fatal error and will not be reported back!
2514 			// Deleted inodes won't be visible in queries anyway.
2515 	}
2516 
2517 	if (inode->InSizeIndex())
2518 		index.RemoveSize(transaction, inode);
2519 	if (inode->InLastModifiedIndex())
2520 		index.RemoveLastModified(transaction, inode);
2521 
2522 	return inode->WriteBack(transaction);
2523 }
2524 
2525 
2526 /*!	Creates the inode with the specified \a parent directory, and automatically
2527 	adds the created inode to that parent directory. If an attribute directory
2528 	is created, it will also automatically  be added to the \a parent inode as
2529 	such. However, the indices root node, and the regular root node won't be
2530 	added to the super block.
2531 	It will also create the initial B+tree for the inode if it's a directory
2532 	of any kind.
2533 	\a name may be \c NULL, but only if no \a parent is given.
2534 	If the "_id" or "_inode" variable is given and non-NULL to store the
2535 	inode's ID, the inode stays locked - you have to call put_vnode() if you
2536 	don't use it anymore.
2537 	If the node already exists, this method will fail if \c O_EXCL is set, or
2538 	it's a directory or a symlink. Otherwise, it will just be returned.
2539 	If \c O_TRUNC has been specified, the file will also be truncated.
2540 */
2541 status_t
2542 Inode::Create(Transaction& transaction, Inode* parent, const char* name,
2543 	int32 mode, int openMode, uint32 type, bool* _created, ino_t* _id,
2544 	Inode** _inode, fs_vnode_ops* vnodeOps, uint32 publishFlags)
2545 {
2546 	FUNCTION_START(("name = %s, mode = %ld\n", name, mode));
2547 
2548 	block_run parentRun = parent ? parent->BlockRun() : block_run::Run(0, 0, 0);
2549 	Volume* volume = transaction.GetVolume();
2550 	BPlusTree* tree = NULL;
2551 
2552 	if (parent != NULL && (mode & S_ATTR_DIR) == 0 && parent->IsContainer()) {
2553 		// check if the file already exists in the directory
2554 		tree = parent->Tree();
2555 	}
2556 
2557 	if (parent != NULL) {
2558 		// the parent directory is locked during the whole inode creation
2559 		parent->WriteLockInTransaction(transaction);
2560 	}
2561 
2562 	if (parent != NULL && !volume->IsInitializing() && parent->IsContainer()) {
2563 		// don't create anything in removed directories
2564 		bool removed;
2565 		if (get_vnode_removed(volume->FSVolume(), parent->ID(), &removed)
2566 				== B_OK && removed) {
2567 			RETURN_ERROR(B_ENTRY_NOT_FOUND);
2568 		}
2569 	}
2570 
2571 	if (tree != NULL) {
2572 		// Does the file already exist?
2573 		off_t offset;
2574 		if (tree->Find((uint8*)name, (uint16)strlen(name), &offset) == B_OK) {
2575 			// Return if the file should be a directory/link or opened in
2576 			// exclusive mode
2577 			if (S_ISDIR(mode) || S_ISLNK(mode) || (openMode & O_EXCL) != 0)
2578 				return B_FILE_EXISTS;
2579 
2580 			Vnode vnode(volume, offset);
2581 			Inode* inode;
2582 			status_t status = vnode.Get(&inode);
2583 			if (status != B_OK) {
2584 				REPORT_ERROR(status);
2585 				return B_ENTRY_NOT_FOUND;
2586 			}
2587 
2588 			if (inode->IsDirectory() && (openMode & O_RWMASK) != O_RDONLY)
2589 				return B_IS_A_DIRECTORY;
2590 			if ((openMode & O_DIRECTORY) != 0 && !inode->IsDirectory())
2591 				return B_NOT_A_DIRECTORY;
2592 
2593 			// we want to open the file, so we should have the rights to do so
2594 			if (inode->CheckPermissions(open_mode_to_access(openMode)
2595 					| ((openMode & O_TRUNC) != 0 ? W_OK : 0)) != B_OK)
2596 				return B_NOT_ALLOWED;
2597 
2598 			if ((openMode & O_TRUNC) != 0) {
2599 				// truncate the existing file
2600 				inode->WriteLockInTransaction(transaction);
2601 
2602 				status_t status = inode->SetFileSize(transaction, 0);
2603 				if (status == B_OK)
2604 					status = inode->WriteBack(transaction);
2605 
2606 				if (status != B_OK)
2607 					return status;
2608 			}
2609 
2610 			if (_created)
2611 				*_created = false;
2612 			if (_id)
2613 				*_id = inode->ID();
2614 			if (_inode)
2615 				*_inode = inode;
2616 
2617 			// Only keep the vnode in memory if the _id or _inode pointer is
2618 			// provided
2619 			if (_id != NULL || _inode != NULL)
2620 				vnode.Keep();
2621 
2622 			return B_OK;
2623 		}
2624 	} else if (parent != NULL && (mode & S_ATTR_DIR) == 0) {
2625 		return B_BAD_VALUE;
2626 	} else if ((openMode & O_DIRECTORY) != 0) {
2627 		// TODO: we might need to return B_NOT_A_DIRECTORY here
2628 		return B_ENTRY_NOT_FOUND;
2629 	}
2630 
2631 	status_t status;
2632 
2633 	// do we have the power to create new files at all?
2634 	if (parent != NULL && (status = parent->CheckPermissions(W_OK)) != B_OK)
2635 		RETURN_ERROR(status);
2636 
2637 	// allocate space for the new inode
2638 	InodeAllocator allocator(transaction);
2639 	block_run run;
2640 	Inode* inode;
2641 	status = allocator.New(&parentRun, mode, run, vnodeOps, &inode);
2642 	if (status < B_OK)
2643 		return status;
2644 
2645 	T(Create(inode, parent, name, mode, openMode, type));
2646 
2647 	// Initialize the parts of the bfs_inode structure that
2648 	// InodeAllocator::New() hasn't touched yet
2649 
2650 	bfs_inode* node = &inode->Node();
2651 
2652 	if (parent == NULL) {
2653 		// we set the parent to itself in this case
2654 		// (only happens for the root and indices node)
2655 		node->parent = run;
2656 	} else
2657 		node->parent = parentRun;
2658 
2659 	node->uid = HOST_ENDIAN_TO_BFS_INT32(geteuid());
2660 	node->gid = HOST_ENDIAN_TO_BFS_INT32(parent
2661 		? parent->Node().GroupID() : getegid());
2662 		// the group ID is inherited from the parent, if available
2663 
2664 	node->type = HOST_ENDIAN_TO_BFS_INT32(type);
2665 
2666 	inode->WriteBack(transaction);
2667 		// make sure the initialized node is available to others
2668 
2669 	// only add the name to regular files, directories, or symlinks
2670 	// don't add it to attributes, or indices
2671 	if (tree && inode->IsRegularNode()
2672 		&& inode->SetName(transaction, name) != B_OK)
2673 		return B_ERROR;
2674 
2675 	// Initialize b+tree if it's a directory (and add "." & ".." if it's
2676 	// a standard directory for files - not for attributes or indices)
2677 	if (inode->IsContainer()) {
2678 		status = allocator.CreateTree();
2679 		if (status != B_OK)
2680 			return status;
2681 	}
2682 
2683 	// Add a link to the inode from the parent, depending on its type
2684 	// (the vnode is not published yet, so it is safe to make the inode
2685 	// accessable to the file system here)
2686 	if (tree != NULL) {
2687 		status = tree->Insert(transaction, name, inode->ID());
2688 	} else if (parent != NULL && (mode & S_ATTR_DIR) != 0) {
2689 		parent->Attributes() = run;
2690 		status = parent->WriteBack(transaction);
2691 	}
2692 
2693 	// Note, we only care if the inode could be made accessable for the
2694 	// two cases above; the root node or the indices root node must
2695 	// handle this case on their own (or other cases where "parent" is
2696 	// NULL)
2697 	if (status != B_OK)
2698 		RETURN_ERROR(status);
2699 
2700 	// Update the main indices (name, size & last_modified)
2701 	// (live queries might want to access us after this)
2702 
2703 	Index index(volume);
2704 	if (inode->InNameIndex() && name != NULL) {
2705 		// the name index only contains regular files
2706 		// (but not the root node where name == NULL)
2707 		status = index.InsertName(transaction, name, inode);
2708 		if (status != B_OK && status != B_BAD_INDEX) {
2709 			// We have to remove the node from the parent at this point,
2710 			// because the InodeAllocator destructor can't handle this
2711 			// case (and if it fails, we can't do anything about it...)
2712 			if (tree)
2713 				tree->Remove(transaction, name, inode->ID());
2714 			else if (parent != NULL && (mode & S_ATTR_DIR) != 0)
2715 				parent->Node().attributes.SetTo(0, 0, 0);
2716 
2717 			RETURN_ERROR(status);
2718 		}
2719 	}
2720 
2721 	if (parent != NULL && parent->IsContainer())
2722 		parent->ContainerContentsChanged(transaction);
2723 
2724 	inode->UpdateOldLastModified();
2725 
2726 	// The "size" & "last_modified" indices don't contain directories.
2727 	// If adding to these indices fails, the inode creation will not be
2728 	// harmed; they are considered less important than the "name" index.
2729 	if (inode->InSizeIndex())
2730 		index.InsertSize(transaction, inode);
2731 	if (inode->InLastModifiedIndex())
2732 		index.InsertLastModified(transaction, inode);
2733 
2734 	if (inode->NeedsFileCache()) {
2735 		inode->SetFileCache(file_cache_create(volume->ID(), inode->ID(),
2736 			inode->Size()));
2737 		inode->SetMap(file_map_create(volume->ID(), inode->ID(),
2738 			inode->Size()));
2739 
2740 		if (inode->FileCache() == NULL || inode->Map() == NULL)
2741 			return B_NO_MEMORY;
2742 	}
2743 
2744 	// Everything worked well until this point, we have a fully
2745 	// initialized inode, and we want to keep it
2746 	allocator.Keep(vnodeOps, publishFlags);
2747 
2748 	if (_created)
2749 		*_created = true;
2750 	if (_id != NULL)
2751 		*_id = inode->ID();
2752 	if (_inode != NULL)
2753 		*_inode = inode;
2754 
2755 	// if either _id or _inode is passed, we will keep the inode locked
2756 	if (_id == NULL && _inode == NULL)
2757 		put_vnode(volume->FSVolume(), inode->ID());
2758 
2759 	return B_OK;
2760 }
2761 
2762 
2763 //	#pragma mark - AttributeIterator
2764 
2765 
2766 AttributeIterator::AttributeIterator(Inode* inode)
2767 	:
2768 	fCurrentSmallData(0),
2769 	fInode(inode),
2770 	fAttributes(NULL),
2771 	fIterator(NULL),
2772 	fBuffer(NULL)
2773 {
2774 	inode->_AddIterator(this);
2775 }
2776 
2777 
2778 AttributeIterator::~AttributeIterator()
2779 {
2780 	if (fAttributes)
2781 		put_vnode(fAttributes->GetVolume()->FSVolume(), fAttributes->ID());
2782 
2783 	delete fIterator;
2784 	fInode->_RemoveIterator(this);
2785 }
2786 
2787 
2788 status_t
2789 AttributeIterator::Rewind()
2790 {
2791 	fCurrentSmallData = 0;
2792 
2793 	if (fIterator != NULL)
2794 		fIterator->Rewind();
2795 
2796 	return B_OK;
2797 }
2798 
2799 
2800 status_t
2801 AttributeIterator::GetNext(char* name, size_t* _length, uint32* _type,
2802 	ino_t* _id)
2803 {
2804 	// read attributes out of the small data section
2805 
2806 	if (fCurrentSmallData >= 0) {
2807 		NodeGetter nodeGetter(fInode->GetVolume(), fInode);
2808 		const bfs_inode* node = nodeGetter.Node();
2809 		const small_data* item = ((bfs_inode*)node)->SmallDataStart();
2810 
2811 		RecursiveLocker _(&fInode->SmallDataLock());
2812 
2813 		int32 index = 0;
2814 		for (; !item->IsLast(node); item = item->Next(), index++) {
2815 			if (item->NameSize() == FILE_NAME_NAME_LENGTH
2816 				&& *item->Name() == FILE_NAME_NAME)
2817 				continue;
2818 
2819 			if (index >= fCurrentSmallData)
2820 				break;
2821 		}
2822 
2823 		if (!item->IsLast(node)) {
2824 			strncpy(name, item->Name(), B_FILE_NAME_LENGTH);
2825 			*_type = item->Type();
2826 			*_length = item->NameSize();
2827 			*_id = (ino_t)index;
2828 
2829 			fCurrentSmallData = index + 1;
2830 			return B_OK;
2831 		}
2832 
2833 		// stop traversing the small_data section
2834 		fCurrentSmallData = -1;
2835 	}
2836 
2837 	// read attributes out of the attribute directory
2838 
2839 	if (fInode->Attributes().IsZero())
2840 		return B_ENTRY_NOT_FOUND;
2841 
2842 	Volume* volume = fInode->GetVolume();
2843 
2844 	// if you haven't yet access to the attributes directory, get it
2845 	if (fAttributes == NULL) {
2846 		if (get_vnode(volume->FSVolume(), volume->ToVnode(fInode->Attributes()),
2847 				(void**)&fAttributes) != B_OK) {
2848 			FATAL(("get_vnode() failed in AttributeIterator::GetNext(ino_t"
2849 				" = %" B_PRIdINO ",name = \"%s\")\n", fInode->ID(), name));
2850 			return B_ENTRY_NOT_FOUND;
2851 		}
2852 
2853 		BPlusTree* tree = fAttributes->Tree();
2854 		if (tree == NULL
2855 			|| (fIterator = new TreeIterator(tree)) == NULL) {
2856 			FATAL(("could not get tree in AttributeIterator::GetNext(ino_t"
2857 				" = %" B_PRIdINO ",name = \"%s\")\n", fInode->ID(), name));
2858 			return B_ENTRY_NOT_FOUND;
2859 		}
2860 	}
2861 
2862 	uint16 length;
2863 	ino_t id;
2864 	status_t status = fIterator->GetNextEntry(name, &length,
2865 		B_FILE_NAME_LENGTH, &id);
2866 	if (status < B_OK)
2867 		return status;
2868 
2869 	Vnode vnode(volume, id);
2870 	Inode* attribute;
2871 	if ((status = vnode.Get(&attribute)) == B_OK) {
2872 		*_type = attribute->Type();
2873 		*_length = length;
2874 		*_id = id;
2875 	}
2876 
2877 	return status;
2878 }
2879 
2880 
2881 void
2882 AttributeIterator::Update(uint16 index, int8 change)
2883 {
2884 	// fCurrentSmallData points already to the next item
2885 	if (index < fCurrentSmallData)
2886 		fCurrentSmallData += change;
2887 }
2888 
2889