xref: /haiku/src/add-ons/kernel/file_systems/bfs/Inode.cpp (revision a5bf12376daeded4049521eb17a6cc41192250d9)
1 /*
2  * Copyright 2001-2010, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 
7 //! Inode access functions
8 
9 
10 #include "Debug.h"
11 #include "Inode.h"
12 #include "BPlusTree.h"
13 #include "Index.h"
14 
15 
16 #if BFS_TRACING && !defined(BFS_SHELL) && !defined(_BOOT_MODE)
17 namespace BFSInodeTracing {
18 
19 class Create : public AbstractTraceEntry {
20 public:
21 	Create(Inode* inode, Inode* parent, const char* name, int32 mode,
22 			int openMode, uint32 type)
23 		:
24 		fInode(inode),
25 		fID(inode->ID()),
26 		fParent(parent),
27 		fParentID(parent != NULL ? parent->ID() : 0),
28 		fMode(mode),
29 		fOpenMode(openMode),
30 		fType(type)
31 	{
32 		if (name != NULL)
33 			strlcpy(fName, name, sizeof(fName));
34 		else
35 			fName[0] = '\0';
36 
37 		Initialized();
38 	}
39 
40 	virtual void AddDump(TraceOutput& out)
41 	{
42 		out.Print("bfs:Create %Ld (%p), parent %Ld (%p), \"%s\", "
43 			"mode %lx, omode %x, type %lx", fID, fInode, fParentID,
44 			fParent, fName, fMode, fOpenMode, fType);
45 	}
46 
47 private:
48 	Inode*	fInode;
49 	ino_t	fID;
50 	Inode*	fParent;
51 	ino_t	fParentID;
52 	char	fName[32];
53 	int32	fMode;
54 	int		fOpenMode;
55 	uint32	fType;
56 };
57 
58 class Remove : public AbstractTraceEntry {
59 public:
60 	Remove(Inode* inode, const char* name)
61 		:
62 		fInode(inode),
63 		fID(inode->ID())
64 	{
65 		strlcpy(fName, name, sizeof(fName));
66 		Initialized();
67 	}
68 
69 	virtual void AddDump(TraceOutput& out)
70 	{
71 		out.Print("bfs:Remove %Ld (%p), \"%s\"", fID, fInode, fName);
72 	}
73 
74 private:
75 	Inode*	fInode;
76 	ino_t	fID;
77 	char	fName[32];
78 };
79 
80 class Action : public AbstractTraceEntry {
81 public:
82 	Action(const char* action, Inode* inode)
83 		:
84 		fInode(inode),
85 		fID(inode->ID())
86 	{
87 		strlcpy(fAction, action, sizeof(fAction));
88 		Initialized();
89 	}
90 
91 	virtual void AddDump(TraceOutput& out)
92 	{
93 		out.Print("bfs:%s %Ld (%p)\n", fAction, fID, fInode);
94 	}
95 
96 private:
97 	Inode*	fInode;
98 	ino_t	fID;
99 	char	fAction[16];
100 };
101 
102 class Resize : public AbstractTraceEntry {
103 public:
104 	Resize(Inode* inode, off_t oldSize, off_t newSize, bool trim)
105 		:
106 		fInode(inode),
107 		fID(inode->ID()),
108 		fOldSize(oldSize),
109 		fNewSize(newSize),
110 		fTrim(trim)
111 	{
112 		Initialized();
113 	}
114 
115 	virtual void AddDump(TraceOutput& out)
116 	{
117 		out.Print("bfs:%s %Ld (%p), %Ld -> %Ld", fTrim ? "Trim" : "Resize",
118 			fID, fInode, fOldSize, fNewSize);
119 	}
120 
121 private:
122 	Inode*	fInode;
123 	ino_t	fID;
124 	off_t	fOldSize;
125 	off_t	fNewSize;
126 	bool	fTrim;
127 };
128 
129 }	// namespace BFSInodeTracing
130 
131 #	define T(x) new(std::nothrow) BFSInodeTracing::x;
132 #else
133 #	define T(x) ;
134 #endif
135 
136 
137 /*!	A helper class used by Inode::Create() to keep track of the belongings
138 	of an inode creation in progress.
139 	This class will make sure everything is cleaned up properly.
140 */
141 class InodeAllocator {
142 public:
143 							InodeAllocator(Transaction& transaction);
144 							~InodeAllocator();
145 
146 			status_t		New(block_run* parentRun, mode_t mode,
147 								block_run& run, fs_vnode_ops* vnodeOps,
148 								Inode** _inode);
149 			status_t		CreateTree();
150 			status_t		Keep(fs_vnode_ops* vnodeOps, uint32 publishFlags);
151 
152 private:
153 	static	void			_TransactionListener(int32 id, int32 event,
154 								void* _inode);
155 
156 			Transaction*	fTransaction;
157 			block_run		fRun;
158 			Inode*			fInode;
159 };
160 
161 
162 InodeAllocator::InodeAllocator(Transaction& transaction)
163 	:
164 	fTransaction(&transaction),
165 	fInode(NULL)
166 {
167 }
168 
169 
170 InodeAllocator::~InodeAllocator()
171 {
172 	if (fTransaction != NULL) {
173 		Volume* volume = fTransaction->GetVolume();
174 
175 		if (fInode != NULL) {
176 			fInode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
177 				// this unblocks any pending bfs_read_vnode() calls
178 			fInode->Free(*fTransaction);
179 			fTransaction->RemoveListener(fInode);
180 
181 			remove_vnode(volume->FSVolume(), fInode->ID());
182 		} else
183 			volume->Free(*fTransaction, fRun);
184 	}
185 
186 	delete fInode;
187 }
188 
189 
190 status_t
191 InodeAllocator::New(block_run* parentRun, mode_t mode, block_run& run,
192 	fs_vnode_ops* vnodeOps, Inode** _inode)
193 {
194 	Volume* volume = fTransaction->GetVolume();
195 
196 	status_t status = volume->AllocateForInode(*fTransaction, parentRun, mode,
197 		fRun);
198 	if (status < B_OK) {
199 		// don't free the space in the destructor, because
200 		// the allocation failed
201 		fTransaction = NULL;
202 		RETURN_ERROR(status);
203 	}
204 
205 	run = fRun;
206 	fInode = new Inode(volume, *fTransaction, volume->ToVnode(run), mode, run);
207 	if (fInode == NULL)
208 		RETURN_ERROR(B_NO_MEMORY);
209 
210 	if (!volume->IsInitializing()) {
211 		status = new_vnode(volume->FSVolume(), fInode->ID(), fInode,
212 			vnodeOps != NULL ? vnodeOps : &gBFSVnodeOps);
213 		if (status < B_OK) {
214 			delete fInode;
215 			fInode = NULL;
216 			RETURN_ERROR(status);
217 		}
218 	}
219 
220 	fInode->WriteLockInTransaction(*fTransaction);
221 	*_inode = fInode;
222 	return B_OK;
223 }
224 
225 
226 status_t
227 InodeAllocator::CreateTree()
228 {
229 	Volume* volume = fTransaction->GetVolume();
230 
231 	// force S_STR_INDEX to be set, if no type is set
232 	if ((fInode->Mode() & S_INDEX_TYPES) == 0)
233 		fInode->Node().mode |= HOST_ENDIAN_TO_BFS_INT32(S_STR_INDEX);
234 
235 	BPlusTree* tree = fInode->fTree = new BPlusTree(*fTransaction, fInode);
236 	if (tree == NULL || tree->InitCheck() < B_OK)
237 		return B_ERROR;
238 
239 	if (fInode->IsRegularNode()) {
240 		if (tree->Insert(*fTransaction, ".", fInode->ID()) < B_OK
241 			|| tree->Insert(*fTransaction, "..",
242 					volume->ToVnode(fInode->Parent())) < B_OK)
243 			return B_ERROR;
244 	}
245 	return B_OK;
246 }
247 
248 
249 status_t
250 InodeAllocator::Keep(fs_vnode_ops* vnodeOps, uint32 publishFlags)
251 {
252 	ASSERT(fInode != NULL && fTransaction != NULL);
253 	Volume* volume = fTransaction->GetVolume();
254 
255 	status_t status = fInode->WriteBack(*fTransaction);
256 	if (status < B_OK) {
257 		FATAL(("writing new inode %" B_PRIdINO " failed!\n", fInode->ID()));
258 		return status;
259 	}
260 
261 	// Symbolic links are not published -- the caller needs to do this once
262 	// the contents have been written.
263 	if (!fInode->IsSymLink() && volume->ID() >= 0) {
264 		status = publish_vnode(volume->FSVolume(), fInode->ID(), fInode,
265 			vnodeOps != NULL ? vnodeOps : &gBFSVnodeOps, fInode->Mode(),
266 			publishFlags);
267 	}
268 
269 	if (status == B_OK) {
270 		cache_add_transaction_listener(volume->BlockCache(), fTransaction->ID(),
271 			TRANSACTION_ABORTED, &_TransactionListener, fInode);
272 	}
273 
274 	fTransaction = NULL;
275 	fInode = NULL;
276 
277 	return status;
278 }
279 
280 
281 /*static*/ void
282 InodeAllocator::_TransactionListener(int32 id, int32 event, void* _inode)
283 {
284 	Inode* inode = (Inode*)_inode;
285 
286 	if (event == TRANSACTION_ABORTED)
287 		panic("transaction %d aborted, inode %p still around!\n", (int)id, inode);
288 }
289 
290 
291 //	#pragma mark -
292 
293 
294 status_t
295 bfs_inode::InitCheck(Volume* volume) const
296 {
297 	if (Magic1() != INODE_MAGIC1
298 		|| !(Flags() & INODE_IN_USE)
299 		|| inode_num.Length() != 1
300 		// matches inode size?
301 		|| (uint32)InodeSize() != volume->InodeSize()
302 		// parent resides on disk?
303 		|| parent.AllocationGroup() > int32(volume->AllocationGroups())
304 		|| parent.AllocationGroup() < 0
305 		|| parent.Start() > (1L << volume->AllocationGroupShift())
306 		|| parent.Length() != 1
307 		// attributes, too?
308 		|| attributes.AllocationGroup() > int32(volume->AllocationGroups())
309 		|| attributes.AllocationGroup() < 0
310 		|| attributes.Start() > (1L << volume->AllocationGroupShift()))
311 		RETURN_ERROR(B_BAD_DATA);
312 
313 	if (Flags() & INODE_DELETED)
314 		return B_NOT_ALLOWED;
315 
316 	// TODO: Add some tests to check the integrity of the other stuff here,
317 	// especially for the data_stream!
318 
319 	return B_OK;
320 }
321 
322 
323 //	#pragma mark - Inode
324 
325 
326 Inode::Inode(Volume* volume, ino_t id)
327 	:
328 	fVolume(volume),
329 	fID(id),
330 	fTree(NULL),
331 	fAttributes(NULL),
332 	fCache(NULL),
333 	fMap(NULL)
334 {
335 	PRINT(("Inode::Inode(volume = %p, id = %Ld) @ %p\n", volume, id, this));
336 
337 	UpdateNodeFromDisk();
338 
339 	rw_lock_init(&fLock, "bfs inode");
340 	recursive_lock_init(&fSmallDataLock, "bfs inode small data");
341 
342 	// these two will help to maintain the indices
343 	fOldSize = Size();
344 	fOldLastModified = LastModified();
345 
346 	if (IsContainer())
347 		fTree = new BPlusTree(this);
348 	if (NeedsFileCache()) {
349 		SetFileCache(file_cache_create(fVolume->ID(), ID(), Size()));
350 		SetMap(file_map_create(volume->ID(), ID(), Size()));
351 	}
352 }
353 
354 
355 Inode::Inode(Volume* volume, Transaction& transaction, ino_t id, mode_t mode,
356 		block_run& run)
357 	:
358 	fVolume(volume),
359 	fID(id),
360 	fTree(NULL),
361 	fAttributes(NULL),
362 	fCache(NULL),
363 	fMap(NULL)
364 {
365 	PRINT(("Inode::Inode(volume = %p, transaction = %p, id = %Ld) @ %p\n",
366 		volume, &transaction, id, this));
367 
368 	rw_lock_init(&fLock, "bfs inode");
369 	recursive_lock_init(&fSmallDataLock, "bfs inode small data");
370 
371 	NodeGetter node(volume, transaction, this, true);
372 	memset(&fNode, 0, sizeof(bfs_inode));
373 
374 	// Initialize the bfs_inode structure -- it's not written back to disk
375 	// here, because it will be done so already when the inode could be
376 	// created completely.
377 
378 	Node().magic1 = HOST_ENDIAN_TO_BFS_INT32(INODE_MAGIC1);
379 	Node().inode_num = run;
380 	Node().mode = HOST_ENDIAN_TO_BFS_INT32(mode);
381 	Node().flags = HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
382 
383 	Node().create_time = Node().last_modified_time = Node().status_change_time
384 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
385 
386 	Node().inode_size = HOST_ENDIAN_TO_BFS_INT32(volume->InodeSize());
387 
388 	// these two will help to maintain the indices
389 	fOldSize = Size();
390 	fOldLastModified = LastModified();
391 }
392 
393 
394 Inode::~Inode()
395 {
396 	PRINT(("Inode::~Inode() @ %p\n", this));
397 
398 	file_cache_delete(FileCache());
399 	file_map_delete(Map());
400 	delete fTree;
401 
402 	rw_lock_destroy(&fLock);
403 }
404 
405 
406 status_t
407 Inode::InitCheck(bool checkNode) const
408 {
409 	// test inode magic and flags
410 	if (checkNode) {
411 		status_t status = Node().InitCheck(fVolume);
412 		if (status == B_BUSY)
413 			return B_BUSY;
414 
415 		if (status != B_OK) {
416 			FATAL(("inode at block %" B_PRIdOFF " corrupt!\n", BlockNumber()));
417 			RETURN_ERROR(B_BAD_DATA);
418 		}
419 	}
420 
421 	if (IsContainer()) {
422 		// inodes that have a B+tree
423 		if (fTree == NULL)
424 			RETURN_ERROR(B_NO_MEMORY);
425 
426 		status_t status = fTree->InitCheck();
427 		if (status != B_OK) {
428 			FATAL(("inode tree at block %" B_PRIdOFF " corrupt!\n",
429 				BlockNumber()));
430 			RETURN_ERROR(B_BAD_DATA);
431 		}
432 	}
433 
434 	if (NeedsFileCache() && (fCache == NULL || fMap == NULL))
435 		return B_NO_MEMORY;
436 
437 	return B_OK;
438 }
439 
440 
441 /*!	Adds this inode to the specified transaction. This means that the inode will
442 	be write locked until the transaction ended.
443 	To ensure that the inode will stay valid until that point, an extra reference
444 	is acquired to it as long as this transaction stays active.
445 */
446 void
447 Inode::WriteLockInTransaction(Transaction& transaction)
448 {
449 	// These flags can only change while holding the transaction lock
450 	if ((Flags() & INODE_IN_TRANSACTION) != 0)
451 		return;
452 
453 	// We share the same list link with the removed list, so we have to remove
454 	// the inode from that list here (and add it back when we no longer need it)
455 	if ((Flags() & INODE_DELETED) != 0)
456 		fVolume->RemovedInodes().Remove(this);
457 
458 	if (!fVolume->IsInitializing())
459 		acquire_vnode(fVolume->FSVolume(), ID());
460 
461 	rw_lock_write_lock(&Lock());
462 	Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
463 
464 	transaction.AddListener(this);
465 }
466 
467 
468 status_t
469 Inode::WriteBack(Transaction& transaction)
470 {
471 	NodeGetter node(fVolume, transaction, this);
472 	if (node.WritableNode() == NULL)
473 		return B_IO_ERROR;
474 
475 	memcpy(node.WritableNode(), &Node(), sizeof(bfs_inode));
476 	return B_OK;
477 }
478 
479 
480 void
481 Inode::UpdateNodeFromDisk()
482 {
483 	NodeGetter node(fVolume, this);
484 	memcpy(&fNode, node.Node(), sizeof(bfs_inode));
485 	fNode.flags &= HOST_ENDIAN_TO_BFS_INT32(INODE_PERMANENT_FLAGS);
486 }
487 
488 
489 status_t
490 Inode::CheckPermissions(int accessMode) const
491 {
492 	uid_t user = geteuid();
493 	gid_t group = getegid();
494 
495 	// you never have write access to a read-only volume
496 	if ((accessMode & W_OK) != 0 && fVolume->IsReadOnly())
497 		return B_READ_ONLY_DEVICE;
498 
499 	// root users always have full access (but they can't execute files without
500 	// any execute permissions set)
501 	if (user == 0) {
502 		if (!((accessMode & X_OK) != 0 && (Mode() & S_IXUSR) == 0)
503 			|| (Mode() & S_DIRECTORY) != 0) {
504 			return B_OK;
505 		}
506 	}
507 
508 	// shift mode bits, to check directly against accessMode
509 	mode_t mode = Mode();
510 	if (user == (uid_t)fNode.UserID())
511 		mode >>= 6;
512 	else if (group == (gid_t)fNode.GroupID())
513 		mode >>= 3;
514 
515 	if (accessMode & ~(mode & S_IRWXO))
516 		return B_NOT_ALLOWED;
517 
518 	return B_OK;
519 }
520 
521 
522 //	#pragma mark - attributes
523 
524 
525 void
526 Inode::_AddIterator(AttributeIterator* iterator)
527 {
528 	RecursiveLocker _(fSmallDataLock);
529 	fIterators.Add(iterator);
530 }
531 
532 
533 void
534 Inode::_RemoveIterator(AttributeIterator* iterator)
535 {
536 	RecursiveLocker _(fSmallDataLock);
537 	fIterators.Remove(iterator);
538 }
539 
540 
541 /*!	Tries to free up "bytes" space in the small_data section by moving
542 	attributes to real files. Used for system attributes like the name.
543 	You need to hold the fSmallDataLock when you call this method
544 */
545 status_t
546 Inode::_MakeSpaceForSmallData(Transaction& transaction, bfs_inode* node,
547 	const char* name, int32 bytes)
548 {
549 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
550 
551 	while (bytes > 0) {
552 		small_data* item = node->SmallDataStart();
553 		small_data* max = NULL;
554 		int32 index = 0, maxIndex = 0;
555 		for (; !item->IsLast(node); item = item->Next(), index++) {
556 			// should not remove those
557 			if (*item->Name() == FILE_NAME_NAME || !strcmp(name, item->Name()))
558 				continue;
559 
560 			if (max == NULL || max->Size() < item->Size()) {
561 				maxIndex = index;
562 				max = item;
563 			}
564 
565 			// Remove the first one large enough to free the needed amount of
566 			// bytes
567 			if (bytes < (int32)item->Size())
568 				break;
569 		}
570 
571 		if (item->IsLast(node) || (int32)item->Size() < bytes)
572 			return B_ERROR;
573 
574 		bytes -= max->Size();
575 
576 		// Move the attribute to a real attribute file
577 		// Luckily, this doesn't cause any index updates
578 
579 		Inode* attribute;
580 		status_t status = CreateAttribute(transaction, item->Name(),
581 			item->Type(), &attribute);
582 		if (status != B_OK)
583 			RETURN_ERROR(status);
584 
585 		size_t length = item->DataSize();
586 		status = attribute->WriteAt(transaction, 0, item->Data(), &length);
587 
588 		ReleaseAttribute(attribute);
589 
590 		if (status != B_OK) {
591 			Vnode vnode(fVolume, Attributes());
592 			Inode* attributes;
593 			if (vnode.Get(&attributes) < B_OK
594 				|| attributes->Remove(transaction, name) < B_OK) {
595 				FATAL(("Could not remove newly created attribute!\n"));
596 			}
597 
598 			RETURN_ERROR(status);
599 		}
600 
601 		_RemoveSmallData(node, max, maxIndex);
602 	}
603 	return B_OK;
604 }
605 
606 
607 /*!	Private function which removes the given attribute from the small_data
608 	section.
609 	You need to hold the fSmallDataLock when you call this method
610 */
611 status_t
612 Inode::_RemoveSmallData(bfs_inode* node, small_data* item, int32 index)
613 {
614 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
615 
616 	small_data* next = item->Next();
617 	if (!next->IsLast(node)) {
618 		// find the last attribute
619 		small_data* last = next;
620 		while (!last->IsLast(node))
621 			last = last->Next();
622 
623 		int32 size = (uint8*)last - (uint8*)next;
624 		if (size < 0
625 			|| size > (uint8*)node + fVolume->BlockSize() - (uint8*)next)
626 			return B_BAD_DATA;
627 
628 		memmove(item, next, size);
629 
630 		// Move the "last" one to its new location and
631 		// correctly terminate the small_data section
632 		last = (small_data*)((uint8*)last - ((uint8*)next - (uint8*)item));
633 		memset(last, 0, (uint8*)node + fVolume->BlockSize() - (uint8*)last);
634 	} else
635 		memset(item, 0, item->Size());
636 
637 	// update all current iterators
638 	SinglyLinkedList<AttributeIterator>::Iterator iterator
639 		= fIterators.GetIterator();
640 	while (iterator.HasNext()) {
641 		iterator.Next()->Update(index, -1);
642 	}
643 
644 	return B_OK;
645 }
646 
647 
648 //!	Removes the given attribute from the small_data section.
649 status_t
650 Inode::_RemoveSmallData(Transaction& transaction, NodeGetter& nodeGetter,
651 	const char* name)
652 {
653 	if (name == NULL)
654 		return B_BAD_VALUE;
655 
656 	bfs_inode* node = nodeGetter.WritableNode();
657 	RecursiveLocker locker(fSmallDataLock);
658 
659 	// search for the small_data item
660 
661 	small_data* item = node->SmallDataStart();
662 	int32 index = 0;
663 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
664 		item = item->Next();
665 		index++;
666 	}
667 
668 	if (item->IsLast(node))
669 		return B_ENTRY_NOT_FOUND;
670 
671 	nodeGetter.MakeWritable(transaction);
672 
673 	status_t status = _RemoveSmallData(node, item, index);
674 	if (status == B_OK) {
675 		Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
676 			bfs_inode::ToInode(real_time_clock_usecs()));
677 
678 		status = WriteBack(transaction);
679 	}
680 
681 	return status;
682 }
683 
684 
685 /*!	Try to place the given attribute in the small_data section - if the
686 	new attribute is too big to fit in that section, it returns B_DEVICE_FULL.
687 	In that case, the attribute should be written to a real attribute file;
688 	it's the caller's responsibility to remove any existing attributes in the
689 	small data section if that's the case.
690 
691 	Note that you need to write back the inode yourself after having called that
692 	method - it's a bad API decision that it needs a transaction but enforces
693 	you to write back the inode all by yourself, but it's just more efficient
694 	in most cases...
695 */
696 status_t
697 Inode::_AddSmallData(Transaction& transaction, NodeGetter& nodeGetter,
698 	const char* name, uint32 type, off_t pos, const uint8* data, size_t length,
699 	bool force)
700 {
701 	bfs_inode* node = nodeGetter.WritableNode();
702 
703 	if (node == NULL || name == NULL || data == NULL)
704 		return B_BAD_VALUE;
705 
706 	// reject any requests that can't fit into the small_data section
707 	uint32 nameLength = strlen(name);
708 	uint32 spaceNeeded = sizeof(small_data) + nameLength + 3 + pos + length + 1;
709 	if (spaceNeeded > fVolume->InodeSize() - sizeof(bfs_inode))
710 		return B_DEVICE_FULL;
711 
712 	nodeGetter.MakeWritable(transaction);
713 	RecursiveLocker locker(fSmallDataLock);
714 
715 	// Find the last item or one with the same name we have to add
716 	small_data* item = node->SmallDataStart();
717 	int32 index = 0;
718 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
719 		item = item->Next();
720 		index++;
721 	}
722 
723 	// is the attribute already in the small_data section?
724 	// then just replace the data part of that one
725 	if (!item->IsLast(node)) {
726 		// find last attribute
727 		small_data* last = item;
728 		while (!last->IsLast(node))
729 			last = last->Next();
730 
731 		// try to change the attributes value
732 		if (item->data_size > pos + length
733 			|| force
734 			|| ((uint8*)last + pos + length - item->DataSize())
735 					<= ((uint8*)node + fVolume->InodeSize())) {
736 			// Make room for the new attribute if needed (and we are forced
737 			// to do so)
738 			if (force && ((uint8*)last + pos + length - item->DataSize())
739 					> ((uint8*)node + fVolume->InodeSize())) {
740 				// We also take the free space at the end of the small_data
741 				// section into account, and request only what's really needed
742 				uint32 needed = pos + length - item->DataSize() -
743 					(uint32)((uint8*)node + fVolume->InodeSize()
744 						- (uint8*)last);
745 
746 				if (_MakeSpaceForSmallData(transaction, node, name, needed)
747 						!= B_OK)
748 					return B_ERROR;
749 
750 				// reset our pointers
751 				item = node->SmallDataStart();
752 				index = 0;
753 				while (!item->IsLast(node) && strcmp(item->Name(), name)) {
754 					item = item->Next();
755 					index++;
756 				}
757 
758 				last = item;
759 				while (!last->IsLast(node))
760 					last = last->Next();
761 			}
762 
763 			size_t oldDataSize = item->DataSize();
764 
765 			// Normally, we can just overwrite the attribute data as the size
766 			// is specified by the type and does not change that often
767 			if (pos + length != item->DataSize()) {
768 				// move the attributes after the current one
769 				small_data* next = item->Next();
770 				if (!next->IsLast(node)) {
771 					memmove((uint8*)item + spaceNeeded, next,
772 						(uint8*)last - (uint8*)next);
773 				}
774 
775 				// Move the "last" one to its new location and
776 				// correctly terminate the small_data section
777 				last = (small_data*)((uint8*)last
778 					- ((uint8*)next - ((uint8*)item + spaceNeeded)));
779 				if ((uint8*)last < (uint8*)node + fVolume->BlockSize()) {
780 					memset(last, 0, (uint8*)node + fVolume->BlockSize()
781 						- (uint8*)last);
782 				}
783 
784 				item->data_size = HOST_ENDIAN_TO_BFS_INT16(pos + length);
785 			}
786 
787 			item->type = HOST_ENDIAN_TO_BFS_INT32(type);
788 
789 			if ((uint64)oldDataSize < (uint64)pos) {
790 				// Fill gap with zeros
791 				memset(item->Data() + oldDataSize, 0, pos - oldDataSize);
792 			}
793 			memcpy(item->Data() + pos, data, length);
794 			item->Data()[pos + length] = '\0';
795 
796 			return B_OK;
797 		}
798 
799 		return B_DEVICE_FULL;
800 	}
801 
802 	// try to add the new attribute!
803 
804 	if ((uint8*)item + spaceNeeded > (uint8*)node + fVolume->InodeSize()) {
805 		// there is not enough space for it!
806 		if (!force)
807 			return B_DEVICE_FULL;
808 
809 		// make room for the new attribute
810 		if (_MakeSpaceForSmallData(transaction, node, name, spaceNeeded) < B_OK)
811 			return B_ERROR;
812 
813 		// get new last item!
814 		item = node->SmallDataStart();
815 		index = 0;
816 		while (!item->IsLast(node)) {
817 			item = item->Next();
818 			index++;
819 		}
820 	}
821 
822 	memset(item, 0, spaceNeeded);
823 	item->type = HOST_ENDIAN_TO_BFS_INT32(type);
824 	item->name_size = HOST_ENDIAN_TO_BFS_INT16(nameLength);
825 	item->data_size = HOST_ENDIAN_TO_BFS_INT16(length);
826 	strcpy(item->Name(), name);
827 	memcpy(item->Data() + pos, data, length);
828 
829 	// correctly terminate the small_data section
830 	item = item->Next();
831 	if (!item->IsLast(node))
832 		memset(item, 0, (uint8*)node + fVolume->InodeSize() - (uint8*)item);
833 
834 	// update all current iterators
835 	SinglyLinkedList<AttributeIterator>::Iterator iterator
836 		= fIterators.GetIterator();
837 	while (iterator.HasNext()) {
838 		iterator.Next()->Update(index, 1);
839 	}
840 
841 	return B_OK;
842 }
843 
844 
845 /*!	Iterates through the small_data section of an inode.
846 	To start at the beginning of this section, you let smallData
847 	point to NULL, like:
848 		small_data* data = NULL;
849 		while (inode->GetNextSmallData(&data) { ... }
850 
851 	This function is reentrant and doesn't allocate any memory;
852 	you can safely stop calling it at any point (you don't need
853 	to iterate through the whole list).
854 	You need to hold the fSmallDataLock when you call this method
855 */
856 status_t
857 Inode::_GetNextSmallData(bfs_inode* node, small_data** _smallData) const
858 {
859 	if (node == NULL)
860 		RETURN_ERROR(B_BAD_VALUE);
861 
862 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
863 
864 	small_data* data = *_smallData;
865 
866 	// begin from the start?
867 	if (data == NULL)
868 		data = node->SmallDataStart();
869 	else
870 		data = data->Next();
871 
872 	// is already last item?
873 	if (data->IsLast(node))
874 		return B_ENTRY_NOT_FOUND;
875 
876 	*_smallData = data;
877 
878 	return B_OK;
879 }
880 
881 
882 /*!	Finds the attribute "name" in the small data section, and
883 	returns a pointer to it (or NULL if it doesn't exist).
884 	You need to hold the fSmallDataLock when you call this method
885 */
886 small_data*
887 Inode::FindSmallData(const bfs_inode* node, const char* name) const
888 {
889 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
890 
891 	small_data* smallData = NULL;
892 	while (_GetNextSmallData(const_cast<bfs_inode*>(node), &smallData)
893 			== B_OK) {
894 		if (!strcmp(smallData->Name(), name))
895 			return smallData;
896 	}
897 	return NULL;
898 }
899 
900 
901 /*!	Returns a pointer to the node's name if present in the small data
902 	section, NULL otherwise.
903 	You need to hold the fSmallDataLock when you call this method
904 */
905 const char*
906 Inode::Name(const bfs_inode* node) const
907 {
908 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
909 
910 	small_data* smallData = NULL;
911 	while (_GetNextSmallData((bfs_inode*)node, &smallData) == B_OK) {
912 		if (*smallData->Name() == FILE_NAME_NAME
913 			&& smallData->NameSize() == FILE_NAME_NAME_LENGTH)
914 			return (const char*)smallData->Data();
915 	}
916 	return NULL;
917 }
918 
919 
920 /*!	Copies the node's name into the provided buffer.
921 	The buffer should be B_FILE_NAME_LENGTH bytes large.
922 */
923 status_t
924 Inode::GetName(char* buffer, size_t size) const
925 {
926 	NodeGetter node(fVolume, this);
927 	RecursiveLocker locker(fSmallDataLock);
928 
929 	const char* name = Name(node.Node());
930 	if (name == NULL)
931 		return B_ENTRY_NOT_FOUND;
932 
933 	strlcpy(buffer, name, size);
934 	return B_OK;
935 }
936 
937 
938 /*!	Changes or set the name of a file: in the inode small_data section only, it
939 	doesn't change it in the parent directory's b+tree.
940 	Note that you need to write back the inode yourself after having called
941 	that method. It suffers from the same API decision as AddSmallData() does
942 	(and for the same reason).
943 */
944 status_t
945 Inode::SetName(Transaction& transaction, const char* name)
946 {
947 	if (name == NULL || *name == '\0')
948 		return B_BAD_VALUE;
949 
950 	NodeGetter node(fVolume, transaction, this);
951 	const char nameTag[2] = {FILE_NAME_NAME, 0};
952 
953 	return _AddSmallData(transaction, node, nameTag, FILE_NAME_TYPE, 0,
954 		(uint8*)name, strlen(name), true);
955 }
956 
957 
958 status_t
959 Inode::_RemoveAttribute(Transaction& transaction, const char* name,
960 	bool hasIndex, Index* index)
961 {
962 	// remove the attribute file if it exists
963 	Vnode vnode(fVolume, Attributes());
964 	Inode* attributes;
965 	status_t status = vnode.Get(&attributes);
966 	if (status < B_OK)
967 		return status;
968 
969 	// update index
970 	if (index != NULL) {
971 		Inode* attribute;
972 		if ((hasIndex || fVolume->CheckForLiveQuery(name))
973 			&& GetAttribute(name, &attribute) == B_OK) {
974 			uint8 data[BPLUSTREE_MAX_KEY_LENGTH];
975 			size_t length = BPLUSTREE_MAX_KEY_LENGTH;
976 			if (attribute->ReadAt(0, data, &length) == B_OK) {
977 				index->Update(transaction, name, attribute->Type(), data,
978 					length, NULL, 0, this);
979 			}
980 
981 			ReleaseAttribute(attribute);
982 		}
983 	}
984 
985 	if ((status = attributes->Remove(transaction, name)) < B_OK)
986 		return status;
987 
988 	if (attributes->IsEmpty()) {
989 		attributes->WriteLockInTransaction(transaction);
990 
991 		// remove attribute directory (don't fail if that can't be done)
992 		if (remove_vnode(fVolume->FSVolume(), attributes->ID()) == B_OK) {
993 			// update the inode, so that no one will ever doubt it's deleted :-)
994 			attributes->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
995 			if (attributes->WriteBack(transaction) == B_OK) {
996 				Attributes().SetTo(0, 0, 0);
997 				WriteBack(transaction);
998 			} else {
999 				unremove_vnode(fVolume->FSVolume(), attributes->ID());
1000 				attributes->Node().flags
1001 					&= ~HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
1002 			}
1003 		}
1004 	}
1005 
1006 	return status;
1007 }
1008 
1009 
1010 /*!	Reads data from the specified attribute.
1011 	This is a high-level attribute function that understands attributes
1012 	in the small_data section as well as real attribute files.
1013 */
1014 status_t
1015 Inode::ReadAttribute(const char* name, int32 type, off_t pos, uint8* buffer,
1016 	size_t* _length)
1017 {
1018 	if (pos < 0)
1019 		pos = 0;
1020 
1021 	// search in the small_data section (which has to be locked first)
1022 	{
1023 		NodeGetter node(fVolume, this);
1024 		RecursiveLocker locker(fSmallDataLock);
1025 
1026 		small_data* smallData = FindSmallData(node.Node(), name);
1027 		if (smallData != NULL) {
1028 			size_t length = *_length;
1029 			if (pos >= smallData->data_size) {
1030 				*_length = 0;
1031 				return B_OK;
1032 			}
1033 			if (length + pos > smallData->DataSize())
1034 				length = smallData->DataSize() - pos;
1035 
1036 			memcpy(buffer, smallData->Data() + pos, length);
1037 			*_length = length;
1038 			return B_OK;
1039 		}
1040 	}
1041 
1042 	// search in the attribute directory
1043 	Inode* attribute;
1044 	status_t status = GetAttribute(name, &attribute);
1045 	if (status == B_OK) {
1046 		status = attribute->ReadAt(pos, (uint8*)buffer, _length);
1047 
1048 		ReleaseAttribute(attribute);
1049 	}
1050 
1051 	RETURN_ERROR(status);
1052 }
1053 
1054 
1055 /*!	Writes data to the specified attribute.
1056 	This is a high-level attribute function that understands attributes
1057 	in the small_data section as well as real attribute files.
1058 */
1059 status_t
1060 Inode::WriteAttribute(Transaction& transaction, const char* name, int32 type,
1061 	off_t pos, const uint8* buffer, size_t* _length, bool* _created)
1062 {
1063 	if (pos < 0)
1064 		return B_BAD_VALUE;
1065 
1066 	// needed to maintain the index
1067 	uint8 oldBuffer[BPLUSTREE_MAX_KEY_LENGTH];
1068 	uint8* oldData = NULL;
1069 	size_t oldLength = 0;
1070 	bool created = false;
1071 
1072 	// TODO: we actually depend on that the contents of "buffer" are constant.
1073 	// If they get changed during the write (hey, user programs), we may mess
1074 	// up our index trees!
1075 	// TODO: for attribute files, we need to log the first
1076 	// BPLUSTREE_MAX_KEY_LENGTH bytes of the data stream, or the same as above
1077 	// might happen.
1078 
1079 	Index index(fVolume);
1080 	bool hasIndex = index.SetTo(name) == B_OK;
1081 
1082 	Inode* attribute = NULL;
1083 	status_t status = B_OK;
1084 
1085 	if (GetAttribute(name, &attribute) != B_OK) {
1086 		// No attribute inode exists yet
1087 
1088 		// save the old attribute data
1089 		NodeGetter node(fVolume, transaction, this);
1090 		recursive_lock_lock(&fSmallDataLock);
1091 
1092 		small_data* smallData = FindSmallData(node.Node(), name);
1093 		if (smallData != NULL) {
1094 			oldLength = smallData->DataSize();
1095 			if (oldLength > 0) {
1096 				if (oldLength > BPLUSTREE_MAX_KEY_LENGTH)
1097 					oldLength = BPLUSTREE_MAX_KEY_LENGTH;
1098 				memcpy(oldData = oldBuffer, smallData->Data(), oldLength);
1099 			}
1100 		} else
1101 			created = true;
1102 
1103 		recursive_lock_unlock(&fSmallDataLock);
1104 
1105 		// if the attribute doesn't exist yet (as a file), try to put it in the
1106 		// small_data section first - if that fails (due to insufficent space),
1107 		// create a real attribute file
1108 		status = _AddSmallData(transaction, node, name, type, pos, buffer,
1109 			*_length);
1110 		if (status == B_DEVICE_FULL) {
1111 			if (smallData != NULL) {
1112 				// remove the old attribute from the small data section - there
1113 				// is no space left for the new data
1114 				status = _RemoveSmallData(transaction, node, name);
1115 			} else
1116 				status = B_OK;
1117 
1118 			if (status == B_OK)
1119 				status = CreateAttribute(transaction, name, type, &attribute);
1120 			if (status != B_OK)
1121 				RETURN_ERROR(status);
1122 
1123 			created = true;
1124 		} else if (status == B_OK) {
1125 			// Update status time on attribute write
1126 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1127 				bfs_inode::ToInode(real_time_clock_usecs()));
1128 
1129 			status = WriteBack(transaction);
1130 		}
1131 	}
1132 
1133 	if (attribute != NULL) {
1134 		WriteLocker writeLocker(attribute->fLock);
1135 
1136 		if (hasIndex || fVolume->CheckForLiveQuery(name)) {
1137 			// Save the old attribute data (if this fails, oldLength will
1138 			// reflect it)
1139 			while (attribute->Size() > 0) {
1140 				bigtime_t oldModified = attribute->LastModified();
1141 				writeLocker.Unlock();
1142 
1143 				oldLength = BPLUSTREE_MAX_KEY_LENGTH;
1144 				if (attribute->ReadAt(0, oldBuffer, &oldLength) == B_OK)
1145 					oldData = oldBuffer;
1146 
1147 				writeLocker.Lock();
1148 
1149 				// Read until the data hasn't changed in between
1150 				if (oldModified == attribute->LastModified())
1151 					break;
1152 
1153 				oldLength = 0;
1154 			}
1155 		}
1156 
1157 		// check if the data fits into the small_data section again
1158 		NodeGetter node(fVolume, transaction, this);
1159 		status = _AddSmallData(transaction, node, name, type, pos, buffer,
1160 			*_length);
1161 
1162 		if (status == B_OK) {
1163 			// it does - remove its file
1164 			writeLocker.Unlock();
1165 			status = _RemoveAttribute(transaction, name, false, NULL);
1166 		} else {
1167 			// The attribute type might have been changed - we need to
1168 			// adopt the new one
1169 			attribute->Node().type = HOST_ENDIAN_TO_BFS_INT32(type);
1170 			status = attribute->WriteBack(transaction);
1171 			writeLocker.Unlock();
1172 
1173 			if (status == B_OK) {
1174 				status = attribute->WriteAt(transaction, pos, buffer,
1175 					_length);
1176 			}
1177 		}
1178 
1179 		if (status == B_OK) {
1180 			// Update status time on attribute write
1181 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1182 				bfs_inode::ToInode(real_time_clock_usecs()));
1183 
1184 			status = WriteBack(transaction);
1185 		}
1186 
1187 		attribute->WriteLockInTransaction(transaction);
1188 		ReleaseAttribute(attribute);
1189 	}
1190 
1191 	// TODO: find a better way than this "pos" thing (the begin of the old key
1192 	//	must be copied to the start of the new one for a comparison)
1193 	if (status == B_OK && pos == 0) {
1194 		// index only the first BPLUSTREE_MAX_KEY_LENGTH bytes
1195 		uint16 length = *_length;
1196 		if (length > BPLUSTREE_MAX_KEY_LENGTH)
1197 			length = BPLUSTREE_MAX_KEY_LENGTH;
1198 
1199 		// Update index. Note, Index::Update() may be called even if
1200 		// initializing the index failed - it will just update the live
1201 		// queries in this case
1202 		if (pos < length || (uint64)pos < (uint64)oldLength) {
1203 			index.Update(transaction, name, type, oldData, oldLength, buffer,
1204 				length, this);
1205 		}
1206 	}
1207 
1208 	if (_created != NULL)
1209 		*_created = created;
1210 
1211 	return status;
1212 }
1213 
1214 
1215 /*!	Removes the specified attribute from the inode.
1216 	This is a high-level attribute function that understands attributes
1217 	in the small_data section as well as real attribute files.
1218 */
1219 status_t
1220 Inode::RemoveAttribute(Transaction& transaction, const char* name)
1221 {
1222 	Index index(fVolume);
1223 	bool hasIndex = index.SetTo(name) == B_OK;
1224 	NodeGetter node(fVolume, this);
1225 
1226 	// update index for attributes in the small_data section
1227 	{
1228 		RecursiveLocker _(fSmallDataLock);
1229 
1230 		small_data* smallData = FindSmallData(node.Node(), name);
1231 		if (smallData != NULL) {
1232 			uint32 length = smallData->DataSize();
1233 			if (length > BPLUSTREE_MAX_KEY_LENGTH)
1234 				length = BPLUSTREE_MAX_KEY_LENGTH;
1235 			index.Update(transaction, name, smallData->Type(),
1236 				smallData->Data(), length, NULL, 0, this);
1237 		}
1238 	}
1239 
1240 	status_t status = _RemoveSmallData(transaction, node, name);
1241 	if (status == B_ENTRY_NOT_FOUND && !Attributes().IsZero()) {
1242 		// remove the attribute file if it exists
1243 		status = _RemoveAttribute(transaction, name, hasIndex, &index);
1244 		if (status == B_OK) {
1245 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1246 				bfs_inode::ToInode(real_time_clock_usecs()));
1247 			WriteBack(transaction);
1248 		}
1249 	}
1250 
1251 	return status;
1252 }
1253 
1254 
1255 /*!	Returns the attribute inode with the specified \a name, in case it exists.
1256 	This method can only return real attribute files; the attributes in the
1257 	small data section are ignored.
1258 */
1259 status_t
1260 Inode::GetAttribute(const char* name, Inode** _attribute)
1261 {
1262 	// does this inode even have attributes?
1263 	if (Attributes().IsZero())
1264 		return B_ENTRY_NOT_FOUND;
1265 
1266 	Vnode vnode(fVolume, Attributes());
1267 	Inode* attributes;
1268 	if (vnode.Get(&attributes) < B_OK) {
1269 		FATAL(("get_vnode() failed in Inode::GetAttribute(name = \"%s\")\n",
1270 			name));
1271 		return B_ERROR;
1272 	}
1273 
1274 	BPlusTree* tree = attributes->Tree();
1275 	if (tree == NULL)
1276 		return B_BAD_VALUE;
1277 
1278 	InodeReadLocker locker(attributes);
1279 
1280 	ino_t id;
1281 	status_t status = tree->Find((uint8*)name, (uint16)strlen(name), &id);
1282 	if (status == B_OK) {
1283 		Vnode vnode(fVolume, id);
1284 		Inode* inode;
1285 		// Check if the attribute is really an attribute
1286 		if (vnode.Get(&inode) != B_OK || !inode->IsAttribute())
1287 			return B_ERROR;
1288 
1289 		*_attribute = inode;
1290 		vnode.Keep();
1291 		return B_OK;
1292 	}
1293 
1294 	return status;
1295 }
1296 
1297 
1298 void
1299 Inode::ReleaseAttribute(Inode* attribute)
1300 {
1301 	if (attribute == NULL)
1302 		return;
1303 
1304 	put_vnode(fVolume->FSVolume(), attribute->ID());
1305 }
1306 
1307 
1308 status_t
1309 Inode::CreateAttribute(Transaction& transaction, const char* name, uint32 type,
1310 	Inode** attribute)
1311 {
1312 	// do we need to create the attribute directory first?
1313 	if (Attributes().IsZero()) {
1314 		status_t status = Inode::Create(transaction, this, NULL,
1315 			S_ATTR_DIR | S_DIRECTORY | 0666, 0, 0, NULL);
1316 		if (status < B_OK)
1317 			RETURN_ERROR(status);
1318 	}
1319 	Vnode vnode(fVolume, Attributes());
1320 	Inode* attributes;
1321 	if (vnode.Get(&attributes) < B_OK)
1322 		return B_ERROR;
1323 
1324 	// Inode::Create() locks the inode for us
1325 	return Inode::Create(transaction, attributes, name,
1326 		S_ATTR | S_FILE | 0666, 0, type, NULL, NULL, attribute);
1327 }
1328 
1329 
1330 //	#pragma mark - directory tree
1331 
1332 
1333 bool
1334 Inode::IsEmpty()
1335 {
1336 	TreeIterator iterator(fTree);
1337 
1338 	// index and attribute directories are really empty when they are
1339 	// empty - directories for standard files always contain ".", and
1340 	// "..", so we need to ignore those two
1341 
1342 	uint32 count = 0;
1343 	char name[BPLUSTREE_MAX_KEY_LENGTH];
1344 	uint16 length;
1345 	ino_t id;
1346 	while (iterator.GetNextEntry(name, &length, B_FILE_NAME_LENGTH,
1347 			&id) == B_OK) {
1348 		if (Mode() & (S_ATTR_DIR | S_INDEX_DIR))
1349 			return false;
1350 
1351 		if (++count > 2 || (strcmp(".", name) && strcmp("..", name)))
1352 			return false;
1353 	}
1354 	return true;
1355 }
1356 
1357 
1358 status_t
1359 Inode::ContainerContentsChanged(Transaction& transaction)
1360 {
1361 	ASSERT(!InLastModifiedIndex());
1362 
1363 	Node().last_modified_time = Node().status_change_time
1364 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
1365 
1366 	return WriteBack(transaction);
1367 }
1368 
1369 
1370 //	#pragma mark - data stream
1371 
1372 
1373 /*!	Computes the number of bytes this inode occupies in the file system.
1374 	This includes the file meta blocks used for maintaining its data stream.
1375 
1376 	TODO: However, the attributes in extra files are not really accounted for;
1377 	depending on the speed penalty, this should be changed, though (the value
1378 	could be cached in the inode structure or Inode object, though).
1379 */
1380 off_t
1381 Inode::AllocatedSize() const
1382 {
1383 	if (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0) {
1384 		// This symlink does not have a data stream
1385 		return Node().InodeSize();
1386 	}
1387 
1388 	const data_stream& data = Node().data;
1389 	uint32 blockSize = fVolume->BlockSize();
1390 	off_t size = blockSize;
1391 
1392 	if (data.MaxDoubleIndirectRange() != 0) {
1393 		off_t doubleIndirectSize = data.MaxDoubleIndirectRange()
1394 			- data.MaxIndirectRange();
1395 		int32 indirectSize = double_indirect_max_indirect_size(
1396 			data.double_indirect.Length(), fVolume->BlockSize());
1397 
1398 		size += (2 * data.double_indirect.Length()
1399 				+ doubleIndirectSize / indirectSize)
1400 			* blockSize + data.MaxDoubleIndirectRange();
1401 	} else if (data.MaxIndirectRange() != 0)
1402 		size += data.indirect.Length() + data.MaxIndirectRange();
1403 	else
1404 		size += data.MaxDirectRange();
1405 
1406 	if (!Node().attributes.IsZero()) {
1407 		// TODO: to make this exact, we'd had to count all attributes
1408 		size += 2 * blockSize;
1409 			// 2 blocks, one for the attributes inode, one for its B+tree
1410 	}
1411 
1412 	return size;
1413 }
1414 
1415 
1416 /*!	Finds the block_run where "pos" is located in the data_stream of
1417 	the inode.
1418 	If successful, "offset" will then be set to the file offset
1419 	of the block_run returned; so "pos - offset" is for the block_run
1420 	what "pos" is for the whole stream.
1421 	The caller has to make sure that "pos" is inside the stream.
1422 */
1423 status_t
1424 Inode::FindBlockRun(off_t pos, block_run& run, off_t& offset)
1425 {
1426 	data_stream* data = &Node().data;
1427 
1428 	// find matching block run
1429 
1430 	if (data->MaxIndirectRange() > 0 && pos >= data->MaxDirectRange()) {
1431 		if (data->MaxDoubleIndirectRange() > 0
1432 			&& pos >= data->MaxIndirectRange()) {
1433 			// access to double indirect blocks
1434 
1435 			CachedBlock cached(fVolume);
1436 
1437 			int32 runsPerBlock;
1438 			int32 directSize;
1439 			int32 indirectSize;
1440 			get_double_indirect_sizes(data->double_indirect.Length(),
1441 				fVolume->BlockSize(), runsPerBlock, directSize, indirectSize);
1442 
1443 			off_t start = pos - data->MaxIndirectRange();
1444 			int32 index = start / indirectSize;
1445 
1446 			block_run* indirect = (block_run*)cached.SetTo(
1447 				fVolume->ToBlock(data->double_indirect) + index / runsPerBlock);
1448 			if (indirect == NULL)
1449 				RETURN_ERROR(B_ERROR);
1450 
1451 			int32 current = (start % indirectSize) / directSize;
1452 
1453 			indirect = (block_run*)cached.SetTo(
1454 				fVolume->ToBlock(indirect[index % runsPerBlock])
1455 				+ current / runsPerBlock);
1456 			if (indirect == NULL)
1457 				RETURN_ERROR(B_ERROR);
1458 
1459 			run = indirect[current % runsPerBlock];
1460 			if (run.Length() != data->double_indirect.Length())
1461 				RETURN_ERROR(B_BAD_DATA);
1462 
1463 			offset = data->MaxIndirectRange() + (index * indirectSize)
1464 				+ (current * directSize);
1465 		} else {
1466 			// access to indirect blocks
1467 
1468 			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1469 			off_t runBlockEnd = data->MaxDirectRange();
1470 
1471 			CachedBlock cached(fVolume);
1472 			off_t block = fVolume->ToBlock(data->indirect);
1473 
1474 			for (int32 i = 0; i < data->indirect.Length(); i++) {
1475 				block_run* indirect = (block_run*)cached.SetTo(block + i);
1476 				if (indirect == NULL)
1477 					RETURN_ERROR(B_IO_ERROR);
1478 
1479 				int32 current = -1;
1480 				while (++current < runsPerBlock) {
1481 					if (indirect[current].IsZero())
1482 						break;
1483 
1484 					runBlockEnd += indirect[current].Length()
1485 						<< cached.BlockShift();
1486 					if (runBlockEnd > pos) {
1487 						run = indirect[current];
1488 						offset = runBlockEnd - (run.Length()
1489 							<< cached.BlockShift());
1490 						return fVolume->ValidateBlockRun(run);
1491 					}
1492 				}
1493 			}
1494 			RETURN_ERROR(B_ERROR);
1495 		}
1496 	} else {
1497 		// access from direct blocks
1498 
1499 		off_t runBlockEnd = 0LL;
1500 		int32 current = -1;
1501 
1502 		while (++current < NUM_DIRECT_BLOCKS) {
1503 			if (data->direct[current].IsZero())
1504 				break;
1505 
1506 			runBlockEnd += data->direct[current].Length()
1507 				<< fVolume->BlockShift();
1508 			if (runBlockEnd > pos) {
1509 				run = data->direct[current];
1510 				offset = runBlockEnd - (run.Length() << fVolume->BlockShift());
1511 				return fVolume->ValidateBlockRun(run);
1512 			}
1513 		}
1514 
1515 		return B_ENTRY_NOT_FOUND;
1516 	}
1517 	return fVolume->ValidateBlockRun(run);
1518 }
1519 
1520 
1521 status_t
1522 Inode::ReadAt(off_t pos, uint8* buffer, size_t* _length)
1523 {
1524 	size_t length = *_length;
1525 
1526 	// set/check boundaries for pos/length
1527 	if (pos < 0)
1528 		return B_BAD_VALUE;
1529 
1530 	InodeReadLocker locker(this);
1531 
1532 	if (pos >= Size() || length == 0) {
1533 		*_length = 0;
1534 		return B_NO_ERROR;
1535 	}
1536 
1537 	locker.Unlock();
1538 
1539 	return file_cache_read(FileCache(), NULL, pos, buffer, _length);
1540 }
1541 
1542 
1543 status_t
1544 Inode::WriteAt(Transaction& transaction, off_t pos, const uint8* buffer,
1545 	size_t* _length)
1546 {
1547 	InodeReadLocker locker(this);
1548 
1549 	// update the last modification time in memory, it will be written
1550 	// back to the inode, and the index when the file is closed
1551 	// TODO: should update the internal last modified time only at this point!
1552 	Node().last_modified_time = Node().status_change_time
1553 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
1554 
1555 	// TODO: support INODE_LOGGED!
1556 
1557 	size_t length = *_length;
1558 	bool changeSize = (uint64)pos + (uint64)length > (uint64)Size();
1559 
1560 	// set/check boundaries for pos/length
1561 	if (pos < 0)
1562 		return B_BAD_VALUE;
1563 
1564 	locker.Unlock();
1565 
1566 	// the transaction doesn't have to be started already
1567 	if (changeSize && !transaction.IsStarted())
1568 		transaction.Start(fVolume, BlockNumber());
1569 
1570 	WriteLocker writeLocker(fLock);
1571 
1572 	// Work around possible race condition: Someone might have shrunken the file
1573 	// while we had no lock.
1574 	if (!transaction.IsStarted()
1575 		&& (uint64)pos + (uint64)length > (uint64)Size()) {
1576 		writeLocker.Unlock();
1577 		transaction.Start(fVolume, BlockNumber());
1578 		writeLocker.Lock();
1579 	}
1580 
1581 	off_t oldSize = Size();
1582 
1583 	if ((uint64)pos + (uint64)length > (uint64)oldSize) {
1584 		// let's grow the data stream to the size needed
1585 		status_t status = SetFileSize(transaction, pos + length);
1586 		if (status != B_OK) {
1587 			*_length = 0;
1588 			WriteLockInTransaction(transaction);
1589 			RETURN_ERROR(status);
1590 		}
1591 		// TODO: In theory we would need to update the file size
1592 		// index here as part of the current transaction - this might just
1593 		// be a bit too expensive, but worth a try.
1594 
1595 		// we need to write back the inode here because it has to
1596 		// go into this transaction (we cannot wait until the file
1597 		// is closed)
1598 		status = WriteBack(transaction);
1599 		if (status != B_OK) {
1600 			WriteLockInTransaction(transaction);
1601 			return status;
1602 		}
1603 	}
1604 
1605 	writeLocker.Unlock();
1606 
1607 	if (oldSize < pos)
1608 		FillGapWithZeros(oldSize, pos);
1609 
1610 	// If we don't want to write anything, we can now return (we may
1611 	// just have changed the file size using the position parameter)
1612 	if (length == 0)
1613 		return B_OK;
1614 
1615 	status_t status = file_cache_write(FileCache(), NULL, pos, buffer, _length);
1616 
1617 	if (transaction.IsStarted())
1618 		WriteLockInTransaction(transaction);
1619 
1620 	return status;
1621 }
1622 
1623 
1624 /*!	Fills the gap between the old file size and the new file size
1625 	with zeros.
1626 	It's more or less a copy of Inode::WriteAt() but it can handle
1627 	length differences of more than just 4 GB, and it never uses
1628 	the log, even if the INODE_LOGGED flag is set.
1629 */
1630 status_t
1631 Inode::FillGapWithZeros(off_t pos, off_t newSize)
1632 {
1633 	while (pos < newSize) {
1634 		size_t size;
1635 		if (newSize > pos + 1024 * 1024 * 1024)
1636 			size = 1024 * 1024 * 1024;
1637 		else
1638 			size = newSize - pos;
1639 
1640 		status_t status = file_cache_write(FileCache(), NULL, pos, NULL, &size);
1641 		if (status < B_OK)
1642 			return status;
1643 
1644 		pos += size;
1645 	}
1646 
1647 	return B_OK;
1648 }
1649 
1650 
1651 /*!	Allocates \a length blocks, and clears their contents. Growing
1652 	the indirect and double indirect range uses this method.
1653 	The allocated block_run is saved in "run"
1654 */
1655 status_t
1656 Inode::_AllocateBlockArray(Transaction& transaction, block_run& run,
1657 	size_t length, bool variableSize)
1658 {
1659 	if (!run.IsZero())
1660 		return B_BAD_VALUE;
1661 
1662 	status_t status = fVolume->Allocate(transaction, this, length, run,
1663 		variableSize ? 1 : length);
1664 	if (status != B_OK)
1665 		return status;
1666 
1667 	// make sure those blocks are empty
1668 	CachedBlock cached(fVolume);
1669 	off_t block = fVolume->ToBlock(run);
1670 
1671 	for (int32 i = 0; i < run.Length(); i++) {
1672 		block_run* runs = (block_run*)cached.SetToWritable(transaction,
1673 			block + i, true);
1674 		if (runs == NULL)
1675 			return B_IO_ERROR;
1676 	}
1677 	return B_OK;
1678 }
1679 
1680 
1681 /*!	Grows the stream to \a size, and fills the direct/indirect/double indirect
1682 	ranges with the runs.
1683 	This method will also determine the size of the preallocation, if any.
1684 */
1685 status_t
1686 Inode::_GrowStream(Transaction& transaction, off_t size)
1687 {
1688 	data_stream* data = &Node().data;
1689 
1690 	// is the data stream already large enough to hold the new size?
1691 	// (can be the case with preallocated blocks)
1692 	if (size < data->MaxDirectRange()
1693 		|| size < data->MaxIndirectRange()
1694 		|| size < data->MaxDoubleIndirectRange()) {
1695 		data->size = HOST_ENDIAN_TO_BFS_INT64(size);
1696 		return B_OK;
1697 	}
1698 
1699 	// how many bytes are still needed? (unused ranges are always zero)
1700 	uint16 minimum = 1;
1701 	off_t bytes;
1702 	if (data->Size() < data->MaxDoubleIndirectRange()) {
1703 		bytes = size - data->MaxDoubleIndirectRange();
1704 		// The double indirect range can only handle multiples of
1705 		// its base length
1706 		minimum = data->double_indirect.Length();
1707 	} else if (data->Size() < data->MaxIndirectRange())
1708 		bytes = size - data->MaxIndirectRange();
1709 	else if (data->Size() < data->MaxDirectRange())
1710 		bytes = size - data->MaxDirectRange();
1711 	else {
1712 		// no preallocation left to be used
1713 		bytes = size - data->Size();
1714 		if (data->MaxDoubleIndirectRange() > 0)
1715 			minimum = data->double_indirect.Length();
1716 	}
1717 
1718 	// do we have enough free blocks on the disk?
1719 	off_t blocksNeeded = (bytes + fVolume->BlockSize() - 1)
1720 		>> fVolume->BlockShift();
1721 	if (blocksNeeded > fVolume->FreeBlocks())
1722 		return B_DEVICE_FULL;
1723 
1724 	off_t blocksRequested = blocksNeeded;
1725 		// because of preallocations and partial allocations, the number of
1726 		// blocks we need to allocate may be different from the one we request
1727 		// from the block allocator
1728 
1729 	// Should we preallocate some blocks?
1730 	// Attributes, attribute directories, and long symlinks usually won't get
1731 	// that big, and should stay close to the inode - preallocating could be
1732 	// counterproductive.
1733 	// Also, if free disk space is tight, don't preallocate.
1734 	if (!IsAttribute() && !IsAttributeDirectory() && !IsSymLink()
1735 		&& fVolume->FreeBlocks() > 128) {
1736 		off_t roundTo = 0;
1737 		if (IsFile()) {
1738 			// Request preallocated blocks depending on the file size and growth
1739 			if (size < 1 * 1024 * 1024 && bytes < 512 * 1024) {
1740 				// Preallocate 64 KB for file sizes <1 MB and grow rates <512 KB
1741 				roundTo = 65536 >> fVolume->BlockShift();
1742 			} else if (size < 32 * 1024 * 1024 && bytes <= 1 * 1024 * 1024) {
1743 				// Preallocate 512 KB for file sizes between 1 MB and 32 MB, and
1744 				// grow rates smaller than 1 MB
1745 				roundTo = (512 * 1024) >> fVolume->BlockShift();
1746 			} else {
1747 				// Preallocate 1/16 of the file size (ie. 4 MB for 64 MB,
1748 				// 64 MB for 1 GB)
1749 				roundTo = size >> (fVolume->BlockShift() + 4);
1750 			}
1751 		} else if (IsIndex()) {
1752 			// Always preallocate 64 KB for index directories
1753 			roundTo = 65536 >> fVolume->BlockShift();
1754 		} else {
1755 			// Preallocate only 4 KB - directories only get trimmed when their
1756 			// vnode is flushed, which might not happen very often.
1757 			roundTo = 4096 >> fVolume->BlockShift();
1758 		}
1759 		if (roundTo > 1) {
1760 			// Round to next "roundTo" block count
1761 			blocksRequested = ((blocksNeeded + roundTo) / roundTo) * roundTo;
1762 		}
1763 	}
1764 
1765 	while (blocksNeeded > 0) {
1766 		// the requested blocks do not need to be returned with a
1767 		// single allocation, so we need to iterate until we have
1768 		// enough blocks allocated
1769 		if (minimum > 1) {
1770 			// make sure that "blocks" is a multiple of minimum
1771 			blocksRequested = round_up(blocksRequested, minimum);
1772 		}
1773 
1774 		block_run run;
1775 		status_t status = fVolume->Allocate(transaction, this, blocksRequested,
1776 			run, minimum);
1777 		if (status != B_OK)
1778 			return status;
1779 
1780 		// okay, we have the needed blocks, so just distribute them to the
1781 		// different ranges of the stream (direct, indirect & double indirect)
1782 
1783 		blocksNeeded -= run.Length();
1784 		// don't preallocate if the first allocation was already too small
1785 		blocksRequested = blocksNeeded;
1786 
1787 		// Direct block range
1788 
1789 		if (data->Size() <= data->MaxDirectRange()) {
1790 			// let's try to put them into the direct block range
1791 			int32 free = 0;
1792 			for (; free < NUM_DIRECT_BLOCKS; free++) {
1793 				if (data->direct[free].IsZero())
1794 					break;
1795 			}
1796 
1797 			if (free < NUM_DIRECT_BLOCKS) {
1798 				// can we merge the last allocated run with the new one?
1799 				int32 last = free - 1;
1800 				if (free > 0 && data->direct[last].MergeableWith(run)) {
1801 					data->direct[last].length = HOST_ENDIAN_TO_BFS_INT16(
1802 						data->direct[last].Length() + run.Length());
1803 				} else
1804 					data->direct[free] = run;
1805 
1806 				data->max_direct_range = HOST_ENDIAN_TO_BFS_INT64(
1807 					data->MaxDirectRange()
1808 					+ run.Length() * fVolume->BlockSize());
1809 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1810 					? data->max_direct_range : size);
1811 				continue;
1812 			}
1813 		}
1814 
1815 		// Indirect block range
1816 
1817 		if (data->Size() <= data->MaxIndirectRange()
1818 			|| !data->MaxIndirectRange()) {
1819 			CachedBlock cached(fVolume);
1820 			block_run* runs = NULL;
1821 			uint32 free = 0;
1822 			off_t block;
1823 
1824 			// if there is no indirect block yet, create one
1825 			if (data->indirect.IsZero()) {
1826 				status = _AllocateBlockArray(transaction, data->indirect,
1827 					NUM_ARRAY_BLOCKS, true);
1828 				if (status != B_OK)
1829 					return status;
1830 
1831 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1832 					data->MaxDirectRange());
1833 				// insert the block_run in the first block
1834 				runs = (block_run*)cached.SetTo(data->indirect);
1835 			} else {
1836 				uint32 numberOfRuns = fVolume->BlockSize() / sizeof(block_run);
1837 				block = fVolume->ToBlock(data->indirect);
1838 
1839 				// search first empty entry
1840 				int32 i = 0;
1841 				for (; i < data->indirect.Length(); i++) {
1842 					if ((runs = (block_run*)cached.SetTo(block + i)) == NULL)
1843 						return B_IO_ERROR;
1844 
1845 					for (free = 0; free < numberOfRuns; free++)
1846 						if (runs[free].IsZero())
1847 							break;
1848 
1849 					if (free < numberOfRuns)
1850 						break;
1851 				}
1852 				if (i == data->indirect.Length())
1853 					runs = NULL;
1854 			}
1855 
1856 			if (runs != NULL) {
1857 				// try to insert the run to the last one - note that this
1858 				// doesn't take block borders into account, so it could be
1859 				// further optimized
1860 				cached.MakeWritable(transaction);
1861 
1862 				int32 last = free - 1;
1863 				if (free > 0 && runs[last].MergeableWith(run)) {
1864 					runs[last].length = HOST_ENDIAN_TO_BFS_INT16(
1865 						runs[last].Length() + run.Length());
1866 				} else
1867 					runs[free] = run;
1868 
1869 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1870 					data->MaxIndirectRange()
1871 					+ (run.Length() << fVolume->BlockShift()));
1872 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1873 					? data->MaxIndirectRange() : size);
1874 				continue;
1875 			}
1876 		}
1877 
1878 		// Double indirect block range
1879 
1880 		if (data->Size() <= data->MaxDoubleIndirectRange()
1881 			|| !data->max_double_indirect_range) {
1882 			// We make sure here that we have this minimum allocated, so if
1883 			// the allocation succeeds, we don't run into an endless loop.
1884 			if (!data->max_double_indirect_range)
1885 				minimum = _DoubleIndirectBlockLength();
1886 			else
1887 				minimum = data->double_indirect.Length();
1888 
1889 			if ((run.Length() % minimum) != 0) {
1890 				// The number of allocated blocks isn't a multiple of 'minimum',
1891 				// so we have to change this. This can happen the first time the
1892 				// stream grows into the double indirect range.
1893 				// First, free the remaining blocks that don't fit into this
1894 				// multiple.
1895 				int32 rest = run.Length() % minimum;
1896 				run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length() - rest);
1897 
1898 				status = fVolume->Free(transaction,
1899 					block_run::Run(run.AllocationGroup(),
1900 					run.Start() + run.Length(), rest));
1901 				if (status != B_OK)
1902 					return status;
1903 
1904 				blocksNeeded += rest;
1905 				blocksRequested = round_up(blocksNeeded, minimum);
1906 
1907 				// Are there any blocks left in the run? If not, allocate
1908 				// a new one
1909 				if (run.length == 0)
1910 					continue;
1911 			}
1912 
1913 			// if there is no double indirect block yet, create one
1914 			if (data->double_indirect.IsZero()) {
1915 				status = _AllocateBlockArray(transaction,
1916 					data->double_indirect, _DoubleIndirectBlockLength());
1917 				if (status != B_OK)
1918 					return status;
1919 
1920 				data->max_double_indirect_range = data->max_indirect_range;
1921 			}
1922 
1923 			// calculate the index where to insert the new blocks
1924 
1925 			int32 runsPerBlock;
1926 			int32 directSize;
1927 			int32 indirectSize;
1928 			get_double_indirect_sizes(data->double_indirect.Length(),
1929 				fVolume->BlockSize(), runsPerBlock, directSize, indirectSize);
1930 
1931 			off_t start = data->MaxDoubleIndirectRange()
1932 				- data->MaxIndirectRange();
1933 			int32 indirectIndex = start / indirectSize;
1934 			int32 index = (start % indirectSize) / directSize;
1935 			int32 runsPerArray = runsPerBlock * minimum;
1936 
1937 			// distribute the blocks to the array and allocate
1938 			// new array blocks when needed
1939 
1940 			CachedBlock cached(fVolume);
1941 			CachedBlock cachedDirect(fVolume);
1942 			block_run* array = NULL;
1943 			uint32 runLength = run.Length();
1944 
1945 			while (run.length != 0) {
1946 				// get the indirect array block
1947 				if (array == NULL) {
1948 					uint32 block = indirectIndex / runsPerBlock;
1949 					if (block >= minimum)
1950 						return EFBIG;
1951 
1952 					array = (block_run*)cached.SetTo(fVolume->ToBlock(
1953 						data->double_indirect) + block);
1954 					if (array == NULL)
1955 						return B_IO_ERROR;
1956 				}
1957 
1958 				do {
1959 					// do we need a new array block?
1960 					if (array[indirectIndex % runsPerBlock].IsZero()) {
1961 						cached.MakeWritable(transaction);
1962 
1963 						status = _AllocateBlockArray(transaction,
1964 							array[indirectIndex % runsPerBlock],
1965 							data->double_indirect.Length());
1966 						if (status != B_OK)
1967 							return status;
1968 					}
1969 
1970 					block_run* runs = (block_run*)cachedDirect.SetToWritable(
1971 						transaction, fVolume->ToBlock(array[indirectIndex
1972 							% runsPerBlock]) + index / runsPerBlock);
1973 					if (runs == NULL)
1974 						return B_IO_ERROR;
1975 
1976 					do {
1977 						// insert the block_run into the array
1978 						runs[index % runsPerBlock] = run;
1979 						runs[index % runsPerBlock].length
1980 							= HOST_ENDIAN_TO_BFS_INT16(minimum);
1981 
1982 						// alter the remaining block_run
1983 						run.start = HOST_ENDIAN_TO_BFS_INT16(run.Start()
1984 							+ minimum);
1985 						run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
1986 							- minimum);
1987 					} while ((++index % runsPerBlock) != 0 && run.length);
1988 				} while ((index % runsPerArray) != 0 && run.length);
1989 
1990 				if (index == runsPerArray)
1991 					index = 0;
1992 				if (++indirectIndex % runsPerBlock == 0) {
1993 					array = NULL;
1994 					index = 0;
1995 				}
1996 			}
1997 
1998 			data->max_double_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1999 				data->MaxDoubleIndirectRange()
2000 				+ (runLength << fVolume->BlockShift()));
2001 			data->size = blocksNeeded > 0 ? HOST_ENDIAN_TO_BFS_INT64(
2002 				data->max_double_indirect_range) : size;
2003 
2004 			continue;
2005 		}
2006 
2007 		RETURN_ERROR(EFBIG);
2008 	}
2009 	// update the size of the data stream
2010 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
2011 
2012 	return B_OK;
2013 }
2014 
2015 
2016 size_t
2017 Inode::_DoubleIndirectBlockLength() const
2018 {
2019 	if (fVolume->BlockSize() > DOUBLE_INDIRECT_ARRAY_SIZE)
2020 		return 1;
2021 
2022 	return DOUBLE_INDIRECT_ARRAY_SIZE / fVolume->BlockSize();
2023 }
2024 
2025 
2026 /*!	Frees the statically sized array of the double indirect part of a data
2027 	stream.
2028 */
2029 status_t
2030 Inode::_FreeStaticStreamArray(Transaction& transaction, int32 level,
2031 	block_run run, off_t size, off_t offset, off_t& max)
2032 {
2033 	int32 indirectSize;
2034 	if (level == 0) {
2035 		indirectSize = double_indirect_max_indirect_size(run.Length(),
2036 			fVolume->BlockSize());
2037 	} else {
2038 		indirectSize = double_indirect_max_direct_size(run.Length(),
2039 			fVolume->BlockSize());
2040 	}
2041 
2042 	off_t start;
2043 	if (size > offset)
2044 		start = size - offset;
2045 	else
2046 		start = 0;
2047 
2048 	int32 index = start / indirectSize;
2049 	int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
2050 
2051 	CachedBlock cached(fVolume);
2052 	off_t blockNumber = fVolume->ToBlock(run);
2053 
2054 	// set the file offset to the current block run
2055 	offset += (off_t)index * indirectSize;
2056 
2057 	for (int32 i = index / runsPerBlock; i < run.Length(); i++) {
2058 		block_run* array = (block_run*)cached.SetToWritable(transaction,
2059 			blockNumber + i);
2060 		if (array == NULL)
2061 			RETURN_ERROR(B_ERROR);
2062 
2063 		for (index = index % runsPerBlock; index < runsPerBlock; index++) {
2064 			if (array[index].IsZero()) {
2065 				// we also want to break out of the outer loop
2066 				i = run.Length();
2067 				break;
2068 			}
2069 
2070 			status_t status = B_OK;
2071 			if (level == 0) {
2072 				status = _FreeStaticStreamArray(transaction, 1, array[index],
2073 					size, offset, max);
2074 			} else if (offset >= size)
2075 				status = fVolume->Free(transaction, array[index]);
2076 			else
2077 				max = HOST_ENDIAN_TO_BFS_INT64(offset + indirectSize);
2078 
2079 			if (status < B_OK)
2080 				RETURN_ERROR(status);
2081 
2082 			if (offset >= size)
2083 				array[index].SetTo(0, 0, 0);
2084 
2085 			offset += indirectSize;
2086 		}
2087 		index = 0;
2088 	}
2089 	return B_OK;
2090 }
2091 
2092 
2093 /*!	Frees all block_runs in the array which come after the specified size.
2094 	It also trims the last block_run that contain the size.
2095 	"offset" and "max" are maintained until the last block_run that doesn't
2096 	have to be freed - after this, the values won't be correct anymore, but
2097 	will still assure correct function for all subsequent calls.
2098 	"max" is considered to be in file system byte order.
2099 */
2100 status_t
2101 Inode::_FreeStreamArray(Transaction& transaction, block_run* array,
2102 	uint32 arrayLength, off_t size, off_t& offset, off_t& max)
2103 {
2104 	PRINT(("FreeStreamArray: arrayLength %lu, size %Ld, offset %Ld, max %Ld\n",
2105 		arrayLength, size, offset, max));
2106 
2107 	off_t newOffset = offset;
2108 	uint32 i = 0;
2109 	for (; i < arrayLength; i++, offset = newOffset) {
2110 		if (array[i].IsZero())
2111 			break;
2112 
2113 		newOffset += (off_t)array[i].Length() << fVolume->BlockShift();
2114 		if (newOffset <= size)
2115 			continue;
2116 
2117 		block_run run = array[i];
2118 
2119 		// determine the block_run to be freed
2120 		if (newOffset > size && offset < size) {
2121 			// free partial block_run (and update the original block_run)
2122 			run.start = HOST_ENDIAN_TO_BFS_INT16(array[i].Start()
2123 				+ ((size + fVolume->BlockSize() - 1 - offset)
2124 					>> fVolume->BlockShift()));
2125 				// round to next block
2126 			array[i].length = HOST_ENDIAN_TO_BFS_INT16(run.Start()
2127 				- array[i].Start());
2128 			run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
2129 				- array[i].Length());
2130 
2131 			if (run.length == 0)
2132 				continue;
2133 
2134 			// update maximum range
2135 			max = HOST_ENDIAN_TO_BFS_INT64(offset + ((off_t)array[i].Length()
2136 				<< fVolume->BlockShift()));
2137 		} else {
2138 			// free the whole block_run
2139 			array[i].SetTo(0, 0, 0);
2140 
2141 			if ((off_t)BFS_ENDIAN_TO_HOST_INT64(max) > offset)
2142 				max = HOST_ENDIAN_TO_BFS_INT64(offset);
2143 		}
2144 
2145 		if (fVolume->Free(transaction, run) < B_OK)
2146 			return B_IO_ERROR;
2147 	}
2148 	return B_OK;
2149 }
2150 
2151 
2152 status_t
2153 Inode::_ShrinkStream(Transaction& transaction, off_t size)
2154 {
2155 	data_stream* data = &Node().data;
2156 	status_t status;
2157 
2158 	if (data->MaxDoubleIndirectRange() > size) {
2159 		off_t* maxDoubleIndirect = &data->max_double_indirect_range;
2160 			// gcc 4 work-around: "error: cannot bind packed field
2161 			// 'data->data_stream::max_double_indirect_range' to 'off_t&'"
2162 		status = _FreeStaticStreamArray(transaction, 0, data->double_indirect,
2163 			size, data->MaxIndirectRange(), *maxDoubleIndirect);
2164 		if (status != B_OK)
2165 			return status;
2166 
2167 		if (size <= data->MaxIndirectRange()) {
2168 			fVolume->Free(transaction, data->double_indirect);
2169 			data->double_indirect.SetTo(0, 0, 0);
2170 			data->max_double_indirect_range = 0;
2171 		}
2172 	}
2173 
2174 	if (data->MaxIndirectRange() > size) {
2175 		CachedBlock cached(fVolume);
2176 		off_t block = fVolume->ToBlock(data->indirect);
2177 		off_t offset = data->MaxDirectRange();
2178 
2179 		for (int32 i = 0; i < data->indirect.Length(); i++) {
2180 			block_run* array = (block_run*)cached.SetToWritable(transaction,
2181 				block + i);
2182 			if (array == NULL)
2183 				break;
2184 
2185 			off_t* maxIndirect = &data->max_indirect_range;
2186 				// gcc 4 work-around: "error: cannot bind packed field
2187 				// 'data->data_stream::max_indirect_range' to 'off_t&'"
2188 			if (_FreeStreamArray(transaction, array, fVolume->BlockSize()
2189 					/ sizeof(block_run), size, offset, *maxIndirect) != B_OK)
2190 				return B_IO_ERROR;
2191 		}
2192 		if (data->max_direct_range == data->max_indirect_range) {
2193 			fVolume->Free(transaction, data->indirect);
2194 			data->indirect.SetTo(0, 0, 0);
2195 			data->max_indirect_range = 0;
2196 		}
2197 	}
2198 
2199 	if (data->MaxDirectRange() > size) {
2200 		off_t offset = 0;
2201 		off_t *maxDirect = &data->max_direct_range;
2202 			// gcc 4 work-around: "error: cannot bind packed field
2203 			// 'data->data_stream::max_direct_range' to 'off_t&'"
2204 		status = _FreeStreamArray(transaction, data->direct, NUM_DIRECT_BLOCKS,
2205 			size, offset, *maxDirect);
2206 		if (status < B_OK)
2207 			return status;
2208 	}
2209 
2210 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
2211 	return B_OK;
2212 }
2213 
2214 
2215 status_t
2216 Inode::SetFileSize(Transaction& transaction, off_t size)
2217 {
2218 	if (size < 0)
2219 		return B_BAD_VALUE;
2220 
2221 	off_t oldSize = Size();
2222 
2223 	if (size == oldSize)
2224 		return B_OK;
2225 
2226 	T(Resize(this, oldSize, size, false));
2227 
2228 	// should the data stream grow or shrink?
2229 	status_t status;
2230 	if (size > oldSize) {
2231 		status = _GrowStream(transaction, size);
2232 		if (status < B_OK) {
2233 			// if the growing of the stream fails, the whole operation
2234 			// fails, so we should shrink the stream to its former size
2235 			_ShrinkStream(transaction, oldSize);
2236 		}
2237 	} else
2238 		status = _ShrinkStream(transaction, size);
2239 
2240 	if (status < B_OK)
2241 		return status;
2242 
2243 	file_cache_set_size(FileCache(), size);
2244 	file_map_set_size(Map(), size);
2245 
2246 	return WriteBack(transaction);
2247 }
2248 
2249 
2250 status_t
2251 Inode::Append(Transaction& transaction, off_t bytes)
2252 {
2253 	return SetFileSize(transaction, Size() + bytes);
2254 }
2255 
2256 
2257 /*!	Checks whether or not this inode's data stream needs to be trimmed
2258 	because of an earlier preallocation.
2259 	Returns true if there are any blocks to be trimmed.
2260 */
2261 bool
2262 Inode::NeedsTrimming() const
2263 {
2264 	// We never trim preallocated index blocks to make them grow as smooth as
2265 	// possible. There are only few indices anyway, so this doesn't hurt.
2266 	// Also, if an inode is already in deleted state, we don't bother trimming
2267 	// it.
2268 	if (IsIndex() || IsDeleted()
2269 		|| (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0))
2270 		return false;
2271 
2272 	off_t roundedSize = round_up(Size(), fVolume->BlockSize());
2273 
2274 	return Node().data.MaxDirectRange() > roundedSize
2275 		|| Node().data.MaxIndirectRange() > roundedSize
2276 		|| Node().data.MaxDoubleIndirectRange() > roundedSize;
2277 }
2278 
2279 
2280 status_t
2281 Inode::TrimPreallocation(Transaction& transaction)
2282 {
2283 	T(Resize(this, max_c(Node().data.MaxDirectRange(),
2284 		Node().data.MaxIndirectRange()), Size(), true));
2285 
2286 	status_t status = _ShrinkStream(transaction, Size());
2287 	if (status < B_OK)
2288 		return status;
2289 
2290 	return WriteBack(transaction);
2291 }
2292 
2293 
2294 //!	Frees the file's data stream and removes all attributes
2295 status_t
2296 Inode::Free(Transaction& transaction)
2297 {
2298 	FUNCTION();
2299 
2300 	// Perhaps there should be an implementation of Inode::ShrinkStream() that
2301 	// just frees the data_stream, but doesn't change the inode (since it is
2302 	// freed anyway) - that would make an undelete command possible
2303 	if (!IsSymLink() || (Flags() & INODE_LONG_SYMLINK) != 0) {
2304 		status_t status = SetFileSize(transaction, 0);
2305 		if (status < B_OK)
2306 			return status;
2307 	}
2308 
2309 	// Free all attributes, and remove their indices
2310 	{
2311 		// We have to limit the scope of AttributeIterator, so that its
2312 		// destructor is not called after the inode is deleted
2313 		AttributeIterator iterator(this);
2314 
2315 		char name[B_FILE_NAME_LENGTH];
2316 		uint32 type;
2317 		size_t length;
2318 		ino_t id;
2319 		while (iterator.GetNext(name, &length, &type, &id) == B_OK) {
2320 			RemoveAttribute(transaction, name);
2321 		}
2322 	}
2323 
2324 	if (WriteBack(transaction) < B_OK)
2325 		return B_IO_ERROR;
2326 
2327 	return fVolume->Free(transaction, BlockRun());
2328 }
2329 
2330 
2331 //!	Synchronizes (writes back to disk) the file stream of the inode.
2332 status_t
2333 Inode::Sync()
2334 {
2335 	if (FileCache())
2336 		return file_cache_sync(FileCache());
2337 
2338 	// We may also want to flush the attribute's data stream to
2339 	// disk here... (do we?)
2340 
2341 	if (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0)
2342 		return B_OK;
2343 
2344 	InodeReadLocker locker(this);
2345 
2346 	data_stream* data = &Node().data;
2347 	status_t status = B_OK;
2348 
2349 	// flush direct range
2350 
2351 	for (int32 i = 0; i < NUM_DIRECT_BLOCKS; i++) {
2352 		if (data->direct[i].IsZero())
2353 			return B_OK;
2354 
2355 		status = block_cache_sync_etc(fVolume->BlockCache(),
2356 			fVolume->ToBlock(data->direct[i]), data->direct[i].Length());
2357 		if (status != B_OK)
2358 			return status;
2359 	}
2360 
2361 	// flush indirect range
2362 
2363 	if (data->max_indirect_range == 0)
2364 		return B_OK;
2365 
2366 	CachedBlock cached(fVolume);
2367 	off_t block = fVolume->ToBlock(data->indirect);
2368 	int32 count = fVolume->BlockSize() / sizeof(block_run);
2369 
2370 	for (int32 j = 0; j < data->indirect.Length(); j++) {
2371 		block_run* runs = (block_run*)cached.SetTo(block + j);
2372 		if (runs == NULL)
2373 			break;
2374 
2375 		for (int32 i = 0; i < count; i++) {
2376 			if (runs[i].IsZero())
2377 				return B_OK;
2378 
2379 			status = block_cache_sync_etc(fVolume->BlockCache(),
2380 				fVolume->ToBlock(runs[i]), runs[i].Length());
2381 			if (status != B_OK)
2382 				return status;
2383 		}
2384 	}
2385 
2386 	// flush double indirect range
2387 
2388 	if (data->max_double_indirect_range == 0)
2389 		return B_OK;
2390 
2391 	off_t indirectBlock = fVolume->ToBlock(data->double_indirect);
2392 
2393 	for (int32 l = 0; l < data->double_indirect.Length(); l++) {
2394 		block_run* indirectRuns = (block_run*)cached.SetTo(indirectBlock + l);
2395 		if (indirectRuns == NULL)
2396 			return B_FILE_ERROR;
2397 
2398 		CachedBlock directCached(fVolume);
2399 
2400 		for (int32 k = 0; k < count; k++) {
2401 			if (indirectRuns[k].IsZero())
2402 				return B_OK;
2403 
2404 			block = fVolume->ToBlock(indirectRuns[k]);
2405 			for (int32 j = 0; j < indirectRuns[k].Length(); j++) {
2406 				block_run* runs = (block_run*)directCached.SetTo(block + j);
2407 				if (runs == NULL)
2408 					return B_FILE_ERROR;
2409 
2410 				for (int32 i = 0; i < count; i++) {
2411 					if (runs[i].IsZero())
2412 						return B_OK;
2413 
2414 					// TODO: combine single block_runs to bigger ones when
2415 					// they are adjacent
2416 					status = block_cache_sync_etc(fVolume->BlockCache(),
2417 						fVolume->ToBlock(runs[i]), runs[i].Length());
2418 					if (status != B_OK)
2419 						return status;
2420 				}
2421 			}
2422 		}
2423 	}
2424 	return B_OK;
2425 }
2426 
2427 
2428 //	#pragma mark - TransactionListener implementation
2429 
2430 
2431 void
2432 Inode::TransactionDone(bool success)
2433 {
2434 	if (!success) {
2435 		// revert any changes made to the cached bfs_inode
2436 		UpdateNodeFromDisk();
2437 	}
2438 }
2439 
2440 
2441 void
2442 Inode::RemovedFromTransaction()
2443 {
2444 	Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
2445 
2446 	// See AddInode() why we do this here
2447 	if ((Flags() & INODE_DELETED) != 0)
2448 		fVolume->RemovedInodes().Add(this);
2449 
2450 	rw_lock_write_unlock(&Lock());
2451 
2452 	if (!fVolume->IsInitializing())
2453 		put_vnode(fVolume->FSVolume(), ID());
2454 }
2455 
2456 
2457 //	#pragma mark - creation/deletion
2458 
2459 
2460 status_t
2461 Inode::Remove(Transaction& transaction, const char* name, ino_t* _id,
2462 	bool isDirectory, bool force)
2463 {
2464 	if (fTree == NULL)
2465 		RETURN_ERROR(B_BAD_VALUE);
2466 
2467 	WriteLockInTransaction(transaction);
2468 
2469 	// does the file even exist?
2470 	off_t id;
2471 	if (fTree->Find((uint8*)name, (uint16)strlen(name), &id) < B_OK)
2472 		return B_ENTRY_NOT_FOUND;
2473 
2474 	if (_id)
2475 		*_id = id;
2476 
2477 	Vnode vnode(fVolume, id);
2478 	Inode* inode;
2479 	status_t status = vnode.Get(&inode);
2480 	if (status < B_OK) {
2481 		REPORT_ERROR(status);
2482 		return B_ENTRY_NOT_FOUND;
2483 	}
2484 
2485 	T(Remove(inode, name));
2486 	inode->WriteLockInTransaction(transaction);
2487 
2488 	// Inode::IsContainer() is true also for indices (furthermore, the S_IFDIR
2489 	// bit is set for indices in BFS, not for attribute directories) - but you
2490 	// should really be able to do whatever you want with your indices
2491 	// without having to remove all files first :)
2492 	if (!inode->IsIndex() && !force) {
2493 		// if it's not of the correct type, don't delete it!
2494 		if (inode->IsContainer() != isDirectory)
2495 			return isDirectory ? B_NOT_A_DIRECTORY : B_IS_A_DIRECTORY;
2496 
2497 		// only delete empty directories
2498 		if (isDirectory && !inode->IsEmpty())
2499 			return B_DIRECTORY_NOT_EMPTY;
2500 	}
2501 
2502 	// remove_vnode() allows the inode to be accessed until the last put_vnode()
2503 	status = remove_vnode(fVolume->FSVolume(), id);
2504 	if (status != B_OK)
2505 		return status;
2506 
2507 	if (fTree->Remove(transaction, name, id) != B_OK && !force) {
2508 		unremove_vnode(fVolume->FSVolume(), id);
2509 		RETURN_ERROR(B_ERROR);
2510 	}
2511 
2512 #ifdef DEBUG
2513 	if (fTree->Find((uint8*)name, (uint16)strlen(name), &id) == B_OK) {
2514 		DIE(("deleted entry still there"));
2515 	}
2516 #endif
2517 
2518 	ContainerContentsChanged(transaction);
2519 
2520 	// update the inode, so that no one will ever doubt it's deleted :-)
2521 	inode->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
2522 	inode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
2523 
2524 	// In balance to the Inode::Create() method, the main indices
2525 	// are updated here (name, size, & last_modified)
2526 
2527 	Index index(fVolume);
2528 	if (inode->InNameIndex()) {
2529 		index.RemoveName(transaction, name, inode);
2530 			// If removing from the index fails, it is not regarded as a
2531 			// fatal error and will not be reported back!
2532 			// Deleted inodes won't be visible in queries anyway.
2533 	}
2534 
2535 	if (inode->InSizeIndex())
2536 		index.RemoveSize(transaction, inode);
2537 	if (inode->InLastModifiedIndex())
2538 		index.RemoveLastModified(transaction, inode);
2539 
2540 	return inode->WriteBack(transaction);
2541 }
2542 
2543 
2544 /*!	Creates the inode with the specified \a parent directory, and automatically
2545 	adds the created inode to that parent directory. If an attribute directory
2546 	is created, it will also automatically  be added to the \a parent inode as
2547 	such. However, the indices root node, and the regular root node won't be
2548 	added to the super block.
2549 	It will also create the initial B+tree for the inode if it's a directory
2550 	of any kind.
2551 	\a name may be \c NULL, but only if no \a parent is given.
2552 	If the "_id" or "_inode" variable is given and non-NULL to store the
2553 	inode's ID, the inode stays locked - you have to call put_vnode() if you
2554 	don't use it anymore.
2555 	If the node already exists, this method will fail if \c O_EXCL is set, or
2556 	it's a directory or a symlink. Otherwise, it will just be returned.
2557 	If \c O_TRUNC has been specified, the file will also be truncated.
2558 */
2559 status_t
2560 Inode::Create(Transaction& transaction, Inode* parent, const char* name,
2561 	int32 mode, int openMode, uint32 type, bool* _created, ino_t* _id,
2562 	Inode** _inode, fs_vnode_ops* vnodeOps, uint32 publishFlags)
2563 {
2564 	FUNCTION_START(("name = %s, mode = %ld\n", name, mode));
2565 
2566 	block_run parentRun = parent ? parent->BlockRun() : block_run::Run(0, 0, 0);
2567 	Volume* volume = transaction.GetVolume();
2568 	BPlusTree* tree = NULL;
2569 
2570 	if (parent != NULL && (mode & S_ATTR_DIR) == 0 && parent->IsContainer()) {
2571 		// check if the file already exists in the directory
2572 		tree = parent->Tree();
2573 	}
2574 
2575 	if (parent != NULL) {
2576 		// the parent directory is locked during the whole inode creation
2577 		parent->WriteLockInTransaction(transaction);
2578 	}
2579 
2580 	if (parent != NULL && !volume->IsInitializing() && parent->IsContainer()) {
2581 		// don't create anything in removed directories
2582 		bool removed;
2583 		if (get_vnode_removed(volume->FSVolume(), parent->ID(), &removed)
2584 				== B_OK && removed) {
2585 			RETURN_ERROR(B_ENTRY_NOT_FOUND);
2586 		}
2587 	}
2588 
2589 	if (tree != NULL) {
2590 		// Does the file already exist?
2591 		off_t offset;
2592 		if (tree->Find((uint8*)name, (uint16)strlen(name), &offset) == B_OK) {
2593 			// Return if the file should be a directory/link or opened in
2594 			// exclusive mode
2595 			if (S_ISDIR(mode) || S_ISLNK(mode) || (openMode & O_EXCL) != 0)
2596 				return B_FILE_EXISTS;
2597 
2598 			Vnode vnode(volume, offset);
2599 			Inode* inode;
2600 			status_t status = vnode.Get(&inode);
2601 			if (status != B_OK) {
2602 				REPORT_ERROR(status);
2603 				return B_ENTRY_NOT_FOUND;
2604 			}
2605 
2606 			if (inode->IsDirectory() && (openMode & O_RWMASK) != O_RDONLY)
2607 				return B_IS_A_DIRECTORY;
2608 			if ((openMode & O_DIRECTORY) != 0 && !inode->IsDirectory())
2609 				return B_NOT_A_DIRECTORY;
2610 
2611 			// we want to open the file, so we should have the rights to do so
2612 			if (inode->CheckPermissions(open_mode_to_access(openMode)
2613 					| ((openMode & O_TRUNC) != 0 ? W_OK : 0)) != B_OK)
2614 				return B_NOT_ALLOWED;
2615 
2616 			if ((openMode & O_TRUNC) != 0) {
2617 				// truncate the existing file
2618 				inode->WriteLockInTransaction(transaction);
2619 
2620 				status_t status = inode->SetFileSize(transaction, 0);
2621 				if (status == B_OK)
2622 					status = inode->WriteBack(transaction);
2623 
2624 				if (status != B_OK)
2625 					return status;
2626 			}
2627 
2628 			if (_created)
2629 				*_created = false;
2630 			if (_id)
2631 				*_id = inode->ID();
2632 			if (_inode)
2633 				*_inode = inode;
2634 
2635 			// Only keep the vnode in memory if the _id or _inode pointer is
2636 			// provided
2637 			if (_id != NULL || _inode != NULL)
2638 				vnode.Keep();
2639 
2640 			return B_OK;
2641 		}
2642 	} else if (parent != NULL && (mode & S_ATTR_DIR) == 0) {
2643 		return B_BAD_VALUE;
2644 	} else if ((openMode & O_DIRECTORY) != 0) {
2645 		// TODO: we might need to return B_NOT_A_DIRECTORY here
2646 		return B_ENTRY_NOT_FOUND;
2647 	}
2648 
2649 	status_t status;
2650 
2651 	// do we have the power to create new files at all?
2652 	if (parent != NULL && (status = parent->CheckPermissions(W_OK)) != B_OK)
2653 		RETURN_ERROR(status);
2654 
2655 	// allocate space for the new inode
2656 	InodeAllocator allocator(transaction);
2657 	block_run run;
2658 	Inode* inode;
2659 	status = allocator.New(&parentRun, mode, run, vnodeOps, &inode);
2660 	if (status < B_OK)
2661 		return status;
2662 
2663 	T(Create(inode, parent, name, mode, openMode, type));
2664 
2665 	// Initialize the parts of the bfs_inode structure that
2666 	// InodeAllocator::New() hasn't touched yet
2667 
2668 	bfs_inode* node = &inode->Node();
2669 
2670 	if (parent == NULL) {
2671 		// we set the parent to itself in this case
2672 		// (only happens for the root and indices node)
2673 		node->parent = run;
2674 	} else
2675 		node->parent = parentRun;
2676 
2677 	node->uid = HOST_ENDIAN_TO_BFS_INT32(geteuid());
2678 	node->gid = HOST_ENDIAN_TO_BFS_INT32(parent
2679 		? parent->Node().GroupID() : getegid());
2680 		// the group ID is inherited from the parent, if available
2681 
2682 	node->type = HOST_ENDIAN_TO_BFS_INT32(type);
2683 
2684 	inode->WriteBack(transaction);
2685 		// make sure the initialized node is available to others
2686 
2687 	// only add the name to regular files, directories, or symlinks
2688 	// don't add it to attributes, or indices
2689 	if (tree && inode->IsRegularNode()
2690 		&& inode->SetName(transaction, name) != B_OK)
2691 		return B_ERROR;
2692 
2693 	// Initialize b+tree if it's a directory (and add "." & ".." if it's
2694 	// a standard directory for files - not for attributes or indices)
2695 	if (inode->IsContainer()) {
2696 		status = allocator.CreateTree();
2697 		if (status != B_OK)
2698 			return status;
2699 	}
2700 
2701 	// Add a link to the inode from the parent, depending on its type
2702 	// (the vnode is not published yet, so it is safe to make the inode
2703 	// accessable to the file system here)
2704 	if (tree != NULL) {
2705 		status = tree->Insert(transaction, name, inode->ID());
2706 	} else if (parent != NULL && (mode & S_ATTR_DIR) != 0) {
2707 		parent->Attributes() = run;
2708 		status = parent->WriteBack(transaction);
2709 	}
2710 
2711 	// Note, we only care if the inode could be made accessable for the
2712 	// two cases above; the root node or the indices root node must
2713 	// handle this case on their own (or other cases where "parent" is
2714 	// NULL)
2715 	if (status != B_OK)
2716 		RETURN_ERROR(status);
2717 
2718 	// Update the main indices (name, size & last_modified)
2719 	// (live queries might want to access us after this)
2720 
2721 	Index index(volume);
2722 	if (inode->InNameIndex() && name != NULL) {
2723 		// the name index only contains regular files
2724 		// (but not the root node where name == NULL)
2725 		status = index.InsertName(transaction, name, inode);
2726 		if (status != B_OK && status != B_BAD_INDEX) {
2727 			// We have to remove the node from the parent at this point,
2728 			// because the InodeAllocator destructor can't handle this
2729 			// case (and if it fails, we can't do anything about it...)
2730 			if (tree)
2731 				tree->Remove(transaction, name, inode->ID());
2732 			else if (parent != NULL && (mode & S_ATTR_DIR) != 0)
2733 				parent->Node().attributes.SetTo(0, 0, 0);
2734 
2735 			RETURN_ERROR(status);
2736 		}
2737 	}
2738 
2739 	if (parent != NULL && parent->IsContainer())
2740 		parent->ContainerContentsChanged(transaction);
2741 
2742 	inode->UpdateOldLastModified();
2743 
2744 	// The "size" & "last_modified" indices don't contain directories.
2745 	// If adding to these indices fails, the inode creation will not be
2746 	// harmed; they are considered less important than the "name" index.
2747 	if (inode->InSizeIndex())
2748 		index.InsertSize(transaction, inode);
2749 	if (inode->InLastModifiedIndex())
2750 		index.InsertLastModified(transaction, inode);
2751 
2752 	if (inode->NeedsFileCache()) {
2753 		inode->SetFileCache(file_cache_create(volume->ID(), inode->ID(),
2754 			inode->Size()));
2755 		inode->SetMap(file_map_create(volume->ID(), inode->ID(),
2756 			inode->Size()));
2757 
2758 		if (inode->FileCache() == NULL || inode->Map() == NULL)
2759 			return B_NO_MEMORY;
2760 	}
2761 
2762 	// Everything worked well until this point, we have a fully
2763 	// initialized inode, and we want to keep it
2764 	allocator.Keep(vnodeOps, publishFlags);
2765 
2766 	if (_created)
2767 		*_created = true;
2768 	if (_id != NULL)
2769 		*_id = inode->ID();
2770 	if (_inode != NULL)
2771 		*_inode = inode;
2772 
2773 	// if either _id or _inode is passed, we will keep the inode locked
2774 	if (_id == NULL && _inode == NULL)
2775 		put_vnode(volume->FSVolume(), inode->ID());
2776 
2777 	return B_OK;
2778 }
2779 
2780 
2781 //	#pragma mark - AttributeIterator
2782 
2783 
2784 AttributeIterator::AttributeIterator(Inode* inode)
2785 	:
2786 	fCurrentSmallData(0),
2787 	fInode(inode),
2788 	fAttributes(NULL),
2789 	fIterator(NULL),
2790 	fBuffer(NULL)
2791 {
2792 	inode->_AddIterator(this);
2793 }
2794 
2795 
2796 AttributeIterator::~AttributeIterator()
2797 {
2798 	if (fAttributes)
2799 		put_vnode(fAttributes->GetVolume()->FSVolume(), fAttributes->ID());
2800 
2801 	delete fIterator;
2802 	fInode->_RemoveIterator(this);
2803 }
2804 
2805 
2806 status_t
2807 AttributeIterator::Rewind()
2808 {
2809 	fCurrentSmallData = 0;
2810 
2811 	if (fIterator != NULL)
2812 		fIterator->Rewind();
2813 
2814 	return B_OK;
2815 }
2816 
2817 
2818 status_t
2819 AttributeIterator::GetNext(char* name, size_t* _length, uint32* _type,
2820 	ino_t* _id)
2821 {
2822 	// read attributes out of the small data section
2823 
2824 	if (fCurrentSmallData >= 0) {
2825 		NodeGetter nodeGetter(fInode->GetVolume(), fInode);
2826 		const bfs_inode* node = nodeGetter.Node();
2827 		const small_data* item = ((bfs_inode*)node)->SmallDataStart();
2828 
2829 		RecursiveLocker _(&fInode->SmallDataLock());
2830 
2831 		int32 index = 0;
2832 		for (; !item->IsLast(node); item = item->Next(), index++) {
2833 			if (item->NameSize() == FILE_NAME_NAME_LENGTH
2834 				&& *item->Name() == FILE_NAME_NAME)
2835 				continue;
2836 
2837 			if (index >= fCurrentSmallData)
2838 				break;
2839 		}
2840 
2841 		if (!item->IsLast(node)) {
2842 			strncpy(name, item->Name(), B_FILE_NAME_LENGTH);
2843 			*_type = item->Type();
2844 			*_length = item->NameSize();
2845 			*_id = (ino_t)index;
2846 
2847 			fCurrentSmallData = index + 1;
2848 			return B_OK;
2849 		}
2850 
2851 		// stop traversing the small_data section
2852 		fCurrentSmallData = -1;
2853 	}
2854 
2855 	// read attributes out of the attribute directory
2856 
2857 	if (fInode->Attributes().IsZero())
2858 		return B_ENTRY_NOT_FOUND;
2859 
2860 	Volume* volume = fInode->GetVolume();
2861 
2862 	// if you haven't yet access to the attributes directory, get it
2863 	if (fAttributes == NULL) {
2864 		if (get_vnode(volume->FSVolume(), volume->ToVnode(fInode->Attributes()),
2865 				(void**)&fAttributes) != B_OK) {
2866 			FATAL(("get_vnode() failed in AttributeIterator::GetNext(ino_t"
2867 				" = %" B_PRIdINO ",name = \"%s\")\n", fInode->ID(), name));
2868 			return B_ENTRY_NOT_FOUND;
2869 		}
2870 
2871 		BPlusTree* tree = fAttributes->Tree();
2872 		if (tree == NULL
2873 			|| (fIterator = new TreeIterator(tree)) == NULL) {
2874 			FATAL(("could not get tree in AttributeIterator::GetNext(ino_t"
2875 				" = %" B_PRIdINO ",name = \"%s\")\n", fInode->ID(), name));
2876 			return B_ENTRY_NOT_FOUND;
2877 		}
2878 	}
2879 
2880 	uint16 length;
2881 	ino_t id;
2882 	status_t status = fIterator->GetNextEntry(name, &length,
2883 		B_FILE_NAME_LENGTH, &id);
2884 	if (status < B_OK)
2885 		return status;
2886 
2887 	Vnode vnode(volume, id);
2888 	Inode* attribute;
2889 	if ((status = vnode.Get(&attribute)) == B_OK) {
2890 		*_type = attribute->Type();
2891 		*_length = length;
2892 		*_id = id;
2893 	}
2894 
2895 	return status;
2896 }
2897 
2898 
2899 void
2900 AttributeIterator::Update(uint16 index, int8 change)
2901 {
2902 	// fCurrentSmallData points already to the next item
2903 	if (index < fCurrentSmallData)
2904 		fCurrentSmallData += change;
2905 }
2906 
2907