xref: /haiku/src/add-ons/kernel/file_systems/bfs/Inode.cpp (revision 1e60bdeab63fa7a57bc9a55b032052e95a18bd2c)
1 /*
2  * Copyright 2001-2017, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 
7 //! Inode access functions
8 
9 
10 #include "Debug.h"
11 #include "Inode.h"
12 #include "BPlusTree.h"
13 #include "Index.h"
14 
15 
16 #if BFS_TRACING && !defined(FS_SHELL) && !defined(_BOOT_MODE)
17 namespace BFSInodeTracing {
18 
19 class Create : public AbstractTraceEntry {
20 public:
21 	Create(Inode* inode, Inode* parent, const char* name, int32 mode,
22 			int openMode, uint32 type)
23 		:
24 		fInode(inode),
25 		fID(inode->ID()),
26 		fParent(parent),
27 		fParentID(parent != NULL ? parent->ID() : 0),
28 		fMode(mode),
29 		fOpenMode(openMode),
30 		fType(type)
31 	{
32 		if (name != NULL)
33 			strlcpy(fName, name, sizeof(fName));
34 		else
35 			fName[0] = '\0';
36 
37 		Initialized();
38 	}
39 
40 	virtual void AddDump(TraceOutput& out)
41 	{
42 		out.Print("bfs:Create %Ld (%p), parent %Ld (%p), \"%s\", "
43 			"mode %lx, omode %x, type %lx", fID, fInode, fParentID,
44 			fParent, fName, fMode, fOpenMode, fType);
45 	}
46 
47 private:
48 	Inode*	fInode;
49 	ino_t	fID;
50 	Inode*	fParent;
51 	ino_t	fParentID;
52 	char	fName[32];
53 	int32	fMode;
54 	int		fOpenMode;
55 	uint32	fType;
56 };
57 
58 class Remove : public AbstractTraceEntry {
59 public:
60 	Remove(Inode* inode, const char* name)
61 		:
62 		fInode(inode),
63 		fID(inode->ID())
64 	{
65 		strlcpy(fName, name, sizeof(fName));
66 		Initialized();
67 	}
68 
69 	virtual void AddDump(TraceOutput& out)
70 	{
71 		out.Print("bfs:Remove %Ld (%p), \"%s\"", fID, fInode, fName);
72 	}
73 
74 private:
75 	Inode*	fInode;
76 	ino_t	fID;
77 	char	fName[32];
78 };
79 
80 class Action : public AbstractTraceEntry {
81 public:
82 	Action(const char* action, Inode* inode)
83 		:
84 		fInode(inode),
85 		fID(inode->ID())
86 	{
87 		strlcpy(fAction, action, sizeof(fAction));
88 		Initialized();
89 	}
90 
91 	virtual void AddDump(TraceOutput& out)
92 	{
93 		out.Print("bfs:%s %Ld (%p)\n", fAction, fID, fInode);
94 	}
95 
96 private:
97 	Inode*	fInode;
98 	ino_t	fID;
99 	char	fAction[16];
100 };
101 
102 class Resize : public AbstractTraceEntry {
103 public:
104 	Resize(Inode* inode, off_t oldSize, off_t newSize, bool trim)
105 		:
106 		fInode(inode),
107 		fID(inode->ID()),
108 		fOldSize(oldSize),
109 		fNewSize(newSize),
110 		fTrim(trim)
111 	{
112 		Initialized();
113 	}
114 
115 	virtual void AddDump(TraceOutput& out)
116 	{
117 		out.Print("bfs:%s %Ld (%p), %Ld -> %Ld", fTrim ? "Trim" : "Resize",
118 			fID, fInode, fOldSize, fNewSize);
119 	}
120 
121 private:
122 	Inode*	fInode;
123 	ino_t	fID;
124 	off_t	fOldSize;
125 	off_t	fNewSize;
126 	bool	fTrim;
127 };
128 
129 }	// namespace BFSInodeTracing
130 
131 #	define T(x) new(std::nothrow) BFSInodeTracing::x;
132 #else
133 #	define T(x) ;
134 #endif
135 
136 
137 /*!	A helper class used by Inode::Create() to keep track of the belongings
138 	of an inode creation in progress.
139 	This class will make sure everything is cleaned up properly.
140 */
141 class InodeAllocator {
142 public:
143 							InodeAllocator(Transaction& transaction);
144 							~InodeAllocator();
145 
146 			status_t		New(block_run* parentRun, mode_t mode, uint32 flags,
147 								block_run& run, fs_vnode_ops* vnodeOps,
148 								Inode** _inode);
149 			status_t		CreateTree();
150 			status_t		Keep(fs_vnode_ops* vnodeOps, uint32 publishFlags);
151 
152 private:
153 	static	void			_TransactionListener(int32 id, int32 event,
154 								void* _inode);
155 
156 			Transaction*	fTransaction;
157 			block_run		fRun;
158 			Inode*			fInode;
159 };
160 
161 
162 InodeAllocator::InodeAllocator(Transaction& transaction)
163 	:
164 	fTransaction(&transaction),
165 	fInode(NULL)
166 {
167 }
168 
169 
170 InodeAllocator::~InodeAllocator()
171 {
172 	if (fTransaction != NULL) {
173 		Volume* volume = fTransaction->GetVolume();
174 
175 		if (fInode != NULL) {
176 			fInode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
177 				// this unblocks any pending bfs_read_vnode() calls
178 			fInode->Free(*fTransaction);
179 
180 			if (fInode->fTree != NULL)
181 				fTransaction->RemoveListener(fInode->fTree);
182 			fTransaction->RemoveListener(fInode);
183 
184 			remove_vnode(volume->FSVolume(), fInode->ID());
185 		} else
186 			volume->Free(*fTransaction, fRun);
187 	}
188 
189 	delete fInode;
190 }
191 
192 
193 status_t
194 InodeAllocator::New(block_run* parentRun, mode_t mode, uint32 publishFlags,
195 	block_run& run, fs_vnode_ops* vnodeOps, Inode** _inode)
196 {
197 	Volume* volume = fTransaction->GetVolume();
198 
199 	status_t status = volume->AllocateForInode(*fTransaction, parentRun, mode,
200 		fRun);
201 	if (status < B_OK) {
202 		// don't free the space in the destructor, because
203 		// the allocation failed
204 		fTransaction = NULL;
205 		RETURN_ERROR(status);
206 	}
207 
208 	run = fRun;
209 	fInode = new(std::nothrow) Inode(volume, *fTransaction,
210 		volume->ToVnode(run), mode, run);
211 	if (fInode == NULL)
212 		RETURN_ERROR(B_NO_MEMORY);
213 
214 	if (!volume->IsInitializing()
215 		&& (publishFlags & BFS_DO_NOT_PUBLISH_VNODE) == 0) {
216 		status = new_vnode(volume->FSVolume(), fInode->ID(), fInode,
217 			vnodeOps != NULL ? vnodeOps : &gBFSVnodeOps);
218 		if (status < B_OK) {
219 			delete fInode;
220 			fInode = NULL;
221 			RETURN_ERROR(status);
222 		}
223 	}
224 
225 	fInode->WriteLockInTransaction(*fTransaction);
226 	*_inode = fInode;
227 	return B_OK;
228 }
229 
230 
231 status_t
232 InodeAllocator::CreateTree()
233 {
234 	Volume* volume = fTransaction->GetVolume();
235 
236 	// force S_STR_INDEX to be set, if no type is set
237 	if ((fInode->Mode() & S_INDEX_TYPES) == 0)
238 		fInode->Node().mode |= HOST_ENDIAN_TO_BFS_INT32(S_STR_INDEX);
239 
240 	BPlusTree* tree = new(std::nothrow) BPlusTree(*fTransaction, fInode);
241 	if (tree == NULL)
242 		return B_ERROR;
243 
244 	status_t status = tree->InitCheck();
245 	if (status != B_OK) {
246 		delete tree;
247 		return status;
248 	}
249 
250 	fInode->fTree = tree;
251 
252 	if (fInode->IsRegularNode()) {
253 		if (tree->Insert(*fTransaction, ".", fInode->ID()) < B_OK
254 			|| tree->Insert(*fTransaction, "..",
255 					volume->ToVnode(fInode->Parent())) < B_OK)
256 			return B_ERROR;
257 	}
258 	return B_OK;
259 }
260 
261 
262 status_t
263 InodeAllocator::Keep(fs_vnode_ops* vnodeOps, uint32 publishFlags)
264 {
265 	ASSERT(fInode != NULL && fTransaction != NULL);
266 	Volume* volume = fTransaction->GetVolume();
267 
268 	status_t status = fInode->WriteBack(*fTransaction);
269 	if (status < B_OK) {
270 		FATAL(("writing new inode %" B_PRIdINO " failed!\n", fInode->ID()));
271 		return status;
272 	}
273 
274 	// Symbolic links are not published -- the caller needs to do this once
275 	// the contents have been written.
276 	if (!fInode->IsSymLink() && !volume->IsInitializing()
277 		&& (publishFlags & BFS_DO_NOT_PUBLISH_VNODE) == 0) {
278 		status = publish_vnode(volume->FSVolume(), fInode->ID(), fInode,
279 			vnodeOps != NULL ? vnodeOps : &gBFSVnodeOps, fInode->Mode(),
280 			publishFlags);
281 	}
282 
283 	if (status == B_OK) {
284 		cache_add_transaction_listener(volume->BlockCache(), fTransaction->ID(),
285 			TRANSACTION_ABORTED, &_TransactionListener, fInode);
286 	}
287 
288 	fTransaction = NULL;
289 	fInode = NULL;
290 
291 	return status;
292 }
293 
294 
295 /*static*/ void
296 InodeAllocator::_TransactionListener(int32 id, int32 event, void* _inode)
297 {
298 	Inode* inode = (Inode*)_inode;
299 
300 	if (event == TRANSACTION_ABORTED)
301 		panic("transaction %d aborted, inode %p still around!\n", (int)id, inode);
302 }
303 
304 
305 //	#pragma mark -
306 
307 
308 status_t
309 bfs_inode::InitCheck(Volume* volume) const
310 {
311 	if (Magic1() != INODE_MAGIC1
312 		|| !(Flags() & INODE_IN_USE)
313 		|| inode_num.Length() != 1
314 		// matches inode size?
315 		|| (uint32)InodeSize() != volume->InodeSize()
316 		// parent resides on disk?
317 		|| parent.AllocationGroup() > int32(volume->AllocationGroups())
318 		|| parent.AllocationGroup() < 0
319 		|| parent.Start() > (1L << volume->AllocationGroupShift())
320 		|| parent.Length() != 1
321 		// attributes, too?
322 		|| attributes.AllocationGroup() > int32(volume->AllocationGroups())
323 		|| attributes.AllocationGroup() < 0
324 		|| attributes.Start() > (1L << volume->AllocationGroupShift()))
325 		RETURN_ERROR(B_BAD_DATA);
326 
327 	if (Flags() & INODE_DELETED)
328 		return B_NOT_ALLOWED;
329 
330 	// TODO: Add some tests to check the integrity of the other stuff here,
331 	// especially for the data_stream!
332 
333 	return B_OK;
334 }
335 
336 
337 //	#pragma mark - Inode
338 
339 
340 Inode::Inode(Volume* volume, ino_t id)
341 	:
342 	fVolume(volume),
343 	fID(id),
344 	fTree(NULL),
345 	fAttributes(NULL),
346 	fCache(NULL),
347 	fMap(NULL)
348 {
349 	PRINT(("Inode::Inode(volume = %p, id = %Ld) @ %p\n", volume, id, this));
350 
351 	rw_lock_init(&fLock, "bfs inode");
352 	recursive_lock_init(&fSmallDataLock, "bfs inode small data");
353 
354 	if (UpdateNodeFromDisk() != B_OK) {
355 		// TODO: the error code gets eaten
356 		return;
357 	}
358 
359 	// these two will help to maintain the indices
360 	fOldSize = Size();
361 	fOldLastModified = LastModified();
362 
363 	if (IsContainer())
364 		fTree = new(std::nothrow) BPlusTree(this);
365 	if (NeedsFileCache()) {
366 		SetFileCache(file_cache_create(fVolume->ID(), ID(), Size()));
367 		SetMap(file_map_create(volume->ID(), ID(), Size()));
368 	}
369 }
370 
371 
372 Inode::Inode(Volume* volume, Transaction& transaction, ino_t id, mode_t mode,
373 		block_run& run)
374 	:
375 	fVolume(volume),
376 	fID(id),
377 	fTree(NULL),
378 	fAttributes(NULL),
379 	fCache(NULL),
380 	fMap(NULL)
381 {
382 	PRINT(("Inode::Inode(volume = %p, transaction = %p, id = %Ld) @ %p\n",
383 		volume, &transaction, id, this));
384 
385 	rw_lock_init(&fLock, "bfs inode");
386 	recursive_lock_init(&fSmallDataLock, "bfs inode small data");
387 
388 	NodeGetter node(volume, transaction, this, true);
389 	if (node.Node() == NULL) {
390 		FATAL(("Could not read inode block %" B_PRId64 "!\n", BlockNumber()));
391 		return;
392 	}
393 
394 	memset(&fNode, 0, sizeof(bfs_inode));
395 
396 	// Initialize the bfs_inode structure -- it's not written back to disk
397 	// here, because it will be done so already when the inode could be
398 	// created completely.
399 
400 	Node().magic1 = HOST_ENDIAN_TO_BFS_INT32(INODE_MAGIC1);
401 	Node().inode_num = run;
402 	Node().mode = HOST_ENDIAN_TO_BFS_INT32(mode);
403 	Node().flags = HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
404 
405 	Node().create_time = Node().last_modified_time = Node().status_change_time
406 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
407 
408 	Node().inode_size = HOST_ENDIAN_TO_BFS_INT32(volume->InodeSize());
409 
410 	// these two will help to maintain the indices
411 	fOldSize = Size();
412 	fOldLastModified = LastModified();
413 }
414 
415 
416 Inode::~Inode()
417 {
418 	PRINT(("Inode::~Inode() @ %p\n", this));
419 
420 	file_cache_delete(FileCache());
421 	file_map_delete(Map());
422 	delete fTree;
423 
424 	rw_lock_destroy(&fLock);
425 	recursive_lock_destroy(&fSmallDataLock);
426 }
427 
428 
429 status_t
430 Inode::InitCheck(bool checkNode) const
431 {
432 	// test inode magic and flags
433 	if (checkNode) {
434 		status_t status = Node().InitCheck(fVolume);
435 		if (status == B_BUSY)
436 			return B_BUSY;
437 
438 		if (status != B_OK) {
439 			FATAL(("inode at block %" B_PRIdOFF " corrupt!\n", BlockNumber()));
440 			RETURN_ERROR(B_BAD_DATA);
441 		}
442 	}
443 
444 	if (IsContainer()) {
445 		// inodes that have a B+tree
446 		if (fTree == NULL)
447 			RETURN_ERROR(B_NO_MEMORY);
448 
449 		status_t status = fTree->InitCheck();
450 		if (status != B_OK) {
451 			FATAL(("inode tree at block %" B_PRIdOFF " corrupt!\n",
452 				BlockNumber()));
453 			RETURN_ERROR(B_BAD_DATA);
454 		}
455 	}
456 
457 	if (NeedsFileCache() && (fCache == NULL || fMap == NULL))
458 		return B_NO_MEMORY;
459 
460 	return B_OK;
461 }
462 
463 
464 /*!	Adds this inode to the specified transaction. This means that the inode will
465 	be write locked until the transaction ended.
466 	To ensure that the inode will stay valid until that point, an extra reference
467 	is acquired to it as long as this transaction stays active.
468 */
469 void
470 Inode::WriteLockInTransaction(Transaction& transaction)
471 {
472 	// These flags can only change while holding the transaction lock
473 	if ((Flags() & INODE_IN_TRANSACTION) != 0)
474 		return;
475 
476 	// We share the same list link with the removed list, so we have to remove
477 	// the inode from that list here (and add it back when we no longer need it)
478 	if ((Flags() & INODE_DELETED) != 0)
479 		fVolume->RemovedInodes().Remove(this);
480 
481 	if (!fVolume->IsInitializing())
482 		acquire_vnode(fVolume->FSVolume(), ID());
483 
484 	rw_lock_write_lock(&Lock());
485 	Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
486 
487 	transaction.AddListener(this);
488 }
489 
490 
491 status_t
492 Inode::WriteBack(Transaction& transaction)
493 {
494 	NodeGetter node(fVolume, transaction, this);
495 	if (node.WritableNode() == NULL)
496 		return B_IO_ERROR;
497 
498 	memcpy(node.WritableNode(), &Node(), sizeof(bfs_inode));
499 	return B_OK;
500 }
501 
502 
503 status_t
504 Inode::UpdateNodeFromDisk()
505 {
506 	NodeGetter node(fVolume, this);
507 	if (node.Node() == NULL) {
508 		FATAL(("Failed to read block %" B_PRId64 " from disk!\n",
509 			BlockNumber()));
510 		return B_IO_ERROR;
511 	}
512 
513 	memcpy(&fNode, node.Node(), sizeof(bfs_inode));
514 	fNode.flags &= HOST_ENDIAN_TO_BFS_INT32(INODE_PERMANENT_FLAGS);
515 	return B_OK;
516 }
517 
518 
519 status_t
520 Inode::CheckPermissions(int accessMode) const
521 {
522 	// you never have write access to a read-only volume
523 	if ((accessMode & W_OK) != 0 && fVolume->IsReadOnly())
524 		return B_READ_ONLY_DEVICE;
525 
526 	return check_access_permissions(accessMode, Mode(), (gid_t)fNode.GroupID(),
527 		(uid_t)fNode.UserID());
528 }
529 
530 
531 //	#pragma mark - attributes
532 
533 
534 void
535 Inode::_AddIterator(AttributeIterator* iterator)
536 {
537 	RecursiveLocker _(fSmallDataLock);
538 	fIterators.Add(iterator);
539 }
540 
541 
542 void
543 Inode::_RemoveIterator(AttributeIterator* iterator)
544 {
545 	RecursiveLocker _(fSmallDataLock);
546 	fIterators.Remove(iterator);
547 }
548 
549 
550 /*!	Tries to free up "bytes" space in the small_data section by moving
551 	attributes to real files. Used for system attributes like the name.
552 	You need to hold the fSmallDataLock when you call this method
553 */
554 status_t
555 Inode::_MakeSpaceForSmallData(Transaction& transaction, bfs_inode* node,
556 	const char* name, int32 bytes)
557 {
558 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
559 
560 	while (bytes > 0) {
561 		small_data* item = node->SmallDataStart();
562 		small_data* max = NULL;
563 		int32 index = 0, maxIndex = 0;
564 		for (; !item->IsLast(node); item = item->Next(), index++) {
565 			// should not remove those
566 			if (*item->Name() == FILE_NAME_NAME || !strcmp(name, item->Name()))
567 				continue;
568 
569 			if (max == NULL || max->Size() < item->Size()) {
570 				maxIndex = index;
571 				max = item;
572 			}
573 
574 			// Remove the first one large enough to free the needed amount of
575 			// bytes
576 			if (bytes < (int32)item->Size())
577 				break;
578 		}
579 
580 		if (item->IsLast(node) || (int32)item->Size() < bytes)
581 			return B_ERROR;
582 
583 		bytes -= max->Size();
584 
585 		// Move the attribute to a real attribute file
586 		// Luckily, this doesn't cause any index updates
587 
588 		Inode* attribute;
589 		status_t status = CreateAttribute(transaction, item->Name(),
590 			item->Type(), &attribute);
591 		if (status != B_OK)
592 			RETURN_ERROR(status);
593 
594 		size_t length = item->DataSize();
595 		status = attribute->WriteAt(transaction, 0, item->Data(), &length);
596 
597 		ReleaseAttribute(attribute);
598 
599 		if (status != B_OK) {
600 			Vnode vnode(fVolume, Attributes());
601 			Inode* attributes;
602 			if (vnode.Get(&attributes) < B_OK
603 				|| attributes->Remove(transaction, name) < B_OK) {
604 				FATAL(("Could not remove newly created attribute!\n"));
605 			}
606 
607 			RETURN_ERROR(status);
608 		}
609 
610 		_RemoveSmallData(node, max, maxIndex);
611 	}
612 	return B_OK;
613 }
614 
615 
616 /*!	Private function which removes the given attribute from the small_data
617 	section.
618 	You need to hold the fSmallDataLock when you call this method
619 */
620 status_t
621 Inode::_RemoveSmallData(bfs_inode* node, small_data* item, int32 index)
622 {
623 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
624 
625 	small_data* next = item->Next();
626 	if (!next->IsLast(node)) {
627 		// find the last attribute
628 		small_data* last = next;
629 		while (!last->IsLast(node))
630 			last = last->Next();
631 
632 		int32 size = (uint8*)last - (uint8*)next;
633 		if (size < 0
634 			|| size > (uint8*)node + fVolume->BlockSize() - (uint8*)next)
635 			return B_BAD_DATA;
636 
637 		memmove(item, next, size);
638 
639 		// Move the "last" one to its new location and
640 		// correctly terminate the small_data section
641 		last = (small_data*)((uint8*)last - ((uint8*)next - (uint8*)item));
642 		memset(last, 0, (uint8*)node + fVolume->BlockSize() - (uint8*)last);
643 	} else
644 		memset(item, 0, item->Size());
645 
646 	// update all current iterators
647 	SinglyLinkedList<AttributeIterator>::Iterator iterator
648 		= fIterators.GetIterator();
649 	while (iterator.HasNext()) {
650 		iterator.Next()->Update(index, -1);
651 	}
652 
653 	return B_OK;
654 }
655 
656 
657 //!	Removes the given attribute from the small_data section.
658 status_t
659 Inode::_RemoveSmallData(Transaction& transaction, NodeGetter& nodeGetter,
660 	const char* name)
661 {
662 	if (name == NULL)
663 		return B_BAD_VALUE;
664 
665 	bfs_inode* node = nodeGetter.WritableNode();
666 	RecursiveLocker locker(fSmallDataLock);
667 
668 	// search for the small_data item
669 
670 	small_data* item = node->SmallDataStart();
671 	int32 index = 0;
672 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
673 		item = item->Next();
674 		index++;
675 	}
676 
677 	if (item->IsLast(node))
678 		return B_ENTRY_NOT_FOUND;
679 
680 	nodeGetter.MakeWritable(transaction);
681 
682 	status_t status = _RemoveSmallData(node, item, index);
683 	if (status == B_OK) {
684 		Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
685 			bfs_inode::ToInode(real_time_clock_usecs()));
686 
687 		status = WriteBack(transaction);
688 	}
689 
690 	return status;
691 }
692 
693 
694 /*!	Try to place the given attribute in the small_data section - if the
695 	new attribute is too big to fit in that section, it returns B_DEVICE_FULL.
696 	In that case, the attribute should be written to a real attribute file;
697 	it's the caller's responsibility to remove any existing attributes in the
698 	small data section if that's the case.
699 
700 	Note that you need to write back the inode yourself after having called that
701 	method - it's a bad API decision that it needs a transaction but enforces
702 	you to write back the inode all by yourself, but it's just more efficient
703 	in most cases...
704 */
705 status_t
706 Inode::_AddSmallData(Transaction& transaction, NodeGetter& nodeGetter,
707 	const char* name, uint32 type, off_t pos, const uint8* data, size_t length,
708 	bool force)
709 {
710 	bfs_inode* node = nodeGetter.WritableNode();
711 
712 	if (node == NULL || name == NULL || data == NULL)
713 		return B_BAD_VALUE;
714 
715 	// reject any requests that can't fit into the small_data section
716 	uint32 nameLength = strlen(name);
717 	uint32 spaceNeeded = sizeof(small_data) + nameLength + 3 + pos + length + 1;
718 	if (spaceNeeded > fVolume->InodeSize() - sizeof(bfs_inode))
719 		return B_DEVICE_FULL;
720 
721 	nodeGetter.MakeWritable(transaction);
722 	RecursiveLocker locker(fSmallDataLock);
723 
724 	// Find the last item or one with the same name we have to add
725 	small_data* item = node->SmallDataStart();
726 	int32 index = 0;
727 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
728 		item = item->Next();
729 		index++;
730 	}
731 
732 	// is the attribute already in the small_data section?
733 	// then just replace the data part of that one
734 	if (!item->IsLast(node)) {
735 		// find last attribute
736 		small_data* last = item;
737 		while (!last->IsLast(node))
738 			last = last->Next();
739 
740 		// try to change the attributes value
741 		if (item->data_size > pos + length
742 			|| force
743 			|| ((uint8*)last + pos + length - item->DataSize())
744 					<= ((uint8*)node + fVolume->InodeSize())) {
745 			// Make room for the new attribute if needed (and we are forced
746 			// to do so)
747 			if (force && ((uint8*)last + pos + length - item->DataSize())
748 					> ((uint8*)node + fVolume->InodeSize())) {
749 				// We also take the free space at the end of the small_data
750 				// section into account, and request only what's really needed
751 				uint32 needed = pos + length - item->DataSize() -
752 					(uint32)((uint8*)node + fVolume->InodeSize()
753 						- (uint8*)last);
754 
755 				if (_MakeSpaceForSmallData(transaction, node, name, needed)
756 						!= B_OK)
757 					return B_ERROR;
758 
759 				// reset our pointers
760 				item = node->SmallDataStart();
761 				index = 0;
762 				while (!item->IsLast(node) && strcmp(item->Name(), name)) {
763 					item = item->Next();
764 					index++;
765 				}
766 
767 				last = item;
768 				while (!last->IsLast(node))
769 					last = last->Next();
770 			}
771 
772 			size_t oldDataSize = item->DataSize();
773 
774 			// Normally, we can just overwrite the attribute data as the size
775 			// is specified by the type and does not change that often
776 			if (pos + length != item->DataSize()) {
777 				// move the attributes after the current one
778 				small_data* next = item->Next();
779 				if (!next->IsLast(node)) {
780 					memmove((uint8*)item + spaceNeeded, next,
781 						(uint8*)last - (uint8*)next);
782 				}
783 
784 				// Move the "last" one to its new location and
785 				// correctly terminate the small_data section
786 				last = (small_data*)((uint8*)last
787 					- ((uint8*)next - ((uint8*)item + spaceNeeded)));
788 				if ((uint8*)last < (uint8*)node + fVolume->BlockSize()) {
789 					memset(last, 0, (uint8*)node + fVolume->BlockSize()
790 						- (uint8*)last);
791 				}
792 
793 				item->data_size = HOST_ENDIAN_TO_BFS_INT16(pos + length);
794 			}
795 
796 			item->type = HOST_ENDIAN_TO_BFS_INT32(type);
797 
798 			if ((uint64)oldDataSize < (uint64)pos) {
799 				// Fill gap with zeros
800 				memset(item->Data() + oldDataSize, 0, pos - oldDataSize);
801 			}
802 			if (user_memcpy(item->Data() + pos, data, length) < B_OK)
803 				return B_BAD_ADDRESS;
804 			item->Data()[pos + length] = '\0';
805 
806 			return B_OK;
807 		}
808 
809 		return B_DEVICE_FULL;
810 	}
811 
812 	// try to add the new attribute!
813 
814 	if ((uint8*)item + spaceNeeded > (uint8*)node + fVolume->InodeSize()) {
815 		// there is not enough space for it!
816 		if (!force)
817 			return B_DEVICE_FULL;
818 
819 		// make room for the new attribute
820 		if (_MakeSpaceForSmallData(transaction, node, name, spaceNeeded) < B_OK)
821 			return B_ERROR;
822 
823 		// get new last item!
824 		item = node->SmallDataStart();
825 		index = 0;
826 		while (!item->IsLast(node)) {
827 			item = item->Next();
828 			index++;
829 		}
830 	}
831 
832 	memset(item, 0, spaceNeeded);
833 	item->type = HOST_ENDIAN_TO_BFS_INT32(type);
834 	item->name_size = HOST_ENDIAN_TO_BFS_INT16(nameLength);
835 	item->data_size = HOST_ENDIAN_TO_BFS_INT16(length);
836 	strcpy(item->Name(), name);
837 	if (user_memcpy(item->Data() + pos, data, length) < B_OK)
838 		return B_BAD_ADDRESS;
839 
840 	// correctly terminate the small_data section
841 	item = item->Next();
842 	if (!item->IsLast(node))
843 		memset(item, 0, (uint8*)node + fVolume->InodeSize() - (uint8*)item);
844 
845 	// update all current iterators
846 	SinglyLinkedList<AttributeIterator>::Iterator iterator
847 		= fIterators.GetIterator();
848 	while (iterator.HasNext()) {
849 		iterator.Next()->Update(index, 1);
850 	}
851 
852 	return B_OK;
853 }
854 
855 
856 /*!	Iterates through the small_data section of an inode.
857 	To start at the beginning of this section, you let smallData
858 	point to NULL, like:
859 		small_data* data = NULL;
860 		while (inode->GetNextSmallData(&data) { ... }
861 
862 	This function is reentrant and doesn't allocate any memory;
863 	you can safely stop calling it at any point (you don't need
864 	to iterate through the whole list).
865 	You need to hold the fSmallDataLock when you call this method
866 */
867 status_t
868 Inode::_GetNextSmallData(bfs_inode* node, small_data** _smallData) const
869 {
870 	if (node == NULL)
871 		RETURN_ERROR(B_BAD_VALUE);
872 
873 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
874 
875 	small_data* data = *_smallData;
876 
877 	// begin from the start?
878 	if (data == NULL)
879 		data = node->SmallDataStart();
880 	else
881 		data = data->Next();
882 
883 	// is already last item?
884 	if (data->IsLast(node))
885 		return B_ENTRY_NOT_FOUND;
886 
887 	*_smallData = data;
888 
889 	return B_OK;
890 }
891 
892 
893 /*!	Finds the attribute "name" in the small data section, and
894 	returns a pointer to it (or NULL if it doesn't exist).
895 	You need to hold the fSmallDataLock when you call this method
896 */
897 small_data*
898 Inode::FindSmallData(const bfs_inode* node, const char* name) const
899 {
900 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
901 
902 	small_data* smallData = NULL;
903 	while (_GetNextSmallData(const_cast<bfs_inode*>(node), &smallData)
904 			== B_OK) {
905 		if (!strcmp(smallData->Name(), name))
906 			return smallData;
907 	}
908 	return NULL;
909 }
910 
911 
912 /*!	Returns a pointer to the node's name if present in the small data
913 	section, NULL otherwise.
914 	You need to hold the fSmallDataLock when you call this method
915 */
916 const char*
917 Inode::Name(const bfs_inode* node) const
918 {
919 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
920 
921 	small_data* smallData = NULL;
922 	while (_GetNextSmallData((bfs_inode*)node, &smallData) == B_OK) {
923 		if (*smallData->Name() == FILE_NAME_NAME
924 			&& smallData->NameSize() == FILE_NAME_NAME_LENGTH)
925 			return (const char*)smallData->Data();
926 	}
927 	return NULL;
928 }
929 
930 
931 /*!	Copies the node's name into the provided buffer.
932 	The buffer should be B_FILE_NAME_LENGTH bytes large.
933 */
934 status_t
935 Inode::GetName(char* buffer, size_t size) const
936 {
937 	NodeGetter node(fVolume, this);
938 	if (node.Node() == NULL)
939 		return B_IO_ERROR;
940 
941 	RecursiveLocker locker(fSmallDataLock);
942 
943 	const char* name = Name(node.Node());
944 	if (name == NULL)
945 		return B_ENTRY_NOT_FOUND;
946 
947 	strlcpy(buffer, name, size);
948 	return B_OK;
949 }
950 
951 
952 /*!	Changes or set the name of a file: in the inode small_data section only, it
953 	doesn't change it in the parent directory's b+tree.
954 	Note that you need to write back the inode yourself after having called
955 	that method. It suffers from the same API decision as AddSmallData() does
956 	(and for the same reason).
957 */
958 status_t
959 Inode::SetName(Transaction& transaction, const char* name)
960 {
961 	if (name == NULL || *name == '\0')
962 		return B_BAD_VALUE;
963 
964 	NodeGetter node(fVolume, transaction, this);
965 	if (node.Node() == NULL)
966 		return B_IO_ERROR;
967 
968 	const char nameTag[2] = {FILE_NAME_NAME, 0};
969 
970 	return _AddSmallData(transaction, node, nameTag, FILE_NAME_TYPE, 0,
971 		(uint8*)name, strlen(name), true);
972 }
973 
974 
975 status_t
976 Inode::_RemoveAttribute(Transaction& transaction, const char* name,
977 	bool hasIndex, Index* index)
978 {
979 	// remove the attribute file if it exists
980 	Vnode vnode(fVolume, Attributes());
981 	Inode* attributes;
982 	status_t status = vnode.Get(&attributes);
983 	if (status < B_OK)
984 		return status;
985 
986 	// update index
987 	if (index != NULL) {
988 		Inode* attribute;
989 		if ((hasIndex || fVolume->CheckForLiveQuery(name))
990 			&& GetAttribute(name, &attribute) == B_OK) {
991 			uint8 data[MAX_INDEX_KEY_LENGTH];
992 			size_t length = MAX_INDEX_KEY_LENGTH;
993 			if (attribute->ReadAt(0, data, &length) == B_OK) {
994 				index->Update(transaction, name, attribute->Type(), data,
995 					length, NULL, 0, this);
996 			}
997 
998 			ReleaseAttribute(attribute);
999 		}
1000 	}
1001 
1002 	if ((status = attributes->Remove(transaction, name)) < B_OK)
1003 		return status;
1004 
1005 	if (attributes->IsEmpty()) {
1006 		attributes->WriteLockInTransaction(transaction);
1007 
1008 		// remove attribute directory (don't fail if that can't be done)
1009 		if (remove_vnode(fVolume->FSVolume(), attributes->ID()) == B_OK) {
1010 			// update the inode, so that no one will ever doubt it's deleted :-)
1011 			attributes->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
1012 			if (attributes->WriteBack(transaction) == B_OK) {
1013 				Attributes().SetTo(0, 0, 0);
1014 				WriteBack(transaction);
1015 			} else {
1016 				unremove_vnode(fVolume->FSVolume(), attributes->ID());
1017 				attributes->Node().flags
1018 					&= ~HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
1019 			}
1020 		}
1021 	}
1022 
1023 	return status;
1024 }
1025 
1026 
1027 /*!	Reads data from the specified attribute.
1028 	This is a high-level attribute function that understands attributes
1029 	in the small_data section as well as real attribute files.
1030 */
1031 status_t
1032 Inode::ReadAttribute(const char* name, int32 type, off_t pos, uint8* buffer,
1033 	size_t* _length)
1034 {
1035 	if (pos < 0)
1036 		pos = 0;
1037 
1038 	// search in the small_data section (which has to be locked first)
1039 	{
1040 		NodeGetter node(fVolume, this);
1041 		if (node.Node() == NULL)
1042 			return B_IO_ERROR;
1043 
1044 		RecursiveLocker locker(fSmallDataLock);
1045 
1046 		small_data* smallData = FindSmallData(node.Node(), name);
1047 		if (smallData != NULL) {
1048 			size_t length = *_length;
1049 			if (pos >= smallData->data_size) {
1050 				*_length = 0;
1051 				return B_OK;
1052 			}
1053 			if (length + pos > smallData->DataSize())
1054 				length = smallData->DataSize() - pos;
1055 
1056 			status_t error = user_memcpy(buffer, smallData->Data() + pos,
1057 				length);
1058 			*_length = length;
1059 			return error;
1060 		}
1061 	}
1062 
1063 	// search in the attribute directory
1064 	Inode* attribute;
1065 	status_t status = GetAttribute(name, &attribute);
1066 	if (status == B_OK) {
1067 		status = attribute->ReadAt(pos, (uint8*)buffer, _length);
1068 
1069 		ReleaseAttribute(attribute);
1070 	}
1071 
1072 	RETURN_ERROR(status);
1073 }
1074 
1075 
1076 /*!	Writes data to the specified attribute.
1077 	This is a high-level attribute function that understands attributes
1078 	in the small_data section as well as real attribute files.
1079 */
1080 status_t
1081 Inode::WriteAttribute(Transaction& transaction, const char* name, int32 type,
1082 	off_t pos, const uint8* buffer, size_t* _length, bool* _created)
1083 {
1084 	if (pos < 0)
1085 		return B_BAD_VALUE;
1086 
1087 	// needed to maintain the index
1088 	uint8 oldBuffer[MAX_INDEX_KEY_LENGTH];
1089 	uint8* oldData = NULL;
1090 	size_t oldLength = 0;
1091 	bool created = false;
1092 
1093 	// TODO: we actually depend on that the contents of "buffer" are constant.
1094 	// If they get changed during the write (hey, user programs), we may mess
1095 	// up our index trees!
1096 	// TODO: for attribute files, we need to log the first
1097 	// MAX_INDEX_KEY_LENGTH bytes of the data stream, or the same as above
1098 	// might happen.
1099 
1100 	Index index(fVolume);
1101 	bool hasIndex = index.SetTo(name) == B_OK;
1102 
1103 	Inode* attribute = NULL;
1104 	status_t status = B_OK;
1105 
1106 	if (GetAttribute(name, &attribute) != B_OK) {
1107 		// No attribute inode exists yet
1108 
1109 		// save the old attribute data
1110 		NodeGetter node(fVolume, transaction, this);
1111 		if (node.Node() == NULL)
1112 			return B_IO_ERROR;
1113 
1114 		recursive_lock_lock(&fSmallDataLock);
1115 
1116 		small_data* smallData = FindSmallData(node.Node(), name);
1117 		if (smallData != NULL) {
1118 			oldLength = smallData->DataSize();
1119 			if (oldLength > 0) {
1120 				if (oldLength > MAX_INDEX_KEY_LENGTH)
1121 					oldLength = MAX_INDEX_KEY_LENGTH;
1122 				memcpy(oldData = oldBuffer, smallData->Data(), oldLength);
1123 			}
1124 		} else
1125 			created = true;
1126 
1127 		recursive_lock_unlock(&fSmallDataLock);
1128 
1129 		// if the attribute doesn't exist yet (as a file), try to put it in the
1130 		// small_data section first - if that fails (due to insufficent space),
1131 		// create a real attribute file
1132 		status = _AddSmallData(transaction, node, name, type, pos, buffer,
1133 			*_length);
1134 		if (status == B_DEVICE_FULL) {
1135 			if (smallData != NULL) {
1136 				// remove the old attribute from the small data section - there
1137 				// is no space left for the new data
1138 				status = _RemoveSmallData(transaction, node, name);
1139 			} else
1140 				status = B_OK;
1141 
1142 			if (status == B_OK)
1143 				status = CreateAttribute(transaction, name, type, &attribute);
1144 			if (status != B_OK)
1145 				RETURN_ERROR(status);
1146 
1147 			created = true;
1148 		} else if (status == B_OK) {
1149 			// Update status time on attribute write
1150 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1151 				bfs_inode::ToInode(real_time_clock_usecs()));
1152 
1153 			status = WriteBack(transaction);
1154 		}
1155 	}
1156 
1157 	if (attribute != NULL) {
1158 		WriteLocker writeLocker(attribute->fLock);
1159 
1160 		if (hasIndex || fVolume->CheckForLiveQuery(name)) {
1161 			// Save the old attribute data (if this fails, oldLength will
1162 			// reflect it)
1163 			while (attribute->Size() > 0) {
1164 				bigtime_t oldModified = attribute->LastModified();
1165 				writeLocker.Unlock();
1166 
1167 				oldLength = MAX_INDEX_KEY_LENGTH;
1168 				if (attribute->ReadAt(0, oldBuffer, &oldLength) == B_OK)
1169 					oldData = oldBuffer;
1170 
1171 				writeLocker.Lock();
1172 
1173 				// Read until the data hasn't changed in between
1174 				if (oldModified == attribute->LastModified())
1175 					break;
1176 
1177 				oldLength = 0;
1178 			}
1179 		}
1180 
1181 		// check if the data fits into the small_data section again
1182 		NodeGetter node(fVolume, transaction, this);
1183 		if (node.Node() == NULL)
1184 			return B_IO_ERROR;
1185 
1186 		status = _AddSmallData(transaction, node, name, type, pos, buffer,
1187 			*_length);
1188 
1189 		if (status == B_OK) {
1190 			// it does - remove its file
1191 			writeLocker.Unlock();
1192 			status = _RemoveAttribute(transaction, name, false, NULL);
1193 		} else {
1194 			// The attribute type might have been changed - we need to
1195 			// adopt the new one
1196 			attribute->Node().type = HOST_ENDIAN_TO_BFS_INT32(type);
1197 			status = attribute->WriteBack(transaction);
1198 			writeLocker.Unlock();
1199 
1200 			if (status == B_OK) {
1201 				status = attribute->WriteAt(transaction, pos, buffer,
1202 					_length);
1203 			}
1204 		}
1205 
1206 		if (status == B_OK) {
1207 			// Update status time on attribute write
1208 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1209 				bfs_inode::ToInode(real_time_clock_usecs()));
1210 
1211 			status = WriteBack(transaction);
1212 		}
1213 
1214 		attribute->WriteLockInTransaction(transaction);
1215 		ReleaseAttribute(attribute);
1216 	}
1217 
1218 	// TODO: find a better way than this "pos" thing (the begin of the old key
1219 	//	must be copied to the start of the new one for a comparison)
1220 	if (status == B_OK && pos == 0) {
1221 		// Index only the first MAX_INDEX_KEY_LENGTH bytes
1222 		uint16 length = *_length;
1223 		if (length > MAX_INDEX_KEY_LENGTH)
1224 			length = MAX_INDEX_KEY_LENGTH;
1225 
1226 		uint8 indexBuffer[MAX_INDEX_KEY_LENGTH];
1227 		// _AddSmallData() already read the buffer
1228 		user_memcpy(indexBuffer, buffer, length);
1229 
1230 		// Update index. Note, Index::Update() may be called even if
1231 		// initializing the index failed - it will just update the live
1232 		// queries in this case
1233 		if (pos < length || (uint64)pos < (uint64)oldLength) {
1234 			index.Update(transaction, name, type, oldData, oldLength,
1235 				indexBuffer, length, this);
1236 		}
1237 	}
1238 
1239 	if (_created != NULL)
1240 		*_created = created;
1241 
1242 	return status;
1243 }
1244 
1245 
1246 /*!	Removes the specified attribute from the inode.
1247 	This is a high-level attribute function that understands attributes
1248 	in the small_data section as well as real attribute files.
1249 */
1250 status_t
1251 Inode::RemoveAttribute(Transaction& transaction, const char* name)
1252 {
1253 	Index index(fVolume);
1254 	bool hasIndex = index.SetTo(name) == B_OK;
1255 	NodeGetter node(fVolume, this);
1256 	if (node.Node() == NULL)
1257 		return B_IO_ERROR;
1258 
1259 	// update index for attributes in the small_data section
1260 	{
1261 		RecursiveLocker _(fSmallDataLock);
1262 
1263 		small_data* smallData = FindSmallData(node.Node(), name);
1264 		if (smallData != NULL) {
1265 			uint32 length = smallData->DataSize();
1266 			if (length > MAX_INDEX_KEY_LENGTH)
1267 				length = MAX_INDEX_KEY_LENGTH;
1268 			index.Update(transaction, name, smallData->Type(),
1269 				smallData->Data(), length, NULL, 0, this);
1270 		}
1271 	}
1272 
1273 	status_t status = _RemoveSmallData(transaction, node, name);
1274 	if (status == B_ENTRY_NOT_FOUND && !Attributes().IsZero()) {
1275 		// remove the attribute file if it exists
1276 		status = _RemoveAttribute(transaction, name, hasIndex, &index);
1277 		if (status == B_OK) {
1278 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1279 				bfs_inode::ToInode(real_time_clock_usecs()));
1280 			WriteBack(transaction);
1281 		}
1282 	}
1283 
1284 	return status;
1285 }
1286 
1287 
1288 /*!	Returns the attribute inode with the specified \a name, in case it exists.
1289 	This method can only return real attribute files; the attributes in the
1290 	small data section are ignored.
1291 */
1292 status_t
1293 Inode::GetAttribute(const char* name, Inode** _attribute)
1294 {
1295 	// does this inode even have attributes?
1296 	if (Attributes().IsZero())
1297 		return B_ENTRY_NOT_FOUND;
1298 
1299 	Vnode vnode(fVolume, Attributes());
1300 	Inode* attributes;
1301 	if (vnode.Get(&attributes) < B_OK) {
1302 		FATAL(("get_vnode() failed in Inode::GetAttribute(name = \"%s\")\n",
1303 			name));
1304 		return B_ERROR;
1305 	}
1306 
1307 	BPlusTree* tree = attributes->Tree();
1308 	if (tree == NULL)
1309 		return B_BAD_VALUE;
1310 
1311 	InodeReadLocker locker(attributes);
1312 
1313 	ino_t id;
1314 	status_t status = tree->Find((uint8*)name, (uint16)strlen(name), &id);
1315 	if (status == B_OK) {
1316 		Vnode vnode(fVolume, id);
1317 		Inode* inode;
1318 		// Check if the attribute is really an attribute
1319 		if (vnode.Get(&inode) != B_OK || !inode->IsAttribute())
1320 			return B_ERROR;
1321 
1322 		*_attribute = inode;
1323 		vnode.Keep();
1324 		return B_OK;
1325 	}
1326 
1327 	return status;
1328 }
1329 
1330 
1331 void
1332 Inode::ReleaseAttribute(Inode* attribute)
1333 {
1334 	if (attribute == NULL)
1335 		return;
1336 
1337 	put_vnode(fVolume->FSVolume(), attribute->ID());
1338 }
1339 
1340 
1341 status_t
1342 Inode::CreateAttribute(Transaction& transaction, const char* name, uint32 type,
1343 	Inode** attribute)
1344 {
1345 	// do we need to create the attribute directory first?
1346 	if (Attributes().IsZero()) {
1347 		status_t status = Inode::Create(transaction, this, NULL,
1348 			S_ATTR_DIR | S_DIRECTORY | 0666, 0, 0, NULL);
1349 		if (status < B_OK)
1350 			RETURN_ERROR(status);
1351 	}
1352 	Vnode vnode(fVolume, Attributes());
1353 	Inode* attributes;
1354 	if (vnode.Get(&attributes) < B_OK)
1355 		return B_ERROR;
1356 
1357 	// Inode::Create() locks the inode for us
1358 	return Inode::Create(transaction, attributes, name,
1359 		S_ATTR | S_FILE | 0666, 0, type, NULL, NULL, attribute);
1360 }
1361 
1362 
1363 //	#pragma mark - directory tree
1364 
1365 
1366 bool
1367 Inode::IsEmpty()
1368 {
1369 	TreeIterator iterator(fTree);
1370 
1371 	uint32 count = 0;
1372 	char name[MAX_INDEX_KEY_LENGTH + 1];
1373 	uint16 length;
1374 	ino_t id;
1375 	while (iterator.GetNextEntry(name, &length, MAX_INDEX_KEY_LENGTH + 1,
1376 			&id) == B_OK) {
1377 		if ((Mode() & (S_ATTR_DIR | S_INDEX_DIR)) != 0)
1378 			return false;
1379 
1380 		// Unlike index and attribute directories, directories
1381 		// for standard files always contain ".", and "..", so
1382 		// we need to ignore those two
1383 		if (++count > 2 || (strcmp(".", name) != 0 && strcmp("..", name) != 0))
1384 			return false;
1385 	}
1386 	return true;
1387 }
1388 
1389 
1390 status_t
1391 Inode::ContainerContentsChanged(Transaction& transaction)
1392 {
1393 	ASSERT(!InLastModifiedIndex());
1394 
1395 	Node().last_modified_time = Node().status_change_time
1396 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
1397 
1398 	return WriteBack(transaction);
1399 }
1400 
1401 
1402 //	#pragma mark - data stream
1403 
1404 
1405 /*!	Computes the number of bytes this inode occupies in the file system.
1406 	This includes the file meta blocks used for maintaining its data stream.
1407 
1408 	TODO: However, the attributes in extra files are not really accounted for;
1409 	depending on the speed penalty, this should be changed, though (the value
1410 	could be cached in the inode structure or Inode object, though).
1411 */
1412 off_t
1413 Inode::AllocatedSize() const
1414 {
1415 	if (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0) {
1416 		// This symlink does not have a data stream
1417 		return Node().InodeSize();
1418 	}
1419 
1420 	const data_stream& data = Node().data;
1421 	uint32 blockSize = fVolume->BlockSize();
1422 	off_t size = blockSize;
1423 
1424 	if (data.MaxDoubleIndirectRange() != 0) {
1425 		off_t doubleIndirectSize = data.MaxDoubleIndirectRange()
1426 			- data.MaxIndirectRange();
1427 		int32 indirectSize = double_indirect_max_indirect_size(
1428 			data.double_indirect.Length(), fVolume->BlockSize());
1429 
1430 		size += (2 * data.double_indirect.Length()
1431 				+ doubleIndirectSize / indirectSize)
1432 			* blockSize + data.MaxDoubleIndirectRange();
1433 	} else if (data.MaxIndirectRange() != 0)
1434 		size += data.indirect.Length() + data.MaxIndirectRange();
1435 	else
1436 		size += data.MaxDirectRange();
1437 
1438 	if (!Node().attributes.IsZero()) {
1439 		// TODO: to make this exact, we'd had to count all attributes
1440 		size += 2 * blockSize;
1441 			// 2 blocks, one for the attributes inode, one for its B+tree
1442 	}
1443 
1444 	return size;
1445 }
1446 
1447 
1448 /*!	Finds the block_run where "pos" is located in the data_stream of
1449 	the inode.
1450 	If successful, "offset" will then be set to the file offset
1451 	of the block_run returned; so "pos - offset" is for the block_run
1452 	what "pos" is for the whole stream.
1453 	The caller has to make sure that "pos" is inside the stream.
1454 */
1455 status_t
1456 Inode::FindBlockRun(off_t pos, block_run& run, off_t& offset)
1457 {
1458 	data_stream* data = &Node().data;
1459 
1460 	// find matching block run
1461 
1462 	if (data->MaxIndirectRange() > 0 && pos >= data->MaxDirectRange()) {
1463 		if (data->MaxDoubleIndirectRange() > 0
1464 			&& pos >= data->MaxIndirectRange()) {
1465 			// access to double indirect blocks
1466 
1467 			CachedBlock cached(fVolume);
1468 
1469 			int32 runsPerBlock;
1470 			int32 directSize;
1471 			int32 indirectSize;
1472 			get_double_indirect_sizes(data->double_indirect.Length(),
1473 				fVolume->BlockSize(), runsPerBlock, directSize, indirectSize);
1474 			if (directSize <= 0 || indirectSize <= 0)
1475 				RETURN_ERROR(B_BAD_DATA);
1476 
1477 			off_t start = pos - data->MaxIndirectRange();
1478 			int32 index = start / indirectSize;
1479 
1480 			block_run* indirect = (block_run*)cached.SetTo(
1481 				fVolume->ToBlock(data->double_indirect) + index / runsPerBlock);
1482 			if (indirect == NULL)
1483 				RETURN_ERROR(B_ERROR);
1484 
1485 			int32 current = (start % indirectSize) / directSize;
1486 
1487 			indirect = (block_run*)cached.SetTo(
1488 				fVolume->ToBlock(indirect[index % runsPerBlock])
1489 				+ current / runsPerBlock);
1490 			if (indirect == NULL)
1491 				RETURN_ERROR(B_ERROR);
1492 
1493 			run = indirect[current % runsPerBlock];
1494 			if (run.Length() != data->double_indirect.Length())
1495 				RETURN_ERROR(B_BAD_DATA);
1496 
1497 			offset = data->MaxIndirectRange() + (index * indirectSize)
1498 				+ (current * directSize);
1499 		} else {
1500 			// access to indirect blocks
1501 
1502 			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1503 			off_t runBlockEnd = data->MaxDirectRange();
1504 
1505 			CachedBlock cached(fVolume);
1506 			off_t block = fVolume->ToBlock(data->indirect);
1507 
1508 			for (int32 i = 0; i < data->indirect.Length(); i++) {
1509 				block_run* indirect = (block_run*)cached.SetTo(block + i);
1510 				if (indirect == NULL)
1511 					RETURN_ERROR(B_IO_ERROR);
1512 
1513 				int32 current = -1;
1514 				while (++current < runsPerBlock) {
1515 					if (indirect[current].IsZero())
1516 						break;
1517 
1518 					runBlockEnd += (uint32)indirect[current].Length()
1519 						<< cached.BlockShift();
1520 					if (runBlockEnd > pos) {
1521 						run = indirect[current];
1522 						offset = runBlockEnd - ((uint32)run.Length()
1523 							<< cached.BlockShift());
1524 						return fVolume->ValidateBlockRun(run);
1525 					}
1526 				}
1527 			}
1528 			RETURN_ERROR(B_ERROR);
1529 		}
1530 	} else {
1531 		// access from direct blocks
1532 
1533 		off_t runBlockEnd = 0LL;
1534 		int32 current = -1;
1535 
1536 		while (++current < NUM_DIRECT_BLOCKS) {
1537 			if (data->direct[current].IsZero())
1538 				break;
1539 
1540 			runBlockEnd += (uint32)data->direct[current].Length()
1541 				<< fVolume->BlockShift();
1542 			if (runBlockEnd > pos) {
1543 				run = data->direct[current];
1544 				offset = runBlockEnd
1545 					- ((uint32)run.Length() << fVolume->BlockShift());
1546 				return fVolume->ValidateBlockRun(run);
1547 			}
1548 		}
1549 
1550 		return B_ENTRY_NOT_FOUND;
1551 	}
1552 	return fVolume->ValidateBlockRun(run);
1553 }
1554 
1555 
1556 status_t
1557 Inode::ReadAt(off_t pos, uint8* buffer, size_t* _length)
1558 {
1559 	size_t length = *_length;
1560 
1561 	// set/check boundaries for pos/length
1562 	if (pos < 0)
1563 		return B_BAD_VALUE;
1564 
1565 	InodeReadLocker locker(this);
1566 
1567 	if (pos >= Size() || length == 0) {
1568 		*_length = 0;
1569 		return B_NO_ERROR;
1570 	}
1571 
1572 	locker.Unlock();
1573 
1574 	return file_cache_read(FileCache(), NULL, pos, buffer, _length);
1575 }
1576 
1577 
1578 status_t
1579 Inode::WriteAt(Transaction& transaction, off_t pos, const uint8* buffer,
1580 	size_t* _length)
1581 {
1582 	InodeReadLocker locker(this);
1583 
1584 	// update the last modification time in memory, it will be written
1585 	// back to the inode, and the index when the file is closed
1586 	// TODO: should update the internal last modified time only at this point!
1587 	Node().last_modified_time = Node().status_change_time
1588 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
1589 
1590 	// TODO: support INODE_LOGGED!
1591 
1592 	size_t length = *_length;
1593 	bool changeSize = (uint64)pos + (uint64)length > (uint64)Size();
1594 
1595 	// set/check boundaries for pos/length
1596 	if (pos < 0)
1597 		return B_BAD_VALUE;
1598 
1599 	locker.Unlock();
1600 
1601 	// the transaction doesn't have to be started already
1602 	if (changeSize && !transaction.IsStarted())
1603 		transaction.Start(fVolume, BlockNumber());
1604 
1605 	WriteLocker writeLocker(fLock);
1606 
1607 	// Work around possible race condition: Someone might have shrunken the file
1608 	// while we had no lock.
1609 	if (!transaction.IsStarted()
1610 		&& (uint64)pos + (uint64)length > (uint64)Size()) {
1611 		writeLocker.Unlock();
1612 		transaction.Start(fVolume, BlockNumber());
1613 		writeLocker.Lock();
1614 	}
1615 
1616 	off_t oldSize = Size();
1617 
1618 	if ((uint64)pos + (uint64)length > (uint64)oldSize) {
1619 		// let's grow the data stream to the size needed
1620 		status_t status = SetFileSize(transaction, pos + length);
1621 		if (status != B_OK) {
1622 			*_length = 0;
1623 			WriteLockInTransaction(transaction);
1624 			RETURN_ERROR(status);
1625 		}
1626 		// TODO: In theory we would need to update the file size
1627 		// index here as part of the current transaction - this might just
1628 		// be a bit too expensive, but worth a try.
1629 
1630 		// we need to write back the inode here because it has to
1631 		// go into this transaction (we cannot wait until the file
1632 		// is closed)
1633 		status = WriteBack(transaction);
1634 		if (status != B_OK) {
1635 			WriteLockInTransaction(transaction);
1636 			return status;
1637 		}
1638 	}
1639 
1640 	writeLocker.Unlock();
1641 
1642 	if (oldSize < pos)
1643 		FillGapWithZeros(oldSize, pos);
1644 
1645 	// If we don't want to write anything, we can now return (we may
1646 	// just have changed the file size using the position parameter)
1647 	if (length == 0)
1648 		return B_OK;
1649 
1650 	status_t status = file_cache_write(FileCache(), NULL, pos, buffer, _length);
1651 
1652 	if (transaction.IsStarted())
1653 		WriteLockInTransaction(transaction);
1654 
1655 	return status;
1656 }
1657 
1658 
1659 /*!	Fills the gap between the old file size and the new file size
1660 	with zeros.
1661 	It's more or less a copy of Inode::WriteAt() but it can handle
1662 	length differences of more than just 4 GB, and it never uses
1663 	the log, even if the INODE_LOGGED flag is set.
1664 */
1665 status_t
1666 Inode::FillGapWithZeros(off_t pos, off_t newSize)
1667 {
1668 	while (pos < newSize) {
1669 		size_t size;
1670 		if (newSize > pos + 1024 * 1024 * 1024)
1671 			size = 1024 * 1024 * 1024;
1672 		else
1673 			size = newSize - pos;
1674 
1675 		status_t status = file_cache_write(FileCache(), NULL, pos, NULL, &size);
1676 		if (status < B_OK)
1677 			return status;
1678 
1679 		pos += size;
1680 	}
1681 
1682 	return B_OK;
1683 }
1684 
1685 
1686 /*!	Allocates \a length blocks, and clears their contents. Growing
1687 	the indirect and double indirect range uses this method.
1688 	The allocated block_run is saved in "run"
1689 */
1690 status_t
1691 Inode::_AllocateBlockArray(Transaction& transaction, block_run& run,
1692 	size_t length, bool variableSize)
1693 {
1694 	if (!run.IsZero())
1695 		return B_BAD_VALUE;
1696 
1697 	status_t status = fVolume->Allocate(transaction, this, length, run,
1698 		variableSize ? 1 : length);
1699 	if (status != B_OK)
1700 		return status;
1701 
1702 	// make sure those blocks are empty
1703 	CachedBlock cached(fVolume);
1704 	off_t block = fVolume->ToBlock(run);
1705 
1706 	for (int32 i = 0; i < run.Length(); i++) {
1707 		block_run* runs = (block_run*)cached.SetToWritable(transaction,
1708 			block + i, true);
1709 		if (runs == NULL)
1710 			return B_IO_ERROR;
1711 	}
1712 	return B_OK;
1713 }
1714 
1715 
1716 /*!	Grows the stream to \a size, and fills the direct/indirect/double indirect
1717 	ranges with the runs.
1718 	This method will also determine the size of the preallocation, if any.
1719 */
1720 status_t
1721 Inode::_GrowStream(Transaction& transaction, off_t size)
1722 {
1723 	data_stream* data = &Node().data;
1724 
1725 	// is the data stream already large enough to hold the new size?
1726 	// (can be the case with preallocated blocks)
1727 	if (size < data->MaxDirectRange()
1728 		|| size < data->MaxIndirectRange()
1729 		|| size < data->MaxDoubleIndirectRange()) {
1730 		data->size = HOST_ENDIAN_TO_BFS_INT64(size);
1731 		return B_OK;
1732 	}
1733 
1734 	// how many bytes are still needed? (unused ranges are always zero)
1735 	uint16 minimum = 1;
1736 	off_t bytes;
1737 	if (data->Size() < data->MaxDoubleIndirectRange()) {
1738 		bytes = size - data->MaxDoubleIndirectRange();
1739 		// The double indirect range can only handle multiples of
1740 		// its base length
1741 		minimum = data->double_indirect.Length();
1742 	} else if (data->Size() < data->MaxIndirectRange())
1743 		bytes = size - data->MaxIndirectRange();
1744 	else if (data->Size() < data->MaxDirectRange())
1745 		bytes = size - data->MaxDirectRange();
1746 	else {
1747 		// no preallocation left to be used
1748 		bytes = size - data->Size();
1749 		if (data->MaxDoubleIndirectRange() > 0)
1750 			minimum = data->double_indirect.Length();
1751 	}
1752 
1753 	// do we have enough free blocks on the disk?
1754 	off_t blocksNeeded = (bytes + fVolume->BlockSize() - 1)
1755 		>> fVolume->BlockShift();
1756 	if (blocksNeeded > fVolume->FreeBlocks())
1757 		return B_DEVICE_FULL;
1758 
1759 	off_t blocksRequested = blocksNeeded;
1760 		// because of preallocations and partial allocations, the number of
1761 		// blocks we need to allocate may be different from the one we request
1762 		// from the block allocator
1763 
1764 	// Should we preallocate some blocks?
1765 	// Attributes, attribute directories, and long symlinks usually won't get
1766 	// that big, and should stay close to the inode - preallocating could be
1767 	// counterproductive.
1768 	// Also, if free disk space is tight, don't preallocate.
1769 	if (!IsAttribute() && !IsAttributeDirectory() && !IsSymLink()
1770 		&& fVolume->FreeBlocks() > 128) {
1771 		off_t roundTo = 0;
1772 		if (IsFile()) {
1773 			// Request preallocated blocks depending on the file size and growth
1774 			if (size < 1 * 1024 * 1024 && bytes < 512 * 1024) {
1775 				// Preallocate 64 KB for file sizes <1 MB and grow rates <512 KB
1776 				roundTo = 65536 >> fVolume->BlockShift();
1777 			} else if (size < 32 * 1024 * 1024 && bytes <= 1 * 1024 * 1024) {
1778 				// Preallocate 512 KB for file sizes between 1 MB and 32 MB, and
1779 				// grow rates smaller than 1 MB
1780 				roundTo = (512 * 1024) >> fVolume->BlockShift();
1781 			} else {
1782 				// Preallocate 1/16 of the file size (ie. 4 MB for 64 MB,
1783 				// 64 MB for 1 GB)
1784 				roundTo = size >> (fVolume->BlockShift() + 4);
1785 			}
1786 		} else if (IsIndex()) {
1787 			// Always preallocate 64 KB for index directories
1788 			roundTo = 65536 >> fVolume->BlockShift();
1789 		} else {
1790 			// Preallocate only 4 KB - directories only get trimmed when their
1791 			// vnode is flushed, which might not happen very often.
1792 			roundTo = 4096 >> fVolume->BlockShift();
1793 		}
1794 		if (roundTo > 1) {
1795 			// Round to next "roundTo" block count
1796 			blocksRequested = ((blocksNeeded + roundTo) / roundTo) * roundTo;
1797 		}
1798 	}
1799 
1800 	while (blocksNeeded > 0) {
1801 		// the requested blocks do not need to be returned with a
1802 		// single allocation, so we need to iterate until we have
1803 		// enough blocks allocated
1804 		if (minimum > 1) {
1805 			// make sure that "blocks" is a multiple of minimum
1806 			blocksRequested = round_up(blocksRequested, minimum);
1807 		}
1808 
1809 		block_run run;
1810 		status_t status = fVolume->Allocate(transaction, this, blocksRequested,
1811 			run, minimum);
1812 		if (status != B_OK)
1813 			return status;
1814 
1815 		// okay, we have the needed blocks, so just distribute them to the
1816 		// different ranges of the stream (direct, indirect & double indirect)
1817 
1818 		blocksNeeded -= run.Length();
1819 		// don't preallocate if the first allocation was already too small
1820 		blocksRequested = blocksNeeded;
1821 
1822 		// Direct block range
1823 
1824 		if (data->Size() <= data->MaxDirectRange()) {
1825 			// let's try to put them into the direct block range
1826 			int32 free = 0;
1827 			for (; free < NUM_DIRECT_BLOCKS; free++) {
1828 				if (data->direct[free].IsZero())
1829 					break;
1830 			}
1831 
1832 			if (free < NUM_DIRECT_BLOCKS) {
1833 				// can we merge the last allocated run with the new one?
1834 				int32 last = free - 1;
1835 				if (free > 0 && data->direct[last].MergeableWith(run)) {
1836 					data->direct[last].length = HOST_ENDIAN_TO_BFS_INT16(
1837 						data->direct[last].Length() + run.Length());
1838 				} else
1839 					data->direct[free] = run;
1840 
1841 				data->max_direct_range = HOST_ENDIAN_TO_BFS_INT64(
1842 					data->MaxDirectRange()
1843 					+ run.Length() * fVolume->BlockSize());
1844 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1845 					? data->max_direct_range : size);
1846 				continue;
1847 			}
1848 		}
1849 
1850 		// Indirect block range
1851 
1852 		if (data->Size() <= data->MaxIndirectRange()
1853 			|| !data->MaxIndirectRange()) {
1854 			CachedBlock cached(fVolume);
1855 			block_run* runs = NULL;
1856 			uint32 free = 0;
1857 			off_t block;
1858 
1859 			// if there is no indirect block yet, create one
1860 			if (data->indirect.IsZero()) {
1861 				status = _AllocateBlockArray(transaction, data->indirect,
1862 					NUM_ARRAY_BLOCKS, true);
1863 				if (status != B_OK)
1864 					return status;
1865 
1866 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1867 					data->MaxDirectRange());
1868 				// insert the block_run in the first block
1869 				runs = (block_run*)cached.SetTo(data->indirect);
1870 			} else {
1871 				uint32 numberOfRuns = fVolume->BlockSize() / sizeof(block_run);
1872 				block = fVolume->ToBlock(data->indirect);
1873 
1874 				// search first empty entry
1875 				int32 i = 0;
1876 				for (; i < data->indirect.Length(); i++) {
1877 					if ((runs = (block_run*)cached.SetTo(block + i)) == NULL)
1878 						return B_IO_ERROR;
1879 
1880 					for (free = 0; free < numberOfRuns; free++)
1881 						if (runs[free].IsZero())
1882 							break;
1883 
1884 					if (free < numberOfRuns)
1885 						break;
1886 				}
1887 				if (i == data->indirect.Length())
1888 					runs = NULL;
1889 			}
1890 
1891 			if (runs != NULL) {
1892 				// try to insert the run to the last one - note that this
1893 				// doesn't take block borders into account, so it could be
1894 				// further optimized
1895 				cached.MakeWritable(transaction);
1896 
1897 				int32 last = free - 1;
1898 				if (free > 0 && runs[last].MergeableWith(run)) {
1899 					runs[last].length = HOST_ENDIAN_TO_BFS_INT16(
1900 						runs[last].Length() + run.Length());
1901 				} else
1902 					runs[free] = run;
1903 
1904 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1905 					data->MaxIndirectRange()
1906 					+ ((uint32)run.Length() << fVolume->BlockShift()));
1907 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1908 					? data->MaxIndirectRange() : size);
1909 				continue;
1910 			}
1911 		}
1912 
1913 		// Double indirect block range
1914 
1915 		if (data->Size() <= data->MaxDoubleIndirectRange()
1916 			|| !data->max_double_indirect_range) {
1917 			// We make sure here that we have this minimum allocated, so if
1918 			// the allocation succeeds, we don't run into an endless loop.
1919 			if (!data->max_double_indirect_range)
1920 				minimum = _DoubleIndirectBlockLength();
1921 			else
1922 				minimum = data->double_indirect.Length();
1923 
1924 			if ((run.Length() % minimum) != 0) {
1925 				// The number of allocated blocks isn't a multiple of 'minimum',
1926 				// so we have to change this. This can happen the first time the
1927 				// stream grows into the double indirect range.
1928 				// First, free the remaining blocks that don't fit into this
1929 				// multiple.
1930 				int32 rest = run.Length() % minimum;
1931 				run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length() - rest);
1932 
1933 				status = fVolume->Free(transaction,
1934 					block_run::Run(run.AllocationGroup(),
1935 					run.Start() + run.Length(), rest));
1936 				if (status != B_OK)
1937 					return status;
1938 
1939 				blocksNeeded += rest;
1940 				blocksRequested = round_up(blocksNeeded, minimum);
1941 
1942 				// Are there any blocks left in the run? If not, allocate
1943 				// a new one
1944 				if (run.length == 0)
1945 					continue;
1946 			}
1947 
1948 			// if there is no double indirect block yet, create one
1949 			if (data->double_indirect.IsZero()) {
1950 				status = _AllocateBlockArray(transaction,
1951 					data->double_indirect, _DoubleIndirectBlockLength());
1952 				if (status != B_OK)
1953 					return status;
1954 
1955 				data->max_double_indirect_range = data->max_indirect_range;
1956 			}
1957 
1958 			// calculate the index where to insert the new blocks
1959 
1960 			int32 runsPerBlock;
1961 			int32 directSize;
1962 			int32 indirectSize;
1963 			get_double_indirect_sizes(data->double_indirect.Length(),
1964 				fVolume->BlockSize(), runsPerBlock, directSize, indirectSize);
1965 			if (directSize <= 0 || indirectSize <= 0)
1966 				return B_BAD_DATA;
1967 
1968 			off_t start = data->MaxDoubleIndirectRange()
1969 				- data->MaxIndirectRange();
1970 			int32 indirectIndex = start / indirectSize;
1971 			int32 index = (start % indirectSize) / directSize;
1972 			int32 runsPerArray = runsPerBlock * minimum;
1973 
1974 			// distribute the blocks to the array and allocate
1975 			// new array blocks when needed
1976 
1977 			CachedBlock cached(fVolume);
1978 			CachedBlock cachedDirect(fVolume);
1979 			block_run* array = NULL;
1980 			uint32 runLength = run.Length();
1981 
1982 			while (run.length != 0) {
1983 				// get the indirect array block
1984 				if (array == NULL) {
1985 					uint32 block = indirectIndex / runsPerBlock;
1986 					if (block >= minimum)
1987 						return EFBIG;
1988 
1989 					array = (block_run*)cached.SetTo(fVolume->ToBlock(
1990 						data->double_indirect) + block);
1991 					if (array == NULL)
1992 						return B_IO_ERROR;
1993 				}
1994 
1995 				do {
1996 					// do we need a new array block?
1997 					if (array[indirectIndex % runsPerBlock].IsZero()) {
1998 						cached.MakeWritable(transaction);
1999 
2000 						status = _AllocateBlockArray(transaction,
2001 							array[indirectIndex % runsPerBlock],
2002 							data->double_indirect.Length());
2003 						if (status != B_OK)
2004 							return status;
2005 					}
2006 
2007 					block_run* runs = (block_run*)cachedDirect.SetToWritable(
2008 						transaction, fVolume->ToBlock(array[indirectIndex
2009 							% runsPerBlock]) + index / runsPerBlock);
2010 					if (runs == NULL)
2011 						return B_IO_ERROR;
2012 
2013 					do {
2014 						// insert the block_run into the array
2015 						runs[index % runsPerBlock] = run;
2016 						runs[index % runsPerBlock].length
2017 							= HOST_ENDIAN_TO_BFS_INT16(minimum);
2018 
2019 						// alter the remaining block_run
2020 						run.start = HOST_ENDIAN_TO_BFS_INT16(run.Start()
2021 							+ minimum);
2022 						run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
2023 							- minimum);
2024 					} while ((++index % runsPerBlock) != 0 && run.length);
2025 				} while ((index % runsPerArray) != 0 && run.length);
2026 
2027 				if (index == runsPerArray)
2028 					index = 0;
2029 				if (++indirectIndex % runsPerBlock == 0) {
2030 					array = NULL;
2031 					index = 0;
2032 				}
2033 			}
2034 
2035 			data->max_double_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
2036 				data->MaxDoubleIndirectRange()
2037 				+ (runLength << fVolume->BlockShift()));
2038 			data->size = blocksNeeded > 0 ? HOST_ENDIAN_TO_BFS_INT64(
2039 				data->max_double_indirect_range) : size;
2040 
2041 			continue;
2042 		}
2043 
2044 		RETURN_ERROR(EFBIG);
2045 	}
2046 	// update the size of the data stream
2047 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
2048 
2049 	return B_OK;
2050 }
2051 
2052 
2053 size_t
2054 Inode::_DoubleIndirectBlockLength() const
2055 {
2056 	if (fVolume->BlockSize() > DOUBLE_INDIRECT_ARRAY_SIZE)
2057 		return 1;
2058 
2059 	return DOUBLE_INDIRECT_ARRAY_SIZE / fVolume->BlockSize();
2060 }
2061 
2062 
2063 /*!	Frees the statically sized array of the double indirect part of a data
2064 	stream.
2065 */
2066 status_t
2067 Inode::_FreeStaticStreamArray(Transaction& transaction, int32 level,
2068 	block_run run, off_t size, off_t offset, off_t& max)
2069 {
2070 	int32 indirectSize;
2071 	if (level == 0) {
2072 		indirectSize = double_indirect_max_indirect_size(run.Length(),
2073 			fVolume->BlockSize());
2074 	} else {
2075 		indirectSize = double_indirect_max_direct_size(run.Length(),
2076 			fVolume->BlockSize());
2077 	}
2078 	if (indirectSize <= 0)
2079 		return B_BAD_DATA;
2080 
2081 	off_t start;
2082 	if (size > offset)
2083 		start = size - offset;
2084 	else
2085 		start = 0;
2086 
2087 	int32 index = start / indirectSize;
2088 	int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
2089 
2090 	CachedBlock cached(fVolume);
2091 	off_t blockNumber = fVolume->ToBlock(run);
2092 
2093 	// set the file offset to the current block run
2094 	offset += (off_t)index * indirectSize;
2095 
2096 	for (int32 i = index / runsPerBlock; i < run.Length(); i++) {
2097 		block_run* array = (block_run*)cached.SetToWritable(transaction,
2098 			blockNumber + i);
2099 		if (array == NULL)
2100 			RETURN_ERROR(B_ERROR);
2101 
2102 		for (index = index % runsPerBlock; index < runsPerBlock; index++) {
2103 			if (array[index].IsZero()) {
2104 				// we also want to break out of the outer loop
2105 				i = run.Length();
2106 				break;
2107 			}
2108 
2109 			status_t status = B_OK;
2110 			if (level == 0) {
2111 				status = _FreeStaticStreamArray(transaction, 1, array[index],
2112 					size, offset, max);
2113 			} else if (offset >= size)
2114 				status = fVolume->Free(transaction, array[index]);
2115 			else
2116 				max = HOST_ENDIAN_TO_BFS_INT64(offset + indirectSize);
2117 
2118 			if (status < B_OK)
2119 				RETURN_ERROR(status);
2120 
2121 			if (offset >= size)
2122 				array[index].SetTo(0, 0, 0);
2123 
2124 			offset += indirectSize;
2125 		}
2126 		index = 0;
2127 	}
2128 	return B_OK;
2129 }
2130 
2131 
2132 /*!	Frees all block_runs in the array which come after the specified size.
2133 	It also trims the last block_run that contain the size.
2134 	"offset" and "max" are maintained until the last block_run that doesn't
2135 	have to be freed - after this, the values won't be correct anymore, but
2136 	will still assure correct function for all subsequent calls.
2137 	"max" is considered to be in file system byte order.
2138 */
2139 status_t
2140 Inode::_FreeStreamArray(Transaction& transaction, block_run* array,
2141 	uint32 arrayLength, off_t size, off_t& offset, off_t& max)
2142 {
2143 	PRINT(("FreeStreamArray: arrayLength %lu, size %Ld, offset %Ld, max %Ld\n",
2144 		arrayLength, size, offset, max));
2145 
2146 	off_t newOffset = offset;
2147 	uint32 i = 0;
2148 	for (; i < arrayLength; i++, offset = newOffset) {
2149 		if (array[i].IsZero())
2150 			break;
2151 
2152 		newOffset += (off_t)array[i].Length() << fVolume->BlockShift();
2153 		if (newOffset <= size)
2154 			continue;
2155 
2156 		block_run run = array[i];
2157 
2158 		// determine the block_run to be freed
2159 		if (newOffset > size && offset < size) {
2160 			// free partial block_run (and update the original block_run)
2161 			run.start = HOST_ENDIAN_TO_BFS_INT16(array[i].Start()
2162 				+ ((size + fVolume->BlockSize() - 1 - offset)
2163 					>> fVolume->BlockShift()));
2164 				// round to next block
2165 			array[i].length = HOST_ENDIAN_TO_BFS_INT16(run.Start()
2166 				- array[i].Start());
2167 			run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
2168 				- array[i].Length());
2169 
2170 			if (run.length == 0)
2171 				continue;
2172 
2173 			// update maximum range
2174 			max = HOST_ENDIAN_TO_BFS_INT64(offset + ((off_t)array[i].Length()
2175 				<< fVolume->BlockShift()));
2176 		} else {
2177 			// free the whole block_run
2178 			array[i].SetTo(0, 0, 0);
2179 
2180 			if ((off_t)BFS_ENDIAN_TO_HOST_INT64(max) > offset)
2181 				max = HOST_ENDIAN_TO_BFS_INT64(offset);
2182 		}
2183 
2184 		if (fVolume->Free(transaction, run) < B_OK)
2185 			return B_IO_ERROR;
2186 	}
2187 	return B_OK;
2188 }
2189 
2190 
2191 status_t
2192 Inode::_ShrinkStream(Transaction& transaction, off_t size)
2193 {
2194 	data_stream* data = &Node().data;
2195 	status_t status;
2196 
2197 	if (data->MaxDoubleIndirectRange() > size) {
2198 		off_t* maxDoubleIndirect = &data->max_double_indirect_range;
2199 			// gcc 4 work-around: "error: cannot bind packed field
2200 			// 'data->data_stream::max_double_indirect_range' to 'off_t&'"
2201 		status = _FreeStaticStreamArray(transaction, 0, data->double_indirect,
2202 			size, data->MaxIndirectRange(), *maxDoubleIndirect);
2203 		if (status != B_OK)
2204 			return status;
2205 
2206 		if (size <= data->MaxIndirectRange()) {
2207 			fVolume->Free(transaction, data->double_indirect);
2208 			data->double_indirect.SetTo(0, 0, 0);
2209 			data->max_double_indirect_range = 0;
2210 		}
2211 	}
2212 
2213 	if (data->MaxIndirectRange() > size) {
2214 		CachedBlock cached(fVolume);
2215 		off_t block = fVolume->ToBlock(data->indirect);
2216 		off_t offset = data->MaxDirectRange();
2217 
2218 		for (int32 i = 0; i < data->indirect.Length(); i++) {
2219 			block_run* array = (block_run*)cached.SetToWritable(transaction,
2220 				block + i);
2221 			if (array == NULL)
2222 				break;
2223 
2224 			off_t* maxIndirect = &data->max_indirect_range;
2225 				// gcc 4 work-around: "error: cannot bind packed field
2226 				// 'data->data_stream::max_indirect_range' to 'off_t&'"
2227 			if (_FreeStreamArray(transaction, array, fVolume->BlockSize()
2228 					/ sizeof(block_run), size, offset, *maxIndirect) != B_OK)
2229 				return B_IO_ERROR;
2230 		}
2231 		if (data->max_direct_range == data->max_indirect_range) {
2232 			fVolume->Free(transaction, data->indirect);
2233 			data->indirect.SetTo(0, 0, 0);
2234 			data->max_indirect_range = 0;
2235 		}
2236 	}
2237 
2238 	if (data->MaxDirectRange() > size) {
2239 		off_t offset = 0;
2240 		off_t *maxDirect = &data->max_direct_range;
2241 			// gcc 4 work-around: "error: cannot bind packed field
2242 			// 'data->data_stream::max_direct_range' to 'off_t&'"
2243 		status = _FreeStreamArray(transaction, data->direct, NUM_DIRECT_BLOCKS,
2244 			size, offset, *maxDirect);
2245 		if (status < B_OK)
2246 			return status;
2247 	}
2248 
2249 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
2250 	return B_OK;
2251 }
2252 
2253 
2254 status_t
2255 Inode::SetFileSize(Transaction& transaction, off_t size)
2256 {
2257 	if (size < 0)
2258 		return B_BAD_VALUE;
2259 
2260 	off_t oldSize = Size();
2261 
2262 	if (size == oldSize)
2263 		return B_OK;
2264 
2265 	T(Resize(this, oldSize, size, false));
2266 
2267 	// should the data stream grow or shrink?
2268 	status_t status;
2269 	if (size > oldSize) {
2270 		status = _GrowStream(transaction, size);
2271 		if (status < B_OK) {
2272 			// if the growing of the stream fails, the whole operation
2273 			// fails, so we should shrink the stream to its former size
2274 			_ShrinkStream(transaction, oldSize);
2275 		}
2276 	} else
2277 		status = _ShrinkStream(transaction, size);
2278 
2279 	if (status < B_OK)
2280 		return status;
2281 
2282 	file_cache_set_size(FileCache(), size);
2283 	file_map_set_size(Map(), size);
2284 
2285 	return WriteBack(transaction);
2286 }
2287 
2288 
2289 status_t
2290 Inode::Append(Transaction& transaction, off_t bytes)
2291 {
2292 	return SetFileSize(transaction, Size() + bytes);
2293 }
2294 
2295 
2296 /*!	Checks whether or not this inode's data stream needs to be trimmed
2297 	because of an earlier preallocation.
2298 	Returns true if there are any blocks to be trimmed.
2299 */
2300 bool
2301 Inode::NeedsTrimming() const
2302 {
2303 	// We never trim preallocated index blocks to make them grow as smooth as
2304 	// possible. There are only few indices anyway, so this doesn't hurt.
2305 	// Also, if an inode is already in deleted state, we don't bother trimming
2306 	// it.
2307 	if (IsIndex() || IsDeleted()
2308 		|| (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0))
2309 		return false;
2310 
2311 	off_t roundedSize = round_up(Size(), fVolume->BlockSize());
2312 
2313 	return Node().data.MaxDirectRange() > roundedSize
2314 		|| Node().data.MaxIndirectRange() > roundedSize
2315 		|| Node().data.MaxDoubleIndirectRange() > roundedSize;
2316 }
2317 
2318 
2319 status_t
2320 Inode::TrimPreallocation(Transaction& transaction)
2321 {
2322 	T(Resize(this, max_c(Node().data.MaxDirectRange(),
2323 		Node().data.MaxIndirectRange()), Size(), true));
2324 
2325 	status_t status = _ShrinkStream(transaction, Size());
2326 	if (status < B_OK)
2327 		return status;
2328 
2329 	return WriteBack(transaction);
2330 }
2331 
2332 
2333 //!	Frees the file's data stream and removes all attributes
2334 status_t
2335 Inode::Free(Transaction& transaction)
2336 {
2337 	FUNCTION();
2338 
2339 	// Perhaps there should be an implementation of Inode::ShrinkStream() that
2340 	// just frees the data_stream, but doesn't change the inode (since it is
2341 	// freed anyway) - that would make an undelete command possible
2342 	if (!IsSymLink() || (Flags() & INODE_LONG_SYMLINK) != 0) {
2343 		status_t status = SetFileSize(transaction, 0);
2344 		if (status < B_OK)
2345 			return status;
2346 	}
2347 
2348 	// Free all attributes, and remove their indices
2349 	{
2350 		// We have to limit the scope of AttributeIterator, so that its
2351 		// destructor is not called after the inode is deleted
2352 		AttributeIterator iterator(this);
2353 
2354 		char name[B_FILE_NAME_LENGTH];
2355 		uint32 type;
2356 		size_t length;
2357 		ino_t id;
2358 		while (iterator.GetNext(name, &length, &type, &id) == B_OK) {
2359 			RemoveAttribute(transaction, name);
2360 		}
2361 	}
2362 
2363 	if (WriteBack(transaction) < B_OK)
2364 		return B_IO_ERROR;
2365 
2366 	return fVolume->Free(transaction, BlockRun());
2367 }
2368 
2369 
2370 //!	Synchronizes (writes back to disk) the file stream of the inode.
2371 status_t
2372 Inode::Sync()
2373 {
2374 	if (FileCache())
2375 		return file_cache_sync(FileCache());
2376 
2377 	// We may also want to flush the attribute's data stream to
2378 	// disk here... (do we?)
2379 
2380 	if (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0)
2381 		return B_OK;
2382 
2383 	InodeReadLocker locker(this);
2384 
2385 	data_stream* data = &Node().data;
2386 	status_t status = B_OK;
2387 
2388 	// flush direct range
2389 
2390 	for (int32 i = 0; i < NUM_DIRECT_BLOCKS; i++) {
2391 		if (data->direct[i].IsZero())
2392 			return B_OK;
2393 
2394 		status = block_cache_sync_etc(fVolume->BlockCache(),
2395 			fVolume->ToBlock(data->direct[i]), data->direct[i].Length());
2396 		if (status != B_OK)
2397 			return status;
2398 	}
2399 
2400 	// flush indirect range
2401 
2402 	if (data->max_indirect_range == 0)
2403 		return B_OK;
2404 
2405 	CachedBlock cached(fVolume);
2406 	off_t block = fVolume->ToBlock(data->indirect);
2407 	int32 count = fVolume->BlockSize() / sizeof(block_run);
2408 
2409 	for (int32 j = 0; j < data->indirect.Length(); j++) {
2410 		block_run* runs = (block_run*)cached.SetTo(block + j);
2411 		if (runs == NULL)
2412 			break;
2413 
2414 		for (int32 i = 0; i < count; i++) {
2415 			if (runs[i].IsZero())
2416 				return B_OK;
2417 
2418 			status = block_cache_sync_etc(fVolume->BlockCache(),
2419 				fVolume->ToBlock(runs[i]), runs[i].Length());
2420 			if (status != B_OK)
2421 				return status;
2422 		}
2423 	}
2424 
2425 	// flush double indirect range
2426 
2427 	if (data->max_double_indirect_range == 0)
2428 		return B_OK;
2429 
2430 	off_t indirectBlock = fVolume->ToBlock(data->double_indirect);
2431 
2432 	for (int32 l = 0; l < data->double_indirect.Length(); l++) {
2433 		block_run* indirectRuns = (block_run*)cached.SetTo(indirectBlock + l);
2434 		if (indirectRuns == NULL)
2435 			return B_FILE_ERROR;
2436 
2437 		CachedBlock directCached(fVolume);
2438 
2439 		for (int32 k = 0; k < count; k++) {
2440 			if (indirectRuns[k].IsZero())
2441 				return B_OK;
2442 
2443 			block = fVolume->ToBlock(indirectRuns[k]);
2444 			for (int32 j = 0; j < indirectRuns[k].Length(); j++) {
2445 				block_run* runs = (block_run*)directCached.SetTo(block + j);
2446 				if (runs == NULL)
2447 					return B_FILE_ERROR;
2448 
2449 				for (int32 i = 0; i < count; i++) {
2450 					if (runs[i].IsZero())
2451 						return B_OK;
2452 
2453 					// TODO: combine single block_runs to bigger ones when
2454 					// they are adjacent
2455 					status = block_cache_sync_etc(fVolume->BlockCache(),
2456 						fVolume->ToBlock(runs[i]), runs[i].Length());
2457 					if (status != B_OK)
2458 						return status;
2459 				}
2460 			}
2461 		}
2462 	}
2463 	return B_OK;
2464 }
2465 
2466 
2467 //	#pragma mark - TransactionListener implementation
2468 
2469 
2470 void
2471 Inode::TransactionDone(bool success)
2472 {
2473 	if (!success) {
2474 		// Revert any changes made to the cached bfs_inode
2475 		// TODO: return code gets eaten
2476 		UpdateNodeFromDisk();
2477 	}
2478 }
2479 
2480 
2481 void
2482 Inode::RemovedFromTransaction()
2483 {
2484 	Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
2485 
2486 	// See AddInode() why we do this here
2487 	if ((Flags() & INODE_DELETED) != 0)
2488 		fVolume->RemovedInodes().Add(this);
2489 
2490 	rw_lock_write_unlock(&Lock());
2491 
2492 	if (!fVolume->IsInitializing())
2493 		put_vnode(fVolume->FSVolume(), ID());
2494 }
2495 
2496 
2497 //	#pragma mark - creation/deletion
2498 
2499 
2500 status_t
2501 Inode::Remove(Transaction& transaction, const char* name, ino_t* _id,
2502 	bool isDirectory, bool force)
2503 {
2504 	if (fTree == NULL)
2505 		RETURN_ERROR(B_BAD_VALUE);
2506 
2507 	WriteLockInTransaction(transaction);
2508 
2509 	// does the file even exist?
2510 	off_t id;
2511 	if (fTree->Find((uint8*)name, (uint16)strlen(name), &id) < B_OK)
2512 		return B_ENTRY_NOT_FOUND;
2513 
2514 	if (_id)
2515 		*_id = id;
2516 
2517 	Vnode vnode(fVolume, id);
2518 	Inode* inode;
2519 	status_t status = vnode.Get(&inode);
2520 	if (status < B_OK) {
2521 		REPORT_ERROR(status);
2522 		return B_ENTRY_NOT_FOUND;
2523 	}
2524 
2525 	T(Remove(inode, name));
2526 	inode->WriteLockInTransaction(transaction);
2527 
2528 	// Inode::IsContainer() is true also for indices (furthermore, the S_IFDIR
2529 	// bit is set for indices in BFS, not for attribute directories) - but you
2530 	// should really be able to do whatever you want with your indices
2531 	// without having to remove all files first :)
2532 	if (!inode->IsIndex() && !force) {
2533 		// if it's not of the correct type, don't delete it!
2534 		if (inode->IsContainer() != isDirectory)
2535 			return isDirectory ? B_NOT_A_DIRECTORY : B_IS_A_DIRECTORY;
2536 
2537 		// only delete empty directories
2538 		if (isDirectory && !inode->IsEmpty())
2539 			return B_DIRECTORY_NOT_EMPTY;
2540 	}
2541 
2542 	// remove_vnode() allows the inode to be accessed until the last put_vnode()
2543 	status = remove_vnode(fVolume->FSVolume(), id);
2544 	if (status != B_OK)
2545 		return status;
2546 
2547 	if (fTree->Remove(transaction, name, id) != B_OK && !force) {
2548 		unremove_vnode(fVolume->FSVolume(), id);
2549 		RETURN_ERROR(B_ERROR);
2550 	}
2551 
2552 #ifdef DEBUG
2553 	if (fTree->Find((uint8*)name, (uint16)strlen(name), &id) == B_OK) {
2554 		DIE(("deleted entry still there"));
2555 	}
2556 #endif
2557 
2558 	ContainerContentsChanged(transaction);
2559 
2560 	// update the inode, so that no one will ever doubt it's deleted :-)
2561 	inode->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
2562 	inode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
2563 
2564 	// In balance to the Inode::Create() method, the main indices
2565 	// are updated here (name, size, & last_modified)
2566 
2567 	Index index(fVolume);
2568 	if (inode->InNameIndex()) {
2569 		index.RemoveName(transaction, name, inode);
2570 			// If removing from the index fails, it is not regarded as a
2571 			// fatal error and will not be reported back!
2572 			// Deleted inodes won't be visible in queries anyway.
2573 	}
2574 
2575 	if (inode->InSizeIndex())
2576 		index.RemoveSize(transaction, inode);
2577 	if (inode->InLastModifiedIndex())
2578 		index.RemoveLastModified(transaction, inode);
2579 
2580 	return inode->WriteBack(transaction);
2581 }
2582 
2583 
2584 /*!	Creates the inode with the specified \a parent directory, and automatically
2585 	adds the created inode to that parent directory. If an attribute directory
2586 	is created, it will also automatically  be added to the \a parent inode as
2587 	such. However, the indices root node, and the regular root node won't be
2588 	added to the superblock.
2589 	It will also create the initial B+tree for the inode if it's a directory
2590 	of any kind.
2591 	\a name may be \c NULL, but only if no \a parent is given.
2592 	If the "_id" or "_inode" variable is given and non-NULL to store the
2593 	inode's ID, the inode stays locked - you have to call put_vnode() if you
2594 	don't use it anymore.
2595 	If the node already exists, this method will fail if \c O_EXCL is set, or
2596 	it's a directory or a symlink. Otherwise, it will just be returned.
2597 	If \c O_TRUNC has been specified, the file will also be truncated.
2598 */
2599 status_t
2600 Inode::Create(Transaction& transaction, Inode* parent, const char* name,
2601 	int32 mode, int openMode, uint32 type, bool* _created, ino_t* _id,
2602 	Inode** _inode, fs_vnode_ops* vnodeOps, uint32 publishFlags)
2603 {
2604 	FUNCTION_START(("name = %s, mode = %ld\n", name, mode));
2605 
2606 	block_run parentRun = parent ? parent->BlockRun() : block_run::Run(0, 0, 0);
2607 	Volume* volume = transaction.GetVolume();
2608 	BPlusTree* tree = NULL;
2609 
2610 	if (parent != NULL && (mode & S_ATTR_DIR) == 0 && parent->IsContainer()) {
2611 		// check if the file already exists in the directory
2612 		tree = parent->Tree();
2613 	}
2614 
2615 	if (parent != NULL) {
2616 		// the parent directory is locked during the whole inode creation
2617 		parent->WriteLockInTransaction(transaction);
2618 	}
2619 
2620 	if (parent != NULL && !volume->IsInitializing() && parent->IsContainer()) {
2621 		// don't create anything in removed directories
2622 		bool removed;
2623 		if (get_vnode_removed(volume->FSVolume(), parent->ID(), &removed)
2624 				== B_OK && removed) {
2625 			RETURN_ERROR(B_ENTRY_NOT_FOUND);
2626 		}
2627 	}
2628 
2629 	if (tree != NULL) {
2630 		// Does the file already exist?
2631 		off_t offset;
2632 		if (tree->Find((uint8*)name, (uint16)strlen(name), &offset) == B_OK) {
2633 			// Return if the file should be a directory/link or opened in
2634 			// exclusive mode
2635 			if (S_ISDIR(mode) || S_ISLNK(mode) || (openMode & O_EXCL) != 0)
2636 				return B_FILE_EXISTS;
2637 
2638 			Vnode vnode(volume, offset);
2639 			Inode* inode;
2640 			status_t status = vnode.Get(&inode);
2641 			if (status != B_OK) {
2642 				REPORT_ERROR(status);
2643 				return B_ENTRY_NOT_FOUND;
2644 			}
2645 
2646 			if (inode->IsDirectory() && (openMode & O_RWMASK) != O_RDONLY)
2647 				return B_IS_A_DIRECTORY;
2648 			if ((openMode & O_DIRECTORY) != 0 && !inode->IsDirectory())
2649 				return B_NOT_A_DIRECTORY;
2650 
2651 			// we want to open the file, so we should have the rights to do so
2652 			if (inode->CheckPermissions(open_mode_to_access(openMode)
2653 					| ((openMode & O_TRUNC) != 0 ? W_OK : 0)) != B_OK)
2654 				return B_NOT_ALLOWED;
2655 
2656 			if ((openMode & O_TRUNC) != 0) {
2657 				// truncate the existing file
2658 				inode->WriteLockInTransaction(transaction);
2659 
2660 				status_t status = inode->SetFileSize(transaction, 0);
2661 				if (status == B_OK)
2662 					status = inode->WriteBack(transaction);
2663 
2664 				if (status != B_OK)
2665 					return status;
2666 			}
2667 
2668 			if (_created)
2669 				*_created = false;
2670 			if (_id)
2671 				*_id = inode->ID();
2672 			if (_inode)
2673 				*_inode = inode;
2674 
2675 			// Only keep the vnode in memory if the _id or _inode pointer is
2676 			// provided
2677 			if (_id != NULL || _inode != NULL)
2678 				vnode.Keep();
2679 
2680 			return B_OK;
2681 		}
2682 	} else if (parent != NULL && (mode & S_ATTR_DIR) == 0) {
2683 		return B_BAD_VALUE;
2684 	} else if ((openMode & O_DIRECTORY) != 0) {
2685 		// TODO: we might need to return B_NOT_A_DIRECTORY here
2686 		return B_ENTRY_NOT_FOUND;
2687 	}
2688 
2689 	status_t status;
2690 
2691 	// do we have the power to create new files at all?
2692 	if (parent != NULL && (status = parent->CheckPermissions(W_OK)) != B_OK)
2693 		RETURN_ERROR(status);
2694 
2695 	// allocate space for the new inode
2696 	InodeAllocator allocator(transaction);
2697 	block_run run;
2698 	Inode* inode;
2699 	status = allocator.New(&parentRun, mode, publishFlags, run, vnodeOps,
2700 		&inode);
2701 	if (status < B_OK)
2702 		return status;
2703 
2704 	T(Create(inode, parent, name, mode, openMode, type));
2705 
2706 	// Initialize the parts of the bfs_inode structure that
2707 	// InodeAllocator::New() hasn't touched yet
2708 
2709 	bfs_inode* node = &inode->Node();
2710 
2711 	if (parent == NULL) {
2712 		// we set the parent to itself in this case
2713 		// (only happens for the root and indices node)
2714 		node->parent = run;
2715 	} else
2716 		node->parent = parentRun;
2717 
2718 	node->uid = HOST_ENDIAN_TO_BFS_INT32(geteuid());
2719 	node->gid = HOST_ENDIAN_TO_BFS_INT32(parent
2720 		? parent->Node().GroupID() : getegid());
2721 		// the group ID is inherited from the parent, if available
2722 
2723 	node->type = HOST_ENDIAN_TO_BFS_INT32(type);
2724 
2725 	inode->WriteBack(transaction);
2726 		// make sure the initialized node is available to others
2727 
2728 	// only add the name to regular files, directories, or symlinks
2729 	// don't add it to attributes, or indices
2730 	if (tree && inode->IsRegularNode()
2731 		&& inode->SetName(transaction, name) != B_OK)
2732 		return B_ERROR;
2733 
2734 	// Initialize b+tree if it's a directory (and add "." & ".." if it's
2735 	// a standard directory for files - not for attributes or indices)
2736 	if (inode->IsContainer()) {
2737 		status = allocator.CreateTree();
2738 		if (status != B_OK)
2739 			return status;
2740 	}
2741 
2742 	// Add a link to the inode from the parent, depending on its type
2743 	// (the vnode is not published yet, so it is safe to make the inode
2744 	// accessable to the file system here)
2745 	if (tree != NULL) {
2746 		status = tree->Insert(transaction, name, inode->ID());
2747 	} else if (parent != NULL && (mode & S_ATTR_DIR) != 0) {
2748 		parent->Attributes() = run;
2749 		status = parent->WriteBack(transaction);
2750 	}
2751 
2752 	// Note, we only care if the inode could be made accessable for the
2753 	// two cases above; the root node or the indices root node must
2754 	// handle this case on their own (or other cases where "parent" is
2755 	// NULL)
2756 	if (status != B_OK)
2757 		RETURN_ERROR(status);
2758 
2759 	// Update the main indices (name, size & last_modified)
2760 	// (live queries might want to access us after this)
2761 
2762 	Index index(volume);
2763 	if (inode->InNameIndex() && name != NULL) {
2764 		// the name index only contains regular files
2765 		// (but not the root node where name == NULL)
2766 		status = index.InsertName(transaction, name, inode);
2767 		if (status != B_OK && status != B_BAD_INDEX) {
2768 			// We have to remove the node from the parent at this point,
2769 			// because the InodeAllocator destructor can't handle this
2770 			// case (and if it fails, we can't do anything about it...)
2771 			if (tree)
2772 				tree->Remove(transaction, name, inode->ID());
2773 			else if (parent != NULL && (mode & S_ATTR_DIR) != 0)
2774 				parent->Node().attributes.SetTo(0, 0, 0);
2775 
2776 			RETURN_ERROR(status);
2777 		}
2778 	}
2779 
2780 	if (parent != NULL && parent->IsContainer())
2781 		parent->ContainerContentsChanged(transaction);
2782 
2783 	inode->UpdateOldLastModified();
2784 
2785 	// The "size" & "last_modified" indices don't contain directories.
2786 	// If adding to these indices fails, the inode creation will not be
2787 	// harmed; they are considered less important than the "name" index.
2788 	if (inode->InSizeIndex())
2789 		index.InsertSize(transaction, inode);
2790 	if (inode->InLastModifiedIndex())
2791 		index.InsertLastModified(transaction, inode);
2792 
2793 	if (inode->NeedsFileCache()) {
2794 		inode->SetFileCache(file_cache_create(volume->ID(), inode->ID(),
2795 			inode->Size()));
2796 		inode->SetMap(file_map_create(volume->ID(), inode->ID(),
2797 			inode->Size()));
2798 
2799 		if (inode->FileCache() == NULL || inode->Map() == NULL)
2800 			return B_NO_MEMORY;
2801 	}
2802 
2803 	// Everything worked well until this point, we have a fully
2804 	// initialized inode, and we want to keep it
2805 	allocator.Keep(vnodeOps, publishFlags);
2806 
2807 	if (_created)
2808 		*_created = true;
2809 	if (_id != NULL)
2810 		*_id = inode->ID();
2811 	if (_inode != NULL)
2812 		*_inode = inode;
2813 
2814 	// if either _id or _inode is passed, we will keep the inode locked
2815 	if (_id == NULL && _inode == NULL)
2816 		put_vnode(volume->FSVolume(), inode->ID());
2817 
2818 	return B_OK;
2819 }
2820 
2821 
2822 //	#pragma mark - AttributeIterator
2823 
2824 
2825 AttributeIterator::AttributeIterator(Inode* inode)
2826 	:
2827 	fCurrentSmallData(0),
2828 	fInode(inode),
2829 	fAttributes(NULL),
2830 	fIterator(NULL),
2831 	fBuffer(NULL)
2832 {
2833 	inode->_AddIterator(this);
2834 }
2835 
2836 
2837 AttributeIterator::~AttributeIterator()
2838 {
2839 	if (fAttributes)
2840 		put_vnode(fAttributes->GetVolume()->FSVolume(), fAttributes->ID());
2841 
2842 	delete fIterator;
2843 	fInode->_RemoveIterator(this);
2844 }
2845 
2846 
2847 status_t
2848 AttributeIterator::Rewind()
2849 {
2850 	fCurrentSmallData = 0;
2851 
2852 	if (fIterator != NULL)
2853 		fIterator->Rewind();
2854 
2855 	return B_OK;
2856 }
2857 
2858 
2859 status_t
2860 AttributeIterator::GetNext(char* name, size_t* _length, uint32* _type,
2861 	ino_t* _id)
2862 {
2863 	// read attributes out of the small data section
2864 
2865 	if (fCurrentSmallData >= 0) {
2866 		NodeGetter nodeGetter(fInode->GetVolume(), fInode);
2867 		if (nodeGetter.Node() == NULL)
2868 			return B_IO_ERROR;
2869 
2870 		const bfs_inode* node = nodeGetter.Node();
2871 		const small_data* item = ((bfs_inode*)node)->SmallDataStart();
2872 
2873 		RecursiveLocker _(&fInode->SmallDataLock());
2874 
2875 		int32 index = 0;
2876 		for (; !item->IsLast(node); item = item->Next(), index++) {
2877 			if (item->NameSize() == FILE_NAME_NAME_LENGTH
2878 				&& *item->Name() == FILE_NAME_NAME)
2879 				continue;
2880 
2881 			if (index >= fCurrentSmallData)
2882 				break;
2883 		}
2884 
2885 		if (!item->IsLast(node)) {
2886 			strncpy(name, item->Name(), B_FILE_NAME_LENGTH);
2887 			*_type = item->Type();
2888 			*_length = item->NameSize();
2889 			*_id = (ino_t)index;
2890 
2891 			fCurrentSmallData = index + 1;
2892 			return B_OK;
2893 		}
2894 
2895 		// stop traversing the small_data section
2896 		fCurrentSmallData = -1;
2897 	}
2898 
2899 	// read attributes out of the attribute directory
2900 
2901 	if (fInode->Attributes().IsZero())
2902 		return B_ENTRY_NOT_FOUND;
2903 
2904 	Volume* volume = fInode->GetVolume();
2905 
2906 	// if you haven't yet access to the attributes directory, get it
2907 	if (fAttributes == NULL) {
2908 		if (get_vnode(volume->FSVolume(), volume->ToVnode(fInode->Attributes()),
2909 				(void**)&fAttributes) != B_OK) {
2910 			FATAL(("get_vnode() failed in AttributeIterator::GetNext(ino_t"
2911 				" = %" B_PRIdINO ",name = \"%s\")\n", fInode->ID(), name));
2912 			return B_ENTRY_NOT_FOUND;
2913 		}
2914 
2915 		BPlusTree* tree = fAttributes->Tree();
2916 		if (tree == NULL
2917 			|| (fIterator = new(std::nothrow) TreeIterator(tree)) == NULL) {
2918 			FATAL(("could not get tree in AttributeIterator::GetNext(ino_t"
2919 				" = %" B_PRIdINO ",name = \"%s\")\n", fInode->ID(), name));
2920 			return B_ENTRY_NOT_FOUND;
2921 		}
2922 	}
2923 
2924 	uint16 length;
2925 	ino_t id;
2926 	status_t status = fIterator->GetNextEntry(name, &length,
2927 		B_FILE_NAME_LENGTH, &id);
2928 	if (status != B_OK)
2929 		return status;
2930 
2931 	Vnode vnode(volume, id);
2932 	Inode* attribute;
2933 	if ((status = vnode.Get(&attribute)) == B_OK) {
2934 		*_type = attribute->Type();
2935 		*_length = length;
2936 		*_id = id;
2937 	}
2938 
2939 	return status;
2940 }
2941 
2942 
2943 void
2944 AttributeIterator::Update(uint16 index, int8 change)
2945 {
2946 	// fCurrentSmallData points already to the next item
2947 	if (index < fCurrentSmallData)
2948 		fCurrentSmallData += change;
2949 }
2950 
2951