xref: /haiku/src/add-ons/kernel/file_systems/bfs/Inode.cpp (revision 1a3518cf757c2da8006753f83962da5935bbc82b)
1 /*
2  * Copyright 2001-2020, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 
7 //! Inode access functions
8 
9 
10 #include "Debug.h"
11 #include "Inode.h"
12 #include "BPlusTree.h"
13 #include "Index.h"
14 
15 
16 #if BFS_TRACING && !defined(FS_SHELL) && !defined(_BOOT_MODE)
17 namespace BFSInodeTracing {
18 
19 class Create : public AbstractTraceEntry {
20 public:
21 	Create(Inode* inode, Inode* parent, const char* name, int32 mode,
22 			int openMode, uint32 type)
23 		:
24 		fInode(inode),
25 		fID(inode->ID()),
26 		fParent(parent),
27 		fParentID(parent != NULL ? parent->ID() : 0),
28 		fMode(mode),
29 		fOpenMode(openMode),
30 		fType(type)
31 	{
32 		if (name != NULL)
33 			strlcpy(fName, name, sizeof(fName));
34 		else
35 			fName[0] = '\0';
36 
37 		Initialized();
38 	}
39 
40 	virtual void AddDump(TraceOutput& out)
41 	{
42 		out.Print("bfs:Create %Ld (%p), parent %Ld (%p), \"%s\", "
43 			"mode %lx, omode %x, type %lx", fID, fInode, fParentID,
44 			fParent, fName, fMode, fOpenMode, fType);
45 	}
46 
47 private:
48 	Inode*	fInode;
49 	ino_t	fID;
50 	Inode*	fParent;
51 	ino_t	fParentID;
52 	char	fName[32];
53 	int32	fMode;
54 	int		fOpenMode;
55 	uint32	fType;
56 };
57 
58 class Remove : public AbstractTraceEntry {
59 public:
60 	Remove(Inode* inode, const char* name)
61 		:
62 		fInode(inode),
63 		fID(inode->ID())
64 	{
65 		strlcpy(fName, name, sizeof(fName));
66 		Initialized();
67 	}
68 
69 	virtual void AddDump(TraceOutput& out)
70 	{
71 		out.Print("bfs:Remove %Ld (%p), \"%s\"", fID, fInode, fName);
72 	}
73 
74 private:
75 	Inode*	fInode;
76 	ino_t	fID;
77 	char	fName[32];
78 };
79 
80 class Action : public AbstractTraceEntry {
81 public:
82 	Action(const char* action, Inode* inode)
83 		:
84 		fInode(inode),
85 		fID(inode->ID())
86 	{
87 		strlcpy(fAction, action, sizeof(fAction));
88 		Initialized();
89 	}
90 
91 	virtual void AddDump(TraceOutput& out)
92 	{
93 		out.Print("bfs:%s %Ld (%p)\n", fAction, fID, fInode);
94 	}
95 
96 private:
97 	Inode*	fInode;
98 	ino_t	fID;
99 	char	fAction[16];
100 };
101 
102 class Resize : public AbstractTraceEntry {
103 public:
104 	Resize(Inode* inode, off_t oldSize, off_t newSize, bool trim)
105 		:
106 		fInode(inode),
107 		fID(inode->ID()),
108 		fOldSize(oldSize),
109 		fNewSize(newSize),
110 		fTrim(trim)
111 	{
112 		Initialized();
113 	}
114 
115 	virtual void AddDump(TraceOutput& out)
116 	{
117 		out.Print("bfs:%s %Ld (%p), %Ld -> %Ld", fTrim ? "Trim" : "Resize",
118 			fID, fInode, fOldSize, fNewSize);
119 	}
120 
121 private:
122 	Inode*	fInode;
123 	ino_t	fID;
124 	off_t	fOldSize;
125 	off_t	fNewSize;
126 	bool	fTrim;
127 };
128 
129 }	// namespace BFSInodeTracing
130 
131 #	define T(x) new(std::nothrow) BFSInodeTracing::x;
132 #else
133 #	define T(x) ;
134 #endif
135 
136 
137 /*!	A helper class used by Inode::Create() to keep track of the belongings
138 	of an inode creation in progress.
139 	This class will make sure everything is cleaned up properly.
140 */
141 class InodeAllocator {
142 public:
143 							InodeAllocator(Transaction& transaction);
144 							~InodeAllocator();
145 
146 			status_t		New(block_run* parentRun, mode_t mode, uint32 flags,
147 								block_run& run, fs_vnode_ops* vnodeOps,
148 								Inode** _inode);
149 			status_t		CreateTree();
150 			status_t		Keep(fs_vnode_ops* vnodeOps, uint32 publishFlags);
151 
152 private:
153 	static	void			_TransactionListener(int32 id, int32 event,
154 								void* _inode);
155 
156 			Transaction*	fTransaction;
157 			block_run		fRun;
158 			Inode*			fInode;
159 };
160 
161 
162 InodeAllocator::InodeAllocator(Transaction& transaction)
163 	:
164 	fTransaction(&transaction),
165 	fInode(NULL)
166 {
167 }
168 
169 
170 InodeAllocator::~InodeAllocator()
171 {
172 	if (fTransaction != NULL) {
173 		Volume* volume = fTransaction->GetVolume();
174 
175 		if (fInode != NULL) {
176 			fInode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
177 				// this unblocks any pending bfs_read_vnode() calls
178 			fInode->Free(*fTransaction);
179 
180 			if (fInode->fTree != NULL)
181 				fTransaction->RemoveListener(fInode->fTree);
182 			fTransaction->RemoveListener(fInode);
183 
184 			remove_vnode(volume->FSVolume(), fInode->ID());
185 		} else
186 			volume->Free(*fTransaction, fRun);
187 	}
188 
189 	delete fInode;
190 }
191 
192 
193 status_t
194 InodeAllocator::New(block_run* parentRun, mode_t mode, uint32 publishFlags,
195 	block_run& run, fs_vnode_ops* vnodeOps, Inode** _inode)
196 {
197 	Volume* volume = fTransaction->GetVolume();
198 
199 	status_t status = volume->AllocateForInode(*fTransaction, parentRun, mode,
200 		fRun);
201 	if (status < B_OK) {
202 		// don't free the space in the destructor, because
203 		// the allocation failed
204 		fTransaction = NULL;
205 		RETURN_ERROR(status);
206 	}
207 
208 	run = fRun;
209 	fInode = new(std::nothrow) Inode(volume, *fTransaction,
210 		volume->ToVnode(run), mode, run);
211 	if (fInode == NULL)
212 		RETURN_ERROR(B_NO_MEMORY);
213 
214 	if (!volume->IsInitializing()
215 		&& (publishFlags & BFS_DO_NOT_PUBLISH_VNODE) == 0) {
216 		status = new_vnode(volume->FSVolume(), fInode->ID(), fInode,
217 			vnodeOps != NULL ? vnodeOps : &gBFSVnodeOps);
218 		if (status < B_OK) {
219 			delete fInode;
220 			fInode = NULL;
221 			RETURN_ERROR(status);
222 		}
223 	}
224 
225 	fInode->WriteLockInTransaction(*fTransaction);
226 	*_inode = fInode;
227 	return B_OK;
228 }
229 
230 
231 status_t
232 InodeAllocator::CreateTree()
233 {
234 	Volume* volume = fTransaction->GetVolume();
235 
236 	// force S_STR_INDEX to be set, if no type is set
237 	if ((fInode->Mode() & S_INDEX_TYPES) == 0)
238 		fInode->Node().mode |= HOST_ENDIAN_TO_BFS_INT32(S_STR_INDEX);
239 
240 	BPlusTree* tree = new(std::nothrow) BPlusTree(*fTransaction, fInode);
241 	if (tree == NULL)
242 		return B_ERROR;
243 
244 	status_t status = tree->InitCheck();
245 	if (status != B_OK) {
246 		delete tree;
247 		return status;
248 	}
249 
250 	fInode->fTree = tree;
251 
252 	if (fInode->IsRegularNode()) {
253 		if (tree->Insert(*fTransaction, ".", fInode->ID()) < B_OK
254 			|| tree->Insert(*fTransaction, "..",
255 					volume->ToVnode(fInode->Parent())) < B_OK)
256 			return B_ERROR;
257 	}
258 	return B_OK;
259 }
260 
261 
262 status_t
263 InodeAllocator::Keep(fs_vnode_ops* vnodeOps, uint32 publishFlags)
264 {
265 	ASSERT(fInode != NULL && fTransaction != NULL);
266 	Volume* volume = fTransaction->GetVolume();
267 
268 	status_t status = fInode->WriteBack(*fTransaction);
269 	if (status < B_OK) {
270 		FATAL(("writing new inode %" B_PRIdINO " failed!\n", fInode->ID()));
271 		return status;
272 	}
273 
274 	// Symbolic links are not published -- the caller needs to do this once
275 	// the contents have been written.
276 	if (!fInode->IsSymLink() && !volume->IsInitializing()
277 		&& (publishFlags & BFS_DO_NOT_PUBLISH_VNODE) == 0) {
278 		status = publish_vnode(volume->FSVolume(), fInode->ID(), fInode,
279 			vnodeOps != NULL ? vnodeOps : &gBFSVnodeOps, fInode->Mode(),
280 			publishFlags);
281 	}
282 
283 	if (status == B_OK) {
284 		cache_add_transaction_listener(volume->BlockCache(), fTransaction->ID(),
285 			TRANSACTION_ABORTED, &_TransactionListener, fInode);
286 	}
287 
288 	fTransaction = NULL;
289 	fInode = NULL;
290 
291 	return status;
292 }
293 
294 
295 /*static*/ void
296 InodeAllocator::_TransactionListener(int32 id, int32 event, void* _inode)
297 {
298 	Inode* inode = (Inode*)_inode;
299 
300 	if (event == TRANSACTION_ABORTED)
301 		panic("transaction %d aborted, inode %p still around!\n", (int)id, inode);
302 }
303 
304 
305 //	#pragma mark -
306 
307 
308 status_t
309 bfs_inode::InitCheck(Volume* volume) const
310 {
311 	if (Magic1() != INODE_MAGIC1
312 		|| !(Flags() & INODE_IN_USE)
313 		|| inode_num.Length() != 1
314 		// matches inode size?
315 		|| (uint32)InodeSize() != volume->InodeSize()
316 		// parent resides on disk?
317 		|| parent.AllocationGroup() > int32(volume->AllocationGroups())
318 		|| parent.AllocationGroup() < 0
319 		|| parent.Start() > (1L << volume->AllocationGroupShift())
320 		|| parent.Length() != 1
321 		// attributes, too?
322 		|| attributes.AllocationGroup() > int32(volume->AllocationGroups())
323 		|| attributes.AllocationGroup() < 0
324 		|| attributes.Start() > (1L << volume->AllocationGroupShift()))
325 		RETURN_ERROR(B_BAD_DATA);
326 
327 	if (Flags() & INODE_DELETED)
328 		return B_NOT_ALLOWED;
329 
330 	// TODO: Add some tests to check the integrity of the other stuff here,
331 	// especially for the data_stream!
332 
333 	return B_OK;
334 }
335 
336 
337 //	#pragma mark - Inode
338 
339 
340 Inode::Inode(Volume* volume, ino_t id)
341 	:
342 	fVolume(volume),
343 	fID(id),
344 	fTree(NULL),
345 	fAttributes(NULL),
346 	fCache(NULL),
347 	fMap(NULL)
348 {
349 	PRINT(("Inode::Inode(volume = %p, id = %Ld) @ %p\n", volume, id, this));
350 
351 	rw_lock_init(&fLock, "bfs inode");
352 	recursive_lock_init(&fSmallDataLock, "bfs inode small data");
353 
354 	if (UpdateNodeFromDisk() != B_OK) {
355 		// TODO: the error code gets eaten
356 		return;
357 	}
358 
359 	// these two will help to maintain the indices
360 	fOldSize = Size();
361 	fOldLastModified = LastModified();
362 
363 	if (IsContainer())
364 		fTree = new(std::nothrow) BPlusTree(this);
365 	if (NeedsFileCache()) {
366 		SetFileCache(file_cache_create(fVolume->ID(), ID(), Size()));
367 		SetMap(file_map_create(volume->ID(), ID(), Size()));
368 	}
369 }
370 
371 
372 Inode::Inode(Volume* volume, Transaction& transaction, ino_t id, mode_t mode,
373 		block_run& run)
374 	:
375 	fVolume(volume),
376 	fID(id),
377 	fTree(NULL),
378 	fAttributes(NULL),
379 	fCache(NULL),
380 	fMap(NULL)
381 {
382 	PRINT(("Inode::Inode(volume = %p, transaction = %p, id = %Ld) @ %p\n",
383 		volume, &transaction, id, this));
384 
385 	rw_lock_init(&fLock, "bfs inode");
386 	recursive_lock_init(&fSmallDataLock, "bfs inode small data");
387 
388 	NodeGetter node(volume);
389 	status_t status = node.SetToWritable(transaction, this, true);
390 	if (status != B_OK) {
391 		FATAL(("Could not read inode block %" B_PRId64 ": %s!\n", BlockNumber(),
392 			strerror(status)));
393 		return;
394 	}
395 
396 	memset(&fNode, 0, sizeof(bfs_inode));
397 
398 	// Initialize the bfs_inode structure -- it's not written back to disk
399 	// here, because it will be done so already when the inode could be
400 	// created completely.
401 
402 	Node().magic1 = HOST_ENDIAN_TO_BFS_INT32(INODE_MAGIC1);
403 	Node().inode_num = run;
404 	Node().mode = HOST_ENDIAN_TO_BFS_INT32(mode);
405 	Node().flags = HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
406 
407 	Node().create_time = Node().last_modified_time = Node().status_change_time
408 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
409 
410 	Node().inode_size = HOST_ENDIAN_TO_BFS_INT32(volume->InodeSize());
411 
412 	// these two will help to maintain the indices
413 	fOldSize = Size();
414 	fOldLastModified = LastModified();
415 }
416 
417 
418 Inode::~Inode()
419 {
420 	PRINT(("Inode::~Inode() @ %p\n", this));
421 
422 	file_cache_delete(FileCache());
423 	file_map_delete(Map());
424 	delete fTree;
425 
426 	rw_lock_destroy(&fLock);
427 	recursive_lock_destroy(&fSmallDataLock);
428 }
429 
430 
431 status_t
432 Inode::InitCheck(bool checkNode) const
433 {
434 	// test inode magic and flags
435 	if (checkNode) {
436 		status_t status = Node().InitCheck(fVolume);
437 		if (status == B_BUSY)
438 			return B_BUSY;
439 
440 		if (status != B_OK) {
441 			FATAL(("inode at block %" B_PRIdOFF " corrupt!\n", BlockNumber()));
442 			RETURN_ERROR(B_BAD_DATA);
443 		}
444 	}
445 
446 	if (IsContainer()) {
447 		// inodes that have a B+tree
448 		if (fTree == NULL)
449 			RETURN_ERROR(B_NO_MEMORY);
450 
451 		status_t status = fTree->InitCheck();
452 		if (status != B_OK) {
453 			FATAL(("inode tree at block %" B_PRIdOFF " corrupt!\n",
454 				BlockNumber()));
455 			RETURN_ERROR(B_BAD_DATA);
456 		}
457 	}
458 
459 	if (NeedsFileCache() && (fCache == NULL || fMap == NULL))
460 		return B_NO_MEMORY;
461 
462 	return B_OK;
463 }
464 
465 
466 /*!	Adds this inode to the specified transaction. This means that the inode will
467 	be write locked until the transaction ended.
468 	To ensure that the inode will stay valid until that point, an extra reference
469 	is acquired to it as long as this transaction stays active.
470 */
471 void
472 Inode::WriteLockInTransaction(Transaction& transaction)
473 {
474 	// These flags can only change while holding the transaction lock
475 	if ((Flags() & INODE_IN_TRANSACTION) != 0)
476 		return;
477 
478 	// We share the same list link with the removed list, so we have to remove
479 	// the inode from that list here (and add it back when we no longer need it)
480 	if ((Flags() & INODE_DELETED) != 0)
481 		fVolume->RemovedInodes().Remove(this);
482 
483 	if (!fVolume->IsInitializing())
484 		acquire_vnode(fVolume->FSVolume(), ID());
485 
486 	rw_lock_write_lock(&Lock());
487 	Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
488 
489 	transaction.AddListener(this);
490 }
491 
492 
493 status_t
494 Inode::WriteBack(Transaction& transaction)
495 {
496 	NodeGetter node(fVolume);
497 	status_t status = node.SetToWritable(transaction, this);
498 	if (status != B_OK)
499 		return status;
500 
501 	memcpy(node.WritableNode(), &Node(), sizeof(bfs_inode));
502 	return B_OK;
503 }
504 
505 
506 status_t
507 Inode::UpdateNodeFromDisk()
508 {
509 	NodeGetter node(fVolume);
510 	status_t status = node.SetTo(this);
511 	if (status != B_OK) {
512 		FATAL(("Failed to read block %" B_PRId64 " from disk: %s!\n",
513 			BlockNumber(), strerror(status)));
514 		return status;
515 	}
516 
517 	memcpy(&fNode, node.Node(), sizeof(bfs_inode));
518 	fNode.flags &= HOST_ENDIAN_TO_BFS_INT32(INODE_PERMANENT_FLAGS);
519 	return B_OK;
520 }
521 
522 
523 status_t
524 Inode::CheckPermissions(int accessMode) const
525 {
526 	// you never have write access to a read-only volume
527 	if ((accessMode & W_OK) != 0 && fVolume->IsReadOnly())
528 		return B_READ_ONLY_DEVICE;
529 
530 	return check_access_permissions(accessMode, Mode(), (gid_t)fNode.GroupID(),
531 		(uid_t)fNode.UserID());
532 }
533 
534 
535 //	#pragma mark - attributes
536 
537 
538 void
539 Inode::_AddIterator(AttributeIterator* iterator)
540 {
541 	RecursiveLocker _(fSmallDataLock);
542 	fIterators.Add(iterator);
543 }
544 
545 
546 void
547 Inode::_RemoveIterator(AttributeIterator* iterator)
548 {
549 	RecursiveLocker _(fSmallDataLock);
550 	fIterators.Remove(iterator);
551 }
552 
553 
554 /*!	Tries to free up "bytes" space in the small_data section by moving
555 	attributes to real files. Used for system attributes like the name.
556 	You need to hold the fSmallDataLock when you call this method
557 */
558 status_t
559 Inode::_MakeSpaceForSmallData(Transaction& transaction, bfs_inode* node,
560 	const char* name, int32 bytes)
561 {
562 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
563 
564 	while (bytes > 0) {
565 		small_data* item = node->SmallDataStart();
566 		small_data* max = NULL;
567 		int32 index = 0, maxIndex = 0;
568 		for (; !item->IsLast(node); item = item->Next(), index++) {
569 			// should not remove those
570 			if (*item->Name() == FILE_NAME_NAME || !strcmp(name, item->Name()))
571 				continue;
572 
573 			if (max == NULL || max->Size() < item->Size()) {
574 				maxIndex = index;
575 				max = item;
576 			}
577 
578 			// Remove the first one large enough to free the needed amount of
579 			// bytes
580 			if (bytes < (int32)item->Size())
581 				break;
582 		}
583 
584 		if (item->IsLast(node) || (int32)item->Size() < bytes)
585 			return B_ERROR;
586 
587 		bytes -= max->Size();
588 
589 		// Move the attribute to a real attribute file
590 		// Luckily, this doesn't cause any index updates
591 
592 		Inode* attribute;
593 		status_t status = CreateAttribute(transaction, item->Name(),
594 			item->Type(), &attribute);
595 		if (status != B_OK)
596 			RETURN_ERROR(status);
597 
598 		size_t length = item->DataSize();
599 		status = attribute->WriteAt(transaction, 0, item->Data(), &length);
600 
601 		ReleaseAttribute(attribute);
602 
603 		if (status != B_OK) {
604 			Vnode vnode(fVolume, Attributes());
605 			Inode* attributes;
606 			if (vnode.Get(&attributes) < B_OK
607 				|| attributes->Remove(transaction, name) < B_OK) {
608 				FATAL(("Could not remove newly created attribute!\n"));
609 			}
610 
611 			RETURN_ERROR(status);
612 		}
613 
614 		_RemoveSmallData(node, max, maxIndex);
615 	}
616 	return B_OK;
617 }
618 
619 
620 /*!	Private function which removes the given attribute from the small_data
621 	section.
622 	You need to hold the fSmallDataLock when you call this method
623 */
624 status_t
625 Inode::_RemoveSmallData(bfs_inode* node, small_data* item, int32 index)
626 {
627 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
628 
629 	small_data* next = item->Next();
630 	if (!next->IsLast(node)) {
631 		// find the last attribute
632 		small_data* last = next;
633 		while (!last->IsLast(node))
634 			last = last->Next();
635 
636 		int32 size = (uint8*)last - (uint8*)next;
637 		if (size < 0
638 			|| size > (uint8*)node + fVolume->BlockSize() - (uint8*)next)
639 			return B_BAD_DATA;
640 
641 		memmove(item, next, size);
642 
643 		// Move the "last" one to its new location and
644 		// correctly terminate the small_data section
645 		last = (small_data*)((uint8*)last - ((uint8*)next - (uint8*)item));
646 		memset(last, 0, (uint8*)node + fVolume->BlockSize() - (uint8*)last);
647 	} else
648 		memset(item, 0, item->Size());
649 
650 	// update all current iterators
651 	SinglyLinkedList<AttributeIterator>::Iterator iterator
652 		= fIterators.GetIterator();
653 	while (iterator.HasNext()) {
654 		iterator.Next()->Update(index, -1);
655 	}
656 
657 	return B_OK;
658 }
659 
660 
661 //!	Removes the given attribute from the small_data section.
662 status_t
663 Inode::_RemoveSmallData(Transaction& transaction, NodeGetter& nodeGetter,
664 	const char* name)
665 {
666 	if (name == NULL)
667 		return B_BAD_VALUE;
668 
669 	bfs_inode* node = nodeGetter.WritableNode();
670 	RecursiveLocker locker(fSmallDataLock);
671 
672 	// search for the small_data item
673 
674 	small_data* item = node->SmallDataStart();
675 	int32 index = 0;
676 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
677 		item = item->Next();
678 		index++;
679 	}
680 
681 	if (item->IsLast(node))
682 		return B_ENTRY_NOT_FOUND;
683 
684 	status_t status = nodeGetter.MakeWritable(transaction);
685 	if (status != B_OK)
686 		return status;
687 
688 	status = _RemoveSmallData(node, item, index);
689 	if (status == B_OK) {
690 		Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
691 			bfs_inode::ToInode(real_time_clock_usecs()));
692 
693 		status = WriteBack(transaction);
694 	}
695 
696 	return status;
697 }
698 
699 
700 /*!	Try to place the given attribute in the small_data section - if the
701 	new attribute is too big to fit in that section, it returns B_DEVICE_FULL.
702 	In that case, the attribute should be written to a real attribute file;
703 	it's the caller's responsibility to remove any existing attributes in the
704 	small data section if that's the case.
705 
706 	Note that you need to write back the inode yourself after having called that
707 	method - it's a bad API decision that it needs a transaction but enforces
708 	you to write back the inode all by yourself, but it's just more efficient
709 	in most cases...
710 */
711 status_t
712 Inode::_AddSmallData(Transaction& transaction, NodeGetter& nodeGetter,
713 	const char* name, uint32 type, off_t pos, const uint8* data, size_t length,
714 	bool force)
715 {
716 	bfs_inode* node = nodeGetter.WritableNode();
717 
718 	if (node == NULL || name == NULL || data == NULL)
719 		return B_BAD_VALUE;
720 
721 	// reject any requests that can't fit into the small_data section
722 	uint32 nameLength = strlen(name);
723 	uint32 spaceNeeded = sizeof(small_data) + nameLength + 3 + pos + length + 1;
724 	if (spaceNeeded > fVolume->InodeSize() - sizeof(bfs_inode))
725 		return B_DEVICE_FULL;
726 
727 	status_t status = nodeGetter.MakeWritable(transaction);
728 	if (status != B_OK)
729 		return status;
730 
731 	RecursiveLocker locker(fSmallDataLock);
732 
733 	// Find the last item or one with the same name we have to add
734 	small_data* item = node->SmallDataStart();
735 	int32 index = 0;
736 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
737 		item = item->Next();
738 		index++;
739 	}
740 
741 	// is the attribute already in the small_data section?
742 	// then just replace the data part of that one
743 	if (!item->IsLast(node)) {
744 		// find last attribute
745 		small_data* last = item;
746 		while (!last->IsLast(node))
747 			last = last->Next();
748 
749 		// try to change the attributes value
750 		if (item->data_size > pos + length
751 			|| force
752 			|| ((uint8*)last + pos + length - item->DataSize())
753 					<= ((uint8*)node + fVolume->InodeSize())) {
754 			// Make room for the new attribute if needed (and we are forced
755 			// to do so)
756 			if (force && ((uint8*)last + pos + length - item->DataSize())
757 					> ((uint8*)node + fVolume->InodeSize())) {
758 				// We also take the free space at the end of the small_data
759 				// section into account, and request only what's really needed
760 				uint32 needed = pos + length - item->DataSize() -
761 					(uint32)((uint8*)node + fVolume->InodeSize()
762 						- (uint8*)last);
763 
764 				if (_MakeSpaceForSmallData(transaction, node, name, needed)
765 						!= B_OK)
766 					return B_ERROR;
767 
768 				// reset our pointers
769 				item = node->SmallDataStart();
770 				index = 0;
771 				while (!item->IsLast(node) && strcmp(item->Name(), name)) {
772 					item = item->Next();
773 					index++;
774 				}
775 
776 				last = item;
777 				while (!last->IsLast(node))
778 					last = last->Next();
779 			}
780 
781 			size_t oldDataSize = item->DataSize();
782 
783 			// Normally, we can just overwrite the attribute data as the size
784 			// is specified by the type and does not change that often
785 			if (pos + length != item->DataSize()) {
786 				// move the attributes after the current one
787 				small_data* next = item->Next();
788 				if (!next->IsLast(node)) {
789 					memmove((uint8*)item + spaceNeeded, next,
790 						(uint8*)last - (uint8*)next);
791 				}
792 
793 				// Move the "last" one to its new location and
794 				// correctly terminate the small_data section
795 				last = (small_data*)((uint8*)last
796 					- ((uint8*)next - ((uint8*)item + spaceNeeded)));
797 				if ((uint8*)last < (uint8*)node + fVolume->BlockSize()) {
798 					memset(last, 0, (uint8*)node + fVolume->BlockSize()
799 						- (uint8*)last);
800 				}
801 
802 				item->data_size = HOST_ENDIAN_TO_BFS_INT16(pos + length);
803 			}
804 
805 			item->type = HOST_ENDIAN_TO_BFS_INT32(type);
806 
807 			if ((uint64)oldDataSize < (uint64)pos) {
808 				// Fill gap with zeros
809 				memset(item->Data() + oldDataSize, 0, pos - oldDataSize);
810 			}
811 			if (user_memcpy(item->Data() + pos, data, length) < B_OK)
812 				return B_BAD_ADDRESS;
813 			item->Data()[pos + length] = '\0';
814 
815 			return B_OK;
816 		}
817 
818 		return B_DEVICE_FULL;
819 	}
820 
821 	// try to add the new attribute!
822 
823 	if ((uint8*)item + spaceNeeded > (uint8*)node + fVolume->InodeSize()) {
824 		// there is not enough space for it!
825 		if (!force)
826 			return B_DEVICE_FULL;
827 
828 		// make room for the new attribute
829 		if (_MakeSpaceForSmallData(transaction, node, name, spaceNeeded) < B_OK)
830 			return B_ERROR;
831 
832 		// get new last item!
833 		item = node->SmallDataStart();
834 		index = 0;
835 		while (!item->IsLast(node)) {
836 			item = item->Next();
837 			index++;
838 		}
839 	}
840 
841 	memset(item, 0, spaceNeeded);
842 	item->type = HOST_ENDIAN_TO_BFS_INT32(type);
843 	item->name_size = HOST_ENDIAN_TO_BFS_INT16(nameLength);
844 	item->data_size = HOST_ENDIAN_TO_BFS_INT16(length);
845 	strcpy(item->Name(), name);
846 	if (user_memcpy(item->Data() + pos, data, length) < B_OK)
847 		return B_BAD_ADDRESS;
848 
849 	// correctly terminate the small_data section
850 	item = item->Next();
851 	if (!item->IsLast(node))
852 		memset(item, 0, (uint8*)node + fVolume->InodeSize() - (uint8*)item);
853 
854 	// update all current iterators
855 	SinglyLinkedList<AttributeIterator>::Iterator iterator
856 		= fIterators.GetIterator();
857 	while (iterator.HasNext()) {
858 		iterator.Next()->Update(index, 1);
859 	}
860 
861 	return B_OK;
862 }
863 
864 
865 /*!	Iterates through the small_data section of an inode.
866 	To start at the beginning of this section, you let smallData
867 	point to NULL, like:
868 		small_data* data = NULL;
869 		while (inode->GetNextSmallData(&data) { ... }
870 
871 	This function is reentrant and doesn't allocate any memory;
872 	you can safely stop calling it at any point (you don't need
873 	to iterate through the whole list).
874 	You need to hold the fSmallDataLock when you call this method
875 */
876 status_t
877 Inode::_GetNextSmallData(bfs_inode* node, small_data** _smallData) const
878 {
879 	if (node == NULL)
880 		RETURN_ERROR(B_BAD_VALUE);
881 
882 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
883 
884 	small_data* data = *_smallData;
885 
886 	// begin from the start?
887 	if (data == NULL)
888 		data = node->SmallDataStart();
889 	else
890 		data = data->Next();
891 
892 	// is already last item?
893 	if (data->IsLast(node))
894 		return B_ENTRY_NOT_FOUND;
895 
896 	*_smallData = data;
897 
898 	return B_OK;
899 }
900 
901 
902 /*!	Finds the attribute "name" in the small data section, and
903 	returns a pointer to it (or NULL if it doesn't exist).
904 	You need to hold the fSmallDataLock when you call this method
905 */
906 small_data*
907 Inode::FindSmallData(const bfs_inode* node, const char* name) const
908 {
909 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
910 
911 	small_data* smallData = NULL;
912 	while (_GetNextSmallData(const_cast<bfs_inode*>(node), &smallData)
913 			== B_OK) {
914 		if (!strcmp(smallData->Name(), name))
915 			return smallData;
916 	}
917 	return NULL;
918 }
919 
920 
921 /*!	Returns a pointer to the node's name if present in the small data
922 	section, NULL otherwise.
923 	You need to hold the fSmallDataLock when you call this method
924 */
925 const char*
926 Inode::Name(const bfs_inode* node) const
927 {
928 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
929 
930 	small_data* smallData = NULL;
931 	while (_GetNextSmallData((bfs_inode*)node, &smallData) == B_OK) {
932 		if (*smallData->Name() == FILE_NAME_NAME
933 			&& smallData->NameSize() == FILE_NAME_NAME_LENGTH)
934 			return (const char*)smallData->Data();
935 	}
936 	return NULL;
937 }
938 
939 
940 /*!	Copies the node's name into the provided buffer.
941 	The buffer should be B_FILE_NAME_LENGTH bytes large.
942 */
943 status_t
944 Inode::GetName(char* buffer, size_t size) const
945 {
946 	NodeGetter node(fVolume);
947 	status_t status = node.SetTo(this);
948 	if (status != B_OK)
949 		return status;
950 
951 	RecursiveLocker locker(fSmallDataLock);
952 
953 	const char* name = Name(node.Node());
954 	if (name == NULL)
955 		return B_ENTRY_NOT_FOUND;
956 
957 	strlcpy(buffer, name, size);
958 	return B_OK;
959 }
960 
961 
962 /*!	Changes or set the name of a file: in the inode small_data section only, it
963 	doesn't change it in the parent directory's b+tree.
964 	Note that you need to write back the inode yourself after having called
965 	that method. It suffers from the same API decision as AddSmallData() does
966 	(and for the same reason).
967 */
968 status_t
969 Inode::SetName(Transaction& transaction, const char* name)
970 {
971 	if (name == NULL || *name == '\0')
972 		return B_BAD_VALUE;
973 
974 	NodeGetter node(fVolume);
975 	status_t status = node.SetToWritable(transaction, this);
976 	if (status != B_OK)
977 		return status;
978 
979 	const char nameTag[2] = {FILE_NAME_NAME, 0};
980 
981 	return _AddSmallData(transaction, node, nameTag, FILE_NAME_TYPE, 0,
982 		(uint8*)name, strlen(name), true);
983 }
984 
985 
986 status_t
987 Inode::_RemoveAttribute(Transaction& transaction, const char* name,
988 	bool hasIndex, Index* index)
989 {
990 	// remove the attribute file if it exists
991 	Vnode vnode(fVolume, Attributes());
992 	Inode* attributes;
993 	status_t status = vnode.Get(&attributes);
994 	if (status < B_OK)
995 		return status;
996 
997 	// update index
998 	if (index != NULL) {
999 		Inode* attribute;
1000 		if ((hasIndex || fVolume->CheckForLiveQuery(name))
1001 			&& GetAttribute(name, &attribute) == B_OK) {
1002 			uint8 data[MAX_INDEX_KEY_LENGTH];
1003 			size_t length = MAX_INDEX_KEY_LENGTH;
1004 			if (attribute->ReadAt(0, data, &length) == B_OK) {
1005 				index->Update(transaction, name, attribute->Type(), data,
1006 					length, NULL, 0, this);
1007 			}
1008 
1009 			ReleaseAttribute(attribute);
1010 		}
1011 	}
1012 
1013 	if ((status = attributes->Remove(transaction, name)) < B_OK)
1014 		return status;
1015 
1016 	if (attributes->IsEmpty()) {
1017 		attributes->WriteLockInTransaction(transaction);
1018 
1019 		// remove attribute directory (don't fail if that can't be done)
1020 		if (remove_vnode(fVolume->FSVolume(), attributes->ID()) == B_OK) {
1021 			// update the inode, so that no one will ever doubt it's deleted :-)
1022 			attributes->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
1023 			if (attributes->WriteBack(transaction) == B_OK) {
1024 				Attributes().SetTo(0, 0, 0);
1025 				WriteBack(transaction);
1026 			} else {
1027 				unremove_vnode(fVolume->FSVolume(), attributes->ID());
1028 				attributes->Node().flags
1029 					&= ~HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
1030 			}
1031 		}
1032 	}
1033 
1034 	return status;
1035 }
1036 
1037 
1038 /*!	Reads data from the specified attribute.
1039 	This is a high-level attribute function that understands attributes
1040 	in the small_data section as well as real attribute files.
1041 */
1042 status_t
1043 Inode::ReadAttribute(const char* name, int32 type, off_t pos, uint8* buffer,
1044 	size_t* _length)
1045 {
1046 	if (pos < 0)
1047 		pos = 0;
1048 
1049 	// search in the small_data section (which has to be locked first)
1050 	{
1051 		NodeGetter node(fVolume);
1052 		status_t status = node.SetTo(this);
1053 		if (status != B_OK)
1054 			return status;
1055 
1056 		RecursiveLocker locker(fSmallDataLock);
1057 
1058 		small_data* smallData = FindSmallData(node.Node(), name);
1059 		if (smallData != NULL) {
1060 			size_t length = *_length;
1061 			if (pos >= smallData->data_size) {
1062 				*_length = 0;
1063 				return B_OK;
1064 			}
1065 			if (length + pos > smallData->DataSize())
1066 				length = smallData->DataSize() - pos;
1067 
1068 			status_t error = user_memcpy(buffer, smallData->Data() + pos,
1069 				length);
1070 			*_length = length;
1071 			return error;
1072 		}
1073 	}
1074 
1075 	// search in the attribute directory
1076 	Inode* attribute;
1077 	status_t status = GetAttribute(name, &attribute);
1078 	if (status == B_OK) {
1079 		status = attribute->ReadAt(pos, (uint8*)buffer, _length);
1080 
1081 		ReleaseAttribute(attribute);
1082 	}
1083 
1084 	RETURN_ERROR(status);
1085 }
1086 
1087 
1088 /*!	Writes data to the specified attribute.
1089 	This is a high-level attribute function that understands attributes
1090 	in the small_data section as well as real attribute files.
1091 */
1092 status_t
1093 Inode::WriteAttribute(Transaction& transaction, const char* name, int32 type,
1094 	off_t pos, const uint8* buffer, size_t* _length, bool* _created)
1095 {
1096 	if (pos < 0)
1097 		return B_BAD_VALUE;
1098 
1099 	// needed to maintain the index
1100 	uint8 oldBuffer[MAX_INDEX_KEY_LENGTH];
1101 	uint8* oldData = NULL;
1102 	size_t oldLength = 0;
1103 	bool created = false;
1104 
1105 	// TODO: we actually depend on that the contents of "buffer" are constant.
1106 	// If they get changed during the write (hey, user programs), we may mess
1107 	// up our index trees!
1108 	// TODO: for attribute files, we need to log the first
1109 	// MAX_INDEX_KEY_LENGTH bytes of the data stream, or the same as above
1110 	// might happen.
1111 
1112 	Index index(fVolume);
1113 	bool hasIndex = index.SetTo(name) == B_OK;
1114 
1115 	Inode* attribute = NULL;
1116 	status_t status = B_OK;
1117 
1118 	if (GetAttribute(name, &attribute) != B_OK) {
1119 		// No attribute inode exists yet
1120 
1121 		// save the old attribute data
1122 		NodeGetter node(fVolume);
1123 		status = node.SetToWritable(transaction, this);
1124 		if (status != B_OK)
1125 			return status;
1126 
1127 		recursive_lock_lock(&fSmallDataLock);
1128 
1129 		small_data* smallData = FindSmallData(node.Node(), name);
1130 		if (smallData != NULL) {
1131 			oldLength = smallData->DataSize();
1132 			if (oldLength > 0) {
1133 				if (oldLength > MAX_INDEX_KEY_LENGTH)
1134 					oldLength = MAX_INDEX_KEY_LENGTH;
1135 				memcpy(oldData = oldBuffer, smallData->Data(), oldLength);
1136 			}
1137 		} else
1138 			created = true;
1139 
1140 		recursive_lock_unlock(&fSmallDataLock);
1141 
1142 		// if the attribute doesn't exist yet (as a file), try to put it in the
1143 		// small_data section first - if that fails (due to insufficent space),
1144 		// create a real attribute file
1145 		status = _AddSmallData(transaction, node, name, type, pos, buffer,
1146 			*_length);
1147 		if (status == B_DEVICE_FULL) {
1148 			if (smallData != NULL) {
1149 				// remove the old attribute from the small data section - there
1150 				// is no space left for the new data
1151 				status = _RemoveSmallData(transaction, node, name);
1152 			} else
1153 				status = B_OK;
1154 
1155 			if (status == B_OK)
1156 				status = CreateAttribute(transaction, name, type, &attribute);
1157 			if (status != B_OK)
1158 				RETURN_ERROR(status);
1159 
1160 			created = true;
1161 		} else if (status == B_OK) {
1162 			// Update status time on attribute write
1163 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1164 				bfs_inode::ToInode(real_time_clock_usecs()));
1165 
1166 			status = WriteBack(transaction);
1167 		}
1168 	}
1169 
1170 	if (attribute != NULL) {
1171 		WriteLocker writeLocker(attribute->fLock);
1172 
1173 		if (hasIndex || fVolume->CheckForLiveQuery(name)) {
1174 			// Save the old attribute data (if this fails, oldLength will
1175 			// reflect it)
1176 			while (attribute->Size() > 0) {
1177 				bigtime_t oldModified = attribute->LastModified();
1178 				writeLocker.Unlock();
1179 
1180 				oldLength = MAX_INDEX_KEY_LENGTH;
1181 				if (attribute->ReadAt(0, oldBuffer, &oldLength) == B_OK)
1182 					oldData = oldBuffer;
1183 
1184 				writeLocker.Lock();
1185 
1186 				// Read until the data hasn't changed in between
1187 				if (oldModified == attribute->LastModified())
1188 					break;
1189 
1190 				oldLength = 0;
1191 			}
1192 		}
1193 
1194 		// check if the data fits into the small_data section again
1195 		NodeGetter node(fVolume);
1196 		status = node.SetToWritable(transaction, this);
1197 		if (status != B_OK)
1198 			return status;
1199 
1200 		status = _AddSmallData(transaction, node, name, type, pos, buffer,
1201 			*_length);
1202 
1203 		if (status == B_OK) {
1204 			// it does - remove its file
1205 			writeLocker.Unlock();
1206 			status = _RemoveAttribute(transaction, name, false, NULL);
1207 		} else {
1208 			// The attribute type might have been changed - we need to
1209 			// adopt the new one
1210 			attribute->Node().type = HOST_ENDIAN_TO_BFS_INT32(type);
1211 			status = attribute->WriteBack(transaction);
1212 			writeLocker.Unlock();
1213 
1214 			if (status == B_OK) {
1215 				status = attribute->WriteAt(transaction, pos, buffer,
1216 					_length);
1217 			}
1218 		}
1219 
1220 		if (status == B_OK) {
1221 			// Update status time on attribute write
1222 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1223 				bfs_inode::ToInode(real_time_clock_usecs()));
1224 
1225 			status = WriteBack(transaction);
1226 		}
1227 
1228 		attribute->WriteLockInTransaction(transaction);
1229 		ReleaseAttribute(attribute);
1230 	}
1231 
1232 	// TODO: find a better way than this "pos" thing (the begin of the old key
1233 	//	must be copied to the start of the new one for a comparison)
1234 	if (status == B_OK && pos == 0) {
1235 		// Index only the first MAX_INDEX_KEY_LENGTH bytes
1236 		uint16 length = *_length;
1237 		if (length > MAX_INDEX_KEY_LENGTH)
1238 			length = MAX_INDEX_KEY_LENGTH;
1239 
1240 		uint8 indexBuffer[MAX_INDEX_KEY_LENGTH];
1241 		// _AddSmallData() already read the buffer
1242 		user_memcpy(indexBuffer, buffer, length);
1243 
1244 		// Update index. Note, Index::Update() may be called even if
1245 		// initializing the index failed - it will just update the live
1246 		// queries in this case
1247 		if (pos < length || (uint64)pos < (uint64)oldLength) {
1248 			index.Update(transaction, name, type, oldData, oldLength,
1249 				indexBuffer, length, this);
1250 		}
1251 	}
1252 
1253 	if (_created != NULL)
1254 		*_created = created;
1255 
1256 	return status;
1257 }
1258 
1259 
1260 /*!	Removes the specified attribute from the inode.
1261 	This is a high-level attribute function that understands attributes
1262 	in the small_data section as well as real attribute files.
1263 */
1264 status_t
1265 Inode::RemoveAttribute(Transaction& transaction, const char* name)
1266 {
1267 	Index index(fVolume);
1268 	bool hasIndex = index.SetTo(name) == B_OK;
1269 	NodeGetter node(fVolume);
1270 	status_t status = node.SetTo(this);
1271 	if (status != B_OK)
1272 		return status;
1273 
1274 	// update index for attributes in the small_data section
1275 	{
1276 		RecursiveLocker _(fSmallDataLock);
1277 
1278 		small_data* smallData = FindSmallData(node.Node(), name);
1279 		if (smallData != NULL) {
1280 			uint32 length = smallData->DataSize();
1281 			if (length > MAX_INDEX_KEY_LENGTH)
1282 				length = MAX_INDEX_KEY_LENGTH;
1283 			index.Update(transaction, name, smallData->Type(),
1284 				smallData->Data(), length, NULL, 0, this);
1285 		}
1286 	}
1287 
1288 	status = _RemoveSmallData(transaction, node, name);
1289 	if (status == B_ENTRY_NOT_FOUND && !Attributes().IsZero()) {
1290 		// remove the attribute file if it exists
1291 		status = _RemoveAttribute(transaction, name, hasIndex, &index);
1292 		if (status == B_OK) {
1293 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1294 				bfs_inode::ToInode(real_time_clock_usecs()));
1295 			WriteBack(transaction);
1296 		}
1297 	}
1298 
1299 	return status;
1300 }
1301 
1302 
1303 /*!	Returns the attribute inode with the specified \a name, in case it exists.
1304 	This method can only return real attribute files; the attributes in the
1305 	small data section are ignored.
1306 */
1307 status_t
1308 Inode::GetAttribute(const char* name, Inode** _attribute)
1309 {
1310 	// does this inode even have attributes?
1311 	if (Attributes().IsZero())
1312 		return B_ENTRY_NOT_FOUND;
1313 
1314 	Vnode vnode(fVolume, Attributes());
1315 	Inode* attributes;
1316 	if (vnode.Get(&attributes) < B_OK) {
1317 		FATAL(("get_vnode() failed in Inode::GetAttribute(name = \"%s\")\n",
1318 			name));
1319 		return B_ERROR;
1320 	}
1321 
1322 	BPlusTree* tree = attributes->Tree();
1323 	if (tree == NULL)
1324 		return B_BAD_VALUE;
1325 
1326 	InodeReadLocker locker(attributes);
1327 
1328 	ino_t id;
1329 	status_t status = tree->Find((uint8*)name, (uint16)strlen(name), &id);
1330 	if (status == B_OK) {
1331 		Vnode vnode(fVolume, id);
1332 		Inode* inode;
1333 		// Check if the attribute is really an attribute
1334 		if (vnode.Get(&inode) != B_OK || !inode->IsAttribute())
1335 			return B_ERROR;
1336 
1337 		*_attribute = inode;
1338 		vnode.Keep();
1339 		return B_OK;
1340 	}
1341 
1342 	return status;
1343 }
1344 
1345 
1346 void
1347 Inode::ReleaseAttribute(Inode* attribute)
1348 {
1349 	if (attribute == NULL)
1350 		return;
1351 
1352 	put_vnode(fVolume->FSVolume(), attribute->ID());
1353 }
1354 
1355 
1356 status_t
1357 Inode::CreateAttribute(Transaction& transaction, const char* name, uint32 type,
1358 	Inode** attribute)
1359 {
1360 	// do we need to create the attribute directory first?
1361 	if (Attributes().IsZero()) {
1362 		status_t status = Inode::Create(transaction, this, NULL,
1363 			S_ATTR_DIR | S_DIRECTORY | 0666, 0, 0, NULL);
1364 		if (status < B_OK)
1365 			RETURN_ERROR(status);
1366 	}
1367 	Vnode vnode(fVolume, Attributes());
1368 	Inode* attributes;
1369 	if (vnode.Get(&attributes) < B_OK)
1370 		return B_ERROR;
1371 
1372 	// Inode::Create() locks the inode for us
1373 	return Inode::Create(transaction, attributes, name,
1374 		S_ATTR | S_FILE | 0666, 0, type, NULL, NULL, attribute);
1375 }
1376 
1377 
1378 //	#pragma mark - directory tree
1379 
1380 
1381 bool
1382 Inode::IsEmpty()
1383 {
1384 	TreeIterator iterator(fTree);
1385 
1386 	uint32 count = 0;
1387 	char name[MAX_INDEX_KEY_LENGTH + 1];
1388 	uint16 length;
1389 	ino_t id;
1390 	while (iterator.GetNextEntry(name, &length, MAX_INDEX_KEY_LENGTH + 1,
1391 			&id) == B_OK) {
1392 		if ((Mode() & (S_ATTR_DIR | S_INDEX_DIR)) != 0)
1393 			return false;
1394 
1395 		// Unlike index and attribute directories, directories
1396 		// for standard files always contain ".", and "..", so
1397 		// we need to ignore those two
1398 		if (++count > 2 || (strcmp(".", name) != 0 && strcmp("..", name) != 0))
1399 			return false;
1400 	}
1401 	return true;
1402 }
1403 
1404 
1405 status_t
1406 Inode::ContainerContentsChanged(Transaction& transaction)
1407 {
1408 	ASSERT(!InLastModifiedIndex());
1409 
1410 	Node().last_modified_time = Node().status_change_time
1411 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
1412 
1413 	return WriteBack(transaction);
1414 }
1415 
1416 
1417 //	#pragma mark - data stream
1418 
1419 
1420 /*!	Computes the number of bytes this inode occupies in the file system.
1421 	This includes the file meta blocks used for maintaining its data stream.
1422 
1423 	TODO: However, the attributes in extra files are not really accounted for;
1424 	depending on the speed penalty, this should be changed, though (the value
1425 	could be cached in the inode structure or Inode object, though).
1426 */
1427 off_t
1428 Inode::AllocatedSize() const
1429 {
1430 	if (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0) {
1431 		// This symlink does not have a data stream
1432 		return Node().InodeSize();
1433 	}
1434 
1435 	const data_stream& data = Node().data;
1436 	uint32 blockSize = fVolume->BlockSize();
1437 	off_t size = blockSize;
1438 
1439 	if (data.MaxDoubleIndirectRange() != 0) {
1440 		off_t doubleIndirectSize = data.MaxDoubleIndirectRange()
1441 			- data.MaxIndirectRange();
1442 		int32 indirectSize = double_indirect_max_indirect_size(
1443 			data.double_indirect.Length(), fVolume->BlockSize());
1444 
1445 		size += (2 * data.double_indirect.Length()
1446 				+ doubleIndirectSize / indirectSize)
1447 			* blockSize + data.MaxDoubleIndirectRange();
1448 	} else if (data.MaxIndirectRange() != 0)
1449 		size += data.indirect.Length() + data.MaxIndirectRange();
1450 	else
1451 		size += data.MaxDirectRange();
1452 
1453 	if (!Node().attributes.IsZero()) {
1454 		// TODO: to make this exact, we'd had to count all attributes
1455 		size += 2 * blockSize;
1456 			// 2 blocks, one for the attributes inode, one for its B+tree
1457 	}
1458 
1459 	return size;
1460 }
1461 
1462 
1463 /*!	Finds the block_run where "pos" is located in the data_stream of
1464 	the inode.
1465 	If successful, "offset" will then be set to the file offset
1466 	of the block_run returned; so "pos - offset" is for the block_run
1467 	what "pos" is for the whole stream.
1468 	The caller has to make sure that "pos" is inside the stream.
1469 */
1470 status_t
1471 Inode::FindBlockRun(off_t pos, block_run& run, off_t& offset)
1472 {
1473 	data_stream* data = &Node().data;
1474 
1475 	// find matching block run
1476 
1477 	if (data->MaxIndirectRange() > 0 && pos >= data->MaxDirectRange()) {
1478 		if (data->MaxDoubleIndirectRange() > 0
1479 			&& pos >= data->MaxIndirectRange()) {
1480 			// access to double indirect blocks
1481 
1482 			CachedBlock cached(fVolume);
1483 
1484 			int32 runsPerBlock;
1485 			int32 directSize;
1486 			int32 indirectSize;
1487 			get_double_indirect_sizes(data->double_indirect.Length(),
1488 				fVolume->BlockSize(), runsPerBlock, directSize, indirectSize);
1489 			if (directSize <= 0 || indirectSize <= 0)
1490 				RETURN_ERROR(B_BAD_DATA);
1491 
1492 			off_t start = pos - data->MaxIndirectRange();
1493 			int32 index = start / indirectSize;
1494 
1495 			status_t status = cached.SetTo(fVolume->ToBlock(
1496 				data->double_indirect) + index / runsPerBlock);
1497 			if (status != B_OK)
1498 				RETURN_ERROR(status);
1499 
1500 			block_run* indirect = (block_run*)cached.Block();
1501 			int32 current = (start % indirectSize) / directSize;
1502 
1503 			status = cached.SetTo(fVolume->ToBlock(indirect[
1504 				index % runsPerBlock]) + current / runsPerBlock);
1505 			if (status != B_OK)
1506 				RETURN_ERROR(status);
1507 
1508 			indirect = (block_run*)cached.Block();
1509 			run = indirect[current % runsPerBlock];
1510 			if (run.Length() != data->double_indirect.Length())
1511 				RETURN_ERROR(B_BAD_DATA);
1512 
1513 			offset = data->MaxIndirectRange() + (index * indirectSize)
1514 				+ (current * directSize);
1515 		} else {
1516 			// access to indirect blocks
1517 
1518 			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1519 			off_t runBlockEnd = data->MaxDirectRange();
1520 
1521 			CachedBlock cached(fVolume);
1522 			off_t block = fVolume->ToBlock(data->indirect);
1523 
1524 			for (int32 i = 0; i < data->indirect.Length(); i++) {
1525 				status_t status = cached.SetTo(block + i);
1526 				if (status != B_OK)
1527 					RETURN_ERROR(status);
1528 
1529 				block_run* indirect = (block_run*)cached.Block();
1530 				int32 current = -1;
1531 				while (++current < runsPerBlock) {
1532 					if (indirect[current].IsZero())
1533 						break;
1534 
1535 					runBlockEnd += (uint32)indirect[current].Length()
1536 						<< cached.BlockShift();
1537 					if (runBlockEnd > pos) {
1538 						run = indirect[current];
1539 						offset = runBlockEnd - ((uint32)run.Length()
1540 							<< cached.BlockShift());
1541 						return fVolume->ValidateBlockRun(run);
1542 					}
1543 				}
1544 			}
1545 			RETURN_ERROR(B_ERROR);
1546 		}
1547 	} else {
1548 		// access from direct blocks
1549 
1550 		off_t runBlockEnd = 0LL;
1551 		int32 current = -1;
1552 
1553 		while (++current < NUM_DIRECT_BLOCKS) {
1554 			if (data->direct[current].IsZero())
1555 				break;
1556 
1557 			runBlockEnd += (uint32)data->direct[current].Length()
1558 				<< fVolume->BlockShift();
1559 			if (runBlockEnd > pos) {
1560 				run = data->direct[current];
1561 				offset = runBlockEnd
1562 					- ((uint32)run.Length() << fVolume->BlockShift());
1563 				return fVolume->ValidateBlockRun(run);
1564 			}
1565 		}
1566 
1567 		return B_ENTRY_NOT_FOUND;
1568 	}
1569 	return fVolume->ValidateBlockRun(run);
1570 }
1571 
1572 
1573 status_t
1574 Inode::ReadAt(off_t pos, uint8* buffer, size_t* _length)
1575 {
1576 	return file_cache_read(FileCache(), NULL, pos, buffer, _length);
1577 }
1578 
1579 
1580 status_t
1581 Inode::WriteAt(Transaction& transaction, off_t pos, const uint8* buffer,
1582 	size_t* _length)
1583 {
1584 	InodeReadLocker locker(this);
1585 
1586 	// update the last modification time in memory, it will be written
1587 	// back to the inode, and the index when the file is closed
1588 	// TODO: should update the internal last modified time only at this point!
1589 	Node().last_modified_time = Node().status_change_time
1590 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
1591 
1592 	// TODO: support INODE_LOGGED!
1593 
1594 	size_t length = *_length;
1595 	bool changeSize = (uint64)pos + (uint64)length > (uint64)Size();
1596 
1597 	// set/check boundaries for pos/length
1598 	if (pos < 0)
1599 		return B_BAD_VALUE;
1600 
1601 	locker.Unlock();
1602 
1603 	// the transaction doesn't have to be started already
1604 	if (changeSize && !transaction.IsStarted())
1605 		transaction.Start(fVolume, BlockNumber());
1606 
1607 	WriteLocker writeLocker(fLock);
1608 
1609 	// Work around possible race condition: Someone might have shrunken the file
1610 	// while we had no lock.
1611 	if (!transaction.IsStarted()
1612 		&& (uint64)pos + (uint64)length > (uint64)Size()) {
1613 		writeLocker.Unlock();
1614 		transaction.Start(fVolume, BlockNumber());
1615 		writeLocker.Lock();
1616 	}
1617 
1618 	off_t oldSize = Size();
1619 
1620 	if ((uint64)pos + (uint64)length > (uint64)oldSize) {
1621 		// let's grow the data stream to the size needed
1622 		status_t status = SetFileSize(transaction, pos + length);
1623 		if (status != B_OK) {
1624 			*_length = 0;
1625 			WriteLockInTransaction(transaction);
1626 			RETURN_ERROR(status);
1627 		}
1628 		// TODO: In theory we would need to update the file size
1629 		// index here as part of the current transaction - this might just
1630 		// be a bit too expensive, but worth a try.
1631 
1632 		// we need to write back the inode here because it has to
1633 		// go into this transaction (we cannot wait until the file
1634 		// is closed)
1635 		status = WriteBack(transaction);
1636 		if (status != B_OK) {
1637 			WriteLockInTransaction(transaction);
1638 			return status;
1639 		}
1640 	}
1641 
1642 	writeLocker.Unlock();
1643 
1644 	if (oldSize < pos)
1645 		FillGapWithZeros(oldSize, pos);
1646 
1647 	// If we don't want to write anything, we can now return (we may
1648 	// just have changed the file size using the position parameter)
1649 	if (length == 0)
1650 		return B_OK;
1651 
1652 	status_t status = file_cache_write(FileCache(), NULL, pos, buffer, _length);
1653 
1654 	if (transaction.IsStarted())
1655 		WriteLockInTransaction(transaction);
1656 
1657 	return status;
1658 }
1659 
1660 
1661 /*!	Fills the gap between the old file size and the new file size
1662 	with zeros.
1663 	It's more or less a copy of Inode::WriteAt() but it can handle
1664 	length differences of more than just 4 GB, and it never uses
1665 	the log, even if the INODE_LOGGED flag is set.
1666 */
1667 status_t
1668 Inode::FillGapWithZeros(off_t pos, off_t newSize)
1669 {
1670 	while (pos < newSize) {
1671 		size_t size;
1672 		if (newSize > pos + 1024 * 1024 * 1024)
1673 			size = 1024 * 1024 * 1024;
1674 		else
1675 			size = newSize - pos;
1676 
1677 		status_t status = file_cache_write(FileCache(), NULL, pos, NULL, &size);
1678 		if (status < B_OK)
1679 			return status;
1680 
1681 		pos += size;
1682 	}
1683 
1684 	return B_OK;
1685 }
1686 
1687 
1688 /*!	Allocates \a length blocks, and clears their contents. Growing
1689 	the indirect and double indirect range uses this method.
1690 	The allocated block_run is saved in "run"
1691 */
1692 status_t
1693 Inode::_AllocateBlockArray(Transaction& transaction, block_run& run,
1694 	size_t length, bool variableSize)
1695 {
1696 	if (!run.IsZero())
1697 		return B_BAD_VALUE;
1698 
1699 	status_t status = fVolume->Allocate(transaction, this, length, run,
1700 		variableSize ? 1 : length);
1701 	if (status != B_OK)
1702 		return status;
1703 
1704 	// make sure those blocks are empty
1705 	CachedBlock cached(fVolume);
1706 	off_t block = fVolume->ToBlock(run);
1707 
1708 	for (int32 i = 0; i < run.Length(); i++) {
1709 		status = cached.SetToWritable(transaction, block + i, true);
1710 		if (status != B_OK)
1711 			return status;
1712 	}
1713 	return B_OK;
1714 }
1715 
1716 
1717 /*!	Grows the stream to \a size, and fills the direct/indirect/double indirect
1718 	ranges with the runs.
1719 	This method will also determine the size of the preallocation, if any.
1720 */
1721 status_t
1722 Inode::_GrowStream(Transaction& transaction, off_t size)
1723 {
1724 	data_stream* data = &Node().data;
1725 
1726 	// is the data stream already large enough to hold the new size?
1727 	// (can be the case with preallocated blocks)
1728 	if (size < data->MaxDirectRange()
1729 		|| size < data->MaxIndirectRange()
1730 		|| size < data->MaxDoubleIndirectRange()) {
1731 		data->size = HOST_ENDIAN_TO_BFS_INT64(size);
1732 		return B_OK;
1733 	}
1734 
1735 	// how many bytes are still needed? (unused ranges are always zero)
1736 	uint16 minimum = 1;
1737 	off_t bytes;
1738 	if (data->Size() < data->MaxDoubleIndirectRange()) {
1739 		bytes = size - data->MaxDoubleIndirectRange();
1740 		// The double indirect range can only handle multiples of
1741 		// its base length
1742 		minimum = data->double_indirect.Length();
1743 	} else if (data->Size() < data->MaxIndirectRange())
1744 		bytes = size - data->MaxIndirectRange();
1745 	else if (data->Size() < data->MaxDirectRange())
1746 		bytes = size - data->MaxDirectRange();
1747 	else {
1748 		// no preallocation left to be used
1749 		bytes = size - data->Size();
1750 		if (data->MaxDoubleIndirectRange() > 0)
1751 			minimum = data->double_indirect.Length();
1752 	}
1753 
1754 	// do we have enough free blocks on the disk?
1755 	off_t blocksNeeded = (bytes + fVolume->BlockSize() - 1)
1756 		>> fVolume->BlockShift();
1757 	if (blocksNeeded > fVolume->FreeBlocks())
1758 		return B_DEVICE_FULL;
1759 
1760 	off_t blocksRequested = blocksNeeded;
1761 		// because of preallocations and partial allocations, the number of
1762 		// blocks we need to allocate may be different from the one we request
1763 		// from the block allocator
1764 
1765 	// Should we preallocate some blocks?
1766 	// Attributes, attribute directories, and long symlinks usually won't get
1767 	// that big, and should stay close to the inode - preallocating could be
1768 	// counterproductive.
1769 	// Also, if free disk space is tight, don't preallocate.
1770 	if (!IsAttribute() && !IsAttributeDirectory() && !IsSymLink()
1771 		&& fVolume->FreeBlocks() > 128) {
1772 		off_t roundTo = 0;
1773 		if (IsFile()) {
1774 			// Request preallocated blocks depending on the file size and growth
1775 			if (size < 1 * 1024 * 1024 && bytes < 512 * 1024) {
1776 				// Preallocate 64 KB for file sizes <1 MB and grow rates <512 KB
1777 				roundTo = 65536 >> fVolume->BlockShift();
1778 			} else if (size < 32 * 1024 * 1024 && bytes <= 1 * 1024 * 1024) {
1779 				// Preallocate 512 KB for file sizes between 1 MB and 32 MB, and
1780 				// grow rates smaller than 1 MB
1781 				roundTo = (512 * 1024) >> fVolume->BlockShift();
1782 			} else {
1783 				// Preallocate 1/16 of the file size (ie. 4 MB for 64 MB,
1784 				// 64 MB for 1 GB)
1785 				roundTo = size >> (fVolume->BlockShift() + 4);
1786 			}
1787 		} else if (IsIndex()) {
1788 			// Always preallocate 64 KB for index directories
1789 			roundTo = 65536 >> fVolume->BlockShift();
1790 		} else {
1791 			// Preallocate only 4 KB - directories only get trimmed when their
1792 			// vnode is flushed, which might not happen very often.
1793 			roundTo = 4096 >> fVolume->BlockShift();
1794 		}
1795 		if (roundTo > 1) {
1796 			// Round to next "roundTo" block count
1797 			blocksRequested = ((blocksNeeded + roundTo) / roundTo) * roundTo;
1798 		}
1799 	}
1800 
1801 	while (blocksNeeded > 0) {
1802 		// the requested blocks do not need to be returned with a
1803 		// single allocation, so we need to iterate until we have
1804 		// enough blocks allocated
1805 		if (minimum > 1) {
1806 			// make sure that "blocks" is a multiple of minimum
1807 			blocksRequested = round_up(blocksRequested, minimum);
1808 		}
1809 
1810 		block_run run;
1811 		status_t status = fVolume->Allocate(transaction, this, blocksRequested,
1812 			run, minimum);
1813 		if (status != B_OK)
1814 			return status;
1815 
1816 		// okay, we have the needed blocks, so just distribute them to the
1817 		// different ranges of the stream (direct, indirect & double indirect)
1818 
1819 		blocksNeeded -= run.Length();
1820 		// don't preallocate if the first allocation was already too small
1821 		blocksRequested = blocksNeeded;
1822 
1823 		// Direct block range
1824 
1825 		if (data->Size() <= data->MaxDirectRange()) {
1826 			// let's try to put them into the direct block range
1827 			int32 free = 0;
1828 			for (; free < NUM_DIRECT_BLOCKS; free++) {
1829 				if (data->direct[free].IsZero())
1830 					break;
1831 			}
1832 
1833 			if (free < NUM_DIRECT_BLOCKS) {
1834 				// can we merge the last allocated run with the new one?
1835 				int32 last = free - 1;
1836 				if (free > 0 && data->direct[last].MergeableWith(run)) {
1837 					data->direct[last].length = HOST_ENDIAN_TO_BFS_INT16(
1838 						data->direct[last].Length() + run.Length());
1839 				} else
1840 					data->direct[free] = run;
1841 
1842 				data->max_direct_range = HOST_ENDIAN_TO_BFS_INT64(
1843 					data->MaxDirectRange()
1844 					+ run.Length() * fVolume->BlockSize());
1845 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1846 					? data->max_direct_range : size);
1847 				continue;
1848 			}
1849 		}
1850 
1851 		// Indirect block range
1852 
1853 		if (data->Size() <= data->MaxIndirectRange()
1854 			|| !data->MaxIndirectRange()) {
1855 			CachedBlock cached(fVolume);
1856 			block_run* runs = NULL;
1857 			uint32 free = 0;
1858 			off_t block;
1859 
1860 			// if there is no indirect block yet, create one
1861 			if (data->indirect.IsZero()) {
1862 				status = _AllocateBlockArray(transaction, data->indirect,
1863 					NUM_ARRAY_BLOCKS, true);
1864 				if (status != B_OK)
1865 					return status;
1866 
1867 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1868 					data->MaxDirectRange());
1869 				// insert the block_run in the first block
1870 				status = cached.SetTo(data->indirect);
1871 				if (status != B_OK)
1872 					return status;
1873 
1874 				runs = (block_run*)cached.Block();
1875 			} else {
1876 				uint32 numberOfRuns = fVolume->BlockSize() / sizeof(block_run);
1877 				block = fVolume->ToBlock(data->indirect);
1878 
1879 				// search first empty entry
1880 				int32 i = 0;
1881 				for (; i < data->indirect.Length(); i++) {
1882 					status = cached.SetTo(block + i);
1883 					if (status != B_OK)
1884 						return status;
1885 
1886 					runs = (block_run*)cached.Block();
1887 					for (free = 0; free < numberOfRuns; free++)
1888 						if (runs[free].IsZero())
1889 							break;
1890 
1891 					if (free < numberOfRuns)
1892 						break;
1893 				}
1894 				if (i == data->indirect.Length())
1895 					runs = NULL;
1896 			}
1897 
1898 			if (runs != NULL) {
1899 				// try to insert the run to the last one - note that this
1900 				// doesn't take block borders into account, so it could be
1901 				// further optimized
1902 				cached.MakeWritable(transaction);
1903 
1904 				int32 last = free - 1;
1905 				if (free > 0 && runs[last].MergeableWith(run)) {
1906 					runs[last].length = HOST_ENDIAN_TO_BFS_INT16(
1907 						runs[last].Length() + run.Length());
1908 				} else
1909 					runs[free] = run;
1910 
1911 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1912 					data->MaxIndirectRange()
1913 					+ ((uint32)run.Length() << fVolume->BlockShift()));
1914 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1915 					? data->MaxIndirectRange() : size);
1916 				continue;
1917 			}
1918 		}
1919 
1920 		// Double indirect block range
1921 
1922 		if (data->Size() <= data->MaxDoubleIndirectRange()
1923 			|| !data->max_double_indirect_range) {
1924 			// We make sure here that we have this minimum allocated, so if
1925 			// the allocation succeeds, we don't run into an endless loop.
1926 			if (!data->max_double_indirect_range)
1927 				minimum = _DoubleIndirectBlockLength();
1928 			else
1929 				minimum = data->double_indirect.Length();
1930 
1931 			if ((run.Length() % minimum) != 0) {
1932 				// The number of allocated blocks isn't a multiple of 'minimum',
1933 				// so we have to change this. This can happen the first time the
1934 				// stream grows into the double indirect range.
1935 				// First, free the remaining blocks that don't fit into this
1936 				// multiple.
1937 				int32 rest = run.Length() % minimum;
1938 				run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length() - rest);
1939 
1940 				status = fVolume->Free(transaction,
1941 					block_run::Run(run.AllocationGroup(),
1942 					run.Start() + run.Length(), rest));
1943 				if (status != B_OK)
1944 					return status;
1945 
1946 				blocksNeeded += rest;
1947 				blocksRequested = round_up(blocksNeeded, minimum);
1948 
1949 				// Are there any blocks left in the run? If not, allocate
1950 				// a new one
1951 				if (run.length == 0)
1952 					continue;
1953 			}
1954 
1955 			// if there is no double indirect block yet, create one
1956 			if (data->double_indirect.IsZero()) {
1957 				status = _AllocateBlockArray(transaction,
1958 					data->double_indirect, _DoubleIndirectBlockLength());
1959 				if (status != B_OK)
1960 					return status;
1961 
1962 				data->max_double_indirect_range = data->max_indirect_range;
1963 			}
1964 
1965 			// calculate the index where to insert the new blocks
1966 
1967 			int32 runsPerBlock;
1968 			int32 directSize;
1969 			int32 indirectSize;
1970 			get_double_indirect_sizes(data->double_indirect.Length(),
1971 				fVolume->BlockSize(), runsPerBlock, directSize, indirectSize);
1972 			if (directSize <= 0 || indirectSize <= 0)
1973 				return B_BAD_DATA;
1974 
1975 			off_t start = data->MaxDoubleIndirectRange()
1976 				- data->MaxIndirectRange();
1977 			int32 indirectIndex = start / indirectSize;
1978 			int32 index = (start % indirectSize) / directSize;
1979 			int32 runsPerArray = runsPerBlock * minimum;
1980 
1981 			// distribute the blocks to the array and allocate
1982 			// new array blocks when needed
1983 
1984 			CachedBlock cached(fVolume);
1985 			CachedBlock cachedDirect(fVolume);
1986 			block_run* array = NULL;
1987 			uint32 runLength = run.Length();
1988 
1989 			while (run.length != 0) {
1990 				// get the indirect array block
1991 				if (array == NULL) {
1992 					uint32 block = indirectIndex / runsPerBlock;
1993 					if (block >= minimum)
1994 						return EFBIG;
1995 
1996 					status = cached.SetTo(fVolume->ToBlock(
1997 						data->double_indirect) + block);
1998 					if (status != B_OK)
1999 						return status;
2000 
2001 					array = (block_run*)cached.Block();
2002 				}
2003 
2004 				do {
2005 					// do we need a new array block?
2006 					if (array[indirectIndex % runsPerBlock].IsZero()) {
2007 						cached.MakeWritable(transaction);
2008 
2009 						status = _AllocateBlockArray(transaction,
2010 							array[indirectIndex % runsPerBlock],
2011 							data->double_indirect.Length());
2012 						if (status != B_OK)
2013 							return status;
2014 					}
2015 
2016 					status = cachedDirect.SetToWritable(transaction,
2017 						fVolume->ToBlock(array[indirectIndex
2018 							% runsPerBlock]) + index / runsPerBlock);
2019 					if (status != B_OK)
2020 						return status;
2021 
2022 					block_run* runs = (block_run*)cachedDirect.Block();
2023 
2024 					do {
2025 						// insert the block_run into the array
2026 						runs[index % runsPerBlock] = run;
2027 						runs[index % runsPerBlock].length
2028 							= HOST_ENDIAN_TO_BFS_INT16(minimum);
2029 
2030 						// alter the remaining block_run
2031 						run.start = HOST_ENDIAN_TO_BFS_INT16(run.Start()
2032 							+ minimum);
2033 						run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
2034 							- minimum);
2035 					} while ((++index % runsPerBlock) != 0 && run.length);
2036 				} while ((index % runsPerArray) != 0 && run.length);
2037 
2038 				if (index == runsPerArray)
2039 					index = 0;
2040 				if (++indirectIndex % runsPerBlock == 0) {
2041 					array = NULL;
2042 					index = 0;
2043 				}
2044 			}
2045 
2046 			data->max_double_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
2047 				data->MaxDoubleIndirectRange()
2048 				+ (runLength << fVolume->BlockShift()));
2049 			data->size = blocksNeeded > 0 ? HOST_ENDIAN_TO_BFS_INT64(
2050 				data->max_double_indirect_range) : size;
2051 
2052 			continue;
2053 		}
2054 
2055 		RETURN_ERROR(EFBIG);
2056 	}
2057 	// update the size of the data stream
2058 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
2059 
2060 	return B_OK;
2061 }
2062 
2063 
2064 size_t
2065 Inode::_DoubleIndirectBlockLength() const
2066 {
2067 	if (fVolume->BlockSize() > DOUBLE_INDIRECT_ARRAY_SIZE)
2068 		return 1;
2069 
2070 	return DOUBLE_INDIRECT_ARRAY_SIZE / fVolume->BlockSize();
2071 }
2072 
2073 
2074 /*!	Frees the statically sized array of the double indirect part of a data
2075 	stream.
2076 */
2077 status_t
2078 Inode::_FreeStaticStreamArray(Transaction& transaction, int32 level,
2079 	block_run run, off_t size, off_t offset, off_t& max)
2080 {
2081 	int32 indirectSize;
2082 	if (level == 0) {
2083 		indirectSize = double_indirect_max_indirect_size(run.Length(),
2084 			fVolume->BlockSize());
2085 	} else {
2086 		indirectSize = double_indirect_max_direct_size(run.Length(),
2087 			fVolume->BlockSize());
2088 	}
2089 	if (indirectSize <= 0)
2090 		return B_BAD_DATA;
2091 
2092 	off_t start;
2093 	if (size > offset)
2094 		start = size - offset;
2095 	else
2096 		start = 0;
2097 
2098 	int32 index = start / indirectSize;
2099 	int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
2100 
2101 	CachedBlock cached(fVolume);
2102 	off_t blockNumber = fVolume->ToBlock(run);
2103 
2104 	// set the file offset to the current block run
2105 	offset += (off_t)index * indirectSize;
2106 
2107 	for (int32 i = index / runsPerBlock; i < run.Length(); i++) {
2108 		status_t status = cached.SetToWritable(transaction, blockNumber + i);
2109 		if (status != B_OK)
2110 			RETURN_ERROR(status);
2111 
2112 		block_run* array = (block_run*)cached.WritableBlock();
2113 
2114 		for (index = index % runsPerBlock; index < runsPerBlock; index++) {
2115 			if (array[index].IsZero()) {
2116 				// we also want to break out of the outer loop
2117 				i = run.Length();
2118 				break;
2119 			}
2120 
2121 			status_t status = B_OK;
2122 			if (level == 0) {
2123 				status = _FreeStaticStreamArray(transaction, 1, array[index],
2124 					size, offset, max);
2125 			} else if (offset >= size)
2126 				status = fVolume->Free(transaction, array[index]);
2127 			else
2128 				max = HOST_ENDIAN_TO_BFS_INT64(offset + indirectSize);
2129 
2130 			if (status < B_OK)
2131 				RETURN_ERROR(status);
2132 
2133 			if (offset >= size)
2134 				array[index].SetTo(0, 0, 0);
2135 
2136 			offset += indirectSize;
2137 		}
2138 		index = 0;
2139 	}
2140 	return B_OK;
2141 }
2142 
2143 
2144 /*!	Frees all block_runs in the array which come after the specified size.
2145 	It also trims the last block_run that contain the size.
2146 	"offset" and "max" are maintained until the last block_run that doesn't
2147 	have to be freed - after this, the values won't be correct anymore, but
2148 	will still assure correct function for all subsequent calls.
2149 	"max" is considered to be in file system byte order.
2150 */
2151 status_t
2152 Inode::_FreeStreamArray(Transaction& transaction, block_run* array,
2153 	uint32 arrayLength, off_t size, off_t& offset, off_t& max)
2154 {
2155 	PRINT(("FreeStreamArray: arrayLength %lu, size %Ld, offset %Ld, max %Ld\n",
2156 		arrayLength, size, offset, max));
2157 
2158 	off_t newOffset = offset;
2159 	uint32 i = 0;
2160 	for (; i < arrayLength; i++, offset = newOffset) {
2161 		if (array[i].IsZero())
2162 			break;
2163 
2164 		newOffset += (off_t)array[i].Length() << fVolume->BlockShift();
2165 		if (newOffset <= size)
2166 			continue;
2167 
2168 		block_run run = array[i];
2169 
2170 		// determine the block_run to be freed
2171 		if (newOffset > size && offset < size) {
2172 			// free partial block_run (and update the original block_run)
2173 			run.start = HOST_ENDIAN_TO_BFS_INT16(array[i].Start()
2174 				+ ((size + fVolume->BlockSize() - 1 - offset)
2175 					>> fVolume->BlockShift()));
2176 				// round to next block
2177 			array[i].length = HOST_ENDIAN_TO_BFS_INT16(run.Start()
2178 				- array[i].Start());
2179 			run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
2180 				- array[i].Length());
2181 
2182 			if (run.length == 0)
2183 				continue;
2184 
2185 			// update maximum range
2186 			max = HOST_ENDIAN_TO_BFS_INT64(offset + ((off_t)array[i].Length()
2187 				<< fVolume->BlockShift()));
2188 		} else {
2189 			// free the whole block_run
2190 			array[i].SetTo(0, 0, 0);
2191 
2192 			if ((off_t)BFS_ENDIAN_TO_HOST_INT64(max) > offset)
2193 				max = HOST_ENDIAN_TO_BFS_INT64(offset);
2194 		}
2195 
2196 		if (fVolume->Free(transaction, run) < B_OK)
2197 			return B_IO_ERROR;
2198 	}
2199 	return B_OK;
2200 }
2201 
2202 
2203 status_t
2204 Inode::_ShrinkStream(Transaction& transaction, off_t size)
2205 {
2206 	data_stream* data = &Node().data;
2207 	status_t status;
2208 
2209 	if (data->MaxDoubleIndirectRange() > size) {
2210 		off_t* maxDoubleIndirect = &data->max_double_indirect_range;
2211 			// gcc 4 work-around: "error: cannot bind packed field
2212 			// 'data->data_stream::max_double_indirect_range' to 'off_t&'"
2213 		status = _FreeStaticStreamArray(transaction, 0, data->double_indirect,
2214 			size, data->MaxIndirectRange(), *maxDoubleIndirect);
2215 		if (status != B_OK)
2216 			return status;
2217 
2218 		if (size <= data->MaxIndirectRange()) {
2219 			fVolume->Free(transaction, data->double_indirect);
2220 			data->double_indirect.SetTo(0, 0, 0);
2221 			data->max_double_indirect_range = 0;
2222 		}
2223 	}
2224 
2225 	if (data->MaxIndirectRange() > size) {
2226 		CachedBlock cached(fVolume);
2227 		off_t block = fVolume->ToBlock(data->indirect);
2228 		off_t offset = data->MaxDirectRange();
2229 
2230 		for (int32 i = 0; i < data->indirect.Length(); i++) {
2231 			status = cached.SetToWritable(transaction, block + i);
2232 			if (status != B_OK)
2233 				return status;
2234 
2235 			block_run* array = (block_run*)cached.WritableBlock();
2236 
2237 			off_t* maxIndirect = &data->max_indirect_range;
2238 				// gcc 4 work-around: "error: cannot bind packed field
2239 				// 'data->data_stream::max_indirect_range' to 'off_t&'"
2240 			if (_FreeStreamArray(transaction, array, fVolume->BlockSize()
2241 					/ sizeof(block_run), size, offset, *maxIndirect) != B_OK)
2242 				return B_IO_ERROR;
2243 		}
2244 		if (data->max_direct_range == data->max_indirect_range) {
2245 			fVolume->Free(transaction, data->indirect);
2246 			data->indirect.SetTo(0, 0, 0);
2247 			data->max_indirect_range = 0;
2248 		}
2249 	}
2250 
2251 	if (data->MaxDirectRange() > size) {
2252 		off_t offset = 0;
2253 		off_t *maxDirect = &data->max_direct_range;
2254 			// gcc 4 work-around: "error: cannot bind packed field
2255 			// 'data->data_stream::max_direct_range' to 'off_t&'"
2256 		status = _FreeStreamArray(transaction, data->direct, NUM_DIRECT_BLOCKS,
2257 			size, offset, *maxDirect);
2258 		if (status != B_OK)
2259 			return status;
2260 	}
2261 
2262 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
2263 	return B_OK;
2264 }
2265 
2266 
2267 status_t
2268 Inode::SetFileSize(Transaction& transaction, off_t size)
2269 {
2270 	if (size < 0)
2271 		return B_BAD_VALUE;
2272 
2273 	off_t oldSize = Size();
2274 
2275 	if (size == oldSize)
2276 		return B_OK;
2277 
2278 	T(Resize(this, oldSize, size, false));
2279 
2280 	// should the data stream grow or shrink?
2281 	status_t status;
2282 	if (size > oldSize) {
2283 		status = _GrowStream(transaction, size);
2284 		if (status < B_OK) {
2285 			// if the growing of the stream fails, the whole operation
2286 			// fails, so we should shrink the stream to its former size
2287 			_ShrinkStream(transaction, oldSize);
2288 		}
2289 	} else
2290 		status = _ShrinkStream(transaction, size);
2291 
2292 	if (status < B_OK)
2293 		return status;
2294 
2295 	file_cache_set_size(FileCache(), size);
2296 	file_map_set_size(Map(), size);
2297 
2298 	return WriteBack(transaction);
2299 }
2300 
2301 
2302 status_t
2303 Inode::Append(Transaction& transaction, off_t bytes)
2304 {
2305 	return SetFileSize(transaction, Size() + bytes);
2306 }
2307 
2308 
2309 /*!	Checks whether or not this inode's data stream needs to be trimmed
2310 	because of an earlier preallocation.
2311 	Returns true if there are any blocks to be trimmed.
2312 */
2313 bool
2314 Inode::NeedsTrimming() const
2315 {
2316 	// We never trim preallocated index blocks to make them grow as smooth as
2317 	// possible. There are only few indices anyway, so this doesn't hurt.
2318 	// Also, if an inode is already in deleted state, we don't bother trimming
2319 	// it.
2320 	if (IsIndex() || IsDeleted()
2321 		|| (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0))
2322 		return false;
2323 
2324 	off_t roundedSize = round_up(Size(), fVolume->BlockSize());
2325 
2326 	return Node().data.MaxDirectRange() > roundedSize
2327 		|| Node().data.MaxIndirectRange() > roundedSize
2328 		|| Node().data.MaxDoubleIndirectRange() > roundedSize;
2329 }
2330 
2331 
2332 status_t
2333 Inode::TrimPreallocation(Transaction& transaction)
2334 {
2335 	T(Resize(this, max_c(Node().data.MaxDirectRange(),
2336 		Node().data.MaxIndirectRange()), Size(), true));
2337 
2338 	status_t status = _ShrinkStream(transaction, Size());
2339 	if (status < B_OK)
2340 		return status;
2341 
2342 	return WriteBack(transaction);
2343 }
2344 
2345 
2346 //!	Frees the file's data stream and removes all attributes
2347 status_t
2348 Inode::Free(Transaction& transaction)
2349 {
2350 	FUNCTION();
2351 
2352 	// Perhaps there should be an implementation of Inode::ShrinkStream() that
2353 	// just frees the data_stream, but doesn't change the inode (since it is
2354 	// freed anyway) - that would make an undelete command possible
2355 	if (!IsSymLink() || (Flags() & INODE_LONG_SYMLINK) != 0) {
2356 		status_t status = SetFileSize(transaction, 0);
2357 		if (status < B_OK)
2358 			return status;
2359 	}
2360 
2361 	// Free all attributes, and remove their indices
2362 	{
2363 		// We have to limit the scope of AttributeIterator, so that its
2364 		// destructor is not called after the inode is deleted
2365 		AttributeIterator iterator(this);
2366 
2367 		char name[B_FILE_NAME_LENGTH];
2368 		uint32 type;
2369 		size_t length;
2370 		ino_t id;
2371 		while (iterator.GetNext(name, &length, &type, &id) == B_OK) {
2372 			RemoveAttribute(transaction, name);
2373 		}
2374 	}
2375 
2376 	if (WriteBack(transaction) < B_OK)
2377 		return B_IO_ERROR;
2378 
2379 	return fVolume->Free(transaction, BlockRun());
2380 }
2381 
2382 
2383 //!	Synchronizes (writes back to disk) the file stream of the inode.
2384 status_t
2385 Inode::Sync()
2386 {
2387 	if (FileCache())
2388 		return file_cache_sync(FileCache());
2389 
2390 	// We may also want to flush the attribute's data stream to
2391 	// disk here... (do we?)
2392 
2393 	if (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0)
2394 		return B_OK;
2395 
2396 	InodeReadLocker locker(this);
2397 
2398 	data_stream* data = &Node().data;
2399 	status_t status = B_OK;
2400 
2401 	// flush direct range
2402 
2403 	for (int32 i = 0; i < NUM_DIRECT_BLOCKS; i++) {
2404 		if (data->direct[i].IsZero())
2405 			return B_OK;
2406 
2407 		status = block_cache_sync_etc(fVolume->BlockCache(),
2408 			fVolume->ToBlock(data->direct[i]), data->direct[i].Length());
2409 		if (status != B_OK)
2410 			return status;
2411 	}
2412 
2413 	// flush indirect range
2414 
2415 	if (data->max_indirect_range == 0)
2416 		return B_OK;
2417 
2418 	CachedBlock cached(fVolume);
2419 	off_t block = fVolume->ToBlock(data->indirect);
2420 	int32 count = fVolume->BlockSize() / sizeof(block_run);
2421 
2422 	for (int32 j = 0; j < data->indirect.Length(); j++) {
2423 		status = cached.SetTo(block + j);
2424 		if (status != B_OK)
2425 			return status;
2426 
2427 		block_run* runs = (block_run*)cached.Block();
2428 
2429 		for (int32 i = 0; i < count; i++) {
2430 			if (runs[i].IsZero())
2431 				return B_OK;
2432 
2433 			status = block_cache_sync_etc(fVolume->BlockCache(),
2434 				fVolume->ToBlock(runs[i]), runs[i].Length());
2435 			if (status != B_OK)
2436 				return status;
2437 		}
2438 	}
2439 
2440 	// flush double indirect range
2441 
2442 	if (data->max_double_indirect_range == 0)
2443 		return B_OK;
2444 
2445 	off_t indirectBlock = fVolume->ToBlock(data->double_indirect);
2446 
2447 	for (int32 l = 0; l < data->double_indirect.Length(); l++) {
2448 		status = cached.SetTo(indirectBlock + l);
2449 		if (status != B_OK)
2450 			return status;
2451 
2452 		block_run* indirectRuns = (block_run*)cached.Block();
2453 		CachedBlock directCached(fVolume);
2454 
2455 		for (int32 k = 0; k < count; k++) {
2456 			if (indirectRuns[k].IsZero())
2457 				return B_OK;
2458 
2459 			block = fVolume->ToBlock(indirectRuns[k]);
2460 			for (int32 j = 0; j < indirectRuns[k].Length(); j++) {
2461 				status = directCached.SetTo(block + j);
2462 				if (status != B_OK)
2463 					return status;
2464 
2465 				block_run* runs = (block_run*)directCached.Block();
2466 
2467 				for (int32 i = 0; i < count; i++) {
2468 					if (runs[i].IsZero())
2469 						return B_OK;
2470 
2471 					// TODO: combine single block_runs to bigger ones when
2472 					// they are adjacent
2473 					status = block_cache_sync_etc(fVolume->BlockCache(),
2474 						fVolume->ToBlock(runs[i]), runs[i].Length());
2475 					if (status != B_OK)
2476 						return status;
2477 				}
2478 			}
2479 		}
2480 	}
2481 	return B_OK;
2482 }
2483 
2484 
2485 //	#pragma mark - TransactionListener implementation
2486 
2487 
2488 void
2489 Inode::TransactionDone(bool success)
2490 {
2491 	if (!success) {
2492 		// Revert any changes made to the cached bfs_inode
2493 		// TODO: return code gets eaten
2494 		UpdateNodeFromDisk();
2495 	}
2496 }
2497 
2498 
2499 void
2500 Inode::RemovedFromTransaction()
2501 {
2502 	Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
2503 
2504 	// See AddInode() why we do this here
2505 	if ((Flags() & INODE_DELETED) != 0)
2506 		fVolume->RemovedInodes().Add(this);
2507 
2508 	rw_lock_write_unlock(&Lock());
2509 
2510 	if (!fVolume->IsInitializing())
2511 		put_vnode(fVolume->FSVolume(), ID());
2512 }
2513 
2514 
2515 //	#pragma mark - creation/deletion
2516 
2517 
2518 status_t
2519 Inode::Remove(Transaction& transaction, const char* name, ino_t* _id,
2520 	bool isDirectory, bool force)
2521 {
2522 	if (fTree == NULL)
2523 		RETURN_ERROR(B_BAD_VALUE);
2524 
2525 	WriteLockInTransaction(transaction);
2526 
2527 	// does the file even exist?
2528 	off_t id;
2529 	if (fTree->Find((uint8*)name, (uint16)strlen(name), &id) < B_OK)
2530 		return B_ENTRY_NOT_FOUND;
2531 
2532 	if (_id)
2533 		*_id = id;
2534 
2535 	Vnode vnode(fVolume, id);
2536 	Inode* inode;
2537 	status_t status = vnode.Get(&inode);
2538 	if (status < B_OK) {
2539 		REPORT_ERROR(status);
2540 		return fTree->Remove(transaction, name, id);
2541 	}
2542 
2543 	T(Remove(inode, name));
2544 	inode->WriteLockInTransaction(transaction);
2545 
2546 	// Inode::IsContainer() is true also for indices (furthermore, the S_IFDIR
2547 	// bit is set for indices in BFS, not for attribute directories) - but you
2548 	// should really be able to do whatever you want with your indices
2549 	// without having to remove all files first :)
2550 	if (!inode->IsIndex() && !force) {
2551 		// if it's not of the correct type, don't delete it!
2552 		if (inode->IsContainer() != isDirectory)
2553 			return isDirectory ? B_NOT_A_DIRECTORY : B_IS_A_DIRECTORY;
2554 
2555 		// only delete empty directories
2556 		if (isDirectory && !inode->IsEmpty())
2557 			return B_DIRECTORY_NOT_EMPTY;
2558 	}
2559 
2560 	// remove_vnode() allows the inode to be accessed until the last put_vnode()
2561 	status = remove_vnode(fVolume->FSVolume(), id);
2562 	if (status != B_OK)
2563 		return status;
2564 
2565 	if (fTree->Remove(transaction, name, id) != B_OK && !force) {
2566 		unremove_vnode(fVolume->FSVolume(), id);
2567 		RETURN_ERROR(B_ERROR);
2568 	}
2569 
2570 #ifdef DEBUG
2571 	if (fTree->Find((uint8*)name, (uint16)strlen(name), &id) == B_OK) {
2572 		DIE(("deleted entry still there"));
2573 	}
2574 #endif
2575 
2576 	ContainerContentsChanged(transaction);
2577 
2578 	// update the inode, so that no one will ever doubt it's deleted :-)
2579 	inode->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
2580 	inode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
2581 
2582 	// In balance to the Inode::Create() method, the main indices
2583 	// are updated here (name, size, & last_modified)
2584 
2585 	Index index(fVolume);
2586 	if (inode->InNameIndex()) {
2587 		index.RemoveName(transaction, name, inode);
2588 			// If removing from the index fails, it is not regarded as a
2589 			// fatal error and will not be reported back!
2590 			// Deleted inodes won't be visible in queries anyway.
2591 	}
2592 
2593 	if (inode->InSizeIndex())
2594 		index.RemoveSize(transaction, inode);
2595 	if (inode->InLastModifiedIndex())
2596 		index.RemoveLastModified(transaction, inode);
2597 
2598 	return inode->WriteBack(transaction);
2599 }
2600 
2601 
2602 /*!	Creates the inode with the specified \a parent directory, and automatically
2603 	adds the created inode to that parent directory. If an attribute directory
2604 	is created, it will also automatically  be added to the \a parent inode as
2605 	such. However, the indices root node, and the regular root node won't be
2606 	added to the superblock.
2607 	It will also create the initial B+tree for the inode if it's a directory
2608 	of any kind.
2609 	\a name may be \c NULL, but only if no \a parent is given.
2610 	If the "_id" or "_inode" variable is given and non-NULL to store the
2611 	inode's ID, the inode stays locked - you have to call put_vnode() if you
2612 	don't use it anymore.
2613 	If the node already exists, this method will fail if \c O_EXCL is set, or
2614 	it's a directory or a symlink. Otherwise, it will just be returned.
2615 	If \c O_TRUNC has been specified, the file will also be truncated.
2616 */
2617 status_t
2618 Inode::Create(Transaction& transaction, Inode* parent, const char* name,
2619 	int32 mode, int openMode, uint32 type, bool* _created, ino_t* _id,
2620 	Inode** _inode, fs_vnode_ops* vnodeOps, uint32 publishFlags)
2621 {
2622 	FUNCTION_START(("name = %s, mode = %ld\n", name, mode));
2623 
2624 	block_run parentRun = parent ? parent->BlockRun() : block_run::Run(0, 0, 0);
2625 	Volume* volume = transaction.GetVolume();
2626 	BPlusTree* tree = NULL;
2627 
2628 	if (parent != NULL && (mode & S_ATTR_DIR) == 0 && parent->IsContainer()) {
2629 		// check if the file already exists in the directory
2630 		tree = parent->Tree();
2631 	}
2632 
2633 	if (parent != NULL) {
2634 		// the parent directory is locked during the whole inode creation
2635 		parent->WriteLockInTransaction(transaction);
2636 	}
2637 
2638 	if (parent != NULL && !volume->IsInitializing() && parent->IsContainer()) {
2639 		// don't create anything in removed directories
2640 		bool removed;
2641 		if (get_vnode_removed(volume->FSVolume(), parent->ID(), &removed)
2642 				== B_OK && removed) {
2643 			RETURN_ERROR(B_ENTRY_NOT_FOUND);
2644 		}
2645 	}
2646 
2647 	if (tree != NULL) {
2648 		// Does the file already exist?
2649 		off_t offset;
2650 		if (tree->Find((uint8*)name, (uint16)strlen(name), &offset) == B_OK) {
2651 			// Return if the file should be a directory/link or opened in
2652 			// exclusive mode
2653 			if (S_ISDIR(mode) || S_ISLNK(mode) || (openMode & O_EXCL) != 0)
2654 				return B_FILE_EXISTS;
2655 
2656 			Vnode vnode(volume, offset);
2657 			Inode* inode;
2658 			status_t status = vnode.Get(&inode);
2659 			if (status != B_OK) {
2660 				REPORT_ERROR(status);
2661 				return B_ENTRY_NOT_FOUND;
2662 			}
2663 
2664 			if (inode->IsDirectory() && (openMode & O_RWMASK) != O_RDONLY)
2665 				return B_IS_A_DIRECTORY;
2666 			if ((openMode & O_DIRECTORY) != 0 && !inode->IsDirectory())
2667 				return B_NOT_A_DIRECTORY;
2668 
2669 			// we want to open the file, so we should have the rights to do so
2670 			if (inode->CheckPermissions(open_mode_to_access(openMode)
2671 					| ((openMode & O_TRUNC) != 0 ? W_OK : 0)) != B_OK)
2672 				return B_NOT_ALLOWED;
2673 
2674 			if ((openMode & O_TRUNC) != 0) {
2675 				// truncate the existing file
2676 				inode->WriteLockInTransaction(transaction);
2677 
2678 				status_t status = inode->SetFileSize(transaction, 0);
2679 				if (status == B_OK)
2680 					status = inode->WriteBack(transaction);
2681 
2682 				if (status != B_OK)
2683 					return status;
2684 			}
2685 
2686 			if (_created)
2687 				*_created = false;
2688 			if (_id)
2689 				*_id = inode->ID();
2690 			if (_inode)
2691 				*_inode = inode;
2692 
2693 			// Only keep the vnode in memory if the _id or _inode pointer is
2694 			// provided
2695 			if (_id != NULL || _inode != NULL)
2696 				vnode.Keep();
2697 
2698 			return B_OK;
2699 		}
2700 	} else if (parent != NULL && (mode & S_ATTR_DIR) == 0) {
2701 		return B_BAD_VALUE;
2702 	} else if ((openMode & O_DIRECTORY) != 0) {
2703 		// TODO: we might need to return B_NOT_A_DIRECTORY here
2704 		return B_ENTRY_NOT_FOUND;
2705 	}
2706 
2707 	status_t status;
2708 
2709 	// do we have the power to create new files at all?
2710 	if (parent != NULL && (status = parent->CheckPermissions(W_OK)) != B_OK)
2711 		RETURN_ERROR(status);
2712 
2713 	// allocate space for the new inode
2714 	InodeAllocator allocator(transaction);
2715 	block_run run;
2716 	Inode* inode;
2717 	status = allocator.New(&parentRun, mode, publishFlags, run, vnodeOps,
2718 		&inode);
2719 	if (status < B_OK)
2720 		return status;
2721 
2722 	T(Create(inode, parent, name, mode, openMode, type));
2723 
2724 	// Initialize the parts of the bfs_inode structure that
2725 	// InodeAllocator::New() hasn't touched yet
2726 
2727 	bfs_inode* node = &inode->Node();
2728 
2729 	if (parent == NULL) {
2730 		// we set the parent to itself in this case
2731 		// (only happens for the root and indices node)
2732 		node->parent = run;
2733 	} else
2734 		node->parent = parentRun;
2735 
2736 	node->uid = HOST_ENDIAN_TO_BFS_INT32(geteuid());
2737 	node->gid = HOST_ENDIAN_TO_BFS_INT32(parent
2738 		? parent->Node().GroupID() : getegid());
2739 		// the group ID is inherited from the parent, if available
2740 
2741 	node->type = HOST_ENDIAN_TO_BFS_INT32(type);
2742 
2743 	inode->WriteBack(transaction);
2744 		// make sure the initialized node is available to others
2745 
2746 	// only add the name to regular files, directories, or symlinks
2747 	// don't add it to attributes, or indices
2748 	if (tree && inode->IsRegularNode()
2749 		&& inode->SetName(transaction, name) != B_OK)
2750 		return B_ERROR;
2751 
2752 	// Initialize b+tree if it's a directory (and add "." & ".." if it's
2753 	// a standard directory for files - not for attributes or indices)
2754 	if (inode->IsContainer()) {
2755 		status = allocator.CreateTree();
2756 		if (status != B_OK)
2757 			return status;
2758 	}
2759 
2760 	// Add a link to the inode from the parent, depending on its type
2761 	// (the vnode is not published yet, so it is safe to make the inode
2762 	// accessable to the file system here)
2763 	if (tree != NULL) {
2764 		status = tree->Insert(transaction, name, inode->ID());
2765 	} else if (parent != NULL && (mode & S_ATTR_DIR) != 0) {
2766 		parent->Attributes() = run;
2767 		status = parent->WriteBack(transaction);
2768 	}
2769 
2770 	// Note, we only care if the inode could be made accessable for the
2771 	// two cases above; the root node or the indices root node must
2772 	// handle this case on their own (or other cases where "parent" is
2773 	// NULL)
2774 	if (status != B_OK)
2775 		RETURN_ERROR(status);
2776 
2777 	// Update the main indices (name, size & last_modified)
2778 	// (live queries might want to access us after this)
2779 
2780 	Index index(volume);
2781 	if (inode->InNameIndex() && name != NULL) {
2782 		// the name index only contains regular files
2783 		// (but not the root node where name == NULL)
2784 		status = index.InsertName(transaction, name, inode);
2785 		if (status != B_OK && status != B_BAD_INDEX) {
2786 			// We have to remove the node from the parent at this point,
2787 			// because the InodeAllocator destructor can't handle this
2788 			// case (and if it fails, we can't do anything about it...)
2789 			if (tree)
2790 				tree->Remove(transaction, name, inode->ID());
2791 			else if (parent != NULL && (mode & S_ATTR_DIR) != 0)
2792 				parent->Node().attributes.SetTo(0, 0, 0);
2793 
2794 			RETURN_ERROR(status);
2795 		}
2796 	}
2797 
2798 	if (parent != NULL && parent->IsContainer())
2799 		parent->ContainerContentsChanged(transaction);
2800 
2801 	inode->UpdateOldLastModified();
2802 
2803 	// The "size" & "last_modified" indices don't contain directories.
2804 	// If adding to these indices fails, the inode creation will not be
2805 	// harmed; they are considered less important than the "name" index.
2806 	if (inode->InSizeIndex())
2807 		index.InsertSize(transaction, inode);
2808 	if (inode->InLastModifiedIndex())
2809 		index.InsertLastModified(transaction, inode);
2810 
2811 	if (inode->NeedsFileCache()) {
2812 		inode->SetFileCache(file_cache_create(volume->ID(), inode->ID(),
2813 			inode->Size()));
2814 		inode->SetMap(file_map_create(volume->ID(), inode->ID(),
2815 			inode->Size()));
2816 
2817 		if (inode->FileCache() == NULL || inode->Map() == NULL)
2818 			return B_NO_MEMORY;
2819 	}
2820 
2821 	// Everything worked well until this point, we have a fully
2822 	// initialized inode, and we want to keep it
2823 	allocator.Keep(vnodeOps, publishFlags);
2824 
2825 	if (_created)
2826 		*_created = true;
2827 	if (_id != NULL)
2828 		*_id = inode->ID();
2829 	if (_inode != NULL)
2830 		*_inode = inode;
2831 
2832 	// if either _id or _inode is passed, we will keep the inode locked
2833 	if (_id == NULL && _inode == NULL)
2834 		put_vnode(volume->FSVolume(), inode->ID());
2835 
2836 	return B_OK;
2837 }
2838 
2839 
2840 //	#pragma mark - AttributeIterator
2841 
2842 
2843 AttributeIterator::AttributeIterator(Inode* inode)
2844 	:
2845 	fCurrentSmallData(0),
2846 	fInode(inode),
2847 	fAttributes(NULL),
2848 	fIterator(NULL),
2849 	fBuffer(NULL)
2850 {
2851 	inode->_AddIterator(this);
2852 }
2853 
2854 
2855 AttributeIterator::~AttributeIterator()
2856 {
2857 	if (fAttributes)
2858 		put_vnode(fAttributes->GetVolume()->FSVolume(), fAttributes->ID());
2859 
2860 	delete fIterator;
2861 	fInode->_RemoveIterator(this);
2862 }
2863 
2864 
2865 status_t
2866 AttributeIterator::Rewind()
2867 {
2868 	fCurrentSmallData = 0;
2869 
2870 	if (fIterator != NULL)
2871 		fIterator->Rewind();
2872 
2873 	return B_OK;
2874 }
2875 
2876 
2877 status_t
2878 AttributeIterator::GetNext(char* name, size_t* _length, uint32* _type,
2879 	ino_t* _id)
2880 {
2881 	// read attributes out of the small data section
2882 
2883 	if (fCurrentSmallData >= 0) {
2884 		NodeGetter nodeGetter(fInode->GetVolume());
2885 		status_t status = nodeGetter.SetTo(fInode);
2886 		if (status != B_OK)
2887 			return status;
2888 
2889 		const bfs_inode* node = nodeGetter.Node();
2890 		const small_data* item = ((bfs_inode*)node)->SmallDataStart();
2891 
2892 		RecursiveLocker _(&fInode->SmallDataLock());
2893 
2894 		int32 index = 0;
2895 		for (; !item->IsLast(node); item = item->Next(), index++) {
2896 			if (item->NameSize() == FILE_NAME_NAME_LENGTH
2897 				&& *item->Name() == FILE_NAME_NAME)
2898 				continue;
2899 
2900 			if (index >= fCurrentSmallData)
2901 				break;
2902 		}
2903 
2904 		if (!item->IsLast(node)) {
2905 			strncpy(name, item->Name(), B_FILE_NAME_LENGTH);
2906 			*_type = item->Type();
2907 			*_length = item->NameSize();
2908 			*_id = (ino_t)index;
2909 
2910 			fCurrentSmallData = index + 1;
2911 			return B_OK;
2912 		}
2913 
2914 		// stop traversing the small_data section
2915 		fCurrentSmallData = -1;
2916 	}
2917 
2918 	// read attributes out of the attribute directory
2919 
2920 	if (fInode->Attributes().IsZero())
2921 		return B_ENTRY_NOT_FOUND;
2922 
2923 	Volume* volume = fInode->GetVolume();
2924 
2925 	// if you haven't yet access to the attributes directory, get it
2926 	if (fAttributes == NULL) {
2927 		if (get_vnode(volume->FSVolume(), volume->ToVnode(fInode->Attributes()),
2928 				(void**)&fAttributes) != B_OK) {
2929 			FATAL(("get_vnode() failed in AttributeIterator::GetNext(ino_t"
2930 				" = %" B_PRIdINO ",name = \"%s\")\n", fInode->ID(), name));
2931 			return B_ENTRY_NOT_FOUND;
2932 		}
2933 
2934 		BPlusTree* tree = fAttributes->Tree();
2935 		if (tree == NULL
2936 			|| (fIterator = new(std::nothrow) TreeIterator(tree)) == NULL) {
2937 			FATAL(("could not get tree in AttributeIterator::GetNext(ino_t"
2938 				" = %" B_PRIdINO ",name = \"%s\")\n", fInode->ID(), name));
2939 			return B_ENTRY_NOT_FOUND;
2940 		}
2941 	}
2942 
2943 	uint16 length;
2944 	ino_t id;
2945 	status_t status = fIterator->GetNextEntry(name, &length,
2946 		B_FILE_NAME_LENGTH, &id);
2947 	if (status != B_OK)
2948 		return status;
2949 
2950 	Vnode vnode(volume, id);
2951 	Inode* attribute;
2952 	if ((status = vnode.Get(&attribute)) == B_OK) {
2953 		*_type = attribute->Type();
2954 		*_length = length;
2955 		*_id = id;
2956 	}
2957 
2958 	return status;
2959 }
2960 
2961 
2962 void
2963 AttributeIterator::Update(uint16 index, int8 change)
2964 {
2965 	// fCurrentSmallData points already to the next item
2966 	if (index < fCurrentSmallData)
2967 		fCurrentSmallData += change;
2968 }
2969 
2970