xref: /haiku/src/add-ons/kernel/file_systems/bfs/Inode.cpp (revision 3634f142352af2428aed187781fc9d75075e9140)
1 /*
2  * Copyright 2001-2020, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 
7 //! Inode access functions
8 
9 
10 #include "Debug.h"
11 #include "Inode.h"
12 #include "BPlusTree.h"
13 #include "Index.h"
14 
15 
16 #if BFS_TRACING && !defined(FS_SHELL) && !defined(_BOOT_MODE)
17 namespace BFSInodeTracing {
18 
19 class Create : public AbstractTraceEntry {
20 public:
21 	Create(Inode* inode, Inode* parent, const char* name, int32 mode,
22 			int openMode, uint32 type)
23 		:
24 		fInode(inode),
25 		fID(inode->ID()),
26 		fParent(parent),
27 		fParentID(parent != NULL ? parent->ID() : 0),
28 		fMode(mode),
29 		fOpenMode(openMode),
30 		fType(type)
31 	{
32 		if (name != NULL)
33 			strlcpy(fName, name, sizeof(fName));
34 		else
35 			fName[0] = '\0';
36 
37 		Initialized();
38 	}
39 
40 	virtual void AddDump(TraceOutput& out)
41 	{
42 		out.Print("bfs:Create %lld (%p), parent %lld (%p), \"%s\", "
43 			"mode %lx, omode %x, type %lx", fID, fInode, fParentID,
44 			fParent, fName, fMode, fOpenMode, fType);
45 	}
46 
47 private:
48 	Inode*	fInode;
49 	ino_t	fID;
50 	Inode*	fParent;
51 	ino_t	fParentID;
52 	char	fName[32];
53 	int32	fMode;
54 	int		fOpenMode;
55 	uint32	fType;
56 };
57 
58 class Remove : public AbstractTraceEntry {
59 public:
60 	Remove(Inode* inode, const char* name)
61 		:
62 		fInode(inode),
63 		fID(inode->ID())
64 	{
65 		strlcpy(fName, name, sizeof(fName));
66 		Initialized();
67 	}
68 
69 	virtual void AddDump(TraceOutput& out)
70 	{
71 		out.Print("bfs:Remove %lld (%p), \"%s\"", fID, fInode, fName);
72 	}
73 
74 private:
75 	Inode*	fInode;
76 	ino_t	fID;
77 	char	fName[32];
78 };
79 
80 class Action : public AbstractTraceEntry {
81 public:
82 	Action(const char* action, Inode* inode)
83 		:
84 		fInode(inode),
85 		fID(inode->ID())
86 	{
87 		strlcpy(fAction, action, sizeof(fAction));
88 		Initialized();
89 	}
90 
91 	virtual void AddDump(TraceOutput& out)
92 	{
93 		out.Print("bfs:%s %lld (%p)\n", fAction, fID, fInode);
94 	}
95 
96 private:
97 	Inode*	fInode;
98 	ino_t	fID;
99 	char	fAction[16];
100 };
101 
102 class Resize : public AbstractTraceEntry {
103 public:
104 	Resize(Inode* inode, off_t oldSize, off_t newSize, bool trim)
105 		:
106 		fInode(inode),
107 		fID(inode->ID()),
108 		fOldSize(oldSize),
109 		fNewSize(newSize),
110 		fTrim(trim)
111 	{
112 		Initialized();
113 	}
114 
115 	virtual void AddDump(TraceOutput& out)
116 	{
117 		out.Print("bfs:%s %lld (%p), %lld -> %lld", fTrim ? "Trim" : "Resize",
118 			fID, fInode, fOldSize, fNewSize);
119 	}
120 
121 private:
122 	Inode*	fInode;
123 	ino_t	fID;
124 	off_t	fOldSize;
125 	off_t	fNewSize;
126 	bool	fTrim;
127 };
128 
129 }	// namespace BFSInodeTracing
130 
131 #	define T(x) new(std::nothrow) BFSInodeTracing::x;
132 #else
133 #	define T(x) ;
134 #endif
135 
136 
137 /*!	A helper class used by Inode::Create() to keep track of the belongings
138 	of an inode creation in progress.
139 	This class will make sure everything is cleaned up properly.
140 */
141 class InodeAllocator {
142 public:
143 							InodeAllocator(Transaction& transaction);
144 							~InodeAllocator();
145 
146 			status_t		New(block_run* parentRun, mode_t mode, uint32 flags,
147 								block_run& run, fs_vnode_ops* vnodeOps,
148 								Inode** _inode);
149 			status_t		CreateTree();
150 			status_t		Keep(fs_vnode_ops* vnodeOps, uint32 publishFlags);
151 
152 private:
153 	static	void			_TransactionListener(int32 id, int32 event,
154 								void* _inode);
155 
156 			Transaction*	fTransaction;
157 			block_run		fRun;
158 			Inode*			fInode;
159 };
160 
161 
162 InodeAllocator::InodeAllocator(Transaction& transaction)
163 	:
164 	fTransaction(&transaction),
165 	fInode(NULL)
166 {
167 }
168 
169 
170 InodeAllocator::~InodeAllocator()
171 {
172 	if (fTransaction != NULL) {
173 		Volume* volume = fTransaction->GetVolume();
174 
175 		if (fInode != NULL) {
176 			fInode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
177 				// this unblocks any pending bfs_read_vnode() calls
178 			fInode->Free(*fTransaction);
179 
180 			if (fInode->fTree != NULL)
181 				fTransaction->RemoveListener(fInode->fTree);
182 			fTransaction->RemoveListener(fInode);
183 
184 			remove_vnode(volume->FSVolume(), fInode->ID());
185 		} else
186 			volume->Free(*fTransaction, fRun);
187 	}
188 
189 	delete fInode;
190 }
191 
192 
193 status_t
194 InodeAllocator::New(block_run* parentRun, mode_t mode, uint32 publishFlags,
195 	block_run& run, fs_vnode_ops* vnodeOps, Inode** _inode)
196 {
197 	Volume* volume = fTransaction->GetVolume();
198 
199 	status_t status = volume->AllocateForInode(*fTransaction, parentRun, mode,
200 		fRun);
201 	if (status < B_OK) {
202 		// don't free the space in the destructor, because
203 		// the allocation failed
204 		fTransaction = NULL;
205 		RETURN_ERROR(status);
206 	}
207 
208 	run = fRun;
209 	fInode = new(std::nothrow) Inode(volume, *fTransaction,
210 		volume->ToVnode(run), mode, run);
211 	if (fInode == NULL)
212 		RETURN_ERROR(B_NO_MEMORY);
213 
214 	if (!volume->IsInitializing()
215 		&& (publishFlags & BFS_DO_NOT_PUBLISH_VNODE) == 0) {
216 		status = new_vnode(volume->FSVolume(), fInode->ID(), fInode,
217 			vnodeOps != NULL ? vnodeOps : &gBFSVnodeOps);
218 		if (status < B_OK) {
219 			delete fInode;
220 			fInode = NULL;
221 			RETURN_ERROR(status);
222 		}
223 	}
224 
225 	fInode->WriteLockInTransaction(*fTransaction);
226 	*_inode = fInode;
227 	return B_OK;
228 }
229 
230 
231 status_t
232 InodeAllocator::CreateTree()
233 {
234 	Volume* volume = fTransaction->GetVolume();
235 
236 	// force S_STR_INDEX to be set, if no type is set
237 	if ((fInode->Mode() & S_INDEX_TYPES) == 0)
238 		fInode->Node().mode |= HOST_ENDIAN_TO_BFS_INT32(S_STR_INDEX);
239 
240 	BPlusTree* tree = new(std::nothrow) BPlusTree(*fTransaction, fInode);
241 	if (tree == NULL)
242 		return B_ERROR;
243 
244 	status_t status = tree->InitCheck();
245 	if (status != B_OK) {
246 		delete tree;
247 		return status;
248 	}
249 
250 	fInode->fTree = tree;
251 
252 	if (fInode->IsRegularNode()) {
253 		if (tree->Insert(*fTransaction, ".", fInode->ID()) < B_OK
254 			|| tree->Insert(*fTransaction, "..",
255 					volume->ToVnode(fInode->Parent())) < B_OK)
256 			return B_ERROR;
257 	}
258 	return B_OK;
259 }
260 
261 
262 status_t
263 InodeAllocator::Keep(fs_vnode_ops* vnodeOps, uint32 publishFlags)
264 {
265 	ASSERT(fInode != NULL && fTransaction != NULL);
266 	Volume* volume = fTransaction->GetVolume();
267 
268 	status_t status = fInode->WriteBack(*fTransaction);
269 	if (status < B_OK) {
270 		FATAL(("writing new inode %" B_PRIdINO " failed!\n", fInode->ID()));
271 		return status;
272 	}
273 
274 	// Symbolic links are not published -- the caller needs to do this once
275 	// the contents have been written.
276 	if (!fInode->IsSymLink() && !volume->IsInitializing()
277 		&& (publishFlags & BFS_DO_NOT_PUBLISH_VNODE) == 0) {
278 		status = publish_vnode(volume->FSVolume(), fInode->ID(), fInode,
279 			vnodeOps != NULL ? vnodeOps : &gBFSVnodeOps, fInode->Mode(),
280 			publishFlags);
281 	}
282 
283 	if (status == B_OK) {
284 		cache_add_transaction_listener(volume->BlockCache(), fTransaction->ID(),
285 			TRANSACTION_ABORTED, &_TransactionListener, fInode);
286 	}
287 
288 	fTransaction = NULL;
289 	fInode = NULL;
290 
291 	return status;
292 }
293 
294 
295 /*static*/ void
296 InodeAllocator::_TransactionListener(int32 id, int32 event, void* _inode)
297 {
298 	Inode* inode = (Inode*)_inode;
299 
300 	if (event == TRANSACTION_ABORTED)
301 		panic("transaction %d aborted, inode %p still around!\n", (int)id, inode);
302 }
303 
304 
305 //	#pragma mark -
306 
307 
308 status_t
309 bfs_inode::InitCheck(Volume* volume) const
310 {
311 	if (Magic1() != INODE_MAGIC1
312 		|| !(Flags() & INODE_IN_USE)
313 		|| inode_num.Length() != 1
314 		// matches inode size?
315 		|| (uint32)InodeSize() != volume->InodeSize()
316 		// parent resides on disk?
317 		|| parent.AllocationGroup() > int32(volume->AllocationGroups())
318 		|| parent.AllocationGroup() < 0
319 		|| parent.Start() > (1L << volume->AllocationGroupShift())
320 		|| parent.Length() != 1
321 		// attributes, too?
322 		|| attributes.AllocationGroup() > int32(volume->AllocationGroups())
323 		|| attributes.AllocationGroup() < 0
324 		|| attributes.Start() > (1L << volume->AllocationGroupShift()))
325 		RETURN_ERROR(B_BAD_DATA);
326 
327 	if (Flags() & INODE_DELETED)
328 		return B_NOT_ALLOWED;
329 
330 	// TODO: Add some tests to check the integrity of the other stuff here,
331 	// especially for the data_stream!
332 
333 	return B_OK;
334 }
335 
336 
337 //	#pragma mark - Inode
338 
339 
340 Inode::Inode(Volume* volume, ino_t id)
341 	:
342 	fVolume(volume),
343 	fID(id),
344 	fTree(NULL),
345 	fAttributes(NULL),
346 	fCache(NULL),
347 	fMap(NULL)
348 {
349 	PRINT(("Inode::Inode(volume = %p, id = %" B_PRIdINO ") @ %p\n",
350 		volume, id, this));
351 
352 	rw_lock_init(&fLock, "bfs inode");
353 	recursive_lock_init(&fSmallDataLock, "bfs inode small data");
354 
355 	if (UpdateNodeFromDisk() != B_OK) {
356 		// TODO: the error code gets eaten
357 		return;
358 	}
359 
360 	// these two will help to maintain the indices
361 	fOldSize = Size();
362 	fOldLastModified = LastModified();
363 
364 	if (IsContainer())
365 		fTree = new(std::nothrow) BPlusTree(this);
366 	if (NeedsFileCache()) {
367 		SetFileCache(file_cache_create(fVolume->ID(), ID(), Size()));
368 		SetMap(file_map_create(volume->ID(), ID(), Size()));
369 	}
370 }
371 
372 
373 Inode::Inode(Volume* volume, Transaction& transaction, ino_t id, mode_t mode,
374 		block_run& run)
375 	:
376 	fVolume(volume),
377 	fID(id),
378 	fTree(NULL),
379 	fAttributes(NULL),
380 	fCache(NULL),
381 	fMap(NULL)
382 {
383 	PRINT(("Inode::Inode(volume = %p, transaction = %p, id = %" B_PRIdINO
384 		") @ %p\n", volume, &transaction, id, this));
385 
386 	rw_lock_init(&fLock, "bfs inode");
387 	recursive_lock_init(&fSmallDataLock, "bfs inode small data");
388 
389 	NodeGetter node(volume);
390 	status_t status = node.SetToWritable(transaction, this, true);
391 	if (status != B_OK) {
392 		FATAL(("Could not read inode block %" B_PRId64 ": %s!\n", BlockNumber(),
393 			strerror(status)));
394 		return;
395 	}
396 
397 	memset(&fNode, 0, sizeof(bfs_inode));
398 
399 	// Initialize the bfs_inode structure -- it's not written back to disk
400 	// here, because it will be done so already when the inode could be
401 	// created completely.
402 
403 	Node().magic1 = HOST_ENDIAN_TO_BFS_INT32(INODE_MAGIC1);
404 	Node().inode_num = run;
405 	Node().mode = HOST_ENDIAN_TO_BFS_INT32(mode);
406 	Node().flags = HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
407 
408 	Node().create_time = Node().last_modified_time = Node().status_change_time
409 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
410 
411 	Node().inode_size = HOST_ENDIAN_TO_BFS_INT32(volume->InodeSize());
412 
413 	// these two will help to maintain the indices
414 	fOldSize = Size();
415 	fOldLastModified = LastModified();
416 }
417 
418 
419 Inode::~Inode()
420 {
421 	PRINT(("Inode::~Inode() @ %p\n", this));
422 
423 	file_cache_delete(FileCache());
424 	file_map_delete(Map());
425 	delete fTree;
426 
427 	rw_lock_destroy(&fLock);
428 	recursive_lock_destroy(&fSmallDataLock);
429 }
430 
431 
432 status_t
433 Inode::InitCheck(bool checkNode) const
434 {
435 	// test inode magic and flags
436 	if (checkNode) {
437 		status_t status = Node().InitCheck(fVolume);
438 		if (status == B_BUSY)
439 			return B_BUSY;
440 
441 		if (status != B_OK) {
442 			FATAL(("inode at block %" B_PRIdOFF " corrupt!\n", BlockNumber()));
443 			RETURN_ERROR(B_BAD_DATA);
444 		}
445 	}
446 
447 	if (IsContainer()) {
448 		// inodes that have a B+tree
449 		if (fTree == NULL)
450 			RETURN_ERROR(B_NO_MEMORY);
451 
452 		status_t status = fTree->InitCheck();
453 		if (status != B_OK) {
454 			FATAL(("inode tree at block %" B_PRIdOFF " corrupt!\n",
455 				BlockNumber()));
456 			RETURN_ERROR(B_BAD_DATA);
457 		}
458 	}
459 
460 	if (NeedsFileCache() && (fCache == NULL || fMap == NULL))
461 		return B_NO_MEMORY;
462 
463 	return B_OK;
464 }
465 
466 
467 /*!	Adds this inode to the specified transaction. This means that the inode will
468 	be write locked until the transaction ended.
469 	To ensure that the inode will stay valid until that point, an extra reference
470 	is acquired to it as long as this transaction stays active.
471 */
472 void
473 Inode::WriteLockInTransaction(Transaction& transaction)
474 {
475 	// These flags can only change while holding the transaction lock
476 	if ((Flags() & INODE_IN_TRANSACTION) != 0)
477 		return;
478 
479 	// We share the same list link with the removed list, so we have to remove
480 	// the inode from that list here (and add it back when we no longer need it)
481 	if ((Flags() & INODE_DELETED) != 0)
482 		fVolume->RemovedInodes().Remove(this);
483 
484 	if (!fVolume->IsInitializing())
485 		acquire_vnode(fVolume->FSVolume(), ID());
486 
487 	rw_lock_write_lock(&Lock());
488 	Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
489 
490 	transaction.AddListener(this);
491 }
492 
493 
494 status_t
495 Inode::WriteBack(Transaction& transaction)
496 {
497 	NodeGetter node(fVolume);
498 	status_t status = node.SetToWritable(transaction, this);
499 	if (status != B_OK)
500 		return status;
501 
502 	memcpy(node.WritableNode(), &Node(), sizeof(bfs_inode));
503 	return B_OK;
504 }
505 
506 
507 status_t
508 Inode::UpdateNodeFromDisk()
509 {
510 	NodeGetter node(fVolume);
511 	status_t status = node.SetTo(this);
512 	if (status != B_OK) {
513 		FATAL(("Failed to read block %" B_PRId64 " from disk: %s!\n",
514 			BlockNumber(), strerror(status)));
515 		return status;
516 	}
517 
518 	memcpy(&fNode, node.Node(), sizeof(bfs_inode));
519 	fNode.flags &= HOST_ENDIAN_TO_BFS_INT32(INODE_PERMANENT_FLAGS);
520 	return B_OK;
521 }
522 
523 
524 status_t
525 Inode::CheckPermissions(int accessMode) const
526 {
527 	// you never have write access to a read-only volume
528 	if ((accessMode & W_OK) != 0 && fVolume->IsReadOnly())
529 		return B_READ_ONLY_DEVICE;
530 
531 	return check_access_permissions(accessMode, Mode(), (gid_t)fNode.GroupID(),
532 		(uid_t)fNode.UserID());
533 }
534 
535 
536 //	#pragma mark - attributes
537 
538 
539 void
540 Inode::_AddIterator(AttributeIterator* iterator)
541 {
542 	RecursiveLocker _(fSmallDataLock);
543 	fIterators.Add(iterator);
544 }
545 
546 
547 void
548 Inode::_RemoveIterator(AttributeIterator* iterator)
549 {
550 	RecursiveLocker _(fSmallDataLock);
551 	fIterators.Remove(iterator);
552 }
553 
554 
555 /*!	Tries to free up "bytes" space in the small_data section by moving
556 	attributes to real files. Used for system attributes like the name.
557 	You need to hold the fSmallDataLock when you call this method
558 */
559 status_t
560 Inode::_MakeSpaceForSmallData(Transaction& transaction, bfs_inode* node,
561 	const char* name, int32 bytes)
562 {
563 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
564 
565 	while (bytes > 0) {
566 		small_data* item = node->SmallDataStart();
567 		small_data* max = NULL;
568 		int32 index = 0, maxIndex = 0;
569 		for (; !item->IsLast(node); item = item->Next(), index++) {
570 			// should not remove those
571 			if (*item->Name() == FILE_NAME_NAME || !strcmp(name, item->Name()))
572 				continue;
573 
574 			if (max == NULL || max->Size() < item->Size()) {
575 				maxIndex = index;
576 				max = item;
577 			}
578 
579 			// Remove the first one large enough to free the needed amount of
580 			// bytes
581 			if (bytes < (int32)item->Size())
582 				break;
583 		}
584 
585 		if (item->IsLast(node) || (int32)item->Size() < bytes || max == NULL)
586 			return B_ERROR;
587 
588 		bytes -= max->Size();
589 
590 		// Move the attribute to a real attribute file
591 		// Luckily, this doesn't cause any index updates
592 
593 		Inode* attribute;
594 		status_t status = CreateAttribute(transaction, item->Name(),
595 			item->Type(), &attribute);
596 		if (status != B_OK)
597 			RETURN_ERROR(status);
598 
599 		size_t length = item->DataSize();
600 		status = attribute->WriteAt(transaction, 0, item->Data(), &length);
601 
602 		ReleaseAttribute(attribute);
603 
604 		if (status != B_OK) {
605 			Vnode vnode(fVolume, Attributes());
606 			Inode* attributes;
607 			if (vnode.Get(&attributes) < B_OK
608 				|| attributes->Remove(transaction, name) < B_OK) {
609 				FATAL(("Could not remove newly created attribute!\n"));
610 			}
611 
612 			RETURN_ERROR(status);
613 		}
614 
615 		_RemoveSmallData(node, max, maxIndex);
616 	}
617 	return B_OK;
618 }
619 
620 
621 /*!	Private function which removes the given attribute from the small_data
622 	section.
623 	You need to hold the fSmallDataLock when you call this method
624 */
625 status_t
626 Inode::_RemoveSmallData(bfs_inode* node, small_data* item, int32 index)
627 {
628 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
629 
630 	small_data* next = item->Next();
631 	if (!next->IsLast(node)) {
632 		// find the last attribute
633 		small_data* last = next;
634 		while (!last->IsLast(node))
635 			last = last->Next();
636 
637 		int32 size = (uint8*)last - (uint8*)next;
638 		if (size < 0
639 			|| size > (uint8*)node + fVolume->BlockSize() - (uint8*)next)
640 			return B_BAD_DATA;
641 
642 		memmove(item, next, size);
643 
644 		// Move the "last" one to its new location and
645 		// correctly terminate the small_data section
646 		last = (small_data*)((uint8*)last - ((uint8*)next - (uint8*)item));
647 		memset(last, 0, (uint8*)node + fVolume->BlockSize() - (uint8*)last);
648 	} else
649 		memset(item, 0, item->Size());
650 
651 	// update all current iterators
652 	SinglyLinkedList<AttributeIterator>::Iterator iterator
653 		= fIterators.GetIterator();
654 	while (iterator.HasNext()) {
655 		iterator.Next()->Update(index, -1);
656 	}
657 
658 	return B_OK;
659 }
660 
661 
662 //!	Removes the given attribute from the small_data section.
663 status_t
664 Inode::_RemoveSmallData(Transaction& transaction, NodeGetter& nodeGetter,
665 	const char* name)
666 {
667 	if (name == NULL)
668 		return B_BAD_VALUE;
669 
670 	bfs_inode* node = nodeGetter.WritableNode();
671 	RecursiveLocker locker(fSmallDataLock);
672 
673 	// search for the small_data item
674 
675 	small_data* item = node->SmallDataStart();
676 	int32 index = 0;
677 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
678 		item = item->Next();
679 		index++;
680 	}
681 
682 	if (item->IsLast(node))
683 		return B_ENTRY_NOT_FOUND;
684 
685 	status_t status = nodeGetter.MakeWritable(transaction);
686 	if (status != B_OK)
687 		return status;
688 
689 	status = _RemoveSmallData(node, item, index);
690 	if (status == B_OK) {
691 		Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
692 			bfs_inode::ToInode(real_time_clock_usecs()));
693 
694 		status = WriteBack(transaction);
695 	}
696 
697 	return status;
698 }
699 
700 
701 /*!	Try to place the given attribute in the small_data section - if the
702 	new attribute is too big to fit in that section, it returns B_DEVICE_FULL.
703 	In that case, the attribute should be written to a real attribute file;
704 	it's the caller's responsibility to remove any existing attributes in the
705 	small data section if that's the case.
706 
707 	Note that you need to write back the inode yourself after having called that
708 	method - it's a bad API decision that it needs a transaction but enforces
709 	you to write back the inode all by yourself, but it's just more efficient
710 	in most cases...
711 */
712 status_t
713 Inode::_AddSmallData(Transaction& transaction, NodeGetter& nodeGetter,
714 	const char* name, uint32 type, off_t pos, const uint8* data, size_t length,
715 	bool force)
716 {
717 	bfs_inode* node = nodeGetter.WritableNode();
718 
719 	if (node == NULL || name == NULL || data == NULL)
720 		return B_BAD_VALUE;
721 
722 	// reject any requests that can't fit into the small_data section
723 	uint32 nameLength = strlen(name);
724 	uint32 spaceNeeded = sizeof(small_data) + nameLength + 3 + pos + length + 1;
725 	if (spaceNeeded > fVolume->InodeSize() - sizeof(bfs_inode))
726 		return B_DEVICE_FULL;
727 
728 	status_t status = nodeGetter.MakeWritable(transaction);
729 	if (status != B_OK)
730 		return status;
731 
732 	RecursiveLocker locker(fSmallDataLock);
733 
734 	// Find the last item or one with the same name we have to add
735 	small_data* item = node->SmallDataStart();
736 	int32 index = 0;
737 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
738 		item = item->Next();
739 		index++;
740 	}
741 
742 	// is the attribute already in the small_data section?
743 	// then just replace the data part of that one
744 	if (!item->IsLast(node)) {
745 		// find last attribute
746 		small_data* last = item;
747 		while (!last->IsLast(node))
748 			last = last->Next();
749 
750 		// try to change the attributes value
751 		if (item->data_size > pos + length
752 			|| force
753 			|| ((uint8*)last + pos + length - item->DataSize())
754 					<= ((uint8*)node + fVolume->InodeSize())) {
755 			// Make room for the new attribute if needed (and we are forced
756 			// to do so)
757 			if (force && ((uint8*)last + pos + length - item->DataSize())
758 					> ((uint8*)node + fVolume->InodeSize())) {
759 				// We also take the free space at the end of the small_data
760 				// section into account, and request only what's really needed
761 				uint32 needed = pos + length - item->DataSize() -
762 					(uint32)((uint8*)node + fVolume->InodeSize()
763 						- (uint8*)last);
764 
765 				if (_MakeSpaceForSmallData(transaction, node, name, needed)
766 						!= B_OK)
767 					return B_ERROR;
768 
769 				// reset our pointers
770 				item = node->SmallDataStart();
771 				index = 0;
772 				while (!item->IsLast(node) && strcmp(item->Name(), name)) {
773 					item = item->Next();
774 					index++;
775 				}
776 
777 				last = item;
778 				while (!last->IsLast(node))
779 					last = last->Next();
780 			}
781 
782 			size_t oldDataSize = item->DataSize();
783 
784 			// Normally, we can just overwrite the attribute data as the size
785 			// is specified by the type and does not change that often
786 			if (pos + length != item->DataSize()) {
787 				// move the attributes after the current one
788 				small_data* next = item->Next();
789 				if (!next->IsLast(node)) {
790 					memmove((uint8*)item + spaceNeeded, next,
791 						(uint8*)last - (uint8*)next);
792 				}
793 
794 				// Move the "last" one to its new location and
795 				// correctly terminate the small_data section
796 				last = (small_data*)((uint8*)last
797 					- ((uint8*)next - ((uint8*)item + spaceNeeded)));
798 				if ((uint8*)last < (uint8*)node + fVolume->BlockSize()) {
799 					memset(last, 0, (uint8*)node + fVolume->BlockSize()
800 						- (uint8*)last);
801 				}
802 
803 				item->data_size = HOST_ENDIAN_TO_BFS_INT16(pos + length);
804 			}
805 
806 			item->type = HOST_ENDIAN_TO_BFS_INT32(type);
807 
808 			if ((uint64)oldDataSize < (uint64)pos) {
809 				// Fill gap with zeros
810 				memset(item->Data() + oldDataSize, 0, pos - oldDataSize);
811 			}
812 			if (user_memcpy(item->Data() + pos, data, length) < B_OK)
813 				return B_BAD_ADDRESS;
814 			item->Data()[pos + length] = '\0';
815 
816 			return B_OK;
817 		}
818 
819 		return B_DEVICE_FULL;
820 	}
821 
822 	// try to add the new attribute!
823 
824 	if ((uint8*)item + spaceNeeded > (uint8*)node + fVolume->InodeSize()) {
825 		// there is not enough space for it!
826 		if (!force)
827 			return B_DEVICE_FULL;
828 
829 		// make room for the new attribute
830 		if (_MakeSpaceForSmallData(transaction, node, name, spaceNeeded) < B_OK)
831 			return B_ERROR;
832 
833 		// get new last item!
834 		item = node->SmallDataStart();
835 		index = 0;
836 		while (!item->IsLast(node)) {
837 			item = item->Next();
838 			index++;
839 		}
840 	}
841 
842 	memset(item, 0, spaceNeeded);
843 	item->type = HOST_ENDIAN_TO_BFS_INT32(type);
844 	item->name_size = HOST_ENDIAN_TO_BFS_INT16(nameLength);
845 	item->data_size = HOST_ENDIAN_TO_BFS_INT16(length);
846 	strcpy(item->Name(), name);
847 	if (user_memcpy(item->Data() + pos, data, length) < B_OK)
848 		return B_BAD_ADDRESS;
849 
850 	// correctly terminate the small_data section
851 	item = item->Next();
852 	if (!item->IsLast(node))
853 		memset(item, 0, (uint8*)node + fVolume->InodeSize() - (uint8*)item);
854 
855 	// update all current iterators
856 	SinglyLinkedList<AttributeIterator>::Iterator iterator
857 		= fIterators.GetIterator();
858 	while (iterator.HasNext()) {
859 		iterator.Next()->Update(index, 1);
860 	}
861 
862 	return B_OK;
863 }
864 
865 
866 /*!	Iterates through the small_data section of an inode.
867 	To start at the beginning of this section, you let smallData
868 	point to NULL, like:
869 		small_data* data = NULL;
870 		while (inode->GetNextSmallData(&data) { ... }
871 
872 	This function is reentrant and doesn't allocate any memory;
873 	you can safely stop calling it at any point (you don't need
874 	to iterate through the whole list).
875 	You need to hold the fSmallDataLock when you call this method
876 */
877 status_t
878 Inode::_GetNextSmallData(bfs_inode* node, small_data** _smallData) const
879 {
880 	if (node == NULL)
881 		RETURN_ERROR(B_BAD_VALUE);
882 
883 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
884 
885 	small_data* data = *_smallData;
886 
887 	// begin from the start?
888 	if (data == NULL)
889 		data = node->SmallDataStart();
890 	else
891 		data = data->Next();
892 
893 	// is already last item?
894 	if (data->IsLast(node))
895 		return B_ENTRY_NOT_FOUND;
896 
897 	*_smallData = data;
898 
899 	return B_OK;
900 }
901 
902 
903 /*!	Finds the attribute "name" in the small data section, and
904 	returns a pointer to it (or NULL if it doesn't exist).
905 	You need to hold the fSmallDataLock when you call this method
906 */
907 small_data*
908 Inode::FindSmallData(const bfs_inode* node, const char* name) const
909 {
910 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
911 
912 	small_data* smallData = NULL;
913 	while (_GetNextSmallData(const_cast<bfs_inode*>(node), &smallData)
914 			== B_OK) {
915 		if (!strcmp(smallData->Name(), name))
916 			return smallData;
917 	}
918 	return NULL;
919 }
920 
921 
922 /*!	Returns a pointer to the node's name if present in the small data
923 	section, NULL otherwise.
924 	You need to hold the fSmallDataLock when you call this method
925 */
926 const char*
927 Inode::Name(const bfs_inode* node) const
928 {
929 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
930 
931 	small_data* smallData = NULL;
932 	while (_GetNextSmallData((bfs_inode*)node, &smallData) == B_OK) {
933 		if (*smallData->Name() == FILE_NAME_NAME
934 			&& smallData->NameSize() == FILE_NAME_NAME_LENGTH)
935 			return (const char*)smallData->Data();
936 	}
937 	return NULL;
938 }
939 
940 
941 /*!	Copies the node's name into the provided buffer.
942 	The buffer should be B_FILE_NAME_LENGTH bytes large.
943 */
944 status_t
945 Inode::GetName(char* buffer, size_t size) const
946 {
947 	NodeGetter node(fVolume);
948 	status_t status = node.SetTo(this);
949 	if (status != B_OK)
950 		return status;
951 
952 	RecursiveLocker locker(fSmallDataLock);
953 
954 	const char* name = Name(node.Node());
955 	if (name == NULL)
956 		return B_ENTRY_NOT_FOUND;
957 
958 	strlcpy(buffer, name, size);
959 	return B_OK;
960 }
961 
962 
963 /*!	Changes or set the name of a file: in the inode small_data section only, it
964 	doesn't change it in the parent directory's b+tree.
965 	Note that you need to write back the inode yourself after having called
966 	that method. It suffers from the same API decision as AddSmallData() does
967 	(and for the same reason).
968 */
969 status_t
970 Inode::SetName(Transaction& transaction, const char* name)
971 {
972 	if (name == NULL || *name == '\0')
973 		return B_BAD_VALUE;
974 
975 	NodeGetter node(fVolume);
976 	status_t status = node.SetToWritable(transaction, this);
977 	if (status != B_OK)
978 		return status;
979 
980 	const char nameTag[2] = {FILE_NAME_NAME, 0};
981 
982 	return _AddSmallData(transaction, node, nameTag, FILE_NAME_TYPE, 0,
983 		(uint8*)name, strlen(name), true);
984 }
985 
986 
987 status_t
988 Inode::_RemoveAttribute(Transaction& transaction, const char* name,
989 	bool hasIndex, Index* index)
990 {
991 	// remove the attribute file if it exists
992 	Vnode vnode(fVolume, Attributes());
993 	Inode* attributes;
994 	status_t status = vnode.Get(&attributes);
995 	if (status < B_OK)
996 		return status;
997 
998 	// update index
999 	if (index != NULL) {
1000 		Inode* attribute;
1001 		if ((hasIndex || fVolume->CheckForLiveQuery(name))
1002 			&& GetAttribute(name, &attribute) == B_OK) {
1003 			uint8 data[MAX_INDEX_KEY_LENGTH];
1004 			size_t length = MAX_INDEX_KEY_LENGTH;
1005 			if (attribute->ReadAt(0, data, &length) == B_OK) {
1006 				index->Update(transaction, name, attribute->Type(), data,
1007 					length, NULL, 0, this);
1008 			}
1009 
1010 			ReleaseAttribute(attribute);
1011 		}
1012 	}
1013 
1014 	if ((status = attributes->Remove(transaction, name)) < B_OK)
1015 		return status;
1016 
1017 	if (attributes->IsEmpty()) {
1018 		attributes->WriteLockInTransaction(transaction);
1019 
1020 		// remove attribute directory (don't fail if that can't be done)
1021 		if (remove_vnode(fVolume->FSVolume(), attributes->ID()) == B_OK) {
1022 			// update the inode, so that no one will ever doubt it's deleted :-)
1023 			attributes->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
1024 			if (attributes->WriteBack(transaction) == B_OK) {
1025 				Attributes().SetTo(0, 0, 0);
1026 				WriteBack(transaction);
1027 			} else {
1028 				unremove_vnode(fVolume->FSVolume(), attributes->ID());
1029 				attributes->Node().flags
1030 					&= ~HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
1031 			}
1032 		}
1033 	}
1034 
1035 	return status;
1036 }
1037 
1038 
1039 /*!	Reads data from the specified attribute.
1040 	This is a high-level attribute function that understands attributes
1041 	in the small_data section as well as real attribute files.
1042 */
1043 status_t
1044 Inode::ReadAttribute(const char* name, int32 type, off_t pos, uint8* buffer,
1045 	size_t* _length)
1046 {
1047 	if (pos < 0)
1048 		pos = 0;
1049 
1050 	// search in the small_data section (which has to be locked first)
1051 	{
1052 		NodeGetter node(fVolume);
1053 		status_t status = node.SetTo(this);
1054 		if (status != B_OK)
1055 			return status;
1056 
1057 		RecursiveLocker locker(fSmallDataLock);
1058 
1059 		small_data* smallData = FindSmallData(node.Node(), name);
1060 		if (smallData != NULL) {
1061 			size_t length = *_length;
1062 			if (pos >= smallData->data_size) {
1063 				*_length = 0;
1064 				return B_OK;
1065 			}
1066 			if (length + pos > smallData->DataSize())
1067 				length = smallData->DataSize() - pos;
1068 
1069 			status_t error = user_memcpy(buffer, smallData->Data() + pos,
1070 				length);
1071 			*_length = length;
1072 			return error;
1073 		}
1074 	}
1075 
1076 	// search in the attribute directory
1077 	Inode* attribute;
1078 	status_t status = GetAttribute(name, &attribute);
1079 	if (status == B_OK) {
1080 		status = attribute->ReadAt(pos, (uint8*)buffer, _length);
1081 
1082 		ReleaseAttribute(attribute);
1083 	}
1084 
1085 	RETURN_ERROR(status);
1086 }
1087 
1088 
1089 /*!	Writes data to the specified attribute.
1090 	This is a high-level attribute function that understands attributes
1091 	in the small_data section as well as real attribute files.
1092 */
1093 status_t
1094 Inode::WriteAttribute(Transaction& transaction, const char* name, int32 type,
1095 	off_t pos, const uint8* buffer, size_t* _length, bool* _created)
1096 {
1097 	if (pos < 0)
1098 		return B_BAD_VALUE;
1099 
1100 	// needed to maintain the index
1101 	uint8 oldBuffer[MAX_INDEX_KEY_LENGTH];
1102 	uint8* oldData = NULL;
1103 	size_t oldLength = 0;
1104 	bool created = false;
1105 
1106 	// TODO: we actually depend on that the contents of "buffer" are constant.
1107 	// If they get changed during the write (hey, user programs), we may mess
1108 	// up our index trees!
1109 	// TODO: for attribute files, we need to log the first
1110 	// MAX_INDEX_KEY_LENGTH bytes of the data stream, or the same as above
1111 	// might happen.
1112 
1113 	Index index(fVolume);
1114 	bool hasIndex = index.SetTo(name) == B_OK;
1115 
1116 	Inode* attribute = NULL;
1117 	status_t status = B_OK;
1118 
1119 	if (GetAttribute(name, &attribute) != B_OK) {
1120 		// No attribute inode exists yet
1121 
1122 		// save the old attribute data
1123 		NodeGetter node(fVolume);
1124 		status = node.SetToWritable(transaction, this);
1125 		if (status != B_OK)
1126 			return status;
1127 
1128 		recursive_lock_lock(&fSmallDataLock);
1129 
1130 		small_data* smallData = FindSmallData(node.Node(), name);
1131 		if (smallData != NULL) {
1132 			oldLength = smallData->DataSize();
1133 			if (oldLength > 0) {
1134 				if (oldLength > MAX_INDEX_KEY_LENGTH)
1135 					oldLength = MAX_INDEX_KEY_LENGTH;
1136 				memcpy(oldData = oldBuffer, smallData->Data(), oldLength);
1137 			}
1138 		} else
1139 			created = true;
1140 
1141 		recursive_lock_unlock(&fSmallDataLock);
1142 
1143 		// if the attribute doesn't exist yet (as a file), try to put it in the
1144 		// small_data section first - if that fails (due to insufficent space),
1145 		// create a real attribute file
1146 		status = _AddSmallData(transaction, node, name, type, pos, buffer,
1147 			*_length);
1148 		if (status == B_DEVICE_FULL) {
1149 			if (smallData != NULL) {
1150 				// remove the old attribute from the small data section - there
1151 				// is no space left for the new data
1152 				status = _RemoveSmallData(transaction, node, name);
1153 			} else
1154 				status = B_OK;
1155 
1156 			if (status == B_OK)
1157 				status = CreateAttribute(transaction, name, type, &attribute);
1158 			if (status != B_OK)
1159 				RETURN_ERROR(status);
1160 
1161 			created = true;
1162 		} else if (status == B_OK) {
1163 			// Update status time on attribute write
1164 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1165 				bfs_inode::ToInode(real_time_clock_usecs()));
1166 
1167 			status = WriteBack(transaction);
1168 		}
1169 	}
1170 
1171 	if (attribute != NULL) {
1172 		WriteLocker writeLocker(attribute->fLock);
1173 
1174 		if (hasIndex || fVolume->CheckForLiveQuery(name)) {
1175 			// Save the old attribute data (if this fails, oldLength will
1176 			// reflect it)
1177 			while (attribute->Size() > 0) {
1178 				bigtime_t oldModified = attribute->LastModified();
1179 				writeLocker.Unlock();
1180 
1181 				oldLength = MAX_INDEX_KEY_LENGTH;
1182 				if (attribute->ReadAt(0, oldBuffer, &oldLength) == B_OK)
1183 					oldData = oldBuffer;
1184 
1185 				writeLocker.Lock();
1186 
1187 				// Read until the data hasn't changed in between
1188 				if (oldModified == attribute->LastModified())
1189 					break;
1190 
1191 				oldLength = 0;
1192 			}
1193 		}
1194 
1195 		// check if the data fits into the small_data section again
1196 		NodeGetter node(fVolume);
1197 		status = node.SetToWritable(transaction, this);
1198 		if (status != B_OK)
1199 			return status;
1200 
1201 		status = _AddSmallData(transaction, node, name, type, pos, buffer,
1202 			*_length);
1203 
1204 		if (status == B_OK) {
1205 			// it does - remove its file
1206 			writeLocker.Unlock();
1207 			status = _RemoveAttribute(transaction, name, false, NULL);
1208 		} else {
1209 			// The attribute type might have been changed - we need to
1210 			// adopt the new one
1211 			attribute->Node().type = HOST_ENDIAN_TO_BFS_INT32(type);
1212 			status = attribute->WriteBack(transaction);
1213 			writeLocker.Unlock();
1214 
1215 			if (status == B_OK) {
1216 				status = attribute->WriteAt(transaction, pos, buffer,
1217 					_length);
1218 			}
1219 		}
1220 
1221 		if (status == B_OK) {
1222 			// Update status time on attribute write
1223 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1224 				bfs_inode::ToInode(real_time_clock_usecs()));
1225 
1226 			status = WriteBack(transaction);
1227 		}
1228 
1229 		attribute->WriteLockInTransaction(transaction);
1230 		ReleaseAttribute(attribute);
1231 	}
1232 
1233 	// TODO: find a better way than this "pos" thing (the begin of the old key
1234 	//	must be copied to the start of the new one for a comparison)
1235 	if (status == B_OK && pos == 0) {
1236 		// Index only the first MAX_INDEX_KEY_LENGTH bytes
1237 		uint16 length = *_length;
1238 		if (length > MAX_INDEX_KEY_LENGTH)
1239 			length = MAX_INDEX_KEY_LENGTH;
1240 
1241 		uint8 indexBuffer[MAX_INDEX_KEY_LENGTH];
1242 		// _AddSmallData() already read the buffer
1243 		user_memcpy(indexBuffer, buffer, length);
1244 
1245 		// Update index. Note, Index::Update() may be called even if
1246 		// initializing the index failed - it will just update the live
1247 		// queries in this case
1248 		if (pos < length || (uint64)pos < (uint64)oldLength) {
1249 			index.Update(transaction, name, type, oldData, oldLength,
1250 				indexBuffer, length, this);
1251 		}
1252 	}
1253 
1254 	if (_created != NULL)
1255 		*_created = created;
1256 
1257 	return status;
1258 }
1259 
1260 
1261 /*!	Removes the specified attribute from the inode.
1262 	This is a high-level attribute function that understands attributes
1263 	in the small_data section as well as real attribute files.
1264 */
1265 status_t
1266 Inode::RemoveAttribute(Transaction& transaction, const char* name)
1267 {
1268 	Index index(fVolume);
1269 	bool hasIndex = index.SetTo(name) == B_OK;
1270 	NodeGetter node(fVolume);
1271 	status_t status = node.SetTo(this);
1272 	if (status != B_OK)
1273 		return status;
1274 
1275 	// update index for attributes in the small_data section
1276 	{
1277 		RecursiveLocker _(fSmallDataLock);
1278 
1279 		small_data* smallData = FindSmallData(node.Node(), name);
1280 		if (smallData != NULL) {
1281 			uint32 length = smallData->DataSize();
1282 			if (length > MAX_INDEX_KEY_LENGTH)
1283 				length = MAX_INDEX_KEY_LENGTH;
1284 			index.Update(transaction, name, smallData->Type(),
1285 				smallData->Data(), length, NULL, 0, this);
1286 		}
1287 	}
1288 
1289 	status = _RemoveSmallData(transaction, node, name);
1290 	if (status == B_ENTRY_NOT_FOUND && !Attributes().IsZero()) {
1291 		// remove the attribute file if it exists
1292 		status = _RemoveAttribute(transaction, name, hasIndex, &index);
1293 		if (status == B_OK) {
1294 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1295 				bfs_inode::ToInode(real_time_clock_usecs()));
1296 			WriteBack(transaction);
1297 		}
1298 	}
1299 
1300 	return status;
1301 }
1302 
1303 
1304 /*!	Returns the attribute inode with the specified \a name, in case it exists.
1305 	This method can only return real attribute files; the attributes in the
1306 	small data section are ignored.
1307 */
1308 status_t
1309 Inode::GetAttribute(const char* name, Inode** _attribute)
1310 {
1311 	// does this inode even have attributes?
1312 	if (Attributes().IsZero())
1313 		return B_ENTRY_NOT_FOUND;
1314 
1315 	Vnode vnode(fVolume, Attributes());
1316 	Inode* attributes;
1317 	if (vnode.Get(&attributes) < B_OK) {
1318 		FATAL(("get_vnode() failed in Inode::GetAttribute(name = \"%s\")\n",
1319 			name));
1320 		return B_ERROR;
1321 	}
1322 
1323 	BPlusTree* tree = attributes->Tree();
1324 	if (tree == NULL)
1325 		return B_BAD_VALUE;
1326 
1327 	InodeReadLocker locker(attributes);
1328 
1329 	ino_t id;
1330 	status_t status = tree->Find((uint8*)name, (uint16)strlen(name), &id);
1331 	if (status == B_OK) {
1332 		Vnode vnode(fVolume, id);
1333 		Inode* inode;
1334 		// Check if the attribute is really an attribute
1335 		if (vnode.Get(&inode) != B_OK || !inode->IsAttribute())
1336 			return B_ERROR;
1337 
1338 		*_attribute = inode;
1339 		vnode.Keep();
1340 		return B_OK;
1341 	}
1342 
1343 	return status;
1344 }
1345 
1346 
1347 void
1348 Inode::ReleaseAttribute(Inode* attribute)
1349 {
1350 	if (attribute == NULL)
1351 		return;
1352 
1353 	put_vnode(fVolume->FSVolume(), attribute->ID());
1354 }
1355 
1356 
1357 status_t
1358 Inode::CreateAttribute(Transaction& transaction, const char* name, uint32 type,
1359 	Inode** attribute)
1360 {
1361 	// do we need to create the attribute directory first?
1362 	if (Attributes().IsZero()) {
1363 		status_t status = Inode::Create(transaction, this, NULL,
1364 			S_ATTR_DIR | S_DIRECTORY | 0666, 0, 0, NULL);
1365 		if (status < B_OK)
1366 			RETURN_ERROR(status);
1367 	}
1368 	Vnode vnode(fVolume, Attributes());
1369 	Inode* attributes;
1370 	if (vnode.Get(&attributes) < B_OK)
1371 		return B_ERROR;
1372 
1373 	// Inode::Create() locks the inode for us
1374 	return Inode::Create(transaction, attributes, name,
1375 		S_ATTR | S_FILE | 0666, 0, type, NULL, NULL, attribute);
1376 }
1377 
1378 
1379 //	#pragma mark - directory tree
1380 
1381 
1382 bool
1383 Inode::IsEmpty()
1384 {
1385 	TreeIterator iterator(fTree);
1386 
1387 	uint32 count = 0;
1388 	char name[MAX_INDEX_KEY_LENGTH + 1];
1389 	uint16 length;
1390 	ino_t id;
1391 	while (iterator.GetNextEntry(name, &length, MAX_INDEX_KEY_LENGTH + 1,
1392 			&id) == B_OK) {
1393 		if ((Mode() & (S_ATTR_DIR | S_INDEX_DIR)) != 0)
1394 			return false;
1395 
1396 		// Unlike index and attribute directories, directories
1397 		// for standard files always contain ".", and "..", so
1398 		// we need to ignore those two
1399 		if (++count > 2 || (strcmp(".", name) != 0 && strcmp("..", name) != 0))
1400 			return false;
1401 	}
1402 	return true;
1403 }
1404 
1405 
1406 status_t
1407 Inode::ContainerContentsChanged(Transaction& transaction)
1408 {
1409 	ASSERT(!InLastModifiedIndex());
1410 
1411 	Node().last_modified_time = Node().status_change_time
1412 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
1413 
1414 	return WriteBack(transaction);
1415 }
1416 
1417 
1418 //	#pragma mark - data stream
1419 
1420 
1421 /*!	Computes the number of bytes this inode occupies in the file system.
1422 	This includes the file meta blocks used for maintaining its data stream.
1423 
1424 	TODO: However, the attributes in extra files are not really accounted for;
1425 	depending on the speed penalty, this should be changed, though (the value
1426 	could be cached in the inode structure or Inode object, though).
1427 */
1428 off_t
1429 Inode::AllocatedSize() const
1430 {
1431 	if (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0) {
1432 		// This symlink does not have a data stream
1433 		return Node().InodeSize();
1434 	}
1435 
1436 	const data_stream& data = Node().data;
1437 	uint32 blockSize = fVolume->BlockSize();
1438 	off_t size = blockSize;
1439 
1440 	if (data.MaxDoubleIndirectRange() != 0) {
1441 		off_t doubleIndirectSize = data.MaxDoubleIndirectRange()
1442 			- data.MaxIndirectRange();
1443 		int32 indirectSize = double_indirect_max_indirect_size(
1444 			data.double_indirect.Length(), fVolume->BlockSize());
1445 
1446 		size += (2 * data.double_indirect.Length()
1447 				+ doubleIndirectSize / indirectSize)
1448 			* blockSize + data.MaxDoubleIndirectRange();
1449 	} else if (data.MaxIndirectRange() != 0)
1450 		size += data.indirect.Length() + data.MaxIndirectRange();
1451 	else
1452 		size += data.MaxDirectRange();
1453 
1454 	if (!Node().attributes.IsZero()) {
1455 		// TODO: to make this exact, we'd had to count all attributes
1456 		size += 2 * blockSize;
1457 			// 2 blocks, one for the attributes inode, one for its B+tree
1458 	}
1459 
1460 	return size;
1461 }
1462 
1463 
1464 /*!	Finds the block_run where "pos" is located in the data_stream of
1465 	the inode.
1466 	If successful, "offset" will then be set to the file offset
1467 	of the block_run returned; so "pos - offset" is for the block_run
1468 	what "pos" is for the whole stream.
1469 	The caller has to make sure that "pos" is inside the stream.
1470 */
1471 status_t
1472 Inode::FindBlockRun(off_t pos, block_run& run, off_t& offset)
1473 {
1474 	data_stream* data = &Node().data;
1475 
1476 	// find matching block run
1477 
1478 	if (data->MaxIndirectRange() > 0 && pos >= data->MaxDirectRange()) {
1479 		if (data->MaxDoubleIndirectRange() > 0
1480 			&& pos >= data->MaxIndirectRange()) {
1481 			// access to double indirect blocks
1482 
1483 			CachedBlock cached(fVolume);
1484 
1485 			int32 runsPerBlock;
1486 			int32 directSize;
1487 			int32 indirectSize;
1488 			get_double_indirect_sizes(data->double_indirect.Length(),
1489 				fVolume->BlockSize(), runsPerBlock, directSize, indirectSize);
1490 			if (directSize <= 0 || indirectSize <= 0)
1491 				RETURN_ERROR(B_BAD_DATA);
1492 
1493 			off_t start = pos - data->MaxIndirectRange();
1494 			int32 index = start / indirectSize;
1495 
1496 			status_t status = cached.SetTo(fVolume->ToBlock(
1497 				data->double_indirect) + index / runsPerBlock);
1498 			if (status != B_OK)
1499 				RETURN_ERROR(status);
1500 
1501 			block_run* indirect = (block_run*)cached.Block();
1502 			int32 current = (start % indirectSize) / directSize;
1503 
1504 			status = cached.SetTo(fVolume->ToBlock(indirect[
1505 				index % runsPerBlock]) + current / runsPerBlock);
1506 			if (status != B_OK)
1507 				RETURN_ERROR(status);
1508 
1509 			indirect = (block_run*)cached.Block();
1510 			run = indirect[current % runsPerBlock];
1511 			if (run.Length() != data->double_indirect.Length())
1512 				RETURN_ERROR(B_BAD_DATA);
1513 
1514 			offset = data->MaxIndirectRange() + (index * indirectSize)
1515 				+ (current * directSize);
1516 		} else {
1517 			// access to indirect blocks
1518 
1519 			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1520 			off_t runBlockEnd = data->MaxDirectRange();
1521 
1522 			CachedBlock cached(fVolume);
1523 			off_t block = fVolume->ToBlock(data->indirect);
1524 
1525 			for (int32 i = 0; i < data->indirect.Length(); i++) {
1526 				status_t status = cached.SetTo(block + i);
1527 				if (status != B_OK)
1528 					RETURN_ERROR(status);
1529 
1530 				block_run* indirect = (block_run*)cached.Block();
1531 				int32 current = -1;
1532 				while (++current < runsPerBlock) {
1533 					if (indirect[current].IsZero())
1534 						break;
1535 
1536 					runBlockEnd += (uint32)indirect[current].Length()
1537 						<< cached.BlockShift();
1538 					if (runBlockEnd > pos) {
1539 						run = indirect[current];
1540 						offset = runBlockEnd - ((uint32)run.Length()
1541 							<< cached.BlockShift());
1542 						return fVolume->ValidateBlockRun(run);
1543 					}
1544 				}
1545 			}
1546 			RETURN_ERROR(B_ERROR);
1547 		}
1548 	} else {
1549 		// access from direct blocks
1550 
1551 		off_t runBlockEnd = 0LL;
1552 		int32 current = -1;
1553 
1554 		while (++current < NUM_DIRECT_BLOCKS) {
1555 			if (data->direct[current].IsZero())
1556 				break;
1557 
1558 			runBlockEnd += (uint32)data->direct[current].Length()
1559 				<< fVolume->BlockShift();
1560 			if (runBlockEnd > pos) {
1561 				run = data->direct[current];
1562 				offset = runBlockEnd
1563 					- ((uint32)run.Length() << fVolume->BlockShift());
1564 				return fVolume->ValidateBlockRun(run);
1565 			}
1566 		}
1567 
1568 		return B_ENTRY_NOT_FOUND;
1569 	}
1570 	return fVolume->ValidateBlockRun(run);
1571 }
1572 
1573 
1574 status_t
1575 Inode::ReadAt(off_t pos, uint8* buffer, size_t* _length)
1576 {
1577 	return file_cache_read(FileCache(), NULL, pos, buffer, _length);
1578 }
1579 
1580 
1581 status_t
1582 Inode::WriteAt(Transaction& transaction, off_t pos, const uint8* buffer,
1583 	size_t* _length)
1584 {
1585 	InodeReadLocker locker(this);
1586 
1587 	// update the last modification time in memory, it will be written
1588 	// back to the inode, and the index when the file is closed
1589 	// TODO: should update the internal last modified time only at this point!
1590 	Node().last_modified_time = Node().status_change_time
1591 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
1592 
1593 	// TODO: support INODE_LOGGED!
1594 
1595 	size_t length = *_length;
1596 	bool changeSize = (uint64)pos + (uint64)length > (uint64)Size();
1597 
1598 	// set/check boundaries for pos/length
1599 	if (pos < 0)
1600 		return B_BAD_VALUE;
1601 
1602 	locker.Unlock();
1603 
1604 	// the transaction doesn't have to be started already
1605 	if (changeSize && !transaction.IsStarted())
1606 		transaction.Start(fVolume, BlockNumber());
1607 
1608 	WriteLocker writeLocker(fLock);
1609 
1610 	// Work around possible race condition: Someone might have shrunken the file
1611 	// while we had no lock.
1612 	if (!transaction.IsStarted()
1613 		&& (uint64)pos + (uint64)length > (uint64)Size()) {
1614 		writeLocker.Unlock();
1615 		transaction.Start(fVolume, BlockNumber());
1616 		writeLocker.Lock();
1617 	}
1618 
1619 	off_t oldSize = Size();
1620 
1621 	if ((uint64)pos + (uint64)length > (uint64)oldSize) {
1622 		// let's grow the data stream to the size needed
1623 		status_t status = SetFileSize(transaction, pos + length);
1624 		if (status != B_OK) {
1625 			*_length = 0;
1626 			WriteLockInTransaction(transaction);
1627 			RETURN_ERROR(status);
1628 		}
1629 		// TODO: In theory we would need to update the file size
1630 		// index here as part of the current transaction - this might just
1631 		// be a bit too expensive, but worth a try.
1632 
1633 		// we need to write back the inode here because it has to
1634 		// go into this transaction (we cannot wait until the file
1635 		// is closed)
1636 		status = WriteBack(transaction);
1637 		if (status != B_OK) {
1638 			WriteLockInTransaction(transaction);
1639 			return status;
1640 		}
1641 	}
1642 
1643 	writeLocker.Unlock();
1644 
1645 	if (oldSize < pos)
1646 		FillGapWithZeros(oldSize, pos);
1647 
1648 	// If we don't want to write anything, we can now return (we may
1649 	// just have changed the file size using the position parameter)
1650 	if (length == 0)
1651 		return B_OK;
1652 
1653 	status_t status = file_cache_write(FileCache(), NULL, pos, buffer, _length);
1654 
1655 	if (transaction.IsStarted())
1656 		WriteLockInTransaction(transaction);
1657 
1658 	return status;
1659 }
1660 
1661 
1662 /*!	Fills the gap between the old file size and the new file size
1663 	with zeros.
1664 	It's more or less a copy of Inode::WriteAt() but it can handle
1665 	length differences of more than just 4 GB, and it never uses
1666 	the log, even if the INODE_LOGGED flag is set.
1667 */
1668 status_t
1669 Inode::FillGapWithZeros(off_t pos, off_t newSize)
1670 {
1671 	while (pos < newSize) {
1672 		size_t size;
1673 		if (newSize > pos + 1024 * 1024 * 1024)
1674 			size = 1024 * 1024 * 1024;
1675 		else
1676 			size = newSize - pos;
1677 
1678 		status_t status = file_cache_write(FileCache(), NULL, pos, NULL, &size);
1679 		if (status < B_OK)
1680 			return status;
1681 
1682 		pos += size;
1683 	}
1684 
1685 	return B_OK;
1686 }
1687 
1688 
1689 /*!	Allocates \a length blocks, and clears their contents. Growing
1690 	the indirect and double indirect range uses this method.
1691 	The allocated block_run is saved in "run"
1692 */
1693 status_t
1694 Inode::_AllocateBlockArray(Transaction& transaction, block_run& run,
1695 	size_t length, bool variableSize)
1696 {
1697 	if (!run.IsZero())
1698 		return B_BAD_VALUE;
1699 
1700 	status_t status = fVolume->Allocate(transaction, this, length, run,
1701 		variableSize ? 1 : length);
1702 	if (status != B_OK)
1703 		return status;
1704 
1705 	// make sure those blocks are empty
1706 	CachedBlock cached(fVolume);
1707 	off_t block = fVolume->ToBlock(run);
1708 
1709 	for (int32 i = 0; i < run.Length(); i++) {
1710 		status = cached.SetToWritable(transaction, block + i, true);
1711 		if (status != B_OK)
1712 			return status;
1713 	}
1714 	return B_OK;
1715 }
1716 
1717 
1718 /*!	Grows the stream to \a size, and fills the direct/indirect/double indirect
1719 	ranges with the runs.
1720 	This method will also determine the size of the preallocation, if any.
1721 */
1722 status_t
1723 Inode::_GrowStream(Transaction& transaction, off_t size)
1724 {
1725 	data_stream* data = &Node().data;
1726 
1727 	// is the data stream already large enough to hold the new size?
1728 	// (can be the case with preallocated blocks)
1729 	if (size < data->MaxDirectRange()
1730 		|| size < data->MaxIndirectRange()
1731 		|| size < data->MaxDoubleIndirectRange()) {
1732 		data->size = HOST_ENDIAN_TO_BFS_INT64(size);
1733 		return B_OK;
1734 	}
1735 
1736 	// how many bytes are still needed? (unused ranges are always zero)
1737 	uint16 minimum = 1;
1738 	off_t bytes;
1739 	if (data->Size() < data->MaxDoubleIndirectRange()) {
1740 		bytes = size - data->MaxDoubleIndirectRange();
1741 		// The double indirect range can only handle multiples of
1742 		// its base length
1743 		minimum = data->double_indirect.Length();
1744 	} else if (data->Size() < data->MaxIndirectRange())
1745 		bytes = size - data->MaxIndirectRange();
1746 	else if (data->Size() < data->MaxDirectRange())
1747 		bytes = size - data->MaxDirectRange();
1748 	else {
1749 		// no preallocation left to be used
1750 		bytes = size - data->Size();
1751 		if (data->MaxDoubleIndirectRange() > 0)
1752 			minimum = data->double_indirect.Length();
1753 	}
1754 
1755 	// do we have enough free blocks on the disk?
1756 	off_t blocksNeeded = (bytes + fVolume->BlockSize() - 1)
1757 		>> fVolume->BlockShift();
1758 	if (blocksNeeded > fVolume->FreeBlocks())
1759 		return B_DEVICE_FULL;
1760 
1761 	off_t blocksRequested = blocksNeeded;
1762 		// because of preallocations and partial allocations, the number of
1763 		// blocks we need to allocate may be different from the one we request
1764 		// from the block allocator
1765 
1766 	// Should we preallocate some blocks?
1767 	// Attributes, attribute directories, and long symlinks usually won't get
1768 	// that big, and should stay close to the inode - preallocating could be
1769 	// counterproductive.
1770 	// Also, if free disk space is tight, don't preallocate.
1771 	if (!IsAttribute() && !IsAttributeDirectory() && !IsSymLink()
1772 		&& fVolume->FreeBlocks() > 128) {
1773 		off_t roundTo = 0;
1774 		if (IsFile()) {
1775 			// Request preallocated blocks depending on the file size and growth
1776 			if (size < 1 * 1024 * 1024 && bytes < 512 * 1024) {
1777 				// Preallocate 64 KB for file sizes <1 MB and grow rates <512 KB
1778 				roundTo = 65536 >> fVolume->BlockShift();
1779 			} else if (size < 32 * 1024 * 1024 && bytes <= 1 * 1024 * 1024) {
1780 				// Preallocate 512 KB for file sizes between 1 MB and 32 MB, and
1781 				// grow rates smaller than 1 MB
1782 				roundTo = (512 * 1024) >> fVolume->BlockShift();
1783 			} else {
1784 				// Preallocate 1/16 of the file size (ie. 4 MB for 64 MB,
1785 				// 64 MB for 1 GB)
1786 				roundTo = size >> (fVolume->BlockShift() + 4);
1787 			}
1788 		} else if (IsIndex()) {
1789 			// Always preallocate 64 KB for index directories
1790 			roundTo = 65536 >> fVolume->BlockShift();
1791 		} else {
1792 			// Preallocate only 4 KB - directories only get trimmed when their
1793 			// vnode is flushed, which might not happen very often.
1794 			roundTo = 4096 >> fVolume->BlockShift();
1795 		}
1796 		if (roundTo > 1) {
1797 			// Round to next "roundTo" block count
1798 			blocksRequested = ((blocksNeeded + roundTo) / roundTo) * roundTo;
1799 		}
1800 	}
1801 
1802 	while (blocksNeeded > 0) {
1803 		// the requested blocks do not need to be returned with a
1804 		// single allocation, so we need to iterate until we have
1805 		// enough blocks allocated
1806 		if (minimum > 1) {
1807 			// make sure that "blocks" is a multiple of minimum
1808 			blocksRequested = round_up(blocksRequested, minimum);
1809 		}
1810 
1811 		block_run run;
1812 		status_t status = fVolume->Allocate(transaction, this, blocksRequested,
1813 			run, minimum);
1814 		if (status != B_OK)
1815 			return status;
1816 
1817 		// okay, we have the needed blocks, so just distribute them to the
1818 		// different ranges of the stream (direct, indirect & double indirect)
1819 
1820 		blocksNeeded -= run.Length();
1821 		// don't preallocate if the first allocation was already too small
1822 		blocksRequested = blocksNeeded;
1823 
1824 		// Direct block range
1825 
1826 		if (data->Size() <= data->MaxDirectRange()) {
1827 			// let's try to put them into the direct block range
1828 			int32 free = 0;
1829 			for (; free < NUM_DIRECT_BLOCKS; free++) {
1830 				if (data->direct[free].IsZero())
1831 					break;
1832 			}
1833 
1834 			if (free < NUM_DIRECT_BLOCKS) {
1835 				// can we merge the last allocated run with the new one?
1836 				int32 last = free - 1;
1837 				if (free > 0 && data->direct[last].MergeableWith(run)) {
1838 					data->direct[last].length = HOST_ENDIAN_TO_BFS_INT16(
1839 						data->direct[last].Length() + run.Length());
1840 				} else
1841 					data->direct[free] = run;
1842 
1843 				data->max_direct_range = HOST_ENDIAN_TO_BFS_INT64(
1844 					data->MaxDirectRange()
1845 					+ run.Length() * fVolume->BlockSize());
1846 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1847 					? data->max_direct_range : size);
1848 				continue;
1849 			}
1850 		}
1851 
1852 		// Indirect block range
1853 
1854 		if (data->Size() <= data->MaxIndirectRange()
1855 			|| !data->MaxIndirectRange()) {
1856 			CachedBlock cached(fVolume);
1857 			block_run* runs = NULL;
1858 			uint32 free = 0;
1859 			off_t block;
1860 
1861 			// if there is no indirect block yet, create one
1862 			if (data->indirect.IsZero()) {
1863 				status = _AllocateBlockArray(transaction, data->indirect,
1864 					NUM_ARRAY_BLOCKS, true);
1865 				if (status != B_OK)
1866 					return status;
1867 
1868 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1869 					data->MaxDirectRange());
1870 				// insert the block_run in the first block
1871 				status = cached.SetTo(data->indirect);
1872 				if (status != B_OK)
1873 					return status;
1874 
1875 				runs = (block_run*)cached.Block();
1876 			} else {
1877 				uint32 numberOfRuns = fVolume->BlockSize() / sizeof(block_run);
1878 				block = fVolume->ToBlock(data->indirect);
1879 
1880 				// search first empty entry
1881 				int32 i = 0;
1882 				for (; i < data->indirect.Length(); i++) {
1883 					status = cached.SetTo(block + i);
1884 					if (status != B_OK)
1885 						return status;
1886 
1887 					runs = (block_run*)cached.Block();
1888 					for (free = 0; free < numberOfRuns; free++)
1889 						if (runs[free].IsZero())
1890 							break;
1891 
1892 					if (free < numberOfRuns)
1893 						break;
1894 				}
1895 				if (i == data->indirect.Length())
1896 					runs = NULL;
1897 			}
1898 
1899 			if (runs != NULL) {
1900 				// try to insert the run to the last one - note that this
1901 				// doesn't take block borders into account, so it could be
1902 				// further optimized
1903 				cached.MakeWritable(transaction);
1904 
1905 				int32 last = free - 1;
1906 				if (free > 0 && runs[last].MergeableWith(run)) {
1907 					runs[last].length = HOST_ENDIAN_TO_BFS_INT16(
1908 						runs[last].Length() + run.Length());
1909 				} else
1910 					runs[free] = run;
1911 
1912 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1913 					data->MaxIndirectRange()
1914 					+ ((uint32)run.Length() << fVolume->BlockShift()));
1915 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1916 					? data->MaxIndirectRange() : size);
1917 				continue;
1918 			}
1919 		}
1920 
1921 		// Double indirect block range
1922 
1923 		if (data->Size() <= data->MaxDoubleIndirectRange()
1924 			|| !data->max_double_indirect_range) {
1925 			// We make sure here that we have this minimum allocated, so if
1926 			// the allocation succeeds, we don't run into an endless loop.
1927 			if (!data->max_double_indirect_range)
1928 				minimum = _DoubleIndirectBlockLength();
1929 			else
1930 				minimum = data->double_indirect.Length();
1931 
1932 			if ((run.Length() % minimum) != 0) {
1933 				// The number of allocated blocks isn't a multiple of 'minimum',
1934 				// so we have to change this. This can happen the first time the
1935 				// stream grows into the double indirect range.
1936 				// First, free the remaining blocks that don't fit into this
1937 				// multiple.
1938 				int32 rest = run.Length() % minimum;
1939 				run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length() - rest);
1940 
1941 				status = fVolume->Free(transaction,
1942 					block_run::Run(run.AllocationGroup(),
1943 					run.Start() + run.Length(), rest));
1944 				if (status != B_OK)
1945 					return status;
1946 
1947 				blocksNeeded += rest;
1948 				blocksRequested = round_up(blocksNeeded, minimum);
1949 
1950 				// Are there any blocks left in the run? If not, allocate
1951 				// a new one
1952 				if (run.length == 0)
1953 					continue;
1954 			}
1955 
1956 			// if there is no double indirect block yet, create one
1957 			if (data->double_indirect.IsZero()) {
1958 				status = _AllocateBlockArray(transaction,
1959 					data->double_indirect, _DoubleIndirectBlockLength());
1960 				if (status != B_OK)
1961 					return status;
1962 
1963 				data->max_double_indirect_range = data->max_indirect_range;
1964 			}
1965 
1966 			// calculate the index where to insert the new blocks
1967 
1968 			int32 runsPerBlock;
1969 			int32 directSize;
1970 			int32 indirectSize;
1971 			get_double_indirect_sizes(data->double_indirect.Length(),
1972 				fVolume->BlockSize(), runsPerBlock, directSize, indirectSize);
1973 			if (directSize <= 0 || indirectSize <= 0)
1974 				return B_BAD_DATA;
1975 
1976 			off_t start = data->MaxDoubleIndirectRange()
1977 				- data->MaxIndirectRange();
1978 			int32 indirectIndex = start / indirectSize;
1979 			int32 index = (start % indirectSize) / directSize;
1980 			int32 runsPerArray = runsPerBlock * minimum;
1981 
1982 			// distribute the blocks to the array and allocate
1983 			// new array blocks when needed
1984 
1985 			CachedBlock cached(fVolume);
1986 			CachedBlock cachedDirect(fVolume);
1987 			block_run* array = NULL;
1988 			uint32 runLength = run.Length();
1989 
1990 			while (run.length != 0) {
1991 				// get the indirect array block
1992 				if (array == NULL) {
1993 					uint32 block = indirectIndex / runsPerBlock;
1994 					if (block >= minimum)
1995 						return EFBIG;
1996 
1997 					status = cached.SetTo(fVolume->ToBlock(
1998 						data->double_indirect) + block);
1999 					if (status != B_OK)
2000 						return status;
2001 
2002 					array = (block_run*)cached.Block();
2003 				}
2004 
2005 				do {
2006 					// do we need a new array block?
2007 					if (array[indirectIndex % runsPerBlock].IsZero()) {
2008 						cached.MakeWritable(transaction);
2009 
2010 						status = _AllocateBlockArray(transaction,
2011 							array[indirectIndex % runsPerBlock],
2012 							data->double_indirect.Length());
2013 						if (status != B_OK)
2014 							return status;
2015 					}
2016 
2017 					status = cachedDirect.SetToWritable(transaction,
2018 						fVolume->ToBlock(array[indirectIndex
2019 							% runsPerBlock]) + index / runsPerBlock);
2020 					if (status != B_OK)
2021 						return status;
2022 
2023 					block_run* runs = (block_run*)cachedDirect.Block();
2024 
2025 					do {
2026 						// insert the block_run into the array
2027 						runs[index % runsPerBlock] = run;
2028 						runs[index % runsPerBlock].length
2029 							= HOST_ENDIAN_TO_BFS_INT16(minimum);
2030 
2031 						// alter the remaining block_run
2032 						run.start = HOST_ENDIAN_TO_BFS_INT16(run.Start()
2033 							+ minimum);
2034 						run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
2035 							- minimum);
2036 					} while ((++index % runsPerBlock) != 0 && run.length);
2037 				} while ((index % runsPerArray) != 0 && run.length);
2038 
2039 				if (index == runsPerArray)
2040 					index = 0;
2041 				if (++indirectIndex % runsPerBlock == 0) {
2042 					array = NULL;
2043 					index = 0;
2044 				}
2045 			}
2046 
2047 			data->max_double_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
2048 				data->MaxDoubleIndirectRange()
2049 				+ (runLength << fVolume->BlockShift()));
2050 			data->size = blocksNeeded > 0 ? HOST_ENDIAN_TO_BFS_INT64(
2051 				data->max_double_indirect_range) : size;
2052 
2053 			continue;
2054 		}
2055 
2056 		RETURN_ERROR(EFBIG);
2057 	}
2058 	// update the size of the data stream
2059 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
2060 
2061 	return B_OK;
2062 }
2063 
2064 
2065 size_t
2066 Inode::_DoubleIndirectBlockLength() const
2067 {
2068 	if (fVolume->BlockSize() > DOUBLE_INDIRECT_ARRAY_SIZE)
2069 		return 1;
2070 
2071 	return DOUBLE_INDIRECT_ARRAY_SIZE / fVolume->BlockSize();
2072 }
2073 
2074 
2075 /*!	Frees the statically sized array of the double indirect part of a data
2076 	stream.
2077 */
2078 status_t
2079 Inode::_FreeStaticStreamArray(Transaction& transaction, int32 level,
2080 	block_run run, off_t size, off_t offset, off_t& max)
2081 {
2082 	int32 indirectSize;
2083 	if (level == 0) {
2084 		indirectSize = double_indirect_max_indirect_size(run.Length(),
2085 			fVolume->BlockSize());
2086 	} else {
2087 		indirectSize = double_indirect_max_direct_size(run.Length(),
2088 			fVolume->BlockSize());
2089 	}
2090 	if (indirectSize <= 0)
2091 		return B_BAD_DATA;
2092 
2093 	off_t start;
2094 	if (size > offset)
2095 		start = size - offset;
2096 	else
2097 		start = 0;
2098 
2099 	int32 index = start / indirectSize;
2100 	int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
2101 
2102 	CachedBlock cached(fVolume);
2103 	off_t blockNumber = fVolume->ToBlock(run);
2104 
2105 	// set the file offset to the current block run
2106 	offset += (off_t)index * indirectSize;
2107 
2108 	for (int32 i = index / runsPerBlock; i < run.Length(); i++) {
2109 		status_t status = cached.SetToWritable(transaction, blockNumber + i);
2110 		if (status != B_OK)
2111 			RETURN_ERROR(status);
2112 
2113 		block_run* array = (block_run*)cached.WritableBlock();
2114 
2115 		for (index = index % runsPerBlock; index < runsPerBlock; index++) {
2116 			if (array[index].IsZero()) {
2117 				// we also want to break out of the outer loop
2118 				i = run.Length();
2119 				break;
2120 			}
2121 
2122 			status_t status = B_OK;
2123 			if (level == 0) {
2124 				status = _FreeStaticStreamArray(transaction, 1, array[index],
2125 					size, offset, max);
2126 			} else if (offset >= size)
2127 				status = fVolume->Free(transaction, array[index]);
2128 			else
2129 				max = HOST_ENDIAN_TO_BFS_INT64(offset + indirectSize);
2130 
2131 			if (status < B_OK)
2132 				RETURN_ERROR(status);
2133 
2134 			if (offset >= size)
2135 				array[index].SetTo(0, 0, 0);
2136 
2137 			offset += indirectSize;
2138 		}
2139 		index = 0;
2140 	}
2141 	return B_OK;
2142 }
2143 
2144 
2145 /*!	Frees all block_runs in the array which come after the specified size.
2146 	It also trims the last block_run that contain the size.
2147 	"offset" and "max" are maintained until the last block_run that doesn't
2148 	have to be freed - after this, the values won't be correct anymore, but
2149 	will still assure correct function for all subsequent calls.
2150 	"max" is considered to be in file system byte order.
2151 */
2152 status_t
2153 Inode::_FreeStreamArray(Transaction& transaction, block_run* array,
2154 	uint32 arrayLength, off_t size, off_t& offset, off_t& max)
2155 {
2156 	PRINT(("FreeStreamArray: arrayLength %" B_PRId32 ", size %" B_PRIdOFF
2157 		", offset %" B_PRIdOFF ", max %" B_PRIdOFF "\n", arrayLength, size,
2158 		offset, max));
2159 
2160 	off_t newOffset = offset;
2161 	uint32 i = 0;
2162 	for (; i < arrayLength; i++, offset = newOffset) {
2163 		if (array[i].IsZero())
2164 			break;
2165 
2166 		newOffset += (off_t)array[i].Length() << fVolume->BlockShift();
2167 		if (newOffset <= size)
2168 			continue;
2169 
2170 		block_run run = array[i];
2171 
2172 		// determine the block_run to be freed
2173 		if (newOffset > size && offset < size) {
2174 			// free partial block_run (and update the original block_run)
2175 			run.start = HOST_ENDIAN_TO_BFS_INT16(array[i].Start()
2176 				+ ((size + fVolume->BlockSize() - 1 - offset)
2177 					>> fVolume->BlockShift()));
2178 				// round to next block
2179 			array[i].length = HOST_ENDIAN_TO_BFS_INT16(run.Start()
2180 				- array[i].Start());
2181 			run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
2182 				- array[i].Length());
2183 
2184 			if (run.length == 0)
2185 				continue;
2186 
2187 			// update maximum range
2188 			max = HOST_ENDIAN_TO_BFS_INT64(offset + ((off_t)array[i].Length()
2189 				<< fVolume->BlockShift()));
2190 		} else {
2191 			// free the whole block_run
2192 			array[i].SetTo(0, 0, 0);
2193 
2194 			if ((off_t)BFS_ENDIAN_TO_HOST_INT64(max) > offset)
2195 				max = HOST_ENDIAN_TO_BFS_INT64(offset);
2196 		}
2197 
2198 		if (fVolume->Free(transaction, run) < B_OK)
2199 			return B_IO_ERROR;
2200 	}
2201 	return B_OK;
2202 }
2203 
2204 
2205 status_t
2206 Inode::_ShrinkStream(Transaction& transaction, off_t size)
2207 {
2208 	data_stream* data = &Node().data;
2209 	status_t status;
2210 
2211 	if (data->MaxDoubleIndirectRange() > size) {
2212 		off_t* maxDoubleIndirect = &data->max_double_indirect_range;
2213 			// gcc 4 work-around: "error: cannot bind packed field
2214 			// 'data->data_stream::max_double_indirect_range' to 'off_t&'"
2215 		status = _FreeStaticStreamArray(transaction, 0, data->double_indirect,
2216 			size, data->MaxIndirectRange(), *maxDoubleIndirect);
2217 		if (status != B_OK)
2218 			return status;
2219 
2220 		if (size <= data->MaxIndirectRange()) {
2221 			fVolume->Free(transaction, data->double_indirect);
2222 			data->double_indirect.SetTo(0, 0, 0);
2223 			data->max_double_indirect_range = 0;
2224 		}
2225 	}
2226 
2227 	if (data->MaxIndirectRange() > size) {
2228 		CachedBlock cached(fVolume);
2229 		off_t block = fVolume->ToBlock(data->indirect);
2230 		off_t offset = data->MaxDirectRange();
2231 
2232 		for (int32 i = 0; i < data->indirect.Length(); i++) {
2233 			status = cached.SetToWritable(transaction, block + i);
2234 			if (status != B_OK)
2235 				return status;
2236 
2237 			block_run* array = (block_run*)cached.WritableBlock();
2238 
2239 			off_t* maxIndirect = &data->max_indirect_range;
2240 				// gcc 4 work-around: "error: cannot bind packed field
2241 				// 'data->data_stream::max_indirect_range' to 'off_t&'"
2242 			if (_FreeStreamArray(transaction, array, fVolume->BlockSize()
2243 					/ sizeof(block_run), size, offset, *maxIndirect) != B_OK)
2244 				return B_IO_ERROR;
2245 		}
2246 		if (data->max_direct_range == data->max_indirect_range) {
2247 			fVolume->Free(transaction, data->indirect);
2248 			data->indirect.SetTo(0, 0, 0);
2249 			data->max_indirect_range = 0;
2250 		}
2251 	}
2252 
2253 	if (data->MaxDirectRange() > size) {
2254 		off_t offset = 0;
2255 		off_t *maxDirect = &data->max_direct_range;
2256 			// gcc 4 work-around: "error: cannot bind packed field
2257 			// 'data->data_stream::max_direct_range' to 'off_t&'"
2258 		status = _FreeStreamArray(transaction, data->direct, NUM_DIRECT_BLOCKS,
2259 			size, offset, *maxDirect);
2260 		if (status != B_OK)
2261 			return status;
2262 	}
2263 
2264 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
2265 	return B_OK;
2266 }
2267 
2268 
2269 status_t
2270 Inode::SetFileSize(Transaction& transaction, off_t size)
2271 {
2272 	if (size < 0)
2273 		return B_BAD_VALUE;
2274 
2275 	off_t oldSize = Size();
2276 
2277 	if (size == oldSize)
2278 		return B_OK;
2279 
2280 	T(Resize(this, oldSize, size, false));
2281 
2282 	// should the data stream grow or shrink?
2283 	status_t status;
2284 	if (size > oldSize) {
2285 		status = _GrowStream(transaction, size);
2286 		if (status < B_OK) {
2287 			// if the growing of the stream fails, the whole operation
2288 			// fails, so we should shrink the stream to its former size
2289 			_ShrinkStream(transaction, oldSize);
2290 		}
2291 	} else
2292 		status = _ShrinkStream(transaction, size);
2293 
2294 	if (status < B_OK)
2295 		return status;
2296 
2297 	file_cache_set_size(FileCache(), size);
2298 	file_map_set_size(Map(), size);
2299 
2300 	return WriteBack(transaction);
2301 }
2302 
2303 
2304 status_t
2305 Inode::Append(Transaction& transaction, off_t bytes)
2306 {
2307 	return SetFileSize(transaction, Size() + bytes);
2308 }
2309 
2310 
2311 /*!	Checks whether or not this inode's data stream needs to be trimmed
2312 	because of an earlier preallocation.
2313 	Returns true if there are any blocks to be trimmed.
2314 */
2315 bool
2316 Inode::NeedsTrimming() const
2317 {
2318 	// We never trim preallocated index blocks to make them grow as smooth as
2319 	// possible. There are only few indices anyway, so this doesn't hurt.
2320 	// Also, if an inode is already in deleted state, we don't bother trimming
2321 	// it.
2322 	if (IsIndex() || IsDeleted()
2323 		|| (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0))
2324 		return false;
2325 
2326 	off_t roundedSize = round_up(Size(), fVolume->BlockSize());
2327 
2328 	return Node().data.MaxDirectRange() > roundedSize
2329 		|| Node().data.MaxIndirectRange() > roundedSize
2330 		|| Node().data.MaxDoubleIndirectRange() > roundedSize;
2331 }
2332 
2333 
2334 status_t
2335 Inode::TrimPreallocation(Transaction& transaction)
2336 {
2337 	T(Resize(this, max_c(Node().data.MaxDirectRange(),
2338 		Node().data.MaxIndirectRange()), Size(), true));
2339 
2340 	status_t status = _ShrinkStream(transaction, Size());
2341 	if (status < B_OK)
2342 		return status;
2343 
2344 	return WriteBack(transaction);
2345 }
2346 
2347 
2348 //!	Frees the file's data stream and removes all attributes
2349 status_t
2350 Inode::Free(Transaction& transaction)
2351 {
2352 	FUNCTION();
2353 
2354 	// Perhaps there should be an implementation of Inode::ShrinkStream() that
2355 	// just frees the data_stream, but doesn't change the inode (since it is
2356 	// freed anyway) - that would make an undelete command possible
2357 	if (!IsSymLink() || (Flags() & INODE_LONG_SYMLINK) != 0) {
2358 		status_t status = SetFileSize(transaction, 0);
2359 		if (status < B_OK)
2360 			return status;
2361 	}
2362 
2363 	// Free all attributes, and remove their indices
2364 	{
2365 		// We have to limit the scope of AttributeIterator, so that its
2366 		// destructor is not called after the inode is deleted
2367 		AttributeIterator iterator(this);
2368 
2369 		char name[B_FILE_NAME_LENGTH];
2370 		uint32 type;
2371 		size_t length;
2372 		ino_t id;
2373 		while (iterator.GetNext(name, &length, &type, &id) == B_OK) {
2374 			RemoveAttribute(transaction, name);
2375 		}
2376 	}
2377 
2378 	if (WriteBack(transaction) < B_OK)
2379 		return B_IO_ERROR;
2380 
2381 	return fVolume->Free(transaction, BlockRun());
2382 }
2383 
2384 
2385 //!	Synchronizes (writes back to disk) the file stream of the inode.
2386 status_t
2387 Inode::Sync()
2388 {
2389 	if (FileCache())
2390 		return file_cache_sync(FileCache());
2391 
2392 	// We may also want to flush the attribute's data stream to
2393 	// disk here... (do we?)
2394 
2395 	if (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0)
2396 		return B_OK;
2397 
2398 	InodeReadLocker locker(this);
2399 
2400 	data_stream* data = &Node().data;
2401 	status_t status = B_OK;
2402 
2403 	// flush direct range
2404 
2405 	for (int32 i = 0; i < NUM_DIRECT_BLOCKS; i++) {
2406 		if (data->direct[i].IsZero())
2407 			return B_OK;
2408 
2409 		status = block_cache_sync_etc(fVolume->BlockCache(),
2410 			fVolume->ToBlock(data->direct[i]), data->direct[i].Length());
2411 		if (status != B_OK)
2412 			return status;
2413 	}
2414 
2415 	// flush indirect range
2416 
2417 	if (data->max_indirect_range == 0)
2418 		return B_OK;
2419 
2420 	CachedBlock cached(fVolume);
2421 	off_t block = fVolume->ToBlock(data->indirect);
2422 	int32 count = fVolume->BlockSize() / sizeof(block_run);
2423 
2424 	for (int32 j = 0; j < data->indirect.Length(); j++) {
2425 		status = cached.SetTo(block + j);
2426 		if (status != B_OK)
2427 			return status;
2428 
2429 		block_run* runs = (block_run*)cached.Block();
2430 
2431 		for (int32 i = 0; i < count; i++) {
2432 			if (runs[i].IsZero())
2433 				return B_OK;
2434 
2435 			status = block_cache_sync_etc(fVolume->BlockCache(),
2436 				fVolume->ToBlock(runs[i]), runs[i].Length());
2437 			if (status != B_OK)
2438 				return status;
2439 		}
2440 	}
2441 
2442 	// flush double indirect range
2443 
2444 	if (data->max_double_indirect_range == 0)
2445 		return B_OK;
2446 
2447 	off_t indirectBlock = fVolume->ToBlock(data->double_indirect);
2448 
2449 	for (int32 l = 0; l < data->double_indirect.Length(); l++) {
2450 		status = cached.SetTo(indirectBlock + l);
2451 		if (status != B_OK)
2452 			return status;
2453 
2454 		block_run* indirectRuns = (block_run*)cached.Block();
2455 		CachedBlock directCached(fVolume);
2456 
2457 		for (int32 k = 0; k < count; k++) {
2458 			if (indirectRuns[k].IsZero())
2459 				return B_OK;
2460 
2461 			block = fVolume->ToBlock(indirectRuns[k]);
2462 			for (int32 j = 0; j < indirectRuns[k].Length(); j++) {
2463 				status = directCached.SetTo(block + j);
2464 				if (status != B_OK)
2465 					return status;
2466 
2467 				block_run* runs = (block_run*)directCached.Block();
2468 
2469 				for (int32 i = 0; i < count; i++) {
2470 					if (runs[i].IsZero())
2471 						return B_OK;
2472 
2473 					// TODO: combine single block_runs to bigger ones when
2474 					// they are adjacent
2475 					status = block_cache_sync_etc(fVolume->BlockCache(),
2476 						fVolume->ToBlock(runs[i]), runs[i].Length());
2477 					if (status != B_OK)
2478 						return status;
2479 				}
2480 			}
2481 		}
2482 	}
2483 	return B_OK;
2484 }
2485 
2486 
2487 //	#pragma mark - TransactionListener implementation
2488 
2489 
2490 void
2491 Inode::TransactionDone(bool success)
2492 {
2493 	if (!success) {
2494 		// Revert any changes made to the cached bfs_inode
2495 		// TODO: return code gets eaten
2496 		UpdateNodeFromDisk();
2497 	}
2498 }
2499 
2500 
2501 void
2502 Inode::RemovedFromTransaction()
2503 {
2504 	Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
2505 
2506 	// See AddInode() why we do this here
2507 	if ((Flags() & INODE_DELETED) != 0)
2508 		fVolume->RemovedInodes().Add(this);
2509 
2510 	rw_lock_write_unlock(&Lock());
2511 
2512 	if (!fVolume->IsInitializing())
2513 		put_vnode(fVolume->FSVolume(), ID());
2514 }
2515 
2516 
2517 //	#pragma mark - creation/deletion
2518 
2519 
2520 status_t
2521 Inode::Remove(Transaction& transaction, const char* name, ino_t* _id,
2522 	bool isDirectory, bool force)
2523 {
2524 	if (fTree == NULL)
2525 		RETURN_ERROR(B_BAD_VALUE);
2526 
2527 	WriteLockInTransaction(transaction);
2528 
2529 	// does the file even exist?
2530 	off_t id;
2531 	if (fTree->Find((uint8*)name, (uint16)strlen(name), &id) < B_OK)
2532 		return B_ENTRY_NOT_FOUND;
2533 
2534 	if (_id)
2535 		*_id = id;
2536 
2537 	Vnode vnode(fVolume, id);
2538 	Inode* inode;
2539 	status_t status = vnode.Get(&inode);
2540 	if (status < B_OK) {
2541 		REPORT_ERROR(status);
2542 		return fTree->Remove(transaction, name, id);
2543 	}
2544 
2545 	T(Remove(inode, name));
2546 	inode->WriteLockInTransaction(transaction);
2547 
2548 	// Inode::IsContainer() is true also for indices (furthermore, the S_IFDIR
2549 	// bit is set for indices in BFS, not for attribute directories) - but you
2550 	// should really be able to do whatever you want with your indices
2551 	// without having to remove all files first :)
2552 	if (!inode->IsIndex() && !force) {
2553 		// if it's not of the correct type, don't delete it!
2554 		if (inode->IsContainer() != isDirectory)
2555 			return isDirectory ? B_NOT_A_DIRECTORY : B_IS_A_DIRECTORY;
2556 
2557 		// only delete empty directories
2558 		if (isDirectory && !inode->IsEmpty())
2559 			return B_DIRECTORY_NOT_EMPTY;
2560 	}
2561 
2562 	// remove_vnode() allows the inode to be accessed until the last put_vnode()
2563 	status = remove_vnode(fVolume->FSVolume(), id);
2564 	if (status != B_OK)
2565 		return status;
2566 
2567 	if (fTree->Remove(transaction, name, id) != B_OK && !force) {
2568 		unremove_vnode(fVolume->FSVolume(), id);
2569 		RETURN_ERROR(B_ERROR);
2570 	}
2571 
2572 #ifdef DEBUG
2573 	if (fTree->Find((uint8*)name, (uint16)strlen(name), &id) == B_OK) {
2574 		DIE(("deleted entry still there"));
2575 	}
2576 #endif
2577 
2578 	ContainerContentsChanged(transaction);
2579 
2580 	// update the inode, so that no one will ever doubt it's deleted :-)
2581 	inode->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
2582 	inode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
2583 
2584 	// In balance to the Inode::Create() method, the main indices
2585 	// are updated here (name, size, & last_modified)
2586 
2587 	Index index(fVolume);
2588 	if (inode->InNameIndex()) {
2589 		index.RemoveName(transaction, name, inode);
2590 			// If removing from the index fails, it is not regarded as a
2591 			// fatal error and will not be reported back!
2592 			// Deleted inodes won't be visible in queries anyway.
2593 	}
2594 
2595 	if (inode->InSizeIndex())
2596 		index.RemoveSize(transaction, inode);
2597 	if (inode->InLastModifiedIndex())
2598 		index.RemoveLastModified(transaction, inode);
2599 
2600 	return inode->WriteBack(transaction);
2601 }
2602 
2603 
2604 /*!	Creates the inode with the specified \a parent directory, and automatically
2605 	adds the created inode to that parent directory. If an attribute directory
2606 	is created, it will also automatically  be added to the \a parent inode as
2607 	such. However, the indices root node, and the regular root node won't be
2608 	added to the superblock.
2609 	It will also create the initial B+tree for the inode if it's a directory
2610 	of any kind.
2611 	\a name may be \c NULL, but only if no \a parent is given.
2612 	If the "_id" or "_inode" variable is given and non-NULL to store the
2613 	inode's ID, the inode stays locked - you have to call put_vnode() if you
2614 	don't use it anymore.
2615 	If the node already exists, this method will fail if \c O_EXCL is set, or
2616 	it's a directory or a symlink. Otherwise, it will just be returned.
2617 	If \c O_TRUNC has been specified, the file will also be truncated.
2618 */
2619 status_t
2620 Inode::Create(Transaction& transaction, Inode* parent, const char* name,
2621 	int32 mode, int openMode, uint32 type, bool* _created, ino_t* _id,
2622 	Inode** _inode, fs_vnode_ops* vnodeOps, uint32 publishFlags)
2623 {
2624 	FUNCTION_START(("name = %s, mode = %" B_PRId32 "\n", name, mode));
2625 
2626 	block_run parentRun = parent ? parent->BlockRun() : block_run::Run(0, 0, 0);
2627 	Volume* volume = transaction.GetVolume();
2628 	BPlusTree* tree = NULL;
2629 
2630 	if (parent != NULL && (mode & S_ATTR_DIR) == 0 && parent->IsContainer()) {
2631 		// check if the file already exists in the directory
2632 		tree = parent->Tree();
2633 	}
2634 
2635 	if (parent != NULL) {
2636 		// the parent directory is locked during the whole inode creation
2637 		parent->WriteLockInTransaction(transaction);
2638 	}
2639 
2640 	if (parent != NULL && !volume->IsInitializing() && parent->IsContainer()) {
2641 		// don't create anything in removed directories
2642 		bool removed;
2643 		if (get_vnode_removed(volume->FSVolume(), parent->ID(), &removed)
2644 				== B_OK && removed) {
2645 			RETURN_ERROR(B_ENTRY_NOT_FOUND);
2646 		}
2647 	}
2648 
2649 	if (tree != NULL) {
2650 		// Does the file already exist?
2651 		off_t offset;
2652 		if (tree->Find((uint8*)name, (uint16)strlen(name), &offset) == B_OK) {
2653 			// Return if the file should be a directory/link or opened in
2654 			// exclusive mode
2655 			if (S_ISDIR(mode) || S_ISLNK(mode) || (openMode & O_EXCL) != 0)
2656 				return B_FILE_EXISTS;
2657 
2658 			Vnode vnode(volume, offset);
2659 			Inode* inode;
2660 			status_t status = vnode.Get(&inode);
2661 			if (status != B_OK) {
2662 				REPORT_ERROR(status);
2663 				return B_ENTRY_NOT_FOUND;
2664 			}
2665 
2666 			if (inode->IsDirectory() && (openMode & O_RWMASK) != O_RDONLY)
2667 				return B_IS_A_DIRECTORY;
2668 			if ((openMode & O_DIRECTORY) != 0 && !inode->IsDirectory())
2669 				return B_NOT_A_DIRECTORY;
2670 
2671 			// we want to open the file, so we should have the rights to do so
2672 			if (inode->CheckPermissions(open_mode_to_access(openMode)
2673 					| ((openMode & O_TRUNC) != 0 ? W_OK : 0)) != B_OK)
2674 				return B_NOT_ALLOWED;
2675 
2676 			if ((openMode & O_TRUNC) != 0) {
2677 				// truncate the existing file
2678 				inode->WriteLockInTransaction(transaction);
2679 
2680 				status_t status = inode->SetFileSize(transaction, 0);
2681 				if (status == B_OK)
2682 					status = inode->WriteBack(transaction);
2683 
2684 				if (status != B_OK)
2685 					return status;
2686 			}
2687 
2688 			if (_created)
2689 				*_created = false;
2690 			if (_id)
2691 				*_id = inode->ID();
2692 			if (_inode)
2693 				*_inode = inode;
2694 
2695 			// Only keep the vnode in memory if the _id or _inode pointer is
2696 			// provided
2697 			if (_id != NULL || _inode != NULL)
2698 				vnode.Keep();
2699 
2700 			return B_OK;
2701 		}
2702 	} else if (parent != NULL && (mode & S_ATTR_DIR) == 0) {
2703 		return B_BAD_VALUE;
2704 	} else if ((openMode & O_DIRECTORY) != 0) {
2705 		// TODO: we might need to return B_NOT_A_DIRECTORY here
2706 		return B_ENTRY_NOT_FOUND;
2707 	}
2708 
2709 	status_t status;
2710 
2711 	// do we have the power to create new files at all?
2712 	if (parent != NULL && (status = parent->CheckPermissions(W_OK)) != B_OK)
2713 		RETURN_ERROR(status);
2714 
2715 	// allocate space for the new inode
2716 	InodeAllocator allocator(transaction);
2717 	block_run run;
2718 	Inode* inode;
2719 	status = allocator.New(&parentRun, mode, publishFlags, run, vnodeOps,
2720 		&inode);
2721 	if (status < B_OK)
2722 		return status;
2723 
2724 	T(Create(inode, parent, name, mode, openMode, type));
2725 
2726 	// Initialize the parts of the bfs_inode structure that
2727 	// InodeAllocator::New() hasn't touched yet
2728 
2729 	bfs_inode* node = &inode->Node();
2730 
2731 	if (parent == NULL) {
2732 		// we set the parent to itself in this case
2733 		// (only happens for the root and indices node)
2734 		node->parent = run;
2735 	} else
2736 		node->parent = parentRun;
2737 
2738 	node->uid = HOST_ENDIAN_TO_BFS_INT32(geteuid());
2739 	node->gid = HOST_ENDIAN_TO_BFS_INT32(parent
2740 		? parent->Node().GroupID() : getegid());
2741 		// the group ID is inherited from the parent, if available
2742 
2743 	node->type = HOST_ENDIAN_TO_BFS_INT32(type);
2744 
2745 	inode->WriteBack(transaction);
2746 		// make sure the initialized node is available to others
2747 
2748 	// only add the name to regular files, directories, or symlinks
2749 	// don't add it to attributes, or indices
2750 	if (tree && inode->IsRegularNode()
2751 		&& inode->SetName(transaction, name) != B_OK)
2752 		return B_ERROR;
2753 
2754 	// Initialize b+tree if it's a directory (and add "." & ".." if it's
2755 	// a standard directory for files - not for attributes or indices)
2756 	if (inode->IsContainer()) {
2757 		status = allocator.CreateTree();
2758 		if (status != B_OK)
2759 			return status;
2760 	}
2761 
2762 	// Add a link to the inode from the parent, depending on its type
2763 	// (the vnode is not published yet, so it is safe to make the inode
2764 	// accessable to the file system here)
2765 	if (tree != NULL) {
2766 		status = tree->Insert(transaction, name, inode->ID());
2767 	} else if (parent != NULL && (mode & S_ATTR_DIR) != 0) {
2768 		parent->Attributes() = run;
2769 		status = parent->WriteBack(transaction);
2770 	}
2771 
2772 	// Note, we only care if the inode could be made accessable for the
2773 	// two cases above; the root node or the indices root node must
2774 	// handle this case on their own (or other cases where "parent" is
2775 	// NULL)
2776 	if (status != B_OK)
2777 		RETURN_ERROR(status);
2778 
2779 	// Update the main indices (name, size & last_modified)
2780 	// (live queries might want to access us after this)
2781 
2782 	Index index(volume);
2783 	if (inode->InNameIndex() && name != NULL) {
2784 		// the name index only contains regular files
2785 		// (but not the root node where name == NULL)
2786 		status = index.InsertName(transaction, name, inode);
2787 		if (status != B_OK && status != B_BAD_INDEX) {
2788 			// We have to remove the node from the parent at this point,
2789 			// because the InodeAllocator destructor can't handle this
2790 			// case (and if it fails, we can't do anything about it...)
2791 			if (tree)
2792 				tree->Remove(transaction, name, inode->ID());
2793 			else if (parent != NULL && (mode & S_ATTR_DIR) != 0)
2794 				parent->Node().attributes.SetTo(0, 0, 0);
2795 
2796 			RETURN_ERROR(status);
2797 		}
2798 	}
2799 
2800 	if (parent != NULL && parent->IsContainer())
2801 		parent->ContainerContentsChanged(transaction);
2802 
2803 	inode->UpdateOldLastModified();
2804 
2805 	// The "size" & "last_modified" indices don't contain directories.
2806 	// If adding to these indices fails, the inode creation will not be
2807 	// harmed; they are considered less important than the "name" index.
2808 	if (inode->InSizeIndex())
2809 		index.InsertSize(transaction, inode);
2810 	if (inode->InLastModifiedIndex())
2811 		index.InsertLastModified(transaction, inode);
2812 
2813 	if (inode->NeedsFileCache()) {
2814 		inode->SetFileCache(file_cache_create(volume->ID(), inode->ID(),
2815 			inode->Size()));
2816 		inode->SetMap(file_map_create(volume->ID(), inode->ID(),
2817 			inode->Size()));
2818 
2819 		if (inode->FileCache() == NULL || inode->Map() == NULL)
2820 			return B_NO_MEMORY;
2821 	}
2822 
2823 	// Everything worked well until this point, we have a fully
2824 	// initialized inode, and we want to keep it
2825 	allocator.Keep(vnodeOps, publishFlags);
2826 
2827 	if (_created)
2828 		*_created = true;
2829 	if (_id != NULL)
2830 		*_id = inode->ID();
2831 	if (_inode != NULL)
2832 		*_inode = inode;
2833 
2834 	// if either _id or _inode is passed, we will keep the inode locked
2835 	if (_id == NULL && _inode == NULL)
2836 		put_vnode(volume->FSVolume(), inode->ID());
2837 
2838 	return B_OK;
2839 }
2840 
2841 
2842 //	#pragma mark - AttributeIterator
2843 
2844 
2845 AttributeIterator::AttributeIterator(Inode* inode)
2846 	:
2847 	fCurrentSmallData(0),
2848 	fInode(inode),
2849 	fAttributes(NULL),
2850 	fIterator(NULL),
2851 	fBuffer(NULL)
2852 {
2853 	inode->_AddIterator(this);
2854 }
2855 
2856 
2857 AttributeIterator::~AttributeIterator()
2858 {
2859 	if (fAttributes)
2860 		put_vnode(fAttributes->GetVolume()->FSVolume(), fAttributes->ID());
2861 
2862 	delete fIterator;
2863 	fInode->_RemoveIterator(this);
2864 }
2865 
2866 
2867 status_t
2868 AttributeIterator::Rewind()
2869 {
2870 	fCurrentSmallData = 0;
2871 
2872 	if (fIterator != NULL)
2873 		fIterator->Rewind();
2874 
2875 	return B_OK;
2876 }
2877 
2878 
2879 status_t
2880 AttributeIterator::GetNext(char* name, size_t* _length, uint32* _type,
2881 	ino_t* _id)
2882 {
2883 	// read attributes out of the small data section
2884 
2885 	if (fCurrentSmallData >= 0) {
2886 		NodeGetter nodeGetter(fInode->GetVolume());
2887 		status_t status = nodeGetter.SetTo(fInode);
2888 		if (status != B_OK)
2889 			return status;
2890 
2891 		const bfs_inode* node = nodeGetter.Node();
2892 		const small_data* item = ((bfs_inode*)node)->SmallDataStart();
2893 
2894 		RecursiveLocker _(&fInode->SmallDataLock());
2895 
2896 		int32 index = 0;
2897 		for (; !item->IsLast(node); item = item->Next(), index++) {
2898 			if (item->NameSize() == FILE_NAME_NAME_LENGTH
2899 				&& *item->Name() == FILE_NAME_NAME)
2900 				continue;
2901 
2902 			if (index >= fCurrentSmallData)
2903 				break;
2904 		}
2905 
2906 		if (!item->IsLast(node)) {
2907 			strncpy(name, item->Name(), B_FILE_NAME_LENGTH);
2908 			*_type = item->Type();
2909 			*_length = item->NameSize();
2910 			*_id = (ino_t)index;
2911 
2912 			fCurrentSmallData = index + 1;
2913 			return B_OK;
2914 		}
2915 
2916 		// stop traversing the small_data section
2917 		fCurrentSmallData = -1;
2918 	}
2919 
2920 	// read attributes out of the attribute directory
2921 
2922 	if (fInode->Attributes().IsZero())
2923 		return B_ENTRY_NOT_FOUND;
2924 
2925 	Volume* volume = fInode->GetVolume();
2926 
2927 	// if you haven't yet access to the attributes directory, get it
2928 	if (fAttributes == NULL) {
2929 		if (get_vnode(volume->FSVolume(), volume->ToVnode(fInode->Attributes()),
2930 				(void**)&fAttributes) != B_OK) {
2931 			FATAL(("get_vnode() failed in AttributeIterator::GetNext(ino_t"
2932 				" = %" B_PRIdINO ",name = \"%s\")\n", fInode->ID(), name));
2933 			return B_ENTRY_NOT_FOUND;
2934 		}
2935 
2936 		BPlusTree* tree = fAttributes->Tree();
2937 		if (tree == NULL
2938 			|| (fIterator = new(std::nothrow) TreeIterator(tree)) == NULL) {
2939 			FATAL(("could not get tree in AttributeIterator::GetNext(ino_t"
2940 				" = %" B_PRIdINO ",name = \"%s\")\n", fInode->ID(), name));
2941 			return B_ENTRY_NOT_FOUND;
2942 		}
2943 	}
2944 
2945 	uint16 length;
2946 	ino_t id;
2947 	status_t status = fIterator->GetNextEntry(name, &length,
2948 		B_FILE_NAME_LENGTH, &id);
2949 	if (status != B_OK)
2950 		return status;
2951 
2952 	Vnode vnode(volume, id);
2953 	Inode* attribute;
2954 	if ((status = vnode.Get(&attribute)) == B_OK) {
2955 		*_type = attribute->Type();
2956 		*_length = length;
2957 		*_id = id;
2958 	}
2959 
2960 	return status;
2961 }
2962 
2963 
2964 void
2965 AttributeIterator::Update(uint16 index, int8 change)
2966 {
2967 	// fCurrentSmallData points already to the next item
2968 	if (index < fCurrentSmallData)
2969 		fCurrentSmallData += change;
2970 }
2971 
2972