xref: /haiku/src/add-ons/kernel/file_systems/bfs/Inode.cpp (revision 894526b51a3d931c423878fc0eb8da610fa1fb2a)
1 /*
2  * Copyright 2001-2014, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 
7 //! Inode access functions
8 
9 
10 #include "Debug.h"
11 #include "Inode.h"
12 #include "BPlusTree.h"
13 #include "Index.h"
14 
15 
16 #if BFS_TRACING && !defined(FS_SHELL) && !defined(_BOOT_MODE)
17 namespace BFSInodeTracing {
18 
19 class Create : public AbstractTraceEntry {
20 public:
21 	Create(Inode* inode, Inode* parent, const char* name, int32 mode,
22 			int openMode, uint32 type)
23 		:
24 		fInode(inode),
25 		fID(inode->ID()),
26 		fParent(parent),
27 		fParentID(parent != NULL ? parent->ID() : 0),
28 		fMode(mode),
29 		fOpenMode(openMode),
30 		fType(type)
31 	{
32 		if (name != NULL)
33 			strlcpy(fName, name, sizeof(fName));
34 		else
35 			fName[0] = '\0';
36 
37 		Initialized();
38 	}
39 
40 	virtual void AddDump(TraceOutput& out)
41 	{
42 		out.Print("bfs:Create %Ld (%p), parent %Ld (%p), \"%s\", "
43 			"mode %lx, omode %x, type %lx", fID, fInode, fParentID,
44 			fParent, fName, fMode, fOpenMode, fType);
45 	}
46 
47 private:
48 	Inode*	fInode;
49 	ino_t	fID;
50 	Inode*	fParent;
51 	ino_t	fParentID;
52 	char	fName[32];
53 	int32	fMode;
54 	int		fOpenMode;
55 	uint32	fType;
56 };
57 
58 class Remove : public AbstractTraceEntry {
59 public:
60 	Remove(Inode* inode, const char* name)
61 		:
62 		fInode(inode),
63 		fID(inode->ID())
64 	{
65 		strlcpy(fName, name, sizeof(fName));
66 		Initialized();
67 	}
68 
69 	virtual void AddDump(TraceOutput& out)
70 	{
71 		out.Print("bfs:Remove %Ld (%p), \"%s\"", fID, fInode, fName);
72 	}
73 
74 private:
75 	Inode*	fInode;
76 	ino_t	fID;
77 	char	fName[32];
78 };
79 
80 class Action : public AbstractTraceEntry {
81 public:
82 	Action(const char* action, Inode* inode)
83 		:
84 		fInode(inode),
85 		fID(inode->ID())
86 	{
87 		strlcpy(fAction, action, sizeof(fAction));
88 		Initialized();
89 	}
90 
91 	virtual void AddDump(TraceOutput& out)
92 	{
93 		out.Print("bfs:%s %Ld (%p)\n", fAction, fID, fInode);
94 	}
95 
96 private:
97 	Inode*	fInode;
98 	ino_t	fID;
99 	char	fAction[16];
100 };
101 
102 class Resize : public AbstractTraceEntry {
103 public:
104 	Resize(Inode* inode, off_t oldSize, off_t newSize, bool trim)
105 		:
106 		fInode(inode),
107 		fID(inode->ID()),
108 		fOldSize(oldSize),
109 		fNewSize(newSize),
110 		fTrim(trim)
111 	{
112 		Initialized();
113 	}
114 
115 	virtual void AddDump(TraceOutput& out)
116 	{
117 		out.Print("bfs:%s %Ld (%p), %Ld -> %Ld", fTrim ? "Trim" : "Resize",
118 			fID, fInode, fOldSize, fNewSize);
119 	}
120 
121 private:
122 	Inode*	fInode;
123 	ino_t	fID;
124 	off_t	fOldSize;
125 	off_t	fNewSize;
126 	bool	fTrim;
127 };
128 
129 }	// namespace BFSInodeTracing
130 
131 #	define T(x) new(std::nothrow) BFSInodeTracing::x;
132 #else
133 #	define T(x) ;
134 #endif
135 
136 
137 /*!	A helper class used by Inode::Create() to keep track of the belongings
138 	of an inode creation in progress.
139 	This class will make sure everything is cleaned up properly.
140 */
141 class InodeAllocator {
142 public:
143 							InodeAllocator(Transaction& transaction);
144 							~InodeAllocator();
145 
146 			status_t		New(block_run* parentRun, mode_t mode,
147 								block_run& run, fs_vnode_ops* vnodeOps,
148 								Inode** _inode);
149 			status_t		CreateTree();
150 			status_t		Keep(fs_vnode_ops* vnodeOps, uint32 publishFlags);
151 
152 private:
153 	static	void			_TransactionListener(int32 id, int32 event,
154 								void* _inode);
155 
156 			Transaction*	fTransaction;
157 			block_run		fRun;
158 			Inode*			fInode;
159 };
160 
161 
162 InodeAllocator::InodeAllocator(Transaction& transaction)
163 	:
164 	fTransaction(&transaction),
165 	fInode(NULL)
166 {
167 }
168 
169 
170 InodeAllocator::~InodeAllocator()
171 {
172 	if (fTransaction != NULL) {
173 		Volume* volume = fTransaction->GetVolume();
174 
175 		if (fInode != NULL) {
176 			fInode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
177 				// this unblocks any pending bfs_read_vnode() calls
178 			fInode->Free(*fTransaction);
179 
180 			if (fInode->fTree != NULL)
181 				fTransaction->RemoveListener(fInode->fTree);
182 			fTransaction->RemoveListener(fInode);
183 
184 			remove_vnode(volume->FSVolume(), fInode->ID());
185 		} else
186 			volume->Free(*fTransaction, fRun);
187 	}
188 
189 	delete fInode;
190 }
191 
192 
193 status_t
194 InodeAllocator::New(block_run* parentRun, mode_t mode, block_run& run,
195 	fs_vnode_ops* vnodeOps, Inode** _inode)
196 {
197 	Volume* volume = fTransaction->GetVolume();
198 
199 	status_t status = volume->AllocateForInode(*fTransaction, parentRun, mode,
200 		fRun);
201 	if (status < B_OK) {
202 		// don't free the space in the destructor, because
203 		// the allocation failed
204 		fTransaction = NULL;
205 		RETURN_ERROR(status);
206 	}
207 
208 	run = fRun;
209 	fInode = new(std::nothrow) Inode(volume, *fTransaction,
210 		volume->ToVnode(run), mode, run);
211 	if (fInode == NULL)
212 		RETURN_ERROR(B_NO_MEMORY);
213 
214 	if (!volume->IsInitializing()) {
215 		status = new_vnode(volume->FSVolume(), fInode->ID(), fInode,
216 			vnodeOps != NULL ? vnodeOps : &gBFSVnodeOps);
217 		if (status < B_OK) {
218 			delete fInode;
219 			fInode = NULL;
220 			RETURN_ERROR(status);
221 		}
222 	}
223 
224 	fInode->WriteLockInTransaction(*fTransaction);
225 	*_inode = fInode;
226 	return B_OK;
227 }
228 
229 
230 status_t
231 InodeAllocator::CreateTree()
232 {
233 	Volume* volume = fTransaction->GetVolume();
234 
235 	// force S_STR_INDEX to be set, if no type is set
236 	if ((fInode->Mode() & S_INDEX_TYPES) == 0)
237 		fInode->Node().mode |= HOST_ENDIAN_TO_BFS_INT32(S_STR_INDEX);
238 
239 	BPlusTree* tree = new(std::nothrow) BPlusTree(*fTransaction, fInode);
240 	if (tree == NULL)
241 		return B_ERROR;
242 
243 	status_t status = tree->InitCheck();
244 	if (status != B_OK) {
245 		delete tree;
246 		return status;
247 	}
248 
249 	fInode->fTree = tree;
250 
251 	if (fInode->IsRegularNode()) {
252 		if (tree->Insert(*fTransaction, ".", fInode->ID()) < B_OK
253 			|| tree->Insert(*fTransaction, "..",
254 					volume->ToVnode(fInode->Parent())) < B_OK)
255 			return B_ERROR;
256 	}
257 	return B_OK;
258 }
259 
260 
261 status_t
262 InodeAllocator::Keep(fs_vnode_ops* vnodeOps, uint32 publishFlags)
263 {
264 	ASSERT(fInode != NULL && fTransaction != NULL);
265 	Volume* volume = fTransaction->GetVolume();
266 
267 	status_t status = fInode->WriteBack(*fTransaction);
268 	if (status < B_OK) {
269 		FATAL(("writing new inode %" B_PRIdINO " failed!\n", fInode->ID()));
270 		return status;
271 	}
272 
273 	// Symbolic links are not published -- the caller needs to do this once
274 	// the contents have been written.
275 	if (!fInode->IsSymLink() && volume->ID() >= 0) {
276 		status = publish_vnode(volume->FSVolume(), fInode->ID(), fInode,
277 			vnodeOps != NULL ? vnodeOps : &gBFSVnodeOps, fInode->Mode(),
278 			publishFlags);
279 	}
280 
281 	if (status == B_OK) {
282 		cache_add_transaction_listener(volume->BlockCache(), fTransaction->ID(),
283 			TRANSACTION_ABORTED, &_TransactionListener, fInode);
284 	}
285 
286 	fTransaction = NULL;
287 	fInode = NULL;
288 
289 	return status;
290 }
291 
292 
293 /*static*/ void
294 InodeAllocator::_TransactionListener(int32 id, int32 event, void* _inode)
295 {
296 	Inode* inode = (Inode*)_inode;
297 
298 	if (event == TRANSACTION_ABORTED)
299 		panic("transaction %d aborted, inode %p still around!\n", (int)id, inode);
300 }
301 
302 
303 //	#pragma mark -
304 
305 
306 status_t
307 bfs_inode::InitCheck(Volume* volume) const
308 {
309 	if (Magic1() != INODE_MAGIC1
310 		|| !(Flags() & INODE_IN_USE)
311 		|| inode_num.Length() != 1
312 		// matches inode size?
313 		|| (uint32)InodeSize() != volume->InodeSize()
314 		// parent resides on disk?
315 		|| parent.AllocationGroup() > int32(volume->AllocationGroups())
316 		|| parent.AllocationGroup() < 0
317 		|| parent.Start() > (1L << volume->AllocationGroupShift())
318 		|| parent.Length() != 1
319 		// attributes, too?
320 		|| attributes.AllocationGroup() > int32(volume->AllocationGroups())
321 		|| attributes.AllocationGroup() < 0
322 		|| attributes.Start() > (1L << volume->AllocationGroupShift()))
323 		RETURN_ERROR(B_BAD_DATA);
324 
325 	if (Flags() & INODE_DELETED)
326 		return B_NOT_ALLOWED;
327 
328 	// TODO: Add some tests to check the integrity of the other stuff here,
329 	// especially for the data_stream!
330 
331 	return B_OK;
332 }
333 
334 
335 //	#pragma mark - Inode
336 
337 
338 Inode::Inode(Volume* volume, ino_t id)
339 	:
340 	fVolume(volume),
341 	fID(id),
342 	fTree(NULL),
343 	fAttributes(NULL),
344 	fCache(NULL),
345 	fMap(NULL)
346 {
347 	PRINT(("Inode::Inode(volume = %p, id = %Ld) @ %p\n", volume, id, this));
348 
349 	UpdateNodeFromDisk();
350 
351 	rw_lock_init(&fLock, "bfs inode");
352 	recursive_lock_init(&fSmallDataLock, "bfs inode small data");
353 
354 	// these two will help to maintain the indices
355 	fOldSize = Size();
356 	fOldLastModified = LastModified();
357 
358 	if (IsContainer())
359 		fTree = new(std::nothrow) BPlusTree(this);
360 	if (NeedsFileCache()) {
361 		SetFileCache(file_cache_create(fVolume->ID(), ID(), Size()));
362 		SetMap(file_map_create(volume->ID(), ID(), Size()));
363 	}
364 }
365 
366 
367 Inode::Inode(Volume* volume, Transaction& transaction, ino_t id, mode_t mode,
368 		block_run& run)
369 	:
370 	fVolume(volume),
371 	fID(id),
372 	fTree(NULL),
373 	fAttributes(NULL),
374 	fCache(NULL),
375 	fMap(NULL)
376 {
377 	PRINT(("Inode::Inode(volume = %p, transaction = %p, id = %Ld) @ %p\n",
378 		volume, &transaction, id, this));
379 
380 	rw_lock_init(&fLock, "bfs inode");
381 	recursive_lock_init(&fSmallDataLock, "bfs inode small data");
382 
383 	NodeGetter node(volume, transaction, this, true);
384 	memset(&fNode, 0, sizeof(bfs_inode));
385 
386 	// Initialize the bfs_inode structure -- it's not written back to disk
387 	// here, because it will be done so already when the inode could be
388 	// created completely.
389 
390 	Node().magic1 = HOST_ENDIAN_TO_BFS_INT32(INODE_MAGIC1);
391 	Node().inode_num = run;
392 	Node().mode = HOST_ENDIAN_TO_BFS_INT32(mode);
393 	Node().flags = HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
394 
395 	Node().create_time = Node().last_modified_time = Node().status_change_time
396 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
397 
398 	Node().inode_size = HOST_ENDIAN_TO_BFS_INT32(volume->InodeSize());
399 
400 	// these two will help to maintain the indices
401 	fOldSize = Size();
402 	fOldLastModified = LastModified();
403 }
404 
405 
406 Inode::~Inode()
407 {
408 	PRINT(("Inode::~Inode() @ %p\n", this));
409 
410 	file_cache_delete(FileCache());
411 	file_map_delete(Map());
412 	delete fTree;
413 
414 	rw_lock_destroy(&fLock);
415 	recursive_lock_destroy(&fSmallDataLock);
416 }
417 
418 
419 status_t
420 Inode::InitCheck(bool checkNode) const
421 {
422 	// test inode magic and flags
423 	if (checkNode) {
424 		status_t status = Node().InitCheck(fVolume);
425 		if (status == B_BUSY)
426 			return B_BUSY;
427 
428 		if (status != B_OK) {
429 			FATAL(("inode at block %" B_PRIdOFF " corrupt!\n", BlockNumber()));
430 			RETURN_ERROR(B_BAD_DATA);
431 		}
432 	}
433 
434 	if (IsContainer()) {
435 		// inodes that have a B+tree
436 		if (fTree == NULL)
437 			RETURN_ERROR(B_NO_MEMORY);
438 
439 		status_t status = fTree->InitCheck();
440 		if (status != B_OK) {
441 			FATAL(("inode tree at block %" B_PRIdOFF " corrupt!\n",
442 				BlockNumber()));
443 			RETURN_ERROR(B_BAD_DATA);
444 		}
445 	}
446 
447 	if (NeedsFileCache() && (fCache == NULL || fMap == NULL))
448 		return B_NO_MEMORY;
449 
450 	return B_OK;
451 }
452 
453 
454 /*!	Adds this inode to the specified transaction. This means that the inode will
455 	be write locked until the transaction ended.
456 	To ensure that the inode will stay valid until that point, an extra reference
457 	is acquired to it as long as this transaction stays active.
458 */
459 void
460 Inode::WriteLockInTransaction(Transaction& transaction)
461 {
462 	// These flags can only change while holding the transaction lock
463 	if ((Flags() & INODE_IN_TRANSACTION) != 0)
464 		return;
465 
466 	// We share the same list link with the removed list, so we have to remove
467 	// the inode from that list here (and add it back when we no longer need it)
468 	if ((Flags() & INODE_DELETED) != 0)
469 		fVolume->RemovedInodes().Remove(this);
470 
471 	if (!fVolume->IsInitializing())
472 		acquire_vnode(fVolume->FSVolume(), ID());
473 
474 	rw_lock_write_lock(&Lock());
475 	Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
476 
477 	transaction.AddListener(this);
478 }
479 
480 
481 status_t
482 Inode::WriteBack(Transaction& transaction)
483 {
484 	NodeGetter node(fVolume, transaction, this);
485 	if (node.WritableNode() == NULL)
486 		return B_IO_ERROR;
487 
488 	memcpy(node.WritableNode(), &Node(), sizeof(bfs_inode));
489 	return B_OK;
490 }
491 
492 
493 void
494 Inode::UpdateNodeFromDisk()
495 {
496 	NodeGetter node(fVolume, this);
497 	memcpy(&fNode, node.Node(), sizeof(bfs_inode));
498 	fNode.flags &= HOST_ENDIAN_TO_BFS_INT32(INODE_PERMANENT_FLAGS);
499 }
500 
501 
502 status_t
503 Inode::CheckPermissions(int accessMode) const
504 {
505 	// you never have write access to a read-only volume
506 	if ((accessMode & W_OK) != 0 && fVolume->IsReadOnly())
507 		return B_READ_ONLY_DEVICE;
508 
509 	return check_access_permissions(accessMode, Mode(), (gid_t)fNode.GroupID(),
510 		(uid_t)fNode.UserID());
511 }
512 
513 
514 //	#pragma mark - attributes
515 
516 
517 void
518 Inode::_AddIterator(AttributeIterator* iterator)
519 {
520 	RecursiveLocker _(fSmallDataLock);
521 	fIterators.Add(iterator);
522 }
523 
524 
525 void
526 Inode::_RemoveIterator(AttributeIterator* iterator)
527 {
528 	RecursiveLocker _(fSmallDataLock);
529 	fIterators.Remove(iterator);
530 }
531 
532 
533 /*!	Tries to free up "bytes" space in the small_data section by moving
534 	attributes to real files. Used for system attributes like the name.
535 	You need to hold the fSmallDataLock when you call this method
536 */
537 status_t
538 Inode::_MakeSpaceForSmallData(Transaction& transaction, bfs_inode* node,
539 	const char* name, int32 bytes)
540 {
541 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
542 
543 	while (bytes > 0) {
544 		small_data* item = node->SmallDataStart();
545 		small_data* max = NULL;
546 		int32 index = 0, maxIndex = 0;
547 		for (; !item->IsLast(node); item = item->Next(), index++) {
548 			// should not remove those
549 			if (*item->Name() == FILE_NAME_NAME || !strcmp(name, item->Name()))
550 				continue;
551 
552 			if (max == NULL || max->Size() < item->Size()) {
553 				maxIndex = index;
554 				max = item;
555 			}
556 
557 			// Remove the first one large enough to free the needed amount of
558 			// bytes
559 			if (bytes < (int32)item->Size())
560 				break;
561 		}
562 
563 		if (item->IsLast(node) || (int32)item->Size() < bytes)
564 			return B_ERROR;
565 
566 		bytes -= max->Size();
567 
568 		// Move the attribute to a real attribute file
569 		// Luckily, this doesn't cause any index updates
570 
571 		Inode* attribute;
572 		status_t status = CreateAttribute(transaction, item->Name(),
573 			item->Type(), &attribute);
574 		if (status != B_OK)
575 			RETURN_ERROR(status);
576 
577 		size_t length = item->DataSize();
578 		status = attribute->WriteAt(transaction, 0, item->Data(), &length);
579 
580 		ReleaseAttribute(attribute);
581 
582 		if (status != B_OK) {
583 			Vnode vnode(fVolume, Attributes());
584 			Inode* attributes;
585 			if (vnode.Get(&attributes) < B_OK
586 				|| attributes->Remove(transaction, name) < B_OK) {
587 				FATAL(("Could not remove newly created attribute!\n"));
588 			}
589 
590 			RETURN_ERROR(status);
591 		}
592 
593 		_RemoveSmallData(node, max, maxIndex);
594 	}
595 	return B_OK;
596 }
597 
598 
599 /*!	Private function which removes the given attribute from the small_data
600 	section.
601 	You need to hold the fSmallDataLock when you call this method
602 */
603 status_t
604 Inode::_RemoveSmallData(bfs_inode* node, small_data* item, int32 index)
605 {
606 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
607 
608 	small_data* next = item->Next();
609 	if (!next->IsLast(node)) {
610 		// find the last attribute
611 		small_data* last = next;
612 		while (!last->IsLast(node))
613 			last = last->Next();
614 
615 		int32 size = (uint8*)last - (uint8*)next;
616 		if (size < 0
617 			|| size > (uint8*)node + fVolume->BlockSize() - (uint8*)next)
618 			return B_BAD_DATA;
619 
620 		memmove(item, next, size);
621 
622 		// Move the "last" one to its new location and
623 		// correctly terminate the small_data section
624 		last = (small_data*)((uint8*)last - ((uint8*)next - (uint8*)item));
625 		memset(last, 0, (uint8*)node + fVolume->BlockSize() - (uint8*)last);
626 	} else
627 		memset(item, 0, item->Size());
628 
629 	// update all current iterators
630 	SinglyLinkedList<AttributeIterator>::Iterator iterator
631 		= fIterators.GetIterator();
632 	while (iterator.HasNext()) {
633 		iterator.Next()->Update(index, -1);
634 	}
635 
636 	return B_OK;
637 }
638 
639 
640 //!	Removes the given attribute from the small_data section.
641 status_t
642 Inode::_RemoveSmallData(Transaction& transaction, NodeGetter& nodeGetter,
643 	const char* name)
644 {
645 	if (name == NULL)
646 		return B_BAD_VALUE;
647 
648 	bfs_inode* node = nodeGetter.WritableNode();
649 	RecursiveLocker locker(fSmallDataLock);
650 
651 	// search for the small_data item
652 
653 	small_data* item = node->SmallDataStart();
654 	int32 index = 0;
655 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
656 		item = item->Next();
657 		index++;
658 	}
659 
660 	if (item->IsLast(node))
661 		return B_ENTRY_NOT_FOUND;
662 
663 	nodeGetter.MakeWritable(transaction);
664 
665 	status_t status = _RemoveSmallData(node, item, index);
666 	if (status == B_OK) {
667 		Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
668 			bfs_inode::ToInode(real_time_clock_usecs()));
669 
670 		status = WriteBack(transaction);
671 	}
672 
673 	return status;
674 }
675 
676 
677 /*!	Try to place the given attribute in the small_data section - if the
678 	new attribute is too big to fit in that section, it returns B_DEVICE_FULL.
679 	In that case, the attribute should be written to a real attribute file;
680 	it's the caller's responsibility to remove any existing attributes in the
681 	small data section if that's the case.
682 
683 	Note that you need to write back the inode yourself after having called that
684 	method - it's a bad API decision that it needs a transaction but enforces
685 	you to write back the inode all by yourself, but it's just more efficient
686 	in most cases...
687 */
688 status_t
689 Inode::_AddSmallData(Transaction& transaction, NodeGetter& nodeGetter,
690 	const char* name, uint32 type, off_t pos, const uint8* data, size_t length,
691 	bool force)
692 {
693 	bfs_inode* node = nodeGetter.WritableNode();
694 
695 	if (node == NULL || name == NULL || data == NULL)
696 		return B_BAD_VALUE;
697 
698 	// reject any requests that can't fit into the small_data section
699 	uint32 nameLength = strlen(name);
700 	uint32 spaceNeeded = sizeof(small_data) + nameLength + 3 + pos + length + 1;
701 	if (spaceNeeded > fVolume->InodeSize() - sizeof(bfs_inode))
702 		return B_DEVICE_FULL;
703 
704 	nodeGetter.MakeWritable(transaction);
705 	RecursiveLocker locker(fSmallDataLock);
706 
707 	// Find the last item or one with the same name we have to add
708 	small_data* item = node->SmallDataStart();
709 	int32 index = 0;
710 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
711 		item = item->Next();
712 		index++;
713 	}
714 
715 	// is the attribute already in the small_data section?
716 	// then just replace the data part of that one
717 	if (!item->IsLast(node)) {
718 		// find last attribute
719 		small_data* last = item;
720 		while (!last->IsLast(node))
721 			last = last->Next();
722 
723 		// try to change the attributes value
724 		if (item->data_size > pos + length
725 			|| force
726 			|| ((uint8*)last + pos + length - item->DataSize())
727 					<= ((uint8*)node + fVolume->InodeSize())) {
728 			// Make room for the new attribute if needed (and we are forced
729 			// to do so)
730 			if (force && ((uint8*)last + pos + length - item->DataSize())
731 					> ((uint8*)node + fVolume->InodeSize())) {
732 				// We also take the free space at the end of the small_data
733 				// section into account, and request only what's really needed
734 				uint32 needed = pos + length - item->DataSize() -
735 					(uint32)((uint8*)node + fVolume->InodeSize()
736 						- (uint8*)last);
737 
738 				if (_MakeSpaceForSmallData(transaction, node, name, needed)
739 						!= B_OK)
740 					return B_ERROR;
741 
742 				// reset our pointers
743 				item = node->SmallDataStart();
744 				index = 0;
745 				while (!item->IsLast(node) && strcmp(item->Name(), name)) {
746 					item = item->Next();
747 					index++;
748 				}
749 
750 				last = item;
751 				while (!last->IsLast(node))
752 					last = last->Next();
753 			}
754 
755 			size_t oldDataSize = item->DataSize();
756 
757 			// Normally, we can just overwrite the attribute data as the size
758 			// is specified by the type and does not change that often
759 			if (pos + length != item->DataSize()) {
760 				// move the attributes after the current one
761 				small_data* next = item->Next();
762 				if (!next->IsLast(node)) {
763 					memmove((uint8*)item + spaceNeeded, next,
764 						(uint8*)last - (uint8*)next);
765 				}
766 
767 				// Move the "last" one to its new location and
768 				// correctly terminate the small_data section
769 				last = (small_data*)((uint8*)last
770 					- ((uint8*)next - ((uint8*)item + spaceNeeded)));
771 				if ((uint8*)last < (uint8*)node + fVolume->BlockSize()) {
772 					memset(last, 0, (uint8*)node + fVolume->BlockSize()
773 						- (uint8*)last);
774 				}
775 
776 				item->data_size = HOST_ENDIAN_TO_BFS_INT16(pos + length);
777 			}
778 
779 			item->type = HOST_ENDIAN_TO_BFS_INT32(type);
780 
781 			if ((uint64)oldDataSize < (uint64)pos) {
782 				// Fill gap with zeros
783 				memset(item->Data() + oldDataSize, 0, pos - oldDataSize);
784 			}
785 			memcpy(item->Data() + pos, data, length);
786 			item->Data()[pos + length] = '\0';
787 
788 			return B_OK;
789 		}
790 
791 		return B_DEVICE_FULL;
792 	}
793 
794 	// try to add the new attribute!
795 
796 	if ((uint8*)item + spaceNeeded > (uint8*)node + fVolume->InodeSize()) {
797 		// there is not enough space for it!
798 		if (!force)
799 			return B_DEVICE_FULL;
800 
801 		// make room for the new attribute
802 		if (_MakeSpaceForSmallData(transaction, node, name, spaceNeeded) < B_OK)
803 			return B_ERROR;
804 
805 		// get new last item!
806 		item = node->SmallDataStart();
807 		index = 0;
808 		while (!item->IsLast(node)) {
809 			item = item->Next();
810 			index++;
811 		}
812 	}
813 
814 	memset(item, 0, spaceNeeded);
815 	item->type = HOST_ENDIAN_TO_BFS_INT32(type);
816 	item->name_size = HOST_ENDIAN_TO_BFS_INT16(nameLength);
817 	item->data_size = HOST_ENDIAN_TO_BFS_INT16(length);
818 	strcpy(item->Name(), name);
819 	memcpy(item->Data() + pos, data, length);
820 
821 	// correctly terminate the small_data section
822 	item = item->Next();
823 	if (!item->IsLast(node))
824 		memset(item, 0, (uint8*)node + fVolume->InodeSize() - (uint8*)item);
825 
826 	// update all current iterators
827 	SinglyLinkedList<AttributeIterator>::Iterator iterator
828 		= fIterators.GetIterator();
829 	while (iterator.HasNext()) {
830 		iterator.Next()->Update(index, 1);
831 	}
832 
833 	return B_OK;
834 }
835 
836 
837 /*!	Iterates through the small_data section of an inode.
838 	To start at the beginning of this section, you let smallData
839 	point to NULL, like:
840 		small_data* data = NULL;
841 		while (inode->GetNextSmallData(&data) { ... }
842 
843 	This function is reentrant and doesn't allocate any memory;
844 	you can safely stop calling it at any point (you don't need
845 	to iterate through the whole list).
846 	You need to hold the fSmallDataLock when you call this method
847 */
848 status_t
849 Inode::_GetNextSmallData(bfs_inode* node, small_data** _smallData) const
850 {
851 	if (node == NULL)
852 		RETURN_ERROR(B_BAD_VALUE);
853 
854 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
855 
856 	small_data* data = *_smallData;
857 
858 	// begin from the start?
859 	if (data == NULL)
860 		data = node->SmallDataStart();
861 	else
862 		data = data->Next();
863 
864 	// is already last item?
865 	if (data->IsLast(node))
866 		return B_ENTRY_NOT_FOUND;
867 
868 	*_smallData = data;
869 
870 	return B_OK;
871 }
872 
873 
874 /*!	Finds the attribute "name" in the small data section, and
875 	returns a pointer to it (or NULL if it doesn't exist).
876 	You need to hold the fSmallDataLock when you call this method
877 */
878 small_data*
879 Inode::FindSmallData(const bfs_inode* node, const char* name) const
880 {
881 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
882 
883 	small_data* smallData = NULL;
884 	while (_GetNextSmallData(const_cast<bfs_inode*>(node), &smallData)
885 			== B_OK) {
886 		if (!strcmp(smallData->Name(), name))
887 			return smallData;
888 	}
889 	return NULL;
890 }
891 
892 
893 /*!	Returns a pointer to the node's name if present in the small data
894 	section, NULL otherwise.
895 	You need to hold the fSmallDataLock when you call this method
896 */
897 const char*
898 Inode::Name(const bfs_inode* node) const
899 {
900 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
901 
902 	small_data* smallData = NULL;
903 	while (_GetNextSmallData((bfs_inode*)node, &smallData) == B_OK) {
904 		if (*smallData->Name() == FILE_NAME_NAME
905 			&& smallData->NameSize() == FILE_NAME_NAME_LENGTH)
906 			return (const char*)smallData->Data();
907 	}
908 	return NULL;
909 }
910 
911 
912 /*!	Copies the node's name into the provided buffer.
913 	The buffer should be B_FILE_NAME_LENGTH bytes large.
914 */
915 status_t
916 Inode::GetName(char* buffer, size_t size) const
917 {
918 	NodeGetter node(fVolume, this);
919 	RecursiveLocker locker(fSmallDataLock);
920 
921 	const char* name = Name(node.Node());
922 	if (name == NULL)
923 		return B_ENTRY_NOT_FOUND;
924 
925 	strlcpy(buffer, name, size);
926 	return B_OK;
927 }
928 
929 
930 /*!	Changes or set the name of a file: in the inode small_data section only, it
931 	doesn't change it in the parent directory's b+tree.
932 	Note that you need to write back the inode yourself after having called
933 	that method. It suffers from the same API decision as AddSmallData() does
934 	(and for the same reason).
935 */
936 status_t
937 Inode::SetName(Transaction& transaction, const char* name)
938 {
939 	if (name == NULL || *name == '\0')
940 		return B_BAD_VALUE;
941 
942 	NodeGetter node(fVolume, transaction, this);
943 	const char nameTag[2] = {FILE_NAME_NAME, 0};
944 
945 	return _AddSmallData(transaction, node, nameTag, FILE_NAME_TYPE, 0,
946 		(uint8*)name, strlen(name), true);
947 }
948 
949 
950 status_t
951 Inode::_RemoveAttribute(Transaction& transaction, const char* name,
952 	bool hasIndex, Index* index)
953 {
954 	// remove the attribute file if it exists
955 	Vnode vnode(fVolume, Attributes());
956 	Inode* attributes;
957 	status_t status = vnode.Get(&attributes);
958 	if (status < B_OK)
959 		return status;
960 
961 	// update index
962 	if (index != NULL) {
963 		Inode* attribute;
964 		if ((hasIndex || fVolume->CheckForLiveQuery(name))
965 			&& GetAttribute(name, &attribute) == B_OK) {
966 			uint8 data[BPLUSTREE_MAX_KEY_LENGTH];
967 			size_t length = BPLUSTREE_MAX_KEY_LENGTH;
968 			if (attribute->ReadAt(0, data, &length) == B_OK) {
969 				index->Update(transaction, name, attribute->Type(), data,
970 					length, NULL, 0, this);
971 			}
972 
973 			ReleaseAttribute(attribute);
974 		}
975 	}
976 
977 	if ((status = attributes->Remove(transaction, name)) < B_OK)
978 		return status;
979 
980 	if (attributes->IsEmpty()) {
981 		attributes->WriteLockInTransaction(transaction);
982 
983 		// remove attribute directory (don't fail if that can't be done)
984 		if (remove_vnode(fVolume->FSVolume(), attributes->ID()) == B_OK) {
985 			// update the inode, so that no one will ever doubt it's deleted :-)
986 			attributes->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
987 			if (attributes->WriteBack(transaction) == B_OK) {
988 				Attributes().SetTo(0, 0, 0);
989 				WriteBack(transaction);
990 			} else {
991 				unremove_vnode(fVolume->FSVolume(), attributes->ID());
992 				attributes->Node().flags
993 					&= ~HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
994 			}
995 		}
996 	}
997 
998 	return status;
999 }
1000 
1001 
1002 /*!	Reads data from the specified attribute.
1003 	This is a high-level attribute function that understands attributes
1004 	in the small_data section as well as real attribute files.
1005 */
1006 status_t
1007 Inode::ReadAttribute(const char* name, int32 type, off_t pos, uint8* buffer,
1008 	size_t* _length)
1009 {
1010 	if (pos < 0)
1011 		pos = 0;
1012 
1013 	// search in the small_data section (which has to be locked first)
1014 	{
1015 		NodeGetter node(fVolume, this);
1016 		RecursiveLocker locker(fSmallDataLock);
1017 
1018 		small_data* smallData = FindSmallData(node.Node(), name);
1019 		if (smallData != NULL) {
1020 			size_t length = *_length;
1021 			if (pos >= smallData->data_size) {
1022 				*_length = 0;
1023 				return B_OK;
1024 			}
1025 			if (length + pos > smallData->DataSize())
1026 				length = smallData->DataSize() - pos;
1027 
1028 			memcpy(buffer, smallData->Data() + pos, length);
1029 			*_length = length;
1030 			return B_OK;
1031 		}
1032 	}
1033 
1034 	// search in the attribute directory
1035 	Inode* attribute;
1036 	status_t status = GetAttribute(name, &attribute);
1037 	if (status == B_OK) {
1038 		status = attribute->ReadAt(pos, (uint8*)buffer, _length);
1039 
1040 		ReleaseAttribute(attribute);
1041 	}
1042 
1043 	RETURN_ERROR(status);
1044 }
1045 
1046 
1047 /*!	Writes data to the specified attribute.
1048 	This is a high-level attribute function that understands attributes
1049 	in the small_data section as well as real attribute files.
1050 */
1051 status_t
1052 Inode::WriteAttribute(Transaction& transaction, const char* name, int32 type,
1053 	off_t pos, const uint8* buffer, size_t* _length, bool* _created)
1054 {
1055 	if (pos < 0)
1056 		return B_BAD_VALUE;
1057 
1058 	// needed to maintain the index
1059 	uint8 oldBuffer[BPLUSTREE_MAX_KEY_LENGTH];
1060 	uint8* oldData = NULL;
1061 	size_t oldLength = 0;
1062 	bool created = false;
1063 
1064 	// TODO: we actually depend on that the contents of "buffer" are constant.
1065 	// If they get changed during the write (hey, user programs), we may mess
1066 	// up our index trees!
1067 	// TODO: for attribute files, we need to log the first
1068 	// BPLUSTREE_MAX_KEY_LENGTH bytes of the data stream, or the same as above
1069 	// might happen.
1070 
1071 	Index index(fVolume);
1072 	bool hasIndex = index.SetTo(name) == B_OK;
1073 
1074 	Inode* attribute = NULL;
1075 	status_t status = B_OK;
1076 
1077 	if (GetAttribute(name, &attribute) != B_OK) {
1078 		// No attribute inode exists yet
1079 
1080 		// save the old attribute data
1081 		NodeGetter node(fVolume, transaction, this);
1082 		recursive_lock_lock(&fSmallDataLock);
1083 
1084 		small_data* smallData = FindSmallData(node.Node(), name);
1085 		if (smallData != NULL) {
1086 			oldLength = smallData->DataSize();
1087 			if (oldLength > 0) {
1088 				if (oldLength > BPLUSTREE_MAX_KEY_LENGTH)
1089 					oldLength = BPLUSTREE_MAX_KEY_LENGTH;
1090 				memcpy(oldData = oldBuffer, smallData->Data(), oldLength);
1091 			}
1092 		} else
1093 			created = true;
1094 
1095 		recursive_lock_unlock(&fSmallDataLock);
1096 
1097 		// if the attribute doesn't exist yet (as a file), try to put it in the
1098 		// small_data section first - if that fails (due to insufficent space),
1099 		// create a real attribute file
1100 		status = _AddSmallData(transaction, node, name, type, pos, buffer,
1101 			*_length);
1102 		if (status == B_DEVICE_FULL) {
1103 			if (smallData != NULL) {
1104 				// remove the old attribute from the small data section - there
1105 				// is no space left for the new data
1106 				status = _RemoveSmallData(transaction, node, name);
1107 			} else
1108 				status = B_OK;
1109 
1110 			if (status == B_OK)
1111 				status = CreateAttribute(transaction, name, type, &attribute);
1112 			if (status != B_OK)
1113 				RETURN_ERROR(status);
1114 
1115 			created = true;
1116 		} else if (status == B_OK) {
1117 			// Update status time on attribute write
1118 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1119 				bfs_inode::ToInode(real_time_clock_usecs()));
1120 
1121 			status = WriteBack(transaction);
1122 		}
1123 	}
1124 
1125 	if (attribute != NULL) {
1126 		WriteLocker writeLocker(attribute->fLock);
1127 
1128 		if (hasIndex || fVolume->CheckForLiveQuery(name)) {
1129 			// Save the old attribute data (if this fails, oldLength will
1130 			// reflect it)
1131 			while (attribute->Size() > 0) {
1132 				bigtime_t oldModified = attribute->LastModified();
1133 				writeLocker.Unlock();
1134 
1135 				oldLength = BPLUSTREE_MAX_KEY_LENGTH;
1136 				if (attribute->ReadAt(0, oldBuffer, &oldLength) == B_OK)
1137 					oldData = oldBuffer;
1138 
1139 				writeLocker.Lock();
1140 
1141 				// Read until the data hasn't changed in between
1142 				if (oldModified == attribute->LastModified())
1143 					break;
1144 
1145 				oldLength = 0;
1146 			}
1147 		}
1148 
1149 		// check if the data fits into the small_data section again
1150 		NodeGetter node(fVolume, transaction, this);
1151 		status = _AddSmallData(transaction, node, name, type, pos, buffer,
1152 			*_length);
1153 
1154 		if (status == B_OK) {
1155 			// it does - remove its file
1156 			writeLocker.Unlock();
1157 			status = _RemoveAttribute(transaction, name, false, NULL);
1158 		} else {
1159 			// The attribute type might have been changed - we need to
1160 			// adopt the new one
1161 			attribute->Node().type = HOST_ENDIAN_TO_BFS_INT32(type);
1162 			status = attribute->WriteBack(transaction);
1163 			writeLocker.Unlock();
1164 
1165 			if (status == B_OK) {
1166 				status = attribute->WriteAt(transaction, pos, buffer,
1167 					_length);
1168 			}
1169 		}
1170 
1171 		if (status == B_OK) {
1172 			// Update status time on attribute write
1173 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1174 				bfs_inode::ToInode(real_time_clock_usecs()));
1175 
1176 			status = WriteBack(transaction);
1177 		}
1178 
1179 		attribute->WriteLockInTransaction(transaction);
1180 		ReleaseAttribute(attribute);
1181 	}
1182 
1183 	// TODO: find a better way than this "pos" thing (the begin of the old key
1184 	//	must be copied to the start of the new one for a comparison)
1185 	if (status == B_OK && pos == 0) {
1186 		// index only the first BPLUSTREE_MAX_KEY_LENGTH bytes
1187 		uint16 length = *_length;
1188 		if (length > BPLUSTREE_MAX_KEY_LENGTH)
1189 			length = BPLUSTREE_MAX_KEY_LENGTH;
1190 
1191 		// Update index. Note, Index::Update() may be called even if
1192 		// initializing the index failed - it will just update the live
1193 		// queries in this case
1194 		if (pos < length || (uint64)pos < (uint64)oldLength) {
1195 			index.Update(transaction, name, type, oldData, oldLength, buffer,
1196 				length, this);
1197 		}
1198 	}
1199 
1200 	if (_created != NULL)
1201 		*_created = created;
1202 
1203 	return status;
1204 }
1205 
1206 
1207 /*!	Removes the specified attribute from the inode.
1208 	This is a high-level attribute function that understands attributes
1209 	in the small_data section as well as real attribute files.
1210 */
1211 status_t
1212 Inode::RemoveAttribute(Transaction& transaction, const char* name)
1213 {
1214 	Index index(fVolume);
1215 	bool hasIndex = index.SetTo(name) == B_OK;
1216 	NodeGetter node(fVolume, this);
1217 
1218 	// update index for attributes in the small_data section
1219 	{
1220 		RecursiveLocker _(fSmallDataLock);
1221 
1222 		small_data* smallData = FindSmallData(node.Node(), name);
1223 		if (smallData != NULL) {
1224 			uint32 length = smallData->DataSize();
1225 			if (length > BPLUSTREE_MAX_KEY_LENGTH)
1226 				length = BPLUSTREE_MAX_KEY_LENGTH;
1227 			index.Update(transaction, name, smallData->Type(),
1228 				smallData->Data(), length, NULL, 0, this);
1229 		}
1230 	}
1231 
1232 	status_t status = _RemoveSmallData(transaction, node, name);
1233 	if (status == B_ENTRY_NOT_FOUND && !Attributes().IsZero()) {
1234 		// remove the attribute file if it exists
1235 		status = _RemoveAttribute(transaction, name, hasIndex, &index);
1236 		if (status == B_OK) {
1237 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1238 				bfs_inode::ToInode(real_time_clock_usecs()));
1239 			WriteBack(transaction);
1240 		}
1241 	}
1242 
1243 	return status;
1244 }
1245 
1246 
1247 /*!	Returns the attribute inode with the specified \a name, in case it exists.
1248 	This method can only return real attribute files; the attributes in the
1249 	small data section are ignored.
1250 */
1251 status_t
1252 Inode::GetAttribute(const char* name, Inode** _attribute)
1253 {
1254 	// does this inode even have attributes?
1255 	if (Attributes().IsZero())
1256 		return B_ENTRY_NOT_FOUND;
1257 
1258 	Vnode vnode(fVolume, Attributes());
1259 	Inode* attributes;
1260 	if (vnode.Get(&attributes) < B_OK) {
1261 		FATAL(("get_vnode() failed in Inode::GetAttribute(name = \"%s\")\n",
1262 			name));
1263 		return B_ERROR;
1264 	}
1265 
1266 	BPlusTree* tree = attributes->Tree();
1267 	if (tree == NULL)
1268 		return B_BAD_VALUE;
1269 
1270 	InodeReadLocker locker(attributes);
1271 
1272 	ino_t id;
1273 	status_t status = tree->Find((uint8*)name, (uint16)strlen(name), &id);
1274 	if (status == B_OK) {
1275 		Vnode vnode(fVolume, id);
1276 		Inode* inode;
1277 		// Check if the attribute is really an attribute
1278 		if (vnode.Get(&inode) != B_OK || !inode->IsAttribute())
1279 			return B_ERROR;
1280 
1281 		*_attribute = inode;
1282 		vnode.Keep();
1283 		return B_OK;
1284 	}
1285 
1286 	return status;
1287 }
1288 
1289 
1290 void
1291 Inode::ReleaseAttribute(Inode* attribute)
1292 {
1293 	if (attribute == NULL)
1294 		return;
1295 
1296 	put_vnode(fVolume->FSVolume(), attribute->ID());
1297 }
1298 
1299 
1300 status_t
1301 Inode::CreateAttribute(Transaction& transaction, const char* name, uint32 type,
1302 	Inode** attribute)
1303 {
1304 	// do we need to create the attribute directory first?
1305 	if (Attributes().IsZero()) {
1306 		status_t status = Inode::Create(transaction, this, NULL,
1307 			S_ATTR_DIR | S_DIRECTORY | 0666, 0, 0, NULL);
1308 		if (status < B_OK)
1309 			RETURN_ERROR(status);
1310 	}
1311 	Vnode vnode(fVolume, Attributes());
1312 	Inode* attributes;
1313 	if (vnode.Get(&attributes) < B_OK)
1314 		return B_ERROR;
1315 
1316 	// Inode::Create() locks the inode for us
1317 	return Inode::Create(transaction, attributes, name,
1318 		S_ATTR | S_FILE | 0666, 0, type, NULL, NULL, attribute);
1319 }
1320 
1321 
1322 //	#pragma mark - directory tree
1323 
1324 
1325 bool
1326 Inode::IsEmpty()
1327 {
1328 	TreeIterator iterator(fTree);
1329 
1330 	// index and attribute directories are really empty when they are
1331 	// empty - directories for standard files always contain ".", and
1332 	// "..", so we need to ignore those two
1333 
1334 	uint32 count = 0;
1335 	char name[BPLUSTREE_MAX_KEY_LENGTH];
1336 	uint16 length;
1337 	ino_t id;
1338 	while (iterator.GetNextEntry(name, &length, B_FILE_NAME_LENGTH,
1339 			&id) == B_OK) {
1340 		if (Mode() & (S_ATTR_DIR | S_INDEX_DIR))
1341 			return false;
1342 
1343 		if (++count > 2 || (strcmp(".", name) && strcmp("..", name)))
1344 			return false;
1345 	}
1346 	return true;
1347 }
1348 
1349 
1350 status_t
1351 Inode::ContainerContentsChanged(Transaction& transaction)
1352 {
1353 	ASSERT(!InLastModifiedIndex());
1354 
1355 	Node().last_modified_time = Node().status_change_time
1356 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
1357 
1358 	return WriteBack(transaction);
1359 }
1360 
1361 
1362 //	#pragma mark - data stream
1363 
1364 
1365 /*!	Computes the number of bytes this inode occupies in the file system.
1366 	This includes the file meta blocks used for maintaining its data stream.
1367 
1368 	TODO: However, the attributes in extra files are not really accounted for;
1369 	depending on the speed penalty, this should be changed, though (the value
1370 	could be cached in the inode structure or Inode object, though).
1371 */
1372 off_t
1373 Inode::AllocatedSize() const
1374 {
1375 	if (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0) {
1376 		// This symlink does not have a data stream
1377 		return Node().InodeSize();
1378 	}
1379 
1380 	const data_stream& data = Node().data;
1381 	uint32 blockSize = fVolume->BlockSize();
1382 	off_t size = blockSize;
1383 
1384 	if (data.MaxDoubleIndirectRange() != 0) {
1385 		off_t doubleIndirectSize = data.MaxDoubleIndirectRange()
1386 			- data.MaxIndirectRange();
1387 		int32 indirectSize = double_indirect_max_indirect_size(
1388 			data.double_indirect.Length(), fVolume->BlockSize());
1389 
1390 		size += (2 * data.double_indirect.Length()
1391 				+ doubleIndirectSize / indirectSize)
1392 			* blockSize + data.MaxDoubleIndirectRange();
1393 	} else if (data.MaxIndirectRange() != 0)
1394 		size += data.indirect.Length() + data.MaxIndirectRange();
1395 	else
1396 		size += data.MaxDirectRange();
1397 
1398 	if (!Node().attributes.IsZero()) {
1399 		// TODO: to make this exact, we'd had to count all attributes
1400 		size += 2 * blockSize;
1401 			// 2 blocks, one for the attributes inode, one for its B+tree
1402 	}
1403 
1404 	return size;
1405 }
1406 
1407 
1408 /*!	Finds the block_run where "pos" is located in the data_stream of
1409 	the inode.
1410 	If successful, "offset" will then be set to the file offset
1411 	of the block_run returned; so "pos - offset" is for the block_run
1412 	what "pos" is for the whole stream.
1413 	The caller has to make sure that "pos" is inside the stream.
1414 */
1415 status_t
1416 Inode::FindBlockRun(off_t pos, block_run& run, off_t& offset)
1417 {
1418 	data_stream* data = &Node().data;
1419 
1420 	// find matching block run
1421 
1422 	if (data->MaxIndirectRange() > 0 && pos >= data->MaxDirectRange()) {
1423 		if (data->MaxDoubleIndirectRange() > 0
1424 			&& pos >= data->MaxIndirectRange()) {
1425 			// access to double indirect blocks
1426 
1427 			CachedBlock cached(fVolume);
1428 
1429 			int32 runsPerBlock;
1430 			int32 directSize;
1431 			int32 indirectSize;
1432 			get_double_indirect_sizes(data->double_indirect.Length(),
1433 				fVolume->BlockSize(), runsPerBlock, directSize, indirectSize);
1434 
1435 			off_t start = pos - data->MaxIndirectRange();
1436 			int32 index = start / indirectSize;
1437 
1438 			block_run* indirect = (block_run*)cached.SetTo(
1439 				fVolume->ToBlock(data->double_indirect) + index / runsPerBlock);
1440 			if (indirect == NULL)
1441 				RETURN_ERROR(B_ERROR);
1442 
1443 			int32 current = (start % indirectSize) / directSize;
1444 
1445 			indirect = (block_run*)cached.SetTo(
1446 				fVolume->ToBlock(indirect[index % runsPerBlock])
1447 				+ current / runsPerBlock);
1448 			if (indirect == NULL)
1449 				RETURN_ERROR(B_ERROR);
1450 
1451 			run = indirect[current % runsPerBlock];
1452 			if (run.Length() != data->double_indirect.Length())
1453 				RETURN_ERROR(B_BAD_DATA);
1454 
1455 			offset = data->MaxIndirectRange() + (index * indirectSize)
1456 				+ (current * directSize);
1457 		} else {
1458 			// access to indirect blocks
1459 
1460 			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1461 			off_t runBlockEnd = data->MaxDirectRange();
1462 
1463 			CachedBlock cached(fVolume);
1464 			off_t block = fVolume->ToBlock(data->indirect);
1465 
1466 			for (int32 i = 0; i < data->indirect.Length(); i++) {
1467 				block_run* indirect = (block_run*)cached.SetTo(block + i);
1468 				if (indirect == NULL)
1469 					RETURN_ERROR(B_IO_ERROR);
1470 
1471 				int32 current = -1;
1472 				while (++current < runsPerBlock) {
1473 					if (indirect[current].IsZero())
1474 						break;
1475 
1476 					runBlockEnd += (uint32)indirect[current].Length()
1477 						<< cached.BlockShift();
1478 					if (runBlockEnd > pos) {
1479 						run = indirect[current];
1480 						offset = runBlockEnd - ((uint32)run.Length()
1481 							<< cached.BlockShift());
1482 						return fVolume->ValidateBlockRun(run);
1483 					}
1484 				}
1485 			}
1486 			RETURN_ERROR(B_ERROR);
1487 		}
1488 	} else {
1489 		// access from direct blocks
1490 
1491 		off_t runBlockEnd = 0LL;
1492 		int32 current = -1;
1493 
1494 		while (++current < NUM_DIRECT_BLOCKS) {
1495 			if (data->direct[current].IsZero())
1496 				break;
1497 
1498 			runBlockEnd += (uint32)data->direct[current].Length()
1499 				<< fVolume->BlockShift();
1500 			if (runBlockEnd > pos) {
1501 				run = data->direct[current];
1502 				offset = runBlockEnd
1503 					- ((uint32)run.Length() << fVolume->BlockShift());
1504 				return fVolume->ValidateBlockRun(run);
1505 			}
1506 		}
1507 
1508 		return B_ENTRY_NOT_FOUND;
1509 	}
1510 	return fVolume->ValidateBlockRun(run);
1511 }
1512 
1513 
1514 status_t
1515 Inode::ReadAt(off_t pos, uint8* buffer, size_t* _length)
1516 {
1517 	size_t length = *_length;
1518 
1519 	// set/check boundaries for pos/length
1520 	if (pos < 0)
1521 		return B_BAD_VALUE;
1522 
1523 	InodeReadLocker locker(this);
1524 
1525 	if (pos >= Size() || length == 0) {
1526 		*_length = 0;
1527 		return B_NO_ERROR;
1528 	}
1529 
1530 	locker.Unlock();
1531 
1532 	return file_cache_read(FileCache(), NULL, pos, buffer, _length);
1533 }
1534 
1535 
1536 status_t
1537 Inode::WriteAt(Transaction& transaction, off_t pos, const uint8* buffer,
1538 	size_t* _length)
1539 {
1540 	InodeReadLocker locker(this);
1541 
1542 	// update the last modification time in memory, it will be written
1543 	// back to the inode, and the index when the file is closed
1544 	// TODO: should update the internal last modified time only at this point!
1545 	Node().last_modified_time = Node().status_change_time
1546 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
1547 
1548 	// TODO: support INODE_LOGGED!
1549 
1550 	size_t length = *_length;
1551 	bool changeSize = (uint64)pos + (uint64)length > (uint64)Size();
1552 
1553 	// set/check boundaries for pos/length
1554 	if (pos < 0)
1555 		return B_BAD_VALUE;
1556 
1557 	locker.Unlock();
1558 
1559 	// the transaction doesn't have to be started already
1560 	if (changeSize && !transaction.IsStarted())
1561 		transaction.Start(fVolume, BlockNumber());
1562 
1563 	WriteLocker writeLocker(fLock);
1564 
1565 	// Work around possible race condition: Someone might have shrunken the file
1566 	// while we had no lock.
1567 	if (!transaction.IsStarted()
1568 		&& (uint64)pos + (uint64)length > (uint64)Size()) {
1569 		writeLocker.Unlock();
1570 		transaction.Start(fVolume, BlockNumber());
1571 		writeLocker.Lock();
1572 	}
1573 
1574 	off_t oldSize = Size();
1575 
1576 	if ((uint64)pos + (uint64)length > (uint64)oldSize) {
1577 		// let's grow the data stream to the size needed
1578 		status_t status = SetFileSize(transaction, pos + length);
1579 		if (status != B_OK) {
1580 			*_length = 0;
1581 			WriteLockInTransaction(transaction);
1582 			RETURN_ERROR(status);
1583 		}
1584 		// TODO: In theory we would need to update the file size
1585 		// index here as part of the current transaction - this might just
1586 		// be a bit too expensive, but worth a try.
1587 
1588 		// we need to write back the inode here because it has to
1589 		// go into this transaction (we cannot wait until the file
1590 		// is closed)
1591 		status = WriteBack(transaction);
1592 		if (status != B_OK) {
1593 			WriteLockInTransaction(transaction);
1594 			return status;
1595 		}
1596 	}
1597 
1598 	writeLocker.Unlock();
1599 
1600 	if (oldSize < pos)
1601 		FillGapWithZeros(oldSize, pos);
1602 
1603 	// If we don't want to write anything, we can now return (we may
1604 	// just have changed the file size using the position parameter)
1605 	if (length == 0)
1606 		return B_OK;
1607 
1608 	status_t status = file_cache_write(FileCache(), NULL, pos, buffer, _length);
1609 
1610 	if (transaction.IsStarted())
1611 		WriteLockInTransaction(transaction);
1612 
1613 	return status;
1614 }
1615 
1616 
1617 /*!	Fills the gap between the old file size and the new file size
1618 	with zeros.
1619 	It's more or less a copy of Inode::WriteAt() but it can handle
1620 	length differences of more than just 4 GB, and it never uses
1621 	the log, even if the INODE_LOGGED flag is set.
1622 */
1623 status_t
1624 Inode::FillGapWithZeros(off_t pos, off_t newSize)
1625 {
1626 	while (pos < newSize) {
1627 		size_t size;
1628 		if (newSize > pos + 1024 * 1024 * 1024)
1629 			size = 1024 * 1024 * 1024;
1630 		else
1631 			size = newSize - pos;
1632 
1633 		status_t status = file_cache_write(FileCache(), NULL, pos, NULL, &size);
1634 		if (status < B_OK)
1635 			return status;
1636 
1637 		pos += size;
1638 	}
1639 
1640 	return B_OK;
1641 }
1642 
1643 
1644 /*!	Allocates \a length blocks, and clears their contents. Growing
1645 	the indirect and double indirect range uses this method.
1646 	The allocated block_run is saved in "run"
1647 */
1648 status_t
1649 Inode::_AllocateBlockArray(Transaction& transaction, block_run& run,
1650 	size_t length, bool variableSize)
1651 {
1652 	if (!run.IsZero())
1653 		return B_BAD_VALUE;
1654 
1655 	status_t status = fVolume->Allocate(transaction, this, length, run,
1656 		variableSize ? 1 : length);
1657 	if (status != B_OK)
1658 		return status;
1659 
1660 	// make sure those blocks are empty
1661 	CachedBlock cached(fVolume);
1662 	off_t block = fVolume->ToBlock(run);
1663 
1664 	for (int32 i = 0; i < run.Length(); i++) {
1665 		block_run* runs = (block_run*)cached.SetToWritable(transaction,
1666 			block + i, true);
1667 		if (runs == NULL)
1668 			return B_IO_ERROR;
1669 	}
1670 	return B_OK;
1671 }
1672 
1673 
1674 /*!	Grows the stream to \a size, and fills the direct/indirect/double indirect
1675 	ranges with the runs.
1676 	This method will also determine the size of the preallocation, if any.
1677 */
1678 status_t
1679 Inode::_GrowStream(Transaction& transaction, off_t size)
1680 {
1681 	data_stream* data = &Node().data;
1682 
1683 	// is the data stream already large enough to hold the new size?
1684 	// (can be the case with preallocated blocks)
1685 	if (size < data->MaxDirectRange()
1686 		|| size < data->MaxIndirectRange()
1687 		|| size < data->MaxDoubleIndirectRange()) {
1688 		data->size = HOST_ENDIAN_TO_BFS_INT64(size);
1689 		return B_OK;
1690 	}
1691 
1692 	// how many bytes are still needed? (unused ranges are always zero)
1693 	uint16 minimum = 1;
1694 	off_t bytes;
1695 	if (data->Size() < data->MaxDoubleIndirectRange()) {
1696 		bytes = size - data->MaxDoubleIndirectRange();
1697 		// The double indirect range can only handle multiples of
1698 		// its base length
1699 		minimum = data->double_indirect.Length();
1700 	} else if (data->Size() < data->MaxIndirectRange())
1701 		bytes = size - data->MaxIndirectRange();
1702 	else if (data->Size() < data->MaxDirectRange())
1703 		bytes = size - data->MaxDirectRange();
1704 	else {
1705 		// no preallocation left to be used
1706 		bytes = size - data->Size();
1707 		if (data->MaxDoubleIndirectRange() > 0)
1708 			minimum = data->double_indirect.Length();
1709 	}
1710 
1711 	// do we have enough free blocks on the disk?
1712 	off_t blocksNeeded = (bytes + fVolume->BlockSize() - 1)
1713 		>> fVolume->BlockShift();
1714 	if (blocksNeeded > fVolume->FreeBlocks())
1715 		return B_DEVICE_FULL;
1716 
1717 	off_t blocksRequested = blocksNeeded;
1718 		// because of preallocations and partial allocations, the number of
1719 		// blocks we need to allocate may be different from the one we request
1720 		// from the block allocator
1721 
1722 	// Should we preallocate some blocks?
1723 	// Attributes, attribute directories, and long symlinks usually won't get
1724 	// that big, and should stay close to the inode - preallocating could be
1725 	// counterproductive.
1726 	// Also, if free disk space is tight, don't preallocate.
1727 	if (!IsAttribute() && !IsAttributeDirectory() && !IsSymLink()
1728 		&& fVolume->FreeBlocks() > 128) {
1729 		off_t roundTo = 0;
1730 		if (IsFile()) {
1731 			// Request preallocated blocks depending on the file size and growth
1732 			if (size < 1 * 1024 * 1024 && bytes < 512 * 1024) {
1733 				// Preallocate 64 KB for file sizes <1 MB and grow rates <512 KB
1734 				roundTo = 65536 >> fVolume->BlockShift();
1735 			} else if (size < 32 * 1024 * 1024 && bytes <= 1 * 1024 * 1024) {
1736 				// Preallocate 512 KB for file sizes between 1 MB and 32 MB, and
1737 				// grow rates smaller than 1 MB
1738 				roundTo = (512 * 1024) >> fVolume->BlockShift();
1739 			} else {
1740 				// Preallocate 1/16 of the file size (ie. 4 MB for 64 MB,
1741 				// 64 MB for 1 GB)
1742 				roundTo = size >> (fVolume->BlockShift() + 4);
1743 			}
1744 		} else if (IsIndex()) {
1745 			// Always preallocate 64 KB for index directories
1746 			roundTo = 65536 >> fVolume->BlockShift();
1747 		} else {
1748 			// Preallocate only 4 KB - directories only get trimmed when their
1749 			// vnode is flushed, which might not happen very often.
1750 			roundTo = 4096 >> fVolume->BlockShift();
1751 		}
1752 		if (roundTo > 1) {
1753 			// Round to next "roundTo" block count
1754 			blocksRequested = ((blocksNeeded + roundTo) / roundTo) * roundTo;
1755 		}
1756 	}
1757 
1758 	while (blocksNeeded > 0) {
1759 		// the requested blocks do not need to be returned with a
1760 		// single allocation, so we need to iterate until we have
1761 		// enough blocks allocated
1762 		if (minimum > 1) {
1763 			// make sure that "blocks" is a multiple of minimum
1764 			blocksRequested = round_up(blocksRequested, minimum);
1765 		}
1766 
1767 		block_run run;
1768 		status_t status = fVolume->Allocate(transaction, this, blocksRequested,
1769 			run, minimum);
1770 		if (status != B_OK)
1771 			return status;
1772 
1773 		// okay, we have the needed blocks, so just distribute them to the
1774 		// different ranges of the stream (direct, indirect & double indirect)
1775 
1776 		blocksNeeded -= run.Length();
1777 		// don't preallocate if the first allocation was already too small
1778 		blocksRequested = blocksNeeded;
1779 
1780 		// Direct block range
1781 
1782 		if (data->Size() <= data->MaxDirectRange()) {
1783 			// let's try to put them into the direct block range
1784 			int32 free = 0;
1785 			for (; free < NUM_DIRECT_BLOCKS; free++) {
1786 				if (data->direct[free].IsZero())
1787 					break;
1788 			}
1789 
1790 			if (free < NUM_DIRECT_BLOCKS) {
1791 				// can we merge the last allocated run with the new one?
1792 				int32 last = free - 1;
1793 				if (free > 0 && data->direct[last].MergeableWith(run)) {
1794 					data->direct[last].length = HOST_ENDIAN_TO_BFS_INT16(
1795 						data->direct[last].Length() + run.Length());
1796 				} else
1797 					data->direct[free] = run;
1798 
1799 				data->max_direct_range = HOST_ENDIAN_TO_BFS_INT64(
1800 					data->MaxDirectRange()
1801 					+ run.Length() * fVolume->BlockSize());
1802 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1803 					? data->max_direct_range : size);
1804 				continue;
1805 			}
1806 		}
1807 
1808 		// Indirect block range
1809 
1810 		if (data->Size() <= data->MaxIndirectRange()
1811 			|| !data->MaxIndirectRange()) {
1812 			CachedBlock cached(fVolume);
1813 			block_run* runs = NULL;
1814 			uint32 free = 0;
1815 			off_t block;
1816 
1817 			// if there is no indirect block yet, create one
1818 			if (data->indirect.IsZero()) {
1819 				status = _AllocateBlockArray(transaction, data->indirect,
1820 					NUM_ARRAY_BLOCKS, true);
1821 				if (status != B_OK)
1822 					return status;
1823 
1824 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1825 					data->MaxDirectRange());
1826 				// insert the block_run in the first block
1827 				runs = (block_run*)cached.SetTo(data->indirect);
1828 			} else {
1829 				uint32 numberOfRuns = fVolume->BlockSize() / sizeof(block_run);
1830 				block = fVolume->ToBlock(data->indirect);
1831 
1832 				// search first empty entry
1833 				int32 i = 0;
1834 				for (; i < data->indirect.Length(); i++) {
1835 					if ((runs = (block_run*)cached.SetTo(block + i)) == NULL)
1836 						return B_IO_ERROR;
1837 
1838 					for (free = 0; free < numberOfRuns; free++)
1839 						if (runs[free].IsZero())
1840 							break;
1841 
1842 					if (free < numberOfRuns)
1843 						break;
1844 				}
1845 				if (i == data->indirect.Length())
1846 					runs = NULL;
1847 			}
1848 
1849 			if (runs != NULL) {
1850 				// try to insert the run to the last one - note that this
1851 				// doesn't take block borders into account, so it could be
1852 				// further optimized
1853 				cached.MakeWritable(transaction);
1854 
1855 				int32 last = free - 1;
1856 				if (free > 0 && runs[last].MergeableWith(run)) {
1857 					runs[last].length = HOST_ENDIAN_TO_BFS_INT16(
1858 						runs[last].Length() + run.Length());
1859 				} else
1860 					runs[free] = run;
1861 
1862 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1863 					data->MaxIndirectRange()
1864 					+ ((uint32)run.Length() << fVolume->BlockShift()));
1865 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1866 					? data->MaxIndirectRange() : size);
1867 				continue;
1868 			}
1869 		}
1870 
1871 		// Double indirect block range
1872 
1873 		if (data->Size() <= data->MaxDoubleIndirectRange()
1874 			|| !data->max_double_indirect_range) {
1875 			// We make sure here that we have this minimum allocated, so if
1876 			// the allocation succeeds, we don't run into an endless loop.
1877 			if (!data->max_double_indirect_range)
1878 				minimum = _DoubleIndirectBlockLength();
1879 			else
1880 				minimum = data->double_indirect.Length();
1881 
1882 			if ((run.Length() % minimum) != 0) {
1883 				// The number of allocated blocks isn't a multiple of 'minimum',
1884 				// so we have to change this. This can happen the first time the
1885 				// stream grows into the double indirect range.
1886 				// First, free the remaining blocks that don't fit into this
1887 				// multiple.
1888 				int32 rest = run.Length() % minimum;
1889 				run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length() - rest);
1890 
1891 				status = fVolume->Free(transaction,
1892 					block_run::Run(run.AllocationGroup(),
1893 					run.Start() + run.Length(), rest));
1894 				if (status != B_OK)
1895 					return status;
1896 
1897 				blocksNeeded += rest;
1898 				blocksRequested = round_up(blocksNeeded, minimum);
1899 
1900 				// Are there any blocks left in the run? If not, allocate
1901 				// a new one
1902 				if (run.length == 0)
1903 					continue;
1904 			}
1905 
1906 			// if there is no double indirect block yet, create one
1907 			if (data->double_indirect.IsZero()) {
1908 				status = _AllocateBlockArray(transaction,
1909 					data->double_indirect, _DoubleIndirectBlockLength());
1910 				if (status != B_OK)
1911 					return status;
1912 
1913 				data->max_double_indirect_range = data->max_indirect_range;
1914 			}
1915 
1916 			// calculate the index where to insert the new blocks
1917 
1918 			int32 runsPerBlock;
1919 			int32 directSize;
1920 			int32 indirectSize;
1921 			get_double_indirect_sizes(data->double_indirect.Length(),
1922 				fVolume->BlockSize(), runsPerBlock, directSize, indirectSize);
1923 
1924 			off_t start = data->MaxDoubleIndirectRange()
1925 				- data->MaxIndirectRange();
1926 			int32 indirectIndex = start / indirectSize;
1927 			int32 index = (start % indirectSize) / directSize;
1928 			int32 runsPerArray = runsPerBlock * minimum;
1929 
1930 			// distribute the blocks to the array and allocate
1931 			// new array blocks when needed
1932 
1933 			CachedBlock cached(fVolume);
1934 			CachedBlock cachedDirect(fVolume);
1935 			block_run* array = NULL;
1936 			uint32 runLength = run.Length();
1937 
1938 			while (run.length != 0) {
1939 				// get the indirect array block
1940 				if (array == NULL) {
1941 					uint32 block = indirectIndex / runsPerBlock;
1942 					if (block >= minimum)
1943 						return EFBIG;
1944 
1945 					array = (block_run*)cached.SetTo(fVolume->ToBlock(
1946 						data->double_indirect) + block);
1947 					if (array == NULL)
1948 						return B_IO_ERROR;
1949 				}
1950 
1951 				do {
1952 					// do we need a new array block?
1953 					if (array[indirectIndex % runsPerBlock].IsZero()) {
1954 						cached.MakeWritable(transaction);
1955 
1956 						status = _AllocateBlockArray(transaction,
1957 							array[indirectIndex % runsPerBlock],
1958 							data->double_indirect.Length());
1959 						if (status != B_OK)
1960 							return status;
1961 					}
1962 
1963 					block_run* runs = (block_run*)cachedDirect.SetToWritable(
1964 						transaction, fVolume->ToBlock(array[indirectIndex
1965 							% runsPerBlock]) + index / runsPerBlock);
1966 					if (runs == NULL)
1967 						return B_IO_ERROR;
1968 
1969 					do {
1970 						// insert the block_run into the array
1971 						runs[index % runsPerBlock] = run;
1972 						runs[index % runsPerBlock].length
1973 							= HOST_ENDIAN_TO_BFS_INT16(minimum);
1974 
1975 						// alter the remaining block_run
1976 						run.start = HOST_ENDIAN_TO_BFS_INT16(run.Start()
1977 							+ minimum);
1978 						run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
1979 							- minimum);
1980 					} while ((++index % runsPerBlock) != 0 && run.length);
1981 				} while ((index % runsPerArray) != 0 && run.length);
1982 
1983 				if (index == runsPerArray)
1984 					index = 0;
1985 				if (++indirectIndex % runsPerBlock == 0) {
1986 					array = NULL;
1987 					index = 0;
1988 				}
1989 			}
1990 
1991 			data->max_double_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1992 				data->MaxDoubleIndirectRange()
1993 				+ (runLength << fVolume->BlockShift()));
1994 			data->size = blocksNeeded > 0 ? HOST_ENDIAN_TO_BFS_INT64(
1995 				data->max_double_indirect_range) : size;
1996 
1997 			continue;
1998 		}
1999 
2000 		RETURN_ERROR(EFBIG);
2001 	}
2002 	// update the size of the data stream
2003 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
2004 
2005 	return B_OK;
2006 }
2007 
2008 
2009 size_t
2010 Inode::_DoubleIndirectBlockLength() const
2011 {
2012 	if (fVolume->BlockSize() > DOUBLE_INDIRECT_ARRAY_SIZE)
2013 		return 1;
2014 
2015 	return DOUBLE_INDIRECT_ARRAY_SIZE / fVolume->BlockSize();
2016 }
2017 
2018 
2019 /*!	Frees the statically sized array of the double indirect part of a data
2020 	stream.
2021 */
2022 status_t
2023 Inode::_FreeStaticStreamArray(Transaction& transaction, int32 level,
2024 	block_run run, off_t size, off_t offset, off_t& max)
2025 {
2026 	int32 indirectSize;
2027 	if (level == 0) {
2028 		indirectSize = double_indirect_max_indirect_size(run.Length(),
2029 			fVolume->BlockSize());
2030 	} else {
2031 		indirectSize = double_indirect_max_direct_size(run.Length(),
2032 			fVolume->BlockSize());
2033 	}
2034 
2035 	off_t start;
2036 	if (size > offset)
2037 		start = size - offset;
2038 	else
2039 		start = 0;
2040 
2041 	int32 index = start / indirectSize;
2042 	int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
2043 
2044 	CachedBlock cached(fVolume);
2045 	off_t blockNumber = fVolume->ToBlock(run);
2046 
2047 	// set the file offset to the current block run
2048 	offset += (off_t)index * indirectSize;
2049 
2050 	for (int32 i = index / runsPerBlock; i < run.Length(); i++) {
2051 		block_run* array = (block_run*)cached.SetToWritable(transaction,
2052 			blockNumber + i);
2053 		if (array == NULL)
2054 			RETURN_ERROR(B_ERROR);
2055 
2056 		for (index = index % runsPerBlock; index < runsPerBlock; index++) {
2057 			if (array[index].IsZero()) {
2058 				// we also want to break out of the outer loop
2059 				i = run.Length();
2060 				break;
2061 			}
2062 
2063 			status_t status = B_OK;
2064 			if (level == 0) {
2065 				status = _FreeStaticStreamArray(transaction, 1, array[index],
2066 					size, offset, max);
2067 			} else if (offset >= size)
2068 				status = fVolume->Free(transaction, array[index]);
2069 			else
2070 				max = HOST_ENDIAN_TO_BFS_INT64(offset + indirectSize);
2071 
2072 			if (status < B_OK)
2073 				RETURN_ERROR(status);
2074 
2075 			if (offset >= size)
2076 				array[index].SetTo(0, 0, 0);
2077 
2078 			offset += indirectSize;
2079 		}
2080 		index = 0;
2081 	}
2082 	return B_OK;
2083 }
2084 
2085 
2086 /*!	Frees all block_runs in the array which come after the specified size.
2087 	It also trims the last block_run that contain the size.
2088 	"offset" and "max" are maintained until the last block_run that doesn't
2089 	have to be freed - after this, the values won't be correct anymore, but
2090 	will still assure correct function for all subsequent calls.
2091 	"max" is considered to be in file system byte order.
2092 */
2093 status_t
2094 Inode::_FreeStreamArray(Transaction& transaction, block_run* array,
2095 	uint32 arrayLength, off_t size, off_t& offset, off_t& max)
2096 {
2097 	PRINT(("FreeStreamArray: arrayLength %lu, size %Ld, offset %Ld, max %Ld\n",
2098 		arrayLength, size, offset, max));
2099 
2100 	off_t newOffset = offset;
2101 	uint32 i = 0;
2102 	for (; i < arrayLength; i++, offset = newOffset) {
2103 		if (array[i].IsZero())
2104 			break;
2105 
2106 		newOffset += (off_t)array[i].Length() << fVolume->BlockShift();
2107 		if (newOffset <= size)
2108 			continue;
2109 
2110 		block_run run = array[i];
2111 
2112 		// determine the block_run to be freed
2113 		if (newOffset > size && offset < size) {
2114 			// free partial block_run (and update the original block_run)
2115 			run.start = HOST_ENDIAN_TO_BFS_INT16(array[i].Start()
2116 				+ ((size + fVolume->BlockSize() - 1 - offset)
2117 					>> fVolume->BlockShift()));
2118 				// round to next block
2119 			array[i].length = HOST_ENDIAN_TO_BFS_INT16(run.Start()
2120 				- array[i].Start());
2121 			run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
2122 				- array[i].Length());
2123 
2124 			if (run.length == 0)
2125 				continue;
2126 
2127 			// update maximum range
2128 			max = HOST_ENDIAN_TO_BFS_INT64(offset + ((off_t)array[i].Length()
2129 				<< fVolume->BlockShift()));
2130 		} else {
2131 			// free the whole block_run
2132 			array[i].SetTo(0, 0, 0);
2133 
2134 			if ((off_t)BFS_ENDIAN_TO_HOST_INT64(max) > offset)
2135 				max = HOST_ENDIAN_TO_BFS_INT64(offset);
2136 		}
2137 
2138 		if (fVolume->Free(transaction, run) < B_OK)
2139 			return B_IO_ERROR;
2140 	}
2141 	return B_OK;
2142 }
2143 
2144 
2145 status_t
2146 Inode::_ShrinkStream(Transaction& transaction, off_t size)
2147 {
2148 	data_stream* data = &Node().data;
2149 	status_t status;
2150 
2151 	if (data->MaxDoubleIndirectRange() > size) {
2152 		off_t* maxDoubleIndirect = &data->max_double_indirect_range;
2153 			// gcc 4 work-around: "error: cannot bind packed field
2154 			// 'data->data_stream::max_double_indirect_range' to 'off_t&'"
2155 		status = _FreeStaticStreamArray(transaction, 0, data->double_indirect,
2156 			size, data->MaxIndirectRange(), *maxDoubleIndirect);
2157 		if (status != B_OK)
2158 			return status;
2159 
2160 		if (size <= data->MaxIndirectRange()) {
2161 			fVolume->Free(transaction, data->double_indirect);
2162 			data->double_indirect.SetTo(0, 0, 0);
2163 			data->max_double_indirect_range = 0;
2164 		}
2165 	}
2166 
2167 	if (data->MaxIndirectRange() > size) {
2168 		CachedBlock cached(fVolume);
2169 		off_t block = fVolume->ToBlock(data->indirect);
2170 		off_t offset = data->MaxDirectRange();
2171 
2172 		for (int32 i = 0; i < data->indirect.Length(); i++) {
2173 			block_run* array = (block_run*)cached.SetToWritable(transaction,
2174 				block + i);
2175 			if (array == NULL)
2176 				break;
2177 
2178 			off_t* maxIndirect = &data->max_indirect_range;
2179 				// gcc 4 work-around: "error: cannot bind packed field
2180 				// 'data->data_stream::max_indirect_range' to 'off_t&'"
2181 			if (_FreeStreamArray(transaction, array, fVolume->BlockSize()
2182 					/ sizeof(block_run), size, offset, *maxIndirect) != B_OK)
2183 				return B_IO_ERROR;
2184 		}
2185 		if (data->max_direct_range == data->max_indirect_range) {
2186 			fVolume->Free(transaction, data->indirect);
2187 			data->indirect.SetTo(0, 0, 0);
2188 			data->max_indirect_range = 0;
2189 		}
2190 	}
2191 
2192 	if (data->MaxDirectRange() > size) {
2193 		off_t offset = 0;
2194 		off_t *maxDirect = &data->max_direct_range;
2195 			// gcc 4 work-around: "error: cannot bind packed field
2196 			// 'data->data_stream::max_direct_range' to 'off_t&'"
2197 		status = _FreeStreamArray(transaction, data->direct, NUM_DIRECT_BLOCKS,
2198 			size, offset, *maxDirect);
2199 		if (status < B_OK)
2200 			return status;
2201 	}
2202 
2203 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
2204 	return B_OK;
2205 }
2206 
2207 
2208 status_t
2209 Inode::SetFileSize(Transaction& transaction, off_t size)
2210 {
2211 	if (size < 0)
2212 		return B_BAD_VALUE;
2213 
2214 	off_t oldSize = Size();
2215 
2216 	if (size == oldSize)
2217 		return B_OK;
2218 
2219 	T(Resize(this, oldSize, size, false));
2220 
2221 	// should the data stream grow or shrink?
2222 	status_t status;
2223 	if (size > oldSize) {
2224 		status = _GrowStream(transaction, size);
2225 		if (status < B_OK) {
2226 			// if the growing of the stream fails, the whole operation
2227 			// fails, so we should shrink the stream to its former size
2228 			_ShrinkStream(transaction, oldSize);
2229 		}
2230 	} else
2231 		status = _ShrinkStream(transaction, size);
2232 
2233 	if (status < B_OK)
2234 		return status;
2235 
2236 	file_cache_set_size(FileCache(), size);
2237 	file_map_set_size(Map(), size);
2238 
2239 	return WriteBack(transaction);
2240 }
2241 
2242 
2243 status_t
2244 Inode::Append(Transaction& transaction, off_t bytes)
2245 {
2246 	return SetFileSize(transaction, Size() + bytes);
2247 }
2248 
2249 
2250 /*!	Checks whether or not this inode's data stream needs to be trimmed
2251 	because of an earlier preallocation.
2252 	Returns true if there are any blocks to be trimmed.
2253 */
2254 bool
2255 Inode::NeedsTrimming() const
2256 {
2257 	// We never trim preallocated index blocks to make them grow as smooth as
2258 	// possible. There are only few indices anyway, so this doesn't hurt.
2259 	// Also, if an inode is already in deleted state, we don't bother trimming
2260 	// it.
2261 	if (IsIndex() || IsDeleted()
2262 		|| (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0))
2263 		return false;
2264 
2265 	off_t roundedSize = round_up(Size(), fVolume->BlockSize());
2266 
2267 	return Node().data.MaxDirectRange() > roundedSize
2268 		|| Node().data.MaxIndirectRange() > roundedSize
2269 		|| Node().data.MaxDoubleIndirectRange() > roundedSize;
2270 }
2271 
2272 
2273 status_t
2274 Inode::TrimPreallocation(Transaction& transaction)
2275 {
2276 	T(Resize(this, max_c(Node().data.MaxDirectRange(),
2277 		Node().data.MaxIndirectRange()), Size(), true));
2278 
2279 	status_t status = _ShrinkStream(transaction, Size());
2280 	if (status < B_OK)
2281 		return status;
2282 
2283 	return WriteBack(transaction);
2284 }
2285 
2286 
2287 //!	Frees the file's data stream and removes all attributes
2288 status_t
2289 Inode::Free(Transaction& transaction)
2290 {
2291 	FUNCTION();
2292 
2293 	// Perhaps there should be an implementation of Inode::ShrinkStream() that
2294 	// just frees the data_stream, but doesn't change the inode (since it is
2295 	// freed anyway) - that would make an undelete command possible
2296 	if (!IsSymLink() || (Flags() & INODE_LONG_SYMLINK) != 0) {
2297 		status_t status = SetFileSize(transaction, 0);
2298 		if (status < B_OK)
2299 			return status;
2300 	}
2301 
2302 	// Free all attributes, and remove their indices
2303 	{
2304 		// We have to limit the scope of AttributeIterator, so that its
2305 		// destructor is not called after the inode is deleted
2306 		AttributeIterator iterator(this);
2307 
2308 		char name[B_FILE_NAME_LENGTH];
2309 		uint32 type;
2310 		size_t length;
2311 		ino_t id;
2312 		while (iterator.GetNext(name, &length, &type, &id) == B_OK) {
2313 			RemoveAttribute(transaction, name);
2314 		}
2315 	}
2316 
2317 	if (WriteBack(transaction) < B_OK)
2318 		return B_IO_ERROR;
2319 
2320 	return fVolume->Free(transaction, BlockRun());
2321 }
2322 
2323 
2324 //!	Synchronizes (writes back to disk) the file stream of the inode.
2325 status_t
2326 Inode::Sync()
2327 {
2328 	if (FileCache())
2329 		return file_cache_sync(FileCache());
2330 
2331 	// We may also want to flush the attribute's data stream to
2332 	// disk here... (do we?)
2333 
2334 	if (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0)
2335 		return B_OK;
2336 
2337 	InodeReadLocker locker(this);
2338 
2339 	data_stream* data = &Node().data;
2340 	status_t status = B_OK;
2341 
2342 	// flush direct range
2343 
2344 	for (int32 i = 0; i < NUM_DIRECT_BLOCKS; i++) {
2345 		if (data->direct[i].IsZero())
2346 			return B_OK;
2347 
2348 		status = block_cache_sync_etc(fVolume->BlockCache(),
2349 			fVolume->ToBlock(data->direct[i]), data->direct[i].Length());
2350 		if (status != B_OK)
2351 			return status;
2352 	}
2353 
2354 	// flush indirect range
2355 
2356 	if (data->max_indirect_range == 0)
2357 		return B_OK;
2358 
2359 	CachedBlock cached(fVolume);
2360 	off_t block = fVolume->ToBlock(data->indirect);
2361 	int32 count = fVolume->BlockSize() / sizeof(block_run);
2362 
2363 	for (int32 j = 0; j < data->indirect.Length(); j++) {
2364 		block_run* runs = (block_run*)cached.SetTo(block + j);
2365 		if (runs == NULL)
2366 			break;
2367 
2368 		for (int32 i = 0; i < count; i++) {
2369 			if (runs[i].IsZero())
2370 				return B_OK;
2371 
2372 			status = block_cache_sync_etc(fVolume->BlockCache(),
2373 				fVolume->ToBlock(runs[i]), runs[i].Length());
2374 			if (status != B_OK)
2375 				return status;
2376 		}
2377 	}
2378 
2379 	// flush double indirect range
2380 
2381 	if (data->max_double_indirect_range == 0)
2382 		return B_OK;
2383 
2384 	off_t indirectBlock = fVolume->ToBlock(data->double_indirect);
2385 
2386 	for (int32 l = 0; l < data->double_indirect.Length(); l++) {
2387 		block_run* indirectRuns = (block_run*)cached.SetTo(indirectBlock + l);
2388 		if (indirectRuns == NULL)
2389 			return B_FILE_ERROR;
2390 
2391 		CachedBlock directCached(fVolume);
2392 
2393 		for (int32 k = 0; k < count; k++) {
2394 			if (indirectRuns[k].IsZero())
2395 				return B_OK;
2396 
2397 			block = fVolume->ToBlock(indirectRuns[k]);
2398 			for (int32 j = 0; j < indirectRuns[k].Length(); j++) {
2399 				block_run* runs = (block_run*)directCached.SetTo(block + j);
2400 				if (runs == NULL)
2401 					return B_FILE_ERROR;
2402 
2403 				for (int32 i = 0; i < count; i++) {
2404 					if (runs[i].IsZero())
2405 						return B_OK;
2406 
2407 					// TODO: combine single block_runs to bigger ones when
2408 					// they are adjacent
2409 					status = block_cache_sync_etc(fVolume->BlockCache(),
2410 						fVolume->ToBlock(runs[i]), runs[i].Length());
2411 					if (status != B_OK)
2412 						return status;
2413 				}
2414 			}
2415 		}
2416 	}
2417 	return B_OK;
2418 }
2419 
2420 
2421 //	#pragma mark - TransactionListener implementation
2422 
2423 
2424 void
2425 Inode::TransactionDone(bool success)
2426 {
2427 	if (!success) {
2428 		// revert any changes made to the cached bfs_inode
2429 		UpdateNodeFromDisk();
2430 	}
2431 }
2432 
2433 
2434 void
2435 Inode::RemovedFromTransaction()
2436 {
2437 	Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
2438 
2439 	// See AddInode() why we do this here
2440 	if ((Flags() & INODE_DELETED) != 0)
2441 		fVolume->RemovedInodes().Add(this);
2442 
2443 	rw_lock_write_unlock(&Lock());
2444 
2445 	if (!fVolume->IsInitializing())
2446 		put_vnode(fVolume->FSVolume(), ID());
2447 }
2448 
2449 
2450 //	#pragma mark - creation/deletion
2451 
2452 
2453 status_t
2454 Inode::Remove(Transaction& transaction, const char* name, ino_t* _id,
2455 	bool isDirectory, bool force)
2456 {
2457 	if (fTree == NULL)
2458 		RETURN_ERROR(B_BAD_VALUE);
2459 
2460 	WriteLockInTransaction(transaction);
2461 
2462 	// does the file even exist?
2463 	off_t id;
2464 	if (fTree->Find((uint8*)name, (uint16)strlen(name), &id) < B_OK)
2465 		return B_ENTRY_NOT_FOUND;
2466 
2467 	if (_id)
2468 		*_id = id;
2469 
2470 	Vnode vnode(fVolume, id);
2471 	Inode* inode;
2472 	status_t status = vnode.Get(&inode);
2473 	if (status < B_OK) {
2474 		REPORT_ERROR(status);
2475 		return B_ENTRY_NOT_FOUND;
2476 	}
2477 
2478 	T(Remove(inode, name));
2479 	inode->WriteLockInTransaction(transaction);
2480 
2481 	// Inode::IsContainer() is true also for indices (furthermore, the S_IFDIR
2482 	// bit is set for indices in BFS, not for attribute directories) - but you
2483 	// should really be able to do whatever you want with your indices
2484 	// without having to remove all files first :)
2485 	if (!inode->IsIndex() && !force) {
2486 		// if it's not of the correct type, don't delete it!
2487 		if (inode->IsContainer() != isDirectory)
2488 			return isDirectory ? B_NOT_A_DIRECTORY : B_IS_A_DIRECTORY;
2489 
2490 		// only delete empty directories
2491 		if (isDirectory && !inode->IsEmpty())
2492 			return B_DIRECTORY_NOT_EMPTY;
2493 	}
2494 
2495 	// remove_vnode() allows the inode to be accessed until the last put_vnode()
2496 	status = remove_vnode(fVolume->FSVolume(), id);
2497 	if (status != B_OK)
2498 		return status;
2499 
2500 	if (fTree->Remove(transaction, name, id) != B_OK && !force) {
2501 		unremove_vnode(fVolume->FSVolume(), id);
2502 		RETURN_ERROR(B_ERROR);
2503 	}
2504 
2505 #ifdef DEBUG
2506 	if (fTree->Find((uint8*)name, (uint16)strlen(name), &id) == B_OK) {
2507 		DIE(("deleted entry still there"));
2508 	}
2509 #endif
2510 
2511 	ContainerContentsChanged(transaction);
2512 
2513 	// update the inode, so that no one will ever doubt it's deleted :-)
2514 	inode->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
2515 	inode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
2516 
2517 	// In balance to the Inode::Create() method, the main indices
2518 	// are updated here (name, size, & last_modified)
2519 
2520 	Index index(fVolume);
2521 	if (inode->InNameIndex()) {
2522 		index.RemoveName(transaction, name, inode);
2523 			// If removing from the index fails, it is not regarded as a
2524 			// fatal error and will not be reported back!
2525 			// Deleted inodes won't be visible in queries anyway.
2526 	}
2527 
2528 	if (inode->InSizeIndex())
2529 		index.RemoveSize(transaction, inode);
2530 	if (inode->InLastModifiedIndex())
2531 		index.RemoveLastModified(transaction, inode);
2532 
2533 	return inode->WriteBack(transaction);
2534 }
2535 
2536 
2537 /*!	Creates the inode with the specified \a parent directory, and automatically
2538 	adds the created inode to that parent directory. If an attribute directory
2539 	is created, it will also automatically  be added to the \a parent inode as
2540 	such. However, the indices root node, and the regular root node won't be
2541 	added to the superblock.
2542 	It will also create the initial B+tree for the inode if it's a directory
2543 	of any kind.
2544 	\a name may be \c NULL, but only if no \a parent is given.
2545 	If the "_id" or "_inode" variable is given and non-NULL to store the
2546 	inode's ID, the inode stays locked - you have to call put_vnode() if you
2547 	don't use it anymore.
2548 	If the node already exists, this method will fail if \c O_EXCL is set, or
2549 	it's a directory or a symlink. Otherwise, it will just be returned.
2550 	If \c O_TRUNC has been specified, the file will also be truncated.
2551 */
2552 status_t
2553 Inode::Create(Transaction& transaction, Inode* parent, const char* name,
2554 	int32 mode, int openMode, uint32 type, bool* _created, ino_t* _id,
2555 	Inode** _inode, fs_vnode_ops* vnodeOps, uint32 publishFlags)
2556 {
2557 	FUNCTION_START(("name = %s, mode = %ld\n", name, mode));
2558 
2559 	block_run parentRun = parent ? parent->BlockRun() : block_run::Run(0, 0, 0);
2560 	Volume* volume = transaction.GetVolume();
2561 	BPlusTree* tree = NULL;
2562 
2563 	if (parent != NULL && (mode & S_ATTR_DIR) == 0 && parent->IsContainer()) {
2564 		// check if the file already exists in the directory
2565 		tree = parent->Tree();
2566 	}
2567 
2568 	if (parent != NULL) {
2569 		// the parent directory is locked during the whole inode creation
2570 		parent->WriteLockInTransaction(transaction);
2571 	}
2572 
2573 	if (parent != NULL && !volume->IsInitializing() && parent->IsContainer()) {
2574 		// don't create anything in removed directories
2575 		bool removed;
2576 		if (get_vnode_removed(volume->FSVolume(), parent->ID(), &removed)
2577 				== B_OK && removed) {
2578 			RETURN_ERROR(B_ENTRY_NOT_FOUND);
2579 		}
2580 	}
2581 
2582 	if (tree != NULL) {
2583 		// Does the file already exist?
2584 		off_t offset;
2585 		if (tree->Find((uint8*)name, (uint16)strlen(name), &offset) == B_OK) {
2586 			// Return if the file should be a directory/link or opened in
2587 			// exclusive mode
2588 			if (S_ISDIR(mode) || S_ISLNK(mode) || (openMode & O_EXCL) != 0)
2589 				return B_FILE_EXISTS;
2590 
2591 			Vnode vnode(volume, offset);
2592 			Inode* inode;
2593 			status_t status = vnode.Get(&inode);
2594 			if (status != B_OK) {
2595 				REPORT_ERROR(status);
2596 				return B_ENTRY_NOT_FOUND;
2597 			}
2598 
2599 			if (inode->IsDirectory() && (openMode & O_RWMASK) != O_RDONLY)
2600 				return B_IS_A_DIRECTORY;
2601 			if ((openMode & O_DIRECTORY) != 0 && !inode->IsDirectory())
2602 				return B_NOT_A_DIRECTORY;
2603 
2604 			// we want to open the file, so we should have the rights to do so
2605 			if (inode->CheckPermissions(open_mode_to_access(openMode)
2606 					| ((openMode & O_TRUNC) != 0 ? W_OK : 0)) != B_OK)
2607 				return B_NOT_ALLOWED;
2608 
2609 			if ((openMode & O_TRUNC) != 0) {
2610 				// truncate the existing file
2611 				inode->WriteLockInTransaction(transaction);
2612 
2613 				status_t status = inode->SetFileSize(transaction, 0);
2614 				if (status == B_OK)
2615 					status = inode->WriteBack(transaction);
2616 
2617 				if (status != B_OK)
2618 					return status;
2619 			}
2620 
2621 			if (_created)
2622 				*_created = false;
2623 			if (_id)
2624 				*_id = inode->ID();
2625 			if (_inode)
2626 				*_inode = inode;
2627 
2628 			// Only keep the vnode in memory if the _id or _inode pointer is
2629 			// provided
2630 			if (_id != NULL || _inode != NULL)
2631 				vnode.Keep();
2632 
2633 			return B_OK;
2634 		}
2635 	} else if (parent != NULL && (mode & S_ATTR_DIR) == 0) {
2636 		return B_BAD_VALUE;
2637 	} else if ((openMode & O_DIRECTORY) != 0) {
2638 		// TODO: we might need to return B_NOT_A_DIRECTORY here
2639 		return B_ENTRY_NOT_FOUND;
2640 	}
2641 
2642 	status_t status;
2643 
2644 	// do we have the power to create new files at all?
2645 	if (parent != NULL && (status = parent->CheckPermissions(W_OK)) != B_OK)
2646 		RETURN_ERROR(status);
2647 
2648 	// allocate space for the new inode
2649 	InodeAllocator allocator(transaction);
2650 	block_run run;
2651 	Inode* inode;
2652 	status = allocator.New(&parentRun, mode, run, vnodeOps, &inode);
2653 	if (status < B_OK)
2654 		return status;
2655 
2656 	T(Create(inode, parent, name, mode, openMode, type));
2657 
2658 	// Initialize the parts of the bfs_inode structure that
2659 	// InodeAllocator::New() hasn't touched yet
2660 
2661 	bfs_inode* node = &inode->Node();
2662 
2663 	if (parent == NULL) {
2664 		// we set the parent to itself in this case
2665 		// (only happens for the root and indices node)
2666 		node->parent = run;
2667 	} else
2668 		node->parent = parentRun;
2669 
2670 	node->uid = HOST_ENDIAN_TO_BFS_INT32(geteuid());
2671 	node->gid = HOST_ENDIAN_TO_BFS_INT32(parent
2672 		? parent->Node().GroupID() : getegid());
2673 		// the group ID is inherited from the parent, if available
2674 
2675 	node->type = HOST_ENDIAN_TO_BFS_INT32(type);
2676 
2677 	inode->WriteBack(transaction);
2678 		// make sure the initialized node is available to others
2679 
2680 	// only add the name to regular files, directories, or symlinks
2681 	// don't add it to attributes, or indices
2682 	if (tree && inode->IsRegularNode()
2683 		&& inode->SetName(transaction, name) != B_OK)
2684 		return B_ERROR;
2685 
2686 	// Initialize b+tree if it's a directory (and add "." & ".." if it's
2687 	// a standard directory for files - not for attributes or indices)
2688 	if (inode->IsContainer()) {
2689 		status = allocator.CreateTree();
2690 		if (status != B_OK)
2691 			return status;
2692 	}
2693 
2694 	// Add a link to the inode from the parent, depending on its type
2695 	// (the vnode is not published yet, so it is safe to make the inode
2696 	// accessable to the file system here)
2697 	if (tree != NULL) {
2698 		status = tree->Insert(transaction, name, inode->ID());
2699 	} else if (parent != NULL && (mode & S_ATTR_DIR) != 0) {
2700 		parent->Attributes() = run;
2701 		status = parent->WriteBack(transaction);
2702 	}
2703 
2704 	// Note, we only care if the inode could be made accessable for the
2705 	// two cases above; the root node or the indices root node must
2706 	// handle this case on their own (or other cases where "parent" is
2707 	// NULL)
2708 	if (status != B_OK)
2709 		RETURN_ERROR(status);
2710 
2711 	// Update the main indices (name, size & last_modified)
2712 	// (live queries might want to access us after this)
2713 
2714 	Index index(volume);
2715 	if (inode->InNameIndex() && name != NULL) {
2716 		// the name index only contains regular files
2717 		// (but not the root node where name == NULL)
2718 		status = index.InsertName(transaction, name, inode);
2719 		if (status != B_OK && status != B_BAD_INDEX) {
2720 			// We have to remove the node from the parent at this point,
2721 			// because the InodeAllocator destructor can't handle this
2722 			// case (and if it fails, we can't do anything about it...)
2723 			if (tree)
2724 				tree->Remove(transaction, name, inode->ID());
2725 			else if (parent != NULL && (mode & S_ATTR_DIR) != 0)
2726 				parent->Node().attributes.SetTo(0, 0, 0);
2727 
2728 			RETURN_ERROR(status);
2729 		}
2730 	}
2731 
2732 	if (parent != NULL && parent->IsContainer())
2733 		parent->ContainerContentsChanged(transaction);
2734 
2735 	inode->UpdateOldLastModified();
2736 
2737 	// The "size" & "last_modified" indices don't contain directories.
2738 	// If adding to these indices fails, the inode creation will not be
2739 	// harmed; they are considered less important than the "name" index.
2740 	if (inode->InSizeIndex())
2741 		index.InsertSize(transaction, inode);
2742 	if (inode->InLastModifiedIndex())
2743 		index.InsertLastModified(transaction, inode);
2744 
2745 	if (inode->NeedsFileCache()) {
2746 		inode->SetFileCache(file_cache_create(volume->ID(), inode->ID(),
2747 			inode->Size()));
2748 		inode->SetMap(file_map_create(volume->ID(), inode->ID(),
2749 			inode->Size()));
2750 
2751 		if (inode->FileCache() == NULL || inode->Map() == NULL)
2752 			return B_NO_MEMORY;
2753 	}
2754 
2755 	// Everything worked well until this point, we have a fully
2756 	// initialized inode, and we want to keep it
2757 	allocator.Keep(vnodeOps, publishFlags);
2758 
2759 	if (_created)
2760 		*_created = true;
2761 	if (_id != NULL)
2762 		*_id = inode->ID();
2763 	if (_inode != NULL)
2764 		*_inode = inode;
2765 
2766 	// if either _id or _inode is passed, we will keep the inode locked
2767 	if (_id == NULL && _inode == NULL)
2768 		put_vnode(volume->FSVolume(), inode->ID());
2769 
2770 	return B_OK;
2771 }
2772 
2773 
2774 //	#pragma mark - AttributeIterator
2775 
2776 
2777 AttributeIterator::AttributeIterator(Inode* inode)
2778 	:
2779 	fCurrentSmallData(0),
2780 	fInode(inode),
2781 	fAttributes(NULL),
2782 	fIterator(NULL),
2783 	fBuffer(NULL)
2784 {
2785 	inode->_AddIterator(this);
2786 }
2787 
2788 
2789 AttributeIterator::~AttributeIterator()
2790 {
2791 	if (fAttributes)
2792 		put_vnode(fAttributes->GetVolume()->FSVolume(), fAttributes->ID());
2793 
2794 	delete fIterator;
2795 	fInode->_RemoveIterator(this);
2796 }
2797 
2798 
2799 status_t
2800 AttributeIterator::Rewind()
2801 {
2802 	fCurrentSmallData = 0;
2803 
2804 	if (fIterator != NULL)
2805 		fIterator->Rewind();
2806 
2807 	return B_OK;
2808 }
2809 
2810 
2811 status_t
2812 AttributeIterator::GetNext(char* name, size_t* _length, uint32* _type,
2813 	ino_t* _id)
2814 {
2815 	// read attributes out of the small data section
2816 
2817 	if (fCurrentSmallData >= 0) {
2818 		NodeGetter nodeGetter(fInode->GetVolume(), fInode);
2819 		const bfs_inode* node = nodeGetter.Node();
2820 		const small_data* item = ((bfs_inode*)node)->SmallDataStart();
2821 
2822 		RecursiveLocker _(&fInode->SmallDataLock());
2823 
2824 		int32 index = 0;
2825 		for (; !item->IsLast(node); item = item->Next(), index++) {
2826 			if (item->NameSize() == FILE_NAME_NAME_LENGTH
2827 				&& *item->Name() == FILE_NAME_NAME)
2828 				continue;
2829 
2830 			if (index >= fCurrentSmallData)
2831 				break;
2832 		}
2833 
2834 		if (!item->IsLast(node)) {
2835 			strncpy(name, item->Name(), B_FILE_NAME_LENGTH);
2836 			*_type = item->Type();
2837 			*_length = item->NameSize();
2838 			*_id = (ino_t)index;
2839 
2840 			fCurrentSmallData = index + 1;
2841 			return B_OK;
2842 		}
2843 
2844 		// stop traversing the small_data section
2845 		fCurrentSmallData = -1;
2846 	}
2847 
2848 	// read attributes out of the attribute directory
2849 
2850 	if (fInode->Attributes().IsZero())
2851 		return B_ENTRY_NOT_FOUND;
2852 
2853 	Volume* volume = fInode->GetVolume();
2854 
2855 	// if you haven't yet access to the attributes directory, get it
2856 	if (fAttributes == NULL) {
2857 		if (get_vnode(volume->FSVolume(), volume->ToVnode(fInode->Attributes()),
2858 				(void**)&fAttributes) != B_OK) {
2859 			FATAL(("get_vnode() failed in AttributeIterator::GetNext(ino_t"
2860 				" = %" B_PRIdINO ",name = \"%s\")\n", fInode->ID(), name));
2861 			return B_ENTRY_NOT_FOUND;
2862 		}
2863 
2864 		BPlusTree* tree = fAttributes->Tree();
2865 		if (tree == NULL
2866 			|| (fIterator = new(std::nothrow) TreeIterator(tree)) == NULL) {
2867 			FATAL(("could not get tree in AttributeIterator::GetNext(ino_t"
2868 				" = %" B_PRIdINO ",name = \"%s\")\n", fInode->ID(), name));
2869 			return B_ENTRY_NOT_FOUND;
2870 		}
2871 	}
2872 
2873 	uint16 length;
2874 	ino_t id;
2875 	status_t status = fIterator->GetNextEntry(name, &length,
2876 		B_FILE_NAME_LENGTH, &id);
2877 	if (status != B_OK)
2878 		return status;
2879 
2880 	Vnode vnode(volume, id);
2881 	Inode* attribute;
2882 	if ((status = vnode.Get(&attribute)) == B_OK) {
2883 		*_type = attribute->Type();
2884 		*_length = length;
2885 		*_id = id;
2886 	}
2887 
2888 	return status;
2889 }
2890 
2891 
2892 void
2893 AttributeIterator::Update(uint16 index, int8 change)
2894 {
2895 	// fCurrentSmallData points already to the next item
2896 	if (index < fCurrentSmallData)
2897 		fCurrentSmallData += change;
2898 }
2899 
2900