xref: /haiku/src/add-ons/kernel/file_systems/bfs/Inode.cpp (revision 759ee24c4c1722813609c3b7c77fabef275b02bf)
1 /*
2  * Copyright 2001-2017, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 
7 //! Inode access functions
8 
9 
10 #include "Debug.h"
11 #include "Inode.h"
12 #include "BPlusTree.h"
13 #include "Index.h"
14 
15 
16 #if BFS_TRACING && !defined(FS_SHELL) && !defined(_BOOT_MODE)
17 namespace BFSInodeTracing {
18 
19 class Create : public AbstractTraceEntry {
20 public:
21 	Create(Inode* inode, Inode* parent, const char* name, int32 mode,
22 			int openMode, uint32 type)
23 		:
24 		fInode(inode),
25 		fID(inode->ID()),
26 		fParent(parent),
27 		fParentID(parent != NULL ? parent->ID() : 0),
28 		fMode(mode),
29 		fOpenMode(openMode),
30 		fType(type)
31 	{
32 		if (name != NULL)
33 			strlcpy(fName, name, sizeof(fName));
34 		else
35 			fName[0] = '\0';
36 
37 		Initialized();
38 	}
39 
40 	virtual void AddDump(TraceOutput& out)
41 	{
42 		out.Print("bfs:Create %Ld (%p), parent %Ld (%p), \"%s\", "
43 			"mode %lx, omode %x, type %lx", fID, fInode, fParentID,
44 			fParent, fName, fMode, fOpenMode, fType);
45 	}
46 
47 private:
48 	Inode*	fInode;
49 	ino_t	fID;
50 	Inode*	fParent;
51 	ino_t	fParentID;
52 	char	fName[32];
53 	int32	fMode;
54 	int		fOpenMode;
55 	uint32	fType;
56 };
57 
58 class Remove : public AbstractTraceEntry {
59 public:
60 	Remove(Inode* inode, const char* name)
61 		:
62 		fInode(inode),
63 		fID(inode->ID())
64 	{
65 		strlcpy(fName, name, sizeof(fName));
66 		Initialized();
67 	}
68 
69 	virtual void AddDump(TraceOutput& out)
70 	{
71 		out.Print("bfs:Remove %Ld (%p), \"%s\"", fID, fInode, fName);
72 	}
73 
74 private:
75 	Inode*	fInode;
76 	ino_t	fID;
77 	char	fName[32];
78 };
79 
80 class Action : public AbstractTraceEntry {
81 public:
82 	Action(const char* action, Inode* inode)
83 		:
84 		fInode(inode),
85 		fID(inode->ID())
86 	{
87 		strlcpy(fAction, action, sizeof(fAction));
88 		Initialized();
89 	}
90 
91 	virtual void AddDump(TraceOutput& out)
92 	{
93 		out.Print("bfs:%s %Ld (%p)\n", fAction, fID, fInode);
94 	}
95 
96 private:
97 	Inode*	fInode;
98 	ino_t	fID;
99 	char	fAction[16];
100 };
101 
102 class Resize : public AbstractTraceEntry {
103 public:
104 	Resize(Inode* inode, off_t oldSize, off_t newSize, bool trim)
105 		:
106 		fInode(inode),
107 		fID(inode->ID()),
108 		fOldSize(oldSize),
109 		fNewSize(newSize),
110 		fTrim(trim)
111 	{
112 		Initialized();
113 	}
114 
115 	virtual void AddDump(TraceOutput& out)
116 	{
117 		out.Print("bfs:%s %Ld (%p), %Ld -> %Ld", fTrim ? "Trim" : "Resize",
118 			fID, fInode, fOldSize, fNewSize);
119 	}
120 
121 private:
122 	Inode*	fInode;
123 	ino_t	fID;
124 	off_t	fOldSize;
125 	off_t	fNewSize;
126 	bool	fTrim;
127 };
128 
129 }	// namespace BFSInodeTracing
130 
131 #	define T(x) new(std::nothrow) BFSInodeTracing::x;
132 #else
133 #	define T(x) ;
134 #endif
135 
136 
137 /*!	A helper class used by Inode::Create() to keep track of the belongings
138 	of an inode creation in progress.
139 	This class will make sure everything is cleaned up properly.
140 */
141 class InodeAllocator {
142 public:
143 							InodeAllocator(Transaction& transaction);
144 							~InodeAllocator();
145 
146 			status_t		New(block_run* parentRun, mode_t mode, uint32 flags,
147 								block_run& run, fs_vnode_ops* vnodeOps,
148 								Inode** _inode);
149 			status_t		CreateTree();
150 			status_t		Keep(fs_vnode_ops* vnodeOps, uint32 publishFlags);
151 
152 private:
153 	static	void			_TransactionListener(int32 id, int32 event,
154 								void* _inode);
155 
156 			Transaction*	fTransaction;
157 			block_run		fRun;
158 			Inode*			fInode;
159 };
160 
161 
162 InodeAllocator::InodeAllocator(Transaction& transaction)
163 	:
164 	fTransaction(&transaction),
165 	fInode(NULL)
166 {
167 }
168 
169 
170 InodeAllocator::~InodeAllocator()
171 {
172 	if (fTransaction != NULL) {
173 		Volume* volume = fTransaction->GetVolume();
174 
175 		if (fInode != NULL) {
176 			fInode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
177 				// this unblocks any pending bfs_read_vnode() calls
178 			fInode->Free(*fTransaction);
179 
180 			if (fInode->fTree != NULL)
181 				fTransaction->RemoveListener(fInode->fTree);
182 			fTransaction->RemoveListener(fInode);
183 
184 			remove_vnode(volume->FSVolume(), fInode->ID());
185 		} else
186 			volume->Free(*fTransaction, fRun);
187 	}
188 
189 	delete fInode;
190 }
191 
192 
193 status_t
194 InodeAllocator::New(block_run* parentRun, mode_t mode, uint32 publishFlags,
195 	block_run& run, fs_vnode_ops* vnodeOps, Inode** _inode)
196 {
197 	Volume* volume = fTransaction->GetVolume();
198 
199 	status_t status = volume->AllocateForInode(*fTransaction, parentRun, mode,
200 		fRun);
201 	if (status < B_OK) {
202 		// don't free the space in the destructor, because
203 		// the allocation failed
204 		fTransaction = NULL;
205 		RETURN_ERROR(status);
206 	}
207 
208 	run = fRun;
209 	fInode = new(std::nothrow) Inode(volume, *fTransaction,
210 		volume->ToVnode(run), mode, run);
211 	if (fInode == NULL)
212 		RETURN_ERROR(B_NO_MEMORY);
213 
214 	if (!volume->IsInitializing()
215 		&& (publishFlags & BFS_DO_NOT_PUBLISH_VNODE) == 0) {
216 		status = new_vnode(volume->FSVolume(), fInode->ID(), fInode,
217 			vnodeOps != NULL ? vnodeOps : &gBFSVnodeOps);
218 		if (status < B_OK) {
219 			delete fInode;
220 			fInode = NULL;
221 			RETURN_ERROR(status);
222 		}
223 	}
224 
225 	fInode->WriteLockInTransaction(*fTransaction);
226 	*_inode = fInode;
227 	return B_OK;
228 }
229 
230 
231 status_t
232 InodeAllocator::CreateTree()
233 {
234 	Volume* volume = fTransaction->GetVolume();
235 
236 	// force S_STR_INDEX to be set, if no type is set
237 	if ((fInode->Mode() & S_INDEX_TYPES) == 0)
238 		fInode->Node().mode |= HOST_ENDIAN_TO_BFS_INT32(S_STR_INDEX);
239 
240 	BPlusTree* tree = new(std::nothrow) BPlusTree(*fTransaction, fInode);
241 	if (tree == NULL)
242 		return B_ERROR;
243 
244 	status_t status = tree->InitCheck();
245 	if (status != B_OK) {
246 		delete tree;
247 		return status;
248 	}
249 
250 	fInode->fTree = tree;
251 
252 	if (fInode->IsRegularNode()) {
253 		if (tree->Insert(*fTransaction, ".", fInode->ID()) < B_OK
254 			|| tree->Insert(*fTransaction, "..",
255 					volume->ToVnode(fInode->Parent())) < B_OK)
256 			return B_ERROR;
257 	}
258 	return B_OK;
259 }
260 
261 
262 status_t
263 InodeAllocator::Keep(fs_vnode_ops* vnodeOps, uint32 publishFlags)
264 {
265 	ASSERT(fInode != NULL && fTransaction != NULL);
266 	Volume* volume = fTransaction->GetVolume();
267 
268 	status_t status = fInode->WriteBack(*fTransaction);
269 	if (status < B_OK) {
270 		FATAL(("writing new inode %" B_PRIdINO " failed!\n", fInode->ID()));
271 		return status;
272 	}
273 
274 	// Symbolic links are not published -- the caller needs to do this once
275 	// the contents have been written.
276 	if (!fInode->IsSymLink() && !volume->IsInitializing()
277 		&& (publishFlags & BFS_DO_NOT_PUBLISH_VNODE) == 0) {
278 		status = publish_vnode(volume->FSVolume(), fInode->ID(), fInode,
279 			vnodeOps != NULL ? vnodeOps : &gBFSVnodeOps, fInode->Mode(),
280 			publishFlags);
281 	}
282 
283 	if (status == B_OK) {
284 		cache_add_transaction_listener(volume->BlockCache(), fTransaction->ID(),
285 			TRANSACTION_ABORTED, &_TransactionListener, fInode);
286 	}
287 
288 	fTransaction = NULL;
289 	fInode = NULL;
290 
291 	return status;
292 }
293 
294 
295 /*static*/ void
296 InodeAllocator::_TransactionListener(int32 id, int32 event, void* _inode)
297 {
298 	Inode* inode = (Inode*)_inode;
299 
300 	if (event == TRANSACTION_ABORTED)
301 		panic("transaction %d aborted, inode %p still around!\n", (int)id, inode);
302 }
303 
304 
305 //	#pragma mark -
306 
307 
308 status_t
309 bfs_inode::InitCheck(Volume* volume) const
310 {
311 	if (Magic1() != INODE_MAGIC1
312 		|| !(Flags() & INODE_IN_USE)
313 		|| inode_num.Length() != 1
314 		// matches inode size?
315 		|| (uint32)InodeSize() != volume->InodeSize()
316 		// parent resides on disk?
317 		|| parent.AllocationGroup() > int32(volume->AllocationGroups())
318 		|| parent.AllocationGroup() < 0
319 		|| parent.Start() > (1L << volume->AllocationGroupShift())
320 		|| parent.Length() != 1
321 		// attributes, too?
322 		|| attributes.AllocationGroup() > int32(volume->AllocationGroups())
323 		|| attributes.AllocationGroup() < 0
324 		|| attributes.Start() > (1L << volume->AllocationGroupShift()))
325 		RETURN_ERROR(B_BAD_DATA);
326 
327 	if (Flags() & INODE_DELETED)
328 		return B_NOT_ALLOWED;
329 
330 	// TODO: Add some tests to check the integrity of the other stuff here,
331 	// especially for the data_stream!
332 
333 	return B_OK;
334 }
335 
336 
337 //	#pragma mark - Inode
338 
339 
340 Inode::Inode(Volume* volume, ino_t id)
341 	:
342 	fVolume(volume),
343 	fID(id),
344 	fTree(NULL),
345 	fAttributes(NULL),
346 	fCache(NULL),
347 	fMap(NULL)
348 {
349 	PRINT(("Inode::Inode(volume = %p, id = %Ld) @ %p\n", volume, id, this));
350 
351 	rw_lock_init(&fLock, "bfs inode");
352 	recursive_lock_init(&fSmallDataLock, "bfs inode small data");
353 
354 	if (UpdateNodeFromDisk() != B_OK) {
355 		// TODO: the error code gets eaten
356 		return;
357 	}
358 
359 	// these two will help to maintain the indices
360 	fOldSize = Size();
361 	fOldLastModified = LastModified();
362 
363 	if (IsContainer())
364 		fTree = new(std::nothrow) BPlusTree(this);
365 	if (NeedsFileCache()) {
366 		SetFileCache(file_cache_create(fVolume->ID(), ID(), Size()));
367 		SetMap(file_map_create(volume->ID(), ID(), Size()));
368 	}
369 }
370 
371 
372 Inode::Inode(Volume* volume, Transaction& transaction, ino_t id, mode_t mode,
373 		block_run& run)
374 	:
375 	fVolume(volume),
376 	fID(id),
377 	fTree(NULL),
378 	fAttributes(NULL),
379 	fCache(NULL),
380 	fMap(NULL)
381 {
382 	PRINT(("Inode::Inode(volume = %p, transaction = %p, id = %Ld) @ %p\n",
383 		volume, &transaction, id, this));
384 
385 	rw_lock_init(&fLock, "bfs inode");
386 	recursive_lock_init(&fSmallDataLock, "bfs inode small data");
387 
388 	NodeGetter node(volume, transaction, this, true);
389 	if (node.Node() == NULL) {
390 		FATAL(("Could not read inode block %" B_PRId64 "!\n", BlockNumber()));
391 		return;
392 	}
393 
394 	memset(&fNode, 0, sizeof(bfs_inode));
395 
396 	// Initialize the bfs_inode structure -- it's not written back to disk
397 	// here, because it will be done so already when the inode could be
398 	// created completely.
399 
400 	Node().magic1 = HOST_ENDIAN_TO_BFS_INT32(INODE_MAGIC1);
401 	Node().inode_num = run;
402 	Node().mode = HOST_ENDIAN_TO_BFS_INT32(mode);
403 	Node().flags = HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
404 
405 	Node().create_time = Node().last_modified_time = Node().status_change_time
406 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
407 
408 	Node().inode_size = HOST_ENDIAN_TO_BFS_INT32(volume->InodeSize());
409 
410 	// these two will help to maintain the indices
411 	fOldSize = Size();
412 	fOldLastModified = LastModified();
413 }
414 
415 
416 Inode::~Inode()
417 {
418 	PRINT(("Inode::~Inode() @ %p\n", this));
419 
420 	file_cache_delete(FileCache());
421 	file_map_delete(Map());
422 	delete fTree;
423 
424 	rw_lock_destroy(&fLock);
425 	recursive_lock_destroy(&fSmallDataLock);
426 }
427 
428 
429 status_t
430 Inode::InitCheck(bool checkNode) const
431 {
432 	// test inode magic and flags
433 	if (checkNode) {
434 		status_t status = Node().InitCheck(fVolume);
435 		if (status == B_BUSY)
436 			return B_BUSY;
437 
438 		if (status != B_OK) {
439 			FATAL(("inode at block %" B_PRIdOFF " corrupt!\n", BlockNumber()));
440 			RETURN_ERROR(B_BAD_DATA);
441 		}
442 	}
443 
444 	if (IsContainer()) {
445 		// inodes that have a B+tree
446 		if (fTree == NULL)
447 			RETURN_ERROR(B_NO_MEMORY);
448 
449 		status_t status = fTree->InitCheck();
450 		if (status != B_OK) {
451 			FATAL(("inode tree at block %" B_PRIdOFF " corrupt!\n",
452 				BlockNumber()));
453 			RETURN_ERROR(B_BAD_DATA);
454 		}
455 	}
456 
457 	if (NeedsFileCache() && (fCache == NULL || fMap == NULL))
458 		return B_NO_MEMORY;
459 
460 	return B_OK;
461 }
462 
463 
464 /*!	Adds this inode to the specified transaction. This means that the inode will
465 	be write locked until the transaction ended.
466 	To ensure that the inode will stay valid until that point, an extra reference
467 	is acquired to it as long as this transaction stays active.
468 */
469 void
470 Inode::WriteLockInTransaction(Transaction& transaction)
471 {
472 	// These flags can only change while holding the transaction lock
473 	if ((Flags() & INODE_IN_TRANSACTION) != 0)
474 		return;
475 
476 	// We share the same list link with the removed list, so we have to remove
477 	// the inode from that list here (and add it back when we no longer need it)
478 	if ((Flags() & INODE_DELETED) != 0)
479 		fVolume->RemovedInodes().Remove(this);
480 
481 	if (!fVolume->IsInitializing())
482 		acquire_vnode(fVolume->FSVolume(), ID());
483 
484 	rw_lock_write_lock(&Lock());
485 	Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
486 
487 	transaction.AddListener(this);
488 }
489 
490 
491 status_t
492 Inode::WriteBack(Transaction& transaction)
493 {
494 	NodeGetter node(fVolume, transaction, this);
495 	if (node.WritableNode() == NULL)
496 		return B_IO_ERROR;
497 
498 	memcpy(node.WritableNode(), &Node(), sizeof(bfs_inode));
499 	return B_OK;
500 }
501 
502 
503 status_t
504 Inode::UpdateNodeFromDisk()
505 {
506 	NodeGetter node(fVolume, this);
507 	if (node.Node() == NULL) {
508 		FATAL(("Failed to read block %" B_PRId64 " from disk!\n",
509 			BlockNumber()));
510 		return B_IO_ERROR;
511 	}
512 
513 	memcpy(&fNode, node.Node(), sizeof(bfs_inode));
514 	fNode.flags &= HOST_ENDIAN_TO_BFS_INT32(INODE_PERMANENT_FLAGS);
515 	return B_OK;
516 }
517 
518 
519 status_t
520 Inode::CheckPermissions(int accessMode) const
521 {
522 	// you never have write access to a read-only volume
523 	if ((accessMode & W_OK) != 0 && fVolume->IsReadOnly())
524 		return B_READ_ONLY_DEVICE;
525 
526 	return check_access_permissions(accessMode, Mode(), (gid_t)fNode.GroupID(),
527 		(uid_t)fNode.UserID());
528 }
529 
530 
531 //	#pragma mark - attributes
532 
533 
534 void
535 Inode::_AddIterator(AttributeIterator* iterator)
536 {
537 	RecursiveLocker _(fSmallDataLock);
538 	fIterators.Add(iterator);
539 }
540 
541 
542 void
543 Inode::_RemoveIterator(AttributeIterator* iterator)
544 {
545 	RecursiveLocker _(fSmallDataLock);
546 	fIterators.Remove(iterator);
547 }
548 
549 
550 /*!	Tries to free up "bytes" space in the small_data section by moving
551 	attributes to real files. Used for system attributes like the name.
552 	You need to hold the fSmallDataLock when you call this method
553 */
554 status_t
555 Inode::_MakeSpaceForSmallData(Transaction& transaction, bfs_inode* node,
556 	const char* name, int32 bytes)
557 {
558 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
559 
560 	while (bytes > 0) {
561 		small_data* item = node->SmallDataStart();
562 		small_data* max = NULL;
563 		int32 index = 0, maxIndex = 0;
564 		for (; !item->IsLast(node); item = item->Next(), index++) {
565 			// should not remove those
566 			if (*item->Name() == FILE_NAME_NAME || !strcmp(name, item->Name()))
567 				continue;
568 
569 			if (max == NULL || max->Size() < item->Size()) {
570 				maxIndex = index;
571 				max = item;
572 			}
573 
574 			// Remove the first one large enough to free the needed amount of
575 			// bytes
576 			if (bytes < (int32)item->Size())
577 				break;
578 		}
579 
580 		if (item->IsLast(node) || (int32)item->Size() < bytes)
581 			return B_ERROR;
582 
583 		bytes -= max->Size();
584 
585 		// Move the attribute to a real attribute file
586 		// Luckily, this doesn't cause any index updates
587 
588 		Inode* attribute;
589 		status_t status = CreateAttribute(transaction, item->Name(),
590 			item->Type(), &attribute);
591 		if (status != B_OK)
592 			RETURN_ERROR(status);
593 
594 		size_t length = item->DataSize();
595 		status = attribute->WriteAt(transaction, 0, item->Data(), &length);
596 
597 		ReleaseAttribute(attribute);
598 
599 		if (status != B_OK) {
600 			Vnode vnode(fVolume, Attributes());
601 			Inode* attributes;
602 			if (vnode.Get(&attributes) < B_OK
603 				|| attributes->Remove(transaction, name) < B_OK) {
604 				FATAL(("Could not remove newly created attribute!\n"));
605 			}
606 
607 			RETURN_ERROR(status);
608 		}
609 
610 		_RemoveSmallData(node, max, maxIndex);
611 	}
612 	return B_OK;
613 }
614 
615 
616 /*!	Private function which removes the given attribute from the small_data
617 	section.
618 	You need to hold the fSmallDataLock when you call this method
619 */
620 status_t
621 Inode::_RemoveSmallData(bfs_inode* node, small_data* item, int32 index)
622 {
623 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
624 
625 	small_data* next = item->Next();
626 	if (!next->IsLast(node)) {
627 		// find the last attribute
628 		small_data* last = next;
629 		while (!last->IsLast(node))
630 			last = last->Next();
631 
632 		int32 size = (uint8*)last - (uint8*)next;
633 		if (size < 0
634 			|| size > (uint8*)node + fVolume->BlockSize() - (uint8*)next)
635 			return B_BAD_DATA;
636 
637 		memmove(item, next, size);
638 
639 		// Move the "last" one to its new location and
640 		// correctly terminate the small_data section
641 		last = (small_data*)((uint8*)last - ((uint8*)next - (uint8*)item));
642 		memset(last, 0, (uint8*)node + fVolume->BlockSize() - (uint8*)last);
643 	} else
644 		memset(item, 0, item->Size());
645 
646 	// update all current iterators
647 	SinglyLinkedList<AttributeIterator>::Iterator iterator
648 		= fIterators.GetIterator();
649 	while (iterator.HasNext()) {
650 		iterator.Next()->Update(index, -1);
651 	}
652 
653 	return B_OK;
654 }
655 
656 
657 //!	Removes the given attribute from the small_data section.
658 status_t
659 Inode::_RemoveSmallData(Transaction& transaction, NodeGetter& nodeGetter,
660 	const char* name)
661 {
662 	if (name == NULL)
663 		return B_BAD_VALUE;
664 
665 	bfs_inode* node = nodeGetter.WritableNode();
666 	RecursiveLocker locker(fSmallDataLock);
667 
668 	// search for the small_data item
669 
670 	small_data* item = node->SmallDataStart();
671 	int32 index = 0;
672 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
673 		item = item->Next();
674 		index++;
675 	}
676 
677 	if (item->IsLast(node))
678 		return B_ENTRY_NOT_FOUND;
679 
680 	nodeGetter.MakeWritable(transaction);
681 
682 	status_t status = _RemoveSmallData(node, item, index);
683 	if (status == B_OK) {
684 		Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
685 			bfs_inode::ToInode(real_time_clock_usecs()));
686 
687 		status = WriteBack(transaction);
688 	}
689 
690 	return status;
691 }
692 
693 
694 /*!	Try to place the given attribute in the small_data section - if the
695 	new attribute is too big to fit in that section, it returns B_DEVICE_FULL.
696 	In that case, the attribute should be written to a real attribute file;
697 	it's the caller's responsibility to remove any existing attributes in the
698 	small data section if that's the case.
699 
700 	Note that you need to write back the inode yourself after having called that
701 	method - it's a bad API decision that it needs a transaction but enforces
702 	you to write back the inode all by yourself, but it's just more efficient
703 	in most cases...
704 */
705 status_t
706 Inode::_AddSmallData(Transaction& transaction, NodeGetter& nodeGetter,
707 	const char* name, uint32 type, off_t pos, const uint8* data, size_t length,
708 	bool force)
709 {
710 	bfs_inode* node = nodeGetter.WritableNode();
711 
712 	if (node == NULL || name == NULL || data == NULL)
713 		return B_BAD_VALUE;
714 
715 	// reject any requests that can't fit into the small_data section
716 	uint32 nameLength = strlen(name);
717 	uint32 spaceNeeded = sizeof(small_data) + nameLength + 3 + pos + length + 1;
718 	if (spaceNeeded > fVolume->InodeSize() - sizeof(bfs_inode))
719 		return B_DEVICE_FULL;
720 
721 	nodeGetter.MakeWritable(transaction);
722 	RecursiveLocker locker(fSmallDataLock);
723 
724 	// Find the last item or one with the same name we have to add
725 	small_data* item = node->SmallDataStart();
726 	int32 index = 0;
727 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
728 		item = item->Next();
729 		index++;
730 	}
731 
732 	// is the attribute already in the small_data section?
733 	// then just replace the data part of that one
734 	if (!item->IsLast(node)) {
735 		// find last attribute
736 		small_data* last = item;
737 		while (!last->IsLast(node))
738 			last = last->Next();
739 
740 		// try to change the attributes value
741 		if (item->data_size > pos + length
742 			|| force
743 			|| ((uint8*)last + pos + length - item->DataSize())
744 					<= ((uint8*)node + fVolume->InodeSize())) {
745 			// Make room for the new attribute if needed (and we are forced
746 			// to do so)
747 			if (force && ((uint8*)last + pos + length - item->DataSize())
748 					> ((uint8*)node + fVolume->InodeSize())) {
749 				// We also take the free space at the end of the small_data
750 				// section into account, and request only what's really needed
751 				uint32 needed = pos + length - item->DataSize() -
752 					(uint32)((uint8*)node + fVolume->InodeSize()
753 						- (uint8*)last);
754 
755 				if (_MakeSpaceForSmallData(transaction, node, name, needed)
756 						!= B_OK)
757 					return B_ERROR;
758 
759 				// reset our pointers
760 				item = node->SmallDataStart();
761 				index = 0;
762 				while (!item->IsLast(node) && strcmp(item->Name(), name)) {
763 					item = item->Next();
764 					index++;
765 				}
766 
767 				last = item;
768 				while (!last->IsLast(node))
769 					last = last->Next();
770 			}
771 
772 			size_t oldDataSize = item->DataSize();
773 
774 			// Normally, we can just overwrite the attribute data as the size
775 			// is specified by the type and does not change that often
776 			if (pos + length != item->DataSize()) {
777 				// move the attributes after the current one
778 				small_data* next = item->Next();
779 				if (!next->IsLast(node)) {
780 					memmove((uint8*)item + spaceNeeded, next,
781 						(uint8*)last - (uint8*)next);
782 				}
783 
784 				// Move the "last" one to its new location and
785 				// correctly terminate the small_data section
786 				last = (small_data*)((uint8*)last
787 					- ((uint8*)next - ((uint8*)item + spaceNeeded)));
788 				if ((uint8*)last < (uint8*)node + fVolume->BlockSize()) {
789 					memset(last, 0, (uint8*)node + fVolume->BlockSize()
790 						- (uint8*)last);
791 				}
792 
793 				item->data_size = HOST_ENDIAN_TO_BFS_INT16(pos + length);
794 			}
795 
796 			item->type = HOST_ENDIAN_TO_BFS_INT32(type);
797 
798 			if ((uint64)oldDataSize < (uint64)pos) {
799 				// Fill gap with zeros
800 				memset(item->Data() + oldDataSize, 0, pos - oldDataSize);
801 			}
802 			if (user_memcpy(item->Data() + pos, data, length) < B_OK)
803 				return B_BAD_ADDRESS;
804 			item->Data()[pos + length] = '\0';
805 
806 			return B_OK;
807 		}
808 
809 		return B_DEVICE_FULL;
810 	}
811 
812 	// try to add the new attribute!
813 
814 	if ((uint8*)item + spaceNeeded > (uint8*)node + fVolume->InodeSize()) {
815 		// there is not enough space for it!
816 		if (!force)
817 			return B_DEVICE_FULL;
818 
819 		// make room for the new attribute
820 		if (_MakeSpaceForSmallData(transaction, node, name, spaceNeeded) < B_OK)
821 			return B_ERROR;
822 
823 		// get new last item!
824 		item = node->SmallDataStart();
825 		index = 0;
826 		while (!item->IsLast(node)) {
827 			item = item->Next();
828 			index++;
829 		}
830 	}
831 
832 	memset(item, 0, spaceNeeded);
833 	item->type = HOST_ENDIAN_TO_BFS_INT32(type);
834 	item->name_size = HOST_ENDIAN_TO_BFS_INT16(nameLength);
835 	item->data_size = HOST_ENDIAN_TO_BFS_INT16(length);
836 	strcpy(item->Name(), name);
837 	if (user_memcpy(item->Data() + pos, data, length) < B_OK)
838 		return B_BAD_ADDRESS;
839 
840 	// correctly terminate the small_data section
841 	item = item->Next();
842 	if (!item->IsLast(node))
843 		memset(item, 0, (uint8*)node + fVolume->InodeSize() - (uint8*)item);
844 
845 	// update all current iterators
846 	SinglyLinkedList<AttributeIterator>::Iterator iterator
847 		= fIterators.GetIterator();
848 	while (iterator.HasNext()) {
849 		iterator.Next()->Update(index, 1);
850 	}
851 
852 	return B_OK;
853 }
854 
855 
856 /*!	Iterates through the small_data section of an inode.
857 	To start at the beginning of this section, you let smallData
858 	point to NULL, like:
859 		small_data* data = NULL;
860 		while (inode->GetNextSmallData(&data) { ... }
861 
862 	This function is reentrant and doesn't allocate any memory;
863 	you can safely stop calling it at any point (you don't need
864 	to iterate through the whole list).
865 	You need to hold the fSmallDataLock when you call this method
866 */
867 status_t
868 Inode::_GetNextSmallData(bfs_inode* node, small_data** _smallData) const
869 {
870 	if (node == NULL)
871 		RETURN_ERROR(B_BAD_VALUE);
872 
873 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
874 
875 	small_data* data = *_smallData;
876 
877 	// begin from the start?
878 	if (data == NULL)
879 		data = node->SmallDataStart();
880 	else
881 		data = data->Next();
882 
883 	// is already last item?
884 	if (data->IsLast(node))
885 		return B_ENTRY_NOT_FOUND;
886 
887 	*_smallData = data;
888 
889 	return B_OK;
890 }
891 
892 
893 /*!	Finds the attribute "name" in the small data section, and
894 	returns a pointer to it (or NULL if it doesn't exist).
895 	You need to hold the fSmallDataLock when you call this method
896 */
897 small_data*
898 Inode::FindSmallData(const bfs_inode* node, const char* name) const
899 {
900 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
901 
902 	small_data* smallData = NULL;
903 	while (_GetNextSmallData(const_cast<bfs_inode*>(node), &smallData)
904 			== B_OK) {
905 		if (!strcmp(smallData->Name(), name))
906 			return smallData;
907 	}
908 	return NULL;
909 }
910 
911 
912 /*!	Returns a pointer to the node's name if present in the small data
913 	section, NULL otherwise.
914 	You need to hold the fSmallDataLock when you call this method
915 */
916 const char*
917 Inode::Name(const bfs_inode* node) const
918 {
919 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
920 
921 	small_data* smallData = NULL;
922 	while (_GetNextSmallData((bfs_inode*)node, &smallData) == B_OK) {
923 		if (*smallData->Name() == FILE_NAME_NAME
924 			&& smallData->NameSize() == FILE_NAME_NAME_LENGTH)
925 			return (const char*)smallData->Data();
926 	}
927 	return NULL;
928 }
929 
930 
931 /*!	Copies the node's name into the provided buffer.
932 	The buffer should be B_FILE_NAME_LENGTH bytes large.
933 */
934 status_t
935 Inode::GetName(char* buffer, size_t size) const
936 {
937 	NodeGetter node(fVolume, this);
938 	if (node.Node() == NULL)
939 		return B_IO_ERROR;
940 
941 	RecursiveLocker locker(fSmallDataLock);
942 
943 	const char* name = Name(node.Node());
944 	if (name == NULL)
945 		return B_ENTRY_NOT_FOUND;
946 
947 	strlcpy(buffer, name, size);
948 	return B_OK;
949 }
950 
951 
952 /*!	Changes or set the name of a file: in the inode small_data section only, it
953 	doesn't change it in the parent directory's b+tree.
954 	Note that you need to write back the inode yourself after having called
955 	that method. It suffers from the same API decision as AddSmallData() does
956 	(and for the same reason).
957 */
958 status_t
959 Inode::SetName(Transaction& transaction, const char* name)
960 {
961 	if (name == NULL || *name == '\0')
962 		return B_BAD_VALUE;
963 
964 	NodeGetter node(fVolume, transaction, this);
965 	if (node.Node() == NULL)
966 		return B_IO_ERROR;
967 
968 	const char nameTag[2] = {FILE_NAME_NAME, 0};
969 
970 	return _AddSmallData(transaction, node, nameTag, FILE_NAME_TYPE, 0,
971 		(uint8*)name, strlen(name), true);
972 }
973 
974 
975 status_t
976 Inode::_RemoveAttribute(Transaction& transaction, const char* name,
977 	bool hasIndex, Index* index)
978 {
979 	// remove the attribute file if it exists
980 	Vnode vnode(fVolume, Attributes());
981 	Inode* attributes;
982 	status_t status = vnode.Get(&attributes);
983 	if (status < B_OK)
984 		return status;
985 
986 	// update index
987 	if (index != NULL) {
988 		Inode* attribute;
989 		if ((hasIndex || fVolume->CheckForLiveQuery(name))
990 			&& GetAttribute(name, &attribute) == B_OK) {
991 			uint8 data[MAX_INDEX_KEY_LENGTH];
992 			size_t length = MAX_INDEX_KEY_LENGTH;
993 			if (attribute->ReadAt(0, data, &length) == B_OK) {
994 				index->Update(transaction, name, attribute->Type(), data,
995 					length, NULL, 0, this);
996 			}
997 
998 			ReleaseAttribute(attribute);
999 		}
1000 	}
1001 
1002 	if ((status = attributes->Remove(transaction, name)) < B_OK)
1003 		return status;
1004 
1005 	if (attributes->IsEmpty()) {
1006 		attributes->WriteLockInTransaction(transaction);
1007 
1008 		// remove attribute directory (don't fail if that can't be done)
1009 		if (remove_vnode(fVolume->FSVolume(), attributes->ID()) == B_OK) {
1010 			// update the inode, so that no one will ever doubt it's deleted :-)
1011 			attributes->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
1012 			if (attributes->WriteBack(transaction) == B_OK) {
1013 				Attributes().SetTo(0, 0, 0);
1014 				WriteBack(transaction);
1015 			} else {
1016 				unremove_vnode(fVolume->FSVolume(), attributes->ID());
1017 				attributes->Node().flags
1018 					&= ~HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
1019 			}
1020 		}
1021 	}
1022 
1023 	return status;
1024 }
1025 
1026 
1027 /*!	Reads data from the specified attribute.
1028 	This is a high-level attribute function that understands attributes
1029 	in the small_data section as well as real attribute files.
1030 */
1031 status_t
1032 Inode::ReadAttribute(const char* name, int32 type, off_t pos, uint8* buffer,
1033 	size_t* _length)
1034 {
1035 	if (pos < 0)
1036 		pos = 0;
1037 
1038 	// search in the small_data section (which has to be locked first)
1039 	{
1040 		NodeGetter node(fVolume, this);
1041 		if (node.Node() == NULL)
1042 			return B_IO_ERROR;
1043 
1044 		RecursiveLocker locker(fSmallDataLock);
1045 
1046 		small_data* smallData = FindSmallData(node.Node(), name);
1047 		if (smallData != NULL) {
1048 			size_t length = *_length;
1049 			if (pos >= smallData->data_size) {
1050 				*_length = 0;
1051 				return B_OK;
1052 			}
1053 			if (length + pos > smallData->DataSize())
1054 				length = smallData->DataSize() - pos;
1055 
1056 			status_t error = user_memcpy(buffer, smallData->Data() + pos,
1057 				length);
1058 			*_length = length;
1059 			return error;
1060 		}
1061 	}
1062 
1063 	// search in the attribute directory
1064 	Inode* attribute;
1065 	status_t status = GetAttribute(name, &attribute);
1066 	if (status == B_OK) {
1067 		status = attribute->ReadAt(pos, (uint8*)buffer, _length);
1068 
1069 		ReleaseAttribute(attribute);
1070 	}
1071 
1072 	RETURN_ERROR(status);
1073 }
1074 
1075 
1076 /*!	Writes data to the specified attribute.
1077 	This is a high-level attribute function that understands attributes
1078 	in the small_data section as well as real attribute files.
1079 */
1080 status_t
1081 Inode::WriteAttribute(Transaction& transaction, const char* name, int32 type,
1082 	off_t pos, const uint8* buffer, size_t* _length, bool* _created)
1083 {
1084 	if (pos < 0)
1085 		return B_BAD_VALUE;
1086 
1087 	// needed to maintain the index
1088 	uint8 oldBuffer[MAX_INDEX_KEY_LENGTH];
1089 	uint8* oldData = NULL;
1090 	size_t oldLength = 0;
1091 	bool created = false;
1092 
1093 	// TODO: we actually depend on that the contents of "buffer" are constant.
1094 	// If they get changed during the write (hey, user programs), we may mess
1095 	// up our index trees!
1096 	// TODO: for attribute files, we need to log the first
1097 	// MAX_INDEX_KEY_LENGTH bytes of the data stream, or the same as above
1098 	// might happen.
1099 
1100 	Index index(fVolume);
1101 	bool hasIndex = index.SetTo(name) == B_OK;
1102 
1103 	Inode* attribute = NULL;
1104 	status_t status = B_OK;
1105 
1106 	if (GetAttribute(name, &attribute) != B_OK) {
1107 		// No attribute inode exists yet
1108 
1109 		// save the old attribute data
1110 		NodeGetter node(fVolume, transaction, this);
1111 		if (node.Node() == NULL)
1112 			return B_IO_ERROR;
1113 
1114 		recursive_lock_lock(&fSmallDataLock);
1115 
1116 		small_data* smallData = FindSmallData(node.Node(), name);
1117 		if (smallData != NULL) {
1118 			oldLength = smallData->DataSize();
1119 			if (oldLength > 0) {
1120 				if (oldLength > MAX_INDEX_KEY_LENGTH)
1121 					oldLength = MAX_INDEX_KEY_LENGTH;
1122 				memcpy(oldData = oldBuffer, smallData->Data(), oldLength);
1123 			}
1124 		} else
1125 			created = true;
1126 
1127 		recursive_lock_unlock(&fSmallDataLock);
1128 
1129 		// if the attribute doesn't exist yet (as a file), try to put it in the
1130 		// small_data section first - if that fails (due to insufficent space),
1131 		// create a real attribute file
1132 		status = _AddSmallData(transaction, node, name, type, pos, buffer,
1133 			*_length);
1134 		if (status == B_DEVICE_FULL) {
1135 			if (smallData != NULL) {
1136 				// remove the old attribute from the small data section - there
1137 				// is no space left for the new data
1138 				status = _RemoveSmallData(transaction, node, name);
1139 			} else
1140 				status = B_OK;
1141 
1142 			if (status == B_OK)
1143 				status = CreateAttribute(transaction, name, type, &attribute);
1144 			if (status != B_OK)
1145 				RETURN_ERROR(status);
1146 
1147 			created = true;
1148 		} else if (status == B_OK) {
1149 			// Update status time on attribute write
1150 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1151 				bfs_inode::ToInode(real_time_clock_usecs()));
1152 
1153 			status = WriteBack(transaction);
1154 		}
1155 	}
1156 
1157 	if (attribute != NULL) {
1158 		WriteLocker writeLocker(attribute->fLock);
1159 
1160 		if (hasIndex || fVolume->CheckForLiveQuery(name)) {
1161 			// Save the old attribute data (if this fails, oldLength will
1162 			// reflect it)
1163 			while (attribute->Size() > 0) {
1164 				bigtime_t oldModified = attribute->LastModified();
1165 				writeLocker.Unlock();
1166 
1167 				oldLength = MAX_INDEX_KEY_LENGTH;
1168 				if (attribute->ReadAt(0, oldBuffer, &oldLength) == B_OK)
1169 					oldData = oldBuffer;
1170 
1171 				writeLocker.Lock();
1172 
1173 				// Read until the data hasn't changed in between
1174 				if (oldModified == attribute->LastModified())
1175 					break;
1176 
1177 				oldLength = 0;
1178 			}
1179 		}
1180 
1181 		// check if the data fits into the small_data section again
1182 		NodeGetter node(fVolume, transaction, this);
1183 		if (node.Node() == NULL)
1184 			return B_IO_ERROR;
1185 
1186 		status = _AddSmallData(transaction, node, name, type, pos, buffer,
1187 			*_length);
1188 
1189 		if (status == B_OK) {
1190 			// it does - remove its file
1191 			writeLocker.Unlock();
1192 			status = _RemoveAttribute(transaction, name, false, NULL);
1193 		} else {
1194 			// The attribute type might have been changed - we need to
1195 			// adopt the new one
1196 			attribute->Node().type = HOST_ENDIAN_TO_BFS_INT32(type);
1197 			status = attribute->WriteBack(transaction);
1198 			writeLocker.Unlock();
1199 
1200 			if (status == B_OK) {
1201 				status = attribute->WriteAt(transaction, pos, buffer,
1202 					_length);
1203 			}
1204 		}
1205 
1206 		if (status == B_OK) {
1207 			// Update status time on attribute write
1208 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1209 				bfs_inode::ToInode(real_time_clock_usecs()));
1210 
1211 			status = WriteBack(transaction);
1212 		}
1213 
1214 		attribute->WriteLockInTransaction(transaction);
1215 		ReleaseAttribute(attribute);
1216 	}
1217 
1218 	// TODO: find a better way than this "pos" thing (the begin of the old key
1219 	//	must be copied to the start of the new one for a comparison)
1220 	if (status == B_OK && pos == 0) {
1221 		// Index only the first MAX_INDEX_KEY_LENGTH bytes
1222 		uint16 length = *_length;
1223 		if (length > MAX_INDEX_KEY_LENGTH)
1224 			length = MAX_INDEX_KEY_LENGTH;
1225 
1226 		uint8 indexBuffer[MAX_INDEX_KEY_LENGTH];
1227 		// _AddSmallData() already read the buffer
1228 		user_memcpy(indexBuffer, buffer, length);
1229 
1230 		// Update index. Note, Index::Update() may be called even if
1231 		// initializing the index failed - it will just update the live
1232 		// queries in this case
1233 		if (pos < length || (uint64)pos < (uint64)oldLength) {
1234 			index.Update(transaction, name, type, oldData, oldLength,
1235 				indexBuffer, length, this);
1236 		}
1237 	}
1238 
1239 	if (_created != NULL)
1240 		*_created = created;
1241 
1242 	return status;
1243 }
1244 
1245 
1246 /*!	Removes the specified attribute from the inode.
1247 	This is a high-level attribute function that understands attributes
1248 	in the small_data section as well as real attribute files.
1249 */
1250 status_t
1251 Inode::RemoveAttribute(Transaction& transaction, const char* name)
1252 {
1253 	Index index(fVolume);
1254 	bool hasIndex = index.SetTo(name) == B_OK;
1255 	NodeGetter node(fVolume, this);
1256 	if (node.Node() == NULL)
1257 		return B_IO_ERROR;
1258 
1259 	// update index for attributes in the small_data section
1260 	{
1261 		RecursiveLocker _(fSmallDataLock);
1262 
1263 		small_data* smallData = FindSmallData(node.Node(), name);
1264 		if (smallData != NULL) {
1265 			uint32 length = smallData->DataSize();
1266 			if (length > MAX_INDEX_KEY_LENGTH)
1267 				length = MAX_INDEX_KEY_LENGTH;
1268 			index.Update(transaction, name, smallData->Type(),
1269 				smallData->Data(), length, NULL, 0, this);
1270 		}
1271 	}
1272 
1273 	status_t status = _RemoveSmallData(transaction, node, name);
1274 	if (status == B_ENTRY_NOT_FOUND && !Attributes().IsZero()) {
1275 		// remove the attribute file if it exists
1276 		status = _RemoveAttribute(transaction, name, hasIndex, &index);
1277 		if (status == B_OK) {
1278 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1279 				bfs_inode::ToInode(real_time_clock_usecs()));
1280 			WriteBack(transaction);
1281 		}
1282 	}
1283 
1284 	return status;
1285 }
1286 
1287 
1288 /*!	Returns the attribute inode with the specified \a name, in case it exists.
1289 	This method can only return real attribute files; the attributes in the
1290 	small data section are ignored.
1291 */
1292 status_t
1293 Inode::GetAttribute(const char* name, Inode** _attribute)
1294 {
1295 	// does this inode even have attributes?
1296 	if (Attributes().IsZero())
1297 		return B_ENTRY_NOT_FOUND;
1298 
1299 	Vnode vnode(fVolume, Attributes());
1300 	Inode* attributes;
1301 	if (vnode.Get(&attributes) < B_OK) {
1302 		FATAL(("get_vnode() failed in Inode::GetAttribute(name = \"%s\")\n",
1303 			name));
1304 		return B_ERROR;
1305 	}
1306 
1307 	BPlusTree* tree = attributes->Tree();
1308 	if (tree == NULL)
1309 		return B_BAD_VALUE;
1310 
1311 	InodeReadLocker locker(attributes);
1312 
1313 	ino_t id;
1314 	status_t status = tree->Find((uint8*)name, (uint16)strlen(name), &id);
1315 	if (status == B_OK) {
1316 		Vnode vnode(fVolume, id);
1317 		Inode* inode;
1318 		// Check if the attribute is really an attribute
1319 		if (vnode.Get(&inode) != B_OK || !inode->IsAttribute())
1320 			return B_ERROR;
1321 
1322 		*_attribute = inode;
1323 		vnode.Keep();
1324 		return B_OK;
1325 	}
1326 
1327 	return status;
1328 }
1329 
1330 
1331 void
1332 Inode::ReleaseAttribute(Inode* attribute)
1333 {
1334 	if (attribute == NULL)
1335 		return;
1336 
1337 	put_vnode(fVolume->FSVolume(), attribute->ID());
1338 }
1339 
1340 
1341 status_t
1342 Inode::CreateAttribute(Transaction& transaction, const char* name, uint32 type,
1343 	Inode** attribute)
1344 {
1345 	// do we need to create the attribute directory first?
1346 	if (Attributes().IsZero()) {
1347 		status_t status = Inode::Create(transaction, this, NULL,
1348 			S_ATTR_DIR | S_DIRECTORY | 0666, 0, 0, NULL);
1349 		if (status < B_OK)
1350 			RETURN_ERROR(status);
1351 	}
1352 	Vnode vnode(fVolume, Attributes());
1353 	Inode* attributes;
1354 	if (vnode.Get(&attributes) < B_OK)
1355 		return B_ERROR;
1356 
1357 	// Inode::Create() locks the inode for us
1358 	return Inode::Create(transaction, attributes, name,
1359 		S_ATTR | S_FILE | 0666, 0, type, NULL, NULL, attribute);
1360 }
1361 
1362 
1363 //	#pragma mark - directory tree
1364 
1365 
1366 bool
1367 Inode::IsEmpty()
1368 {
1369 	TreeIterator iterator(fTree);
1370 
1371 	uint32 count = 0;
1372 	char name[MAX_INDEX_KEY_LENGTH + 1];
1373 	uint16 length;
1374 	ino_t id;
1375 	while (iterator.GetNextEntry(name, &length, MAX_INDEX_KEY_LENGTH + 1,
1376 			&id) == B_OK) {
1377 		if ((Mode() & (S_ATTR_DIR | S_INDEX_DIR)) != 0)
1378 			return false;
1379 
1380 		// Unlike index and attribute directories, directories
1381 		// for standard files always contain ".", and "..", so
1382 		// we need to ignore those two
1383 		if (++count > 2 || (strcmp(".", name) != 0 && strcmp("..", name) != 0))
1384 			return false;
1385 	}
1386 	return true;
1387 }
1388 
1389 
1390 status_t
1391 Inode::ContainerContentsChanged(Transaction& transaction)
1392 {
1393 	ASSERT(!InLastModifiedIndex());
1394 
1395 	Node().last_modified_time = Node().status_change_time
1396 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
1397 
1398 	return WriteBack(transaction);
1399 }
1400 
1401 
1402 //	#pragma mark - data stream
1403 
1404 
1405 /*!	Computes the number of bytes this inode occupies in the file system.
1406 	This includes the file meta blocks used for maintaining its data stream.
1407 
1408 	TODO: However, the attributes in extra files are not really accounted for;
1409 	depending on the speed penalty, this should be changed, though (the value
1410 	could be cached in the inode structure or Inode object, though).
1411 */
1412 off_t
1413 Inode::AllocatedSize() const
1414 {
1415 	if (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0) {
1416 		// This symlink does not have a data stream
1417 		return Node().InodeSize();
1418 	}
1419 
1420 	const data_stream& data = Node().data;
1421 	uint32 blockSize = fVolume->BlockSize();
1422 	off_t size = blockSize;
1423 
1424 	if (data.MaxDoubleIndirectRange() != 0) {
1425 		off_t doubleIndirectSize = data.MaxDoubleIndirectRange()
1426 			- data.MaxIndirectRange();
1427 		int32 indirectSize = double_indirect_max_indirect_size(
1428 			data.double_indirect.Length(), fVolume->BlockSize());
1429 
1430 		size += (2 * data.double_indirect.Length()
1431 				+ doubleIndirectSize / indirectSize)
1432 			* blockSize + data.MaxDoubleIndirectRange();
1433 	} else if (data.MaxIndirectRange() != 0)
1434 		size += data.indirect.Length() + data.MaxIndirectRange();
1435 	else
1436 		size += data.MaxDirectRange();
1437 
1438 	if (!Node().attributes.IsZero()) {
1439 		// TODO: to make this exact, we'd had to count all attributes
1440 		size += 2 * blockSize;
1441 			// 2 blocks, one for the attributes inode, one for its B+tree
1442 	}
1443 
1444 	return size;
1445 }
1446 
1447 
1448 /*!	Finds the block_run where "pos" is located in the data_stream of
1449 	the inode.
1450 	If successful, "offset" will then be set to the file offset
1451 	of the block_run returned; so "pos - offset" is for the block_run
1452 	what "pos" is for the whole stream.
1453 	The caller has to make sure that "pos" is inside the stream.
1454 */
1455 status_t
1456 Inode::FindBlockRun(off_t pos, block_run& run, off_t& offset)
1457 {
1458 	data_stream* data = &Node().data;
1459 
1460 	// find matching block run
1461 
1462 	if (data->MaxIndirectRange() > 0 && pos >= data->MaxDirectRange()) {
1463 		if (data->MaxDoubleIndirectRange() > 0
1464 			&& pos >= data->MaxIndirectRange()) {
1465 			// access to double indirect blocks
1466 
1467 			CachedBlock cached(fVolume);
1468 
1469 			int32 runsPerBlock;
1470 			int32 directSize;
1471 			int32 indirectSize;
1472 			get_double_indirect_sizes(data->double_indirect.Length(),
1473 				fVolume->BlockSize(), runsPerBlock, directSize, indirectSize);
1474 			if (directSize <= 0 || indirectSize <= 0)
1475 				RETURN_ERROR(B_BAD_DATA);
1476 
1477 			off_t start = pos - data->MaxIndirectRange();
1478 			int32 index = start / indirectSize;
1479 
1480 			block_run* indirect = (block_run*)cached.SetTo(
1481 				fVolume->ToBlock(data->double_indirect) + index / runsPerBlock);
1482 			if (indirect == NULL)
1483 				RETURN_ERROR(B_ERROR);
1484 
1485 			int32 current = (start % indirectSize) / directSize;
1486 
1487 			indirect = (block_run*)cached.SetTo(
1488 				fVolume->ToBlock(indirect[index % runsPerBlock])
1489 				+ current / runsPerBlock);
1490 			if (indirect == NULL)
1491 				RETURN_ERROR(B_ERROR);
1492 
1493 			run = indirect[current % runsPerBlock];
1494 			if (run.Length() != data->double_indirect.Length())
1495 				RETURN_ERROR(B_BAD_DATA);
1496 
1497 			offset = data->MaxIndirectRange() + (index * indirectSize)
1498 				+ (current * directSize);
1499 		} else {
1500 			// access to indirect blocks
1501 
1502 			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1503 			off_t runBlockEnd = data->MaxDirectRange();
1504 
1505 			CachedBlock cached(fVolume);
1506 			off_t block = fVolume->ToBlock(data->indirect);
1507 
1508 			for (int32 i = 0; i < data->indirect.Length(); i++) {
1509 				block_run* indirect = (block_run*)cached.SetTo(block + i);
1510 				if (indirect == NULL)
1511 					RETURN_ERROR(B_IO_ERROR);
1512 
1513 				int32 current = -1;
1514 				while (++current < runsPerBlock) {
1515 					if (indirect[current].IsZero())
1516 						break;
1517 
1518 					runBlockEnd += (uint32)indirect[current].Length()
1519 						<< cached.BlockShift();
1520 					if (runBlockEnd > pos) {
1521 						run = indirect[current];
1522 						offset = runBlockEnd - ((uint32)run.Length()
1523 							<< cached.BlockShift());
1524 						return fVolume->ValidateBlockRun(run);
1525 					}
1526 				}
1527 			}
1528 			RETURN_ERROR(B_ERROR);
1529 		}
1530 	} else {
1531 		// access from direct blocks
1532 
1533 		off_t runBlockEnd = 0LL;
1534 		int32 current = -1;
1535 
1536 		while (++current < NUM_DIRECT_BLOCKS) {
1537 			if (data->direct[current].IsZero())
1538 				break;
1539 
1540 			runBlockEnd += (uint32)data->direct[current].Length()
1541 				<< fVolume->BlockShift();
1542 			if (runBlockEnd > pos) {
1543 				run = data->direct[current];
1544 				offset = runBlockEnd
1545 					- ((uint32)run.Length() << fVolume->BlockShift());
1546 				return fVolume->ValidateBlockRun(run);
1547 			}
1548 		}
1549 
1550 		return B_ENTRY_NOT_FOUND;
1551 	}
1552 	return fVolume->ValidateBlockRun(run);
1553 }
1554 
1555 
1556 status_t
1557 Inode::ReadAt(off_t pos, uint8* buffer, size_t* _length)
1558 {
1559 	return file_cache_read(FileCache(), NULL, pos, buffer, _length);
1560 }
1561 
1562 
1563 status_t
1564 Inode::WriteAt(Transaction& transaction, off_t pos, const uint8* buffer,
1565 	size_t* _length)
1566 {
1567 	InodeReadLocker locker(this);
1568 
1569 	// update the last modification time in memory, it will be written
1570 	// back to the inode, and the index when the file is closed
1571 	// TODO: should update the internal last modified time only at this point!
1572 	Node().last_modified_time = Node().status_change_time
1573 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
1574 
1575 	// TODO: support INODE_LOGGED!
1576 
1577 	size_t length = *_length;
1578 	bool changeSize = (uint64)pos + (uint64)length > (uint64)Size();
1579 
1580 	// set/check boundaries for pos/length
1581 	if (pos < 0)
1582 		return B_BAD_VALUE;
1583 
1584 	locker.Unlock();
1585 
1586 	// the transaction doesn't have to be started already
1587 	if (changeSize && !transaction.IsStarted())
1588 		transaction.Start(fVolume, BlockNumber());
1589 
1590 	WriteLocker writeLocker(fLock);
1591 
1592 	// Work around possible race condition: Someone might have shrunken the file
1593 	// while we had no lock.
1594 	if (!transaction.IsStarted()
1595 		&& (uint64)pos + (uint64)length > (uint64)Size()) {
1596 		writeLocker.Unlock();
1597 		transaction.Start(fVolume, BlockNumber());
1598 		writeLocker.Lock();
1599 	}
1600 
1601 	off_t oldSize = Size();
1602 
1603 	if ((uint64)pos + (uint64)length > (uint64)oldSize) {
1604 		// let's grow the data stream to the size needed
1605 		status_t status = SetFileSize(transaction, pos + length);
1606 		if (status != B_OK) {
1607 			*_length = 0;
1608 			WriteLockInTransaction(transaction);
1609 			RETURN_ERROR(status);
1610 		}
1611 		// TODO: In theory we would need to update the file size
1612 		// index here as part of the current transaction - this might just
1613 		// be a bit too expensive, but worth a try.
1614 
1615 		// we need to write back the inode here because it has to
1616 		// go into this transaction (we cannot wait until the file
1617 		// is closed)
1618 		status = WriteBack(transaction);
1619 		if (status != B_OK) {
1620 			WriteLockInTransaction(transaction);
1621 			return status;
1622 		}
1623 	}
1624 
1625 	writeLocker.Unlock();
1626 
1627 	if (oldSize < pos)
1628 		FillGapWithZeros(oldSize, pos);
1629 
1630 	// If we don't want to write anything, we can now return (we may
1631 	// just have changed the file size using the position parameter)
1632 	if (length == 0)
1633 		return B_OK;
1634 
1635 	status_t status = file_cache_write(FileCache(), NULL, pos, buffer, _length);
1636 
1637 	if (transaction.IsStarted())
1638 		WriteLockInTransaction(transaction);
1639 
1640 	return status;
1641 }
1642 
1643 
1644 /*!	Fills the gap between the old file size and the new file size
1645 	with zeros.
1646 	It's more or less a copy of Inode::WriteAt() but it can handle
1647 	length differences of more than just 4 GB, and it never uses
1648 	the log, even if the INODE_LOGGED flag is set.
1649 */
1650 status_t
1651 Inode::FillGapWithZeros(off_t pos, off_t newSize)
1652 {
1653 	while (pos < newSize) {
1654 		size_t size;
1655 		if (newSize > pos + 1024 * 1024 * 1024)
1656 			size = 1024 * 1024 * 1024;
1657 		else
1658 			size = newSize - pos;
1659 
1660 		status_t status = file_cache_write(FileCache(), NULL, pos, NULL, &size);
1661 		if (status < B_OK)
1662 			return status;
1663 
1664 		pos += size;
1665 	}
1666 
1667 	return B_OK;
1668 }
1669 
1670 
1671 /*!	Allocates \a length blocks, and clears their contents. Growing
1672 	the indirect and double indirect range uses this method.
1673 	The allocated block_run is saved in "run"
1674 */
1675 status_t
1676 Inode::_AllocateBlockArray(Transaction& transaction, block_run& run,
1677 	size_t length, bool variableSize)
1678 {
1679 	if (!run.IsZero())
1680 		return B_BAD_VALUE;
1681 
1682 	status_t status = fVolume->Allocate(transaction, this, length, run,
1683 		variableSize ? 1 : length);
1684 	if (status != B_OK)
1685 		return status;
1686 
1687 	// make sure those blocks are empty
1688 	CachedBlock cached(fVolume);
1689 	off_t block = fVolume->ToBlock(run);
1690 
1691 	for (int32 i = 0; i < run.Length(); i++) {
1692 		block_run* runs = (block_run*)cached.SetToWritable(transaction,
1693 			block + i, true);
1694 		if (runs == NULL)
1695 			return B_IO_ERROR;
1696 	}
1697 	return B_OK;
1698 }
1699 
1700 
1701 /*!	Grows the stream to \a size, and fills the direct/indirect/double indirect
1702 	ranges with the runs.
1703 	This method will also determine the size of the preallocation, if any.
1704 */
1705 status_t
1706 Inode::_GrowStream(Transaction& transaction, off_t size)
1707 {
1708 	data_stream* data = &Node().data;
1709 
1710 	// is the data stream already large enough to hold the new size?
1711 	// (can be the case with preallocated blocks)
1712 	if (size < data->MaxDirectRange()
1713 		|| size < data->MaxIndirectRange()
1714 		|| size < data->MaxDoubleIndirectRange()) {
1715 		data->size = HOST_ENDIAN_TO_BFS_INT64(size);
1716 		return B_OK;
1717 	}
1718 
1719 	// how many bytes are still needed? (unused ranges are always zero)
1720 	uint16 minimum = 1;
1721 	off_t bytes;
1722 	if (data->Size() < data->MaxDoubleIndirectRange()) {
1723 		bytes = size - data->MaxDoubleIndirectRange();
1724 		// The double indirect range can only handle multiples of
1725 		// its base length
1726 		minimum = data->double_indirect.Length();
1727 	} else if (data->Size() < data->MaxIndirectRange())
1728 		bytes = size - data->MaxIndirectRange();
1729 	else if (data->Size() < data->MaxDirectRange())
1730 		bytes = size - data->MaxDirectRange();
1731 	else {
1732 		// no preallocation left to be used
1733 		bytes = size - data->Size();
1734 		if (data->MaxDoubleIndirectRange() > 0)
1735 			minimum = data->double_indirect.Length();
1736 	}
1737 
1738 	// do we have enough free blocks on the disk?
1739 	off_t blocksNeeded = (bytes + fVolume->BlockSize() - 1)
1740 		>> fVolume->BlockShift();
1741 	if (blocksNeeded > fVolume->FreeBlocks())
1742 		return B_DEVICE_FULL;
1743 
1744 	off_t blocksRequested = blocksNeeded;
1745 		// because of preallocations and partial allocations, the number of
1746 		// blocks we need to allocate may be different from the one we request
1747 		// from the block allocator
1748 
1749 	// Should we preallocate some blocks?
1750 	// Attributes, attribute directories, and long symlinks usually won't get
1751 	// that big, and should stay close to the inode - preallocating could be
1752 	// counterproductive.
1753 	// Also, if free disk space is tight, don't preallocate.
1754 	if (!IsAttribute() && !IsAttributeDirectory() && !IsSymLink()
1755 		&& fVolume->FreeBlocks() > 128) {
1756 		off_t roundTo = 0;
1757 		if (IsFile()) {
1758 			// Request preallocated blocks depending on the file size and growth
1759 			if (size < 1 * 1024 * 1024 && bytes < 512 * 1024) {
1760 				// Preallocate 64 KB for file sizes <1 MB and grow rates <512 KB
1761 				roundTo = 65536 >> fVolume->BlockShift();
1762 			} else if (size < 32 * 1024 * 1024 && bytes <= 1 * 1024 * 1024) {
1763 				// Preallocate 512 KB for file sizes between 1 MB and 32 MB, and
1764 				// grow rates smaller than 1 MB
1765 				roundTo = (512 * 1024) >> fVolume->BlockShift();
1766 			} else {
1767 				// Preallocate 1/16 of the file size (ie. 4 MB for 64 MB,
1768 				// 64 MB for 1 GB)
1769 				roundTo = size >> (fVolume->BlockShift() + 4);
1770 			}
1771 		} else if (IsIndex()) {
1772 			// Always preallocate 64 KB for index directories
1773 			roundTo = 65536 >> fVolume->BlockShift();
1774 		} else {
1775 			// Preallocate only 4 KB - directories only get trimmed when their
1776 			// vnode is flushed, which might not happen very often.
1777 			roundTo = 4096 >> fVolume->BlockShift();
1778 		}
1779 		if (roundTo > 1) {
1780 			// Round to next "roundTo" block count
1781 			blocksRequested = ((blocksNeeded + roundTo) / roundTo) * roundTo;
1782 		}
1783 	}
1784 
1785 	while (blocksNeeded > 0) {
1786 		// the requested blocks do not need to be returned with a
1787 		// single allocation, so we need to iterate until we have
1788 		// enough blocks allocated
1789 		if (minimum > 1) {
1790 			// make sure that "blocks" is a multiple of minimum
1791 			blocksRequested = round_up(blocksRequested, minimum);
1792 		}
1793 
1794 		block_run run;
1795 		status_t status = fVolume->Allocate(transaction, this, blocksRequested,
1796 			run, minimum);
1797 		if (status != B_OK)
1798 			return status;
1799 
1800 		// okay, we have the needed blocks, so just distribute them to the
1801 		// different ranges of the stream (direct, indirect & double indirect)
1802 
1803 		blocksNeeded -= run.Length();
1804 		// don't preallocate if the first allocation was already too small
1805 		blocksRequested = blocksNeeded;
1806 
1807 		// Direct block range
1808 
1809 		if (data->Size() <= data->MaxDirectRange()) {
1810 			// let's try to put them into the direct block range
1811 			int32 free = 0;
1812 			for (; free < NUM_DIRECT_BLOCKS; free++) {
1813 				if (data->direct[free].IsZero())
1814 					break;
1815 			}
1816 
1817 			if (free < NUM_DIRECT_BLOCKS) {
1818 				// can we merge the last allocated run with the new one?
1819 				int32 last = free - 1;
1820 				if (free > 0 && data->direct[last].MergeableWith(run)) {
1821 					data->direct[last].length = HOST_ENDIAN_TO_BFS_INT16(
1822 						data->direct[last].Length() + run.Length());
1823 				} else
1824 					data->direct[free] = run;
1825 
1826 				data->max_direct_range = HOST_ENDIAN_TO_BFS_INT64(
1827 					data->MaxDirectRange()
1828 					+ run.Length() * fVolume->BlockSize());
1829 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1830 					? data->max_direct_range : size);
1831 				continue;
1832 			}
1833 		}
1834 
1835 		// Indirect block range
1836 
1837 		if (data->Size() <= data->MaxIndirectRange()
1838 			|| !data->MaxIndirectRange()) {
1839 			CachedBlock cached(fVolume);
1840 			block_run* runs = NULL;
1841 			uint32 free = 0;
1842 			off_t block;
1843 
1844 			// if there is no indirect block yet, create one
1845 			if (data->indirect.IsZero()) {
1846 				status = _AllocateBlockArray(transaction, data->indirect,
1847 					NUM_ARRAY_BLOCKS, true);
1848 				if (status != B_OK)
1849 					return status;
1850 
1851 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1852 					data->MaxDirectRange());
1853 				// insert the block_run in the first block
1854 				runs = (block_run*)cached.SetTo(data->indirect);
1855 			} else {
1856 				uint32 numberOfRuns = fVolume->BlockSize() / sizeof(block_run);
1857 				block = fVolume->ToBlock(data->indirect);
1858 
1859 				// search first empty entry
1860 				int32 i = 0;
1861 				for (; i < data->indirect.Length(); i++) {
1862 					if ((runs = (block_run*)cached.SetTo(block + i)) == NULL)
1863 						return B_IO_ERROR;
1864 
1865 					for (free = 0; free < numberOfRuns; free++)
1866 						if (runs[free].IsZero())
1867 							break;
1868 
1869 					if (free < numberOfRuns)
1870 						break;
1871 				}
1872 				if (i == data->indirect.Length())
1873 					runs = NULL;
1874 			}
1875 
1876 			if (runs != NULL) {
1877 				// try to insert the run to the last one - note that this
1878 				// doesn't take block borders into account, so it could be
1879 				// further optimized
1880 				cached.MakeWritable(transaction);
1881 
1882 				int32 last = free - 1;
1883 				if (free > 0 && runs[last].MergeableWith(run)) {
1884 					runs[last].length = HOST_ENDIAN_TO_BFS_INT16(
1885 						runs[last].Length() + run.Length());
1886 				} else
1887 					runs[free] = run;
1888 
1889 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1890 					data->MaxIndirectRange()
1891 					+ ((uint32)run.Length() << fVolume->BlockShift()));
1892 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1893 					? data->MaxIndirectRange() : size);
1894 				continue;
1895 			}
1896 		}
1897 
1898 		// Double indirect block range
1899 
1900 		if (data->Size() <= data->MaxDoubleIndirectRange()
1901 			|| !data->max_double_indirect_range) {
1902 			// We make sure here that we have this minimum allocated, so if
1903 			// the allocation succeeds, we don't run into an endless loop.
1904 			if (!data->max_double_indirect_range)
1905 				minimum = _DoubleIndirectBlockLength();
1906 			else
1907 				minimum = data->double_indirect.Length();
1908 
1909 			if ((run.Length() % minimum) != 0) {
1910 				// The number of allocated blocks isn't a multiple of 'minimum',
1911 				// so we have to change this. This can happen the first time the
1912 				// stream grows into the double indirect range.
1913 				// First, free the remaining blocks that don't fit into this
1914 				// multiple.
1915 				int32 rest = run.Length() % minimum;
1916 				run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length() - rest);
1917 
1918 				status = fVolume->Free(transaction,
1919 					block_run::Run(run.AllocationGroup(),
1920 					run.Start() + run.Length(), rest));
1921 				if (status != B_OK)
1922 					return status;
1923 
1924 				blocksNeeded += rest;
1925 				blocksRequested = round_up(blocksNeeded, minimum);
1926 
1927 				// Are there any blocks left in the run? If not, allocate
1928 				// a new one
1929 				if (run.length == 0)
1930 					continue;
1931 			}
1932 
1933 			// if there is no double indirect block yet, create one
1934 			if (data->double_indirect.IsZero()) {
1935 				status = _AllocateBlockArray(transaction,
1936 					data->double_indirect, _DoubleIndirectBlockLength());
1937 				if (status != B_OK)
1938 					return status;
1939 
1940 				data->max_double_indirect_range = data->max_indirect_range;
1941 			}
1942 
1943 			// calculate the index where to insert the new blocks
1944 
1945 			int32 runsPerBlock;
1946 			int32 directSize;
1947 			int32 indirectSize;
1948 			get_double_indirect_sizes(data->double_indirect.Length(),
1949 				fVolume->BlockSize(), runsPerBlock, directSize, indirectSize);
1950 			if (directSize <= 0 || indirectSize <= 0)
1951 				return B_BAD_DATA;
1952 
1953 			off_t start = data->MaxDoubleIndirectRange()
1954 				- data->MaxIndirectRange();
1955 			int32 indirectIndex = start / indirectSize;
1956 			int32 index = (start % indirectSize) / directSize;
1957 			int32 runsPerArray = runsPerBlock * minimum;
1958 
1959 			// distribute the blocks to the array and allocate
1960 			// new array blocks when needed
1961 
1962 			CachedBlock cached(fVolume);
1963 			CachedBlock cachedDirect(fVolume);
1964 			block_run* array = NULL;
1965 			uint32 runLength = run.Length();
1966 
1967 			while (run.length != 0) {
1968 				// get the indirect array block
1969 				if (array == NULL) {
1970 					uint32 block = indirectIndex / runsPerBlock;
1971 					if (block >= minimum)
1972 						return EFBIG;
1973 
1974 					array = (block_run*)cached.SetTo(fVolume->ToBlock(
1975 						data->double_indirect) + block);
1976 					if (array == NULL)
1977 						return B_IO_ERROR;
1978 				}
1979 
1980 				do {
1981 					// do we need a new array block?
1982 					if (array[indirectIndex % runsPerBlock].IsZero()) {
1983 						cached.MakeWritable(transaction);
1984 
1985 						status = _AllocateBlockArray(transaction,
1986 							array[indirectIndex % runsPerBlock],
1987 							data->double_indirect.Length());
1988 						if (status != B_OK)
1989 							return status;
1990 					}
1991 
1992 					block_run* runs = (block_run*)cachedDirect.SetToWritable(
1993 						transaction, fVolume->ToBlock(array[indirectIndex
1994 							% runsPerBlock]) + index / runsPerBlock);
1995 					if (runs == NULL)
1996 						return B_IO_ERROR;
1997 
1998 					do {
1999 						// insert the block_run into the array
2000 						runs[index % runsPerBlock] = run;
2001 						runs[index % runsPerBlock].length
2002 							= HOST_ENDIAN_TO_BFS_INT16(minimum);
2003 
2004 						// alter the remaining block_run
2005 						run.start = HOST_ENDIAN_TO_BFS_INT16(run.Start()
2006 							+ minimum);
2007 						run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
2008 							- minimum);
2009 					} while ((++index % runsPerBlock) != 0 && run.length);
2010 				} while ((index % runsPerArray) != 0 && run.length);
2011 
2012 				if (index == runsPerArray)
2013 					index = 0;
2014 				if (++indirectIndex % runsPerBlock == 0) {
2015 					array = NULL;
2016 					index = 0;
2017 				}
2018 			}
2019 
2020 			data->max_double_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
2021 				data->MaxDoubleIndirectRange()
2022 				+ (runLength << fVolume->BlockShift()));
2023 			data->size = blocksNeeded > 0 ? HOST_ENDIAN_TO_BFS_INT64(
2024 				data->max_double_indirect_range) : size;
2025 
2026 			continue;
2027 		}
2028 
2029 		RETURN_ERROR(EFBIG);
2030 	}
2031 	// update the size of the data stream
2032 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
2033 
2034 	return B_OK;
2035 }
2036 
2037 
2038 size_t
2039 Inode::_DoubleIndirectBlockLength() const
2040 {
2041 	if (fVolume->BlockSize() > DOUBLE_INDIRECT_ARRAY_SIZE)
2042 		return 1;
2043 
2044 	return DOUBLE_INDIRECT_ARRAY_SIZE / fVolume->BlockSize();
2045 }
2046 
2047 
2048 /*!	Frees the statically sized array of the double indirect part of a data
2049 	stream.
2050 */
2051 status_t
2052 Inode::_FreeStaticStreamArray(Transaction& transaction, int32 level,
2053 	block_run run, off_t size, off_t offset, off_t& max)
2054 {
2055 	int32 indirectSize;
2056 	if (level == 0) {
2057 		indirectSize = double_indirect_max_indirect_size(run.Length(),
2058 			fVolume->BlockSize());
2059 	} else {
2060 		indirectSize = double_indirect_max_direct_size(run.Length(),
2061 			fVolume->BlockSize());
2062 	}
2063 	if (indirectSize <= 0)
2064 		return B_BAD_DATA;
2065 
2066 	off_t start;
2067 	if (size > offset)
2068 		start = size - offset;
2069 	else
2070 		start = 0;
2071 
2072 	int32 index = start / indirectSize;
2073 	int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
2074 
2075 	CachedBlock cached(fVolume);
2076 	off_t blockNumber = fVolume->ToBlock(run);
2077 
2078 	// set the file offset to the current block run
2079 	offset += (off_t)index * indirectSize;
2080 
2081 	for (int32 i = index / runsPerBlock; i < run.Length(); i++) {
2082 		block_run* array = (block_run*)cached.SetToWritable(transaction,
2083 			blockNumber + i);
2084 		if (array == NULL)
2085 			RETURN_ERROR(B_ERROR);
2086 
2087 		for (index = index % runsPerBlock; index < runsPerBlock; index++) {
2088 			if (array[index].IsZero()) {
2089 				// we also want to break out of the outer loop
2090 				i = run.Length();
2091 				break;
2092 			}
2093 
2094 			status_t status = B_OK;
2095 			if (level == 0) {
2096 				status = _FreeStaticStreamArray(transaction, 1, array[index],
2097 					size, offset, max);
2098 			} else if (offset >= size)
2099 				status = fVolume->Free(transaction, array[index]);
2100 			else
2101 				max = HOST_ENDIAN_TO_BFS_INT64(offset + indirectSize);
2102 
2103 			if (status < B_OK)
2104 				RETURN_ERROR(status);
2105 
2106 			if (offset >= size)
2107 				array[index].SetTo(0, 0, 0);
2108 
2109 			offset += indirectSize;
2110 		}
2111 		index = 0;
2112 	}
2113 	return B_OK;
2114 }
2115 
2116 
2117 /*!	Frees all block_runs in the array which come after the specified size.
2118 	It also trims the last block_run that contain the size.
2119 	"offset" and "max" are maintained until the last block_run that doesn't
2120 	have to be freed - after this, the values won't be correct anymore, but
2121 	will still assure correct function for all subsequent calls.
2122 	"max" is considered to be in file system byte order.
2123 */
2124 status_t
2125 Inode::_FreeStreamArray(Transaction& transaction, block_run* array,
2126 	uint32 arrayLength, off_t size, off_t& offset, off_t& max)
2127 {
2128 	PRINT(("FreeStreamArray: arrayLength %lu, size %Ld, offset %Ld, max %Ld\n",
2129 		arrayLength, size, offset, max));
2130 
2131 	off_t newOffset = offset;
2132 	uint32 i = 0;
2133 	for (; i < arrayLength; i++, offset = newOffset) {
2134 		if (array[i].IsZero())
2135 			break;
2136 
2137 		newOffset += (off_t)array[i].Length() << fVolume->BlockShift();
2138 		if (newOffset <= size)
2139 			continue;
2140 
2141 		block_run run = array[i];
2142 
2143 		// determine the block_run to be freed
2144 		if (newOffset > size && offset < size) {
2145 			// free partial block_run (and update the original block_run)
2146 			run.start = HOST_ENDIAN_TO_BFS_INT16(array[i].Start()
2147 				+ ((size + fVolume->BlockSize() - 1 - offset)
2148 					>> fVolume->BlockShift()));
2149 				// round to next block
2150 			array[i].length = HOST_ENDIAN_TO_BFS_INT16(run.Start()
2151 				- array[i].Start());
2152 			run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
2153 				- array[i].Length());
2154 
2155 			if (run.length == 0)
2156 				continue;
2157 
2158 			// update maximum range
2159 			max = HOST_ENDIAN_TO_BFS_INT64(offset + ((off_t)array[i].Length()
2160 				<< fVolume->BlockShift()));
2161 		} else {
2162 			// free the whole block_run
2163 			array[i].SetTo(0, 0, 0);
2164 
2165 			if ((off_t)BFS_ENDIAN_TO_HOST_INT64(max) > offset)
2166 				max = HOST_ENDIAN_TO_BFS_INT64(offset);
2167 		}
2168 
2169 		if (fVolume->Free(transaction, run) < B_OK)
2170 			return B_IO_ERROR;
2171 	}
2172 	return B_OK;
2173 }
2174 
2175 
2176 status_t
2177 Inode::_ShrinkStream(Transaction& transaction, off_t size)
2178 {
2179 	data_stream* data = &Node().data;
2180 	status_t status;
2181 
2182 	if (data->MaxDoubleIndirectRange() > size) {
2183 		off_t* maxDoubleIndirect = &data->max_double_indirect_range;
2184 			// gcc 4 work-around: "error: cannot bind packed field
2185 			// 'data->data_stream::max_double_indirect_range' to 'off_t&'"
2186 		status = _FreeStaticStreamArray(transaction, 0, data->double_indirect,
2187 			size, data->MaxIndirectRange(), *maxDoubleIndirect);
2188 		if (status != B_OK)
2189 			return status;
2190 
2191 		if (size <= data->MaxIndirectRange()) {
2192 			fVolume->Free(transaction, data->double_indirect);
2193 			data->double_indirect.SetTo(0, 0, 0);
2194 			data->max_double_indirect_range = 0;
2195 		}
2196 	}
2197 
2198 	if (data->MaxIndirectRange() > size) {
2199 		CachedBlock cached(fVolume);
2200 		off_t block = fVolume->ToBlock(data->indirect);
2201 		off_t offset = data->MaxDirectRange();
2202 
2203 		for (int32 i = 0; i < data->indirect.Length(); i++) {
2204 			block_run* array = (block_run*)cached.SetToWritable(transaction,
2205 				block + i);
2206 			if (array == NULL)
2207 				break;
2208 
2209 			off_t* maxIndirect = &data->max_indirect_range;
2210 				// gcc 4 work-around: "error: cannot bind packed field
2211 				// 'data->data_stream::max_indirect_range' to 'off_t&'"
2212 			if (_FreeStreamArray(transaction, array, fVolume->BlockSize()
2213 					/ sizeof(block_run), size, offset, *maxIndirect) != B_OK)
2214 				return B_IO_ERROR;
2215 		}
2216 		if (data->max_direct_range == data->max_indirect_range) {
2217 			fVolume->Free(transaction, data->indirect);
2218 			data->indirect.SetTo(0, 0, 0);
2219 			data->max_indirect_range = 0;
2220 		}
2221 	}
2222 
2223 	if (data->MaxDirectRange() > size) {
2224 		off_t offset = 0;
2225 		off_t *maxDirect = &data->max_direct_range;
2226 			// gcc 4 work-around: "error: cannot bind packed field
2227 			// 'data->data_stream::max_direct_range' to 'off_t&'"
2228 		status = _FreeStreamArray(transaction, data->direct, NUM_DIRECT_BLOCKS,
2229 			size, offset, *maxDirect);
2230 		if (status < B_OK)
2231 			return status;
2232 	}
2233 
2234 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
2235 	return B_OK;
2236 }
2237 
2238 
2239 status_t
2240 Inode::SetFileSize(Transaction& transaction, off_t size)
2241 {
2242 	if (size < 0)
2243 		return B_BAD_VALUE;
2244 
2245 	off_t oldSize = Size();
2246 
2247 	if (size == oldSize)
2248 		return B_OK;
2249 
2250 	T(Resize(this, oldSize, size, false));
2251 
2252 	// should the data stream grow or shrink?
2253 	status_t status;
2254 	if (size > oldSize) {
2255 		status = _GrowStream(transaction, size);
2256 		if (status < B_OK) {
2257 			// if the growing of the stream fails, the whole operation
2258 			// fails, so we should shrink the stream to its former size
2259 			_ShrinkStream(transaction, oldSize);
2260 		}
2261 	} else
2262 		status = _ShrinkStream(transaction, size);
2263 
2264 	if (status < B_OK)
2265 		return status;
2266 
2267 	file_cache_set_size(FileCache(), size);
2268 	file_map_set_size(Map(), size);
2269 
2270 	return WriteBack(transaction);
2271 }
2272 
2273 
2274 status_t
2275 Inode::Append(Transaction& transaction, off_t bytes)
2276 {
2277 	return SetFileSize(transaction, Size() + bytes);
2278 }
2279 
2280 
2281 /*!	Checks whether or not this inode's data stream needs to be trimmed
2282 	because of an earlier preallocation.
2283 	Returns true if there are any blocks to be trimmed.
2284 */
2285 bool
2286 Inode::NeedsTrimming() const
2287 {
2288 	// We never trim preallocated index blocks to make them grow as smooth as
2289 	// possible. There are only few indices anyway, so this doesn't hurt.
2290 	// Also, if an inode is already in deleted state, we don't bother trimming
2291 	// it.
2292 	if (IsIndex() || IsDeleted()
2293 		|| (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0))
2294 		return false;
2295 
2296 	off_t roundedSize = round_up(Size(), fVolume->BlockSize());
2297 
2298 	return Node().data.MaxDirectRange() > roundedSize
2299 		|| Node().data.MaxIndirectRange() > roundedSize
2300 		|| Node().data.MaxDoubleIndirectRange() > roundedSize;
2301 }
2302 
2303 
2304 status_t
2305 Inode::TrimPreallocation(Transaction& transaction)
2306 {
2307 	T(Resize(this, max_c(Node().data.MaxDirectRange(),
2308 		Node().data.MaxIndirectRange()), Size(), true));
2309 
2310 	status_t status = _ShrinkStream(transaction, Size());
2311 	if (status < B_OK)
2312 		return status;
2313 
2314 	return WriteBack(transaction);
2315 }
2316 
2317 
2318 //!	Frees the file's data stream and removes all attributes
2319 status_t
2320 Inode::Free(Transaction& transaction)
2321 {
2322 	FUNCTION();
2323 
2324 	// Perhaps there should be an implementation of Inode::ShrinkStream() that
2325 	// just frees the data_stream, but doesn't change the inode (since it is
2326 	// freed anyway) - that would make an undelete command possible
2327 	if (!IsSymLink() || (Flags() & INODE_LONG_SYMLINK) != 0) {
2328 		status_t status = SetFileSize(transaction, 0);
2329 		if (status < B_OK)
2330 			return status;
2331 	}
2332 
2333 	// Free all attributes, and remove their indices
2334 	{
2335 		// We have to limit the scope of AttributeIterator, so that its
2336 		// destructor is not called after the inode is deleted
2337 		AttributeIterator iterator(this);
2338 
2339 		char name[B_FILE_NAME_LENGTH];
2340 		uint32 type;
2341 		size_t length;
2342 		ino_t id;
2343 		while (iterator.GetNext(name, &length, &type, &id) == B_OK) {
2344 			RemoveAttribute(transaction, name);
2345 		}
2346 	}
2347 
2348 	if (WriteBack(transaction) < B_OK)
2349 		return B_IO_ERROR;
2350 
2351 	return fVolume->Free(transaction, BlockRun());
2352 }
2353 
2354 
2355 //!	Synchronizes (writes back to disk) the file stream of the inode.
2356 status_t
2357 Inode::Sync()
2358 {
2359 	if (FileCache())
2360 		return file_cache_sync(FileCache());
2361 
2362 	// We may also want to flush the attribute's data stream to
2363 	// disk here... (do we?)
2364 
2365 	if (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0)
2366 		return B_OK;
2367 
2368 	InodeReadLocker locker(this);
2369 
2370 	data_stream* data = &Node().data;
2371 	status_t status = B_OK;
2372 
2373 	// flush direct range
2374 
2375 	for (int32 i = 0; i < NUM_DIRECT_BLOCKS; i++) {
2376 		if (data->direct[i].IsZero())
2377 			return B_OK;
2378 
2379 		status = block_cache_sync_etc(fVolume->BlockCache(),
2380 			fVolume->ToBlock(data->direct[i]), data->direct[i].Length());
2381 		if (status != B_OK)
2382 			return status;
2383 	}
2384 
2385 	// flush indirect range
2386 
2387 	if (data->max_indirect_range == 0)
2388 		return B_OK;
2389 
2390 	CachedBlock cached(fVolume);
2391 	off_t block = fVolume->ToBlock(data->indirect);
2392 	int32 count = fVolume->BlockSize() / sizeof(block_run);
2393 
2394 	for (int32 j = 0; j < data->indirect.Length(); j++) {
2395 		block_run* runs = (block_run*)cached.SetTo(block + j);
2396 		if (runs == NULL)
2397 			break;
2398 
2399 		for (int32 i = 0; i < count; i++) {
2400 			if (runs[i].IsZero())
2401 				return B_OK;
2402 
2403 			status = block_cache_sync_etc(fVolume->BlockCache(),
2404 				fVolume->ToBlock(runs[i]), runs[i].Length());
2405 			if (status != B_OK)
2406 				return status;
2407 		}
2408 	}
2409 
2410 	// flush double indirect range
2411 
2412 	if (data->max_double_indirect_range == 0)
2413 		return B_OK;
2414 
2415 	off_t indirectBlock = fVolume->ToBlock(data->double_indirect);
2416 
2417 	for (int32 l = 0; l < data->double_indirect.Length(); l++) {
2418 		block_run* indirectRuns = (block_run*)cached.SetTo(indirectBlock + l);
2419 		if (indirectRuns == NULL)
2420 			return B_FILE_ERROR;
2421 
2422 		CachedBlock directCached(fVolume);
2423 
2424 		for (int32 k = 0; k < count; k++) {
2425 			if (indirectRuns[k].IsZero())
2426 				return B_OK;
2427 
2428 			block = fVolume->ToBlock(indirectRuns[k]);
2429 			for (int32 j = 0; j < indirectRuns[k].Length(); j++) {
2430 				block_run* runs = (block_run*)directCached.SetTo(block + j);
2431 				if (runs == NULL)
2432 					return B_FILE_ERROR;
2433 
2434 				for (int32 i = 0; i < count; i++) {
2435 					if (runs[i].IsZero())
2436 						return B_OK;
2437 
2438 					// TODO: combine single block_runs to bigger ones when
2439 					// they are adjacent
2440 					status = block_cache_sync_etc(fVolume->BlockCache(),
2441 						fVolume->ToBlock(runs[i]), runs[i].Length());
2442 					if (status != B_OK)
2443 						return status;
2444 				}
2445 			}
2446 		}
2447 	}
2448 	return B_OK;
2449 }
2450 
2451 
2452 //	#pragma mark - TransactionListener implementation
2453 
2454 
2455 void
2456 Inode::TransactionDone(bool success)
2457 {
2458 	if (!success) {
2459 		// Revert any changes made to the cached bfs_inode
2460 		// TODO: return code gets eaten
2461 		UpdateNodeFromDisk();
2462 	}
2463 }
2464 
2465 
2466 void
2467 Inode::RemovedFromTransaction()
2468 {
2469 	Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
2470 
2471 	// See AddInode() why we do this here
2472 	if ((Flags() & INODE_DELETED) != 0)
2473 		fVolume->RemovedInodes().Add(this);
2474 
2475 	rw_lock_write_unlock(&Lock());
2476 
2477 	if (!fVolume->IsInitializing())
2478 		put_vnode(fVolume->FSVolume(), ID());
2479 }
2480 
2481 
2482 //	#pragma mark - creation/deletion
2483 
2484 
2485 status_t
2486 Inode::Remove(Transaction& transaction, const char* name, ino_t* _id,
2487 	bool isDirectory, bool force)
2488 {
2489 	if (fTree == NULL)
2490 		RETURN_ERROR(B_BAD_VALUE);
2491 
2492 	WriteLockInTransaction(transaction);
2493 
2494 	// does the file even exist?
2495 	off_t id;
2496 	if (fTree->Find((uint8*)name, (uint16)strlen(name), &id) < B_OK)
2497 		return B_ENTRY_NOT_FOUND;
2498 
2499 	if (_id)
2500 		*_id = id;
2501 
2502 	Vnode vnode(fVolume, id);
2503 	Inode* inode;
2504 	status_t status = vnode.Get(&inode);
2505 	if (status < B_OK) {
2506 		REPORT_ERROR(status);
2507 		return B_ENTRY_NOT_FOUND;
2508 	}
2509 
2510 	T(Remove(inode, name));
2511 	inode->WriteLockInTransaction(transaction);
2512 
2513 	// Inode::IsContainer() is true also for indices (furthermore, the S_IFDIR
2514 	// bit is set for indices in BFS, not for attribute directories) - but you
2515 	// should really be able to do whatever you want with your indices
2516 	// without having to remove all files first :)
2517 	if (!inode->IsIndex() && !force) {
2518 		// if it's not of the correct type, don't delete it!
2519 		if (inode->IsContainer() != isDirectory)
2520 			return isDirectory ? B_NOT_A_DIRECTORY : B_IS_A_DIRECTORY;
2521 
2522 		// only delete empty directories
2523 		if (isDirectory && !inode->IsEmpty())
2524 			return B_DIRECTORY_NOT_EMPTY;
2525 	}
2526 
2527 	// remove_vnode() allows the inode to be accessed until the last put_vnode()
2528 	status = remove_vnode(fVolume->FSVolume(), id);
2529 	if (status != B_OK)
2530 		return status;
2531 
2532 	if (fTree->Remove(transaction, name, id) != B_OK && !force) {
2533 		unremove_vnode(fVolume->FSVolume(), id);
2534 		RETURN_ERROR(B_ERROR);
2535 	}
2536 
2537 #ifdef DEBUG
2538 	if (fTree->Find((uint8*)name, (uint16)strlen(name), &id) == B_OK) {
2539 		DIE(("deleted entry still there"));
2540 	}
2541 #endif
2542 
2543 	ContainerContentsChanged(transaction);
2544 
2545 	// update the inode, so that no one will ever doubt it's deleted :-)
2546 	inode->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
2547 	inode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
2548 
2549 	// In balance to the Inode::Create() method, the main indices
2550 	// are updated here (name, size, & last_modified)
2551 
2552 	Index index(fVolume);
2553 	if (inode->InNameIndex()) {
2554 		index.RemoveName(transaction, name, inode);
2555 			// If removing from the index fails, it is not regarded as a
2556 			// fatal error and will not be reported back!
2557 			// Deleted inodes won't be visible in queries anyway.
2558 	}
2559 
2560 	if (inode->InSizeIndex())
2561 		index.RemoveSize(transaction, inode);
2562 	if (inode->InLastModifiedIndex())
2563 		index.RemoveLastModified(transaction, inode);
2564 
2565 	return inode->WriteBack(transaction);
2566 }
2567 
2568 
2569 /*!	Creates the inode with the specified \a parent directory, and automatically
2570 	adds the created inode to that parent directory. If an attribute directory
2571 	is created, it will also automatically  be added to the \a parent inode as
2572 	such. However, the indices root node, and the regular root node won't be
2573 	added to the superblock.
2574 	It will also create the initial B+tree for the inode if it's a directory
2575 	of any kind.
2576 	\a name may be \c NULL, but only if no \a parent is given.
2577 	If the "_id" or "_inode" variable is given and non-NULL to store the
2578 	inode's ID, the inode stays locked - you have to call put_vnode() if you
2579 	don't use it anymore.
2580 	If the node already exists, this method will fail if \c O_EXCL is set, or
2581 	it's a directory or a symlink. Otherwise, it will just be returned.
2582 	If \c O_TRUNC has been specified, the file will also be truncated.
2583 */
2584 status_t
2585 Inode::Create(Transaction& transaction, Inode* parent, const char* name,
2586 	int32 mode, int openMode, uint32 type, bool* _created, ino_t* _id,
2587 	Inode** _inode, fs_vnode_ops* vnodeOps, uint32 publishFlags)
2588 {
2589 	FUNCTION_START(("name = %s, mode = %ld\n", name, mode));
2590 
2591 	block_run parentRun = parent ? parent->BlockRun() : block_run::Run(0, 0, 0);
2592 	Volume* volume = transaction.GetVolume();
2593 	BPlusTree* tree = NULL;
2594 
2595 	if (parent != NULL && (mode & S_ATTR_DIR) == 0 && parent->IsContainer()) {
2596 		// check if the file already exists in the directory
2597 		tree = parent->Tree();
2598 	}
2599 
2600 	if (parent != NULL) {
2601 		// the parent directory is locked during the whole inode creation
2602 		parent->WriteLockInTransaction(transaction);
2603 	}
2604 
2605 	if (parent != NULL && !volume->IsInitializing() && parent->IsContainer()) {
2606 		// don't create anything in removed directories
2607 		bool removed;
2608 		if (get_vnode_removed(volume->FSVolume(), parent->ID(), &removed)
2609 				== B_OK && removed) {
2610 			RETURN_ERROR(B_ENTRY_NOT_FOUND);
2611 		}
2612 	}
2613 
2614 	if (tree != NULL) {
2615 		// Does the file already exist?
2616 		off_t offset;
2617 		if (tree->Find((uint8*)name, (uint16)strlen(name), &offset) == B_OK) {
2618 			// Return if the file should be a directory/link or opened in
2619 			// exclusive mode
2620 			if (S_ISDIR(mode) || S_ISLNK(mode) || (openMode & O_EXCL) != 0)
2621 				return B_FILE_EXISTS;
2622 
2623 			Vnode vnode(volume, offset);
2624 			Inode* inode;
2625 			status_t status = vnode.Get(&inode);
2626 			if (status != B_OK) {
2627 				REPORT_ERROR(status);
2628 				return B_ENTRY_NOT_FOUND;
2629 			}
2630 
2631 			if (inode->IsDirectory() && (openMode & O_RWMASK) != O_RDONLY)
2632 				return B_IS_A_DIRECTORY;
2633 			if ((openMode & O_DIRECTORY) != 0 && !inode->IsDirectory())
2634 				return B_NOT_A_DIRECTORY;
2635 
2636 			// we want to open the file, so we should have the rights to do so
2637 			if (inode->CheckPermissions(open_mode_to_access(openMode)
2638 					| ((openMode & O_TRUNC) != 0 ? W_OK : 0)) != B_OK)
2639 				return B_NOT_ALLOWED;
2640 
2641 			if ((openMode & O_TRUNC) != 0) {
2642 				// truncate the existing file
2643 				inode->WriteLockInTransaction(transaction);
2644 
2645 				status_t status = inode->SetFileSize(transaction, 0);
2646 				if (status == B_OK)
2647 					status = inode->WriteBack(transaction);
2648 
2649 				if (status != B_OK)
2650 					return status;
2651 			}
2652 
2653 			if (_created)
2654 				*_created = false;
2655 			if (_id)
2656 				*_id = inode->ID();
2657 			if (_inode)
2658 				*_inode = inode;
2659 
2660 			// Only keep the vnode in memory if the _id or _inode pointer is
2661 			// provided
2662 			if (_id != NULL || _inode != NULL)
2663 				vnode.Keep();
2664 
2665 			return B_OK;
2666 		}
2667 	} else if (parent != NULL && (mode & S_ATTR_DIR) == 0) {
2668 		return B_BAD_VALUE;
2669 	} else if ((openMode & O_DIRECTORY) != 0) {
2670 		// TODO: we might need to return B_NOT_A_DIRECTORY here
2671 		return B_ENTRY_NOT_FOUND;
2672 	}
2673 
2674 	status_t status;
2675 
2676 	// do we have the power to create new files at all?
2677 	if (parent != NULL && (status = parent->CheckPermissions(W_OK)) != B_OK)
2678 		RETURN_ERROR(status);
2679 
2680 	// allocate space for the new inode
2681 	InodeAllocator allocator(transaction);
2682 	block_run run;
2683 	Inode* inode;
2684 	status = allocator.New(&parentRun, mode, publishFlags, run, vnodeOps,
2685 		&inode);
2686 	if (status < B_OK)
2687 		return status;
2688 
2689 	T(Create(inode, parent, name, mode, openMode, type));
2690 
2691 	// Initialize the parts of the bfs_inode structure that
2692 	// InodeAllocator::New() hasn't touched yet
2693 
2694 	bfs_inode* node = &inode->Node();
2695 
2696 	if (parent == NULL) {
2697 		// we set the parent to itself in this case
2698 		// (only happens for the root and indices node)
2699 		node->parent = run;
2700 	} else
2701 		node->parent = parentRun;
2702 
2703 	node->uid = HOST_ENDIAN_TO_BFS_INT32(geteuid());
2704 	node->gid = HOST_ENDIAN_TO_BFS_INT32(parent
2705 		? parent->Node().GroupID() : getegid());
2706 		// the group ID is inherited from the parent, if available
2707 
2708 	node->type = HOST_ENDIAN_TO_BFS_INT32(type);
2709 
2710 	inode->WriteBack(transaction);
2711 		// make sure the initialized node is available to others
2712 
2713 	// only add the name to regular files, directories, or symlinks
2714 	// don't add it to attributes, or indices
2715 	if (tree && inode->IsRegularNode()
2716 		&& inode->SetName(transaction, name) != B_OK)
2717 		return B_ERROR;
2718 
2719 	// Initialize b+tree if it's a directory (and add "." & ".." if it's
2720 	// a standard directory for files - not for attributes or indices)
2721 	if (inode->IsContainer()) {
2722 		status = allocator.CreateTree();
2723 		if (status != B_OK)
2724 			return status;
2725 	}
2726 
2727 	// Add a link to the inode from the parent, depending on its type
2728 	// (the vnode is not published yet, so it is safe to make the inode
2729 	// accessable to the file system here)
2730 	if (tree != NULL) {
2731 		status = tree->Insert(transaction, name, inode->ID());
2732 	} else if (parent != NULL && (mode & S_ATTR_DIR) != 0) {
2733 		parent->Attributes() = run;
2734 		status = parent->WriteBack(transaction);
2735 	}
2736 
2737 	// Note, we only care if the inode could be made accessable for the
2738 	// two cases above; the root node or the indices root node must
2739 	// handle this case on their own (or other cases where "parent" is
2740 	// NULL)
2741 	if (status != B_OK)
2742 		RETURN_ERROR(status);
2743 
2744 	// Update the main indices (name, size & last_modified)
2745 	// (live queries might want to access us after this)
2746 
2747 	Index index(volume);
2748 	if (inode->InNameIndex() && name != NULL) {
2749 		// the name index only contains regular files
2750 		// (but not the root node where name == NULL)
2751 		status = index.InsertName(transaction, name, inode);
2752 		if (status != B_OK && status != B_BAD_INDEX) {
2753 			// We have to remove the node from the parent at this point,
2754 			// because the InodeAllocator destructor can't handle this
2755 			// case (and if it fails, we can't do anything about it...)
2756 			if (tree)
2757 				tree->Remove(transaction, name, inode->ID());
2758 			else if (parent != NULL && (mode & S_ATTR_DIR) != 0)
2759 				parent->Node().attributes.SetTo(0, 0, 0);
2760 
2761 			RETURN_ERROR(status);
2762 		}
2763 	}
2764 
2765 	if (parent != NULL && parent->IsContainer())
2766 		parent->ContainerContentsChanged(transaction);
2767 
2768 	inode->UpdateOldLastModified();
2769 
2770 	// The "size" & "last_modified" indices don't contain directories.
2771 	// If adding to these indices fails, the inode creation will not be
2772 	// harmed; they are considered less important than the "name" index.
2773 	if (inode->InSizeIndex())
2774 		index.InsertSize(transaction, inode);
2775 	if (inode->InLastModifiedIndex())
2776 		index.InsertLastModified(transaction, inode);
2777 
2778 	if (inode->NeedsFileCache()) {
2779 		inode->SetFileCache(file_cache_create(volume->ID(), inode->ID(),
2780 			inode->Size()));
2781 		inode->SetMap(file_map_create(volume->ID(), inode->ID(),
2782 			inode->Size()));
2783 
2784 		if (inode->FileCache() == NULL || inode->Map() == NULL)
2785 			return B_NO_MEMORY;
2786 	}
2787 
2788 	// Everything worked well until this point, we have a fully
2789 	// initialized inode, and we want to keep it
2790 	allocator.Keep(vnodeOps, publishFlags);
2791 
2792 	if (_created)
2793 		*_created = true;
2794 	if (_id != NULL)
2795 		*_id = inode->ID();
2796 	if (_inode != NULL)
2797 		*_inode = inode;
2798 
2799 	// if either _id or _inode is passed, we will keep the inode locked
2800 	if (_id == NULL && _inode == NULL)
2801 		put_vnode(volume->FSVolume(), inode->ID());
2802 
2803 	return B_OK;
2804 }
2805 
2806 
2807 //	#pragma mark - AttributeIterator
2808 
2809 
2810 AttributeIterator::AttributeIterator(Inode* inode)
2811 	:
2812 	fCurrentSmallData(0),
2813 	fInode(inode),
2814 	fAttributes(NULL),
2815 	fIterator(NULL),
2816 	fBuffer(NULL)
2817 {
2818 	inode->_AddIterator(this);
2819 }
2820 
2821 
2822 AttributeIterator::~AttributeIterator()
2823 {
2824 	if (fAttributes)
2825 		put_vnode(fAttributes->GetVolume()->FSVolume(), fAttributes->ID());
2826 
2827 	delete fIterator;
2828 	fInode->_RemoveIterator(this);
2829 }
2830 
2831 
2832 status_t
2833 AttributeIterator::Rewind()
2834 {
2835 	fCurrentSmallData = 0;
2836 
2837 	if (fIterator != NULL)
2838 		fIterator->Rewind();
2839 
2840 	return B_OK;
2841 }
2842 
2843 
2844 status_t
2845 AttributeIterator::GetNext(char* name, size_t* _length, uint32* _type,
2846 	ino_t* _id)
2847 {
2848 	// read attributes out of the small data section
2849 
2850 	if (fCurrentSmallData >= 0) {
2851 		NodeGetter nodeGetter(fInode->GetVolume(), fInode);
2852 		if (nodeGetter.Node() == NULL)
2853 			return B_IO_ERROR;
2854 
2855 		const bfs_inode* node = nodeGetter.Node();
2856 		const small_data* item = ((bfs_inode*)node)->SmallDataStart();
2857 
2858 		RecursiveLocker _(&fInode->SmallDataLock());
2859 
2860 		int32 index = 0;
2861 		for (; !item->IsLast(node); item = item->Next(), index++) {
2862 			if (item->NameSize() == FILE_NAME_NAME_LENGTH
2863 				&& *item->Name() == FILE_NAME_NAME)
2864 				continue;
2865 
2866 			if (index >= fCurrentSmallData)
2867 				break;
2868 		}
2869 
2870 		if (!item->IsLast(node)) {
2871 			strncpy(name, item->Name(), B_FILE_NAME_LENGTH);
2872 			*_type = item->Type();
2873 			*_length = item->NameSize();
2874 			*_id = (ino_t)index;
2875 
2876 			fCurrentSmallData = index + 1;
2877 			return B_OK;
2878 		}
2879 
2880 		// stop traversing the small_data section
2881 		fCurrentSmallData = -1;
2882 	}
2883 
2884 	// read attributes out of the attribute directory
2885 
2886 	if (fInode->Attributes().IsZero())
2887 		return B_ENTRY_NOT_FOUND;
2888 
2889 	Volume* volume = fInode->GetVolume();
2890 
2891 	// if you haven't yet access to the attributes directory, get it
2892 	if (fAttributes == NULL) {
2893 		if (get_vnode(volume->FSVolume(), volume->ToVnode(fInode->Attributes()),
2894 				(void**)&fAttributes) != B_OK) {
2895 			FATAL(("get_vnode() failed in AttributeIterator::GetNext(ino_t"
2896 				" = %" B_PRIdINO ",name = \"%s\")\n", fInode->ID(), name));
2897 			return B_ENTRY_NOT_FOUND;
2898 		}
2899 
2900 		BPlusTree* tree = fAttributes->Tree();
2901 		if (tree == NULL
2902 			|| (fIterator = new(std::nothrow) TreeIterator(tree)) == NULL) {
2903 			FATAL(("could not get tree in AttributeIterator::GetNext(ino_t"
2904 				" = %" B_PRIdINO ",name = \"%s\")\n", fInode->ID(), name));
2905 			return B_ENTRY_NOT_FOUND;
2906 		}
2907 	}
2908 
2909 	uint16 length;
2910 	ino_t id;
2911 	status_t status = fIterator->GetNextEntry(name, &length,
2912 		B_FILE_NAME_LENGTH, &id);
2913 	if (status != B_OK)
2914 		return status;
2915 
2916 	Vnode vnode(volume, id);
2917 	Inode* attribute;
2918 	if ((status = vnode.Get(&attribute)) == B_OK) {
2919 		*_type = attribute->Type();
2920 		*_length = length;
2921 		*_id = id;
2922 	}
2923 
2924 	return status;
2925 }
2926 
2927 
2928 void
2929 AttributeIterator::Update(uint16 index, int8 change)
2930 {
2931 	// fCurrentSmallData points already to the next item
2932 	if (index < fCurrentSmallData)
2933 		fCurrentSmallData += change;
2934 }
2935 
2936