xref: /haiku/src/add-ons/kernel/file_systems/bfs/Inode.cpp (revision 5e7964b0a929555415798dea3373db9ac4611caa)
1 /*
2  * Copyright 2001-2017, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 
7 //! Inode access functions
8 
9 
10 #include "Debug.h"
11 #include "Inode.h"
12 #include "BPlusTree.h"
13 #include "Index.h"
14 
15 
16 #if BFS_TRACING && !defined(FS_SHELL) && !defined(_BOOT_MODE)
17 namespace BFSInodeTracing {
18 
19 class Create : public AbstractTraceEntry {
20 public:
21 	Create(Inode* inode, Inode* parent, const char* name, int32 mode,
22 			int openMode, uint32 type)
23 		:
24 		fInode(inode),
25 		fID(inode->ID()),
26 		fParent(parent),
27 		fParentID(parent != NULL ? parent->ID() : 0),
28 		fMode(mode),
29 		fOpenMode(openMode),
30 		fType(type)
31 	{
32 		if (name != NULL)
33 			strlcpy(fName, name, sizeof(fName));
34 		else
35 			fName[0] = '\0';
36 
37 		Initialized();
38 	}
39 
40 	virtual void AddDump(TraceOutput& out)
41 	{
42 		out.Print("bfs:Create %Ld (%p), parent %Ld (%p), \"%s\", "
43 			"mode %lx, omode %x, type %lx", fID, fInode, fParentID,
44 			fParent, fName, fMode, fOpenMode, fType);
45 	}
46 
47 private:
48 	Inode*	fInode;
49 	ino_t	fID;
50 	Inode*	fParent;
51 	ino_t	fParentID;
52 	char	fName[32];
53 	int32	fMode;
54 	int		fOpenMode;
55 	uint32	fType;
56 };
57 
58 class Remove : public AbstractTraceEntry {
59 public:
60 	Remove(Inode* inode, const char* name)
61 		:
62 		fInode(inode),
63 		fID(inode->ID())
64 	{
65 		strlcpy(fName, name, sizeof(fName));
66 		Initialized();
67 	}
68 
69 	virtual void AddDump(TraceOutput& out)
70 	{
71 		out.Print("bfs:Remove %Ld (%p), \"%s\"", fID, fInode, fName);
72 	}
73 
74 private:
75 	Inode*	fInode;
76 	ino_t	fID;
77 	char	fName[32];
78 };
79 
80 class Action : public AbstractTraceEntry {
81 public:
82 	Action(const char* action, Inode* inode)
83 		:
84 		fInode(inode),
85 		fID(inode->ID())
86 	{
87 		strlcpy(fAction, action, sizeof(fAction));
88 		Initialized();
89 	}
90 
91 	virtual void AddDump(TraceOutput& out)
92 	{
93 		out.Print("bfs:%s %Ld (%p)\n", fAction, fID, fInode);
94 	}
95 
96 private:
97 	Inode*	fInode;
98 	ino_t	fID;
99 	char	fAction[16];
100 };
101 
102 class Resize : public AbstractTraceEntry {
103 public:
104 	Resize(Inode* inode, off_t oldSize, off_t newSize, bool trim)
105 		:
106 		fInode(inode),
107 		fID(inode->ID()),
108 		fOldSize(oldSize),
109 		fNewSize(newSize),
110 		fTrim(trim)
111 	{
112 		Initialized();
113 	}
114 
115 	virtual void AddDump(TraceOutput& out)
116 	{
117 		out.Print("bfs:%s %Ld (%p), %Ld -> %Ld", fTrim ? "Trim" : "Resize",
118 			fID, fInode, fOldSize, fNewSize);
119 	}
120 
121 private:
122 	Inode*	fInode;
123 	ino_t	fID;
124 	off_t	fOldSize;
125 	off_t	fNewSize;
126 	bool	fTrim;
127 };
128 
129 }	// namespace BFSInodeTracing
130 
131 #	define T(x) new(std::nothrow) BFSInodeTracing::x;
132 #else
133 #	define T(x) ;
134 #endif
135 
136 
137 /*!	A helper class used by Inode::Create() to keep track of the belongings
138 	of an inode creation in progress.
139 	This class will make sure everything is cleaned up properly.
140 */
141 class InodeAllocator {
142 public:
143 							InodeAllocator(Transaction& transaction);
144 							~InodeAllocator();
145 
146 			status_t		New(block_run* parentRun, mode_t mode, uint32 flags,
147 								block_run& run, fs_vnode_ops* vnodeOps,
148 								Inode** _inode);
149 			status_t		CreateTree();
150 			status_t		Keep(fs_vnode_ops* vnodeOps, uint32 publishFlags);
151 
152 private:
153 	static	void			_TransactionListener(int32 id, int32 event,
154 								void* _inode);
155 
156 			Transaction*	fTransaction;
157 			block_run		fRun;
158 			Inode*			fInode;
159 };
160 
161 
162 InodeAllocator::InodeAllocator(Transaction& transaction)
163 	:
164 	fTransaction(&transaction),
165 	fInode(NULL)
166 {
167 }
168 
169 
170 InodeAllocator::~InodeAllocator()
171 {
172 	if (fTransaction != NULL) {
173 		Volume* volume = fTransaction->GetVolume();
174 
175 		if (fInode != NULL) {
176 			fInode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
177 				// this unblocks any pending bfs_read_vnode() calls
178 			fInode->Free(*fTransaction);
179 
180 			if (fInode->fTree != NULL)
181 				fTransaction->RemoveListener(fInode->fTree);
182 			fTransaction->RemoveListener(fInode);
183 
184 			remove_vnode(volume->FSVolume(), fInode->ID());
185 		} else
186 			volume->Free(*fTransaction, fRun);
187 	}
188 
189 	delete fInode;
190 }
191 
192 
193 status_t
194 InodeAllocator::New(block_run* parentRun, mode_t mode, uint32 publishFlags,
195 	block_run& run, fs_vnode_ops* vnodeOps, Inode** _inode)
196 {
197 	Volume* volume = fTransaction->GetVolume();
198 
199 	status_t status = volume->AllocateForInode(*fTransaction, parentRun, mode,
200 		fRun);
201 	if (status < B_OK) {
202 		// don't free the space in the destructor, because
203 		// the allocation failed
204 		fTransaction = NULL;
205 		RETURN_ERROR(status);
206 	}
207 
208 	run = fRun;
209 	fInode = new(std::nothrow) Inode(volume, *fTransaction,
210 		volume->ToVnode(run), mode, run);
211 	if (fInode == NULL)
212 		RETURN_ERROR(B_NO_MEMORY);
213 
214 	if (!volume->IsInitializing()
215 		&& (publishFlags & BFS_DO_NOT_PUBLISH_VNODE) == 0) {
216 		status = new_vnode(volume->FSVolume(), fInode->ID(), fInode,
217 			vnodeOps != NULL ? vnodeOps : &gBFSVnodeOps);
218 		if (status < B_OK) {
219 			delete fInode;
220 			fInode = NULL;
221 			RETURN_ERROR(status);
222 		}
223 	}
224 
225 	fInode->WriteLockInTransaction(*fTransaction);
226 	*_inode = fInode;
227 	return B_OK;
228 }
229 
230 
231 status_t
232 InodeAllocator::CreateTree()
233 {
234 	Volume* volume = fTransaction->GetVolume();
235 
236 	// force S_STR_INDEX to be set, if no type is set
237 	if ((fInode->Mode() & S_INDEX_TYPES) == 0)
238 		fInode->Node().mode |= HOST_ENDIAN_TO_BFS_INT32(S_STR_INDEX);
239 
240 	BPlusTree* tree = new(std::nothrow) BPlusTree(*fTransaction, fInode);
241 	if (tree == NULL)
242 		return B_ERROR;
243 
244 	status_t status = tree->InitCheck();
245 	if (status != B_OK) {
246 		delete tree;
247 		return status;
248 	}
249 
250 	fInode->fTree = tree;
251 
252 	if (fInode->IsRegularNode()) {
253 		if (tree->Insert(*fTransaction, ".", fInode->ID()) < B_OK
254 			|| tree->Insert(*fTransaction, "..",
255 					volume->ToVnode(fInode->Parent())) < B_OK)
256 			return B_ERROR;
257 	}
258 	return B_OK;
259 }
260 
261 
262 status_t
263 InodeAllocator::Keep(fs_vnode_ops* vnodeOps, uint32 publishFlags)
264 {
265 	ASSERT(fInode != NULL && fTransaction != NULL);
266 	Volume* volume = fTransaction->GetVolume();
267 
268 	status_t status = fInode->WriteBack(*fTransaction);
269 	if (status < B_OK) {
270 		FATAL(("writing new inode %" B_PRIdINO " failed!\n", fInode->ID()));
271 		return status;
272 	}
273 
274 	// Symbolic links are not published -- the caller needs to do this once
275 	// the contents have been written.
276 	if (!fInode->IsSymLink() && !volume->IsInitializing()
277 		&& (publishFlags & BFS_DO_NOT_PUBLISH_VNODE) == 0) {
278 		status = publish_vnode(volume->FSVolume(), fInode->ID(), fInode,
279 			vnodeOps != NULL ? vnodeOps : &gBFSVnodeOps, fInode->Mode(),
280 			publishFlags);
281 	}
282 
283 	if (status == B_OK) {
284 		cache_add_transaction_listener(volume->BlockCache(), fTransaction->ID(),
285 			TRANSACTION_ABORTED, &_TransactionListener, fInode);
286 	}
287 
288 	fTransaction = NULL;
289 	fInode = NULL;
290 
291 	return status;
292 }
293 
294 
295 /*static*/ void
296 InodeAllocator::_TransactionListener(int32 id, int32 event, void* _inode)
297 {
298 	Inode* inode = (Inode*)_inode;
299 
300 	if (event == TRANSACTION_ABORTED)
301 		panic("transaction %d aborted, inode %p still around!\n", (int)id, inode);
302 }
303 
304 
305 //	#pragma mark -
306 
307 
308 status_t
309 bfs_inode::InitCheck(Volume* volume) const
310 {
311 	if (Magic1() != INODE_MAGIC1
312 		|| !(Flags() & INODE_IN_USE)
313 		|| inode_num.Length() != 1
314 		// matches inode size?
315 		|| (uint32)InodeSize() != volume->InodeSize()
316 		// parent resides on disk?
317 		|| parent.AllocationGroup() > int32(volume->AllocationGroups())
318 		|| parent.AllocationGroup() < 0
319 		|| parent.Start() > (1L << volume->AllocationGroupShift())
320 		|| parent.Length() != 1
321 		// attributes, too?
322 		|| attributes.AllocationGroup() > int32(volume->AllocationGroups())
323 		|| attributes.AllocationGroup() < 0
324 		|| attributes.Start() > (1L << volume->AllocationGroupShift()))
325 		RETURN_ERROR(B_BAD_DATA);
326 
327 	if (Flags() & INODE_DELETED)
328 		return B_NOT_ALLOWED;
329 
330 	// TODO: Add some tests to check the integrity of the other stuff here,
331 	// especially for the data_stream!
332 
333 	return B_OK;
334 }
335 
336 
337 //	#pragma mark - Inode
338 
339 
340 Inode::Inode(Volume* volume, ino_t id)
341 	:
342 	fVolume(volume),
343 	fID(id),
344 	fTree(NULL),
345 	fAttributes(NULL),
346 	fCache(NULL),
347 	fMap(NULL)
348 {
349 	PRINT(("Inode::Inode(volume = %p, id = %Ld) @ %p\n", volume, id, this));
350 
351 	rw_lock_init(&fLock, "bfs inode");
352 	recursive_lock_init(&fSmallDataLock, "bfs inode small data");
353 
354 	if (UpdateNodeFromDisk() != B_OK) {
355 		// TODO: the error code gets eaten
356 		return;
357 	}
358 
359 	// these two will help to maintain the indices
360 	fOldSize = Size();
361 	fOldLastModified = LastModified();
362 
363 	if (IsContainer())
364 		fTree = new(std::nothrow) BPlusTree(this);
365 	if (NeedsFileCache()) {
366 		SetFileCache(file_cache_create(fVolume->ID(), ID(), Size()));
367 		SetMap(file_map_create(volume->ID(), ID(), Size()));
368 	}
369 }
370 
371 
372 Inode::Inode(Volume* volume, Transaction& transaction, ino_t id, mode_t mode,
373 		block_run& run)
374 	:
375 	fVolume(volume),
376 	fID(id),
377 	fTree(NULL),
378 	fAttributes(NULL),
379 	fCache(NULL),
380 	fMap(NULL)
381 {
382 	PRINT(("Inode::Inode(volume = %p, transaction = %p, id = %Ld) @ %p\n",
383 		volume, &transaction, id, this));
384 
385 	rw_lock_init(&fLock, "bfs inode");
386 	recursive_lock_init(&fSmallDataLock, "bfs inode small data");
387 
388 	NodeGetter node(volume, transaction, this, true);
389 	if (node.Node() == NULL) {
390 		FATAL(("Could not read inode block %" B_PRId64 "!\n", BlockNumber()));
391 		return;
392 	}
393 
394 	memset(&fNode, 0, sizeof(bfs_inode));
395 
396 	// Initialize the bfs_inode structure -- it's not written back to disk
397 	// here, because it will be done so already when the inode could be
398 	// created completely.
399 
400 	Node().magic1 = HOST_ENDIAN_TO_BFS_INT32(INODE_MAGIC1);
401 	Node().inode_num = run;
402 	Node().mode = HOST_ENDIAN_TO_BFS_INT32(mode);
403 	Node().flags = HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
404 
405 	Node().create_time = Node().last_modified_time = Node().status_change_time
406 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
407 
408 	Node().inode_size = HOST_ENDIAN_TO_BFS_INT32(volume->InodeSize());
409 
410 	// these two will help to maintain the indices
411 	fOldSize = Size();
412 	fOldLastModified = LastModified();
413 }
414 
415 
416 Inode::~Inode()
417 {
418 	PRINT(("Inode::~Inode() @ %p\n", this));
419 
420 	file_cache_delete(FileCache());
421 	file_map_delete(Map());
422 	delete fTree;
423 
424 	rw_lock_destroy(&fLock);
425 	recursive_lock_destroy(&fSmallDataLock);
426 }
427 
428 
429 status_t
430 Inode::InitCheck(bool checkNode) const
431 {
432 	// test inode magic and flags
433 	if (checkNode) {
434 		status_t status = Node().InitCheck(fVolume);
435 		if (status == B_BUSY)
436 			return B_BUSY;
437 
438 		if (status != B_OK) {
439 			FATAL(("inode at block %" B_PRIdOFF " corrupt!\n", BlockNumber()));
440 			RETURN_ERROR(B_BAD_DATA);
441 		}
442 	}
443 
444 	if (IsContainer()) {
445 		// inodes that have a B+tree
446 		if (fTree == NULL)
447 			RETURN_ERROR(B_NO_MEMORY);
448 
449 		status_t status = fTree->InitCheck();
450 		if (status != B_OK) {
451 			FATAL(("inode tree at block %" B_PRIdOFF " corrupt!\n",
452 				BlockNumber()));
453 			RETURN_ERROR(B_BAD_DATA);
454 		}
455 	}
456 
457 	if (NeedsFileCache() && (fCache == NULL || fMap == NULL))
458 		return B_NO_MEMORY;
459 
460 	return B_OK;
461 }
462 
463 
464 /*!	Adds this inode to the specified transaction. This means that the inode will
465 	be write locked until the transaction ended.
466 	To ensure that the inode will stay valid until that point, an extra reference
467 	is acquired to it as long as this transaction stays active.
468 */
469 void
470 Inode::WriteLockInTransaction(Transaction& transaction)
471 {
472 	// These flags can only change while holding the transaction lock
473 	if ((Flags() & INODE_IN_TRANSACTION) != 0)
474 		return;
475 
476 	// We share the same list link with the removed list, so we have to remove
477 	// the inode from that list here (and add it back when we no longer need it)
478 	if ((Flags() & INODE_DELETED) != 0)
479 		fVolume->RemovedInodes().Remove(this);
480 
481 	if (!fVolume->IsInitializing())
482 		acquire_vnode(fVolume->FSVolume(), ID());
483 
484 	rw_lock_write_lock(&Lock());
485 	Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
486 
487 	transaction.AddListener(this);
488 }
489 
490 
491 status_t
492 Inode::WriteBack(Transaction& transaction)
493 {
494 	NodeGetter node(fVolume, transaction, this);
495 	if (node.WritableNode() == NULL)
496 		return B_IO_ERROR;
497 
498 	memcpy(node.WritableNode(), &Node(), sizeof(bfs_inode));
499 	return B_OK;
500 }
501 
502 
503 status_t
504 Inode::UpdateNodeFromDisk()
505 {
506 	NodeGetter node(fVolume, this);
507 	if (node.Node() == NULL) {
508 		FATAL(("Failed to read block %" B_PRId64 " from disk!\n",
509 			BlockNumber()));
510 		return B_IO_ERROR;
511 	}
512 
513 	memcpy(&fNode, node.Node(), sizeof(bfs_inode));
514 	fNode.flags &= HOST_ENDIAN_TO_BFS_INT32(INODE_PERMANENT_FLAGS);
515 	return B_OK;
516 }
517 
518 
519 status_t
520 Inode::CheckPermissions(int accessMode) const
521 {
522 	// you never have write access to a read-only volume
523 	if ((accessMode & W_OK) != 0 && fVolume->IsReadOnly())
524 		return B_READ_ONLY_DEVICE;
525 
526 	return check_access_permissions(accessMode, Mode(), (gid_t)fNode.GroupID(),
527 		(uid_t)fNode.UserID());
528 }
529 
530 
531 //	#pragma mark - attributes
532 
533 
534 void
535 Inode::_AddIterator(AttributeIterator* iterator)
536 {
537 	RecursiveLocker _(fSmallDataLock);
538 	fIterators.Add(iterator);
539 }
540 
541 
542 void
543 Inode::_RemoveIterator(AttributeIterator* iterator)
544 {
545 	RecursiveLocker _(fSmallDataLock);
546 	fIterators.Remove(iterator);
547 }
548 
549 
550 /*!	Tries to free up "bytes" space in the small_data section by moving
551 	attributes to real files. Used for system attributes like the name.
552 	You need to hold the fSmallDataLock when you call this method
553 */
554 status_t
555 Inode::_MakeSpaceForSmallData(Transaction& transaction, bfs_inode* node,
556 	const char* name, int32 bytes)
557 {
558 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
559 
560 	while (bytes > 0) {
561 		small_data* item = node->SmallDataStart();
562 		small_data* max = NULL;
563 		int32 index = 0, maxIndex = 0;
564 		for (; !item->IsLast(node); item = item->Next(), index++) {
565 			// should not remove those
566 			if (*item->Name() == FILE_NAME_NAME || !strcmp(name, item->Name()))
567 				continue;
568 
569 			if (max == NULL || max->Size() < item->Size()) {
570 				maxIndex = index;
571 				max = item;
572 			}
573 
574 			// Remove the first one large enough to free the needed amount of
575 			// bytes
576 			if (bytes < (int32)item->Size())
577 				break;
578 		}
579 
580 		if (item->IsLast(node) || (int32)item->Size() < bytes)
581 			return B_ERROR;
582 
583 		bytes -= max->Size();
584 
585 		// Move the attribute to a real attribute file
586 		// Luckily, this doesn't cause any index updates
587 
588 		Inode* attribute;
589 		status_t status = CreateAttribute(transaction, item->Name(),
590 			item->Type(), &attribute);
591 		if (status != B_OK)
592 			RETURN_ERROR(status);
593 
594 		size_t length = item->DataSize();
595 		status = attribute->WriteAt(transaction, 0, item->Data(), &length);
596 
597 		ReleaseAttribute(attribute);
598 
599 		if (status != B_OK) {
600 			Vnode vnode(fVolume, Attributes());
601 			Inode* attributes;
602 			if (vnode.Get(&attributes) < B_OK
603 				|| attributes->Remove(transaction, name) < B_OK) {
604 				FATAL(("Could not remove newly created attribute!\n"));
605 			}
606 
607 			RETURN_ERROR(status);
608 		}
609 
610 		_RemoveSmallData(node, max, maxIndex);
611 	}
612 	return B_OK;
613 }
614 
615 
616 /*!	Private function which removes the given attribute from the small_data
617 	section.
618 	You need to hold the fSmallDataLock when you call this method
619 */
620 status_t
621 Inode::_RemoveSmallData(bfs_inode* node, small_data* item, int32 index)
622 {
623 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
624 
625 	small_data* next = item->Next();
626 	if (!next->IsLast(node)) {
627 		// find the last attribute
628 		small_data* last = next;
629 		while (!last->IsLast(node))
630 			last = last->Next();
631 
632 		int32 size = (uint8*)last - (uint8*)next;
633 		if (size < 0
634 			|| size > (uint8*)node + fVolume->BlockSize() - (uint8*)next)
635 			return B_BAD_DATA;
636 
637 		memmove(item, next, size);
638 
639 		// Move the "last" one to its new location and
640 		// correctly terminate the small_data section
641 		last = (small_data*)((uint8*)last - ((uint8*)next - (uint8*)item));
642 		memset(last, 0, (uint8*)node + fVolume->BlockSize() - (uint8*)last);
643 	} else
644 		memset(item, 0, item->Size());
645 
646 	// update all current iterators
647 	SinglyLinkedList<AttributeIterator>::Iterator iterator
648 		= fIterators.GetIterator();
649 	while (iterator.HasNext()) {
650 		iterator.Next()->Update(index, -1);
651 	}
652 
653 	return B_OK;
654 }
655 
656 
657 //!	Removes the given attribute from the small_data section.
658 status_t
659 Inode::_RemoveSmallData(Transaction& transaction, NodeGetter& nodeGetter,
660 	const char* name)
661 {
662 	if (name == NULL)
663 		return B_BAD_VALUE;
664 
665 	bfs_inode* node = nodeGetter.WritableNode();
666 	RecursiveLocker locker(fSmallDataLock);
667 
668 	// search for the small_data item
669 
670 	small_data* item = node->SmallDataStart();
671 	int32 index = 0;
672 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
673 		item = item->Next();
674 		index++;
675 	}
676 
677 	if (item->IsLast(node))
678 		return B_ENTRY_NOT_FOUND;
679 
680 	nodeGetter.MakeWritable(transaction);
681 
682 	status_t status = _RemoveSmallData(node, item, index);
683 	if (status == B_OK) {
684 		Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
685 			bfs_inode::ToInode(real_time_clock_usecs()));
686 
687 		status = WriteBack(transaction);
688 	}
689 
690 	return status;
691 }
692 
693 
694 /*!	Try to place the given attribute in the small_data section - if the
695 	new attribute is too big to fit in that section, it returns B_DEVICE_FULL.
696 	In that case, the attribute should be written to a real attribute file;
697 	it's the caller's responsibility to remove any existing attributes in the
698 	small data section if that's the case.
699 
700 	Note that you need to write back the inode yourself after having called that
701 	method - it's a bad API decision that it needs a transaction but enforces
702 	you to write back the inode all by yourself, but it's just more efficient
703 	in most cases...
704 */
705 status_t
706 Inode::_AddSmallData(Transaction& transaction, NodeGetter& nodeGetter,
707 	const char* name, uint32 type, off_t pos, const uint8* data, size_t length,
708 	bool force)
709 {
710 	bfs_inode* node = nodeGetter.WritableNode();
711 
712 	if (node == NULL || name == NULL || data == NULL)
713 		return B_BAD_VALUE;
714 
715 	// reject any requests that can't fit into the small_data section
716 	uint32 nameLength = strlen(name);
717 	uint32 spaceNeeded = sizeof(small_data) + nameLength + 3 + pos + length + 1;
718 	if (spaceNeeded > fVolume->InodeSize() - sizeof(bfs_inode))
719 		return B_DEVICE_FULL;
720 
721 	nodeGetter.MakeWritable(transaction);
722 	RecursiveLocker locker(fSmallDataLock);
723 
724 	// Find the last item or one with the same name we have to add
725 	small_data* item = node->SmallDataStart();
726 	int32 index = 0;
727 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
728 		item = item->Next();
729 		index++;
730 	}
731 
732 	// is the attribute already in the small_data section?
733 	// then just replace the data part of that one
734 	if (!item->IsLast(node)) {
735 		// find last attribute
736 		small_data* last = item;
737 		while (!last->IsLast(node))
738 			last = last->Next();
739 
740 		// try to change the attributes value
741 		if (item->data_size > pos + length
742 			|| force
743 			|| ((uint8*)last + pos + length - item->DataSize())
744 					<= ((uint8*)node + fVolume->InodeSize())) {
745 			// Make room for the new attribute if needed (and we are forced
746 			// to do so)
747 			if (force && ((uint8*)last + pos + length - item->DataSize())
748 					> ((uint8*)node + fVolume->InodeSize())) {
749 				// We also take the free space at the end of the small_data
750 				// section into account, and request only what's really needed
751 				uint32 needed = pos + length - item->DataSize() -
752 					(uint32)((uint8*)node + fVolume->InodeSize()
753 						- (uint8*)last);
754 
755 				if (_MakeSpaceForSmallData(transaction, node, name, needed)
756 						!= B_OK)
757 					return B_ERROR;
758 
759 				// reset our pointers
760 				item = node->SmallDataStart();
761 				index = 0;
762 				while (!item->IsLast(node) && strcmp(item->Name(), name)) {
763 					item = item->Next();
764 					index++;
765 				}
766 
767 				last = item;
768 				while (!last->IsLast(node))
769 					last = last->Next();
770 			}
771 
772 			size_t oldDataSize = item->DataSize();
773 
774 			// Normally, we can just overwrite the attribute data as the size
775 			// is specified by the type and does not change that often
776 			if (pos + length != item->DataSize()) {
777 				// move the attributes after the current one
778 				small_data* next = item->Next();
779 				if (!next->IsLast(node)) {
780 					memmove((uint8*)item + spaceNeeded, next,
781 						(uint8*)last - (uint8*)next);
782 				}
783 
784 				// Move the "last" one to its new location and
785 				// correctly terminate the small_data section
786 				last = (small_data*)((uint8*)last
787 					- ((uint8*)next - ((uint8*)item + spaceNeeded)));
788 				if ((uint8*)last < (uint8*)node + fVolume->BlockSize()) {
789 					memset(last, 0, (uint8*)node + fVolume->BlockSize()
790 						- (uint8*)last);
791 				}
792 
793 				item->data_size = HOST_ENDIAN_TO_BFS_INT16(pos + length);
794 			}
795 
796 			item->type = HOST_ENDIAN_TO_BFS_INT32(type);
797 
798 			if ((uint64)oldDataSize < (uint64)pos) {
799 				// Fill gap with zeros
800 				memset(item->Data() + oldDataSize, 0, pos - oldDataSize);
801 			}
802 			memcpy(item->Data() + pos, data, length);
803 			item->Data()[pos + length] = '\0';
804 
805 			return B_OK;
806 		}
807 
808 		return B_DEVICE_FULL;
809 	}
810 
811 	// try to add the new attribute!
812 
813 	if ((uint8*)item + spaceNeeded > (uint8*)node + fVolume->InodeSize()) {
814 		// there is not enough space for it!
815 		if (!force)
816 			return B_DEVICE_FULL;
817 
818 		// make room for the new attribute
819 		if (_MakeSpaceForSmallData(transaction, node, name, spaceNeeded) < B_OK)
820 			return B_ERROR;
821 
822 		// get new last item!
823 		item = node->SmallDataStart();
824 		index = 0;
825 		while (!item->IsLast(node)) {
826 			item = item->Next();
827 			index++;
828 		}
829 	}
830 
831 	memset(item, 0, spaceNeeded);
832 	item->type = HOST_ENDIAN_TO_BFS_INT32(type);
833 	item->name_size = HOST_ENDIAN_TO_BFS_INT16(nameLength);
834 	item->data_size = HOST_ENDIAN_TO_BFS_INT16(length);
835 	strcpy(item->Name(), name);
836 	memcpy(item->Data() + pos, data, length);
837 
838 	// correctly terminate the small_data section
839 	item = item->Next();
840 	if (!item->IsLast(node))
841 		memset(item, 0, (uint8*)node + fVolume->InodeSize() - (uint8*)item);
842 
843 	// update all current iterators
844 	SinglyLinkedList<AttributeIterator>::Iterator iterator
845 		= fIterators.GetIterator();
846 	while (iterator.HasNext()) {
847 		iterator.Next()->Update(index, 1);
848 	}
849 
850 	return B_OK;
851 }
852 
853 
854 /*!	Iterates through the small_data section of an inode.
855 	To start at the beginning of this section, you let smallData
856 	point to NULL, like:
857 		small_data* data = NULL;
858 		while (inode->GetNextSmallData(&data) { ... }
859 
860 	This function is reentrant and doesn't allocate any memory;
861 	you can safely stop calling it at any point (you don't need
862 	to iterate through the whole list).
863 	You need to hold the fSmallDataLock when you call this method
864 */
865 status_t
866 Inode::_GetNextSmallData(bfs_inode* node, small_data** _smallData) const
867 {
868 	if (node == NULL)
869 		RETURN_ERROR(B_BAD_VALUE);
870 
871 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
872 
873 	small_data* data = *_smallData;
874 
875 	// begin from the start?
876 	if (data == NULL)
877 		data = node->SmallDataStart();
878 	else
879 		data = data->Next();
880 
881 	// is already last item?
882 	if (data->IsLast(node))
883 		return B_ENTRY_NOT_FOUND;
884 
885 	*_smallData = data;
886 
887 	return B_OK;
888 }
889 
890 
891 /*!	Finds the attribute "name" in the small data section, and
892 	returns a pointer to it (or NULL if it doesn't exist).
893 	You need to hold the fSmallDataLock when you call this method
894 */
895 small_data*
896 Inode::FindSmallData(const bfs_inode* node, const char* name) const
897 {
898 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
899 
900 	small_data* smallData = NULL;
901 	while (_GetNextSmallData(const_cast<bfs_inode*>(node), &smallData)
902 			== B_OK) {
903 		if (!strcmp(smallData->Name(), name))
904 			return smallData;
905 	}
906 	return NULL;
907 }
908 
909 
910 /*!	Returns a pointer to the node's name if present in the small data
911 	section, NULL otherwise.
912 	You need to hold the fSmallDataLock when you call this method
913 */
914 const char*
915 Inode::Name(const bfs_inode* node) const
916 {
917 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
918 
919 	small_data* smallData = NULL;
920 	while (_GetNextSmallData((bfs_inode*)node, &smallData) == B_OK) {
921 		if (*smallData->Name() == FILE_NAME_NAME
922 			&& smallData->NameSize() == FILE_NAME_NAME_LENGTH)
923 			return (const char*)smallData->Data();
924 	}
925 	return NULL;
926 }
927 
928 
929 /*!	Copies the node's name into the provided buffer.
930 	The buffer should be B_FILE_NAME_LENGTH bytes large.
931 */
932 status_t
933 Inode::GetName(char* buffer, size_t size) const
934 {
935 	NodeGetter node(fVolume, this);
936 	if (node.Node() == NULL)
937 		return B_IO_ERROR;
938 
939 	RecursiveLocker locker(fSmallDataLock);
940 
941 	const char* name = Name(node.Node());
942 	if (name == NULL)
943 		return B_ENTRY_NOT_FOUND;
944 
945 	strlcpy(buffer, name, size);
946 	return B_OK;
947 }
948 
949 
950 /*!	Changes or set the name of a file: in the inode small_data section only, it
951 	doesn't change it in the parent directory's b+tree.
952 	Note that you need to write back the inode yourself after having called
953 	that method. It suffers from the same API decision as AddSmallData() does
954 	(and for the same reason).
955 */
956 status_t
957 Inode::SetName(Transaction& transaction, const char* name)
958 {
959 	if (name == NULL || *name == '\0')
960 		return B_BAD_VALUE;
961 
962 	NodeGetter node(fVolume, transaction, this);
963 	if (node.Node() == NULL)
964 		return B_IO_ERROR;
965 
966 	const char nameTag[2] = {FILE_NAME_NAME, 0};
967 
968 	return _AddSmallData(transaction, node, nameTag, FILE_NAME_TYPE, 0,
969 		(uint8*)name, strlen(name), true);
970 }
971 
972 
973 status_t
974 Inode::_RemoveAttribute(Transaction& transaction, const char* name,
975 	bool hasIndex, Index* index)
976 {
977 	// remove the attribute file if it exists
978 	Vnode vnode(fVolume, Attributes());
979 	Inode* attributes;
980 	status_t status = vnode.Get(&attributes);
981 	if (status < B_OK)
982 		return status;
983 
984 	// update index
985 	if (index != NULL) {
986 		Inode* attribute;
987 		if ((hasIndex || fVolume->CheckForLiveQuery(name))
988 			&& GetAttribute(name, &attribute) == B_OK) {
989 			uint8 data[MAX_INDEX_KEY_LENGTH];
990 			size_t length = MAX_INDEX_KEY_LENGTH;
991 			if (attribute->ReadAt(0, data, &length) == B_OK) {
992 				index->Update(transaction, name, attribute->Type(), data,
993 					length, NULL, 0, this);
994 			}
995 
996 			ReleaseAttribute(attribute);
997 		}
998 	}
999 
1000 	if ((status = attributes->Remove(transaction, name)) < B_OK)
1001 		return status;
1002 
1003 	if (attributes->IsEmpty()) {
1004 		attributes->WriteLockInTransaction(transaction);
1005 
1006 		// remove attribute directory (don't fail if that can't be done)
1007 		if (remove_vnode(fVolume->FSVolume(), attributes->ID()) == B_OK) {
1008 			// update the inode, so that no one will ever doubt it's deleted :-)
1009 			attributes->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
1010 			if (attributes->WriteBack(transaction) == B_OK) {
1011 				Attributes().SetTo(0, 0, 0);
1012 				WriteBack(transaction);
1013 			} else {
1014 				unremove_vnode(fVolume->FSVolume(), attributes->ID());
1015 				attributes->Node().flags
1016 					&= ~HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
1017 			}
1018 		}
1019 	}
1020 
1021 	return status;
1022 }
1023 
1024 
1025 /*!	Reads data from the specified attribute.
1026 	This is a high-level attribute function that understands attributes
1027 	in the small_data section as well as real attribute files.
1028 */
1029 status_t
1030 Inode::ReadAttribute(const char* name, int32 type, off_t pos, uint8* buffer,
1031 	size_t* _length)
1032 {
1033 	if (pos < 0)
1034 		pos = 0;
1035 
1036 	// search in the small_data section (which has to be locked first)
1037 	{
1038 		NodeGetter node(fVolume, this);
1039 		if (node.Node() == NULL)
1040 			return B_IO_ERROR;
1041 
1042 		RecursiveLocker locker(fSmallDataLock);
1043 
1044 		small_data* smallData = FindSmallData(node.Node(), name);
1045 		if (smallData != NULL) {
1046 			size_t length = *_length;
1047 			if (pos >= smallData->data_size) {
1048 				*_length = 0;
1049 				return B_OK;
1050 			}
1051 			if (length + pos > smallData->DataSize())
1052 				length = smallData->DataSize() - pos;
1053 
1054 			memcpy(buffer, smallData->Data() + pos, length);
1055 			*_length = length;
1056 			return B_OK;
1057 		}
1058 	}
1059 
1060 	// search in the attribute directory
1061 	Inode* attribute;
1062 	status_t status = GetAttribute(name, &attribute);
1063 	if (status == B_OK) {
1064 		status = attribute->ReadAt(pos, (uint8*)buffer, _length);
1065 
1066 		ReleaseAttribute(attribute);
1067 	}
1068 
1069 	RETURN_ERROR(status);
1070 }
1071 
1072 
1073 /*!	Writes data to the specified attribute.
1074 	This is a high-level attribute function that understands attributes
1075 	in the small_data section as well as real attribute files.
1076 */
1077 status_t
1078 Inode::WriteAttribute(Transaction& transaction, const char* name, int32 type,
1079 	off_t pos, const uint8* buffer, size_t* _length, bool* _created)
1080 {
1081 	if (pos < 0)
1082 		return B_BAD_VALUE;
1083 
1084 	// needed to maintain the index
1085 	uint8 oldBuffer[MAX_INDEX_KEY_LENGTH];
1086 	uint8* oldData = NULL;
1087 	size_t oldLength = 0;
1088 	bool created = false;
1089 
1090 	// TODO: we actually depend on that the contents of "buffer" are constant.
1091 	// If they get changed during the write (hey, user programs), we may mess
1092 	// up our index trees!
1093 	// TODO: for attribute files, we need to log the first
1094 	// MAX_INDEX_KEY_LENGTH bytes of the data stream, or the same as above
1095 	// might happen.
1096 
1097 	Index index(fVolume);
1098 	bool hasIndex = index.SetTo(name) == B_OK;
1099 
1100 	Inode* attribute = NULL;
1101 	status_t status = B_OK;
1102 
1103 	if (GetAttribute(name, &attribute) != B_OK) {
1104 		// No attribute inode exists yet
1105 
1106 		// save the old attribute data
1107 		NodeGetter node(fVolume, transaction, this);
1108 		if (node.Node() == NULL)
1109 			return B_IO_ERROR;
1110 
1111 		recursive_lock_lock(&fSmallDataLock);
1112 
1113 		small_data* smallData = FindSmallData(node.Node(), name);
1114 		if (smallData != NULL) {
1115 			oldLength = smallData->DataSize();
1116 			if (oldLength > 0) {
1117 				if (oldLength > MAX_INDEX_KEY_LENGTH)
1118 					oldLength = MAX_INDEX_KEY_LENGTH;
1119 				memcpy(oldData = oldBuffer, smallData->Data(), oldLength);
1120 			}
1121 		} else
1122 			created = true;
1123 
1124 		recursive_lock_unlock(&fSmallDataLock);
1125 
1126 		// if the attribute doesn't exist yet (as a file), try to put it in the
1127 		// small_data section first - if that fails (due to insufficent space),
1128 		// create a real attribute file
1129 		status = _AddSmallData(transaction, node, name, type, pos, buffer,
1130 			*_length);
1131 		if (status == B_DEVICE_FULL) {
1132 			if (smallData != NULL) {
1133 				// remove the old attribute from the small data section - there
1134 				// is no space left for the new data
1135 				status = _RemoveSmallData(transaction, node, name);
1136 			} else
1137 				status = B_OK;
1138 
1139 			if (status == B_OK)
1140 				status = CreateAttribute(transaction, name, type, &attribute);
1141 			if (status != B_OK)
1142 				RETURN_ERROR(status);
1143 
1144 			created = true;
1145 		} else if (status == B_OK) {
1146 			// Update status time on attribute write
1147 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1148 				bfs_inode::ToInode(real_time_clock_usecs()));
1149 
1150 			status = WriteBack(transaction);
1151 		}
1152 	}
1153 
1154 	if (attribute != NULL) {
1155 		WriteLocker writeLocker(attribute->fLock);
1156 
1157 		if (hasIndex || fVolume->CheckForLiveQuery(name)) {
1158 			// Save the old attribute data (if this fails, oldLength will
1159 			// reflect it)
1160 			while (attribute->Size() > 0) {
1161 				bigtime_t oldModified = attribute->LastModified();
1162 				writeLocker.Unlock();
1163 
1164 				oldLength = MAX_INDEX_KEY_LENGTH;
1165 				if (attribute->ReadAt(0, oldBuffer, &oldLength) == B_OK)
1166 					oldData = oldBuffer;
1167 
1168 				writeLocker.Lock();
1169 
1170 				// Read until the data hasn't changed in between
1171 				if (oldModified == attribute->LastModified())
1172 					break;
1173 
1174 				oldLength = 0;
1175 			}
1176 		}
1177 
1178 		// check if the data fits into the small_data section again
1179 		NodeGetter node(fVolume, transaction, this);
1180 		if (node.Node() == NULL)
1181 			return B_IO_ERROR;
1182 
1183 		status = _AddSmallData(transaction, node, name, type, pos, buffer,
1184 			*_length);
1185 
1186 		if (status == B_OK) {
1187 			// it does - remove its file
1188 			writeLocker.Unlock();
1189 			status = _RemoveAttribute(transaction, name, false, NULL);
1190 		} else {
1191 			// The attribute type might have been changed - we need to
1192 			// adopt the new one
1193 			attribute->Node().type = HOST_ENDIAN_TO_BFS_INT32(type);
1194 			status = attribute->WriteBack(transaction);
1195 			writeLocker.Unlock();
1196 
1197 			if (status == B_OK) {
1198 				status = attribute->WriteAt(transaction, pos, buffer,
1199 					_length);
1200 			}
1201 		}
1202 
1203 		if (status == B_OK) {
1204 			// Update status time on attribute write
1205 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1206 				bfs_inode::ToInode(real_time_clock_usecs()));
1207 
1208 			status = WriteBack(transaction);
1209 		}
1210 
1211 		attribute->WriteLockInTransaction(transaction);
1212 		ReleaseAttribute(attribute);
1213 	}
1214 
1215 	// TODO: find a better way than this "pos" thing (the begin of the old key
1216 	//	must be copied to the start of the new one for a comparison)
1217 	if (status == B_OK && pos == 0) {
1218 		// Index only the first MAX_INDEX_KEY_LENGTH bytes
1219 		uint16 length = *_length;
1220 		if (length > MAX_INDEX_KEY_LENGTH)
1221 			length = MAX_INDEX_KEY_LENGTH;
1222 
1223 		// Update index. Note, Index::Update() may be called even if
1224 		// initializing the index failed - it will just update the live
1225 		// queries in this case
1226 		if (pos < length || (uint64)pos < (uint64)oldLength) {
1227 			index.Update(transaction, name, type, oldData, oldLength, buffer,
1228 				length, this);
1229 		}
1230 	}
1231 
1232 	if (_created != NULL)
1233 		*_created = created;
1234 
1235 	return status;
1236 }
1237 
1238 
1239 /*!	Removes the specified attribute from the inode.
1240 	This is a high-level attribute function that understands attributes
1241 	in the small_data section as well as real attribute files.
1242 */
1243 status_t
1244 Inode::RemoveAttribute(Transaction& transaction, const char* name)
1245 {
1246 	Index index(fVolume);
1247 	bool hasIndex = index.SetTo(name) == B_OK;
1248 	NodeGetter node(fVolume, this);
1249 	if (node.Node() == NULL)
1250 		return B_IO_ERROR;
1251 
1252 	// update index for attributes in the small_data section
1253 	{
1254 		RecursiveLocker _(fSmallDataLock);
1255 
1256 		small_data* smallData = FindSmallData(node.Node(), name);
1257 		if (smallData != NULL) {
1258 			uint32 length = smallData->DataSize();
1259 			if (length > MAX_INDEX_KEY_LENGTH)
1260 				length = MAX_INDEX_KEY_LENGTH;
1261 			index.Update(transaction, name, smallData->Type(),
1262 				smallData->Data(), length, NULL, 0, this);
1263 		}
1264 	}
1265 
1266 	status_t status = _RemoveSmallData(transaction, node, name);
1267 	if (status == B_ENTRY_NOT_FOUND && !Attributes().IsZero()) {
1268 		// remove the attribute file if it exists
1269 		status = _RemoveAttribute(transaction, name, hasIndex, &index);
1270 		if (status == B_OK) {
1271 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1272 				bfs_inode::ToInode(real_time_clock_usecs()));
1273 			WriteBack(transaction);
1274 		}
1275 	}
1276 
1277 	return status;
1278 }
1279 
1280 
1281 /*!	Returns the attribute inode with the specified \a name, in case it exists.
1282 	This method can only return real attribute files; the attributes in the
1283 	small data section are ignored.
1284 */
1285 status_t
1286 Inode::GetAttribute(const char* name, Inode** _attribute)
1287 {
1288 	// does this inode even have attributes?
1289 	if (Attributes().IsZero())
1290 		return B_ENTRY_NOT_FOUND;
1291 
1292 	Vnode vnode(fVolume, Attributes());
1293 	Inode* attributes;
1294 	if (vnode.Get(&attributes) < B_OK) {
1295 		FATAL(("get_vnode() failed in Inode::GetAttribute(name = \"%s\")\n",
1296 			name));
1297 		return B_ERROR;
1298 	}
1299 
1300 	BPlusTree* tree = attributes->Tree();
1301 	if (tree == NULL)
1302 		return B_BAD_VALUE;
1303 
1304 	InodeReadLocker locker(attributes);
1305 
1306 	ino_t id;
1307 	status_t status = tree->Find((uint8*)name, (uint16)strlen(name), &id);
1308 	if (status == B_OK) {
1309 		Vnode vnode(fVolume, id);
1310 		Inode* inode;
1311 		// Check if the attribute is really an attribute
1312 		if (vnode.Get(&inode) != B_OK || !inode->IsAttribute())
1313 			return B_ERROR;
1314 
1315 		*_attribute = inode;
1316 		vnode.Keep();
1317 		return B_OK;
1318 	}
1319 
1320 	return status;
1321 }
1322 
1323 
1324 void
1325 Inode::ReleaseAttribute(Inode* attribute)
1326 {
1327 	if (attribute == NULL)
1328 		return;
1329 
1330 	put_vnode(fVolume->FSVolume(), attribute->ID());
1331 }
1332 
1333 
1334 status_t
1335 Inode::CreateAttribute(Transaction& transaction, const char* name, uint32 type,
1336 	Inode** attribute)
1337 {
1338 	// do we need to create the attribute directory first?
1339 	if (Attributes().IsZero()) {
1340 		status_t status = Inode::Create(transaction, this, NULL,
1341 			S_ATTR_DIR | S_DIRECTORY | 0666, 0, 0, NULL);
1342 		if (status < B_OK)
1343 			RETURN_ERROR(status);
1344 	}
1345 	Vnode vnode(fVolume, Attributes());
1346 	Inode* attributes;
1347 	if (vnode.Get(&attributes) < B_OK)
1348 		return B_ERROR;
1349 
1350 	// Inode::Create() locks the inode for us
1351 	return Inode::Create(transaction, attributes, name,
1352 		S_ATTR | S_FILE | 0666, 0, type, NULL, NULL, attribute);
1353 }
1354 
1355 
1356 //	#pragma mark - directory tree
1357 
1358 
1359 bool
1360 Inode::IsEmpty()
1361 {
1362 	TreeIterator iterator(fTree);
1363 
1364 	uint32 count = 0;
1365 	char name[MAX_INDEX_KEY_LENGTH + 1];
1366 	uint16 length;
1367 	ino_t id;
1368 	while (iterator.GetNextEntry(name, &length, MAX_INDEX_KEY_LENGTH + 1,
1369 			&id) == B_OK) {
1370 		if ((Mode() & (S_ATTR_DIR | S_INDEX_DIR)) != 0)
1371 			return false;
1372 
1373 		// Unlike index and attribute directories, directories
1374 		// for standard files always contain ".", and "..", so
1375 		// we need to ignore those two
1376 		if (++count > 2 || (strcmp(".", name) != 0 && strcmp("..", name) != 0))
1377 			return false;
1378 	}
1379 	return true;
1380 }
1381 
1382 
1383 status_t
1384 Inode::ContainerContentsChanged(Transaction& transaction)
1385 {
1386 	ASSERT(!InLastModifiedIndex());
1387 
1388 	Node().last_modified_time = Node().status_change_time
1389 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
1390 
1391 	return WriteBack(transaction);
1392 }
1393 
1394 
1395 //	#pragma mark - data stream
1396 
1397 
1398 /*!	Computes the number of bytes this inode occupies in the file system.
1399 	This includes the file meta blocks used for maintaining its data stream.
1400 
1401 	TODO: However, the attributes in extra files are not really accounted for;
1402 	depending on the speed penalty, this should be changed, though (the value
1403 	could be cached in the inode structure or Inode object, though).
1404 */
1405 off_t
1406 Inode::AllocatedSize() const
1407 {
1408 	if (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0) {
1409 		// This symlink does not have a data stream
1410 		return Node().InodeSize();
1411 	}
1412 
1413 	const data_stream& data = Node().data;
1414 	uint32 blockSize = fVolume->BlockSize();
1415 	off_t size = blockSize;
1416 
1417 	if (data.MaxDoubleIndirectRange() != 0) {
1418 		off_t doubleIndirectSize = data.MaxDoubleIndirectRange()
1419 			- data.MaxIndirectRange();
1420 		int32 indirectSize = double_indirect_max_indirect_size(
1421 			data.double_indirect.Length(), fVolume->BlockSize());
1422 
1423 		size += (2 * data.double_indirect.Length()
1424 				+ doubleIndirectSize / indirectSize)
1425 			* blockSize + data.MaxDoubleIndirectRange();
1426 	} else if (data.MaxIndirectRange() != 0)
1427 		size += data.indirect.Length() + data.MaxIndirectRange();
1428 	else
1429 		size += data.MaxDirectRange();
1430 
1431 	if (!Node().attributes.IsZero()) {
1432 		// TODO: to make this exact, we'd had to count all attributes
1433 		size += 2 * blockSize;
1434 			// 2 blocks, one for the attributes inode, one for its B+tree
1435 	}
1436 
1437 	return size;
1438 }
1439 
1440 
1441 /*!	Finds the block_run where "pos" is located in the data_stream of
1442 	the inode.
1443 	If successful, "offset" will then be set to the file offset
1444 	of the block_run returned; so "pos - offset" is for the block_run
1445 	what "pos" is for the whole stream.
1446 	The caller has to make sure that "pos" is inside the stream.
1447 */
1448 status_t
1449 Inode::FindBlockRun(off_t pos, block_run& run, off_t& offset)
1450 {
1451 	data_stream* data = &Node().data;
1452 
1453 	// find matching block run
1454 
1455 	if (data->MaxIndirectRange() > 0 && pos >= data->MaxDirectRange()) {
1456 		if (data->MaxDoubleIndirectRange() > 0
1457 			&& pos >= data->MaxIndirectRange()) {
1458 			// access to double indirect blocks
1459 
1460 			CachedBlock cached(fVolume);
1461 
1462 			int32 runsPerBlock;
1463 			int32 directSize;
1464 			int32 indirectSize;
1465 			get_double_indirect_sizes(data->double_indirect.Length(),
1466 				fVolume->BlockSize(), runsPerBlock, directSize, indirectSize);
1467 			if (directSize <= 0 || indirectSize <= 0)
1468 				RETURN_ERROR(B_BAD_DATA);
1469 
1470 			off_t start = pos - data->MaxIndirectRange();
1471 			int32 index = start / indirectSize;
1472 
1473 			block_run* indirect = (block_run*)cached.SetTo(
1474 				fVolume->ToBlock(data->double_indirect) + index / runsPerBlock);
1475 			if (indirect == NULL)
1476 				RETURN_ERROR(B_ERROR);
1477 
1478 			int32 current = (start % indirectSize) / directSize;
1479 
1480 			indirect = (block_run*)cached.SetTo(
1481 				fVolume->ToBlock(indirect[index % runsPerBlock])
1482 				+ current / runsPerBlock);
1483 			if (indirect == NULL)
1484 				RETURN_ERROR(B_ERROR);
1485 
1486 			run = indirect[current % runsPerBlock];
1487 			if (run.Length() != data->double_indirect.Length())
1488 				RETURN_ERROR(B_BAD_DATA);
1489 
1490 			offset = data->MaxIndirectRange() + (index * indirectSize)
1491 				+ (current * directSize);
1492 		} else {
1493 			// access to indirect blocks
1494 
1495 			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1496 			off_t runBlockEnd = data->MaxDirectRange();
1497 
1498 			CachedBlock cached(fVolume);
1499 			off_t block = fVolume->ToBlock(data->indirect);
1500 
1501 			for (int32 i = 0; i < data->indirect.Length(); i++) {
1502 				block_run* indirect = (block_run*)cached.SetTo(block + i);
1503 				if (indirect == NULL)
1504 					RETURN_ERROR(B_IO_ERROR);
1505 
1506 				int32 current = -1;
1507 				while (++current < runsPerBlock) {
1508 					if (indirect[current].IsZero())
1509 						break;
1510 
1511 					runBlockEnd += (uint32)indirect[current].Length()
1512 						<< cached.BlockShift();
1513 					if (runBlockEnd > pos) {
1514 						run = indirect[current];
1515 						offset = runBlockEnd - ((uint32)run.Length()
1516 							<< cached.BlockShift());
1517 						return fVolume->ValidateBlockRun(run);
1518 					}
1519 				}
1520 			}
1521 			RETURN_ERROR(B_ERROR);
1522 		}
1523 	} else {
1524 		// access from direct blocks
1525 
1526 		off_t runBlockEnd = 0LL;
1527 		int32 current = -1;
1528 
1529 		while (++current < NUM_DIRECT_BLOCKS) {
1530 			if (data->direct[current].IsZero())
1531 				break;
1532 
1533 			runBlockEnd += (uint32)data->direct[current].Length()
1534 				<< fVolume->BlockShift();
1535 			if (runBlockEnd > pos) {
1536 				run = data->direct[current];
1537 				offset = runBlockEnd
1538 					- ((uint32)run.Length() << fVolume->BlockShift());
1539 				return fVolume->ValidateBlockRun(run);
1540 			}
1541 		}
1542 
1543 		return B_ENTRY_NOT_FOUND;
1544 	}
1545 	return fVolume->ValidateBlockRun(run);
1546 }
1547 
1548 
1549 status_t
1550 Inode::ReadAt(off_t pos, uint8* buffer, size_t* _length)
1551 {
1552 	size_t length = *_length;
1553 
1554 	// set/check boundaries for pos/length
1555 	if (pos < 0)
1556 		return B_BAD_VALUE;
1557 
1558 	InodeReadLocker locker(this);
1559 
1560 	if (pos >= Size() || length == 0) {
1561 		*_length = 0;
1562 		return B_NO_ERROR;
1563 	}
1564 
1565 	locker.Unlock();
1566 
1567 	return file_cache_read(FileCache(), NULL, pos, buffer, _length);
1568 }
1569 
1570 
1571 status_t
1572 Inode::WriteAt(Transaction& transaction, off_t pos, const uint8* buffer,
1573 	size_t* _length)
1574 {
1575 	InodeReadLocker locker(this);
1576 
1577 	// update the last modification time in memory, it will be written
1578 	// back to the inode, and the index when the file is closed
1579 	// TODO: should update the internal last modified time only at this point!
1580 	Node().last_modified_time = Node().status_change_time
1581 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
1582 
1583 	// TODO: support INODE_LOGGED!
1584 
1585 	size_t length = *_length;
1586 	bool changeSize = (uint64)pos + (uint64)length > (uint64)Size();
1587 
1588 	// set/check boundaries for pos/length
1589 	if (pos < 0)
1590 		return B_BAD_VALUE;
1591 
1592 	locker.Unlock();
1593 
1594 	// the transaction doesn't have to be started already
1595 	if (changeSize && !transaction.IsStarted())
1596 		transaction.Start(fVolume, BlockNumber());
1597 
1598 	WriteLocker writeLocker(fLock);
1599 
1600 	// Work around possible race condition: Someone might have shrunken the file
1601 	// while we had no lock.
1602 	if (!transaction.IsStarted()
1603 		&& (uint64)pos + (uint64)length > (uint64)Size()) {
1604 		writeLocker.Unlock();
1605 		transaction.Start(fVolume, BlockNumber());
1606 		writeLocker.Lock();
1607 	}
1608 
1609 	off_t oldSize = Size();
1610 
1611 	if ((uint64)pos + (uint64)length > (uint64)oldSize) {
1612 		// let's grow the data stream to the size needed
1613 		status_t status = SetFileSize(transaction, pos + length);
1614 		if (status != B_OK) {
1615 			*_length = 0;
1616 			WriteLockInTransaction(transaction);
1617 			RETURN_ERROR(status);
1618 		}
1619 		// TODO: In theory we would need to update the file size
1620 		// index here as part of the current transaction - this might just
1621 		// be a bit too expensive, but worth a try.
1622 
1623 		// we need to write back the inode here because it has to
1624 		// go into this transaction (we cannot wait until the file
1625 		// is closed)
1626 		status = WriteBack(transaction);
1627 		if (status != B_OK) {
1628 			WriteLockInTransaction(transaction);
1629 			return status;
1630 		}
1631 	}
1632 
1633 	writeLocker.Unlock();
1634 
1635 	if (oldSize < pos)
1636 		FillGapWithZeros(oldSize, pos);
1637 
1638 	// If we don't want to write anything, we can now return (we may
1639 	// just have changed the file size using the position parameter)
1640 	if (length == 0)
1641 		return B_OK;
1642 
1643 	status_t status = file_cache_write(FileCache(), NULL, pos, buffer, _length);
1644 
1645 	if (transaction.IsStarted())
1646 		WriteLockInTransaction(transaction);
1647 
1648 	return status;
1649 }
1650 
1651 
1652 /*!	Fills the gap between the old file size and the new file size
1653 	with zeros.
1654 	It's more or less a copy of Inode::WriteAt() but it can handle
1655 	length differences of more than just 4 GB, and it never uses
1656 	the log, even if the INODE_LOGGED flag is set.
1657 */
1658 status_t
1659 Inode::FillGapWithZeros(off_t pos, off_t newSize)
1660 {
1661 	while (pos < newSize) {
1662 		size_t size;
1663 		if (newSize > pos + 1024 * 1024 * 1024)
1664 			size = 1024 * 1024 * 1024;
1665 		else
1666 			size = newSize - pos;
1667 
1668 		status_t status = file_cache_write(FileCache(), NULL, pos, NULL, &size);
1669 		if (status < B_OK)
1670 			return status;
1671 
1672 		pos += size;
1673 	}
1674 
1675 	return B_OK;
1676 }
1677 
1678 
1679 /*!	Allocates \a length blocks, and clears their contents. Growing
1680 	the indirect and double indirect range uses this method.
1681 	The allocated block_run is saved in "run"
1682 */
1683 status_t
1684 Inode::_AllocateBlockArray(Transaction& transaction, block_run& run,
1685 	size_t length, bool variableSize)
1686 {
1687 	if (!run.IsZero())
1688 		return B_BAD_VALUE;
1689 
1690 	status_t status = fVolume->Allocate(transaction, this, length, run,
1691 		variableSize ? 1 : length);
1692 	if (status != B_OK)
1693 		return status;
1694 
1695 	// make sure those blocks are empty
1696 	CachedBlock cached(fVolume);
1697 	off_t block = fVolume->ToBlock(run);
1698 
1699 	for (int32 i = 0; i < run.Length(); i++) {
1700 		block_run* runs = (block_run*)cached.SetToWritable(transaction,
1701 			block + i, true);
1702 		if (runs == NULL)
1703 			return B_IO_ERROR;
1704 	}
1705 	return B_OK;
1706 }
1707 
1708 
1709 /*!	Grows the stream to \a size, and fills the direct/indirect/double indirect
1710 	ranges with the runs.
1711 	This method will also determine the size of the preallocation, if any.
1712 */
1713 status_t
1714 Inode::_GrowStream(Transaction& transaction, off_t size)
1715 {
1716 	data_stream* data = &Node().data;
1717 
1718 	// is the data stream already large enough to hold the new size?
1719 	// (can be the case with preallocated blocks)
1720 	if (size < data->MaxDirectRange()
1721 		|| size < data->MaxIndirectRange()
1722 		|| size < data->MaxDoubleIndirectRange()) {
1723 		data->size = HOST_ENDIAN_TO_BFS_INT64(size);
1724 		return B_OK;
1725 	}
1726 
1727 	// how many bytes are still needed? (unused ranges are always zero)
1728 	uint16 minimum = 1;
1729 	off_t bytes;
1730 	if (data->Size() < data->MaxDoubleIndirectRange()) {
1731 		bytes = size - data->MaxDoubleIndirectRange();
1732 		// The double indirect range can only handle multiples of
1733 		// its base length
1734 		minimum = data->double_indirect.Length();
1735 	} else if (data->Size() < data->MaxIndirectRange())
1736 		bytes = size - data->MaxIndirectRange();
1737 	else if (data->Size() < data->MaxDirectRange())
1738 		bytes = size - data->MaxDirectRange();
1739 	else {
1740 		// no preallocation left to be used
1741 		bytes = size - data->Size();
1742 		if (data->MaxDoubleIndirectRange() > 0)
1743 			minimum = data->double_indirect.Length();
1744 	}
1745 
1746 	// do we have enough free blocks on the disk?
1747 	off_t blocksNeeded = (bytes + fVolume->BlockSize() - 1)
1748 		>> fVolume->BlockShift();
1749 	if (blocksNeeded > fVolume->FreeBlocks())
1750 		return B_DEVICE_FULL;
1751 
1752 	off_t blocksRequested = blocksNeeded;
1753 		// because of preallocations and partial allocations, the number of
1754 		// blocks we need to allocate may be different from the one we request
1755 		// from the block allocator
1756 
1757 	// Should we preallocate some blocks?
1758 	// Attributes, attribute directories, and long symlinks usually won't get
1759 	// that big, and should stay close to the inode - preallocating could be
1760 	// counterproductive.
1761 	// Also, if free disk space is tight, don't preallocate.
1762 	if (!IsAttribute() && !IsAttributeDirectory() && !IsSymLink()
1763 		&& fVolume->FreeBlocks() > 128) {
1764 		off_t roundTo = 0;
1765 		if (IsFile()) {
1766 			// Request preallocated blocks depending on the file size and growth
1767 			if (size < 1 * 1024 * 1024 && bytes < 512 * 1024) {
1768 				// Preallocate 64 KB for file sizes <1 MB and grow rates <512 KB
1769 				roundTo = 65536 >> fVolume->BlockShift();
1770 			} else if (size < 32 * 1024 * 1024 && bytes <= 1 * 1024 * 1024) {
1771 				// Preallocate 512 KB for file sizes between 1 MB and 32 MB, and
1772 				// grow rates smaller than 1 MB
1773 				roundTo = (512 * 1024) >> fVolume->BlockShift();
1774 			} else {
1775 				// Preallocate 1/16 of the file size (ie. 4 MB for 64 MB,
1776 				// 64 MB for 1 GB)
1777 				roundTo = size >> (fVolume->BlockShift() + 4);
1778 			}
1779 		} else if (IsIndex()) {
1780 			// Always preallocate 64 KB for index directories
1781 			roundTo = 65536 >> fVolume->BlockShift();
1782 		} else {
1783 			// Preallocate only 4 KB - directories only get trimmed when their
1784 			// vnode is flushed, which might not happen very often.
1785 			roundTo = 4096 >> fVolume->BlockShift();
1786 		}
1787 		if (roundTo > 1) {
1788 			// Round to next "roundTo" block count
1789 			blocksRequested = ((blocksNeeded + roundTo) / roundTo) * roundTo;
1790 		}
1791 	}
1792 
1793 	while (blocksNeeded > 0) {
1794 		// the requested blocks do not need to be returned with a
1795 		// single allocation, so we need to iterate until we have
1796 		// enough blocks allocated
1797 		if (minimum > 1) {
1798 			// make sure that "blocks" is a multiple of minimum
1799 			blocksRequested = round_up(blocksRequested, minimum);
1800 		}
1801 
1802 		block_run run;
1803 		status_t status = fVolume->Allocate(transaction, this, blocksRequested,
1804 			run, minimum);
1805 		if (status != B_OK)
1806 			return status;
1807 
1808 		// okay, we have the needed blocks, so just distribute them to the
1809 		// different ranges of the stream (direct, indirect & double indirect)
1810 
1811 		blocksNeeded -= run.Length();
1812 		// don't preallocate if the first allocation was already too small
1813 		blocksRequested = blocksNeeded;
1814 
1815 		// Direct block range
1816 
1817 		if (data->Size() <= data->MaxDirectRange()) {
1818 			// let's try to put them into the direct block range
1819 			int32 free = 0;
1820 			for (; free < NUM_DIRECT_BLOCKS; free++) {
1821 				if (data->direct[free].IsZero())
1822 					break;
1823 			}
1824 
1825 			if (free < NUM_DIRECT_BLOCKS) {
1826 				// can we merge the last allocated run with the new one?
1827 				int32 last = free - 1;
1828 				if (free > 0 && data->direct[last].MergeableWith(run)) {
1829 					data->direct[last].length = HOST_ENDIAN_TO_BFS_INT16(
1830 						data->direct[last].Length() + run.Length());
1831 				} else
1832 					data->direct[free] = run;
1833 
1834 				data->max_direct_range = HOST_ENDIAN_TO_BFS_INT64(
1835 					data->MaxDirectRange()
1836 					+ run.Length() * fVolume->BlockSize());
1837 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1838 					? data->max_direct_range : size);
1839 				continue;
1840 			}
1841 		}
1842 
1843 		// Indirect block range
1844 
1845 		if (data->Size() <= data->MaxIndirectRange()
1846 			|| !data->MaxIndirectRange()) {
1847 			CachedBlock cached(fVolume);
1848 			block_run* runs = NULL;
1849 			uint32 free = 0;
1850 			off_t block;
1851 
1852 			// if there is no indirect block yet, create one
1853 			if (data->indirect.IsZero()) {
1854 				status = _AllocateBlockArray(transaction, data->indirect,
1855 					NUM_ARRAY_BLOCKS, true);
1856 				if (status != B_OK)
1857 					return status;
1858 
1859 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1860 					data->MaxDirectRange());
1861 				// insert the block_run in the first block
1862 				runs = (block_run*)cached.SetTo(data->indirect);
1863 			} else {
1864 				uint32 numberOfRuns = fVolume->BlockSize() / sizeof(block_run);
1865 				block = fVolume->ToBlock(data->indirect);
1866 
1867 				// search first empty entry
1868 				int32 i = 0;
1869 				for (; i < data->indirect.Length(); i++) {
1870 					if ((runs = (block_run*)cached.SetTo(block + i)) == NULL)
1871 						return B_IO_ERROR;
1872 
1873 					for (free = 0; free < numberOfRuns; free++)
1874 						if (runs[free].IsZero())
1875 							break;
1876 
1877 					if (free < numberOfRuns)
1878 						break;
1879 				}
1880 				if (i == data->indirect.Length())
1881 					runs = NULL;
1882 			}
1883 
1884 			if (runs != NULL) {
1885 				// try to insert the run to the last one - note that this
1886 				// doesn't take block borders into account, so it could be
1887 				// further optimized
1888 				cached.MakeWritable(transaction);
1889 
1890 				int32 last = free - 1;
1891 				if (free > 0 && runs[last].MergeableWith(run)) {
1892 					runs[last].length = HOST_ENDIAN_TO_BFS_INT16(
1893 						runs[last].Length() + run.Length());
1894 				} else
1895 					runs[free] = run;
1896 
1897 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1898 					data->MaxIndirectRange()
1899 					+ ((uint32)run.Length() << fVolume->BlockShift()));
1900 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1901 					? data->MaxIndirectRange() : size);
1902 				continue;
1903 			}
1904 		}
1905 
1906 		// Double indirect block range
1907 
1908 		if (data->Size() <= data->MaxDoubleIndirectRange()
1909 			|| !data->max_double_indirect_range) {
1910 			// We make sure here that we have this minimum allocated, so if
1911 			// the allocation succeeds, we don't run into an endless loop.
1912 			if (!data->max_double_indirect_range)
1913 				minimum = _DoubleIndirectBlockLength();
1914 			else
1915 				minimum = data->double_indirect.Length();
1916 
1917 			if ((run.Length() % minimum) != 0) {
1918 				// The number of allocated blocks isn't a multiple of 'minimum',
1919 				// so we have to change this. This can happen the first time the
1920 				// stream grows into the double indirect range.
1921 				// First, free the remaining blocks that don't fit into this
1922 				// multiple.
1923 				int32 rest = run.Length() % minimum;
1924 				run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length() - rest);
1925 
1926 				status = fVolume->Free(transaction,
1927 					block_run::Run(run.AllocationGroup(),
1928 					run.Start() + run.Length(), rest));
1929 				if (status != B_OK)
1930 					return status;
1931 
1932 				blocksNeeded += rest;
1933 				blocksRequested = round_up(blocksNeeded, minimum);
1934 
1935 				// Are there any blocks left in the run? If not, allocate
1936 				// a new one
1937 				if (run.length == 0)
1938 					continue;
1939 			}
1940 
1941 			// if there is no double indirect block yet, create one
1942 			if (data->double_indirect.IsZero()) {
1943 				status = _AllocateBlockArray(transaction,
1944 					data->double_indirect, _DoubleIndirectBlockLength());
1945 				if (status != B_OK)
1946 					return status;
1947 
1948 				data->max_double_indirect_range = data->max_indirect_range;
1949 			}
1950 
1951 			// calculate the index where to insert the new blocks
1952 
1953 			int32 runsPerBlock;
1954 			int32 directSize;
1955 			int32 indirectSize;
1956 			get_double_indirect_sizes(data->double_indirect.Length(),
1957 				fVolume->BlockSize(), runsPerBlock, directSize, indirectSize);
1958 			if (directSize <= 0 || indirectSize <= 0)
1959 				return B_BAD_DATA;
1960 
1961 			off_t start = data->MaxDoubleIndirectRange()
1962 				- data->MaxIndirectRange();
1963 			int32 indirectIndex = start / indirectSize;
1964 			int32 index = (start % indirectSize) / directSize;
1965 			int32 runsPerArray = runsPerBlock * minimum;
1966 
1967 			// distribute the blocks to the array and allocate
1968 			// new array blocks when needed
1969 
1970 			CachedBlock cached(fVolume);
1971 			CachedBlock cachedDirect(fVolume);
1972 			block_run* array = NULL;
1973 			uint32 runLength = run.Length();
1974 
1975 			while (run.length != 0) {
1976 				// get the indirect array block
1977 				if (array == NULL) {
1978 					uint32 block = indirectIndex / runsPerBlock;
1979 					if (block >= minimum)
1980 						return EFBIG;
1981 
1982 					array = (block_run*)cached.SetTo(fVolume->ToBlock(
1983 						data->double_indirect) + block);
1984 					if (array == NULL)
1985 						return B_IO_ERROR;
1986 				}
1987 
1988 				do {
1989 					// do we need a new array block?
1990 					if (array[indirectIndex % runsPerBlock].IsZero()) {
1991 						cached.MakeWritable(transaction);
1992 
1993 						status = _AllocateBlockArray(transaction,
1994 							array[indirectIndex % runsPerBlock],
1995 							data->double_indirect.Length());
1996 						if (status != B_OK)
1997 							return status;
1998 					}
1999 
2000 					block_run* runs = (block_run*)cachedDirect.SetToWritable(
2001 						transaction, fVolume->ToBlock(array[indirectIndex
2002 							% runsPerBlock]) + index / runsPerBlock);
2003 					if (runs == NULL)
2004 						return B_IO_ERROR;
2005 
2006 					do {
2007 						// insert the block_run into the array
2008 						runs[index % runsPerBlock] = run;
2009 						runs[index % runsPerBlock].length
2010 							= HOST_ENDIAN_TO_BFS_INT16(minimum);
2011 
2012 						// alter the remaining block_run
2013 						run.start = HOST_ENDIAN_TO_BFS_INT16(run.Start()
2014 							+ minimum);
2015 						run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
2016 							- minimum);
2017 					} while ((++index % runsPerBlock) != 0 && run.length);
2018 				} while ((index % runsPerArray) != 0 && run.length);
2019 
2020 				if (index == runsPerArray)
2021 					index = 0;
2022 				if (++indirectIndex % runsPerBlock == 0) {
2023 					array = NULL;
2024 					index = 0;
2025 				}
2026 			}
2027 
2028 			data->max_double_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
2029 				data->MaxDoubleIndirectRange()
2030 				+ (runLength << fVolume->BlockShift()));
2031 			data->size = blocksNeeded > 0 ? HOST_ENDIAN_TO_BFS_INT64(
2032 				data->max_double_indirect_range) : size;
2033 
2034 			continue;
2035 		}
2036 
2037 		RETURN_ERROR(EFBIG);
2038 	}
2039 	// update the size of the data stream
2040 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
2041 
2042 	return B_OK;
2043 }
2044 
2045 
2046 size_t
2047 Inode::_DoubleIndirectBlockLength() const
2048 {
2049 	if (fVolume->BlockSize() > DOUBLE_INDIRECT_ARRAY_SIZE)
2050 		return 1;
2051 
2052 	return DOUBLE_INDIRECT_ARRAY_SIZE / fVolume->BlockSize();
2053 }
2054 
2055 
2056 /*!	Frees the statically sized array of the double indirect part of a data
2057 	stream.
2058 */
2059 status_t
2060 Inode::_FreeStaticStreamArray(Transaction& transaction, int32 level,
2061 	block_run run, off_t size, off_t offset, off_t& max)
2062 {
2063 	int32 indirectSize;
2064 	if (level == 0) {
2065 		indirectSize = double_indirect_max_indirect_size(run.Length(),
2066 			fVolume->BlockSize());
2067 	} else {
2068 		indirectSize = double_indirect_max_direct_size(run.Length(),
2069 			fVolume->BlockSize());
2070 	}
2071 	if (indirectSize <= 0)
2072 		return B_BAD_DATA;
2073 
2074 	off_t start;
2075 	if (size > offset)
2076 		start = size - offset;
2077 	else
2078 		start = 0;
2079 
2080 	int32 index = start / indirectSize;
2081 	int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
2082 
2083 	CachedBlock cached(fVolume);
2084 	off_t blockNumber = fVolume->ToBlock(run);
2085 
2086 	// set the file offset to the current block run
2087 	offset += (off_t)index * indirectSize;
2088 
2089 	for (int32 i = index / runsPerBlock; i < run.Length(); i++) {
2090 		block_run* array = (block_run*)cached.SetToWritable(transaction,
2091 			blockNumber + i);
2092 		if (array == NULL)
2093 			RETURN_ERROR(B_ERROR);
2094 
2095 		for (index = index % runsPerBlock; index < runsPerBlock; index++) {
2096 			if (array[index].IsZero()) {
2097 				// we also want to break out of the outer loop
2098 				i = run.Length();
2099 				break;
2100 			}
2101 
2102 			status_t status = B_OK;
2103 			if (level == 0) {
2104 				status = _FreeStaticStreamArray(transaction, 1, array[index],
2105 					size, offset, max);
2106 			} else if (offset >= size)
2107 				status = fVolume->Free(transaction, array[index]);
2108 			else
2109 				max = HOST_ENDIAN_TO_BFS_INT64(offset + indirectSize);
2110 
2111 			if (status < B_OK)
2112 				RETURN_ERROR(status);
2113 
2114 			if (offset >= size)
2115 				array[index].SetTo(0, 0, 0);
2116 
2117 			offset += indirectSize;
2118 		}
2119 		index = 0;
2120 	}
2121 	return B_OK;
2122 }
2123 
2124 
2125 /*!	Frees all block_runs in the array which come after the specified size.
2126 	It also trims the last block_run that contain the size.
2127 	"offset" and "max" are maintained until the last block_run that doesn't
2128 	have to be freed - after this, the values won't be correct anymore, but
2129 	will still assure correct function for all subsequent calls.
2130 	"max" is considered to be in file system byte order.
2131 */
2132 status_t
2133 Inode::_FreeStreamArray(Transaction& transaction, block_run* array,
2134 	uint32 arrayLength, off_t size, off_t& offset, off_t& max)
2135 {
2136 	PRINT(("FreeStreamArray: arrayLength %lu, size %Ld, offset %Ld, max %Ld\n",
2137 		arrayLength, size, offset, max));
2138 
2139 	off_t newOffset = offset;
2140 	uint32 i = 0;
2141 	for (; i < arrayLength; i++, offset = newOffset) {
2142 		if (array[i].IsZero())
2143 			break;
2144 
2145 		newOffset += (off_t)array[i].Length() << fVolume->BlockShift();
2146 		if (newOffset <= size)
2147 			continue;
2148 
2149 		block_run run = array[i];
2150 
2151 		// determine the block_run to be freed
2152 		if (newOffset > size && offset < size) {
2153 			// free partial block_run (and update the original block_run)
2154 			run.start = HOST_ENDIAN_TO_BFS_INT16(array[i].Start()
2155 				+ ((size + fVolume->BlockSize() - 1 - offset)
2156 					>> fVolume->BlockShift()));
2157 				// round to next block
2158 			array[i].length = HOST_ENDIAN_TO_BFS_INT16(run.Start()
2159 				- array[i].Start());
2160 			run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
2161 				- array[i].Length());
2162 
2163 			if (run.length == 0)
2164 				continue;
2165 
2166 			// update maximum range
2167 			max = HOST_ENDIAN_TO_BFS_INT64(offset + ((off_t)array[i].Length()
2168 				<< fVolume->BlockShift()));
2169 		} else {
2170 			// free the whole block_run
2171 			array[i].SetTo(0, 0, 0);
2172 
2173 			if ((off_t)BFS_ENDIAN_TO_HOST_INT64(max) > offset)
2174 				max = HOST_ENDIAN_TO_BFS_INT64(offset);
2175 		}
2176 
2177 		if (fVolume->Free(transaction, run) < B_OK)
2178 			return B_IO_ERROR;
2179 	}
2180 	return B_OK;
2181 }
2182 
2183 
2184 status_t
2185 Inode::_ShrinkStream(Transaction& transaction, off_t size)
2186 {
2187 	data_stream* data = &Node().data;
2188 	status_t status;
2189 
2190 	if (data->MaxDoubleIndirectRange() > size) {
2191 		off_t* maxDoubleIndirect = &data->max_double_indirect_range;
2192 			// gcc 4 work-around: "error: cannot bind packed field
2193 			// 'data->data_stream::max_double_indirect_range' to 'off_t&'"
2194 		status = _FreeStaticStreamArray(transaction, 0, data->double_indirect,
2195 			size, data->MaxIndirectRange(), *maxDoubleIndirect);
2196 		if (status != B_OK)
2197 			return status;
2198 
2199 		if (size <= data->MaxIndirectRange()) {
2200 			fVolume->Free(transaction, data->double_indirect);
2201 			data->double_indirect.SetTo(0, 0, 0);
2202 			data->max_double_indirect_range = 0;
2203 		}
2204 	}
2205 
2206 	if (data->MaxIndirectRange() > size) {
2207 		CachedBlock cached(fVolume);
2208 		off_t block = fVolume->ToBlock(data->indirect);
2209 		off_t offset = data->MaxDirectRange();
2210 
2211 		for (int32 i = 0; i < data->indirect.Length(); i++) {
2212 			block_run* array = (block_run*)cached.SetToWritable(transaction,
2213 				block + i);
2214 			if (array == NULL)
2215 				break;
2216 
2217 			off_t* maxIndirect = &data->max_indirect_range;
2218 				// gcc 4 work-around: "error: cannot bind packed field
2219 				// 'data->data_stream::max_indirect_range' to 'off_t&'"
2220 			if (_FreeStreamArray(transaction, array, fVolume->BlockSize()
2221 					/ sizeof(block_run), size, offset, *maxIndirect) != B_OK)
2222 				return B_IO_ERROR;
2223 		}
2224 		if (data->max_direct_range == data->max_indirect_range) {
2225 			fVolume->Free(transaction, data->indirect);
2226 			data->indirect.SetTo(0, 0, 0);
2227 			data->max_indirect_range = 0;
2228 		}
2229 	}
2230 
2231 	if (data->MaxDirectRange() > size) {
2232 		off_t offset = 0;
2233 		off_t *maxDirect = &data->max_direct_range;
2234 			// gcc 4 work-around: "error: cannot bind packed field
2235 			// 'data->data_stream::max_direct_range' to 'off_t&'"
2236 		status = _FreeStreamArray(transaction, data->direct, NUM_DIRECT_BLOCKS,
2237 			size, offset, *maxDirect);
2238 		if (status < B_OK)
2239 			return status;
2240 	}
2241 
2242 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
2243 	return B_OK;
2244 }
2245 
2246 
2247 status_t
2248 Inode::SetFileSize(Transaction& transaction, off_t size)
2249 {
2250 	if (size < 0)
2251 		return B_BAD_VALUE;
2252 
2253 	off_t oldSize = Size();
2254 
2255 	if (size == oldSize)
2256 		return B_OK;
2257 
2258 	T(Resize(this, oldSize, size, false));
2259 
2260 	// should the data stream grow or shrink?
2261 	status_t status;
2262 	if (size > oldSize) {
2263 		status = _GrowStream(transaction, size);
2264 		if (status < B_OK) {
2265 			// if the growing of the stream fails, the whole operation
2266 			// fails, so we should shrink the stream to its former size
2267 			_ShrinkStream(transaction, oldSize);
2268 		}
2269 	} else
2270 		status = _ShrinkStream(transaction, size);
2271 
2272 	if (status < B_OK)
2273 		return status;
2274 
2275 	file_cache_set_size(FileCache(), size);
2276 	file_map_set_size(Map(), size);
2277 
2278 	return WriteBack(transaction);
2279 }
2280 
2281 
2282 status_t
2283 Inode::Append(Transaction& transaction, off_t bytes)
2284 {
2285 	return SetFileSize(transaction, Size() + bytes);
2286 }
2287 
2288 
2289 /*!	Checks whether or not this inode's data stream needs to be trimmed
2290 	because of an earlier preallocation.
2291 	Returns true if there are any blocks to be trimmed.
2292 */
2293 bool
2294 Inode::NeedsTrimming() const
2295 {
2296 	// We never trim preallocated index blocks to make them grow as smooth as
2297 	// possible. There are only few indices anyway, so this doesn't hurt.
2298 	// Also, if an inode is already in deleted state, we don't bother trimming
2299 	// it.
2300 	if (IsIndex() || IsDeleted()
2301 		|| (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0))
2302 		return false;
2303 
2304 	off_t roundedSize = round_up(Size(), fVolume->BlockSize());
2305 
2306 	return Node().data.MaxDirectRange() > roundedSize
2307 		|| Node().data.MaxIndirectRange() > roundedSize
2308 		|| Node().data.MaxDoubleIndirectRange() > roundedSize;
2309 }
2310 
2311 
2312 status_t
2313 Inode::TrimPreallocation(Transaction& transaction)
2314 {
2315 	T(Resize(this, max_c(Node().data.MaxDirectRange(),
2316 		Node().data.MaxIndirectRange()), Size(), true));
2317 
2318 	status_t status = _ShrinkStream(transaction, Size());
2319 	if (status < B_OK)
2320 		return status;
2321 
2322 	return WriteBack(transaction);
2323 }
2324 
2325 
2326 //!	Frees the file's data stream and removes all attributes
2327 status_t
2328 Inode::Free(Transaction& transaction)
2329 {
2330 	FUNCTION();
2331 
2332 	// Perhaps there should be an implementation of Inode::ShrinkStream() that
2333 	// just frees the data_stream, but doesn't change the inode (since it is
2334 	// freed anyway) - that would make an undelete command possible
2335 	if (!IsSymLink() || (Flags() & INODE_LONG_SYMLINK) != 0) {
2336 		status_t status = SetFileSize(transaction, 0);
2337 		if (status < B_OK)
2338 			return status;
2339 	}
2340 
2341 	// Free all attributes, and remove their indices
2342 	{
2343 		// We have to limit the scope of AttributeIterator, so that its
2344 		// destructor is not called after the inode is deleted
2345 		AttributeIterator iterator(this);
2346 
2347 		char name[B_FILE_NAME_LENGTH];
2348 		uint32 type;
2349 		size_t length;
2350 		ino_t id;
2351 		while (iterator.GetNext(name, &length, &type, &id) == B_OK) {
2352 			RemoveAttribute(transaction, name);
2353 		}
2354 	}
2355 
2356 	if (WriteBack(transaction) < B_OK)
2357 		return B_IO_ERROR;
2358 
2359 	return fVolume->Free(transaction, BlockRun());
2360 }
2361 
2362 
2363 //!	Synchronizes (writes back to disk) the file stream of the inode.
2364 status_t
2365 Inode::Sync()
2366 {
2367 	if (FileCache())
2368 		return file_cache_sync(FileCache());
2369 
2370 	// We may also want to flush the attribute's data stream to
2371 	// disk here... (do we?)
2372 
2373 	if (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0)
2374 		return B_OK;
2375 
2376 	InodeReadLocker locker(this);
2377 
2378 	data_stream* data = &Node().data;
2379 	status_t status = B_OK;
2380 
2381 	// flush direct range
2382 
2383 	for (int32 i = 0; i < NUM_DIRECT_BLOCKS; i++) {
2384 		if (data->direct[i].IsZero())
2385 			return B_OK;
2386 
2387 		status = block_cache_sync_etc(fVolume->BlockCache(),
2388 			fVolume->ToBlock(data->direct[i]), data->direct[i].Length());
2389 		if (status != B_OK)
2390 			return status;
2391 	}
2392 
2393 	// flush indirect range
2394 
2395 	if (data->max_indirect_range == 0)
2396 		return B_OK;
2397 
2398 	CachedBlock cached(fVolume);
2399 	off_t block = fVolume->ToBlock(data->indirect);
2400 	int32 count = fVolume->BlockSize() / sizeof(block_run);
2401 
2402 	for (int32 j = 0; j < data->indirect.Length(); j++) {
2403 		block_run* runs = (block_run*)cached.SetTo(block + j);
2404 		if (runs == NULL)
2405 			break;
2406 
2407 		for (int32 i = 0; i < count; i++) {
2408 			if (runs[i].IsZero())
2409 				return B_OK;
2410 
2411 			status = block_cache_sync_etc(fVolume->BlockCache(),
2412 				fVolume->ToBlock(runs[i]), runs[i].Length());
2413 			if (status != B_OK)
2414 				return status;
2415 		}
2416 	}
2417 
2418 	// flush double indirect range
2419 
2420 	if (data->max_double_indirect_range == 0)
2421 		return B_OK;
2422 
2423 	off_t indirectBlock = fVolume->ToBlock(data->double_indirect);
2424 
2425 	for (int32 l = 0; l < data->double_indirect.Length(); l++) {
2426 		block_run* indirectRuns = (block_run*)cached.SetTo(indirectBlock + l);
2427 		if (indirectRuns == NULL)
2428 			return B_FILE_ERROR;
2429 
2430 		CachedBlock directCached(fVolume);
2431 
2432 		for (int32 k = 0; k < count; k++) {
2433 			if (indirectRuns[k].IsZero())
2434 				return B_OK;
2435 
2436 			block = fVolume->ToBlock(indirectRuns[k]);
2437 			for (int32 j = 0; j < indirectRuns[k].Length(); j++) {
2438 				block_run* runs = (block_run*)directCached.SetTo(block + j);
2439 				if (runs == NULL)
2440 					return B_FILE_ERROR;
2441 
2442 				for (int32 i = 0; i < count; i++) {
2443 					if (runs[i].IsZero())
2444 						return B_OK;
2445 
2446 					// TODO: combine single block_runs to bigger ones when
2447 					// they are adjacent
2448 					status = block_cache_sync_etc(fVolume->BlockCache(),
2449 						fVolume->ToBlock(runs[i]), runs[i].Length());
2450 					if (status != B_OK)
2451 						return status;
2452 				}
2453 			}
2454 		}
2455 	}
2456 	return B_OK;
2457 }
2458 
2459 
2460 //	#pragma mark - TransactionListener implementation
2461 
2462 
2463 void
2464 Inode::TransactionDone(bool success)
2465 {
2466 	if (!success) {
2467 		// Revert any changes made to the cached bfs_inode
2468 		// TODO: return code gets eaten
2469 		UpdateNodeFromDisk();
2470 	}
2471 }
2472 
2473 
2474 void
2475 Inode::RemovedFromTransaction()
2476 {
2477 	Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
2478 
2479 	// See AddInode() why we do this here
2480 	if ((Flags() & INODE_DELETED) != 0)
2481 		fVolume->RemovedInodes().Add(this);
2482 
2483 	rw_lock_write_unlock(&Lock());
2484 
2485 	if (!fVolume->IsInitializing())
2486 		put_vnode(fVolume->FSVolume(), ID());
2487 }
2488 
2489 
2490 //	#pragma mark - creation/deletion
2491 
2492 
2493 status_t
2494 Inode::Remove(Transaction& transaction, const char* name, ino_t* _id,
2495 	bool isDirectory, bool force)
2496 {
2497 	if (fTree == NULL)
2498 		RETURN_ERROR(B_BAD_VALUE);
2499 
2500 	WriteLockInTransaction(transaction);
2501 
2502 	// does the file even exist?
2503 	off_t id;
2504 	if (fTree->Find((uint8*)name, (uint16)strlen(name), &id) < B_OK)
2505 		return B_ENTRY_NOT_FOUND;
2506 
2507 	if (_id)
2508 		*_id = id;
2509 
2510 	Vnode vnode(fVolume, id);
2511 	Inode* inode;
2512 	status_t status = vnode.Get(&inode);
2513 	if (status < B_OK) {
2514 		REPORT_ERROR(status);
2515 		return B_ENTRY_NOT_FOUND;
2516 	}
2517 
2518 	T(Remove(inode, name));
2519 	inode->WriteLockInTransaction(transaction);
2520 
2521 	// Inode::IsContainer() is true also for indices (furthermore, the S_IFDIR
2522 	// bit is set for indices in BFS, not for attribute directories) - but you
2523 	// should really be able to do whatever you want with your indices
2524 	// without having to remove all files first :)
2525 	if (!inode->IsIndex() && !force) {
2526 		// if it's not of the correct type, don't delete it!
2527 		if (inode->IsContainer() != isDirectory)
2528 			return isDirectory ? B_NOT_A_DIRECTORY : B_IS_A_DIRECTORY;
2529 
2530 		// only delete empty directories
2531 		if (isDirectory && !inode->IsEmpty())
2532 			return B_DIRECTORY_NOT_EMPTY;
2533 	}
2534 
2535 	// remove_vnode() allows the inode to be accessed until the last put_vnode()
2536 	status = remove_vnode(fVolume->FSVolume(), id);
2537 	if (status != B_OK)
2538 		return status;
2539 
2540 	if (fTree->Remove(transaction, name, id) != B_OK && !force) {
2541 		unremove_vnode(fVolume->FSVolume(), id);
2542 		RETURN_ERROR(B_ERROR);
2543 	}
2544 
2545 #ifdef DEBUG
2546 	if (fTree->Find((uint8*)name, (uint16)strlen(name), &id) == B_OK) {
2547 		DIE(("deleted entry still there"));
2548 	}
2549 #endif
2550 
2551 	ContainerContentsChanged(transaction);
2552 
2553 	// update the inode, so that no one will ever doubt it's deleted :-)
2554 	inode->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
2555 	inode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
2556 
2557 	// In balance to the Inode::Create() method, the main indices
2558 	// are updated here (name, size, & last_modified)
2559 
2560 	Index index(fVolume);
2561 	if (inode->InNameIndex()) {
2562 		index.RemoveName(transaction, name, inode);
2563 			// If removing from the index fails, it is not regarded as a
2564 			// fatal error and will not be reported back!
2565 			// Deleted inodes won't be visible in queries anyway.
2566 	}
2567 
2568 	if (inode->InSizeIndex())
2569 		index.RemoveSize(transaction, inode);
2570 	if (inode->InLastModifiedIndex())
2571 		index.RemoveLastModified(transaction, inode);
2572 
2573 	return inode->WriteBack(transaction);
2574 }
2575 
2576 
2577 /*!	Creates the inode with the specified \a parent directory, and automatically
2578 	adds the created inode to that parent directory. If an attribute directory
2579 	is created, it will also automatically  be added to the \a parent inode as
2580 	such. However, the indices root node, and the regular root node won't be
2581 	added to the superblock.
2582 	It will also create the initial B+tree for the inode if it's a directory
2583 	of any kind.
2584 	\a name may be \c NULL, but only if no \a parent is given.
2585 	If the "_id" or "_inode" variable is given and non-NULL to store the
2586 	inode's ID, the inode stays locked - you have to call put_vnode() if you
2587 	don't use it anymore.
2588 	If the node already exists, this method will fail if \c O_EXCL is set, or
2589 	it's a directory or a symlink. Otherwise, it will just be returned.
2590 	If \c O_TRUNC has been specified, the file will also be truncated.
2591 */
2592 status_t
2593 Inode::Create(Transaction& transaction, Inode* parent, const char* name,
2594 	int32 mode, int openMode, uint32 type, bool* _created, ino_t* _id,
2595 	Inode** _inode, fs_vnode_ops* vnodeOps, uint32 publishFlags)
2596 {
2597 	FUNCTION_START(("name = %s, mode = %ld\n", name, mode));
2598 
2599 	block_run parentRun = parent ? parent->BlockRun() : block_run::Run(0, 0, 0);
2600 	Volume* volume = transaction.GetVolume();
2601 	BPlusTree* tree = NULL;
2602 
2603 	if (parent != NULL && (mode & S_ATTR_DIR) == 0 && parent->IsContainer()) {
2604 		// check if the file already exists in the directory
2605 		tree = parent->Tree();
2606 	}
2607 
2608 	if (parent != NULL) {
2609 		// the parent directory is locked during the whole inode creation
2610 		parent->WriteLockInTransaction(transaction);
2611 	}
2612 
2613 	if (parent != NULL && !volume->IsInitializing() && parent->IsContainer()) {
2614 		// don't create anything in removed directories
2615 		bool removed;
2616 		if (get_vnode_removed(volume->FSVolume(), parent->ID(), &removed)
2617 				== B_OK && removed) {
2618 			RETURN_ERROR(B_ENTRY_NOT_FOUND);
2619 		}
2620 	}
2621 
2622 	if (tree != NULL) {
2623 		// Does the file already exist?
2624 		off_t offset;
2625 		if (tree->Find((uint8*)name, (uint16)strlen(name), &offset) == B_OK) {
2626 			// Return if the file should be a directory/link or opened in
2627 			// exclusive mode
2628 			if (S_ISDIR(mode) || S_ISLNK(mode) || (openMode & O_EXCL) != 0)
2629 				return B_FILE_EXISTS;
2630 
2631 			Vnode vnode(volume, offset);
2632 			Inode* inode;
2633 			status_t status = vnode.Get(&inode);
2634 			if (status != B_OK) {
2635 				REPORT_ERROR(status);
2636 				return B_ENTRY_NOT_FOUND;
2637 			}
2638 
2639 			if (inode->IsDirectory() && (openMode & O_RWMASK) != O_RDONLY)
2640 				return B_IS_A_DIRECTORY;
2641 			if ((openMode & O_DIRECTORY) != 0 && !inode->IsDirectory())
2642 				return B_NOT_A_DIRECTORY;
2643 
2644 			// we want to open the file, so we should have the rights to do so
2645 			if (inode->CheckPermissions(open_mode_to_access(openMode)
2646 					| ((openMode & O_TRUNC) != 0 ? W_OK : 0)) != B_OK)
2647 				return B_NOT_ALLOWED;
2648 
2649 			if ((openMode & O_TRUNC) != 0) {
2650 				// truncate the existing file
2651 				inode->WriteLockInTransaction(transaction);
2652 
2653 				status_t status = inode->SetFileSize(transaction, 0);
2654 				if (status == B_OK)
2655 					status = inode->WriteBack(transaction);
2656 
2657 				if (status != B_OK)
2658 					return status;
2659 			}
2660 
2661 			if (_created)
2662 				*_created = false;
2663 			if (_id)
2664 				*_id = inode->ID();
2665 			if (_inode)
2666 				*_inode = inode;
2667 
2668 			// Only keep the vnode in memory if the _id or _inode pointer is
2669 			// provided
2670 			if (_id != NULL || _inode != NULL)
2671 				vnode.Keep();
2672 
2673 			return B_OK;
2674 		}
2675 	} else if (parent != NULL && (mode & S_ATTR_DIR) == 0) {
2676 		return B_BAD_VALUE;
2677 	} else if ((openMode & O_DIRECTORY) != 0) {
2678 		// TODO: we might need to return B_NOT_A_DIRECTORY here
2679 		return B_ENTRY_NOT_FOUND;
2680 	}
2681 
2682 	status_t status;
2683 
2684 	// do we have the power to create new files at all?
2685 	if (parent != NULL && (status = parent->CheckPermissions(W_OK)) != B_OK)
2686 		RETURN_ERROR(status);
2687 
2688 	// allocate space for the new inode
2689 	InodeAllocator allocator(transaction);
2690 	block_run run;
2691 	Inode* inode;
2692 	status = allocator.New(&parentRun, mode, publishFlags, run, vnodeOps,
2693 		&inode);
2694 	if (status < B_OK)
2695 		return status;
2696 
2697 	T(Create(inode, parent, name, mode, openMode, type));
2698 
2699 	// Initialize the parts of the bfs_inode structure that
2700 	// InodeAllocator::New() hasn't touched yet
2701 
2702 	bfs_inode* node = &inode->Node();
2703 
2704 	if (parent == NULL) {
2705 		// we set the parent to itself in this case
2706 		// (only happens for the root and indices node)
2707 		node->parent = run;
2708 	} else
2709 		node->parent = parentRun;
2710 
2711 	node->uid = HOST_ENDIAN_TO_BFS_INT32(geteuid());
2712 	node->gid = HOST_ENDIAN_TO_BFS_INT32(parent
2713 		? parent->Node().GroupID() : getegid());
2714 		// the group ID is inherited from the parent, if available
2715 
2716 	node->type = HOST_ENDIAN_TO_BFS_INT32(type);
2717 
2718 	inode->WriteBack(transaction);
2719 		// make sure the initialized node is available to others
2720 
2721 	// only add the name to regular files, directories, or symlinks
2722 	// don't add it to attributes, or indices
2723 	if (tree && inode->IsRegularNode()
2724 		&& inode->SetName(transaction, name) != B_OK)
2725 		return B_ERROR;
2726 
2727 	// Initialize b+tree if it's a directory (and add "." & ".." if it's
2728 	// a standard directory for files - not for attributes or indices)
2729 	if (inode->IsContainer()) {
2730 		status = allocator.CreateTree();
2731 		if (status != B_OK)
2732 			return status;
2733 	}
2734 
2735 	// Add a link to the inode from the parent, depending on its type
2736 	// (the vnode is not published yet, so it is safe to make the inode
2737 	// accessable to the file system here)
2738 	if (tree != NULL) {
2739 		status = tree->Insert(transaction, name, inode->ID());
2740 	} else if (parent != NULL && (mode & S_ATTR_DIR) != 0) {
2741 		parent->Attributes() = run;
2742 		status = parent->WriteBack(transaction);
2743 	}
2744 
2745 	// Note, we only care if the inode could be made accessable for the
2746 	// two cases above; the root node or the indices root node must
2747 	// handle this case on their own (or other cases where "parent" is
2748 	// NULL)
2749 	if (status != B_OK)
2750 		RETURN_ERROR(status);
2751 
2752 	// Update the main indices (name, size & last_modified)
2753 	// (live queries might want to access us after this)
2754 
2755 	Index index(volume);
2756 	if (inode->InNameIndex() && name != NULL) {
2757 		// the name index only contains regular files
2758 		// (but not the root node where name == NULL)
2759 		status = index.InsertName(transaction, name, inode);
2760 		if (status != B_OK && status != B_BAD_INDEX) {
2761 			// We have to remove the node from the parent at this point,
2762 			// because the InodeAllocator destructor can't handle this
2763 			// case (and if it fails, we can't do anything about it...)
2764 			if (tree)
2765 				tree->Remove(transaction, name, inode->ID());
2766 			else if (parent != NULL && (mode & S_ATTR_DIR) != 0)
2767 				parent->Node().attributes.SetTo(0, 0, 0);
2768 
2769 			RETURN_ERROR(status);
2770 		}
2771 	}
2772 
2773 	if (parent != NULL && parent->IsContainer())
2774 		parent->ContainerContentsChanged(transaction);
2775 
2776 	inode->UpdateOldLastModified();
2777 
2778 	// The "size" & "last_modified" indices don't contain directories.
2779 	// If adding to these indices fails, the inode creation will not be
2780 	// harmed; they are considered less important than the "name" index.
2781 	if (inode->InSizeIndex())
2782 		index.InsertSize(transaction, inode);
2783 	if (inode->InLastModifiedIndex())
2784 		index.InsertLastModified(transaction, inode);
2785 
2786 	if (inode->NeedsFileCache()) {
2787 		inode->SetFileCache(file_cache_create(volume->ID(), inode->ID(),
2788 			inode->Size()));
2789 		inode->SetMap(file_map_create(volume->ID(), inode->ID(),
2790 			inode->Size()));
2791 
2792 		if (inode->FileCache() == NULL || inode->Map() == NULL)
2793 			return B_NO_MEMORY;
2794 	}
2795 
2796 	// Everything worked well until this point, we have a fully
2797 	// initialized inode, and we want to keep it
2798 	allocator.Keep(vnodeOps, publishFlags);
2799 
2800 	if (_created)
2801 		*_created = true;
2802 	if (_id != NULL)
2803 		*_id = inode->ID();
2804 	if (_inode != NULL)
2805 		*_inode = inode;
2806 
2807 	// if either _id or _inode is passed, we will keep the inode locked
2808 	if (_id == NULL && _inode == NULL)
2809 		put_vnode(volume->FSVolume(), inode->ID());
2810 
2811 	return B_OK;
2812 }
2813 
2814 
2815 //	#pragma mark - AttributeIterator
2816 
2817 
2818 AttributeIterator::AttributeIterator(Inode* inode)
2819 	:
2820 	fCurrentSmallData(0),
2821 	fInode(inode),
2822 	fAttributes(NULL),
2823 	fIterator(NULL),
2824 	fBuffer(NULL)
2825 {
2826 	inode->_AddIterator(this);
2827 }
2828 
2829 
2830 AttributeIterator::~AttributeIterator()
2831 {
2832 	if (fAttributes)
2833 		put_vnode(fAttributes->GetVolume()->FSVolume(), fAttributes->ID());
2834 
2835 	delete fIterator;
2836 	fInode->_RemoveIterator(this);
2837 }
2838 
2839 
2840 status_t
2841 AttributeIterator::Rewind()
2842 {
2843 	fCurrentSmallData = 0;
2844 
2845 	if (fIterator != NULL)
2846 		fIterator->Rewind();
2847 
2848 	return B_OK;
2849 }
2850 
2851 
2852 status_t
2853 AttributeIterator::GetNext(char* name, size_t* _length, uint32* _type,
2854 	ino_t* _id)
2855 {
2856 	// read attributes out of the small data section
2857 
2858 	if (fCurrentSmallData >= 0) {
2859 		NodeGetter nodeGetter(fInode->GetVolume(), fInode);
2860 		if (nodeGetter.Node() == NULL)
2861 			return B_IO_ERROR;
2862 
2863 		const bfs_inode* node = nodeGetter.Node();
2864 		const small_data* item = ((bfs_inode*)node)->SmallDataStart();
2865 
2866 		RecursiveLocker _(&fInode->SmallDataLock());
2867 
2868 		int32 index = 0;
2869 		for (; !item->IsLast(node); item = item->Next(), index++) {
2870 			if (item->NameSize() == FILE_NAME_NAME_LENGTH
2871 				&& *item->Name() == FILE_NAME_NAME)
2872 				continue;
2873 
2874 			if (index >= fCurrentSmallData)
2875 				break;
2876 		}
2877 
2878 		if (!item->IsLast(node)) {
2879 			strncpy(name, item->Name(), B_FILE_NAME_LENGTH);
2880 			*_type = item->Type();
2881 			*_length = item->NameSize();
2882 			*_id = (ino_t)index;
2883 
2884 			fCurrentSmallData = index + 1;
2885 			return B_OK;
2886 		}
2887 
2888 		// stop traversing the small_data section
2889 		fCurrentSmallData = -1;
2890 	}
2891 
2892 	// read attributes out of the attribute directory
2893 
2894 	if (fInode->Attributes().IsZero())
2895 		return B_ENTRY_NOT_FOUND;
2896 
2897 	Volume* volume = fInode->GetVolume();
2898 
2899 	// if you haven't yet access to the attributes directory, get it
2900 	if (fAttributes == NULL) {
2901 		if (get_vnode(volume->FSVolume(), volume->ToVnode(fInode->Attributes()),
2902 				(void**)&fAttributes) != B_OK) {
2903 			FATAL(("get_vnode() failed in AttributeIterator::GetNext(ino_t"
2904 				" = %" B_PRIdINO ",name = \"%s\")\n", fInode->ID(), name));
2905 			return B_ENTRY_NOT_FOUND;
2906 		}
2907 
2908 		BPlusTree* tree = fAttributes->Tree();
2909 		if (tree == NULL
2910 			|| (fIterator = new(std::nothrow) TreeIterator(tree)) == NULL) {
2911 			FATAL(("could not get tree in AttributeIterator::GetNext(ino_t"
2912 				" = %" B_PRIdINO ",name = \"%s\")\n", fInode->ID(), name));
2913 			return B_ENTRY_NOT_FOUND;
2914 		}
2915 	}
2916 
2917 	uint16 length;
2918 	ino_t id;
2919 	status_t status = fIterator->GetNextEntry(name, &length,
2920 		B_FILE_NAME_LENGTH, &id);
2921 	if (status != B_OK)
2922 		return status;
2923 
2924 	Vnode vnode(volume, id);
2925 	Inode* attribute;
2926 	if ((status = vnode.Get(&attribute)) == B_OK) {
2927 		*_type = attribute->Type();
2928 		*_length = length;
2929 		*_id = id;
2930 	}
2931 
2932 	return status;
2933 }
2934 
2935 
2936 void
2937 AttributeIterator::Update(uint16 index, int8 change)
2938 {
2939 	// fCurrentSmallData points already to the next item
2940 	if (index < fCurrentSmallData)
2941 		fCurrentSmallData += change;
2942 }
2943 
2944