xref: /haiku/src/add-ons/kernel/file_systems/bfs/Inode.cpp (revision 1c09002cbee8e797a0f8bbfc5678dfadd39ee1a7)
1 /*
2  * Copyright 2001-2010, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 
7 //! Inode access functions
8 
9 
10 #include "Debug.h"
11 #include "Inode.h"
12 #include "BPlusTree.h"
13 #include "Index.h"
14 
15 
16 #if BFS_TRACING && !defined(BFS_SHELL) && !defined(_BOOT_MODE)
17 namespace BFSInodeTracing {
18 
19 class Create : public AbstractTraceEntry {
20 public:
21 	Create(Inode* inode, Inode* parent, const char* name, int32 mode,
22 			int openMode, uint32 type)
23 		:
24 		fInode(inode),
25 		fID(inode->ID()),
26 		fParent(parent),
27 		fParentID(parent != NULL ? parent->ID() : 0),
28 		fMode(mode),
29 		fOpenMode(openMode),
30 		fType(type)
31 	{
32 		if (name != NULL)
33 			strlcpy(fName, name, sizeof(fName));
34 		else
35 			fName[0] = '\0';
36 
37 		Initialized();
38 	}
39 
40 	virtual void AddDump(TraceOutput& out)
41 	{
42 		out.Print("bfs:Create %Ld (%p), parent %Ld (%p), \"%s\", "
43 			"mode %lx, omode %x, type %lx", fID, fInode, fParentID,
44 			fParent, fName, fMode, fOpenMode, fType);
45 	}
46 
47 private:
48 	Inode*	fInode;
49 	ino_t	fID;
50 	Inode*	fParent;
51 	ino_t	fParentID;
52 	char	fName[32];
53 	int32	fMode;
54 	int		fOpenMode;
55 	uint32	fType;
56 };
57 
58 class Remove : public AbstractTraceEntry {
59 public:
60 	Remove(Inode* inode, const char* name)
61 		:
62 		fInode(inode),
63 		fID(inode->ID())
64 	{
65 		strlcpy(fName, name, sizeof(fName));
66 		Initialized();
67 	}
68 
69 	virtual void AddDump(TraceOutput& out)
70 	{
71 		out.Print("bfs:Remove %Ld (%p), \"%s\"", fID, fInode, fName);
72 	}
73 
74 private:
75 	Inode*	fInode;
76 	ino_t	fID;
77 	char	fName[32];
78 };
79 
80 class Action : public AbstractTraceEntry {
81 public:
82 	Action(const char* action, Inode* inode)
83 		:
84 		fInode(inode),
85 		fID(inode->ID())
86 	{
87 		strlcpy(fAction, action, sizeof(fAction));
88 		Initialized();
89 	}
90 
91 	virtual void AddDump(TraceOutput& out)
92 	{
93 		out.Print("bfs:%s %Ld (%p)\n", fAction, fID, fInode);
94 	}
95 
96 private:
97 	Inode*	fInode;
98 	ino_t	fID;
99 	char	fAction[16];
100 };
101 
102 class Resize : public AbstractTraceEntry {
103 public:
104 	Resize(Inode* inode, off_t oldSize, off_t newSize, bool trim)
105 		:
106 		fInode(inode),
107 		fID(inode->ID()),
108 		fOldSize(oldSize),
109 		fNewSize(newSize),
110 		fTrim(trim)
111 	{
112 		Initialized();
113 	}
114 
115 	virtual void AddDump(TraceOutput& out)
116 	{
117 		out.Print("bfs:%s %Ld (%p), %Ld -> %Ld", fTrim ? "Trim" : "Resize",
118 			fID, fInode, fOldSize, fNewSize);
119 	}
120 
121 private:
122 	Inode*	fInode;
123 	ino_t	fID;
124 	off_t	fOldSize;
125 	off_t	fNewSize;
126 	bool	fTrim;
127 };
128 
129 }	// namespace BFSInodeTracing
130 
131 #	define T(x) new(std::nothrow) BFSInodeTracing::x;
132 #else
133 #	define T(x) ;
134 #endif
135 
136 
137 /*!	A helper class used by Inode::Create() to keep track of the belongings
138 	of an inode creation in progress.
139 	This class will make sure everything is cleaned up properly.
140 */
141 class InodeAllocator {
142 public:
143 							InodeAllocator(Transaction& transaction);
144 							~InodeAllocator();
145 
146 			status_t		New(block_run* parentRun, mode_t mode,
147 								block_run& run, fs_vnode_ops* vnodeOps,
148 								Inode** _inode);
149 			status_t		CreateTree();
150 			status_t		Keep(fs_vnode_ops* vnodeOps, uint32 publishFlags);
151 
152 private:
153 	static	void			_TransactionListener(int32 id, int32 event,
154 								void* _inode);
155 
156 			Transaction*	fTransaction;
157 			block_run		fRun;
158 			Inode*			fInode;
159 };
160 
161 
162 InodeAllocator::InodeAllocator(Transaction& transaction)
163 	:
164 	fTransaction(&transaction),
165 	fInode(NULL)
166 {
167 }
168 
169 
170 InodeAllocator::~InodeAllocator()
171 {
172 	if (fTransaction != NULL) {
173 		Volume* volume = fTransaction->GetVolume();
174 
175 		if (fInode != NULL) {
176 			fInode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
177 				// this unblocks any pending bfs_read_vnode() calls
178 			fInode->Free(*fTransaction);
179 			fTransaction->RemoveListener(fInode);
180 
181 			remove_vnode(volume->FSVolume(), fInode->ID());
182 		} else
183 			volume->Free(*fTransaction, fRun);
184 	}
185 
186 	delete fInode;
187 }
188 
189 
190 status_t
191 InodeAllocator::New(block_run* parentRun, mode_t mode, block_run& run,
192 	fs_vnode_ops* vnodeOps, Inode** _inode)
193 {
194 	Volume* volume = fTransaction->GetVolume();
195 
196 	status_t status = volume->AllocateForInode(*fTransaction, parentRun, mode,
197 		fRun);
198 	if (status < B_OK) {
199 		// don't free the space in the destructor, because
200 		// the allocation failed
201 		fTransaction = NULL;
202 		RETURN_ERROR(status);
203 	}
204 
205 	run = fRun;
206 	fInode = new Inode(volume, *fTransaction, volume->ToVnode(run), mode, run);
207 	if (fInode == NULL)
208 		RETURN_ERROR(B_NO_MEMORY);
209 
210 	if (!volume->IsInitializing()) {
211 		status = new_vnode(volume->FSVolume(), fInode->ID(), fInode,
212 			vnodeOps != NULL ? vnodeOps : &gBFSVnodeOps);
213 		if (status < B_OK) {
214 			delete fInode;
215 			fInode = NULL;
216 			RETURN_ERROR(status);
217 		}
218 	}
219 
220 	fInode->WriteLockInTransaction(*fTransaction);
221 	*_inode = fInode;
222 	return B_OK;
223 }
224 
225 
226 status_t
227 InodeAllocator::CreateTree()
228 {
229 	Volume* volume = fTransaction->GetVolume();
230 
231 	// force S_STR_INDEX to be set, if no type is set
232 	if ((fInode->Mode() & S_INDEX_TYPES) == 0)
233 		fInode->Node().mode |= HOST_ENDIAN_TO_BFS_INT32(S_STR_INDEX);
234 
235 	BPlusTree* tree = fInode->fTree = new BPlusTree(*fTransaction, fInode);
236 	if (tree == NULL || tree->InitCheck() < B_OK)
237 		return B_ERROR;
238 
239 	if (fInode->IsRegularNode()) {
240 		if (tree->Insert(*fTransaction, ".", fInode->ID()) < B_OK
241 			|| tree->Insert(*fTransaction, "..",
242 					volume->ToVnode(fInode->Parent())) < B_OK)
243 			return B_ERROR;
244 	}
245 	return B_OK;
246 }
247 
248 
249 status_t
250 InodeAllocator::Keep(fs_vnode_ops* vnodeOps, uint32 publishFlags)
251 {
252 	ASSERT(fInode != NULL && fTransaction != NULL);
253 	Volume* volume = fTransaction->GetVolume();
254 
255 	status_t status = fInode->WriteBack(*fTransaction);
256 	if (status < B_OK) {
257 		FATAL(("writing new inode %" B_PRIdINO " failed!\n", fInode->ID()));
258 		return status;
259 	}
260 
261 	// Symbolic links are not published -- the caller needs to do this once
262 	// the contents have been written.
263 	if (!fInode->IsSymLink() && volume->ID() >= 0) {
264 		status = publish_vnode(volume->FSVolume(), fInode->ID(), fInode,
265 			vnodeOps != NULL ? vnodeOps : &gBFSVnodeOps, fInode->Mode(),
266 			publishFlags);
267 	}
268 
269 	if (status == B_OK) {
270 		cache_add_transaction_listener(volume->BlockCache(), fTransaction->ID(),
271 			TRANSACTION_ABORTED, &_TransactionListener, fInode);
272 	}
273 
274 	fTransaction = NULL;
275 	fInode = NULL;
276 
277 	return status;
278 }
279 
280 
281 /*static*/ void
282 InodeAllocator::_TransactionListener(int32 id, int32 event, void* _inode)
283 {
284 	Inode* inode = (Inode*)_inode;
285 
286 	if (event == TRANSACTION_ABORTED)
287 		panic("transaction %d aborted, inode %p still around!\n", (int)id, inode);
288 }
289 
290 
291 //	#pragma mark -
292 
293 
294 status_t
295 bfs_inode::InitCheck(Volume* volume) const
296 {
297 	if (Magic1() != INODE_MAGIC1
298 		|| !(Flags() & INODE_IN_USE)
299 		|| inode_num.Length() != 1
300 		// matches inode size?
301 		|| (uint32)InodeSize() != volume->InodeSize()
302 		// parent resides on disk?
303 		|| parent.AllocationGroup() > int32(volume->AllocationGroups())
304 		|| parent.AllocationGroup() < 0
305 		|| parent.Start() > (1L << volume->AllocationGroupShift())
306 		|| parent.Length() != 1
307 		// attributes, too?
308 		|| attributes.AllocationGroup() > int32(volume->AllocationGroups())
309 		|| attributes.AllocationGroup() < 0
310 		|| attributes.Start() > (1L << volume->AllocationGroupShift()))
311 		RETURN_ERROR(B_BAD_DATA);
312 
313 	if (Flags() & INODE_DELETED)
314 		return B_NOT_ALLOWED;
315 
316 	// TODO: Add some tests to check the integrity of the other stuff here,
317 	// especially for the data_stream!
318 
319 	return B_OK;
320 }
321 
322 
323 //	#pragma mark - Inode
324 
325 
326 Inode::Inode(Volume* volume, ino_t id)
327 	:
328 	fVolume(volume),
329 	fID(id),
330 	fTree(NULL),
331 	fAttributes(NULL),
332 	fCache(NULL),
333 	fMap(NULL)
334 {
335 	PRINT(("Inode::Inode(volume = %p, id = %Ld) @ %p\n", volume, id, this));
336 
337 	UpdateNodeFromDisk();
338 
339 	rw_lock_init(&fLock, "bfs inode");
340 	recursive_lock_init(&fSmallDataLock, "bfs inode small data");
341 
342 	// these two will help to maintain the indices
343 	fOldSize = Size();
344 	fOldLastModified = LastModified();
345 
346 	if (IsContainer())
347 		fTree = new BPlusTree(this);
348 	if (NeedsFileCache()) {
349 		SetFileCache(file_cache_create(fVolume->ID(), ID(), Size()));
350 		SetMap(file_map_create(volume->ID(), ID(), Size()));
351 	}
352 }
353 
354 
355 Inode::Inode(Volume* volume, Transaction& transaction, ino_t id, mode_t mode,
356 		block_run& run)
357 	:
358 	fVolume(volume),
359 	fID(id),
360 	fTree(NULL),
361 	fAttributes(NULL),
362 	fCache(NULL),
363 	fMap(NULL)
364 {
365 	PRINT(("Inode::Inode(volume = %p, transaction = %p, id = %Ld) @ %p\n",
366 		volume, &transaction, id, this));
367 
368 	rw_lock_init(&fLock, "bfs inode");
369 	recursive_lock_init(&fSmallDataLock, "bfs inode small data");
370 
371 	NodeGetter node(volume, transaction, this, true);
372 	memset(&fNode, 0, sizeof(bfs_inode));
373 
374 	// Initialize the bfs_inode structure -- it's not written back to disk
375 	// here, because it will be done so already when the inode could be
376 	// created completely.
377 
378 	Node().magic1 = HOST_ENDIAN_TO_BFS_INT32(INODE_MAGIC1);
379 	Node().inode_num = run;
380 	Node().mode = HOST_ENDIAN_TO_BFS_INT32(mode);
381 	Node().flags = HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
382 
383 	Node().create_time = Node().last_modified_time = Node().status_change_time
384 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
385 
386 	Node().inode_size = HOST_ENDIAN_TO_BFS_INT32(volume->InodeSize());
387 
388 	// these two will help to maintain the indices
389 	fOldSize = Size();
390 	fOldLastModified = LastModified();
391 }
392 
393 
394 Inode::~Inode()
395 {
396 	PRINT(("Inode::~Inode() @ %p\n", this));
397 
398 	file_cache_delete(FileCache());
399 	file_map_delete(Map());
400 	delete fTree;
401 
402 	rw_lock_destroy(&fLock);
403 }
404 
405 
406 status_t
407 Inode::InitCheck(bool checkNode) const
408 {
409 	// test inode magic and flags
410 	if (checkNode) {
411 		status_t status = Node().InitCheck(fVolume);
412 		if (status == B_BUSY)
413 			return B_BUSY;
414 
415 		if (status != B_OK) {
416 			FATAL(("inode at block %" B_PRIdOFF " corrupt!\n", BlockNumber()));
417 			RETURN_ERROR(B_BAD_DATA);
418 		}
419 	}
420 
421 	if (IsContainer()) {
422 		// inodes that have a B+tree
423 		if (fTree == NULL)
424 			RETURN_ERROR(B_NO_MEMORY);
425 
426 		status_t status = fTree->InitCheck();
427 		if (status != B_OK) {
428 			FATAL(("inode tree at block %" B_PRIdOFF " corrupt!\n",
429 				BlockNumber()));
430 			RETURN_ERROR(B_BAD_DATA);
431 		}
432 	}
433 
434 	if (NeedsFileCache() && (fCache == NULL || fMap == NULL))
435 		return B_NO_MEMORY;
436 
437 	return B_OK;
438 }
439 
440 
441 /*!	Adds this inode to the specified transaction. This means that the inode will
442 	be write locked until the transaction ended.
443 	To ensure that the inode will stay valid until that point, an extra reference
444 	is acquired to it as long as this transaction stays active.
445 */
446 void
447 Inode::WriteLockInTransaction(Transaction& transaction)
448 {
449 	// These flags can only change while holding the transaction lock
450 	if ((Flags() & INODE_IN_TRANSACTION) != 0)
451 		return;
452 
453 	// We share the same list link with the removed list, so we have to remove
454 	// the inode from that list here (and add it back when we no longer need it)
455 	if ((Flags() & INODE_DELETED) != 0)
456 		fVolume->RemovedInodes().Remove(this);
457 
458 	if (!fVolume->IsInitializing())
459 		acquire_vnode(fVolume->FSVolume(), ID());
460 
461 	rw_lock_write_lock(&Lock());
462 	Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
463 
464 	transaction.AddListener(this);
465 }
466 
467 
468 status_t
469 Inode::WriteBack(Transaction& transaction)
470 {
471 	NodeGetter node(fVolume, transaction, this);
472 	if (node.WritableNode() == NULL)
473 		return B_IO_ERROR;
474 
475 	memcpy(node.WritableNode(), &Node(), sizeof(bfs_inode));
476 	return B_OK;
477 }
478 
479 
480 void
481 Inode::UpdateNodeFromDisk()
482 {
483 	NodeGetter node(fVolume, this);
484 	memcpy(&fNode, node.Node(), sizeof(bfs_inode));
485 	fNode.flags &= HOST_ENDIAN_TO_BFS_INT32(INODE_PERMANENT_FLAGS);
486 }
487 
488 
489 status_t
490 Inode::CheckPermissions(int accessMode) const
491 {
492 	// you never have write access to a read-only volume
493 	if ((accessMode & W_OK) != 0 && fVolume->IsReadOnly())
494 		return B_READ_ONLY_DEVICE;
495 
496 	// get node permissions
497 	mode_t mode = Mode();
498 	int userPermissions = (mode & S_IRWXU) >> 6;
499 	int groupPermissions = (mode & S_IRWXG) >> 3;
500 	int otherPermissions = mode & S_IRWXO;
501 
502 	// get the node permissions for this uid/gid
503 	int permissions = 0;
504 	uid_t uid = geteuid();
505 	gid_t gid = getegid();
506 
507 	if (uid == 0) {
508 		// user is root
509 		// root has always read/write permission, but at least one of the
510 		// X bits must be set for execute permission
511 		permissions = userPermissions | groupPermissions | otherPermissions
512 			| R_OK | W_OK;
513 	} else if (uid == (uid_t)fNode.UserID()) {
514 		// user is node owner
515 		permissions = userPermissions;
516 	} else if (gid == (gid_t)fNode.GroupID()) {
517 		// user is in owning group
518 		permissions = groupPermissions;
519 	} else {
520 		// user is one of the others
521 		permissions = otherPermissions;
522 	}
523 
524 	return (accessMode & ~permissions) == 0 ? B_OK : B_NOT_ALLOWED;
525 }
526 
527 
528 //	#pragma mark - attributes
529 
530 
531 void
532 Inode::_AddIterator(AttributeIterator* iterator)
533 {
534 	RecursiveLocker _(fSmallDataLock);
535 	fIterators.Add(iterator);
536 }
537 
538 
539 void
540 Inode::_RemoveIterator(AttributeIterator* iterator)
541 {
542 	RecursiveLocker _(fSmallDataLock);
543 	fIterators.Remove(iterator);
544 }
545 
546 
547 /*!	Tries to free up "bytes" space in the small_data section by moving
548 	attributes to real files. Used for system attributes like the name.
549 	You need to hold the fSmallDataLock when you call this method
550 */
551 status_t
552 Inode::_MakeSpaceForSmallData(Transaction& transaction, bfs_inode* node,
553 	const char* name, int32 bytes)
554 {
555 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
556 
557 	while (bytes > 0) {
558 		small_data* item = node->SmallDataStart();
559 		small_data* max = NULL;
560 		int32 index = 0, maxIndex = 0;
561 		for (; !item->IsLast(node); item = item->Next(), index++) {
562 			// should not remove those
563 			if (*item->Name() == FILE_NAME_NAME || !strcmp(name, item->Name()))
564 				continue;
565 
566 			if (max == NULL || max->Size() < item->Size()) {
567 				maxIndex = index;
568 				max = item;
569 			}
570 
571 			// Remove the first one large enough to free the needed amount of
572 			// bytes
573 			if (bytes < (int32)item->Size())
574 				break;
575 		}
576 
577 		if (item->IsLast(node) || (int32)item->Size() < bytes)
578 			return B_ERROR;
579 
580 		bytes -= max->Size();
581 
582 		// Move the attribute to a real attribute file
583 		// Luckily, this doesn't cause any index updates
584 
585 		Inode* attribute;
586 		status_t status = CreateAttribute(transaction, item->Name(),
587 			item->Type(), &attribute);
588 		if (status != B_OK)
589 			RETURN_ERROR(status);
590 
591 		size_t length = item->DataSize();
592 		status = attribute->WriteAt(transaction, 0, item->Data(), &length);
593 
594 		ReleaseAttribute(attribute);
595 
596 		if (status != B_OK) {
597 			Vnode vnode(fVolume, Attributes());
598 			Inode* attributes;
599 			if (vnode.Get(&attributes) < B_OK
600 				|| attributes->Remove(transaction, name) < B_OK) {
601 				FATAL(("Could not remove newly created attribute!\n"));
602 			}
603 
604 			RETURN_ERROR(status);
605 		}
606 
607 		_RemoveSmallData(node, max, maxIndex);
608 	}
609 	return B_OK;
610 }
611 
612 
613 /*!	Private function which removes the given attribute from the small_data
614 	section.
615 	You need to hold the fSmallDataLock when you call this method
616 */
617 status_t
618 Inode::_RemoveSmallData(bfs_inode* node, small_data* item, int32 index)
619 {
620 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
621 
622 	small_data* next = item->Next();
623 	if (!next->IsLast(node)) {
624 		// find the last attribute
625 		small_data* last = next;
626 		while (!last->IsLast(node))
627 			last = last->Next();
628 
629 		int32 size = (uint8*)last - (uint8*)next;
630 		if (size < 0
631 			|| size > (uint8*)node + fVolume->BlockSize() - (uint8*)next)
632 			return B_BAD_DATA;
633 
634 		memmove(item, next, size);
635 
636 		// Move the "last" one to its new location and
637 		// correctly terminate the small_data section
638 		last = (small_data*)((uint8*)last - ((uint8*)next - (uint8*)item));
639 		memset(last, 0, (uint8*)node + fVolume->BlockSize() - (uint8*)last);
640 	} else
641 		memset(item, 0, item->Size());
642 
643 	// update all current iterators
644 	SinglyLinkedList<AttributeIterator>::Iterator iterator
645 		= fIterators.GetIterator();
646 	while (iterator.HasNext()) {
647 		iterator.Next()->Update(index, -1);
648 	}
649 
650 	return B_OK;
651 }
652 
653 
654 //!	Removes the given attribute from the small_data section.
655 status_t
656 Inode::_RemoveSmallData(Transaction& transaction, NodeGetter& nodeGetter,
657 	const char* name)
658 {
659 	if (name == NULL)
660 		return B_BAD_VALUE;
661 
662 	bfs_inode* node = nodeGetter.WritableNode();
663 	RecursiveLocker locker(fSmallDataLock);
664 
665 	// search for the small_data item
666 
667 	small_data* item = node->SmallDataStart();
668 	int32 index = 0;
669 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
670 		item = item->Next();
671 		index++;
672 	}
673 
674 	if (item->IsLast(node))
675 		return B_ENTRY_NOT_FOUND;
676 
677 	nodeGetter.MakeWritable(transaction);
678 
679 	status_t status = _RemoveSmallData(node, item, index);
680 	if (status == B_OK) {
681 		Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
682 			bfs_inode::ToInode(real_time_clock_usecs()));
683 
684 		status = WriteBack(transaction);
685 	}
686 
687 	return status;
688 }
689 
690 
691 /*!	Try to place the given attribute in the small_data section - if the
692 	new attribute is too big to fit in that section, it returns B_DEVICE_FULL.
693 	In that case, the attribute should be written to a real attribute file;
694 	it's the caller's responsibility to remove any existing attributes in the
695 	small data section if that's the case.
696 
697 	Note that you need to write back the inode yourself after having called that
698 	method - it's a bad API decision that it needs a transaction but enforces
699 	you to write back the inode all by yourself, but it's just more efficient
700 	in most cases...
701 */
702 status_t
703 Inode::_AddSmallData(Transaction& transaction, NodeGetter& nodeGetter,
704 	const char* name, uint32 type, off_t pos, const uint8* data, size_t length,
705 	bool force)
706 {
707 	bfs_inode* node = nodeGetter.WritableNode();
708 
709 	if (node == NULL || name == NULL || data == NULL)
710 		return B_BAD_VALUE;
711 
712 	// reject any requests that can't fit into the small_data section
713 	uint32 nameLength = strlen(name);
714 	uint32 spaceNeeded = sizeof(small_data) + nameLength + 3 + pos + length + 1;
715 	if (spaceNeeded > fVolume->InodeSize() - sizeof(bfs_inode))
716 		return B_DEVICE_FULL;
717 
718 	nodeGetter.MakeWritable(transaction);
719 	RecursiveLocker locker(fSmallDataLock);
720 
721 	// Find the last item or one with the same name we have to add
722 	small_data* item = node->SmallDataStart();
723 	int32 index = 0;
724 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
725 		item = item->Next();
726 		index++;
727 	}
728 
729 	// is the attribute already in the small_data section?
730 	// then just replace the data part of that one
731 	if (!item->IsLast(node)) {
732 		// find last attribute
733 		small_data* last = item;
734 		while (!last->IsLast(node))
735 			last = last->Next();
736 
737 		// try to change the attributes value
738 		if (item->data_size > pos + length
739 			|| force
740 			|| ((uint8*)last + pos + length - item->DataSize())
741 					<= ((uint8*)node + fVolume->InodeSize())) {
742 			// Make room for the new attribute if needed (and we are forced
743 			// to do so)
744 			if (force && ((uint8*)last + pos + length - item->DataSize())
745 					> ((uint8*)node + fVolume->InodeSize())) {
746 				// We also take the free space at the end of the small_data
747 				// section into account, and request only what's really needed
748 				uint32 needed = pos + length - item->DataSize() -
749 					(uint32)((uint8*)node + fVolume->InodeSize()
750 						- (uint8*)last);
751 
752 				if (_MakeSpaceForSmallData(transaction, node, name, needed)
753 						!= B_OK)
754 					return B_ERROR;
755 
756 				// reset our pointers
757 				item = node->SmallDataStart();
758 				index = 0;
759 				while (!item->IsLast(node) && strcmp(item->Name(), name)) {
760 					item = item->Next();
761 					index++;
762 				}
763 
764 				last = item;
765 				while (!last->IsLast(node))
766 					last = last->Next();
767 			}
768 
769 			size_t oldDataSize = item->DataSize();
770 
771 			// Normally, we can just overwrite the attribute data as the size
772 			// is specified by the type and does not change that often
773 			if (pos + length != item->DataSize()) {
774 				// move the attributes after the current one
775 				small_data* next = item->Next();
776 				if (!next->IsLast(node)) {
777 					memmove((uint8*)item + spaceNeeded, next,
778 						(uint8*)last - (uint8*)next);
779 				}
780 
781 				// Move the "last" one to its new location and
782 				// correctly terminate the small_data section
783 				last = (small_data*)((uint8*)last
784 					- ((uint8*)next - ((uint8*)item + spaceNeeded)));
785 				if ((uint8*)last < (uint8*)node + fVolume->BlockSize()) {
786 					memset(last, 0, (uint8*)node + fVolume->BlockSize()
787 						- (uint8*)last);
788 				}
789 
790 				item->data_size = HOST_ENDIAN_TO_BFS_INT16(pos + length);
791 			}
792 
793 			item->type = HOST_ENDIAN_TO_BFS_INT32(type);
794 
795 			if ((uint64)oldDataSize < (uint64)pos) {
796 				// Fill gap with zeros
797 				memset(item->Data() + oldDataSize, 0, pos - oldDataSize);
798 			}
799 			memcpy(item->Data() + pos, data, length);
800 			item->Data()[pos + length] = '\0';
801 
802 			return B_OK;
803 		}
804 
805 		return B_DEVICE_FULL;
806 	}
807 
808 	// try to add the new attribute!
809 
810 	if ((uint8*)item + spaceNeeded > (uint8*)node + fVolume->InodeSize()) {
811 		// there is not enough space for it!
812 		if (!force)
813 			return B_DEVICE_FULL;
814 
815 		// make room for the new attribute
816 		if (_MakeSpaceForSmallData(transaction, node, name, spaceNeeded) < B_OK)
817 			return B_ERROR;
818 
819 		// get new last item!
820 		item = node->SmallDataStart();
821 		index = 0;
822 		while (!item->IsLast(node)) {
823 			item = item->Next();
824 			index++;
825 		}
826 	}
827 
828 	memset(item, 0, spaceNeeded);
829 	item->type = HOST_ENDIAN_TO_BFS_INT32(type);
830 	item->name_size = HOST_ENDIAN_TO_BFS_INT16(nameLength);
831 	item->data_size = HOST_ENDIAN_TO_BFS_INT16(length);
832 	strcpy(item->Name(), name);
833 	memcpy(item->Data() + pos, data, length);
834 
835 	// correctly terminate the small_data section
836 	item = item->Next();
837 	if (!item->IsLast(node))
838 		memset(item, 0, (uint8*)node + fVolume->InodeSize() - (uint8*)item);
839 
840 	// update all current iterators
841 	SinglyLinkedList<AttributeIterator>::Iterator iterator
842 		= fIterators.GetIterator();
843 	while (iterator.HasNext()) {
844 		iterator.Next()->Update(index, 1);
845 	}
846 
847 	return B_OK;
848 }
849 
850 
851 /*!	Iterates through the small_data section of an inode.
852 	To start at the beginning of this section, you let smallData
853 	point to NULL, like:
854 		small_data* data = NULL;
855 		while (inode->GetNextSmallData(&data) { ... }
856 
857 	This function is reentrant and doesn't allocate any memory;
858 	you can safely stop calling it at any point (you don't need
859 	to iterate through the whole list).
860 	You need to hold the fSmallDataLock when you call this method
861 */
862 status_t
863 Inode::_GetNextSmallData(bfs_inode* node, small_data** _smallData) const
864 {
865 	if (node == NULL)
866 		RETURN_ERROR(B_BAD_VALUE);
867 
868 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
869 
870 	small_data* data = *_smallData;
871 
872 	// begin from the start?
873 	if (data == NULL)
874 		data = node->SmallDataStart();
875 	else
876 		data = data->Next();
877 
878 	// is already last item?
879 	if (data->IsLast(node))
880 		return B_ENTRY_NOT_FOUND;
881 
882 	*_smallData = data;
883 
884 	return B_OK;
885 }
886 
887 
888 /*!	Finds the attribute "name" in the small data section, and
889 	returns a pointer to it (or NULL if it doesn't exist).
890 	You need to hold the fSmallDataLock when you call this method
891 */
892 small_data*
893 Inode::FindSmallData(const bfs_inode* node, const char* name) const
894 {
895 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
896 
897 	small_data* smallData = NULL;
898 	while (_GetNextSmallData(const_cast<bfs_inode*>(node), &smallData)
899 			== B_OK) {
900 		if (!strcmp(smallData->Name(), name))
901 			return smallData;
902 	}
903 	return NULL;
904 }
905 
906 
907 /*!	Returns a pointer to the node's name if present in the small data
908 	section, NULL otherwise.
909 	You need to hold the fSmallDataLock when you call this method
910 */
911 const char*
912 Inode::Name(const bfs_inode* node) const
913 {
914 	ASSERT_LOCKED_RECURSIVE(&fSmallDataLock);
915 
916 	small_data* smallData = NULL;
917 	while (_GetNextSmallData((bfs_inode*)node, &smallData) == B_OK) {
918 		if (*smallData->Name() == FILE_NAME_NAME
919 			&& smallData->NameSize() == FILE_NAME_NAME_LENGTH)
920 			return (const char*)smallData->Data();
921 	}
922 	return NULL;
923 }
924 
925 
926 /*!	Copies the node's name into the provided buffer.
927 	The buffer should be B_FILE_NAME_LENGTH bytes large.
928 */
929 status_t
930 Inode::GetName(char* buffer, size_t size) const
931 {
932 	NodeGetter node(fVolume, this);
933 	RecursiveLocker locker(fSmallDataLock);
934 
935 	const char* name = Name(node.Node());
936 	if (name == NULL)
937 		return B_ENTRY_NOT_FOUND;
938 
939 	strlcpy(buffer, name, size);
940 	return B_OK;
941 }
942 
943 
944 /*!	Changes or set the name of a file: in the inode small_data section only, it
945 	doesn't change it in the parent directory's b+tree.
946 	Note that you need to write back the inode yourself after having called
947 	that method. It suffers from the same API decision as AddSmallData() does
948 	(and for the same reason).
949 */
950 status_t
951 Inode::SetName(Transaction& transaction, const char* name)
952 {
953 	if (name == NULL || *name == '\0')
954 		return B_BAD_VALUE;
955 
956 	NodeGetter node(fVolume, transaction, this);
957 	const char nameTag[2] = {FILE_NAME_NAME, 0};
958 
959 	return _AddSmallData(transaction, node, nameTag, FILE_NAME_TYPE, 0,
960 		(uint8*)name, strlen(name), true);
961 }
962 
963 
964 status_t
965 Inode::_RemoveAttribute(Transaction& transaction, const char* name,
966 	bool hasIndex, Index* index)
967 {
968 	// remove the attribute file if it exists
969 	Vnode vnode(fVolume, Attributes());
970 	Inode* attributes;
971 	status_t status = vnode.Get(&attributes);
972 	if (status < B_OK)
973 		return status;
974 
975 	// update index
976 	if (index != NULL) {
977 		Inode* attribute;
978 		if ((hasIndex || fVolume->CheckForLiveQuery(name))
979 			&& GetAttribute(name, &attribute) == B_OK) {
980 			uint8 data[BPLUSTREE_MAX_KEY_LENGTH];
981 			size_t length = BPLUSTREE_MAX_KEY_LENGTH;
982 			if (attribute->ReadAt(0, data, &length) == B_OK) {
983 				index->Update(transaction, name, attribute->Type(), data,
984 					length, NULL, 0, this);
985 			}
986 
987 			ReleaseAttribute(attribute);
988 		}
989 	}
990 
991 	if ((status = attributes->Remove(transaction, name)) < B_OK)
992 		return status;
993 
994 	if (attributes->IsEmpty()) {
995 		attributes->WriteLockInTransaction(transaction);
996 
997 		// remove attribute directory (don't fail if that can't be done)
998 		if (remove_vnode(fVolume->FSVolume(), attributes->ID()) == B_OK) {
999 			// update the inode, so that no one will ever doubt it's deleted :-)
1000 			attributes->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
1001 			if (attributes->WriteBack(transaction) == B_OK) {
1002 				Attributes().SetTo(0, 0, 0);
1003 				WriteBack(transaction);
1004 			} else {
1005 				unremove_vnode(fVolume->FSVolume(), attributes->ID());
1006 				attributes->Node().flags
1007 					&= ~HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
1008 			}
1009 		}
1010 	}
1011 
1012 	return status;
1013 }
1014 
1015 
1016 /*!	Reads data from the specified attribute.
1017 	This is a high-level attribute function that understands attributes
1018 	in the small_data section as well as real attribute files.
1019 */
1020 status_t
1021 Inode::ReadAttribute(const char* name, int32 type, off_t pos, uint8* buffer,
1022 	size_t* _length)
1023 {
1024 	if (pos < 0)
1025 		pos = 0;
1026 
1027 	// search in the small_data section (which has to be locked first)
1028 	{
1029 		NodeGetter node(fVolume, this);
1030 		RecursiveLocker locker(fSmallDataLock);
1031 
1032 		small_data* smallData = FindSmallData(node.Node(), name);
1033 		if (smallData != NULL) {
1034 			size_t length = *_length;
1035 			if (pos >= smallData->data_size) {
1036 				*_length = 0;
1037 				return B_OK;
1038 			}
1039 			if (length + pos > smallData->DataSize())
1040 				length = smallData->DataSize() - pos;
1041 
1042 			memcpy(buffer, smallData->Data() + pos, length);
1043 			*_length = length;
1044 			return B_OK;
1045 		}
1046 	}
1047 
1048 	// search in the attribute directory
1049 	Inode* attribute;
1050 	status_t status = GetAttribute(name, &attribute);
1051 	if (status == B_OK) {
1052 		status = attribute->ReadAt(pos, (uint8*)buffer, _length);
1053 
1054 		ReleaseAttribute(attribute);
1055 	}
1056 
1057 	RETURN_ERROR(status);
1058 }
1059 
1060 
1061 /*!	Writes data to the specified attribute.
1062 	This is a high-level attribute function that understands attributes
1063 	in the small_data section as well as real attribute files.
1064 */
1065 status_t
1066 Inode::WriteAttribute(Transaction& transaction, const char* name, int32 type,
1067 	off_t pos, const uint8* buffer, size_t* _length, bool* _created)
1068 {
1069 	if (pos < 0)
1070 		return B_BAD_VALUE;
1071 
1072 	// needed to maintain the index
1073 	uint8 oldBuffer[BPLUSTREE_MAX_KEY_LENGTH];
1074 	uint8* oldData = NULL;
1075 	size_t oldLength = 0;
1076 	bool created = false;
1077 
1078 	// TODO: we actually depend on that the contents of "buffer" are constant.
1079 	// If they get changed during the write (hey, user programs), we may mess
1080 	// up our index trees!
1081 	// TODO: for attribute files, we need to log the first
1082 	// BPLUSTREE_MAX_KEY_LENGTH bytes of the data stream, or the same as above
1083 	// might happen.
1084 
1085 	Index index(fVolume);
1086 	bool hasIndex = index.SetTo(name) == B_OK;
1087 
1088 	Inode* attribute = NULL;
1089 	status_t status = B_OK;
1090 
1091 	if (GetAttribute(name, &attribute) != B_OK) {
1092 		// No attribute inode exists yet
1093 
1094 		// save the old attribute data
1095 		NodeGetter node(fVolume, transaction, this);
1096 		recursive_lock_lock(&fSmallDataLock);
1097 
1098 		small_data* smallData = FindSmallData(node.Node(), name);
1099 		if (smallData != NULL) {
1100 			oldLength = smallData->DataSize();
1101 			if (oldLength > 0) {
1102 				if (oldLength > BPLUSTREE_MAX_KEY_LENGTH)
1103 					oldLength = BPLUSTREE_MAX_KEY_LENGTH;
1104 				memcpy(oldData = oldBuffer, smallData->Data(), oldLength);
1105 			}
1106 		} else
1107 			created = true;
1108 
1109 		recursive_lock_unlock(&fSmallDataLock);
1110 
1111 		// if the attribute doesn't exist yet (as a file), try to put it in the
1112 		// small_data section first - if that fails (due to insufficent space),
1113 		// create a real attribute file
1114 		status = _AddSmallData(transaction, node, name, type, pos, buffer,
1115 			*_length);
1116 		if (status == B_DEVICE_FULL) {
1117 			if (smallData != NULL) {
1118 				// remove the old attribute from the small data section - there
1119 				// is no space left for the new data
1120 				status = _RemoveSmallData(transaction, node, name);
1121 			} else
1122 				status = B_OK;
1123 
1124 			if (status == B_OK)
1125 				status = CreateAttribute(transaction, name, type, &attribute);
1126 			if (status != B_OK)
1127 				RETURN_ERROR(status);
1128 
1129 			created = true;
1130 		} else if (status == B_OK) {
1131 			// Update status time on attribute write
1132 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1133 				bfs_inode::ToInode(real_time_clock_usecs()));
1134 
1135 			status = WriteBack(transaction);
1136 		}
1137 	}
1138 
1139 	if (attribute != NULL) {
1140 		WriteLocker writeLocker(attribute->fLock);
1141 
1142 		if (hasIndex || fVolume->CheckForLiveQuery(name)) {
1143 			// Save the old attribute data (if this fails, oldLength will
1144 			// reflect it)
1145 			while (attribute->Size() > 0) {
1146 				bigtime_t oldModified = attribute->LastModified();
1147 				writeLocker.Unlock();
1148 
1149 				oldLength = BPLUSTREE_MAX_KEY_LENGTH;
1150 				if (attribute->ReadAt(0, oldBuffer, &oldLength) == B_OK)
1151 					oldData = oldBuffer;
1152 
1153 				writeLocker.Lock();
1154 
1155 				// Read until the data hasn't changed in between
1156 				if (oldModified == attribute->LastModified())
1157 					break;
1158 
1159 				oldLength = 0;
1160 			}
1161 		}
1162 
1163 		// check if the data fits into the small_data section again
1164 		NodeGetter node(fVolume, transaction, this);
1165 		status = _AddSmallData(transaction, node, name, type, pos, buffer,
1166 			*_length);
1167 
1168 		if (status == B_OK) {
1169 			// it does - remove its file
1170 			writeLocker.Unlock();
1171 			status = _RemoveAttribute(transaction, name, false, NULL);
1172 		} else {
1173 			// The attribute type might have been changed - we need to
1174 			// adopt the new one
1175 			attribute->Node().type = HOST_ENDIAN_TO_BFS_INT32(type);
1176 			status = attribute->WriteBack(transaction);
1177 			writeLocker.Unlock();
1178 
1179 			if (status == B_OK) {
1180 				status = attribute->WriteAt(transaction, pos, buffer,
1181 					_length);
1182 			}
1183 		}
1184 
1185 		if (status == B_OK) {
1186 			// Update status time on attribute write
1187 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1188 				bfs_inode::ToInode(real_time_clock_usecs()));
1189 
1190 			status = WriteBack(transaction);
1191 		}
1192 
1193 		attribute->WriteLockInTransaction(transaction);
1194 		ReleaseAttribute(attribute);
1195 	}
1196 
1197 	// TODO: find a better way than this "pos" thing (the begin of the old key
1198 	//	must be copied to the start of the new one for a comparison)
1199 	if (status == B_OK && pos == 0) {
1200 		// index only the first BPLUSTREE_MAX_KEY_LENGTH bytes
1201 		uint16 length = *_length;
1202 		if (length > BPLUSTREE_MAX_KEY_LENGTH)
1203 			length = BPLUSTREE_MAX_KEY_LENGTH;
1204 
1205 		// Update index. Note, Index::Update() may be called even if
1206 		// initializing the index failed - it will just update the live
1207 		// queries in this case
1208 		if (pos < length || (uint64)pos < (uint64)oldLength) {
1209 			index.Update(transaction, name, type, oldData, oldLength, buffer,
1210 				length, this);
1211 		}
1212 	}
1213 
1214 	if (_created != NULL)
1215 		*_created = created;
1216 
1217 	return status;
1218 }
1219 
1220 
1221 /*!	Removes the specified attribute from the inode.
1222 	This is a high-level attribute function that understands attributes
1223 	in the small_data section as well as real attribute files.
1224 */
1225 status_t
1226 Inode::RemoveAttribute(Transaction& transaction, const char* name)
1227 {
1228 	Index index(fVolume);
1229 	bool hasIndex = index.SetTo(name) == B_OK;
1230 	NodeGetter node(fVolume, this);
1231 
1232 	// update index for attributes in the small_data section
1233 	{
1234 		RecursiveLocker _(fSmallDataLock);
1235 
1236 		small_data* smallData = FindSmallData(node.Node(), name);
1237 		if (smallData != NULL) {
1238 			uint32 length = smallData->DataSize();
1239 			if (length > BPLUSTREE_MAX_KEY_LENGTH)
1240 				length = BPLUSTREE_MAX_KEY_LENGTH;
1241 			index.Update(transaction, name, smallData->Type(),
1242 				smallData->Data(), length, NULL, 0, this);
1243 		}
1244 	}
1245 
1246 	status_t status = _RemoveSmallData(transaction, node, name);
1247 	if (status == B_ENTRY_NOT_FOUND && !Attributes().IsZero()) {
1248 		// remove the attribute file if it exists
1249 		status = _RemoveAttribute(transaction, name, hasIndex, &index);
1250 		if (status == B_OK) {
1251 			Node().status_change_time = HOST_ENDIAN_TO_BFS_INT64(
1252 				bfs_inode::ToInode(real_time_clock_usecs()));
1253 			WriteBack(transaction);
1254 		}
1255 	}
1256 
1257 	return status;
1258 }
1259 
1260 
1261 /*!	Returns the attribute inode with the specified \a name, in case it exists.
1262 	This method can only return real attribute files; the attributes in the
1263 	small data section are ignored.
1264 */
1265 status_t
1266 Inode::GetAttribute(const char* name, Inode** _attribute)
1267 {
1268 	// does this inode even have attributes?
1269 	if (Attributes().IsZero())
1270 		return B_ENTRY_NOT_FOUND;
1271 
1272 	Vnode vnode(fVolume, Attributes());
1273 	Inode* attributes;
1274 	if (vnode.Get(&attributes) < B_OK) {
1275 		FATAL(("get_vnode() failed in Inode::GetAttribute(name = \"%s\")\n",
1276 			name));
1277 		return B_ERROR;
1278 	}
1279 
1280 	BPlusTree* tree = attributes->Tree();
1281 	if (tree == NULL)
1282 		return B_BAD_VALUE;
1283 
1284 	InodeReadLocker locker(attributes);
1285 
1286 	ino_t id;
1287 	status_t status = tree->Find((uint8*)name, (uint16)strlen(name), &id);
1288 	if (status == B_OK) {
1289 		Vnode vnode(fVolume, id);
1290 		Inode* inode;
1291 		// Check if the attribute is really an attribute
1292 		if (vnode.Get(&inode) != B_OK || !inode->IsAttribute())
1293 			return B_ERROR;
1294 
1295 		*_attribute = inode;
1296 		vnode.Keep();
1297 		return B_OK;
1298 	}
1299 
1300 	return status;
1301 }
1302 
1303 
1304 void
1305 Inode::ReleaseAttribute(Inode* attribute)
1306 {
1307 	if (attribute == NULL)
1308 		return;
1309 
1310 	put_vnode(fVolume->FSVolume(), attribute->ID());
1311 }
1312 
1313 
1314 status_t
1315 Inode::CreateAttribute(Transaction& transaction, const char* name, uint32 type,
1316 	Inode** attribute)
1317 {
1318 	// do we need to create the attribute directory first?
1319 	if (Attributes().IsZero()) {
1320 		status_t status = Inode::Create(transaction, this, NULL,
1321 			S_ATTR_DIR | S_DIRECTORY | 0666, 0, 0, NULL);
1322 		if (status < B_OK)
1323 			RETURN_ERROR(status);
1324 	}
1325 	Vnode vnode(fVolume, Attributes());
1326 	Inode* attributes;
1327 	if (vnode.Get(&attributes) < B_OK)
1328 		return B_ERROR;
1329 
1330 	// Inode::Create() locks the inode for us
1331 	return Inode::Create(transaction, attributes, name,
1332 		S_ATTR | S_FILE | 0666, 0, type, NULL, NULL, attribute);
1333 }
1334 
1335 
1336 //	#pragma mark - directory tree
1337 
1338 
1339 bool
1340 Inode::IsEmpty()
1341 {
1342 	TreeIterator iterator(fTree);
1343 
1344 	// index and attribute directories are really empty when they are
1345 	// empty - directories for standard files always contain ".", and
1346 	// "..", so we need to ignore those two
1347 
1348 	uint32 count = 0;
1349 	char name[BPLUSTREE_MAX_KEY_LENGTH];
1350 	uint16 length;
1351 	ino_t id;
1352 	while (iterator.GetNextEntry(name, &length, B_FILE_NAME_LENGTH,
1353 			&id) == B_OK) {
1354 		if (Mode() & (S_ATTR_DIR | S_INDEX_DIR))
1355 			return false;
1356 
1357 		if (++count > 2 || (strcmp(".", name) && strcmp("..", name)))
1358 			return false;
1359 	}
1360 	return true;
1361 }
1362 
1363 
1364 status_t
1365 Inode::ContainerContentsChanged(Transaction& transaction)
1366 {
1367 	ASSERT(!InLastModifiedIndex());
1368 
1369 	Node().last_modified_time = Node().status_change_time
1370 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
1371 
1372 	return WriteBack(transaction);
1373 }
1374 
1375 
1376 //	#pragma mark - data stream
1377 
1378 
1379 /*!	Computes the number of bytes this inode occupies in the file system.
1380 	This includes the file meta blocks used for maintaining its data stream.
1381 
1382 	TODO: However, the attributes in extra files are not really accounted for;
1383 	depending on the speed penalty, this should be changed, though (the value
1384 	could be cached in the inode structure or Inode object, though).
1385 */
1386 off_t
1387 Inode::AllocatedSize() const
1388 {
1389 	if (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0) {
1390 		// This symlink does not have a data stream
1391 		return Node().InodeSize();
1392 	}
1393 
1394 	const data_stream& data = Node().data;
1395 	uint32 blockSize = fVolume->BlockSize();
1396 	off_t size = blockSize;
1397 
1398 	if (data.MaxDoubleIndirectRange() != 0) {
1399 		off_t doubleIndirectSize = data.MaxDoubleIndirectRange()
1400 			- data.MaxIndirectRange();
1401 		int32 indirectSize = double_indirect_max_indirect_size(
1402 			data.double_indirect.Length(), fVolume->BlockSize());
1403 
1404 		size += (2 * data.double_indirect.Length()
1405 				+ doubleIndirectSize / indirectSize)
1406 			* blockSize + data.MaxDoubleIndirectRange();
1407 	} else if (data.MaxIndirectRange() != 0)
1408 		size += data.indirect.Length() + data.MaxIndirectRange();
1409 	else
1410 		size += data.MaxDirectRange();
1411 
1412 	if (!Node().attributes.IsZero()) {
1413 		// TODO: to make this exact, we'd had to count all attributes
1414 		size += 2 * blockSize;
1415 			// 2 blocks, one for the attributes inode, one for its B+tree
1416 	}
1417 
1418 	return size;
1419 }
1420 
1421 
1422 /*!	Finds the block_run where "pos" is located in the data_stream of
1423 	the inode.
1424 	If successful, "offset" will then be set to the file offset
1425 	of the block_run returned; so "pos - offset" is for the block_run
1426 	what "pos" is for the whole stream.
1427 	The caller has to make sure that "pos" is inside the stream.
1428 */
1429 status_t
1430 Inode::FindBlockRun(off_t pos, block_run& run, off_t& offset)
1431 {
1432 	data_stream* data = &Node().data;
1433 
1434 	// find matching block run
1435 
1436 	if (data->MaxIndirectRange() > 0 && pos >= data->MaxDirectRange()) {
1437 		if (data->MaxDoubleIndirectRange() > 0
1438 			&& pos >= data->MaxIndirectRange()) {
1439 			// access to double indirect blocks
1440 
1441 			CachedBlock cached(fVolume);
1442 
1443 			int32 runsPerBlock;
1444 			int32 directSize;
1445 			int32 indirectSize;
1446 			get_double_indirect_sizes(data->double_indirect.Length(),
1447 				fVolume->BlockSize(), runsPerBlock, directSize, indirectSize);
1448 
1449 			off_t start = pos - data->MaxIndirectRange();
1450 			int32 index = start / indirectSize;
1451 
1452 			block_run* indirect = (block_run*)cached.SetTo(
1453 				fVolume->ToBlock(data->double_indirect) + index / runsPerBlock);
1454 			if (indirect == NULL)
1455 				RETURN_ERROR(B_ERROR);
1456 
1457 			int32 current = (start % indirectSize) / directSize;
1458 
1459 			indirect = (block_run*)cached.SetTo(
1460 				fVolume->ToBlock(indirect[index % runsPerBlock])
1461 				+ current / runsPerBlock);
1462 			if (indirect == NULL)
1463 				RETURN_ERROR(B_ERROR);
1464 
1465 			run = indirect[current % runsPerBlock];
1466 			if (run.Length() != data->double_indirect.Length())
1467 				RETURN_ERROR(B_BAD_DATA);
1468 
1469 			offset = data->MaxIndirectRange() + (index * indirectSize)
1470 				+ (current * directSize);
1471 		} else {
1472 			// access to indirect blocks
1473 
1474 			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1475 			off_t runBlockEnd = data->MaxDirectRange();
1476 
1477 			CachedBlock cached(fVolume);
1478 			off_t block = fVolume->ToBlock(data->indirect);
1479 
1480 			for (int32 i = 0; i < data->indirect.Length(); i++) {
1481 				block_run* indirect = (block_run*)cached.SetTo(block + i);
1482 				if (indirect == NULL)
1483 					RETURN_ERROR(B_IO_ERROR);
1484 
1485 				int32 current = -1;
1486 				while (++current < runsPerBlock) {
1487 					if (indirect[current].IsZero())
1488 						break;
1489 
1490 					runBlockEnd += indirect[current].Length()
1491 						<< cached.BlockShift();
1492 					if (runBlockEnd > pos) {
1493 						run = indirect[current];
1494 						offset = runBlockEnd - (run.Length()
1495 							<< cached.BlockShift());
1496 						return fVolume->ValidateBlockRun(run);
1497 					}
1498 				}
1499 			}
1500 			RETURN_ERROR(B_ERROR);
1501 		}
1502 	} else {
1503 		// access from direct blocks
1504 
1505 		off_t runBlockEnd = 0LL;
1506 		int32 current = -1;
1507 
1508 		while (++current < NUM_DIRECT_BLOCKS) {
1509 			if (data->direct[current].IsZero())
1510 				break;
1511 
1512 			runBlockEnd += data->direct[current].Length()
1513 				<< fVolume->BlockShift();
1514 			if (runBlockEnd > pos) {
1515 				run = data->direct[current];
1516 				offset = runBlockEnd - (run.Length() << fVolume->BlockShift());
1517 				return fVolume->ValidateBlockRun(run);
1518 			}
1519 		}
1520 
1521 		return B_ENTRY_NOT_FOUND;
1522 	}
1523 	return fVolume->ValidateBlockRun(run);
1524 }
1525 
1526 
1527 status_t
1528 Inode::ReadAt(off_t pos, uint8* buffer, size_t* _length)
1529 {
1530 	size_t length = *_length;
1531 
1532 	// set/check boundaries for pos/length
1533 	if (pos < 0)
1534 		return B_BAD_VALUE;
1535 
1536 	InodeReadLocker locker(this);
1537 
1538 	if (pos >= Size() || length == 0) {
1539 		*_length = 0;
1540 		return B_NO_ERROR;
1541 	}
1542 
1543 	locker.Unlock();
1544 
1545 	return file_cache_read(FileCache(), NULL, pos, buffer, _length);
1546 }
1547 
1548 
1549 status_t
1550 Inode::WriteAt(Transaction& transaction, off_t pos, const uint8* buffer,
1551 	size_t* _length)
1552 {
1553 	InodeReadLocker locker(this);
1554 
1555 	// update the last modification time in memory, it will be written
1556 	// back to the inode, and the index when the file is closed
1557 	// TODO: should update the internal last modified time only at this point!
1558 	Node().last_modified_time = Node().status_change_time
1559 		= HOST_ENDIAN_TO_BFS_INT64(bfs_inode::ToInode(real_time_clock_usecs()));
1560 
1561 	// TODO: support INODE_LOGGED!
1562 
1563 	size_t length = *_length;
1564 	bool changeSize = (uint64)pos + (uint64)length > (uint64)Size();
1565 
1566 	// set/check boundaries for pos/length
1567 	if (pos < 0)
1568 		return B_BAD_VALUE;
1569 
1570 	locker.Unlock();
1571 
1572 	// the transaction doesn't have to be started already
1573 	if (changeSize && !transaction.IsStarted())
1574 		transaction.Start(fVolume, BlockNumber());
1575 
1576 	WriteLocker writeLocker(fLock);
1577 
1578 	// Work around possible race condition: Someone might have shrunken the file
1579 	// while we had no lock.
1580 	if (!transaction.IsStarted()
1581 		&& (uint64)pos + (uint64)length > (uint64)Size()) {
1582 		writeLocker.Unlock();
1583 		transaction.Start(fVolume, BlockNumber());
1584 		writeLocker.Lock();
1585 	}
1586 
1587 	off_t oldSize = Size();
1588 
1589 	if ((uint64)pos + (uint64)length > (uint64)oldSize) {
1590 		// let's grow the data stream to the size needed
1591 		status_t status = SetFileSize(transaction, pos + length);
1592 		if (status != B_OK) {
1593 			*_length = 0;
1594 			WriteLockInTransaction(transaction);
1595 			RETURN_ERROR(status);
1596 		}
1597 		// TODO: In theory we would need to update the file size
1598 		// index here as part of the current transaction - this might just
1599 		// be a bit too expensive, but worth a try.
1600 
1601 		// we need to write back the inode here because it has to
1602 		// go into this transaction (we cannot wait until the file
1603 		// is closed)
1604 		status = WriteBack(transaction);
1605 		if (status != B_OK) {
1606 			WriteLockInTransaction(transaction);
1607 			return status;
1608 		}
1609 	}
1610 
1611 	writeLocker.Unlock();
1612 
1613 	if (oldSize < pos)
1614 		FillGapWithZeros(oldSize, pos);
1615 
1616 	// If we don't want to write anything, we can now return (we may
1617 	// just have changed the file size using the position parameter)
1618 	if (length == 0)
1619 		return B_OK;
1620 
1621 	status_t status = file_cache_write(FileCache(), NULL, pos, buffer, _length);
1622 
1623 	if (transaction.IsStarted())
1624 		WriteLockInTransaction(transaction);
1625 
1626 	return status;
1627 }
1628 
1629 
1630 /*!	Fills the gap between the old file size and the new file size
1631 	with zeros.
1632 	It's more or less a copy of Inode::WriteAt() but it can handle
1633 	length differences of more than just 4 GB, and it never uses
1634 	the log, even if the INODE_LOGGED flag is set.
1635 */
1636 status_t
1637 Inode::FillGapWithZeros(off_t pos, off_t newSize)
1638 {
1639 	while (pos < newSize) {
1640 		size_t size;
1641 		if (newSize > pos + 1024 * 1024 * 1024)
1642 			size = 1024 * 1024 * 1024;
1643 		else
1644 			size = newSize - pos;
1645 
1646 		status_t status = file_cache_write(FileCache(), NULL, pos, NULL, &size);
1647 		if (status < B_OK)
1648 			return status;
1649 
1650 		pos += size;
1651 	}
1652 
1653 	return B_OK;
1654 }
1655 
1656 
1657 /*!	Allocates \a length blocks, and clears their contents. Growing
1658 	the indirect and double indirect range uses this method.
1659 	The allocated block_run is saved in "run"
1660 */
1661 status_t
1662 Inode::_AllocateBlockArray(Transaction& transaction, block_run& run,
1663 	size_t length, bool variableSize)
1664 {
1665 	if (!run.IsZero())
1666 		return B_BAD_VALUE;
1667 
1668 	status_t status = fVolume->Allocate(transaction, this, length, run,
1669 		variableSize ? 1 : length);
1670 	if (status != B_OK)
1671 		return status;
1672 
1673 	// make sure those blocks are empty
1674 	CachedBlock cached(fVolume);
1675 	off_t block = fVolume->ToBlock(run);
1676 
1677 	for (int32 i = 0; i < run.Length(); i++) {
1678 		block_run* runs = (block_run*)cached.SetToWritable(transaction,
1679 			block + i, true);
1680 		if (runs == NULL)
1681 			return B_IO_ERROR;
1682 	}
1683 	return B_OK;
1684 }
1685 
1686 
1687 /*!	Grows the stream to \a size, and fills the direct/indirect/double indirect
1688 	ranges with the runs.
1689 	This method will also determine the size of the preallocation, if any.
1690 */
1691 status_t
1692 Inode::_GrowStream(Transaction& transaction, off_t size)
1693 {
1694 	data_stream* data = &Node().data;
1695 
1696 	// is the data stream already large enough to hold the new size?
1697 	// (can be the case with preallocated blocks)
1698 	if (size < data->MaxDirectRange()
1699 		|| size < data->MaxIndirectRange()
1700 		|| size < data->MaxDoubleIndirectRange()) {
1701 		data->size = HOST_ENDIAN_TO_BFS_INT64(size);
1702 		return B_OK;
1703 	}
1704 
1705 	// how many bytes are still needed? (unused ranges are always zero)
1706 	uint16 minimum = 1;
1707 	off_t bytes;
1708 	if (data->Size() < data->MaxDoubleIndirectRange()) {
1709 		bytes = size - data->MaxDoubleIndirectRange();
1710 		// The double indirect range can only handle multiples of
1711 		// its base length
1712 		minimum = data->double_indirect.Length();
1713 	} else if (data->Size() < data->MaxIndirectRange())
1714 		bytes = size - data->MaxIndirectRange();
1715 	else if (data->Size() < data->MaxDirectRange())
1716 		bytes = size - data->MaxDirectRange();
1717 	else {
1718 		// no preallocation left to be used
1719 		bytes = size - data->Size();
1720 		if (data->MaxDoubleIndirectRange() > 0)
1721 			minimum = data->double_indirect.Length();
1722 	}
1723 
1724 	// do we have enough free blocks on the disk?
1725 	off_t blocksNeeded = (bytes + fVolume->BlockSize() - 1)
1726 		>> fVolume->BlockShift();
1727 	if (blocksNeeded > fVolume->FreeBlocks())
1728 		return B_DEVICE_FULL;
1729 
1730 	off_t blocksRequested = blocksNeeded;
1731 		// because of preallocations and partial allocations, the number of
1732 		// blocks we need to allocate may be different from the one we request
1733 		// from the block allocator
1734 
1735 	// Should we preallocate some blocks?
1736 	// Attributes, attribute directories, and long symlinks usually won't get
1737 	// that big, and should stay close to the inode - preallocating could be
1738 	// counterproductive.
1739 	// Also, if free disk space is tight, don't preallocate.
1740 	if (!IsAttribute() && !IsAttributeDirectory() && !IsSymLink()
1741 		&& fVolume->FreeBlocks() > 128) {
1742 		off_t roundTo = 0;
1743 		if (IsFile()) {
1744 			// Request preallocated blocks depending on the file size and growth
1745 			if (size < 1 * 1024 * 1024 && bytes < 512 * 1024) {
1746 				// Preallocate 64 KB for file sizes <1 MB and grow rates <512 KB
1747 				roundTo = 65536 >> fVolume->BlockShift();
1748 			} else if (size < 32 * 1024 * 1024 && bytes <= 1 * 1024 * 1024) {
1749 				// Preallocate 512 KB for file sizes between 1 MB and 32 MB, and
1750 				// grow rates smaller than 1 MB
1751 				roundTo = (512 * 1024) >> fVolume->BlockShift();
1752 			} else {
1753 				// Preallocate 1/16 of the file size (ie. 4 MB for 64 MB,
1754 				// 64 MB for 1 GB)
1755 				roundTo = size >> (fVolume->BlockShift() + 4);
1756 			}
1757 		} else if (IsIndex()) {
1758 			// Always preallocate 64 KB for index directories
1759 			roundTo = 65536 >> fVolume->BlockShift();
1760 		} else {
1761 			// Preallocate only 4 KB - directories only get trimmed when their
1762 			// vnode is flushed, which might not happen very often.
1763 			roundTo = 4096 >> fVolume->BlockShift();
1764 		}
1765 		if (roundTo > 1) {
1766 			// Round to next "roundTo" block count
1767 			blocksRequested = ((blocksNeeded + roundTo) / roundTo) * roundTo;
1768 		}
1769 	}
1770 
1771 	while (blocksNeeded > 0) {
1772 		// the requested blocks do not need to be returned with a
1773 		// single allocation, so we need to iterate until we have
1774 		// enough blocks allocated
1775 		if (minimum > 1) {
1776 			// make sure that "blocks" is a multiple of minimum
1777 			blocksRequested = round_up(blocksRequested, minimum);
1778 		}
1779 
1780 		block_run run;
1781 		status_t status = fVolume->Allocate(transaction, this, blocksRequested,
1782 			run, minimum);
1783 		if (status != B_OK)
1784 			return status;
1785 
1786 		// okay, we have the needed blocks, so just distribute them to the
1787 		// different ranges of the stream (direct, indirect & double indirect)
1788 
1789 		blocksNeeded -= run.Length();
1790 		// don't preallocate if the first allocation was already too small
1791 		blocksRequested = blocksNeeded;
1792 
1793 		// Direct block range
1794 
1795 		if (data->Size() <= data->MaxDirectRange()) {
1796 			// let's try to put them into the direct block range
1797 			int32 free = 0;
1798 			for (; free < NUM_DIRECT_BLOCKS; free++) {
1799 				if (data->direct[free].IsZero())
1800 					break;
1801 			}
1802 
1803 			if (free < NUM_DIRECT_BLOCKS) {
1804 				// can we merge the last allocated run with the new one?
1805 				int32 last = free - 1;
1806 				if (free > 0 && data->direct[last].MergeableWith(run)) {
1807 					data->direct[last].length = HOST_ENDIAN_TO_BFS_INT16(
1808 						data->direct[last].Length() + run.Length());
1809 				} else
1810 					data->direct[free] = run;
1811 
1812 				data->max_direct_range = HOST_ENDIAN_TO_BFS_INT64(
1813 					data->MaxDirectRange()
1814 					+ run.Length() * fVolume->BlockSize());
1815 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1816 					? data->max_direct_range : size);
1817 				continue;
1818 			}
1819 		}
1820 
1821 		// Indirect block range
1822 
1823 		if (data->Size() <= data->MaxIndirectRange()
1824 			|| !data->MaxIndirectRange()) {
1825 			CachedBlock cached(fVolume);
1826 			block_run* runs = NULL;
1827 			uint32 free = 0;
1828 			off_t block;
1829 
1830 			// if there is no indirect block yet, create one
1831 			if (data->indirect.IsZero()) {
1832 				status = _AllocateBlockArray(transaction, data->indirect,
1833 					NUM_ARRAY_BLOCKS, true);
1834 				if (status != B_OK)
1835 					return status;
1836 
1837 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1838 					data->MaxDirectRange());
1839 				// insert the block_run in the first block
1840 				runs = (block_run*)cached.SetTo(data->indirect);
1841 			} else {
1842 				uint32 numberOfRuns = fVolume->BlockSize() / sizeof(block_run);
1843 				block = fVolume->ToBlock(data->indirect);
1844 
1845 				// search first empty entry
1846 				int32 i = 0;
1847 				for (; i < data->indirect.Length(); i++) {
1848 					if ((runs = (block_run*)cached.SetTo(block + i)) == NULL)
1849 						return B_IO_ERROR;
1850 
1851 					for (free = 0; free < numberOfRuns; free++)
1852 						if (runs[free].IsZero())
1853 							break;
1854 
1855 					if (free < numberOfRuns)
1856 						break;
1857 				}
1858 				if (i == data->indirect.Length())
1859 					runs = NULL;
1860 			}
1861 
1862 			if (runs != NULL) {
1863 				// try to insert the run to the last one - note that this
1864 				// doesn't take block borders into account, so it could be
1865 				// further optimized
1866 				cached.MakeWritable(transaction);
1867 
1868 				int32 last = free - 1;
1869 				if (free > 0 && runs[last].MergeableWith(run)) {
1870 					runs[last].length = HOST_ENDIAN_TO_BFS_INT16(
1871 						runs[last].Length() + run.Length());
1872 				} else
1873 					runs[free] = run;
1874 
1875 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1876 					data->MaxIndirectRange()
1877 					+ (run.Length() << fVolume->BlockShift()));
1878 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1879 					? data->MaxIndirectRange() : size);
1880 				continue;
1881 			}
1882 		}
1883 
1884 		// Double indirect block range
1885 
1886 		if (data->Size() <= data->MaxDoubleIndirectRange()
1887 			|| !data->max_double_indirect_range) {
1888 			// We make sure here that we have this minimum allocated, so if
1889 			// the allocation succeeds, we don't run into an endless loop.
1890 			if (!data->max_double_indirect_range)
1891 				minimum = _DoubleIndirectBlockLength();
1892 			else
1893 				minimum = data->double_indirect.Length();
1894 
1895 			if ((run.Length() % minimum) != 0) {
1896 				// The number of allocated blocks isn't a multiple of 'minimum',
1897 				// so we have to change this. This can happen the first time the
1898 				// stream grows into the double indirect range.
1899 				// First, free the remaining blocks that don't fit into this
1900 				// multiple.
1901 				int32 rest = run.Length() % minimum;
1902 				run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length() - rest);
1903 
1904 				status = fVolume->Free(transaction,
1905 					block_run::Run(run.AllocationGroup(),
1906 					run.Start() + run.Length(), rest));
1907 				if (status != B_OK)
1908 					return status;
1909 
1910 				blocksNeeded += rest;
1911 				blocksRequested = round_up(blocksNeeded, minimum);
1912 
1913 				// Are there any blocks left in the run? If not, allocate
1914 				// a new one
1915 				if (run.length == 0)
1916 					continue;
1917 			}
1918 
1919 			// if there is no double indirect block yet, create one
1920 			if (data->double_indirect.IsZero()) {
1921 				status = _AllocateBlockArray(transaction,
1922 					data->double_indirect, _DoubleIndirectBlockLength());
1923 				if (status != B_OK)
1924 					return status;
1925 
1926 				data->max_double_indirect_range = data->max_indirect_range;
1927 			}
1928 
1929 			// calculate the index where to insert the new blocks
1930 
1931 			int32 runsPerBlock;
1932 			int32 directSize;
1933 			int32 indirectSize;
1934 			get_double_indirect_sizes(data->double_indirect.Length(),
1935 				fVolume->BlockSize(), runsPerBlock, directSize, indirectSize);
1936 
1937 			off_t start = data->MaxDoubleIndirectRange()
1938 				- data->MaxIndirectRange();
1939 			int32 indirectIndex = start / indirectSize;
1940 			int32 index = (start % indirectSize) / directSize;
1941 			int32 runsPerArray = runsPerBlock * minimum;
1942 
1943 			// distribute the blocks to the array and allocate
1944 			// new array blocks when needed
1945 
1946 			CachedBlock cached(fVolume);
1947 			CachedBlock cachedDirect(fVolume);
1948 			block_run* array = NULL;
1949 			uint32 runLength = run.Length();
1950 
1951 			while (run.length != 0) {
1952 				// get the indirect array block
1953 				if (array == NULL) {
1954 					uint32 block = indirectIndex / runsPerBlock;
1955 					if (block >= minimum)
1956 						return EFBIG;
1957 
1958 					array = (block_run*)cached.SetTo(fVolume->ToBlock(
1959 						data->double_indirect) + block);
1960 					if (array == NULL)
1961 						return B_IO_ERROR;
1962 				}
1963 
1964 				do {
1965 					// do we need a new array block?
1966 					if (array[indirectIndex % runsPerBlock].IsZero()) {
1967 						cached.MakeWritable(transaction);
1968 
1969 						status = _AllocateBlockArray(transaction,
1970 							array[indirectIndex % runsPerBlock],
1971 							data->double_indirect.Length());
1972 						if (status != B_OK)
1973 							return status;
1974 					}
1975 
1976 					block_run* runs = (block_run*)cachedDirect.SetToWritable(
1977 						transaction, fVolume->ToBlock(array[indirectIndex
1978 							% runsPerBlock]) + index / runsPerBlock);
1979 					if (runs == NULL)
1980 						return B_IO_ERROR;
1981 
1982 					do {
1983 						// insert the block_run into the array
1984 						runs[index % runsPerBlock] = run;
1985 						runs[index % runsPerBlock].length
1986 							= HOST_ENDIAN_TO_BFS_INT16(minimum);
1987 
1988 						// alter the remaining block_run
1989 						run.start = HOST_ENDIAN_TO_BFS_INT16(run.Start()
1990 							+ minimum);
1991 						run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
1992 							- minimum);
1993 					} while ((++index % runsPerBlock) != 0 && run.length);
1994 				} while ((index % runsPerArray) != 0 && run.length);
1995 
1996 				if (index == runsPerArray)
1997 					index = 0;
1998 				if (++indirectIndex % runsPerBlock == 0) {
1999 					array = NULL;
2000 					index = 0;
2001 				}
2002 			}
2003 
2004 			data->max_double_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
2005 				data->MaxDoubleIndirectRange()
2006 				+ (runLength << fVolume->BlockShift()));
2007 			data->size = blocksNeeded > 0 ? HOST_ENDIAN_TO_BFS_INT64(
2008 				data->max_double_indirect_range) : size;
2009 
2010 			continue;
2011 		}
2012 
2013 		RETURN_ERROR(EFBIG);
2014 	}
2015 	// update the size of the data stream
2016 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
2017 
2018 	return B_OK;
2019 }
2020 
2021 
2022 size_t
2023 Inode::_DoubleIndirectBlockLength() const
2024 {
2025 	if (fVolume->BlockSize() > DOUBLE_INDIRECT_ARRAY_SIZE)
2026 		return 1;
2027 
2028 	return DOUBLE_INDIRECT_ARRAY_SIZE / fVolume->BlockSize();
2029 }
2030 
2031 
2032 /*!	Frees the statically sized array of the double indirect part of a data
2033 	stream.
2034 */
2035 status_t
2036 Inode::_FreeStaticStreamArray(Transaction& transaction, int32 level,
2037 	block_run run, off_t size, off_t offset, off_t& max)
2038 {
2039 	int32 indirectSize;
2040 	if (level == 0) {
2041 		indirectSize = double_indirect_max_indirect_size(run.Length(),
2042 			fVolume->BlockSize());
2043 	} else {
2044 		indirectSize = double_indirect_max_direct_size(run.Length(),
2045 			fVolume->BlockSize());
2046 	}
2047 
2048 	off_t start;
2049 	if (size > offset)
2050 		start = size - offset;
2051 	else
2052 		start = 0;
2053 
2054 	int32 index = start / indirectSize;
2055 	int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
2056 
2057 	CachedBlock cached(fVolume);
2058 	off_t blockNumber = fVolume->ToBlock(run);
2059 
2060 	// set the file offset to the current block run
2061 	offset += (off_t)index * indirectSize;
2062 
2063 	for (int32 i = index / runsPerBlock; i < run.Length(); i++) {
2064 		block_run* array = (block_run*)cached.SetToWritable(transaction,
2065 			blockNumber + i);
2066 		if (array == NULL)
2067 			RETURN_ERROR(B_ERROR);
2068 
2069 		for (index = index % runsPerBlock; index < runsPerBlock; index++) {
2070 			if (array[index].IsZero()) {
2071 				// we also want to break out of the outer loop
2072 				i = run.Length();
2073 				break;
2074 			}
2075 
2076 			status_t status = B_OK;
2077 			if (level == 0) {
2078 				status = _FreeStaticStreamArray(transaction, 1, array[index],
2079 					size, offset, max);
2080 			} else if (offset >= size)
2081 				status = fVolume->Free(transaction, array[index]);
2082 			else
2083 				max = HOST_ENDIAN_TO_BFS_INT64(offset + indirectSize);
2084 
2085 			if (status < B_OK)
2086 				RETURN_ERROR(status);
2087 
2088 			if (offset >= size)
2089 				array[index].SetTo(0, 0, 0);
2090 
2091 			offset += indirectSize;
2092 		}
2093 		index = 0;
2094 	}
2095 	return B_OK;
2096 }
2097 
2098 
2099 /*!	Frees all block_runs in the array which come after the specified size.
2100 	It also trims the last block_run that contain the size.
2101 	"offset" and "max" are maintained until the last block_run that doesn't
2102 	have to be freed - after this, the values won't be correct anymore, but
2103 	will still assure correct function for all subsequent calls.
2104 	"max" is considered to be in file system byte order.
2105 */
2106 status_t
2107 Inode::_FreeStreamArray(Transaction& transaction, block_run* array,
2108 	uint32 arrayLength, off_t size, off_t& offset, off_t& max)
2109 {
2110 	PRINT(("FreeStreamArray: arrayLength %lu, size %Ld, offset %Ld, max %Ld\n",
2111 		arrayLength, size, offset, max));
2112 
2113 	off_t newOffset = offset;
2114 	uint32 i = 0;
2115 	for (; i < arrayLength; i++, offset = newOffset) {
2116 		if (array[i].IsZero())
2117 			break;
2118 
2119 		newOffset += (off_t)array[i].Length() << fVolume->BlockShift();
2120 		if (newOffset <= size)
2121 			continue;
2122 
2123 		block_run run = array[i];
2124 
2125 		// determine the block_run to be freed
2126 		if (newOffset > size && offset < size) {
2127 			// free partial block_run (and update the original block_run)
2128 			run.start = HOST_ENDIAN_TO_BFS_INT16(array[i].Start()
2129 				+ ((size + fVolume->BlockSize() - 1 - offset)
2130 					>> fVolume->BlockShift()));
2131 				// round to next block
2132 			array[i].length = HOST_ENDIAN_TO_BFS_INT16(run.Start()
2133 				- array[i].Start());
2134 			run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
2135 				- array[i].Length());
2136 
2137 			if (run.length == 0)
2138 				continue;
2139 
2140 			// update maximum range
2141 			max = HOST_ENDIAN_TO_BFS_INT64(offset + ((off_t)array[i].Length()
2142 				<< fVolume->BlockShift()));
2143 		} else {
2144 			// free the whole block_run
2145 			array[i].SetTo(0, 0, 0);
2146 
2147 			if ((off_t)BFS_ENDIAN_TO_HOST_INT64(max) > offset)
2148 				max = HOST_ENDIAN_TO_BFS_INT64(offset);
2149 		}
2150 
2151 		if (fVolume->Free(transaction, run) < B_OK)
2152 			return B_IO_ERROR;
2153 	}
2154 	return B_OK;
2155 }
2156 
2157 
2158 status_t
2159 Inode::_ShrinkStream(Transaction& transaction, off_t size)
2160 {
2161 	data_stream* data = &Node().data;
2162 	status_t status;
2163 
2164 	if (data->MaxDoubleIndirectRange() > size) {
2165 		off_t* maxDoubleIndirect = &data->max_double_indirect_range;
2166 			// gcc 4 work-around: "error: cannot bind packed field
2167 			// 'data->data_stream::max_double_indirect_range' to 'off_t&'"
2168 		status = _FreeStaticStreamArray(transaction, 0, data->double_indirect,
2169 			size, data->MaxIndirectRange(), *maxDoubleIndirect);
2170 		if (status != B_OK)
2171 			return status;
2172 
2173 		if (size <= data->MaxIndirectRange()) {
2174 			fVolume->Free(transaction, data->double_indirect);
2175 			data->double_indirect.SetTo(0, 0, 0);
2176 			data->max_double_indirect_range = 0;
2177 		}
2178 	}
2179 
2180 	if (data->MaxIndirectRange() > size) {
2181 		CachedBlock cached(fVolume);
2182 		off_t block = fVolume->ToBlock(data->indirect);
2183 		off_t offset = data->MaxDirectRange();
2184 
2185 		for (int32 i = 0; i < data->indirect.Length(); i++) {
2186 			block_run* array = (block_run*)cached.SetToWritable(transaction,
2187 				block + i);
2188 			if (array == NULL)
2189 				break;
2190 
2191 			off_t* maxIndirect = &data->max_indirect_range;
2192 				// gcc 4 work-around: "error: cannot bind packed field
2193 				// 'data->data_stream::max_indirect_range' to 'off_t&'"
2194 			if (_FreeStreamArray(transaction, array, fVolume->BlockSize()
2195 					/ sizeof(block_run), size, offset, *maxIndirect) != B_OK)
2196 				return B_IO_ERROR;
2197 		}
2198 		if (data->max_direct_range == data->max_indirect_range) {
2199 			fVolume->Free(transaction, data->indirect);
2200 			data->indirect.SetTo(0, 0, 0);
2201 			data->max_indirect_range = 0;
2202 		}
2203 	}
2204 
2205 	if (data->MaxDirectRange() > size) {
2206 		off_t offset = 0;
2207 		off_t *maxDirect = &data->max_direct_range;
2208 			// gcc 4 work-around: "error: cannot bind packed field
2209 			// 'data->data_stream::max_direct_range' to 'off_t&'"
2210 		status = _FreeStreamArray(transaction, data->direct, NUM_DIRECT_BLOCKS,
2211 			size, offset, *maxDirect);
2212 		if (status < B_OK)
2213 			return status;
2214 	}
2215 
2216 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
2217 	return B_OK;
2218 }
2219 
2220 
2221 status_t
2222 Inode::SetFileSize(Transaction& transaction, off_t size)
2223 {
2224 	if (size < 0)
2225 		return B_BAD_VALUE;
2226 
2227 	off_t oldSize = Size();
2228 
2229 	if (size == oldSize)
2230 		return B_OK;
2231 
2232 	T(Resize(this, oldSize, size, false));
2233 
2234 	// should the data stream grow or shrink?
2235 	status_t status;
2236 	if (size > oldSize) {
2237 		status = _GrowStream(transaction, size);
2238 		if (status < B_OK) {
2239 			// if the growing of the stream fails, the whole operation
2240 			// fails, so we should shrink the stream to its former size
2241 			_ShrinkStream(transaction, oldSize);
2242 		}
2243 	} else
2244 		status = _ShrinkStream(transaction, size);
2245 
2246 	if (status < B_OK)
2247 		return status;
2248 
2249 	file_cache_set_size(FileCache(), size);
2250 	file_map_set_size(Map(), size);
2251 
2252 	return WriteBack(transaction);
2253 }
2254 
2255 
2256 status_t
2257 Inode::Append(Transaction& transaction, off_t bytes)
2258 {
2259 	return SetFileSize(transaction, Size() + bytes);
2260 }
2261 
2262 
2263 /*!	Checks whether or not this inode's data stream needs to be trimmed
2264 	because of an earlier preallocation.
2265 	Returns true if there are any blocks to be trimmed.
2266 */
2267 bool
2268 Inode::NeedsTrimming() const
2269 {
2270 	// We never trim preallocated index blocks to make them grow as smooth as
2271 	// possible. There are only few indices anyway, so this doesn't hurt.
2272 	// Also, if an inode is already in deleted state, we don't bother trimming
2273 	// it.
2274 	if (IsIndex() || IsDeleted()
2275 		|| (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0))
2276 		return false;
2277 
2278 	off_t roundedSize = round_up(Size(), fVolume->BlockSize());
2279 
2280 	return Node().data.MaxDirectRange() > roundedSize
2281 		|| Node().data.MaxIndirectRange() > roundedSize
2282 		|| Node().data.MaxDoubleIndirectRange() > roundedSize;
2283 }
2284 
2285 
2286 status_t
2287 Inode::TrimPreallocation(Transaction& transaction)
2288 {
2289 	T(Resize(this, max_c(Node().data.MaxDirectRange(),
2290 		Node().data.MaxIndirectRange()), Size(), true));
2291 
2292 	status_t status = _ShrinkStream(transaction, Size());
2293 	if (status < B_OK)
2294 		return status;
2295 
2296 	return WriteBack(transaction);
2297 }
2298 
2299 
2300 //!	Frees the file's data stream and removes all attributes
2301 status_t
2302 Inode::Free(Transaction& transaction)
2303 {
2304 	FUNCTION();
2305 
2306 	// Perhaps there should be an implementation of Inode::ShrinkStream() that
2307 	// just frees the data_stream, but doesn't change the inode (since it is
2308 	// freed anyway) - that would make an undelete command possible
2309 	if (!IsSymLink() || (Flags() & INODE_LONG_SYMLINK) != 0) {
2310 		status_t status = SetFileSize(transaction, 0);
2311 		if (status < B_OK)
2312 			return status;
2313 	}
2314 
2315 	// Free all attributes, and remove their indices
2316 	{
2317 		// We have to limit the scope of AttributeIterator, so that its
2318 		// destructor is not called after the inode is deleted
2319 		AttributeIterator iterator(this);
2320 
2321 		char name[B_FILE_NAME_LENGTH];
2322 		uint32 type;
2323 		size_t length;
2324 		ino_t id;
2325 		while (iterator.GetNext(name, &length, &type, &id) == B_OK) {
2326 			RemoveAttribute(transaction, name);
2327 		}
2328 	}
2329 
2330 	if (WriteBack(transaction) < B_OK)
2331 		return B_IO_ERROR;
2332 
2333 	return fVolume->Free(transaction, BlockRun());
2334 }
2335 
2336 
2337 //!	Synchronizes (writes back to disk) the file stream of the inode.
2338 status_t
2339 Inode::Sync()
2340 {
2341 	if (FileCache())
2342 		return file_cache_sync(FileCache());
2343 
2344 	// We may also want to flush the attribute's data stream to
2345 	// disk here... (do we?)
2346 
2347 	if (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0)
2348 		return B_OK;
2349 
2350 	InodeReadLocker locker(this);
2351 
2352 	data_stream* data = &Node().data;
2353 	status_t status = B_OK;
2354 
2355 	// flush direct range
2356 
2357 	for (int32 i = 0; i < NUM_DIRECT_BLOCKS; i++) {
2358 		if (data->direct[i].IsZero())
2359 			return B_OK;
2360 
2361 		status = block_cache_sync_etc(fVolume->BlockCache(),
2362 			fVolume->ToBlock(data->direct[i]), data->direct[i].Length());
2363 		if (status != B_OK)
2364 			return status;
2365 	}
2366 
2367 	// flush indirect range
2368 
2369 	if (data->max_indirect_range == 0)
2370 		return B_OK;
2371 
2372 	CachedBlock cached(fVolume);
2373 	off_t block = fVolume->ToBlock(data->indirect);
2374 	int32 count = fVolume->BlockSize() / sizeof(block_run);
2375 
2376 	for (int32 j = 0; j < data->indirect.Length(); j++) {
2377 		block_run* runs = (block_run*)cached.SetTo(block + j);
2378 		if (runs == NULL)
2379 			break;
2380 
2381 		for (int32 i = 0; i < count; i++) {
2382 			if (runs[i].IsZero())
2383 				return B_OK;
2384 
2385 			status = block_cache_sync_etc(fVolume->BlockCache(),
2386 				fVolume->ToBlock(runs[i]), runs[i].Length());
2387 			if (status != B_OK)
2388 				return status;
2389 		}
2390 	}
2391 
2392 	// flush double indirect range
2393 
2394 	if (data->max_double_indirect_range == 0)
2395 		return B_OK;
2396 
2397 	off_t indirectBlock = fVolume->ToBlock(data->double_indirect);
2398 
2399 	for (int32 l = 0; l < data->double_indirect.Length(); l++) {
2400 		block_run* indirectRuns = (block_run*)cached.SetTo(indirectBlock + l);
2401 		if (indirectRuns == NULL)
2402 			return B_FILE_ERROR;
2403 
2404 		CachedBlock directCached(fVolume);
2405 
2406 		for (int32 k = 0; k < count; k++) {
2407 			if (indirectRuns[k].IsZero())
2408 				return B_OK;
2409 
2410 			block = fVolume->ToBlock(indirectRuns[k]);
2411 			for (int32 j = 0; j < indirectRuns[k].Length(); j++) {
2412 				block_run* runs = (block_run*)directCached.SetTo(block + j);
2413 				if (runs == NULL)
2414 					return B_FILE_ERROR;
2415 
2416 				for (int32 i = 0; i < count; i++) {
2417 					if (runs[i].IsZero())
2418 						return B_OK;
2419 
2420 					// TODO: combine single block_runs to bigger ones when
2421 					// they are adjacent
2422 					status = block_cache_sync_etc(fVolume->BlockCache(),
2423 						fVolume->ToBlock(runs[i]), runs[i].Length());
2424 					if (status != B_OK)
2425 						return status;
2426 				}
2427 			}
2428 		}
2429 	}
2430 	return B_OK;
2431 }
2432 
2433 
2434 //	#pragma mark - TransactionListener implementation
2435 
2436 
2437 void
2438 Inode::TransactionDone(bool success)
2439 {
2440 	if (!success) {
2441 		// revert any changes made to the cached bfs_inode
2442 		UpdateNodeFromDisk();
2443 	}
2444 }
2445 
2446 
2447 void
2448 Inode::RemovedFromTransaction()
2449 {
2450 	Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
2451 
2452 	// See AddInode() why we do this here
2453 	if ((Flags() & INODE_DELETED) != 0)
2454 		fVolume->RemovedInodes().Add(this);
2455 
2456 	rw_lock_write_unlock(&Lock());
2457 
2458 	if (!fVolume->IsInitializing())
2459 		put_vnode(fVolume->FSVolume(), ID());
2460 }
2461 
2462 
2463 //	#pragma mark - creation/deletion
2464 
2465 
2466 status_t
2467 Inode::Remove(Transaction& transaction, const char* name, ino_t* _id,
2468 	bool isDirectory, bool force)
2469 {
2470 	if (fTree == NULL)
2471 		RETURN_ERROR(B_BAD_VALUE);
2472 
2473 	WriteLockInTransaction(transaction);
2474 
2475 	// does the file even exist?
2476 	off_t id;
2477 	if (fTree->Find((uint8*)name, (uint16)strlen(name), &id) < B_OK)
2478 		return B_ENTRY_NOT_FOUND;
2479 
2480 	if (_id)
2481 		*_id = id;
2482 
2483 	Vnode vnode(fVolume, id);
2484 	Inode* inode;
2485 	status_t status = vnode.Get(&inode);
2486 	if (status < B_OK) {
2487 		REPORT_ERROR(status);
2488 		return B_ENTRY_NOT_FOUND;
2489 	}
2490 
2491 	T(Remove(inode, name));
2492 	inode->WriteLockInTransaction(transaction);
2493 
2494 	// Inode::IsContainer() is true also for indices (furthermore, the S_IFDIR
2495 	// bit is set for indices in BFS, not for attribute directories) - but you
2496 	// should really be able to do whatever you want with your indices
2497 	// without having to remove all files first :)
2498 	if (!inode->IsIndex() && !force) {
2499 		// if it's not of the correct type, don't delete it!
2500 		if (inode->IsContainer() != isDirectory)
2501 			return isDirectory ? B_NOT_A_DIRECTORY : B_IS_A_DIRECTORY;
2502 
2503 		// only delete empty directories
2504 		if (isDirectory && !inode->IsEmpty())
2505 			return B_DIRECTORY_NOT_EMPTY;
2506 	}
2507 
2508 	// remove_vnode() allows the inode to be accessed until the last put_vnode()
2509 	status = remove_vnode(fVolume->FSVolume(), id);
2510 	if (status != B_OK)
2511 		return status;
2512 
2513 	if (fTree->Remove(transaction, name, id) != B_OK && !force) {
2514 		unremove_vnode(fVolume->FSVolume(), id);
2515 		RETURN_ERROR(B_ERROR);
2516 	}
2517 
2518 #ifdef DEBUG
2519 	if (fTree->Find((uint8*)name, (uint16)strlen(name), &id) == B_OK) {
2520 		DIE(("deleted entry still there"));
2521 	}
2522 #endif
2523 
2524 	ContainerContentsChanged(transaction);
2525 
2526 	// update the inode, so that no one will ever doubt it's deleted :-)
2527 	inode->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
2528 	inode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
2529 
2530 	// In balance to the Inode::Create() method, the main indices
2531 	// are updated here (name, size, & last_modified)
2532 
2533 	Index index(fVolume);
2534 	if (inode->InNameIndex()) {
2535 		index.RemoveName(transaction, name, inode);
2536 			// If removing from the index fails, it is not regarded as a
2537 			// fatal error and will not be reported back!
2538 			// Deleted inodes won't be visible in queries anyway.
2539 	}
2540 
2541 	if (inode->InSizeIndex())
2542 		index.RemoveSize(transaction, inode);
2543 	if (inode->InLastModifiedIndex())
2544 		index.RemoveLastModified(transaction, inode);
2545 
2546 	return inode->WriteBack(transaction);
2547 }
2548 
2549 
2550 /*!	Creates the inode with the specified \a parent directory, and automatically
2551 	adds the created inode to that parent directory. If an attribute directory
2552 	is created, it will also automatically  be added to the \a parent inode as
2553 	such. However, the indices root node, and the regular root node won't be
2554 	added to the super block.
2555 	It will also create the initial B+tree for the inode if it's a directory
2556 	of any kind.
2557 	\a name may be \c NULL, but only if no \a parent is given.
2558 	If the "_id" or "_inode" variable is given and non-NULL to store the
2559 	inode's ID, the inode stays locked - you have to call put_vnode() if you
2560 	don't use it anymore.
2561 	If the node already exists, this method will fail if \c O_EXCL is set, or
2562 	it's a directory or a symlink. Otherwise, it will just be returned.
2563 	If \c O_TRUNC has been specified, the file will also be truncated.
2564 */
2565 status_t
2566 Inode::Create(Transaction& transaction, Inode* parent, const char* name,
2567 	int32 mode, int openMode, uint32 type, bool* _created, ino_t* _id,
2568 	Inode** _inode, fs_vnode_ops* vnodeOps, uint32 publishFlags)
2569 {
2570 	FUNCTION_START(("name = %s, mode = %ld\n", name, mode));
2571 
2572 	block_run parentRun = parent ? parent->BlockRun() : block_run::Run(0, 0, 0);
2573 	Volume* volume = transaction.GetVolume();
2574 	BPlusTree* tree = NULL;
2575 
2576 	if (parent != NULL && (mode & S_ATTR_DIR) == 0 && parent->IsContainer()) {
2577 		// check if the file already exists in the directory
2578 		tree = parent->Tree();
2579 	}
2580 
2581 	if (parent != NULL) {
2582 		// the parent directory is locked during the whole inode creation
2583 		parent->WriteLockInTransaction(transaction);
2584 	}
2585 
2586 	if (parent != NULL && !volume->IsInitializing() && parent->IsContainer()) {
2587 		// don't create anything in removed directories
2588 		bool removed;
2589 		if (get_vnode_removed(volume->FSVolume(), parent->ID(), &removed)
2590 				== B_OK && removed) {
2591 			RETURN_ERROR(B_ENTRY_NOT_FOUND);
2592 		}
2593 	}
2594 
2595 	if (tree != NULL) {
2596 		// Does the file already exist?
2597 		off_t offset;
2598 		if (tree->Find((uint8*)name, (uint16)strlen(name), &offset) == B_OK) {
2599 			// Return if the file should be a directory/link or opened in
2600 			// exclusive mode
2601 			if (S_ISDIR(mode) || S_ISLNK(mode) || (openMode & O_EXCL) != 0)
2602 				return B_FILE_EXISTS;
2603 
2604 			Vnode vnode(volume, offset);
2605 			Inode* inode;
2606 			status_t status = vnode.Get(&inode);
2607 			if (status != B_OK) {
2608 				REPORT_ERROR(status);
2609 				return B_ENTRY_NOT_FOUND;
2610 			}
2611 
2612 			if (inode->IsDirectory() && (openMode & O_RWMASK) != O_RDONLY)
2613 				return B_IS_A_DIRECTORY;
2614 			if ((openMode & O_DIRECTORY) != 0 && !inode->IsDirectory())
2615 				return B_NOT_A_DIRECTORY;
2616 
2617 			// we want to open the file, so we should have the rights to do so
2618 			if (inode->CheckPermissions(open_mode_to_access(openMode)
2619 					| ((openMode & O_TRUNC) != 0 ? W_OK : 0)) != B_OK)
2620 				return B_NOT_ALLOWED;
2621 
2622 			if ((openMode & O_TRUNC) != 0) {
2623 				// truncate the existing file
2624 				inode->WriteLockInTransaction(transaction);
2625 
2626 				status_t status = inode->SetFileSize(transaction, 0);
2627 				if (status == B_OK)
2628 					status = inode->WriteBack(transaction);
2629 
2630 				if (status != B_OK)
2631 					return status;
2632 			}
2633 
2634 			if (_created)
2635 				*_created = false;
2636 			if (_id)
2637 				*_id = inode->ID();
2638 			if (_inode)
2639 				*_inode = inode;
2640 
2641 			// Only keep the vnode in memory if the _id or _inode pointer is
2642 			// provided
2643 			if (_id != NULL || _inode != NULL)
2644 				vnode.Keep();
2645 
2646 			return B_OK;
2647 		}
2648 	} else if (parent != NULL && (mode & S_ATTR_DIR) == 0) {
2649 		return B_BAD_VALUE;
2650 	} else if ((openMode & O_DIRECTORY) != 0) {
2651 		// TODO: we might need to return B_NOT_A_DIRECTORY here
2652 		return B_ENTRY_NOT_FOUND;
2653 	}
2654 
2655 	status_t status;
2656 
2657 	// do we have the power to create new files at all?
2658 	if (parent != NULL && (status = parent->CheckPermissions(W_OK)) != B_OK)
2659 		RETURN_ERROR(status);
2660 
2661 	// allocate space for the new inode
2662 	InodeAllocator allocator(transaction);
2663 	block_run run;
2664 	Inode* inode;
2665 	status = allocator.New(&parentRun, mode, run, vnodeOps, &inode);
2666 	if (status < B_OK)
2667 		return status;
2668 
2669 	T(Create(inode, parent, name, mode, openMode, type));
2670 
2671 	// Initialize the parts of the bfs_inode structure that
2672 	// InodeAllocator::New() hasn't touched yet
2673 
2674 	bfs_inode* node = &inode->Node();
2675 
2676 	if (parent == NULL) {
2677 		// we set the parent to itself in this case
2678 		// (only happens for the root and indices node)
2679 		node->parent = run;
2680 	} else
2681 		node->parent = parentRun;
2682 
2683 	node->uid = HOST_ENDIAN_TO_BFS_INT32(geteuid());
2684 	node->gid = HOST_ENDIAN_TO_BFS_INT32(parent
2685 		? parent->Node().GroupID() : getegid());
2686 		// the group ID is inherited from the parent, if available
2687 
2688 	node->type = HOST_ENDIAN_TO_BFS_INT32(type);
2689 
2690 	inode->WriteBack(transaction);
2691 		// make sure the initialized node is available to others
2692 
2693 	// only add the name to regular files, directories, or symlinks
2694 	// don't add it to attributes, or indices
2695 	if (tree && inode->IsRegularNode()
2696 		&& inode->SetName(transaction, name) != B_OK)
2697 		return B_ERROR;
2698 
2699 	// Initialize b+tree if it's a directory (and add "." & ".." if it's
2700 	// a standard directory for files - not for attributes or indices)
2701 	if (inode->IsContainer()) {
2702 		status = allocator.CreateTree();
2703 		if (status != B_OK)
2704 			return status;
2705 	}
2706 
2707 	// Add a link to the inode from the parent, depending on its type
2708 	// (the vnode is not published yet, so it is safe to make the inode
2709 	// accessable to the file system here)
2710 	if (tree != NULL) {
2711 		status = tree->Insert(transaction, name, inode->ID());
2712 	} else if (parent != NULL && (mode & S_ATTR_DIR) != 0) {
2713 		parent->Attributes() = run;
2714 		status = parent->WriteBack(transaction);
2715 	}
2716 
2717 	// Note, we only care if the inode could be made accessable for the
2718 	// two cases above; the root node or the indices root node must
2719 	// handle this case on their own (or other cases where "parent" is
2720 	// NULL)
2721 	if (status != B_OK)
2722 		RETURN_ERROR(status);
2723 
2724 	// Update the main indices (name, size & last_modified)
2725 	// (live queries might want to access us after this)
2726 
2727 	Index index(volume);
2728 	if (inode->InNameIndex() && name != NULL) {
2729 		// the name index only contains regular files
2730 		// (but not the root node where name == NULL)
2731 		status = index.InsertName(transaction, name, inode);
2732 		if (status != B_OK && status != B_BAD_INDEX) {
2733 			// We have to remove the node from the parent at this point,
2734 			// because the InodeAllocator destructor can't handle this
2735 			// case (and if it fails, we can't do anything about it...)
2736 			if (tree)
2737 				tree->Remove(transaction, name, inode->ID());
2738 			else if (parent != NULL && (mode & S_ATTR_DIR) != 0)
2739 				parent->Node().attributes.SetTo(0, 0, 0);
2740 
2741 			RETURN_ERROR(status);
2742 		}
2743 	}
2744 
2745 	if (parent != NULL && parent->IsContainer())
2746 		parent->ContainerContentsChanged(transaction);
2747 
2748 	inode->UpdateOldLastModified();
2749 
2750 	// The "size" & "last_modified" indices don't contain directories.
2751 	// If adding to these indices fails, the inode creation will not be
2752 	// harmed; they are considered less important than the "name" index.
2753 	if (inode->InSizeIndex())
2754 		index.InsertSize(transaction, inode);
2755 	if (inode->InLastModifiedIndex())
2756 		index.InsertLastModified(transaction, inode);
2757 
2758 	if (inode->NeedsFileCache()) {
2759 		inode->SetFileCache(file_cache_create(volume->ID(), inode->ID(),
2760 			inode->Size()));
2761 		inode->SetMap(file_map_create(volume->ID(), inode->ID(),
2762 			inode->Size()));
2763 
2764 		if (inode->FileCache() == NULL || inode->Map() == NULL)
2765 			return B_NO_MEMORY;
2766 	}
2767 
2768 	// Everything worked well until this point, we have a fully
2769 	// initialized inode, and we want to keep it
2770 	allocator.Keep(vnodeOps, publishFlags);
2771 
2772 	if (_created)
2773 		*_created = true;
2774 	if (_id != NULL)
2775 		*_id = inode->ID();
2776 	if (_inode != NULL)
2777 		*_inode = inode;
2778 
2779 	// if either _id or _inode is passed, we will keep the inode locked
2780 	if (_id == NULL && _inode == NULL)
2781 		put_vnode(volume->FSVolume(), inode->ID());
2782 
2783 	return B_OK;
2784 }
2785 
2786 
2787 //	#pragma mark - AttributeIterator
2788 
2789 
2790 AttributeIterator::AttributeIterator(Inode* inode)
2791 	:
2792 	fCurrentSmallData(0),
2793 	fInode(inode),
2794 	fAttributes(NULL),
2795 	fIterator(NULL),
2796 	fBuffer(NULL)
2797 {
2798 	inode->_AddIterator(this);
2799 }
2800 
2801 
2802 AttributeIterator::~AttributeIterator()
2803 {
2804 	if (fAttributes)
2805 		put_vnode(fAttributes->GetVolume()->FSVolume(), fAttributes->ID());
2806 
2807 	delete fIterator;
2808 	fInode->_RemoveIterator(this);
2809 }
2810 
2811 
2812 status_t
2813 AttributeIterator::Rewind()
2814 {
2815 	fCurrentSmallData = 0;
2816 
2817 	if (fIterator != NULL)
2818 		fIterator->Rewind();
2819 
2820 	return B_OK;
2821 }
2822 
2823 
2824 status_t
2825 AttributeIterator::GetNext(char* name, size_t* _length, uint32* _type,
2826 	ino_t* _id)
2827 {
2828 	// read attributes out of the small data section
2829 
2830 	if (fCurrentSmallData >= 0) {
2831 		NodeGetter nodeGetter(fInode->GetVolume(), fInode);
2832 		const bfs_inode* node = nodeGetter.Node();
2833 		const small_data* item = ((bfs_inode*)node)->SmallDataStart();
2834 
2835 		RecursiveLocker _(&fInode->SmallDataLock());
2836 
2837 		int32 index = 0;
2838 		for (; !item->IsLast(node); item = item->Next(), index++) {
2839 			if (item->NameSize() == FILE_NAME_NAME_LENGTH
2840 				&& *item->Name() == FILE_NAME_NAME)
2841 				continue;
2842 
2843 			if (index >= fCurrentSmallData)
2844 				break;
2845 		}
2846 
2847 		if (!item->IsLast(node)) {
2848 			strncpy(name, item->Name(), B_FILE_NAME_LENGTH);
2849 			*_type = item->Type();
2850 			*_length = item->NameSize();
2851 			*_id = (ino_t)index;
2852 
2853 			fCurrentSmallData = index + 1;
2854 			return B_OK;
2855 		}
2856 
2857 		// stop traversing the small_data section
2858 		fCurrentSmallData = -1;
2859 	}
2860 
2861 	// read attributes out of the attribute directory
2862 
2863 	if (fInode->Attributes().IsZero())
2864 		return B_ENTRY_NOT_FOUND;
2865 
2866 	Volume* volume = fInode->GetVolume();
2867 
2868 	// if you haven't yet access to the attributes directory, get it
2869 	if (fAttributes == NULL) {
2870 		if (get_vnode(volume->FSVolume(), volume->ToVnode(fInode->Attributes()),
2871 				(void**)&fAttributes) != B_OK) {
2872 			FATAL(("get_vnode() failed in AttributeIterator::GetNext(ino_t"
2873 				" = %" B_PRIdINO ",name = \"%s\")\n", fInode->ID(), name));
2874 			return B_ENTRY_NOT_FOUND;
2875 		}
2876 
2877 		BPlusTree* tree = fAttributes->Tree();
2878 		if (tree == NULL
2879 			|| (fIterator = new TreeIterator(tree)) == NULL) {
2880 			FATAL(("could not get tree in AttributeIterator::GetNext(ino_t"
2881 				" = %" B_PRIdINO ",name = \"%s\")\n", fInode->ID(), name));
2882 			return B_ENTRY_NOT_FOUND;
2883 		}
2884 	}
2885 
2886 	uint16 length;
2887 	ino_t id;
2888 	status_t status = fIterator->GetNextEntry(name, &length,
2889 		B_FILE_NAME_LENGTH, &id);
2890 	if (status < B_OK)
2891 		return status;
2892 
2893 	Vnode vnode(volume, id);
2894 	Inode* attribute;
2895 	if ((status = vnode.Get(&attribute)) == B_OK) {
2896 		*_type = attribute->Type();
2897 		*_length = length;
2898 		*_id = id;
2899 	}
2900 
2901 	return status;
2902 }
2903 
2904 
2905 void
2906 AttributeIterator::Update(uint16 index, int8 change)
2907 {
2908 	// fCurrentSmallData points already to the next item
2909 	if (index < fCurrentSmallData)
2910 		fCurrentSmallData += change;
2911 }
2912 
2913