xref: /haiku/src/add-ons/kernel/file_systems/bfs/Inode.cpp (revision cc6e7cb3477cdb34c23be8ce246203d2b7f002de)
1 /*
2  * Copyright 2001-2008, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 //! inode access functions
7 
8 
9 #include "Debug.h"
10 #include "Inode.h"
11 #include "BPlusTree.h"
12 #include "Index.h"
13 
14 
15 #if defined(BFS_TRACING) && !defined(BFS_SHELL) && !defined(_BOOT_MODE)
16 namespace BFSInodeTracing {
17 
18 class Create : public AbstractTraceEntry {
19 	public:
20 		Create(Inode* inode, Inode* parent, const char* name, int32 mode,
21 				int openMode, uint32 type)
22 			:
23 			fInode(inode),
24 			fID(inode->ID()),
25 			fParent(parent),
26 			fParentID(parent != NULL ? parent->ID() : 0),
27 			fMode(mode),
28 			fOpenMode(openMode),
29 			fType(type)
30 		{
31 			if (name != NULL)
32 				strlcpy(fName, name, sizeof(fName));
33 			else
34 				fName[0] = '\0';
35 
36 			Initialized();
37 		}
38 
39 		virtual void AddDump(TraceOutput& out)
40 		{
41 			out.Print("bfs:Create %Ld (%p), parent %Ld (%p), \"%s\", "
42 				"mode %lx, omode %x, type %lx", fID, fInode, fParentID,
43 				fParent, fName, fMode, fOpenMode, fType);
44 		}
45 
46 	private:
47 		Inode*	fInode;
48 		ino_t	fID;
49 		Inode*	fParent;
50 		ino_t	fParentID;
51 		char	fName[32];
52 		int32	fMode;
53 		int		fOpenMode;
54 		uint32	fType;
55 
56 };
57 
58 class Remove : public AbstractTraceEntry {
59 	public:
60 		Remove(Inode* inode, const char* name)
61 			:
62 			fInode(inode),
63 			fID(inode->ID())
64 		{
65 			strlcpy(fName, name, sizeof(fName));
66 			Initialized();
67 		}
68 
69 		virtual void AddDump(TraceOutput& out)
70 		{
71 			out.Print("bfs:Remove %Ld (%p), \"%s\"", fID, fInode, fName);
72 		}
73 
74 	private:
75 		Inode*	fInode;
76 		ino_t	fID;
77 		char	fName[32];
78 };
79 
80 class Action : public AbstractTraceEntry {
81 	public:
82 		Action(const char* action, Inode* inode)
83 			:
84 			fInode(inode),
85 			fID(inode->ID())
86 		{
87 			strlcpy(fAction, action, sizeof(fAction));
88 			Initialized();
89 		}
90 
91 		virtual void AddDump(TraceOutput& out)
92 		{
93 			out.Print("bfs:%s %Ld (%p)\n", fAction, fID, fInode);
94 		}
95 
96 	private:
97 		Inode*	fInode;
98 		ino_t	fID;
99 		char	fAction[16];
100 };
101 
102 class Resize : public AbstractTraceEntry {
103 	public:
104 		Resize(Inode* inode, off_t oldSize, off_t newSize, bool trim)
105 			:
106 			fInode(inode),
107 			fID(inode->ID()),
108 			fOldSize(oldSize),
109 			fNewSize(newSize),
110 			fTrim(trim)
111 		{
112 			Initialized();
113 		}
114 
115 		virtual void AddDump(TraceOutput& out)
116 		{
117 			out.Print("bfs:%s %Ld (%p), %Ld -> %Ld", fTrim ? "Trim" : "Resize",
118 				fID, fInode, fOldSize, fNewSize);
119 		}
120 
121 	private:
122 		Inode*	fInode;
123 		ino_t	fID;
124 		off_t	fOldSize;
125 		off_t	fNewSize;
126 		bool	fTrim;
127 };
128 
129 }	// namespace BFSInodeTracing
130 
131 #	define T(x) new(std::nothrow) BFSInodeTracing::x;
132 #else
133 #	define T(x) ;
134 #endif
135 
136 class InodeAllocator {
137 	public:
138 		InodeAllocator(Transaction &transaction);
139 		~InodeAllocator();
140 
141 		status_t New(block_run *parentRun, mode_t mode, block_run &run,
142 			Inode **_inode);
143 		status_t CreateTree();
144 		status_t Keep();
145 
146 	private:
147 		static void _TransactionListener(int32 id, int32 event, void *_inode);
148 
149 		Transaction *fTransaction;
150 		block_run fRun;
151 		Inode *fInode;
152 };
153 
154 
155 InodeAllocator::InodeAllocator(Transaction &transaction)
156 	:
157 	fTransaction(&transaction),
158 	fInode(NULL)
159 {
160 }
161 
162 
163 InodeAllocator::~InodeAllocator()
164 {
165 	if (fTransaction != NULL) {
166 		Volume *volume = fTransaction->GetVolume();
167 
168 		if (fInode != NULL) {
169 			fInode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
170 				// this unblocks any pending bfs_read_vnode() calls
171 			fInode->Free(*fTransaction);
172 			remove_vnode(volume->FSVolume(), fInode->ID());
173 		} else
174 			volume->Free(*fTransaction, fRun);
175 	}
176 
177 	delete fInode;
178 }
179 
180 
181 status_t
182 InodeAllocator::New(block_run *parentRun, mode_t mode, block_run &run,
183 	Inode **_inode)
184 {
185 	Volume *volume = fTransaction->GetVolume();
186 
187 	status_t status = volume->AllocateForInode(*fTransaction, parentRun, mode,
188 		fRun);
189 	if (status < B_OK) {
190 		// don't free the space in the destructor, because
191 		// the allocation failed
192 		fTransaction = NULL;
193 		RETURN_ERROR(status);
194 	}
195 
196 	run = fRun;
197 	fInode = new Inode(volume, *fTransaction, volume->ToVnode(run), mode, run);
198 	if (fInode == NULL)
199 		RETURN_ERROR(B_NO_MEMORY);
200 
201 	if (volume->ID() >= 0) {
202 		status = new_vnode(volume->FSVolume(), fInode->ID(), fInode,
203 			&gBFSVnodeOps);
204 		if (status < B_OK) {
205 			delete fInode;
206 			fInode = NULL;
207 			RETURN_ERROR(status);
208 		}
209 	}
210 
211 	*_inode = fInode;
212 	return B_OK;
213 }
214 
215 
216 status_t
217 InodeAllocator::CreateTree()
218 {
219 	Volume *volume = fTransaction->GetVolume();
220 
221 	// force S_STR_INDEX to be set, if no type is set
222 	if ((fInode->Mode() & S_INDEX_TYPES) == 0)
223 		fInode->Node().mode |= HOST_ENDIAN_TO_BFS_INT32(S_STR_INDEX);
224 
225 	BPlusTree *tree = fInode->fTree = new BPlusTree(*fTransaction, fInode);
226 	if (tree == NULL || tree->InitCheck() < B_OK)
227 		return B_ERROR;
228 
229 	if (fInode->IsRegularNode()) {
230 		if (tree->Insert(*fTransaction, ".", fInode->ID()) < B_OK
231 			|| tree->Insert(*fTransaction, "..",
232 					volume->ToVnode(fInode->Parent())) < B_OK)
233 			return B_ERROR;
234 	}
235 	return B_OK;
236 }
237 
238 
239 status_t
240 InodeAllocator::Keep()
241 {
242 	ASSERT(fInode != NULL && fTransaction != NULL);
243 	Volume *volume = fTransaction->GetVolume();
244 
245 	status_t status = fInode->WriteBack(*fTransaction);
246 	if (status < B_OK) {
247 		FATAL(("writing new inode %Ld failed!\n", fInode->ID()));
248 		return status;
249 	}
250 
251 	if (!fInode->IsSymLink() && volume->ID() >= 0) {
252 		status = publish_vnode(volume->FSVolume(), fInode->ID(), fInode,
253 			&gBFSVnodeOps, fInode->Mode(), 0);
254 	}
255 
256 	if (status == B_OK) {
257 		cache_add_transaction_listener(volume->BlockCache(), fTransaction->ID(),
258 			TRANSACTION_ABORTED, &_TransactionListener, fInode);
259 	}
260 
261 	fTransaction = NULL;
262 	fInode = NULL;
263 
264 	return status;
265 }
266 
267 
268 /*static*/ void
269 InodeAllocator::_TransactionListener(int32 id, int32 event, void *_inode)
270 {
271 	Inode *inode = (Inode *)_inode;
272 
273 	if (event == TRANSACTION_ABORTED)
274 		panic("transaction %d aborted, inode %p still around!\n", (int)id, inode);
275 }
276 
277 
278 //	#pragma mark -
279 
280 
281 status_t
282 bfs_inode::InitCheck(Volume *volume)
283 {
284 	if (Magic1() != INODE_MAGIC1
285 		|| !(Flags() & INODE_IN_USE)
286 		|| inode_num.Length() != 1
287 		// matches inode size?
288 		|| (uint32)InodeSize() != volume->InodeSize()
289 		// parent resides on disk?
290 		|| parent.AllocationGroup() > int32(volume->AllocationGroups())
291 		|| parent.AllocationGroup() < 0
292 		|| parent.Start() > (1L << volume->AllocationGroupShift())
293 		|| parent.Length() != 1
294 		// attributes, too?
295 		|| attributes.AllocationGroup() > int32(volume->AllocationGroups())
296 		|| attributes.AllocationGroup() < 0
297 		|| attributes.Start() > (1L << volume->AllocationGroupShift()))
298 		RETURN_ERROR(B_BAD_DATA);
299 
300 	if (Flags() & INODE_DELETED)
301 		return B_NOT_ALLOWED;
302 
303 	// ToDo: Add some tests to check the integrity of the other stuff here,
304 	// especially for the data_stream!
305 
306 	return B_OK;
307 }
308 
309 
310 //	#pragma mark - Inode
311 
312 
313 Inode::Inode(Volume *volume, ino_t id)
314 	:
315 	fVolume(volume),
316 	fID(id),
317 	fTree(NULL),
318 	fAttributes(NULL),
319 	fCache(NULL),
320 	fMap(NULL)
321 {
322 	PRINT(("Inode::Inode(volume = %p, id = %Ld) @ %p\n", volume, id, this));
323 
324 	NodeGetter node(volume, this);
325 	memcpy(&fNode, node.Node(), sizeof(bfs_inode));
326 
327 	char lockName[B_OS_NAME_LENGTH];
328 	snprintf(lockName, sizeof(lockName), "bfs inode %d.%d",
329 		(int)BlockRun().AllocationGroup(), BlockRun().Start());
330 	fLock.Initialize(lockName);
331 
332 	// these two will help to maintain the indices
333 	fOldSize = Size();
334 	fOldLastModified = LastModified();
335 
336 	if (IsContainer())
337 		fTree = new BPlusTree(this);
338 	if (IsFile() || IsAttribute()) {
339 		SetFileCache(file_cache_create(fVolume->ID(), ID(), Size()));
340 		SetMap(file_map_create(volume->ID(), ID(), Size()));
341 	}
342 }
343 
344 
345 Inode::Inode(Volume *volume, Transaction &transaction, ino_t id, mode_t mode,
346 		block_run &run)
347 	:
348 	fVolume(volume),
349 	fID(id),
350 	fTree(NULL),
351 	fAttributes(NULL),
352 	fCache(NULL),
353 	fMap(NULL)
354 {
355 	PRINT(("Inode::Inode(volume = %p, transaction = %p, id = %Ld) @ %p\n",
356 		volume, &transaction, id, this));
357 
358 	char lockName[B_OS_NAME_LENGTH];
359 	snprintf(lockName, sizeof(lockName), "bfs inode+%d.%d",
360 		(int)run.AllocationGroup(), run.Start());
361 	fLock.Initialize(lockName);
362 
363 	NodeGetter node(volume, transaction, this, true);
364 	memset(&fNode, 0, sizeof(bfs_inode));
365 
366 	// Initialize the bfs_inode structure -- it's not written back to disk
367 	// here, because it will be done so already when the inode could be
368 	// created completely.
369 
370 	Node().magic1 = HOST_ENDIAN_TO_BFS_INT32(INODE_MAGIC1);
371 	Node().inode_num = run;
372 	Node().mode = HOST_ENDIAN_TO_BFS_INT32(mode);
373 	Node().flags = HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
374 
375 	Node().create_time = HOST_ENDIAN_TO_BFS_INT64((bigtime_t)time(NULL)
376 		<< INODE_TIME_SHIFT);
377 	Node().last_modified_time = HOST_ENDIAN_TO_BFS_INT64(Node().create_time
378 		| (volume->GetUniqueID() & INODE_TIME_MASK));
379 		// we use Volume::GetUniqueID() to avoid having too many duplicates
380 		// in the last_modified index
381 
382 	Node().inode_size = HOST_ENDIAN_TO_BFS_INT32(volume->InodeSize());
383 
384 	// these two will help to maintain the indices
385 	fOldSize = Size();
386 	fOldLastModified = LastModified();
387 }
388 
389 
390 Inode::~Inode()
391 {
392 	PRINT(("Inode::~Inode() @ %p\n", this));
393 
394 	file_cache_delete(FileCache());
395 	file_map_delete(Map());
396 	delete fTree;
397 }
398 
399 
400 status_t
401 Inode::InitCheck(bool checkNode)
402 {
403 	// test inode magic and flags
404 	if (checkNode) {
405 		status_t status = Node().InitCheck(fVolume);
406 		if (status == B_BUSY)
407 			return B_BUSY;
408 
409 		if (status < B_OK) {
410 			FATAL(("inode at block %Ld corrupt!\n", BlockNumber()));
411 			RETURN_ERROR(B_BAD_DATA);
412 		}
413 	}
414 
415 	if (IsContainer()) {
416 		// inodes that have a B+tree
417 		if (fTree == NULL)
418 			RETURN_ERROR(B_NO_MEMORY);
419 
420 		status_t status = fTree->InitCheck();
421 		if (status < B_OK) {
422 			FATAL(("inode tree at block %Ld corrupt!\n", BlockNumber()));
423 			RETURN_ERROR(B_BAD_DATA);
424 		}
425 	}
426 
427 	// it's more important to know that the inode is corrupt
428 	// so we check for the lock not until here
429 	return fLock.InitCheck();
430 }
431 
432 
433 status_t
434 Inode::WriteBack(Transaction &transaction)
435 {
436 	NodeGetter node(fVolume, transaction, this);
437 	if (node.WritableNode() == NULL)
438 		return B_IO_ERROR;
439 
440 	memcpy(node.WritableNode(), &Node(), sizeof(bfs_inode));
441 	return B_OK;
442 }
443 
444 
445 status_t
446 Inode::CheckPermissions(int accessMode) const
447 {
448 	uid_t user = geteuid();
449 	gid_t group = getegid();
450 
451 	// you never have write access to a read-only volume
452 	if (accessMode & W_OK && fVolume->IsReadOnly())
453 		return B_READ_ONLY_DEVICE;
454 
455 	// root users always have full access (but they can't execute files without
456 	// any execute permissions set)
457 	if (user == 0) {
458 		if (!((accessMode & X_OK) != 0 && (Mode() & S_IXUSR) == 0)
459 			|| (Mode() & S_DIRECTORY) != 0) {
460 			return B_OK;
461 		}
462 	}
463 
464 	// shift mode bits, to check directly against accessMode
465 	mode_t mode = Mode();
466 	if (user == (uid_t)fNode.UserID())
467 		mode >>= 6;
468 	else if (group == (gid_t)fNode.GroupID())
469 		mode >>= 3;
470 
471 	if (accessMode & ~(mode & S_IRWXO))
472 		return B_NOT_ALLOWED;
473 
474 	return B_OK;
475 }
476 
477 
478 //	#pragma mark - attributes
479 
480 
481 void
482 Inode::_AddIterator(AttributeIterator *iterator)
483 {
484 	if (fSmallDataLock.Lock() < B_OK)
485 		return;
486 
487 	fIterators.Add(iterator);
488 
489 	fSmallDataLock.Unlock();
490 }
491 
492 
493 void
494 Inode::_RemoveIterator(AttributeIterator *iterator)
495 {
496 	if (fSmallDataLock.Lock() < B_OK)
497 		return;
498 
499 	fIterators.Remove(iterator);
500 
501 	fSmallDataLock.Unlock();
502 }
503 
504 
505 /*!	Tries to free up "bytes" space in the small_data section by moving
506 	attributes to real files. Used for system attributes like the name.
507 	You need to hold the fSmallDataLock when you call this method
508 */
509 status_t
510 Inode::_MakeSpaceForSmallData(Transaction &transaction, bfs_inode *node,
511 	const char *name, int32 bytes)
512 {
513 	ASSERT(fSmallDataLock.IsLocked());
514 
515 	while (bytes > 0) {
516 		small_data *item = node->SmallDataStart(), *max = NULL;
517 		int32 index = 0, maxIndex = 0;
518 		for (; !item->IsLast(node); item = item->Next(), index++) {
519 			// should not remove those
520 			if (*item->Name() == FILE_NAME_NAME || !strcmp(name, item->Name()))
521 				continue;
522 
523 			if (max == NULL || max->Size() < item->Size()) {
524 				maxIndex = index;
525 				max = item;
526 			}
527 
528 			// Remove the first one large enough to free the needed amount of
529 			// bytes
530 			if (bytes < (int32)item->Size())
531 				break;
532 		}
533 
534 		if (item->IsLast(node) || (int32)item->Size() < bytes)
535 			return B_ERROR;
536 
537 		bytes -= max->Size();
538 
539 		// Move the attribute to a real attribute file
540 		// Luckily, this doesn't cause any index updates
541 
542 		Inode *attribute;
543 		status_t status = CreateAttribute(transaction, item->Name(),
544 			item->Type(), &attribute);
545 		if (status < B_OK)
546 			RETURN_ERROR(status);
547 
548 		size_t length = item->DataSize();
549 		status = attribute->WriteAt(transaction, 0, item->Data(), &length);
550 
551 		ReleaseAttribute(attribute);
552 
553 		if (status < B_OK) {
554 			Vnode vnode(fVolume, Attributes());
555 			Inode *attributes;
556 			if (vnode.Get(&attributes) < B_OK
557 				|| attributes->Remove(transaction, name) < B_OK) {
558 				FATAL(("Could not remove newly created attribute!\n"));
559 			}
560 
561 			RETURN_ERROR(status);
562 		}
563 
564 		_RemoveSmallData(node, max, maxIndex);
565 	}
566 	return B_OK;
567 }
568 
569 
570 /*!	Private function which removes the given attribute from the small_data
571 	section.
572 	You need to hold the fSmallDataLock when you call this method
573 */
574 status_t
575 Inode::_RemoveSmallData(bfs_inode *node, small_data *item, int32 index)
576 {
577 	ASSERT(fSmallDataLock.IsLocked());
578 
579 	small_data *next = item->Next();
580 	if (!next->IsLast(node)) {
581 		// find the last attribute
582 		small_data *last = next;
583 		while (!last->IsLast(node))
584 			last = last->Next();
585 
586 		int32 size = (uint8 *)last - (uint8 *)next;
587 		if (size < 0
588 			|| size > (uint8 *)node + fVolume->BlockSize() - (uint8 *)next)
589 			return B_BAD_DATA;
590 
591 		memmove(item, next, size);
592 
593 		// Move the "last" one to its new location and
594 		// correctly terminate the small_data section
595 		last = (small_data *)((uint8 *)last - ((uint8 *)next - (uint8 *)item));
596 		memset(last, 0, (uint8 *)node + fVolume->BlockSize() - (uint8 *)last);
597 	} else
598 		memset(item, 0, item->Size());
599 
600 	// update all current iterators
601 	AttributeIterator *iterator = NULL;
602 	while ((iterator = fIterators.Next(iterator)) != NULL) {
603 		iterator->Update(index, -1);
604 	}
605 
606 	return B_OK;
607 }
608 
609 
610 //!	Removes the given attribute from the small_data section.
611 status_t
612 Inode::_RemoveSmallData(Transaction &transaction, NodeGetter &nodeGetter,
613 	const char *name)
614 {
615 	if (name == NULL)
616 		return B_BAD_VALUE;
617 
618 	bfs_inode *node = nodeGetter.WritableNode();
619 	SimpleLocker locker(fSmallDataLock);
620 
621 	// search for the small_data item
622 
623 	small_data *item = node->SmallDataStart();
624 	int32 index = 0;
625 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
626 		item = item->Next();
627 		index++;
628 	}
629 
630 	if (item->IsLast(node))
631 		return B_ENTRY_NOT_FOUND;
632 
633 	nodeGetter.MakeWritable(transaction);
634 	status_t status = _RemoveSmallData(node, item, index);
635 	if (status == B_OK)
636 		status = WriteBack(transaction);
637 
638 	return status;
639 }
640 
641 
642 /*!	Try to place the given attribute in the small_data section - if the
643 	new attribute is too big to fit in that section, it returns B_DEVICE_FULL.
644 	In that case, the attribute should be written to a real attribute file;
645 	it's the caller's responsibility to remove any existing attributes in the
646 	small data section if that's the case.
647 
648 	Note that you need to write back the inode yourself after having called that
649 	method - it's a bad API decision that it needs a transaction but enforces
650 	you to write back the inode all by yourself, but it's just more efficient
651 	in most cases...
652 */
653 status_t
654 Inode::_AddSmallData(Transaction &transaction, NodeGetter &nodeGetter,
655 	const char *name, uint32 type, const uint8 *data, size_t length, bool force)
656 {
657 	// TODO: support new write attr semantics and write offset!
658 	bfs_inode *node = nodeGetter.WritableNode();
659 
660 	if (node == NULL || name == NULL || data == NULL)
661 		return B_BAD_VALUE;
662 
663 	// reject any requests that can't fit into the small_data section
664 	uint32 nameLength = strlen(name);
665 	uint32 spaceNeeded = sizeof(small_data) + nameLength + 3 + length + 1;
666 	if (spaceNeeded > fVolume->InodeSize() - sizeof(bfs_inode))
667 		return B_DEVICE_FULL;
668 
669 	nodeGetter.MakeWritable(transaction);
670 	SimpleLocker locker(fSmallDataLock);
671 
672 	// Find the last item or one with the same name we have to add
673 	small_data *item = node->SmallDataStart();
674 	int32 index = 0;
675 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
676 		item = item->Next();
677 		index++;
678 	}
679 
680 	// is the attribute already in the small_data section?
681 	// then just replace the data part of that one
682 	if (!item->IsLast(node)) {
683 		// find last attribute
684 		small_data *last = item;
685 		while (!last->IsLast(node))
686 			last = last->Next();
687 
688 		// try to change the attributes value
689 		if (item->data_size > length
690 			|| force
691 			|| ((uint8 *)last + length - item->DataSize())
692 					<= ((uint8 *)node + fVolume->InodeSize())) {
693 			// Make room for the new attribute if needed (and we are forced
694 			// to do so)
695 			if (force && ((uint8 *)last + length - item->DataSize())
696 					> ((uint8 *)node + fVolume->InodeSize())) {
697 				// We also take the free space at the end of the small_data
698 				// section into account, and request only what's really needed
699 				uint32 needed = length - item->DataSize() -
700 					(uint32)((uint8 *)node + fVolume->InodeSize()
701 						- (uint8 *)last);
702 
703 				if (_MakeSpaceForSmallData(transaction, node, name, needed)
704 						< B_OK)
705 					return B_ERROR;
706 
707 				// reset our pointers
708 				item = node->SmallDataStart();
709 				index = 0;
710 				while (!item->IsLast(node) && strcmp(item->Name(), name)) {
711 					item = item->Next();
712 					index++;
713 				}
714 
715 				last = item;
716 				while (!last->IsLast(node))
717 					last = last->Next();
718 			}
719 
720 			// Normally, we can just overwrite the attribute data as the size
721 			// is specified by the type and does not change that often
722 			if (length != item->DataSize()) {
723 				// move the attributes after the current one
724 				small_data *next = item->Next();
725 				if (!next->IsLast(node)) {
726 					memmove((uint8 *)item + spaceNeeded, next,
727 						(uint8 *)last - (uint8 *)next);
728 				}
729 
730 				// Move the "last" one to its new location and
731 				// correctly terminate the small_data section
732 				last = (small_data *)((uint8 *)last
733 					- ((uint8 *)next - ((uint8 *)item + spaceNeeded)));
734 				if ((uint8 *)last < (uint8 *)node + fVolume->BlockSize()) {
735 					memset(last, 0, (uint8 *)node + fVolume->BlockSize()
736 						- (uint8 *)last);
737 				}
738 
739 				item->data_size = HOST_ENDIAN_TO_BFS_INT16(length);
740 			}
741 
742 			item->type = HOST_ENDIAN_TO_BFS_INT32(type);
743 			memcpy(item->Data(), data, length);
744 			item->Data()[length] = '\0';
745 
746 			return B_OK;
747 		}
748 
749 		return B_DEVICE_FULL;
750 	}
751 
752 	// try to add the new attribute!
753 
754 	if ((uint8 *)item + spaceNeeded > (uint8 *)node + fVolume->InodeSize()) {
755 		// there is not enough space for it!
756 		if (!force)
757 			return B_DEVICE_FULL;
758 
759 		// make room for the new attribute
760 		if (_MakeSpaceForSmallData(transaction, node, name, spaceNeeded) < B_OK)
761 			return B_ERROR;
762 
763 		// get new last item!
764 		item = node->SmallDataStart();
765 		index = 0;
766 		while (!item->IsLast(node)) {
767 			item = item->Next();
768 			index++;
769 		}
770 	}
771 
772 	memset(item, 0, spaceNeeded);
773 	item->type = HOST_ENDIAN_TO_BFS_INT32(type);
774 	item->name_size = HOST_ENDIAN_TO_BFS_INT16(nameLength);
775 	item->data_size = HOST_ENDIAN_TO_BFS_INT16(length);
776 	strcpy(item->Name(), name);
777 	memcpy(item->Data(), data, length);
778 
779 	// correctly terminate the small_data section
780 	item = item->Next();
781 	if (!item->IsLast(node))
782 		memset(item, 0, (uint8 *)node + fVolume->InodeSize() - (uint8 *)item);
783 
784 	// update all current iterators
785 	AttributeIterator *iterator = NULL;
786 	while ((iterator = fIterators.Next(iterator)) != NULL) {
787 		iterator->Update(index, 1);
788 	}
789 
790 	return B_OK;
791 }
792 
793 
794 /*!	Iterates through the small_data section of an inode.
795 	To start at the beginning of this section, you let smallData
796 	point to NULL, like:
797 		small_data *data = NULL;
798 		while (inode->GetNextSmallData(&data) { ... }
799 
800 	This function is reentrant and doesn't allocate any memory;
801 	you can safely stop calling it at any point (you don't need
802 	to iterate through the whole list).
803 	You need to hold the fSmallDataLock when you call this method
804 */
805 status_t
806 Inode::_GetNextSmallData(bfs_inode *node, small_data **_smallData) const
807 {
808 	if (node == NULL)
809 		RETURN_ERROR(B_BAD_VALUE);
810 
811 	ASSERT(fSmallDataLock.IsLocked());
812 
813 	small_data *data = *_smallData;
814 
815 	// begin from the start?
816 	if (data == NULL)
817 		data = node->SmallDataStart();
818 	else
819 		data = data->Next();
820 
821 	// is already last item?
822 	if (data->IsLast(node))
823 		return B_ENTRY_NOT_FOUND;
824 
825 	*_smallData = data;
826 
827 	return B_OK;
828 }
829 
830 
831 /*!	Finds the attribute "name" in the small data section, and
832 	returns a pointer to it (or NULL if it doesn't exist).
833 	You need to hold the fSmallDataLock when you call this method
834 */
835 small_data *
836 Inode::FindSmallData(const bfs_inode *node, const char *name) const
837 {
838 	ASSERT(fSmallDataLock.IsLocked());
839 
840 	small_data *smallData = NULL;
841 	while (_GetNextSmallData(const_cast<bfs_inode *>(node), &smallData)
842 			== B_OK) {
843 		if (!strcmp(smallData->Name(), name))
844 			return smallData;
845 	}
846 	return NULL;
847 }
848 
849 
850 /*!	Returns a pointer to the node's name if present in the small data
851 	section, NULL otherwise.
852 	You need to hold the fSmallDataLock when you call this method
853 */
854 const char *
855 Inode::Name(const bfs_inode *node) const
856 {
857 	ASSERT(fSmallDataLock.IsLocked());
858 
859 	small_data *smallData = NULL;
860 	while (_GetNextSmallData((bfs_inode *)node, &smallData) == B_OK) {
861 		if (*smallData->Name() == FILE_NAME_NAME
862 			&& smallData->NameSize() == FILE_NAME_NAME_LENGTH)
863 			return (const char *)smallData->Data();
864 	}
865 	return NULL;
866 }
867 
868 
869 /*!	Copies the node's name into the provided buffer.
870 	The buffer should be B_FILE_NAME_LENGTH bytes large.
871 */
872 status_t
873 Inode::GetName(char *buffer, size_t size) const
874 {
875 	NodeGetter node(fVolume, this);
876 	SimpleLocker locker(fSmallDataLock);
877 
878 	const char *name = Name(node.Node());
879 	if (name == NULL)
880 		return B_ENTRY_NOT_FOUND;
881 
882 	strlcpy(buffer, name, size);
883 	return B_OK;
884 }
885 
886 
887 /*!	Changes or set the name of a file: in the inode small_data section only, it
888 	doesn't change it in the parent directory's b+tree.
889 	Note that you need to write back the inode yourself after having called
890 	that method. It suffers from the same API decision as AddSmallData() does
891 	(and for the same reason).
892 */
893 status_t
894 Inode::SetName(Transaction &transaction, const char *name)
895 {
896 	if (name == NULL || *name == '\0')
897 		return B_BAD_VALUE;
898 
899 	NodeGetter node(fVolume, transaction, this);
900 	const char nameTag[2] = {FILE_NAME_NAME, 0};
901 
902 	return _AddSmallData(transaction, node, nameTag, FILE_NAME_TYPE,
903 		(uint8 *)name, strlen(name), true);
904 }
905 
906 
907 status_t
908 Inode::_RemoveAttribute(Transaction &transaction, const char *name,
909 	bool hasIndex, Index *index)
910 {
911 	// remove the attribute file if it exists
912 	Vnode vnode(fVolume, Attributes());
913 	Inode *attributes;
914 	status_t status = vnode.Get(&attributes);
915 	if (status < B_OK)
916 		return status;
917 
918 	// update index
919 	if (index != NULL) {
920 		Inode *attribute;
921 		if ((hasIndex || fVolume->CheckForLiveQuery(name))
922 			&& GetAttribute(name, &attribute) == B_OK) {
923 			uint8 data[BPLUSTREE_MAX_KEY_LENGTH];
924 			size_t length = BPLUSTREE_MAX_KEY_LENGTH;
925 			if (attribute->ReadAt(0, data, &length) == B_OK) {
926 				index->Update(transaction, name, attribute->Type(), data,
927 					length, NULL, 0, this);
928 			}
929 
930 			ReleaseAttribute(attribute);
931 		}
932 	}
933 
934 	if ((status = attributes->Remove(transaction, name)) < B_OK)
935 		return status;
936 
937 	if (attributes->IsEmpty()) {
938 		// remove attribute directory (don't fail if that can't be done)
939 		if (remove_vnode(fVolume->FSVolume(), attributes->ID()) == B_OK) {
940 			// update the inode, so that no one will ever doubt it's deleted :-)
941 			attributes->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
942 			if (attributes->WriteBack(transaction) == B_OK) {
943 				Attributes().SetTo(0, 0, 0);
944 				WriteBack(transaction);
945 			} else
946 				unremove_vnode(fVolume->FSVolume(), attributes->ID());
947 		}
948 	}
949 
950 	return status;
951 }
952 
953 
954 /*!	Reads data from the specified attribute.
955 	This is a high-level attribute function that understands attributes
956 	in the small_data section as well as real attribute files.
957 */
958 status_t
959 Inode::ReadAttribute(const char *name, int32 type, off_t pos, uint8 *buffer,
960 	size_t *_length)
961 {
962 	if (pos < 0)
963 		pos = 0;
964 
965 	// search in the small_data section (which has to be locked first)
966 	{
967 		NodeGetter node(fVolume, this);
968 		SimpleLocker locker(fSmallDataLock);
969 
970 		small_data *smallData = FindSmallData(node.Node(), name);
971 		if (smallData != NULL) {
972 			size_t length = *_length;
973 			if (pos >= smallData->data_size) {
974 				*_length = 0;
975 				return B_OK;
976 			}
977 			if (length + pos > smallData->DataSize())
978 				length = smallData->DataSize() - pos;
979 
980 			memcpy(buffer, smallData->Data() + pos, length);
981 			*_length = length;
982 			return B_OK;
983 		}
984 	}
985 
986 	// search in the attribute directory
987 	Inode *attribute;
988 	status_t status = GetAttribute(name, &attribute);
989 	if (status == B_OK) {
990 		status = attribute->ReadAt(pos, (uint8 *)buffer, _length);
991 
992 		ReleaseAttribute(attribute);
993 	}
994 
995 	RETURN_ERROR(status);
996 }
997 
998 
999 /*!	Writes data to the specified attribute.
1000 	This is a high-level attribute function that understands attributes
1001 	in the small_data section as well as real attribute files.
1002 */
1003 status_t
1004 Inode::WriteAttribute(Transaction &transaction, const char *name, int32 type,
1005 	off_t pos, const uint8 *buffer, size_t *_length)
1006 {
1007 	// needed to maintain the index
1008 	uint8 oldBuffer[BPLUSTREE_MAX_KEY_LENGTH], *oldData = NULL;
1009 	size_t oldLength = 0;
1010 
1011 	// TODO: we actually depend on that the contents of "buffer" are constant.
1012 	// If they get changed during the write (hey, user programs), we may mess
1013 	// up our index trees!
1014 	// TODO: for attribute files, we need to log the first
1015 	// BPLUSTREE_MAX_KEY_LENGTH bytes of the data stream, or the same as above
1016 	// might happen.
1017 
1018 	Index index(fVolume);
1019 	index.SetTo(name);
1020 
1021 	Inode *attribute = NULL;
1022 	status_t status = B_OK;
1023 
1024 	if (GetAttribute(name, &attribute) < B_OK) {
1025 		// save the old attribute data
1026 		NodeGetter node(fVolume, transaction, this);
1027 		fSmallDataLock.Lock();
1028 
1029 		small_data *smallData = FindSmallData(node.Node(), name);
1030 		if (smallData != NULL) {
1031 			oldLength = smallData->DataSize();
1032 			if (oldLength > 0) {
1033 				if (oldLength > BPLUSTREE_MAX_KEY_LENGTH)
1034 					oldLength = BPLUSTREE_MAX_KEY_LENGTH;
1035 				memcpy(oldData = oldBuffer, smallData->Data(), oldLength);
1036 			}
1037 		}
1038 		fSmallDataLock.Unlock();
1039 
1040 		// if the attribute doesn't exist yet (as a file), try to put it in the
1041 		// small_data section first - if that fails (due to insufficent space),
1042 		// create a real attribute file
1043 		status = _AddSmallData(transaction, node, name, type, buffer, *_length);
1044 		if (status == B_DEVICE_FULL) {
1045 			if (smallData != NULL) {
1046 				// remove the old attribute from the small data section - there
1047 				// is no space left for the new data
1048 				status = _RemoveSmallData(transaction, node, name);
1049 			} else
1050 				status = B_OK;
1051 
1052 			if (status == B_OK)
1053 				status = CreateAttribute(transaction, name, type, &attribute);
1054 			if (status < B_OK)
1055 				RETURN_ERROR(status);
1056 		} else if (status == B_OK)
1057 			status = WriteBack(transaction);
1058 	}
1059 
1060 	if (attribute != NULL) {
1061 		if (attribute->Lock().LockWrite() == B_OK) {
1062 			// Save the old attribute data (if this fails, oldLength will
1063 			// reflect it)
1064 			if (fVolume->CheckForLiveQuery(name) && attribute->Size() > 0) {
1065 				oldLength = BPLUSTREE_MAX_KEY_LENGTH;
1066 				if (attribute->ReadAt(0, oldBuffer, &oldLength) == B_OK)
1067 					oldData = oldBuffer;
1068 			}
1069 
1070 			// check if the data fits into the small_data section again
1071 			NodeGetter node(fVolume, transaction, this);
1072 			status = _AddSmallData(transaction, node, name, type, buffer,
1073 				*_length);
1074 
1075 			if (status == B_OK) {
1076 				// it does - remove its file
1077 				attribute->Lock().UnlockWrite();
1078 				status = _RemoveAttribute(transaction, name, false, NULL);
1079 			} else {
1080 				// The attribute type might have been changed - we need to
1081 				// adopt the new one
1082 				attribute->Node().type = HOST_ENDIAN_TO_BFS_INT32(type);
1083 				status = attribute->WriteBack(transaction);
1084 				attribute->Lock().UnlockWrite();
1085 
1086 				if (status == B_OK) {
1087 					status = attribute->WriteAt(transaction, pos, buffer,
1088 						_length);
1089 				}
1090 			}
1091 		} else
1092 			status = B_ERROR;
1093 
1094 		ReleaseAttribute(attribute);
1095 	}
1096 
1097 	// TODO: find a better way than this "pos" thing (the begin of the old key
1098 	//	must be copied to the start of the new one for a comparison)
1099 	if (status == B_OK && pos == 0) {
1100 		// index only the first BPLUSTREE_MAX_KEY_LENGTH bytes
1101 		uint16 length = *_length;
1102 		if (length > BPLUSTREE_MAX_KEY_LENGTH)
1103 			length = BPLUSTREE_MAX_KEY_LENGTH;
1104 
1105 		// Update index. Note, Index::Update() may be called even if
1106 		// initializing the index failed - it will just update the live
1107 		// queries in this case
1108 		if (pos < length || pos < oldLength) {
1109 			index.Update(transaction, name, type, oldData, oldLength, buffer,
1110 				length, this);
1111 		}
1112 	}
1113 	return status;
1114 }
1115 
1116 
1117 /*!	Removes the specified attribute from the inode.
1118 	This is a high-level attribute function that understands attributes
1119 	in the small_data section as well as real attribute files.
1120 */
1121 status_t
1122 Inode::RemoveAttribute(Transaction &transaction, const char *name)
1123 {
1124 	Index index(fVolume);
1125 	bool hasIndex = index.SetTo(name) == B_OK;
1126 	NodeGetter node(fVolume, this);
1127 
1128 	// update index for attributes in the small_data section
1129 	{
1130 		fSmallDataLock.Lock();
1131 
1132 		small_data *smallData = FindSmallData(node.Node(), name);
1133 		if (smallData != NULL) {
1134 			uint32 length = smallData->DataSize();
1135 			if (length > BPLUSTREE_MAX_KEY_LENGTH)
1136 				length = BPLUSTREE_MAX_KEY_LENGTH;
1137 			index.Update(transaction, name, smallData->Type(),
1138 				smallData->Data(), length, NULL, 0, this);
1139 		}
1140 		fSmallDataLock.Unlock();
1141 	}
1142 
1143 	status_t status = _RemoveSmallData(transaction, node, name);
1144 	if (status == B_ENTRY_NOT_FOUND && !Attributes().IsZero()) {
1145 		// remove the attribute file if it exists
1146 		status = _RemoveAttribute(transaction, name, hasIndex, &index);
1147 	}
1148 	return status;
1149 }
1150 
1151 
1152 status_t
1153 Inode::GetAttribute(const char *name, Inode **_attribute)
1154 {
1155 	// does this inode even have attributes?
1156 	if (Attributes().IsZero())
1157 		return B_ENTRY_NOT_FOUND;
1158 
1159 	Vnode vnode(fVolume, Attributes());
1160 	Inode *attributes;
1161 	if (vnode.Get(&attributes) < B_OK) {
1162 		FATAL(("get_vnode() failed in Inode::GetAttribute(name = \"%s\")\n",
1163 			name));
1164 		return B_ERROR;
1165 	}
1166 
1167 	BPlusTree *tree;
1168 	status_t status = attributes->GetTree(&tree);
1169 	if (status == B_OK) {
1170 		ino_t id;
1171 		status = tree->Find((uint8 *)name, (uint16)strlen(name), &id);
1172 		if (status == B_OK) {
1173 			Vnode vnode(fVolume, id);
1174 			Inode *inode;
1175 			// Check if the attribute is really an attribute
1176 			if (vnode.Get(&inode) < B_OK || !inode->IsAttribute())
1177 				return B_ERROR;
1178 
1179 			*_attribute = inode;
1180 			vnode.Keep();
1181 			return B_OK;
1182 		}
1183 	}
1184 	return status;
1185 }
1186 
1187 
1188 void
1189 Inode::ReleaseAttribute(Inode *attribute)
1190 {
1191 	if (attribute == NULL)
1192 		return;
1193 
1194 	put_vnode(fVolume->FSVolume(), attribute->ID());
1195 }
1196 
1197 
1198 status_t
1199 Inode::CreateAttribute(Transaction &transaction, const char *name, uint32 type,
1200 	Inode **attribute)
1201 {
1202 	// do we need to create the attribute directory first?
1203 	if (Attributes().IsZero()) {
1204 		status_t status = Inode::Create(transaction, this, NULL,
1205 			S_ATTR_DIR | S_DIRECTORY | 0666, 0, 0, NULL);
1206 		if (status < B_OK)
1207 			RETURN_ERROR(status);
1208 	}
1209 	Vnode vnode(fVolume, Attributes());
1210 	Inode *attributes;
1211 	if (vnode.Get(&attributes) < B_OK)
1212 		return B_ERROR;
1213 
1214 	// Inode::Create() locks the inode for us
1215 	return Inode::Create(transaction, attributes, name,
1216 		S_ATTR | S_FILE | 0666, 0, type, NULL, NULL, attribute);
1217 }
1218 
1219 
1220 //	#pragma mark - directory tree
1221 
1222 
1223 /*!	Gives the caller direct access to the b+tree for a given directory.
1224 	The tree is no longer created on demand, but when the inode is first
1225 	created. That will report any potential errors upfront, saves locking,
1226 	and should work as good (though a bit slower).
1227 */
1228 status_t
1229 Inode::GetTree(BPlusTree **tree)
1230 {
1231 	if (fTree) {
1232 		*tree = fTree;
1233 		return B_OK;
1234 	}
1235 
1236 	RETURN_ERROR(B_BAD_VALUE);
1237 }
1238 
1239 
1240 bool
1241 Inode::IsEmpty()
1242 {
1243 	BPlusTree *tree;
1244 	status_t status = GetTree(&tree);
1245 	if (status < B_OK)
1246 		return status;
1247 
1248 	TreeIterator iterator(tree);
1249 
1250 	// index and attribute directories are really empty when they are
1251 	// empty - directories for standard files always contain ".", and
1252 	// "..", so we need to ignore those two
1253 
1254 	uint32 count = 0;
1255 	char name[BPLUSTREE_MAX_KEY_LENGTH];
1256 	uint16 length;
1257 	ino_t id;
1258 	while (iterator.GetNextEntry(name, &length, B_FILE_NAME_LENGTH,
1259 			&id) == B_OK) {
1260 		if (Mode() & (S_ATTR_DIR | S_INDEX_DIR))
1261 			return false;
1262 
1263 		if (++count > 2 || (strcmp(".", name) && strcmp("..", name)))
1264 			return false;
1265 	}
1266 	return true;
1267 }
1268 
1269 
1270 //	#pragma mark - data stream
1271 
1272 
1273 /*!	Finds the block_run where "pos" is located in the data_stream of
1274 	the inode.
1275 	If successful, "offset" will then be set to the file offset
1276 	of the block_run returned; so "pos - offset" is for the block_run
1277 	what "pos" is for the whole stream.
1278 	The caller has to make sure that "pos" is inside the stream.
1279 */
1280 status_t
1281 Inode::FindBlockRun(off_t pos, block_run &run, off_t &offset)
1282 {
1283 	data_stream *data = &Node().data;
1284 
1285 	// find matching block run
1286 
1287 	if (data->MaxDirectRange() > 0 && pos >= data->MaxDirectRange()) {
1288 		if (data->MaxDoubleIndirectRange() > 0
1289 			&& pos >= data->MaxIndirectRange()) {
1290 			// access to double indirect blocks
1291 
1292 			CachedBlock cached(fVolume);
1293 
1294 			off_t start = pos - data->MaxIndirectRange();
1295 			int32 indirectSize = (1L << (INDIRECT_BLOCKS_SHIFT
1296 					+ cached.BlockShift()))
1297 				* (fVolume->BlockSize() / sizeof(block_run));
1298 			int32 directSize = NUM_ARRAY_BLOCKS << cached.BlockShift();
1299 			int32 index = start / indirectSize;
1300 			int32 runsPerBlock = cached.BlockSize() / sizeof(block_run);
1301 
1302 			block_run *indirect = (block_run *)cached.SetTo(
1303 				fVolume->ToBlock(data->double_indirect) + index / runsPerBlock);
1304 			if (indirect == NULL)
1305 				RETURN_ERROR(B_ERROR);
1306 
1307 			int32 current = (start % indirectSize) / directSize;
1308 
1309 			indirect = (block_run *)cached.SetTo(
1310 				fVolume->ToBlock(indirect[index % runsPerBlock])
1311 				+ current / runsPerBlock);
1312 			if (indirect == NULL)
1313 				RETURN_ERROR(B_ERROR);
1314 
1315 			run = indirect[current % runsPerBlock];
1316 			offset = data->MaxIndirectRange() + (index * indirectSize)
1317 				+ (current * directSize);
1318 		} else {
1319 			// access to indirect blocks
1320 
1321 			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1322 			off_t runBlockEnd = data->MaxDirectRange();
1323 
1324 			CachedBlock cached(fVolume);
1325 			off_t block = fVolume->ToBlock(data->indirect);
1326 
1327 			for (int32 i = 0; i < data->indirect.Length(); i++) {
1328 				block_run *indirect = (block_run *)cached.SetTo(block + i);
1329 				if (indirect == NULL)
1330 					RETURN_ERROR(B_IO_ERROR);
1331 
1332 				int32 current = -1;
1333 				while (++current < runsPerBlock) {
1334 					if (indirect[current].IsZero())
1335 						break;
1336 
1337 					runBlockEnd += indirect[current].Length()
1338 						<< cached.BlockShift();
1339 					if (runBlockEnd > pos) {
1340 						run = indirect[current];
1341 						offset = runBlockEnd - (run.Length()
1342 							<< cached.BlockShift());
1343 						return fVolume->ValidateBlockRun(run);
1344 					}
1345 				}
1346 			}
1347 			RETURN_ERROR(B_ERROR);
1348 		}
1349 	} else {
1350 		// access from direct blocks
1351 
1352 		off_t runBlockEnd = 0LL;
1353 		int32 current = -1;
1354 
1355 		while (++current < NUM_DIRECT_BLOCKS) {
1356 			if (data->direct[current].IsZero())
1357 				break;
1358 
1359 			runBlockEnd += data->direct[current].Length()
1360 				<< fVolume->BlockShift();
1361 			if (runBlockEnd > pos) {
1362 				run = data->direct[current];
1363 				offset = runBlockEnd - (run.Length() << fVolume->BlockShift());
1364 				return fVolume->ValidateBlockRun(run);
1365 			}
1366 		}
1367 
1368 		return B_ENTRY_NOT_FOUND;
1369 	}
1370 	return fVolume->ValidateBlockRun(run);
1371 }
1372 
1373 
1374 status_t
1375 Inode::ReadAt(off_t pos, uint8 *buffer, size_t *_length)
1376 {
1377 	size_t length = *_length;
1378 
1379 	// set/check boundaries for pos/length
1380 	if (pos < 0)
1381 		return B_BAD_VALUE;
1382 
1383 	ReadLocked locker(Lock());
1384 
1385 	if (pos >= Size() || length == 0) {
1386 		*_length = 0;
1387 		return B_NO_ERROR;
1388 	}
1389 
1390 	locker.Unlock();
1391 
1392 	return file_cache_read(FileCache(), NULL, pos, buffer, _length);
1393 }
1394 
1395 
1396 status_t
1397 Inode::WriteAt(Transaction &transaction, off_t pos, const uint8 *buffer,
1398 	size_t *_length)
1399 {
1400 	WriteLocked locker(Lock());
1401 	if (locker.IsLocked() < B_OK)
1402 		RETURN_ERROR(B_ERROR);
1403 
1404 	// update the last modification time in memory, it will be written
1405 	// back to the inode, and the index when the file is closed
1406 	// ToDo: should update the internal last modified time only at this point!
1407 	Node().last_modified_time = HOST_ENDIAN_TO_BFS_INT64((bigtime_t)time(NULL)
1408 		<< INODE_TIME_SHIFT);
1409 
1410 	// TODO: support INODE_LOGGED!
1411 
1412 	size_t length = *_length;
1413 	bool changeSize = false;
1414 
1415 	// set/check boundaries for pos/length
1416 	if (pos < 0)
1417 		return B_BAD_VALUE;
1418 
1419 	if (pos + length > Size())
1420 		changeSize = true;
1421 
1422 	locker.Unlock();
1423 
1424 	// the transaction doesn't have to be started already
1425 	if (changeSize && !transaction.IsStarted())
1426 		transaction.Start(fVolume, BlockNumber());
1427 
1428 	locker.Lock();
1429 
1430 	if (pos + length > Size()) {
1431 		// let's grow the data stream to the size needed
1432 		status_t status = SetFileSize(transaction, pos + length);
1433 		if (status < B_OK) {
1434 			*_length = 0;
1435 			RETURN_ERROR(status);
1436 		}
1437 		// TODO: In theory we would need to update the file size
1438 		// index here as part of the current transaction - this might just
1439 		// be a bit too expensive, but worth a try.
1440 
1441 		// we need to write back the inode here because it has to
1442 		// go into this transaction (we cannot wait until the file
1443 		// is closed)
1444 		status = WriteBack(transaction);
1445 		if (status < B_OK)
1446 			return status;
1447 	}
1448 
1449 	// If we don't want to write anything, we can now return (we may
1450 	// just have changed the file size using the position parameter)
1451 	if (length == 0)
1452 		return B_OK;
1453 
1454 	locker.Unlock();
1455 
1456 	return file_cache_write(FileCache(), NULL, pos, buffer, _length);
1457 }
1458 
1459 
1460 /*!	Fills the gap between the old file size and the new file size
1461 	with zeros.
1462 	It's more or less a copy of Inode::WriteAt() but it can handle
1463 	length differences of more than just 4 GB, and it never uses
1464 	the log, even if the INODE_LOGGED flag is set.
1465 */
1466 status_t
1467 Inode::FillGapWithZeros(off_t pos, off_t newSize)
1468 {
1469 	while (pos < newSize) {
1470 		size_t size;
1471 		if (newSize > pos + 1024 * 1024 * 1024)
1472 			size = 1024 * 1024 * 1024;
1473 		else
1474 			size = newSize - pos;
1475 
1476 		status_t status = file_cache_write(FileCache(), NULL, pos, NULL, &size);
1477 		if (status < B_OK)
1478 			return status;
1479 
1480 		pos += size;
1481 	}
1482 
1483 	return B_OK;
1484 }
1485 
1486 
1487 /*!	Allocates NUM_ARRAY_BLOCKS blocks, and clears their contents. Growing
1488 	the indirect and double indirect range uses this method.
1489 	The allocated block_run is saved in "run"
1490 */
1491 status_t
1492 Inode::_AllocateBlockArray(Transaction &transaction, block_run &run)
1493 {
1494 	if (!run.IsZero())
1495 		return B_BAD_VALUE;
1496 
1497 	status_t status = fVolume->Allocate(transaction, this, NUM_ARRAY_BLOCKS,
1498 		run, NUM_ARRAY_BLOCKS);
1499 	if (status < B_OK)
1500 		return status;
1501 
1502 	// make sure those blocks are empty
1503 	CachedBlock cached(fVolume);
1504 	off_t block = fVolume->ToBlock(run);
1505 
1506 	for (int32 i = 0; i < run.Length(); i++) {
1507 		block_run *runs = (block_run *)cached.SetToWritable(transaction,
1508 			block + i, true);
1509 		if (runs == NULL)
1510 			return B_IO_ERROR;
1511 	}
1512 	return B_OK;
1513 }
1514 
1515 
1516 status_t
1517 Inode::_GrowStream(Transaction &transaction, off_t size)
1518 {
1519 	data_stream *data = &Node().data;
1520 
1521 	// is the data stream already large enough to hold the new size?
1522 	// (can be the case with preallocated blocks)
1523 	if (size < data->MaxDirectRange()
1524 		|| size < data->MaxIndirectRange()
1525 		|| size < data->MaxDoubleIndirectRange()) {
1526 		data->size = HOST_ENDIAN_TO_BFS_INT64(size);
1527 		return B_OK;
1528 	}
1529 
1530 	// how many bytes are still needed? (unused ranges are always zero)
1531 	uint16 minimum = 1;
1532 	off_t bytes;
1533 	if (data->Size() < data->MaxDoubleIndirectRange()) {
1534 		bytes = size - data->MaxDoubleIndirectRange();
1535 		// The double indirect range can only handle multiple of
1536 		// NUM_ARRAY_BLOCKS
1537 		minimum = NUM_ARRAY_BLOCKS;
1538 	} else if (data->Size() < data->MaxIndirectRange())
1539 		bytes = size - data->MaxIndirectRange();
1540 	else if (data->Size() < data->MaxDirectRange())
1541 		bytes = size - data->MaxDirectRange();
1542 	else
1543 		bytes = size - data->Size();
1544 
1545 	// do we have enough free blocks on the disk?
1546 	off_t blocksRequested = (bytes + fVolume->BlockSize() - 1)
1547 		>> fVolume->BlockShift();
1548 	if (blocksRequested > fVolume->FreeBlocks())
1549 		return B_DEVICE_FULL;
1550 
1551 	off_t blocksNeeded = blocksRequested;
1552 		// because of preallocations and partial allocations, the number of
1553 		// blocks we need to allocate may be different from the one we request
1554 		// from the block allocator
1555 
1556 	// Should we preallocate some blocks (currently, always 64k)?
1557 	// Attributes, attribute directories, and long symlinks usually won't get
1558 	// that big, and should stay close to the inode - preallocating could be
1559 	// counterproductive.
1560 	// Also, if free disk space is tight, we probably don't want to do this as
1561 	// well.
1562 	if (!IsAttribute() && !IsAttributeDirectory() && !IsSymLink()
1563 		&& blocksRequested < (65536 >> fVolume->BlockShift())
1564 		&& fVolume->FreeBlocks() > 128)
1565 		blocksRequested = 65536 >> fVolume->BlockShift();
1566 
1567 	while (blocksNeeded > 0) {
1568 		// the requested blocks do not need to be returned with a
1569 		// single allocation, so we need to iterate until we have
1570 		// enough blocks allocated
1571 		block_run run;
1572 		status_t status = fVolume->Allocate(transaction, this, blocksRequested,
1573 			run, minimum);
1574 		if (status < B_OK)
1575 			return status;
1576 
1577 		// okay, we have the needed blocks, so just distribute them to the
1578 		// different ranges of the stream (direct, indirect & double indirect)
1579 
1580 		// TODO: if anything goes wrong here, we probably want to free the
1581 		// blocks that couldn't be distributed into the stream!
1582 
1583 		blocksNeeded -= run.Length();
1584 		// don't preallocate if the first allocation was already too small
1585 		blocksRequested = blocksNeeded;
1586 		if (minimum > 1) {
1587 			// make sure that "blocks" is a multiple of minimum
1588 			blocksRequested = (blocksRequested + minimum - 1) & ~(minimum - 1);
1589 		}
1590 
1591 		// Direct block range
1592 
1593 		if (data->Size() <= data->MaxDirectRange()) {
1594 			// let's try to put them into the direct block range
1595 			int32 free = 0;
1596 			for (; free < NUM_DIRECT_BLOCKS; free++)
1597 				if (data->direct[free].IsZero())
1598 					break;
1599 
1600 			if (free < NUM_DIRECT_BLOCKS) {
1601 				// can we merge the last allocated run with the new one?
1602 				int32 last = free - 1;
1603 				if (free > 0 && data->direct[last].MergeableWith(run)) {
1604 					data->direct[last].length = HOST_ENDIAN_TO_BFS_INT16(
1605 						data->direct[last].Length() + run.Length());
1606 				} else
1607 					data->direct[free] = run;
1608 
1609 				data->max_direct_range = HOST_ENDIAN_TO_BFS_INT64(
1610 					data->MaxDirectRange()
1611 					+ run.Length() * fVolume->BlockSize());
1612 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1613 					? data->max_direct_range : size);
1614 				continue;
1615 			}
1616 		}
1617 
1618 		// Indirect block range
1619 
1620 		if (data->Size() <= data->MaxIndirectRange()
1621 			|| !data->MaxIndirectRange()) {
1622 			CachedBlock cached(fVolume);
1623 			block_run *runs = NULL;
1624 			uint32 free = 0;
1625 			off_t block;
1626 
1627 			// if there is no indirect block yet, create one
1628 			if (data->indirect.IsZero()) {
1629 				status = _AllocateBlockArray(transaction, data->indirect);
1630 				if (status < B_OK)
1631 					return status;
1632 
1633 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1634 					data->MaxDirectRange());
1635 				// insert the block_run in the first block
1636 				runs = (block_run *)cached.SetTo(data->indirect);
1637 			} else {
1638 				uint32 numberOfRuns = fVolume->BlockSize() / sizeof(block_run);
1639 				block = fVolume->ToBlock(data->indirect);
1640 
1641 				// search first empty entry
1642 				int32 i = 0;
1643 				for (; i < data->indirect.Length(); i++) {
1644 					if ((runs = (block_run *)cached.SetTo(block + i)) == NULL)
1645 						return B_IO_ERROR;
1646 
1647 					for (free = 0; free < numberOfRuns; free++)
1648 						if (runs[free].IsZero())
1649 							break;
1650 
1651 					if (free < numberOfRuns)
1652 						break;
1653 				}
1654 				if (i == data->indirect.Length())
1655 					runs = NULL;
1656 			}
1657 
1658 			if (runs != NULL) {
1659 				// try to insert the run to the last one - note that this
1660 				// doesn't take block borders into account, so it could be
1661 				// further optimized
1662 				cached.MakeWritable(transaction);
1663 
1664 				int32 last = free - 1;
1665 				if (free > 0 && runs[last].MergeableWith(run)) {
1666 					runs[last].length = HOST_ENDIAN_TO_BFS_INT16(
1667 						runs[last].Length() + run.Length());
1668 				} else
1669 					runs[free] = run;
1670 
1671 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1672 					data->MaxIndirectRange()
1673 					+ (run.Length() << fVolume->BlockShift()));
1674 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1675 					? data->MaxIndirectRange() : size);
1676 				continue;
1677 			}
1678 		}
1679 
1680 		// Double indirect block range
1681 
1682 		if (data->Size() <= data->MaxDoubleIndirectRange()
1683 			|| !data->max_double_indirect_range) {
1684 			while ((run.Length() % NUM_ARRAY_BLOCKS) != 0) {
1685 				// The number of allocated blocks isn't a multiple of
1686 				// NUM_ARRAY_BLOCKS, so we have to change this. This can happen
1687 				// the first time the stream grows into the double
1688 				// indirect range.
1689 				// First, free the remaining blocks that don't fit into a
1690 				// multiple of NUM_ARRAY_BLOCKS
1691 				int32 rest = run.Length() % NUM_ARRAY_BLOCKS;
1692 				run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length() - rest);
1693 
1694 				status = fVolume->Free(transaction,
1695 					block_run::Run(run.AllocationGroup(),
1696 					run.Start() + run.Length(), rest));
1697 				if (status < B_OK)
1698 					return status;
1699 
1700 				blocksNeeded += rest;
1701 				blocksRequested = (blocksNeeded + NUM_ARRAY_BLOCKS - 1)
1702 					& ~(NUM_ARRAY_BLOCKS - 1);
1703 				minimum = NUM_ARRAY_BLOCKS;
1704 					// we make sure here that we have at minimum
1705 					// NUM_ARRAY_BLOCKS allocated, so if the allocation
1706 					// succeeds, we don't run into an endless loop
1707 
1708 				// Are there any blocks left in the run? If not, allocate
1709 				// a new one
1710 				if (run.length == 0)
1711 					continue;
1712 			}
1713 
1714 			// if there is no double indirect block yet, create one
1715 			if (data->double_indirect.IsZero()) {
1716 				status = _AllocateBlockArray(transaction,
1717 					data->double_indirect);
1718 				if (status < B_OK)
1719 					return status;
1720 
1721 				data->max_double_indirect_range = data->max_indirect_range;
1722 			}
1723 
1724 			// calculate the index where to insert the new blocks
1725 
1726 			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1727 			int32 indirectSize = ((1L << INDIRECT_BLOCKS_SHIFT)
1728 				<< fVolume->BlockShift()) * runsPerBlock;
1729 			int32 directSize = NUM_ARRAY_BLOCKS << fVolume->BlockShift();
1730 			int32 runsPerArray = runsPerBlock << ARRAY_BLOCKS_SHIFT;
1731 
1732 			off_t start = data->MaxDoubleIndirectRange()
1733 				- data->MaxIndirectRange();
1734 			int32 indirectIndex = start / indirectSize;
1735 			int32 index = start / directSize;
1736 
1737 			// distribute the blocks to the array and allocate
1738 			// new array blocks when needed
1739 
1740 			CachedBlock cached(fVolume);
1741 			CachedBlock cachedDirect(fVolume);
1742 			block_run *array = NULL;
1743 			uint32 runLength = run.Length();
1744 
1745 			while (run.length != 0) {
1746 				// get the indirect array block
1747 				if (array == NULL) {
1748 					array = (block_run *)cached.SetTo(fVolume->ToBlock(
1749 						data->double_indirect) + indirectIndex / runsPerBlock);
1750 					if (array == NULL)
1751 						return B_IO_ERROR;
1752 				}
1753 
1754 				do {
1755 					// do we need a new array block?
1756 					if (array[indirectIndex % runsPerBlock].IsZero()) {
1757 						cached.MakeWritable(transaction);
1758 
1759 						status = _AllocateBlockArray(transaction,
1760 							array[indirectIndex % runsPerBlock]);
1761 						if (status < B_OK)
1762 							return status;
1763 					}
1764 
1765 					block_run *runs = (block_run *)cachedDirect.SetToWritable(
1766 						transaction, fVolume->ToBlock(array[indirectIndex
1767 							% runsPerBlock]) + index / runsPerBlock);
1768 					if (runs == NULL)
1769 						return B_IO_ERROR;
1770 
1771 					do {
1772 						// insert the block_run into the array
1773 						runs[index % runsPerBlock] = run;
1774 						runs[index % runsPerBlock].length
1775 							= HOST_ENDIAN_TO_BFS_INT16(NUM_ARRAY_BLOCKS);
1776 
1777 						// alter the remaining block_run
1778 						run.start = HOST_ENDIAN_TO_BFS_INT16(run.Start()
1779 							+ NUM_ARRAY_BLOCKS);
1780 						run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
1781 							- NUM_ARRAY_BLOCKS);
1782 					} while ((++index % runsPerBlock) != 0 && run.length);
1783 				} while ((index % runsPerArray) != 0 && run.length);
1784 
1785 				if (++indirectIndex % runsPerBlock == 0) {
1786 					array = NULL;
1787 					index = 0;
1788 				}
1789 			}
1790 
1791 			data->max_double_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1792 				data->MaxDoubleIndirectRange()
1793 				+ (runLength << fVolume->BlockShift()));
1794 			data->size = blocksNeeded > 0 ? HOST_ENDIAN_TO_BFS_INT64(
1795 				data->max_double_indirect_range) : size;
1796 
1797 			continue;
1798 		}
1799 
1800 		RETURN_ERROR(EFBIG);
1801 	}
1802 	// update the size of the data stream
1803 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
1804 
1805 	return B_OK;
1806 }
1807 
1808 
1809 status_t
1810 Inode::_FreeStaticStreamArray(Transaction &transaction, int32 level,
1811 	block_run run, off_t size, off_t offset, off_t &max)
1812 {
1813 	int32 indirectSize = 0;
1814 	if (level == 0)
1815 		indirectSize = (1L << (INDIRECT_BLOCKS_SHIFT + fVolume->BlockShift()))
1816 			* (fVolume->BlockSize() / sizeof(block_run));
1817 	else if (level == 1)
1818 		indirectSize = 4 << fVolume->BlockShift();
1819 
1820 	off_t start;
1821 	if (size > offset)
1822 		start = size - offset;
1823 	else
1824 		start = 0;
1825 
1826 	int32 index = start / indirectSize;
1827 	int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1828 
1829 	CachedBlock cached(fVolume);
1830 	off_t blockNumber = fVolume->ToBlock(run);
1831 
1832 	// set the file offset to the current block run
1833 	offset += (off_t)index * indirectSize;
1834 
1835 	for (int32 i = index / runsPerBlock; i < run.Length(); i++) {
1836 		block_run *array = (block_run *)cached.SetToWritable(transaction,
1837 			blockNumber + i);
1838 		if (array == NULL)
1839 			RETURN_ERROR(B_ERROR);
1840 
1841 		for (index = index % runsPerBlock; index < runsPerBlock; index++) {
1842 			if (array[index].IsZero()) {
1843 				// we also want to break out of the outer loop
1844 				i = run.Length();
1845 				break;
1846 			}
1847 
1848 			status_t status = B_OK;
1849 			if (level == 0) {
1850 				status = _FreeStaticStreamArray(transaction, 1, array[index],
1851 					size, offset, max);
1852 			} else if (offset >= size)
1853 				status = fVolume->Free(transaction, array[index]);
1854 			else
1855 				max = HOST_ENDIAN_TO_BFS_INT64(offset + indirectSize);
1856 
1857 			if (status < B_OK)
1858 				RETURN_ERROR(status);
1859 
1860 			if (offset >= size)
1861 				array[index].SetTo(0, 0, 0);
1862 
1863 			offset += indirectSize;
1864 		}
1865 		index = 0;
1866 	}
1867 	return B_OK;
1868 }
1869 
1870 
1871 /*!	Frees all block_runs in the array which come after the specified size.
1872 	It also trims the last block_run that contain the size.
1873 	"offset" and "max" are maintained until the last block_run that doesn't
1874 	have to be freed - after this, the values won't be correct anymore, but
1875 	will still assure correct function for all subsequent calls.
1876 	"max" is considered to be in file system byte order.
1877 */
1878 status_t
1879 Inode::_FreeStreamArray(Transaction &transaction, block_run *array,
1880 	uint32 arrayLength, off_t size, off_t &offset, off_t &max)
1881 {
1882 	PRINT(("FreeStreamArray: arrayLength %lu, size %Ld, offset %Ld, max %Ld\n",
1883 		arrayLength, size, offset, max));
1884 
1885 	off_t newOffset = offset;
1886 	uint32 i = 0;
1887 	for (; i < arrayLength; i++, offset = newOffset) {
1888 		if (array[i].IsZero())
1889 			break;
1890 
1891 		newOffset += (off_t)array[i].Length() << fVolume->BlockShift();
1892 		if (newOffset <= size)
1893 			continue;
1894 
1895 		block_run run = array[i];
1896 
1897 		// determine the block_run to be freed
1898 		if (newOffset > size && offset < size) {
1899 			// free partial block_run (and update the original block_run)
1900 			run.start = HOST_ENDIAN_TO_BFS_INT16(array[i].Start()
1901 				+ ((size + fVolume->BlockSize() - 1 - offset)
1902 					>> fVolume->BlockShift()));
1903 				// round to next block
1904 			array[i].length = HOST_ENDIAN_TO_BFS_INT16(run.Start()
1905 				- array[i].Start());
1906 			run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
1907 				- array[i].Length());
1908 
1909 			if (run.length == 0)
1910 				continue;
1911 
1912 			// update maximum range
1913 			max = HOST_ENDIAN_TO_BFS_INT64(offset + ((off_t)array[i].Length()
1914 				<< fVolume->BlockShift()));
1915 		} else {
1916 			// free the whole block_run
1917 			array[i].SetTo(0, 0, 0);
1918 
1919 			if ((off_t)BFS_ENDIAN_TO_HOST_INT64(max) > offset)
1920 				max = HOST_ENDIAN_TO_BFS_INT64(offset);
1921 		}
1922 
1923 		if (fVolume->Free(transaction, run) < B_OK)
1924 			return B_IO_ERROR;
1925 	}
1926 	return B_OK;
1927 }
1928 
1929 
1930 status_t
1931 Inode::_ShrinkStream(Transaction &transaction, off_t size)
1932 {
1933 	data_stream *data = &Node().data;
1934 	status_t status;
1935 
1936 	if (data->MaxDoubleIndirectRange() > size) {
1937 		off_t *maxDoubleIndirect = &data->max_double_indirect_range;
1938 			// gcc 4 work-around: "error: cannot bind packed field
1939 			// 'data->data_stream::max_double_indirect_range' to 'off_t&'"
1940 		status = _FreeStaticStreamArray(transaction, 0, data->double_indirect,
1941 			size, data->MaxIndirectRange(), *maxDoubleIndirect);
1942 		if (status < B_OK)
1943 			return status;
1944 
1945 		if (size <= data->MaxIndirectRange()) {
1946 			fVolume->Free(transaction, data->double_indirect);
1947 			data->double_indirect.SetTo(0, 0, 0);
1948 			data->max_double_indirect_range = 0;
1949 		}
1950 	}
1951 
1952 	if (data->MaxIndirectRange() > size) {
1953 		CachedBlock cached(fVolume);
1954 		off_t block = fVolume->ToBlock(data->indirect);
1955 		off_t offset = data->MaxDirectRange();
1956 
1957 		for (int32 i = 0; i < data->indirect.Length(); i++) {
1958 			block_run *array = (block_run *)cached.SetToWritable(transaction,
1959 				block + i);
1960 			if (array == NULL)
1961 				break;
1962 
1963 			off_t *maxIndirect = &data->max_indirect_range;
1964 				// gcc 4 work-around: "error: cannot bind packed field
1965 				// 'data->data_stream::max_indirect_range' to 'off_t&'"
1966 			if (_FreeStreamArray(transaction, array, fVolume->BlockSize()
1967 					/ sizeof(block_run), size, offset, *maxIndirect) != B_OK)
1968 				return B_IO_ERROR;
1969 		}
1970 		if (data->max_direct_range == data->max_indirect_range) {
1971 			fVolume->Free(transaction, data->indirect);
1972 			data->indirect.SetTo(0, 0, 0);
1973 			data->max_indirect_range = 0;
1974 		}
1975 	}
1976 
1977 	if (data->MaxDirectRange() > size) {
1978 		off_t offset = 0;
1979 		off_t *maxDirect = &data->max_direct_range;
1980 			// gcc 4 work-around: "error: cannot bind packed field
1981 			// 'data->data_stream::max_direct_range' to 'off_t&'"
1982 		status = _FreeStreamArray(transaction, data->direct, NUM_DIRECT_BLOCKS,
1983 			size, offset, *maxDirect);
1984 		if (status < B_OK)
1985 			return status;
1986 	}
1987 
1988 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
1989 	return B_OK;
1990 }
1991 
1992 
1993 status_t
1994 Inode::SetFileSize(Transaction &transaction, off_t size)
1995 {
1996 	if (size < 0)
1997 		return B_BAD_VALUE;
1998 
1999 	off_t oldSize = Size();
2000 
2001 	if (size == oldSize)
2002 		return B_OK;
2003 
2004 	T(Resize(this, oldSize, size, false));
2005 
2006 	// should the data stream grow or shrink?
2007 	status_t status;
2008 	if (size > oldSize) {
2009 		status = _GrowStream(transaction, size);
2010 		if (status < B_OK) {
2011 			// if the growing of the stream fails, the whole operation
2012 			// fails, so we should shrink the stream to its former size
2013 			_ShrinkStream(transaction, oldSize);
2014 		}
2015 	} else
2016 		status = _ShrinkStream(transaction, size);
2017 
2018 	if (status < B_OK)
2019 		return status;
2020 
2021 	file_cache_set_size(FileCache(), size);
2022 	file_map_set_size(Map(), size);
2023 
2024 	return WriteBack(transaction);
2025 }
2026 
2027 
2028 status_t
2029 Inode::Append(Transaction &transaction, off_t bytes)
2030 {
2031 	return SetFileSize(transaction, Size() + bytes);
2032 }
2033 
2034 
2035 /*!	Checks wether or not this inode's data stream needs to be trimmed
2036 	because of an earlier preallocation.
2037 	Returns true if there are any blocks to be trimmed.
2038 */
2039 bool
2040 Inode::NeedsTrimming()
2041 {
2042 	// We never trim preallocated index blocks to make them grow as smooth as
2043 	// possible. There are only few indices anyway, so this doesn't hurt.
2044 	// Also, if an inode is already in deleted state, we don't bother trimming
2045 	// it.
2046 	if (IsIndex() || IsDeleted())
2047 		return false;
2048 
2049 	off_t roundedSize = (Size() + fVolume->BlockSize() - 1)
2050 		& ~(fVolume->BlockSize() - 1);
2051 
2052 	return Node().data.MaxDirectRange() > roundedSize
2053 		|| Node().data.MaxIndirectRange() > roundedSize
2054 		|| Node().data.MaxDoubleIndirectRange() > roundedSize;
2055 }
2056 
2057 
2058 status_t
2059 Inode::TrimPreallocation(Transaction &transaction)
2060 {
2061 	T(Resize(this, max_c(Node().data.MaxDirectRange(),
2062 		Node().data.MaxIndirectRange()), Size(), true));
2063 
2064 	status_t status = _ShrinkStream(transaction, Size());
2065 	if (status < B_OK)
2066 		return status;
2067 
2068 	return WriteBack(transaction);
2069 }
2070 
2071 
2072 //!	Frees the file's data stream and removes all attributes
2073 status_t
2074 Inode::Free(Transaction &transaction)
2075 {
2076 	FUNCTION();
2077 
2078 	// Perhaps there should be an implementation of Inode::ShrinkStream() that
2079 	// just frees the data_stream, but doesn't change the inode (since it is
2080 	// freed anyway) - that would make an undelete command possible
2081 	status_t status = SetFileSize(transaction, 0);
2082 	if (status < B_OK)
2083 		return status;
2084 
2085 	// Free all attributes, and remove their indices
2086 	{
2087 		// We have to limit the scope of AttributeIterator, so that its
2088 		// destructor is not called after the inode is deleted
2089 		AttributeIterator iterator(this);
2090 
2091 		char name[B_FILE_NAME_LENGTH];
2092 		uint32 type;
2093 		size_t length;
2094 		ino_t id;
2095 		while ((status = iterator.GetNext(name, &length, &type, &id)) == B_OK) {
2096 			RemoveAttribute(transaction, name);
2097 		}
2098 	}
2099 
2100 	if (WriteBack(transaction) < B_OK)
2101 		return B_IO_ERROR;
2102 
2103 	return fVolume->Free(transaction, BlockRun());
2104 }
2105 
2106 
2107 //!	Synchronizes (writes back to disk) the file stream of the inode.
2108 status_t
2109 Inode::Sync()
2110 {
2111 	if (FileCache())
2112 		return file_cache_sync(FileCache());
2113 
2114 	// We may also want to flush the attribute's data stream to
2115 	// disk here... (do we?)
2116 
2117 	if (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0)
2118 		return B_OK;
2119 
2120 	data_stream *data = &Node().data;
2121 	status_t status = B_OK;
2122 
2123 	// flush direct range
2124 
2125 	for (int32 i = 0; i < NUM_DIRECT_BLOCKS; i++) {
2126 		if (data->direct[i].IsZero())
2127 			return B_OK;
2128 
2129 		status = block_cache_sync_etc(fVolume->BlockCache(),
2130 			fVolume->ToBlock(data->direct[i]), data->direct[i].Length());
2131 		if (status != B_OK)
2132 			return status;
2133 	}
2134 
2135 	// flush indirect range
2136 
2137 	if (data->max_indirect_range == 0)
2138 		return B_OK;
2139 
2140 	CachedBlock cached(fVolume);
2141 	off_t block = fVolume->ToBlock(data->indirect);
2142 	int32 count = fVolume->BlockSize() / sizeof(block_run);
2143 
2144 	for (int32 j = 0; j < data->indirect.Length(); j++) {
2145 		block_run *runs = (block_run *)cached.SetTo(block + j);
2146 		if (runs == NULL)
2147 			break;
2148 
2149 		for (int32 i = 0; i < count; i++) {
2150 			if (runs[i].IsZero())
2151 				return B_OK;
2152 
2153 			status = block_cache_sync_etc(fVolume->BlockCache(),
2154 				fVolume->ToBlock(runs[i]), runs[i].Length());
2155 			if (status != B_OK)
2156 				return status;
2157 		}
2158 	}
2159 
2160 	// flush double indirect range
2161 
2162 	if (data->max_double_indirect_range == 0)
2163 		return B_OK;
2164 
2165 	off_t indirectBlock = fVolume->ToBlock(data->double_indirect);
2166 
2167 	for (int32 l = 0; l < data->double_indirect.Length(); l++) {
2168 		block_run *indirectRuns = (block_run *)cached.SetTo(indirectBlock + l);
2169 		if (indirectRuns == NULL)
2170 			return B_FILE_ERROR;
2171 
2172 		CachedBlock directCached(fVolume);
2173 
2174 		for (int32 k = 0; k < count; k++) {
2175 			if (indirectRuns[k].IsZero())
2176 				return B_OK;
2177 
2178 			block = fVolume->ToBlock(indirectRuns[k]);
2179 			for (int32 j = 0; j < indirectRuns[k].Length(); j++) {
2180 				block_run *runs = (block_run *)directCached.SetTo(block + j);
2181 				if (runs == NULL)
2182 					return B_FILE_ERROR;
2183 
2184 				for (int32 i = 0; i < count; i++) {
2185 					if (runs[i].IsZero())
2186 						return B_OK;
2187 
2188 					// TODO: combine single block_runs to bigger ones when
2189 					// they are adjacent
2190 					status = block_cache_sync_etc(fVolume->BlockCache(),
2191 						fVolume->ToBlock(runs[i]), runs[i].Length());
2192 					if (status != B_OK)
2193 						return status;
2194 				}
2195 			}
2196 		}
2197 	}
2198 	return B_OK;
2199 }
2200 
2201 
2202 //	#pragma mark - creation/deletion
2203 
2204 
2205 status_t
2206 Inode::Remove(Transaction &transaction, const char *name, ino_t *_id,
2207 	bool isDirectory)
2208 {
2209 	BPlusTree *tree;
2210 	if (GetTree(&tree) != B_OK)
2211 		RETURN_ERROR(B_BAD_VALUE);
2212 
2213 	RecursiveLocker locker(fVolume->Lock());
2214 
2215 	// does the file even exist?
2216 	off_t id;
2217 	if (tree->Find((uint8 *)name, (uint16)strlen(name), &id) < B_OK)
2218 		return B_ENTRY_NOT_FOUND;
2219 
2220 	if (_id)
2221 		*_id = id;
2222 
2223 	Vnode vnode(fVolume, id);
2224 	Inode *inode;
2225 	status_t status = vnode.Get(&inode);
2226 	if (status < B_OK) {
2227 		REPORT_ERROR(status);
2228 		return B_ENTRY_NOT_FOUND;
2229 	}
2230 
2231 	T(Remove(inode, name));
2232 
2233 	// Inode::IsContainer() is true also for indices (furthermore, the S_IFDIR
2234 	// bit is set for indices in BFS, not for attribute directories) - but you
2235 	// should really be able to do whatever you want with your indices
2236 	// without having to remove all files first :)
2237 	if (!inode->IsIndex()) {
2238 		// if it's not of the correct type, don't delete it!
2239 		if (inode->IsContainer() != isDirectory)
2240 			return isDirectory ? B_NOT_A_DIRECTORY : B_IS_A_DIRECTORY;
2241 
2242 		// only delete empty directories
2243 		if (isDirectory && !inode->IsEmpty())
2244 			return B_DIRECTORY_NOT_EMPTY;
2245 	}
2246 
2247 	// remove_vnode() allows the inode to be accessed until the last put_vnode()
2248 	status = remove_vnode(fVolume->FSVolume(), id);
2249 	if (status != B_OK)
2250 		return status;
2251 
2252 	if (tree->Remove(transaction, name, id) < B_OK) {
2253 		unremove_vnode(fVolume->FSVolume(), id);
2254 		RETURN_ERROR(B_ERROR);
2255 	}
2256 
2257 #ifdef DEBUG
2258 	if (tree->Find((uint8 *)name, (uint16)strlen(name), &id) == B_OK) {
2259 		DIE(("deleted entry still there"));
2260 	}
2261 #endif
2262 
2263 	// update the inode, so that no one will ever doubt it's deleted :-)
2264 	inode->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
2265 	inode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
2266 
2267 	// In balance to the Inode::Create() method, the main indices
2268 	// are updated here (name, size, & last_modified)
2269 
2270 	Index index(fVolume);
2271 	if (inode->IsRegularNode()) {
2272 		index.RemoveName(transaction, name, inode);
2273 			// If removing from the index fails, it is not regarded as a
2274 			// fatal error and will not be reported back!
2275 			// Deleted inodes won't be visible in queries anyway.
2276 	}
2277 
2278 	if (inode->IsFile() || inode->IsSymLink()) {
2279 		if (inode->IsFile())
2280 			index.RemoveSize(transaction, inode);
2281 		index.RemoveLastModified(transaction, inode);
2282 	}
2283 
2284 	return inode->WriteBack(transaction);
2285 }
2286 
2287 
2288 /*!	Creates the inode with the specified parent directory, and automatically
2289 	adds the created inode to that parent directory. If an attribute directory
2290 	is created, it will also automatically  be added to the parent inode as
2291 	such. However, the indices root node, and the regular root node won't be
2292 	added to the super block.
2293 	It will also create the initial B+tree for the inode if it's a directory
2294 	of any kind.
2295 	If the "_id" or "_inode" variable is given and non-NULL to store the
2296 	inode's ID, the inode stays locked - you have to call put_vnode() if you
2297 	don't use it anymore.
2298 	If the node already exists, this method will fail if O_EXCL is set, or it's
2299 	a directory or a symlink. Otherwise, it will just be returned. If O_TRUNC
2300 	has been specified, the file will also be truncated.
2301 */
2302 status_t
2303 Inode::Create(Transaction &transaction, Inode *parent, const char *name,
2304 	int32 mode, int openMode, uint32 type, bool *_created, ino_t *_id,
2305 	Inode **_inode)
2306 {
2307 	FUNCTION_START(("name = %s, mode = %ld\n", name, mode));
2308 
2309 	block_run parentRun = parent ? parent->BlockRun() : block_run::Run(0, 0, 0);
2310 	Volume *volume = transaction.GetVolume();
2311 	BPlusTree *tree = NULL;
2312 
2313 	if (parent && (mode & S_ATTR_DIR) == 0 && parent->IsContainer()) {
2314 		// check if the file already exists in the directory
2315 		if (parent->GetTree(&tree) != B_OK)
2316 			RETURN_ERROR(B_BAD_VALUE);
2317 	}
2318 
2319 	WriteLocked locker(parent != NULL ? &parent->Lock() : NULL);
2320 		// the parent directory is locked during the whole inode creation
2321 
2322 	if (tree != NULL) {
2323 		// Does the file already exist?
2324 		off_t offset;
2325 		if (tree->Find((uint8 *)name, (uint16)strlen(name), &offset) == B_OK) {
2326 			// Return if the file should be a directory/link or opened in
2327 			// exclusive mode
2328 			if (S_ISDIR(mode) || S_ISLNK(mode) || openMode & O_EXCL)
2329 				return B_FILE_EXISTS;
2330 
2331 			Vnode vnode(volume, offset);
2332 			Inode *inode;
2333 			status_t status = vnode.Get(&inode);
2334 			if (status < B_OK) {
2335 				REPORT_ERROR(status);
2336 				return B_ENTRY_NOT_FOUND;
2337 			}
2338 
2339 			// if it's a directory, bail out!
2340 			if (inode->IsDirectory())
2341 				return B_IS_A_DIRECTORY;
2342 
2343 			// we want to open the file, so we should have the rights to do so
2344 			if (inode->CheckPermissions(openModeToAccess(openMode)) != B_OK)
2345 				return B_NOT_ALLOWED;
2346 
2347 			if (openMode & O_TRUNC) {
2348 				// we need write access in order to truncate the file
2349 				status = inode->CheckPermissions(W_OK);
2350 				if (status != B_OK)
2351 					return status;
2352 
2353 				// truncate the existing file
2354 				WriteLocked locked(inode->Lock());
2355 
2356 				status_t status = inode->SetFileSize(transaction, 0);
2357 				if (status >= B_OK)
2358 					status = inode->WriteBack(transaction);
2359 
2360 				if (status < B_OK)
2361 					return status;
2362 			}
2363 
2364 			if (_created)
2365 				*_created = false;
2366 			if (_id)
2367 				*_id = inode->ID();
2368 			if (_inode)
2369 				*_inode = inode;
2370 
2371 			// Only keep the vnode in memory if the _id or _inode pointer is
2372 			// provided
2373 			if (_id != NULL || _inode != NULL)
2374 				vnode.Keep();
2375 
2376 			return B_OK;
2377 		}
2378 	} else if (parent && (mode & S_ATTR_DIR) == 0)
2379 		return B_BAD_VALUE;
2380 
2381 	status_t status;
2382 
2383 	// do we have the power to create new files at all?
2384 	if (parent != NULL && (status = parent->CheckPermissions(W_OK)) != B_OK)
2385 		RETURN_ERROR(status);
2386 
2387 	// allocate space for the new inode
2388 	InodeAllocator allocator(transaction);
2389 	block_run run;
2390 	Inode *inode;
2391 	status = allocator.New(&parentRun, mode, run, &inode);
2392 	if (status < B_OK)
2393 		return status;
2394 
2395 	T(Create(inode, parent, name, mode, openMode, type));
2396 
2397 	// Initialize the parts of the bfs_inode structure that
2398 	// InodeAllocator::New() hasn't touched yet
2399 
2400 	bfs_inode *node = &inode->Node();
2401 
2402 	if (parent == NULL) {
2403 		// we set the parent to itself in this case
2404 		// (only happens for the root and indices node)
2405 		node->parent = run;
2406 	} else
2407 		node->parent = parentRun;
2408 
2409 	node->uid = HOST_ENDIAN_TO_BFS_INT32(geteuid());
2410 	node->gid = HOST_ENDIAN_TO_BFS_INT32(parent
2411 		? parent->Node().GroupID() : getegid());
2412 		// the group ID is inherited from the parent, if available
2413 
2414 	node->type = HOST_ENDIAN_TO_BFS_INT32(type);
2415 
2416 	inode->WriteBack(transaction);
2417 		// make sure the initialized node is available to others
2418 
2419 	// only add the name to regular files, directories, or symlinks
2420 	// don't add it to attributes, or indices
2421 	if (tree && inode->IsRegularNode()
2422 		&& inode->SetName(transaction, name) < B_OK)
2423 		return B_ERROR;
2424 
2425 	// Initialize b+tree if it's a directory (and add "." & ".." if it's
2426 	// a standard directory for files - not for attributes or indices)
2427 	if (inode->IsContainer()) {
2428 		status = allocator.CreateTree();
2429 		if (status < B_OK)
2430 			return status;
2431 	}
2432 
2433 	// Add a link to the inode from the parent, depending on its type
2434 	// (the vnode is not published yet, so it is safe to make the inode
2435 	// accessable to the file system here)
2436 	if (tree) {
2437 		status = tree->Insert(transaction, name, inode->ID());
2438 	} else if (parent && (mode & S_ATTR_DIR) != 0) {
2439 		parent->Attributes() = run;
2440 		status = parent->WriteBack(transaction);
2441 	}
2442 
2443 	// Note, we only care if the inode could be made accessable for the
2444 	// two cases above; the root node or the indices root node must
2445 	// handle this case on their own (or other cases where "parent" is
2446 	// NULL)
2447 	if (status < B_OK)
2448 		RETURN_ERROR(status);
2449 
2450 	// Update the main indices (name, size & last_modified)
2451 	// (live queries might want to access us after this)
2452 
2453 	Index index(volume);
2454 	if (inode->IsRegularNode() && name != NULL) {
2455 		// the name index only contains regular files
2456 		// (but not the root node where name == NULL)
2457 		status = index.InsertName(transaction, name, inode);
2458 		if (status < B_OK && status != B_BAD_INDEX) {
2459 			// We have to remove the node from the parent at this point,
2460 			// because the InodeAllocator destructor can't handle this
2461 			// case (and if it fails, we can't do anything about it...)
2462 			if (tree)
2463 				tree->Remove(transaction, name, inode->ID());
2464 			else if (parent != NULL && (mode & S_ATTR_DIR) != 0)
2465 				parent->Node().attributes.SetTo(0, 0, 0);
2466 
2467 			RETURN_ERROR(status);
2468 		}
2469 	}
2470 
2471 	inode->UpdateOldLastModified();
2472 
2473 	// The "size" & "last_modified" indices don't contain directories
2474 	if (inode->IsFile() || inode->IsSymLink()) {
2475 		// if adding to these indices fails, the inode creation will not be
2476 		//  harmed; they are considered less important than the "name" index
2477 		if (inode->IsFile())
2478 			index.InsertSize(transaction, inode);
2479 		index.InsertLastModified(transaction, inode);
2480 	}
2481 
2482 	// Everything worked well until this point, we have a fully
2483 	// initialized inode, and we want to keep it
2484 	allocator.Keep();
2485 
2486 	if (inode->IsFile() || inode->IsAttribute()) {
2487 		inode->SetFileCache(file_cache_create(volume->ID(), inode->ID(),
2488 			inode->Size()));
2489 		inode->SetMap(file_map_create(volume->ID(), inode->ID(),
2490 			inode->Size()));
2491 	}
2492 
2493 	if (_created)
2494 		*_created = true;
2495 	if (_id != NULL)
2496 		*_id = inode->ID();
2497 	if (_inode != NULL)
2498 		*_inode = inode;
2499 
2500 	// if either _id or _inode is passed, we will keep the inode locked
2501 	if (_id == NULL && _inode == NULL)
2502 		put_vnode(volume->FSVolume(), inode->ID());
2503 
2504 	return B_OK;
2505 }
2506 
2507 
2508 //	#pragma mark - AttributeIterator
2509 
2510 
2511 AttributeIterator::AttributeIterator(Inode *inode)
2512 	:
2513 	fCurrentSmallData(0),
2514 	fInode(inode),
2515 	fAttributes(NULL),
2516 	fIterator(NULL),
2517 	fBuffer(NULL)
2518 {
2519 	inode->_AddIterator(this);
2520 }
2521 
2522 
2523 AttributeIterator::~AttributeIterator()
2524 {
2525 	if (fAttributes)
2526 		put_vnode(fAttributes->GetVolume()->FSVolume(), fAttributes->ID());
2527 
2528 	delete fIterator;
2529 	fInode->_RemoveIterator(this);
2530 }
2531 
2532 
2533 status_t
2534 AttributeIterator::Rewind()
2535 {
2536 	fCurrentSmallData = 0;
2537 
2538 	if (fIterator != NULL)
2539 		fIterator->Rewind();
2540 
2541 	return B_OK;
2542 }
2543 
2544 
2545 status_t
2546 AttributeIterator::GetNext(char *name, size_t *_length, uint32 *_type,
2547 	ino_t *_id)
2548 {
2549 	// read attributes out of the small data section
2550 
2551 	if (fCurrentSmallData >= 0) {
2552 		NodeGetter nodeGetter(fInode->GetVolume(), fInode);
2553 		const bfs_inode *node = nodeGetter.Node();
2554 		const small_data *item = ((bfs_inode *)node)->SmallDataStart();
2555 
2556 		fInode->SmallDataLock().Lock();
2557 
2558 		int32 i = 0;
2559 		for (;;item = item->Next()) {
2560 			if (item->IsLast(node))
2561 				break;
2562 
2563 			if (item->NameSize() == FILE_NAME_NAME_LENGTH
2564 				&& *item->Name() == FILE_NAME_NAME)
2565 				continue;
2566 
2567 			if (i++ == fCurrentSmallData)
2568 				break;
2569 		}
2570 
2571 		if (!item->IsLast(node)) {
2572 			strncpy(name, item->Name(), B_FILE_NAME_LENGTH);
2573 			*_type = item->Type();
2574 			*_length = item->NameSize();
2575 			*_id = (ino_t)fCurrentSmallData;
2576 
2577 			fCurrentSmallData = i;
2578 		}
2579 		else {
2580 			// stop traversing the small_data section
2581 			fCurrentSmallData = -1;
2582 		}
2583 
2584 		fInode->SmallDataLock().Unlock();
2585 
2586 		if (fCurrentSmallData != -1)
2587 			return B_OK;
2588 	}
2589 
2590 	// read attributes out of the attribute directory
2591 
2592 	if (fInode->Attributes().IsZero())
2593 		return B_ENTRY_NOT_FOUND;
2594 
2595 	Volume *volume = fInode->GetVolume();
2596 
2597 	// if you haven't yet access to the attributes directory, get it
2598 	if (fAttributes == NULL) {
2599 		if (get_vnode(volume->FSVolume(), volume->ToVnode(fInode->Attributes()),
2600 				(void **)&fAttributes) != B_OK) {
2601 			FATAL(("get_vnode() failed in AttributeIterator::GetNext(ino_t"
2602 				" = %Ld,name = \"%s\")\n",fInode->ID(),name));
2603 			return B_ENTRY_NOT_FOUND;
2604 		}
2605 
2606 		BPlusTree *tree;
2607 		if (fAttributes->GetTree(&tree) < B_OK
2608 			|| (fIterator = new TreeIterator(tree)) == NULL) {
2609 			FATAL(("could not get tree in AttributeIterator::GetNext(ino_t"
2610 				" = %Ld,name = \"%s\")\n",fInode->ID(),name));
2611 			return B_ENTRY_NOT_FOUND;
2612 		}
2613 	}
2614 
2615 	uint16 length;
2616 	ino_t id;
2617 	status_t status = fIterator->GetNextEntry(name, &length,
2618 		B_FILE_NAME_LENGTH, &id);
2619 	if (status < B_OK)
2620 		return status;
2621 
2622 	Vnode vnode(volume,id);
2623 	Inode *attribute;
2624 	if ((status = vnode.Get(&attribute)) == B_OK) {
2625 		*_type = attribute->Type();
2626 		*_length = length;
2627 		*_id = id;
2628 	}
2629 
2630 	return status;
2631 }
2632 
2633 
2634 void
2635 AttributeIterator::Update(uint16 index, int8 change)
2636 {
2637 	// fCurrentSmallData points already to the next item
2638 	if (index < fCurrentSmallData)
2639 		fCurrentSmallData += change;
2640 }
2641 
2642