xref: /haiku/src/add-ons/kernel/file_systems/bfs/Inode.cpp (revision 80d75f15dfa48ebea421c6b2c19a5296cc63d7eb)
1 /*
2  * Copyright 2001-2007, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 //! inode access functions
7 
8 
9 #include "Debug.h"
10 #include "Inode.h"
11 #include "BPlusTree.h"
12 #include "Index.h"
13 
14 
15 class InodeAllocator {
16 	public:
17 		InodeAllocator(Transaction &transaction);
18 		~InodeAllocator();
19 
20 		status_t New(block_run *parentRun, mode_t mode, block_run &run, Inode **_inode);
21 		status_t CreateTree();
22 		status_t Keep();
23 
24 	private:
25 		Transaction *fTransaction;
26 		block_run fRun;
27 		Inode *fInode;
28 };
29 
30 
31 InodeAllocator::InodeAllocator(Transaction &transaction)
32 	:
33 	fTransaction(&transaction),
34 	fInode(NULL)
35 {
36 }
37 
38 
39 InodeAllocator::~InodeAllocator()
40 {
41 	if (fTransaction != NULL) {
42 		Volume *volume = fTransaction->GetVolume();
43 
44 		if (fInode != NULL) {
45 			fInode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
46 				// this unblocks any pending bfs_read_vnode() calls
47 			fInode->Free(*fTransaction);
48 			remove_vnode(volume->ID(), fInode->ID());
49 		} else
50 			volume->Free(*fTransaction, fRun);
51 	}
52 
53 	delete fInode;
54 }
55 
56 
57 status_t
58 InodeAllocator::New(block_run *parentRun, mode_t mode, block_run &run, Inode **_inode)
59 {
60 	Volume *volume = fTransaction->GetVolume();
61 
62 	status_t status = volume->AllocateForInode(*fTransaction, parentRun, mode, fRun);
63 	if (status < B_OK) {
64 		// don't free the space in the destructor, because
65 		// the allocation failed
66 		fTransaction = NULL;
67 		RETURN_ERROR(status);
68 	}
69 
70 	run = fRun;
71 	fInode = new Inode(volume, *fTransaction, volume->ToVnode(run), mode, run);
72 	if (fInode == NULL)
73 		RETURN_ERROR(B_NO_MEMORY);
74 
75 	if (volume->ID() >= 0) {
76 		status = new_vnode(volume->ID(), fInode->ID(), fInode);
77 		if (status < B_OK) {
78 			delete fInode;
79 			fInode = NULL;
80 			RETURN_ERROR(status);
81 		}
82 	}
83 
84 	*_inode = fInode;
85 	return B_OK;
86 }
87 
88 
89 status_t
90 InodeAllocator::CreateTree()
91 {
92 	Volume *volume = fTransaction->GetVolume();
93 
94 	// force S_STR_INDEX to be set, if no type is set
95 	if ((fInode->Mode() & S_INDEX_TYPES) == 0)
96 		fInode->Node().mode |= HOST_ENDIAN_TO_BFS_INT32(S_STR_INDEX);
97 
98 	BPlusTree *tree = fInode->fTree = new BPlusTree(*fTransaction, fInode);
99 	if (tree == NULL || tree->InitCheck() < B_OK)
100 		return B_ERROR;
101 
102 	if (fInode->IsRegularNode()) {
103 		if (tree->Insert(*fTransaction, ".", fInode->ID()) < B_OK
104 			|| tree->Insert(*fTransaction, "..", volume->ToVnode(fInode->Parent())) < B_OK)
105 			return B_ERROR;
106 	}
107 	return B_OK;
108 }
109 
110 
111 status_t
112 InodeAllocator::Keep()
113 {
114 	ASSERT(fInode != NULL && fTransaction != NULL);
115 	Volume *volume = fTransaction->GetVolume();
116 
117 	status_t status = fInode->WriteBack(*fTransaction);
118 	if (status < B_OK) {
119 		FATAL(("writing new inode %Ld failed!\n", fInode->ID()));
120 		return status;
121 	}
122 
123 	if (!fInode->IsSymLink() && volume->ID() >= 0)
124 		status = publish_vnode(volume->ID(), fInode->ID(), fInode);
125 
126 	fTransaction = NULL;
127 	fInode = NULL;
128 
129 	return status;
130 }
131 
132 
133 //	#pragma mark -
134 
135 
136 status_t
137 bfs_inode::InitCheck(Volume *volume)
138 {
139 	if (Magic1() != INODE_MAGIC1
140 		|| !(Flags() & INODE_IN_USE)
141 		|| inode_num.Length() != 1
142 		// matches inode size?
143 		|| (uint32)InodeSize() != volume->InodeSize()
144 		// parent resides on disk?
145 		|| parent.AllocationGroup() > int32(volume->AllocationGroups())
146 		|| parent.AllocationGroup() < 0
147 		|| parent.Start() > (1L << volume->AllocationGroupShift())
148 		|| parent.Length() != 1
149 		// attributes, too?
150 		|| attributes.AllocationGroup() > int32(volume->AllocationGroups())
151 		|| attributes.AllocationGroup() < 0
152 		|| attributes.Start() > (1L << volume->AllocationGroupShift()))
153 		RETURN_ERROR(B_BAD_DATA);
154 
155 	if (Flags() & INODE_DELETED)
156 		return B_NOT_ALLOWED;
157 
158 	// ToDo: Add some tests to check the integrity of the other stuff here,
159 	// especially for the data_stream!
160 
161 	return B_OK;
162 }
163 
164 
165 //	#pragma mark - Inode
166 
167 
168 Inode::Inode(Volume *volume, ino_t id)
169 	:
170 	fVolume(volume),
171 	fID(id),
172 	fTree(NULL),
173 	fAttributes(NULL),
174 	fCache(NULL)
175 {
176 	PRINT(("Inode::Inode(volume = %p, id = %Ld) @ %p\n", volume, id, this));
177 
178 	NodeGetter node(volume, this);
179 	memcpy(&fNode, node.Node(), sizeof(bfs_inode));
180 
181 	char lockName[B_OS_NAME_LENGTH];
182 	snprintf(lockName, sizeof(lockName), "bfs inode %d.%d",
183 		(int)BlockRun().AllocationGroup(), BlockRun().Start());
184 	fLock.Initialize(lockName);
185 
186 	// these two will help to maintain the indices
187 	fOldSize = Size();
188 	fOldLastModified = LastModified();
189 
190 	if (IsContainer())
191 		fTree = new BPlusTree(this);
192 	if (IsFile() || IsAttribute())
193 		SetFileCache(file_cache_create(fVolume->ID(), ID(), Size(), fVolume->Device()));
194 }
195 
196 
197 Inode::Inode(Volume *volume, Transaction &transaction, ino_t id, mode_t mode,
198 		block_run &run)
199 	:
200 	fVolume(volume),
201 	fID(id),
202 	fTree(NULL),
203 	fAttributes(NULL),
204 	fCache(NULL)
205 {
206 	PRINT(("Inode::Inode(volume = %p, transaction = %p, id = %Ld) @ %p\n",
207 		volume, &transaction, id, this));
208 
209 	char lockName[B_OS_NAME_LENGTH];
210 	snprintf(lockName, sizeof(lockName), "bfs inode+%d.%d",
211 		(int)run.AllocationGroup(), run.Start());
212 	fLock.Initialize(lockName);
213 
214 	NodeGetter node(volume, transaction, this, true);
215 	memset(&fNode, 0, sizeof(bfs_inode));
216 
217 	// Initialize the bfs_inode structure -- it's not written back to disk
218 	// here, because it will be done so already when the inode could be
219 	// created completely.
220 
221 	Node().magic1 = HOST_ENDIAN_TO_BFS_INT32(INODE_MAGIC1);
222 	Node().inode_num = run;
223 	Node().mode = HOST_ENDIAN_TO_BFS_INT32(mode);
224 	Node().flags = HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
225 
226 	Node().create_time = HOST_ENDIAN_TO_BFS_INT64((bigtime_t)time(NULL) << INODE_TIME_SHIFT);
227 	Node().last_modified_time = HOST_ENDIAN_TO_BFS_INT64(Node().create_time
228 		| (volume->GetUniqueID() & INODE_TIME_MASK));
229 		// we use Volume::GetUniqueID() to avoid having too many duplicates in the
230 		// last_modified index
231 
232 	Node().inode_size = HOST_ENDIAN_TO_BFS_INT32(volume->InodeSize());
233 
234 	// these two will help to maintain the indices
235 	fOldSize = Size();
236 	fOldLastModified = LastModified();
237 }
238 
239 
240 Inode::~Inode()
241 {
242 	PRINT(("Inode::~Inode() @ %p\n", this));
243 
244 	file_cache_delete(FileCache());
245 	delete fTree;
246 }
247 
248 
249 status_t
250 Inode::InitCheck(bool checkNode)
251 {
252 	// test inode magic and flags
253 	if (checkNode) {
254 		status_t status = Node().InitCheck(fVolume);
255 		if (status == B_BUSY)
256 			return B_BUSY;
257 
258 		if (status < B_OK) {
259 			FATAL(("inode at block %Ld corrupt!\n", BlockNumber()));
260 			RETURN_ERROR(B_BAD_DATA);
261 		}
262 	}
263 
264 	if (IsContainer()) {
265 		// inodes that have a
266 		if (fTree == NULL)
267 			RETURN_ERROR(B_NO_MEMORY);
268 
269 		status_t status = fTree->InitCheck();
270 		if (status < B_OK) {
271 			FATAL(("inode tree at block %Ld corrupt!\n", BlockNumber()));
272 			RETURN_ERROR(B_BAD_DATA);
273 		}
274 	}
275 
276 	// it's more important to know that the inode is corrupt
277 	// so we check for the lock not until here
278 	return fLock.InitCheck();
279 }
280 
281 
282 status_t
283 Inode::WriteBack(Transaction &transaction)
284 {
285 	NodeGetter node(fVolume, transaction, this);
286 	if (node.WritableNode() == NULL)
287 		return B_IO_ERROR;
288 
289 	memcpy(node.WritableNode(), &Node(), sizeof(bfs_inode));
290 	return B_OK;
291 }
292 
293 
294 status_t
295 Inode::CheckPermissions(int accessMode) const
296 {
297 	uid_t user = geteuid();
298 	gid_t group = getegid();
299 
300 	// you never have write access to a read-only volume
301 	if (accessMode & W_OK && fVolume->IsReadOnly())
302 		return B_READ_ONLY_DEVICE;
303 
304 	// root users always have full access (but they can't execute anything)
305 	if (user == 0 && !((accessMode & X_OK) && (Mode() & S_IXUSR) == 0))
306 		return B_OK;
307 
308 	// shift mode bits, to check directly against accessMode
309 	mode_t mode = Mode();
310 	if (user == (uid_t)fNode.UserID())
311 		mode >>= 6;
312 	else if (group == (gid_t)fNode.GroupID())
313 		mode >>= 3;
314 
315 	if (accessMode & ~(mode & S_IRWXO))
316 		return B_NOT_ALLOWED;
317 
318 	return B_OK;
319 }
320 
321 
322 //	#pragma mark - attributes
323 
324 
325 void
326 Inode::_AddIterator(AttributeIterator *iterator)
327 {
328 	if (fSmallDataLock.Lock() < B_OK)
329 		return;
330 
331 	fIterators.Add(iterator);
332 
333 	fSmallDataLock.Unlock();
334 }
335 
336 
337 void
338 Inode::_RemoveIterator(AttributeIterator *iterator)
339 {
340 	if (fSmallDataLock.Lock() < B_OK)
341 		return;
342 
343 	fIterators.Remove(iterator);
344 
345 	fSmallDataLock.Unlock();
346 }
347 
348 
349 /*!
350 	Tries to free up "bytes" space in the small_data section by moving
351 	attributes to real files. Used for system attributes like the name.
352 	You need to hold the fSmallDataLock when you call this method
353 */
354 status_t
355 Inode::_MakeSpaceForSmallData(Transaction &transaction, bfs_inode *node,
356 	const char *name, int32 bytes)
357 {
358 	ASSERT(fSmallDataLock.IsLocked());
359 
360 	while (bytes > 0) {
361 		small_data *item = node->SmallDataStart(), *max = NULL;
362 		int32 index = 0, maxIndex = 0;
363 		for (; !item->IsLast(node); item = item->Next(), index++) {
364 			// should not remove those
365 			if (*item->Name() == FILE_NAME_NAME || !strcmp(name, item->Name()))
366 				continue;
367 
368 			if (max == NULL || max->Size() < item->Size()) {
369 				maxIndex = index;
370 				max = item;
371 			}
372 
373 			// remove the first one large enough to free the needed amount of bytes
374 			if (bytes < (int32)item->Size())
375 				break;
376 		}
377 
378 		if (item->IsLast(node) || (int32)item->Size() < bytes)
379 			return B_ERROR;
380 
381 		bytes -= max->Size();
382 
383 		// Move the attribute to a real attribute file
384 		// Luckily, this doesn't cause any index updates
385 
386 		Inode *attribute;
387 		status_t status = CreateAttribute(transaction, item->Name(), item->Type(),
388 			&attribute);
389 		if (status < B_OK)
390 			RETURN_ERROR(status);
391 
392 		size_t length = item->DataSize();
393 		status = attribute->WriteAt(transaction, 0, item->Data(), &length);
394 
395 		ReleaseAttribute(attribute);
396 
397 		if (status < B_OK) {
398 			Vnode vnode(fVolume, Attributes());
399 			Inode *attributes;
400 			if (vnode.Get(&attributes) < B_OK
401 				|| attributes->Remove(transaction, name) < B_OK) {
402 				FATAL(("Could not remove newly created attribute!\n"));
403 			}
404 
405 			RETURN_ERROR(status);
406 		}
407 
408 		_RemoveSmallData(node, max, maxIndex);
409 	}
410 	return B_OK;
411 }
412 
413 
414 /*!
415 	Private function which removes the given attribute from the small_data
416 	section.
417 	You need to hold the fSmallDataLock when you call this method
418 */
419 status_t
420 Inode::_RemoveSmallData(bfs_inode *node, small_data *item, int32 index)
421 {
422 	ASSERT(fSmallDataLock.IsLocked());
423 
424 	small_data *next = item->Next();
425 	if (!next->IsLast(node)) {
426 		// find the last attribute
427 		small_data *last = next;
428 		while (!last->IsLast(node))
429 			last = last->Next();
430 
431 		int32 size = (uint8 *)last - (uint8 *)next;
432 		if (size < 0 || size > (uint8 *)node + fVolume->BlockSize() - (uint8 *)next)
433 			return B_BAD_DATA;
434 
435 		memmove(item, next, size);
436 
437 		// Move the "last" one to its new location and
438 		// correctly terminate the small_data section
439 		last = (small_data *)((uint8 *)last - ((uint8 *)next - (uint8 *)item));
440 		memset(last, 0, (uint8 *)node + fVolume->BlockSize() - (uint8 *)last);
441 	} else
442 		memset(item, 0, item->Size());
443 
444 	// update all current iterators
445 	AttributeIterator *iterator = NULL;
446 	while ((iterator = fIterators.Next(iterator)) != NULL) {
447 		iterator->Update(index, -1);
448 	}
449 
450 	return B_OK;
451 }
452 
453 
454 //!	Removes the given attribute from the small_data section.
455 status_t
456 Inode::_RemoveSmallData(Transaction &transaction, NodeGetter &nodeGetter,
457 	const char *name)
458 {
459 	if (name == NULL)
460 		return B_BAD_VALUE;
461 
462 	bfs_inode *node = nodeGetter.WritableNode();
463 	SimpleLocker locker(fSmallDataLock);
464 
465 	// search for the small_data item
466 
467 	small_data *item = node->SmallDataStart();
468 	int32 index = 0;
469 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
470 		item = item->Next();
471 		index++;
472 	}
473 
474 	if (item->IsLast(node))
475 		return B_ENTRY_NOT_FOUND;
476 
477 	nodeGetter.MakeWritable(transaction);
478 	status_t status = _RemoveSmallData(node, item, index);
479 	if (status == B_OK)
480 		status = WriteBack(transaction);
481 
482 	return status;
483 }
484 
485 
486 /*!
487 	Try to place the given attribute in the small_data section - if the
488 	new attribute is too big to fit in that section, it returns B_DEVICE_FULL.
489 	In that case, the attribute should be written to a real attribute file;
490 	it's the caller's responsibility to remove any existing attributes in the small
491 	data section if that's the case.
492 
493 	Note that you need to write back the inode yourself after having called that
494 	method - it's a bad API decision that it needs a transaction but enforces you
495 	to write back the inode all by yourself, but it's just more efficient in most
496 	cases...
497 */
498 status_t
499 Inode::_AddSmallData(Transaction &transaction, NodeGetter &nodeGetter,
500 	const char *name, uint32 type, const uint8 *data, size_t length, bool force)
501 {
502 	// TODO: support new write attr semantics and write offset!
503 	bfs_inode *node = nodeGetter.WritableNode();
504 
505 	if (node == NULL || name == NULL || data == NULL)
506 		return B_BAD_VALUE;
507 
508 	// reject any requests that can't fit into the small_data section
509 	uint32 nameLength = strlen(name);
510 	uint32 spaceNeeded = sizeof(small_data) + nameLength + 3 + length + 1;
511 	if (spaceNeeded > fVolume->InodeSize() - sizeof(bfs_inode))
512 		return B_DEVICE_FULL;
513 
514 	nodeGetter.MakeWritable(transaction);
515 	SimpleLocker locker(fSmallDataLock);
516 
517 	// Find the last item or one with the same name we have to add
518 	small_data *item = node->SmallDataStart();
519 	int32 index = 0;
520 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
521 		item = item->Next();
522 		index++;
523 	}
524 
525 	// is the attribute already in the small_data section?
526 	// then just replace the data part of that one
527 	if (!item->IsLast(node)) {
528 		// find last attribute
529 		small_data *last = item;
530 		while (!last->IsLast(node))
531 			last = last->Next();
532 
533 		// try to change the attributes value
534 		if (item->data_size > length
535 			|| force
536 			|| ((uint8 *)last + length - item->DataSize()) <= ((uint8 *)node + fVolume->InodeSize())) {
537 			// make room for the new attribute if needed (and we are forced to do so)
538 			if (force
539 				&& ((uint8 *)last + length - item->DataSize()) > ((uint8 *)node + fVolume->InodeSize())) {
540 				// We also take the free space at the end of the small_data section
541 				// into account, and request only what's really needed
542 				uint32 needed = length - item->DataSize() -
543 						(uint32)((uint8 *)node + fVolume->InodeSize() - (uint8 *)last);
544 
545 				if (_MakeSpaceForSmallData(transaction, node, name, needed) < B_OK)
546 					return B_ERROR;
547 
548 				// reset our pointers
549 				item = node->SmallDataStart();
550 				index = 0;
551 				while (!item->IsLast(node) && strcmp(item->Name(), name)) {
552 					item = item->Next();
553 					index++;
554 				}
555 
556 				last = item;
557 				while (!last->IsLast(node))
558 					last = last->Next();
559 			}
560 
561 			// Normally, we can just overwrite the attribute data as the size
562 			// is specified by the type and does not change that often
563 			if (length != item->DataSize()) {
564 				// move the attributes after the current one
565 				small_data *next = item->Next();
566 				if (!next->IsLast(node))
567 					memmove((uint8 *)item + spaceNeeded, next, (uint8 *)last - (uint8 *)next);
568 
569 				// Move the "last" one to its new location and
570 				// correctly terminate the small_data section
571 				last = (small_data *)((uint8 *)last - ((uint8 *)next - ((uint8 *)item + spaceNeeded)));
572 				if ((uint8 *)last < (uint8 *)node + fVolume->BlockSize())
573 					memset(last, 0, (uint8 *)node + fVolume->BlockSize() - (uint8 *)last);
574 
575 				item->data_size = HOST_ENDIAN_TO_BFS_INT16(length);
576 			}
577 
578 			item->type = HOST_ENDIAN_TO_BFS_INT32(type);
579 			memcpy(item->Data(), data, length);
580 			item->Data()[length] = '\0';
581 
582 			return B_OK;
583 		}
584 
585 		return B_DEVICE_FULL;
586 	}
587 
588 	// try to add the new attribute!
589 
590 	if ((uint8 *)item + spaceNeeded > (uint8 *)node + fVolume->InodeSize()) {
591 		// there is not enough space for it!
592 		if (!force)
593 			return B_DEVICE_FULL;
594 
595 		// make room for the new attribute
596 		if (_MakeSpaceForSmallData(transaction, node, name, spaceNeeded) < B_OK)
597 			return B_ERROR;
598 
599 		// get new last item!
600 		item = node->SmallDataStart();
601 		index = 0;
602 		while (!item->IsLast(node)) {
603 			item = item->Next();
604 			index++;
605 		}
606 	}
607 
608 	memset(item, 0, spaceNeeded);
609 	item->type = HOST_ENDIAN_TO_BFS_INT32(type);
610 	item->name_size = HOST_ENDIAN_TO_BFS_INT16(nameLength);
611 	item->data_size = HOST_ENDIAN_TO_BFS_INT16(length);
612 	strcpy(item->Name(), name);
613 	memcpy(item->Data(), data, length);
614 
615 	// correctly terminate the small_data section
616 	item = item->Next();
617 	if (!item->IsLast(node))
618 		memset(item, 0, (uint8 *)node + fVolume->InodeSize() - (uint8 *)item);
619 
620 	// update all current iterators
621 	AttributeIterator *iterator = NULL;
622 	while ((iterator = fIterators.Next(iterator)) != NULL) {
623 		iterator->Update(index, 1);
624 	}
625 
626 	return B_OK;
627 }
628 
629 
630 /*!
631 	Iterates through the small_data section of an inode.
632 	To start at the beginning of this section, you let smallData
633 	point to NULL, like:
634 		small_data *data = NULL;
635 		while (inode->GetNextSmallData(&data) { ... }
636 
637 	This function is reentrant and doesn't allocate any memory;
638 	you can safely stop calling it at any point (you don't need
639 	to iterate through the whole list).
640 	You need to hold the fSmallDataLock when you call this method
641 */
642 status_t
643 Inode::_GetNextSmallData(bfs_inode *node, small_data **_smallData) const
644 {
645 	if (node == NULL)
646 		RETURN_ERROR(B_BAD_VALUE);
647 
648 	ASSERT(fSmallDataLock.IsLocked());
649 
650 	small_data *data = *_smallData;
651 
652 	// begin from the start?
653 	if (data == NULL)
654 		data = node->SmallDataStart();
655 	else
656 		data = data->Next();
657 
658 	// is already last item?
659 	if (data->IsLast(node))
660 		return B_ENTRY_NOT_FOUND;
661 
662 	*_smallData = data;
663 
664 	return B_OK;
665 }
666 
667 
668 /*!
669 	Finds the attribute "name" in the small data section, and
670 	returns a pointer to it (or NULL if it doesn't exist).
671 	You need to hold the fSmallDataLock when you call this method
672 */
673 small_data *
674 Inode::FindSmallData(const bfs_inode *node, const char *name) const
675 {
676 	ASSERT(fSmallDataLock.IsLocked());
677 
678 	small_data *smallData = NULL;
679 	while (_GetNextSmallData(const_cast<bfs_inode *>(node), &smallData) == B_OK) {
680 		if (!strcmp(smallData->Name(), name))
681 			return smallData;
682 	}
683 	return NULL;
684 }
685 
686 
687 /*!
688 	Returns a pointer to the node's name if present in the small data
689 	section, NULL otherwise.
690 	You need to hold the fSmallDataLock when you call this method
691 */
692 const char *
693 Inode::Name(const bfs_inode *node) const
694 {
695 	ASSERT(fSmallDataLock.IsLocked());
696 
697 	small_data *smallData = NULL;
698 	while (_GetNextSmallData((bfs_inode *)node, &smallData) == B_OK) {
699 		if (*smallData->Name() == FILE_NAME_NAME
700 			&& smallData->NameSize() == FILE_NAME_NAME_LENGTH)
701 			return (const char *)smallData->Data();
702 	}
703 	return NULL;
704 }
705 
706 
707 /*!
708 	Copies the node's name into the provided buffer.
709 	The buffer should be B_FILE_NAME_LENGTH bytes large.
710 */
711 status_t
712 Inode::GetName(char *buffer, size_t size) const
713 {
714 	NodeGetter node(fVolume, this);
715 	SimpleLocker locker(fSmallDataLock);
716 
717 	const char *name = Name(node.Node());
718 	if (name == NULL)
719 		return B_ENTRY_NOT_FOUND;
720 
721 	strlcpy(buffer, name, size);
722 	return B_OK;
723 }
724 
725 
726 /*!
727 	Changes or set the name of a file: in the inode small_data section only, it
728 	doesn't change it in the parent directory's b+tree.
729 	Note that you need to write back the inode yourself after having called
730 	that method. It suffers from the same API decision as AddSmallData() does
731 	(and for the same reason).
732 */
733 status_t
734 Inode::SetName(Transaction &transaction, const char *name)
735 {
736 	if (name == NULL || *name == '\0')
737 		return B_BAD_VALUE;
738 
739 	NodeGetter node(fVolume, transaction, this);
740 	const char nameTag[2] = {FILE_NAME_NAME, 0};
741 
742 	return _AddSmallData(transaction, node, nameTag, FILE_NAME_TYPE,
743 		(uint8 *)name, strlen(name), true);
744 }
745 
746 
747 status_t
748 Inode::_RemoveAttribute(Transaction &transaction, const char *name,
749 	bool hasIndex, Index *index)
750 {
751 	// remove the attribute file if it exists
752 	Vnode vnode(fVolume, Attributes());
753 	Inode *attributes;
754 	status_t status = vnode.Get(&attributes);
755 	if (status < B_OK)
756 		return status;
757 
758 	// update index
759 	if (index != NULL) {
760 		Inode *attribute;
761 		if ((hasIndex || fVolume->CheckForLiveQuery(name))
762 			&& GetAttribute(name, &attribute) == B_OK) {
763 			uint8 data[BPLUSTREE_MAX_KEY_LENGTH];
764 			size_t length = BPLUSTREE_MAX_KEY_LENGTH;
765 			if (attribute->ReadAt(0, data, &length) == B_OK) {
766 				index->Update(transaction, name, attribute->Type(), data,
767 					length, NULL, 0, this);
768 			}
769 
770 			ReleaseAttribute(attribute);
771 		}
772 	}
773 
774 	if ((status = attributes->Remove(transaction, name)) < B_OK)
775 		return status;
776 
777 	if (attributes->IsEmpty()) {
778 		// remove attribute directory (don't fail if that can't be done)
779 		if (remove_vnode(fVolume->ID(), attributes->ID()) == B_OK) {
780 			// update the inode, so that no one will ever doubt it's deleted :-)
781 			attributes->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
782 			if (attributes->WriteBack(transaction) == B_OK) {
783 				Attributes().SetTo(0, 0, 0);
784 				WriteBack(transaction);
785 			} else
786 				unremove_vnode(fVolume->ID(), attributes->ID());
787 		}
788 	}
789 
790 	return status;
791 }
792 
793 
794 /*!
795 	Reads data from the specified attribute.
796 	This is a high-level attribute function that understands attributes
797 	in the small_data section as well as real attribute files.
798 */
799 status_t
800 Inode::ReadAttribute(const char *name, int32 type, off_t pos, uint8 *buffer,
801 	size_t *_length)
802 {
803 	if (pos < 0)
804 		pos = 0;
805 
806 	// search in the small_data section (which has to be locked first)
807 	{
808 		NodeGetter node(fVolume, this);
809 		SimpleLocker locker(fSmallDataLock);
810 
811 		small_data *smallData = FindSmallData(node.Node(), name);
812 		if (smallData != NULL) {
813 			size_t length = *_length;
814 			if (pos >= smallData->data_size) {
815 				*_length = 0;
816 				return B_OK;
817 			}
818 			if (length + pos > smallData->DataSize())
819 				length = smallData->DataSize() - pos;
820 
821 			memcpy(buffer, smallData->Data() + pos, length);
822 			*_length = length;
823 			return B_OK;
824 		}
825 	}
826 
827 	// search in the attribute directory
828 	Inode *attribute;
829 	status_t status = GetAttribute(name, &attribute);
830 	if (status == B_OK) {
831 		if (attribute->Lock().Lock() == B_OK) {
832 			status = attribute->ReadAt(pos, (uint8 *)buffer, _length);
833 			attribute->Lock().Unlock();
834 		} else
835 			status = B_ERROR;
836 
837 		ReleaseAttribute(attribute);
838 	}
839 
840 	RETURN_ERROR(status);
841 }
842 
843 
844 /*!
845 	Writes data to the specified attribute.
846 	This is a high-level attribute function that understands attributes
847 	in the small_data section as well as real attribute files.
848 */
849 status_t
850 Inode::WriteAttribute(Transaction &transaction, const char *name, int32 type,
851 	off_t pos, const uint8 *buffer, size_t *_length)
852 {
853 	// needed to maintain the index
854 	uint8 oldBuffer[BPLUSTREE_MAX_KEY_LENGTH], *oldData = NULL;
855 	size_t oldLength = 0;
856 
857 	// TODO: we actually depend on that the contents of "buffer" are constant.
858 	// If they get changed during the write (hey, user programs), we may mess
859 	// up our index trees!
860 
861 	Index index(fVolume);
862 	index.SetTo(name);
863 
864 	Inode *attribute = NULL;
865 	status_t status = B_OK;
866 
867 	if (GetAttribute(name, &attribute) < B_OK) {
868 		// save the old attribute data
869 		NodeGetter node(fVolume, transaction, this);
870 		fSmallDataLock.Lock();
871 
872 		small_data *smallData = FindSmallData(node.Node(), name);
873 		if (smallData != NULL) {
874 			oldLength = smallData->DataSize();
875 			if (oldLength > 0) {
876 				if (oldLength > BPLUSTREE_MAX_KEY_LENGTH)
877 					oldLength = BPLUSTREE_MAX_KEY_LENGTH;
878 				memcpy(oldData = oldBuffer, smallData->Data(), oldLength);
879 			}
880 		}
881 		fSmallDataLock.Unlock();
882 
883 		// if the attribute doesn't exist yet (as a file), try to put it in the
884 		// small_data section first - if that fails (due to insufficent space),
885 		// create a real attribute file
886 		status = _AddSmallData(transaction, node, name, type, buffer, *_length);
887 		if (status == B_DEVICE_FULL) {
888 			if (smallData != NULL) {
889 				// remove the old attribute from the small data section - there
890 				// is no space left for the new data
891 				status = _RemoveSmallData(transaction, node, name);
892 			} else
893 				status = B_OK;
894 
895 			if (status == B_OK)
896 				status = CreateAttribute(transaction, name, type, &attribute);
897 			if (status < B_OK)
898 				RETURN_ERROR(status);
899 		} else if (status == B_OK)
900 			status = WriteBack(transaction);
901 	}
902 
903 	if (attribute != NULL) {
904 		if (attribute->Lock().LockWrite() == B_OK) {
905 			// save the old attribute data (if this fails, oldLength will reflect it)
906 			if (fVolume->CheckForLiveQuery(name) && attribute->Size() > 0) {
907 				oldLength = BPLUSTREE_MAX_KEY_LENGTH;
908 				if (attribute->ReadAt(0, oldBuffer, &oldLength) == B_OK)
909 					oldData = oldBuffer;
910 			}
911 
912 			// check if the data fits into the small_data section again
913 			NodeGetter node(fVolume, transaction, this);
914 			status = _AddSmallData(transaction, node, name, type, buffer, *_length);
915 			if (status == B_OK) {
916 				// it does - remove its file
917 				status = _RemoveAttribute(transaction, name, false, NULL);
918 			} else {
919 				status = attribute->WriteAt(transaction, pos, buffer, _length);
920 				if (status == B_OK) {
921 					// The attribute type might have been changed - we need to adopt
922 					// the new one - the attribute's inode will be written back on close
923 					attribute->Node().type = HOST_ENDIAN_TO_BFS_INT32(type);
924 				}
925 
926 				attribute->Lock().UnlockWrite();
927 			}
928 		} else
929 			status = B_ERROR;
930 
931 		ReleaseAttribute(attribute);
932 	}
933 
934 	// TODO: find a better way than this "pos" thing (the begin of the old key
935 	//	must be copied to the start of the new one for a comparison)
936 	if (status == B_OK && pos == 0) {
937 		// index only the first BPLUSTREE_MAX_KEY_LENGTH bytes
938 		uint16 length = *_length;
939 		if (length > BPLUSTREE_MAX_KEY_LENGTH)
940 			length = BPLUSTREE_MAX_KEY_LENGTH;
941 
942 		// Update index. Note, Index::Update() may be called even if initializing
943 		// the index failed - it will just update the live queries in this case
944 		if (pos < length || pos < oldLength) {
945 			index.Update(transaction, name, type, oldData, oldLength, buffer,
946 				length, this);
947 		}
948 	}
949 	return status;
950 }
951 
952 
953 /*!
954 	Removes the specified attribute from the inode.
955 	This is a high-level attribute function that understands attributes
956 	in the small_data section as well as real attribute files.
957 */
958 status_t
959 Inode::RemoveAttribute(Transaction &transaction, const char *name)
960 {
961 	Index index(fVolume);
962 	bool hasIndex = index.SetTo(name) == B_OK;
963 	NodeGetter node(fVolume, this);
964 
965 	// update index for attributes in the small_data section
966 	{
967 		fSmallDataLock.Lock();
968 
969 		small_data *smallData = FindSmallData(node.Node(), name);
970 		if (smallData != NULL) {
971 			uint32 length = smallData->DataSize();
972 			if (length > BPLUSTREE_MAX_KEY_LENGTH)
973 				length = BPLUSTREE_MAX_KEY_LENGTH;
974 			index.Update(transaction, name, smallData->Type(), smallData->Data(),
975 				length, NULL, 0, this);
976 		}
977 		fSmallDataLock.Unlock();
978 	}
979 
980 	status_t status = _RemoveSmallData(transaction, node, name);
981 	if (status == B_ENTRY_NOT_FOUND && !Attributes().IsZero()) {
982 		// remove the attribute file if it exists
983 		status = _RemoveAttribute(transaction, name, hasIndex, &index);
984 	}
985 	return status;
986 }
987 
988 
989 status_t
990 Inode::GetAttribute(const char *name, Inode **_attribute)
991 {
992 	// does this inode even have attributes?
993 	if (Attributes().IsZero())
994 		return B_ENTRY_NOT_FOUND;
995 
996 	Vnode vnode(fVolume, Attributes());
997 	Inode *attributes;
998 	if (vnode.Get(&attributes) < B_OK) {
999 		FATAL(("get_vnode() failed in Inode::GetAttribute(name = \"%s\")\n", name));
1000 		return B_ERROR;
1001 	}
1002 
1003 	BPlusTree *tree;
1004 	status_t status = attributes->GetTree(&tree);
1005 	if (status == B_OK) {
1006 		ino_t id;
1007 		status = tree->Find((uint8 *)name, (uint16)strlen(name), &id);
1008 		if (status == B_OK) {
1009 			Vnode vnode(fVolume, id);
1010 			Inode *inode;
1011 			// Check if the attribute is really an attribute
1012 			if (vnode.Get(&inode) < B_OK || !inode->IsAttribute())
1013 				return B_ERROR;
1014 
1015 			*_attribute = inode;
1016 			vnode.Keep();
1017 			return B_OK;
1018 		}
1019 	}
1020 	return status;
1021 }
1022 
1023 
1024 void
1025 Inode::ReleaseAttribute(Inode *attribute)
1026 {
1027 	if (attribute == NULL)
1028 		return;
1029 
1030 	put_vnode(fVolume->ID(), attribute->ID());
1031 }
1032 
1033 
1034 status_t
1035 Inode::CreateAttribute(Transaction &transaction, const char *name, uint32 type,
1036 	Inode **attribute)
1037 {
1038 	// do we need to create the attribute directory first?
1039 	if (Attributes().IsZero()) {
1040 		status_t status = Inode::Create(transaction, this, NULL,
1041 			S_ATTR_DIR | S_DIRECTORY | 0666, 0, 0, NULL);
1042 		if (status < B_OK)
1043 			RETURN_ERROR(status);
1044 	}
1045 	Vnode vnode(fVolume, Attributes());
1046 	Inode *attributes;
1047 	if (vnode.Get(&attributes) < B_OK)
1048 		return B_ERROR;
1049 
1050 	// Inode::Create() locks the inode for us
1051 	return Inode::Create(transaction, attributes, name,
1052 		S_ATTR | S_FILE | 0666, 0, type, NULL, NULL, attribute);
1053 }
1054 
1055 
1056 //	#pragma mark - directory tree
1057 
1058 
1059 /*!
1060 	Gives the caller direct access to the b+tree for a given directory.
1061 	The tree is no longer created on demand, but when the inode is first
1062 	created. That will report any potential errors upfront, saves locking,
1063 	and should work as good (though a bit slower).
1064 */
1065 status_t
1066 Inode::GetTree(BPlusTree **tree)
1067 {
1068 	if (fTree) {
1069 		*tree = fTree;
1070 		return B_OK;
1071 	}
1072 
1073 	RETURN_ERROR(B_BAD_VALUE);
1074 }
1075 
1076 
1077 bool
1078 Inode::IsEmpty()
1079 {
1080 	BPlusTree *tree;
1081 	status_t status = GetTree(&tree);
1082 	if (status < B_OK)
1083 		return status;
1084 
1085 	TreeIterator iterator(tree);
1086 
1087 	// index and attribute directories are really empty when they are
1088 	// empty - directories for standard files always contain ".", and
1089 	// "..", so we need to ignore those two
1090 
1091 	uint32 count = 0;
1092 	char name[BPLUSTREE_MAX_KEY_LENGTH];
1093 	uint16 length;
1094 	ino_t id;
1095 	while (iterator.GetNextEntry(name, &length, B_FILE_NAME_LENGTH,
1096 			&id) == B_OK) {
1097 		if (Mode() & (S_ATTR_DIR | S_INDEX_DIR))
1098 			return false;
1099 
1100 		if (++count > 2 || strcmp(".", name) && strcmp("..", name))
1101 			return false;
1102 	}
1103 	return true;
1104 }
1105 
1106 
1107 //	#pragma mark - data stream
1108 
1109 
1110 /*!
1111 	Finds the block_run where "pos" is located in the data_stream of
1112 	the inode.
1113 	If successful, "offset" will then be set to the file offset
1114 	of the block_run returned; so "pos - offset" is for the block_run
1115 	what "pos" is for the whole stream.
1116 	The caller has to make sure that "pos" is inside the stream.
1117 */
1118 status_t
1119 Inode::FindBlockRun(off_t pos, block_run &run, off_t &offset)
1120 {
1121 	data_stream *data = &Node().data;
1122 
1123 	// find matching block run
1124 
1125 	if (data->MaxDirectRange() > 0 && pos >= data->MaxDirectRange()) {
1126 		if (data->MaxDoubleIndirectRange() > 0 && pos >= data->MaxIndirectRange()) {
1127 			// access to double indirect blocks
1128 
1129 			CachedBlock cached(fVolume);
1130 
1131 			off_t start = pos - data->MaxIndirectRange();
1132 			int32 indirectSize = (1L << (INDIRECT_BLOCKS_SHIFT
1133 					+ cached.BlockShift()))
1134 				* (fVolume->BlockSize() / sizeof(block_run));
1135 			int32 directSize = NUM_ARRAY_BLOCKS << cached.BlockShift();
1136 			int32 index = start / indirectSize;
1137 			int32 runsPerBlock = cached.BlockSize() / sizeof(block_run);
1138 
1139 			block_run *indirect = (block_run *)cached.SetTo(
1140 					fVolume->ToBlock(data->double_indirect) + index / runsPerBlock);
1141 			if (indirect == NULL)
1142 				RETURN_ERROR(B_ERROR);
1143 
1144 			//printf("\tstart = %Ld, indirectSize = %ld, directSize = %ld, index = %ld\n",start,indirectSize,directSize,index);
1145 			//printf("\tlook for indirect block at %ld,%d\n",indirect[index].allocation_group,indirect[index].start);
1146 
1147 			int32 current = (start % indirectSize) / directSize;
1148 
1149 			indirect = (block_run *)cached.SetTo(
1150 				fVolume->ToBlock(indirect[index % runsPerBlock])
1151 				+ current / runsPerBlock);
1152 			if (indirect == NULL)
1153 				RETURN_ERROR(B_ERROR);
1154 
1155 			run = indirect[current % runsPerBlock];
1156 			offset = data->MaxIndirectRange() + (index * indirectSize)
1157 				+ (current * directSize);
1158 			//printf("\tfCurrent = %ld, fRunFileOffset = %Ld, fRunBlockEnd = %Ld, fRun = %ld,%d\n",fCurrent,fRunFileOffset,fRunBlockEnd,fRun.allocation_group,fRun.start);
1159 		} else {
1160 			// access to indirect blocks
1161 
1162 			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1163 			off_t runBlockEnd = data->MaxDirectRange();
1164 
1165 			CachedBlock cached(fVolume);
1166 			off_t block = fVolume->ToBlock(data->indirect);
1167 
1168 			for (int32 i = 0; i < data->indirect.Length(); i++) {
1169 				block_run *indirect = (block_run *)cached.SetTo(block + i);
1170 				if (indirect == NULL)
1171 					RETURN_ERROR(B_IO_ERROR);
1172 
1173 				int32 current = -1;
1174 				while (++current < runsPerBlock) {
1175 					if (indirect[current].IsZero())
1176 						break;
1177 
1178 					runBlockEnd += indirect[current].Length()
1179 						<< cached.BlockShift();
1180 					if (runBlockEnd > pos) {
1181 						run = indirect[current];
1182 						offset = runBlockEnd - (run.Length()
1183 							<< cached.BlockShift());
1184 						//printf("reading from indirect block: %ld,%d\n",fRun.allocation_group,fRun.start);
1185 						//printf("### indirect-run[%ld] = (%ld,%d,%d), offset = %Ld\n",fCurrent,fRun.allocation_group,fRun.start,fRun.Length(),fRunFileOffset);
1186 						return fVolume->ValidateBlockRun(run);
1187 					}
1188 				}
1189 			}
1190 			RETURN_ERROR(B_ERROR);
1191 		}
1192 	} else {
1193 		// access from direct blocks
1194 
1195 		off_t runBlockEnd = 0LL;
1196 		int32 current = -1;
1197 
1198 		while (++current < NUM_DIRECT_BLOCKS) {
1199 			if (data->direct[current].IsZero())
1200 				break;
1201 
1202 			runBlockEnd += data->direct[current].Length()
1203 				<< fVolume->BlockShift();
1204 			if (runBlockEnd > pos) {
1205 				run = data->direct[current];
1206 				offset = runBlockEnd - (run.Length() << fVolume->BlockShift());
1207 				//printf("### run[%ld] = (%ld,%d,%d), offset = %Ld\n",fCurrent,fRun.allocation_group,fRun.start,fRun.Length(),fRunFileOffset);
1208 				return fVolume->ValidateBlockRun(run);
1209 			}
1210 		}
1211 
1212 		//PRINT(("FindBlockRun() failed in direct range: size = %Ld, pos = %Ld\n",data->size,pos));
1213 		return B_ENTRY_NOT_FOUND;
1214 	}
1215 	return fVolume->ValidateBlockRun(run);
1216 }
1217 
1218 
1219 status_t
1220 Inode::ReadAt(off_t pos, uint8 *buffer, size_t *_length)
1221 {
1222 #if 0
1223 	// call the right ReadAt() method, depending on the inode flags
1224 
1225 	if (Flags() & INODE_LOGGED)
1226 		return ((Stream<Access::Logged> *)this)->ReadAt(pos, buffer, _length);
1227 
1228 	return ((Stream<Access::Cached> *)this)->ReadAt(pos, buffer, _length);
1229 #endif
1230 	size_t length = *_length;
1231 
1232 	// set/check boundaries for pos/length
1233 	if (pos < 0)
1234 		return B_BAD_VALUE;
1235 
1236 	if (pos >= Size() || length == 0) {
1237 		*_length = 0;
1238 		return B_NO_ERROR;
1239 	}
1240 
1241 	return file_cache_read(FileCache(), pos, buffer, _length);
1242 }
1243 
1244 
1245 status_t
1246 Inode::WriteAt(Transaction &transaction, off_t pos, const uint8 *buffer,
1247 	size_t *_length)
1248 {
1249 	// update the last modification time in memory, it will be written
1250 	// back to the inode, and the index when the file is closed
1251 	// ToDo: should update the internal last modified time only at this point!
1252 	Node().last_modified_time = HOST_ENDIAN_TO_BFS_INT64((bigtime_t)time(NULL)
1253 		<< INODE_TIME_SHIFT);
1254 
1255 	// TODO: support INODE_LOGGED!
1256 #if 0
1257 	if (Flags() & INODE_LOGGED)
1258 		return ((Stream<Access::Logged> *)this)->WriteAt(transaction, pos, buffer, _length);
1259 
1260 	return ((Stream<Access::Cached> *)this)->WriteAt(transaction, pos, buffer, _length);
1261 #endif
1262 	size_t length = *_length;
1263 
1264 	// set/check boundaries for pos/length
1265 	if (pos < 0)
1266 		return B_BAD_VALUE;
1267 
1268 	if (pos + length > Size()) {
1269 		off_t oldSize = Size();
1270 
1271 		// the transaction doesn't have to be started already
1272 		// ToDo: what's that INODE_NO_TRANSACTION flag good for again?
1273 		if ((Flags() & INODE_NO_TRANSACTION) == 0
1274 			&& !transaction.IsStarted())
1275 			transaction.Start(fVolume, BlockNumber());
1276 
1277 		// let's grow the data stream to the size needed
1278 		status_t status = SetFileSize(transaction, pos + length);
1279 		if (status < B_OK) {
1280 			*_length = 0;
1281 			RETURN_ERROR(status);
1282 		}
1283 		// If the position of the write was beyond the file size, we
1284 		// have to fill the gap between that position and the old file
1285 		// size with zeros.
1286 		FillGapWithZeros(oldSize, pos);
1287 
1288 		// we need to write back the inode here because it has to
1289 		// go into this transaction (we cannot wait until the file
1290 		// is closed)
1291 		status = WriteBack(transaction);
1292 		if (status < B_OK)
1293 			return status;
1294 	}
1295 
1296 	// If we don't want to write anything, we can now return (we may
1297 	// just have changed the file size using the position parameter)
1298 	if (length == 0)
1299 		return B_OK;
1300 
1301 	return file_cache_write(FileCache(), pos, buffer, _length);
1302 }
1303 
1304 
1305 /*!
1306 	Fills the gap between the old file size and the new file size
1307 	with zeros.
1308 	It's more or less a copy of Inode::WriteAt() but it can handle
1309 	length differences of more than just 4 GB, and it never uses
1310 	the log, even if the INODE_LOGGED flag is set.
1311 */
1312 status_t
1313 Inode::FillGapWithZeros(off_t pos, off_t newSize)
1314 {
1315 	// ToDo: we currently do anything here, same as original BFS!
1316 	//if (pos >= newSize)
1317 	return B_OK;
1318 #if 0
1319 	block_run run;
1320 	off_t offset;
1321 	if (FindBlockRun(pos, run, offset) < B_OK)
1322 		RETURN_ERROR(B_BAD_VALUE);
1323 
1324 	off_t length = newSize - pos;
1325 	uint32 bytesWritten = 0;
1326 	uint32 blockSize = fVolume->BlockSize();
1327 	uint32 blockShift = fVolume->BlockShift();
1328 	uint8 *block;
1329 
1330 	// the first block_run we write could not be aligned to the block_size boundary
1331 	// (write partial block at the beginning)
1332 
1333 	// pos % block_size == (pos - offset) % block_size, offset % block_size == 0
1334 	if (pos % blockSize != 0) {
1335 		run.start += (pos - offset) / blockSize;
1336 		run.length -= (pos - offset) / blockSize;
1337 
1338 		CachedBlock cached(fVolume,run);
1339 		if ((block = cached.Block()) == NULL)
1340 			RETURN_ERROR(B_BAD_VALUE);
1341 
1342 		bytesWritten = blockSize - (pos % blockSize);
1343 		if (length < bytesWritten)
1344 			bytesWritten = length;
1345 
1346 		memset(block + (pos % blockSize), 0, bytesWritten);
1347 		if (fVolume->WriteBlocks(cached.BlockNumber(), block, 1) < B_OK)
1348 			RETURN_ERROR(B_IO_ERROR);
1349 
1350 		pos += bytesWritten;
1351 
1352 		length -= bytesWritten;
1353 		if (length == 0)
1354 			return B_OK;
1355 
1356 		if (FindBlockRun(pos, run, offset) < B_OK)
1357 			RETURN_ERROR(B_BAD_VALUE);
1358 	}
1359 
1360 	while (length > 0) {
1361 		// offset is the offset to the current pos in the block_run
1362 		run.start = HOST_ENDIAN_TO_BFS_INT16(run.Start() + ((pos - offset) >> blockShift));
1363 		run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length() - ((pos - offset) >> blockShift));
1364 
1365 		CachedBlock cached(fVolume);
1366 		off_t blockNumber = fVolume->ToBlock(run);
1367 		for (int32 i = 0; i < run.Length(); i++) {
1368 			if ((block = cached.SetTo(blockNumber + i, true)) == NULL)
1369 				RETURN_ERROR(B_IO_ERROR);
1370 
1371 			if (fVolume->WriteBlocks(cached.BlockNumber(), block, 1) < B_OK)
1372 				RETURN_ERROR(B_IO_ERROR);
1373 		}
1374 
1375 		int32 bytes = run.Length() << blockShift;
1376 		length -= bytes;
1377 		bytesWritten += bytes;
1378 
1379 		// since we don't respect a last partial block, length can be lower
1380 		if (length <= 0)
1381 			break;
1382 
1383 		pos += bytes;
1384 
1385 		if (FindBlockRun(pos, run, offset) < B_OK)
1386 			RETURN_ERROR(B_BAD_VALUE);
1387 	}
1388 	return B_OK;
1389 #endif
1390 }
1391 
1392 
1393 /*!
1394 	Allocates NUM_ARRAY_BLOCKS blocks, and clears their contents. Growing
1395 	the indirect and double indirect range uses this method.
1396 	The allocated block_run is saved in "run"
1397 */
1398 status_t
1399 Inode::_AllocateBlockArray(Transaction &transaction, block_run &run)
1400 {
1401 	if (!run.IsZero())
1402 		return B_BAD_VALUE;
1403 
1404 	status_t status = fVolume->Allocate(transaction, this, NUM_ARRAY_BLOCKS,
1405 		run, NUM_ARRAY_BLOCKS);
1406 	if (status < B_OK)
1407 		return status;
1408 
1409 	// make sure those blocks are empty
1410 	CachedBlock cached(fVolume);
1411 	off_t block = fVolume->ToBlock(run);
1412 
1413 	for (int32 i = 0; i < run.Length(); i++) {
1414 		block_run *runs = (block_run *)cached.SetToWritable(transaction,
1415 			block + i, true);
1416 		if (runs == NULL)
1417 			return B_IO_ERROR;
1418 	}
1419 	return B_OK;
1420 }
1421 
1422 
1423 status_t
1424 Inode::_GrowStream(Transaction &transaction, off_t size)
1425 {
1426 	data_stream *data = &Node().data;
1427 
1428 	// is the data stream already large enough to hold the new size?
1429 	// (can be the case with preallocated blocks)
1430 	if (size < data->MaxDirectRange()
1431 		|| size < data->MaxIndirectRange()
1432 		|| size < data->MaxDoubleIndirectRange()) {
1433 		data->size = HOST_ENDIAN_TO_BFS_INT64(size);
1434 		return B_OK;
1435 	}
1436 
1437 	// how many bytes are still needed? (unused ranges are always zero)
1438 	uint16 minimum = 1;
1439 	off_t bytes;
1440 	if (data->Size() < data->MaxDoubleIndirectRange()) {
1441 		bytes = size - data->MaxDoubleIndirectRange();
1442 		// the double indirect range can only handle multiple of NUM_ARRAY_BLOCKS
1443 		minimum = NUM_ARRAY_BLOCKS;
1444 	} else if (data->Size() < data->MaxIndirectRange())
1445 		bytes = size - data->MaxIndirectRange();
1446 	else if (data->Size() < data->MaxDirectRange())
1447 		bytes = size - data->MaxDirectRange();
1448 	else
1449 		bytes = size - data->Size();
1450 
1451 	// do we have enough free blocks on the disk?
1452 	off_t blocksRequested = (bytes + fVolume->BlockSize() - 1)
1453 		>> fVolume->BlockShift();
1454 	if (blocksRequested > fVolume->FreeBlocks())
1455 		return B_DEVICE_FULL;
1456 
1457 	off_t blocksNeeded = blocksRequested;
1458 		// because of preallocations and partial allocations, the number of
1459 		// blocks we need to allocate may be different from the one we request
1460 		// from the block allocator
1461 
1462 	// Should we preallocate some blocks (currently, always 64k)?
1463 	// Attributes, attribute directories, and long symlinks usually won't get
1464 	// that big, and should stay close to the inode - preallocating could be
1465 	// counterproductive.
1466 	// Also, if free disk space is tight, we probably don't want to do this as well.
1467 	if (!IsAttribute() && !IsAttributeDirectory() && !IsSymLink()
1468 		&& blocksRequested < (65536 >> fVolume->BlockShift())
1469 		&& fVolume->FreeBlocks() > 128)
1470 		blocksRequested = 65536 >> fVolume->BlockShift();
1471 
1472 	while (blocksNeeded > 0) {
1473 		// the requested blocks do not need to be returned with a
1474 		// single allocation, so we need to iterate until we have
1475 		// enough blocks allocated
1476 		block_run run;
1477 		status_t status = fVolume->Allocate(transaction, this, blocksRequested,
1478 			run, minimum);
1479 		if (status < B_OK)
1480 			return status;
1481 
1482 		// okay, we have the needed blocks, so just distribute them to the
1483 		// different ranges of the stream (direct, indirect & double indirect)
1484 
1485 		// TODO: if anything goes wrong here, we probably want to free the
1486 		// blocks that couldn't be distributed into the stream!
1487 
1488 		blocksNeeded -= run.Length();
1489 		// don't preallocate if the first allocation was already too small
1490 		blocksRequested = blocksNeeded;
1491 		if (minimum > 1) {
1492 			// make sure that "blocks" is a multiple of minimum
1493 			blocksRequested = (blocksRequested + minimum - 1) & ~(minimum - 1);
1494 		}
1495 
1496 		// Direct block range
1497 
1498 		if (data->Size() <= data->MaxDirectRange()) {
1499 			// let's try to put them into the direct block range
1500 			int32 free = 0;
1501 			for (; free < NUM_DIRECT_BLOCKS; free++)
1502 				if (data->direct[free].IsZero())
1503 					break;
1504 
1505 			if (free < NUM_DIRECT_BLOCKS) {
1506 				// can we merge the last allocated run with the new one?
1507 				int32 last = free - 1;
1508 				if (free > 0 && data->direct[last].MergeableWith(run)) {
1509 					data->direct[last].length = HOST_ENDIAN_TO_BFS_INT16(
1510 						data->direct[last].Length() + run.Length());
1511 				} else
1512 					data->direct[free] = run;
1513 
1514 				data->max_direct_range = HOST_ENDIAN_TO_BFS_INT64(
1515 					data->MaxDirectRange() + run.Length() * fVolume->BlockSize());
1516 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1517 					? data->max_direct_range : size);
1518 				continue;
1519 			}
1520 		}
1521 
1522 		// Indirect block range
1523 
1524 		if (data->Size() <= data->MaxIndirectRange()
1525 			|| !data->MaxIndirectRange()) {
1526 			CachedBlock cached(fVolume);
1527 			block_run *runs = NULL;
1528 			uint32 free = 0;
1529 			off_t block;
1530 
1531 			// if there is no indirect block yet, create one
1532 			if (data->indirect.IsZero()) {
1533 				status = _AllocateBlockArray(transaction, data->indirect);
1534 				if (status < B_OK)
1535 					return status;
1536 
1537 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1538 					data->MaxDirectRange());
1539 				// insert the block_run in the first block
1540 				runs = (block_run *)cached.SetTo(data->indirect);
1541 			} else {
1542 				uint32 numberOfRuns = fVolume->BlockSize() / sizeof(block_run);
1543 				block = fVolume->ToBlock(data->indirect);
1544 
1545 				// search first empty entry
1546 				int32 i = 0;
1547 				for (; i < data->indirect.Length(); i++) {
1548 					if ((runs = (block_run *)cached.SetTo(block + i)) == NULL)
1549 						return B_IO_ERROR;
1550 
1551 					for (free = 0; free < numberOfRuns; free++)
1552 						if (runs[free].IsZero())
1553 							break;
1554 
1555 					if (free < numberOfRuns)
1556 						break;
1557 				}
1558 				if (i == data->indirect.Length())
1559 					runs = NULL;
1560 			}
1561 
1562 			if (runs != NULL) {
1563 				// try to insert the run to the last one - note that this
1564 				// doesn't take block borders into account, so it could be
1565 				// further optimized
1566 				cached.MakeWritable(transaction);
1567 
1568 				int32 last = free - 1;
1569 				if (free > 0 && runs[last].MergeableWith(run)) {
1570 					runs[last].length = HOST_ENDIAN_TO_BFS_INT16(
1571 						runs[last].Length() + run.Length());
1572 				} else
1573 					runs[free] = run;
1574 
1575 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1576 					data->MaxIndirectRange()
1577 					+ (run.Length() << fVolume->BlockShift()));
1578 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1579 					? data->MaxIndirectRange() : size);
1580 				continue;
1581 			}
1582 		}
1583 
1584 		// Double indirect block range
1585 
1586 		if (data->Size() <= data->MaxDoubleIndirectRange()
1587 			|| !data->max_double_indirect_range) {
1588 			while ((run.Length() % NUM_ARRAY_BLOCKS) != 0) {
1589 				// The number of allocated blocks isn't a multiple of
1590 				// NUM_ARRAY_BLOCKS, so we have to change this. This can happen
1591 				// the first time the stream grows into the double
1592 				// indirect range.
1593 				// First, free the remaining blocks that don't fit into a multiple
1594 				// of NUM_ARRAY_BLOCKS
1595 				int32 rest = run.Length() % NUM_ARRAY_BLOCKS;
1596 				run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length() - rest);
1597 
1598 				status = fVolume->Free(transaction,
1599 					block_run::Run(run.AllocationGroup(),
1600 					run.Start() + run.Length(), rest));
1601 				if (status < B_OK)
1602 					return status;
1603 
1604 				blocksNeeded += rest;
1605 				blocksRequested = (blocksNeeded + NUM_ARRAY_BLOCKS - 1)
1606 					& ~(NUM_ARRAY_BLOCKS - 1);
1607 				minimum = NUM_ARRAY_BLOCKS;
1608 					// we make sure here that we have at minimum
1609 					// NUM_ARRAY_BLOCKS allocated, so if the allocation
1610 					// succeeds, we don't run into an endless loop
1611 
1612 				// Are there any blocks left in the run? If not, allocate a new one
1613 				if (run.length == 0)
1614 					continue;
1615 			}
1616 
1617 			// if there is no double indirect block yet, create one
1618 			if (data->double_indirect.IsZero()) {
1619 				status = _AllocateBlockArray(transaction, data->double_indirect);
1620 				if (status < B_OK)
1621 					return status;
1622 
1623 				data->max_double_indirect_range = data->max_indirect_range;
1624 			}
1625 
1626 			// calculate the index where to insert the new blocks
1627 
1628 			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1629 			int32 indirectSize = ((1L << INDIRECT_BLOCKS_SHIFT) << fVolume->BlockShift())
1630 				* runsPerBlock;
1631 			int32 directSize = NUM_ARRAY_BLOCKS << fVolume->BlockShift();
1632 			int32 runsPerArray = runsPerBlock << ARRAY_BLOCKS_SHIFT;
1633 
1634 			off_t start = data->MaxDoubleIndirectRange() - data->MaxIndirectRange();
1635 			int32 indirectIndex = start / indirectSize;
1636 			int32 index = start / directSize;
1637 
1638 			// distribute the blocks to the array and allocate
1639 			// new array blocks when needed
1640 
1641 			CachedBlock cached(fVolume);
1642 			CachedBlock cachedDirect(fVolume);
1643 			block_run *array = NULL;
1644 			uint32 runLength = run.Length();
1645 
1646 			// TODO: the following code is commented - it could be used to
1647 			// preallocate all needed block arrays to see in advance if the
1648 			// allocation will succeed.
1649 			// I will probably remove it later, because it's no perfect solution
1650 			// either: if the allocation was broken up before (blocksNeeded != 0),
1651 			// it doesn't guarantee anything.
1652 			// And since failing in this case is not that common, it doesn't have
1653 			// to be optimized in that way.
1654 			// Anyway, I wanted to have it in CVS - all those lines, and they will
1655 			// be removed soon :-)
1656 /*
1657 			// allocate new block arrays if needed
1658 
1659 			off_t block = -1;
1660 
1661 			for (int32 i = 0;i < needed;i++) {
1662 				// get the block to insert the run into
1663 				block = fVolume->ToBlock(data->double_indirect) + i + indirectIndex / runsPerBlock;
1664 				if (cached.BlockNumber() != block)
1665 					array = (block_run *)cached.SetTo(block);
1666 
1667 				if (array == NULL)
1668 					return B_ERROR;
1669 
1670 				status = AllocateBlockArray(transaction, array[i + indirectIndex % runsPerBlock]);
1671 				if (status < B_OK)
1672 					return status;
1673 			}
1674 */
1675 
1676 			while (run.length != 0) {
1677 				// get the indirect array block
1678 				if (array == NULL) {
1679 					array = (block_run *)cached.SetTo(fVolume->ToBlock(
1680 						data->double_indirect) + indirectIndex / runsPerBlock);
1681 					if (array == NULL)
1682 						return B_IO_ERROR;
1683 				}
1684 
1685 				do {
1686 					// do we need a new array block?
1687 					if (array[indirectIndex % runsPerBlock].IsZero()) {
1688 						status = _AllocateBlockArray(transaction,
1689 							array[indirectIndex % runsPerBlock]);
1690 						if (status < B_OK)
1691 							return status;
1692 					}
1693 
1694 					block_run *runs = (block_run *)cachedDirect.SetToWritable(
1695 						transaction, fVolume->ToBlock(array[indirectIndex
1696 							% runsPerBlock]) + index / runsPerBlock);
1697 					if (runs == NULL)
1698 						return B_IO_ERROR;
1699 
1700 					do {
1701 						// insert the block_run into the array
1702 						runs[index % runsPerBlock] = run;
1703 						runs[index % runsPerBlock].length
1704 							= HOST_ENDIAN_TO_BFS_INT16(NUM_ARRAY_BLOCKS);
1705 
1706 						// alter the remaining block_run
1707 						run.start = HOST_ENDIAN_TO_BFS_INT16(run.Start()
1708 							+ NUM_ARRAY_BLOCKS);
1709 						run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
1710 							- NUM_ARRAY_BLOCKS);
1711 					} while ((++index % runsPerBlock) != 0 && run.length);
1712 				} while ((index % runsPerArray) != 0 && run.length);
1713 
1714 				if (++indirectIndex % runsPerBlock == 0) {
1715 					array = NULL;
1716 					index = 0;
1717 				}
1718 			}
1719 
1720 			data->max_double_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1721 				data->MaxDoubleIndirectRange()
1722 				+ (runLength << fVolume->BlockShift()));
1723 			data->size = blocksNeeded > 0 ? HOST_ENDIAN_TO_BFS_INT64(
1724 				data->max_double_indirect_range) : size;
1725 
1726 			continue;
1727 		}
1728 
1729 		RETURN_ERROR(EFBIG);
1730 	}
1731 	// update the size of the data stream
1732 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
1733 
1734 	return B_OK;
1735 }
1736 
1737 
1738 status_t
1739 Inode::_FreeStaticStreamArray(Transaction &transaction, int32 level,
1740 	block_run run, off_t size, off_t offset, off_t &max)
1741 {
1742 	int32 indirectSize = 0;
1743 	if (level == 0)
1744 		indirectSize = (1L << (INDIRECT_BLOCKS_SHIFT + fVolume->BlockShift()))
1745 			* (fVolume->BlockSize() / sizeof(block_run));
1746 	else if (level == 1)
1747 		indirectSize = 4 << fVolume->BlockShift();
1748 
1749 	off_t start;
1750 	if (size > offset)
1751 		start = size - offset;
1752 	else
1753 		start = 0;
1754 
1755 	int32 index = start / indirectSize;
1756 	int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1757 
1758 	CachedBlock cached(fVolume);
1759 	off_t blockNumber = fVolume->ToBlock(run);
1760 
1761 	// set the file offset to the current block run
1762 	offset += (off_t)index * indirectSize;
1763 
1764 	for (int32 i = index / runsPerBlock; i < run.Length(); i++) {
1765 		block_run *array = (block_run *)cached.SetToWritable(transaction,
1766 			blockNumber + i);
1767 		if (array == NULL)
1768 			RETURN_ERROR(B_ERROR);
1769 
1770 		for (index = index % runsPerBlock; index < runsPerBlock; index++) {
1771 			if (array[index].IsZero()) {
1772 				// we also want to break out of the outer loop
1773 				i = run.Length();
1774 				break;
1775 			}
1776 
1777 			status_t status = B_OK;
1778 			if (level == 0) {
1779 				status = _FreeStaticStreamArray(transaction, 1, array[index],
1780 					size, offset, max);
1781 			} else if (offset >= size)
1782 				status = fVolume->Free(transaction, array[index]);
1783 			else
1784 				max = HOST_ENDIAN_TO_BFS_INT64(offset + indirectSize);
1785 
1786 			if (status < B_OK)
1787 				RETURN_ERROR(status);
1788 
1789 			if (offset >= size)
1790 				array[index].SetTo(0, 0, 0);
1791 
1792 			offset += indirectSize;
1793 		}
1794 		index = 0;
1795 	}
1796 	return B_OK;
1797 }
1798 
1799 
1800 /*!
1801 	Frees all block_runs in the array which come after the specified size.
1802 	It also trims the last block_run that contain the size.
1803 	"offset" and "max" are maintained until the last block_run that doesn't
1804 	have to be freed - after this, the values won't be correct anymore, but
1805 	will still assure correct function for all subsequent calls.
1806 	"max" is considered to be in file system byte order.
1807 */
1808 status_t
1809 Inode::_FreeStreamArray(Transaction &transaction, block_run *array,
1810 	uint32 arrayLength, off_t size, off_t &offset, off_t &max)
1811 {
1812 	PRINT(("FreeStreamArray: arrayLength %lu, size %Ld, offset %Ld, max %Ld\n",
1813 		arrayLength, size, offset, max));
1814 
1815 	off_t newOffset = offset;
1816 	uint32 i = 0;
1817 	for (; i < arrayLength; i++, offset = newOffset) {
1818 		if (array[i].IsZero())
1819 			break;
1820 
1821 		newOffset += (off_t)array[i].Length() << fVolume->BlockShift();
1822 		if (newOffset <= size)
1823 			continue;
1824 
1825 		block_run run = array[i];
1826 
1827 		// determine the block_run to be freed
1828 		if (newOffset > size && offset < size) {
1829 			// free partial block_run (and update the original block_run)
1830 			run.start = array[i].start
1831 				+ ((size - offset) >> fVolume->BlockShift()) + 1;
1832 			array[i].length = HOST_ENDIAN_TO_BFS_INT16(run.Start()
1833 				- array[i].Start());
1834 			run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
1835 				- array[i].Length());
1836 
1837 			if (run.length == 0)
1838 				continue;
1839 
1840 			// update maximum range
1841 			max = HOST_ENDIAN_TO_BFS_INT64(offset + ((off_t)array[i].Length()
1842 				<< fVolume->BlockShift()));
1843 		} else {
1844 			// free the whole block_run
1845 			array[i].SetTo(0, 0, 0);
1846 
1847 			if ((off_t)BFS_ENDIAN_TO_HOST_INT64(max) > offset)
1848 				max = HOST_ENDIAN_TO_BFS_INT64(offset);
1849 		}
1850 
1851 		if (fVolume->Free(transaction, run) < B_OK)
1852 			return B_IO_ERROR;
1853 	}
1854 	return B_OK;
1855 }
1856 
1857 
1858 status_t
1859 Inode::_ShrinkStream(Transaction &transaction, off_t size)
1860 {
1861 	data_stream *data = &Node().data;
1862 	status_t status;
1863 
1864 	if (data->MaxDoubleIndirectRange() > size) {
1865 		off_t *maxDoubleIndirect = &data->max_double_indirect_range;
1866 			// gcc 4 work-around: "error: cannot bind packed field
1867 			// 'data->data_stream::max_double_indirect_range' to 'off_t&'"
1868 		status = _FreeStaticStreamArray(transaction, 0, data->double_indirect,
1869 			size, data->MaxIndirectRange(), *maxDoubleIndirect);
1870 		if (status < B_OK)
1871 			return status;
1872 
1873 		if (size <= data->MaxIndirectRange()) {
1874 			fVolume->Free(transaction, data->double_indirect);
1875 			data->double_indirect.SetTo(0, 0, 0);
1876 			data->max_double_indirect_range = 0;
1877 		}
1878 	}
1879 
1880 	if (data->MaxIndirectRange() > size) {
1881 		CachedBlock cached(fVolume);
1882 		off_t block = fVolume->ToBlock(data->indirect);
1883 		off_t offset = data->MaxDirectRange();
1884 
1885 		for (int32 i = 0; i < data->indirect.Length(); i++) {
1886 			block_run *array = (block_run *)cached.SetToWritable(transaction,
1887 				block + i);
1888 			if (array == NULL)
1889 				break;
1890 
1891 			off_t *maxIndirect = &data->max_indirect_range;
1892 				// gcc 4 work-around: "error: cannot bind packed field
1893 				// 'data->data_stream::max_indirect_range' to 'off_t&'"
1894 			if (_FreeStreamArray(transaction, array, fVolume->BlockSize()
1895 					/ sizeof(block_run), size, offset, *maxIndirect) != B_OK)
1896 				return B_IO_ERROR;
1897 		}
1898 		if (data->max_direct_range == data->max_indirect_range) {
1899 			fVolume->Free(transaction, data->indirect);
1900 			data->indirect.SetTo(0, 0, 0);
1901 			data->max_indirect_range = 0;
1902 		}
1903 	}
1904 
1905 	if (data->MaxDirectRange() > size) {
1906 		off_t offset = 0;
1907 		off_t *maxDirect = &data->max_direct_range;
1908 			// gcc 4 work-around: "error: cannot bind packed field
1909 			// 'data->data_stream::max_direct_range' to 'off_t&'"
1910 		status = _FreeStreamArray(transaction, data->direct, NUM_DIRECT_BLOCKS,
1911 			size, offset, *maxDirect);
1912 		if (status < B_OK)
1913 			return status;
1914 	}
1915 
1916 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
1917 	return B_OK;
1918 }
1919 
1920 
1921 status_t
1922 Inode::SetFileSize(Transaction &transaction, off_t size)
1923 {
1924 	if (size < 0)
1925 		return B_BAD_VALUE;
1926 
1927 	off_t oldSize = Size();
1928 
1929 	if (size == oldSize)
1930 		return B_OK;
1931 
1932 	// should the data stream grow or shrink?
1933 	status_t status;
1934 	if (size > oldSize) {
1935 		status = _GrowStream(transaction, size);
1936 		if (status < B_OK) {
1937 			// if the growing of the stream fails, the whole operation
1938 			// fails, so we should shrink the stream to its former size
1939 			_ShrinkStream(transaction, oldSize);
1940 		}
1941 	} else
1942 		status = _ShrinkStream(transaction, size);
1943 
1944 	if (status < B_OK)
1945 		return status;
1946 
1947 	file_cache_set_size(FileCache(), size);
1948 	return WriteBack(transaction);
1949 }
1950 
1951 
1952 status_t
1953 Inode::Append(Transaction &transaction, off_t bytes)
1954 {
1955 	return SetFileSize(transaction, Size() + bytes);
1956 }
1957 
1958 
1959 /*!
1960 	Checks wether or not this inode's data stream needs to be trimmed
1961 	because of an earlier preallocation.
1962 	Returns true if there are any blocks to be trimmed.
1963 */
1964 bool
1965 Inode::NeedsTrimming()
1966 {
1967 	// We never trim preallocated index blocks to make them grow as smooth as possible.
1968 	// There are only few indices anyway, so this doesn't hurt
1969 	// Also, if an inode is already in deleted state, we don't bother trimming it
1970 	if (IsIndex() || IsDeleted())
1971 		return false;
1972 
1973 	off_t roundedSize = (Size() + fVolume->BlockSize() - 1)
1974 		& ~(fVolume->BlockSize() - 1);
1975 
1976 	return Node().data.MaxDirectRange() > roundedSize
1977 		|| Node().data.MaxIndirectRange() > roundedSize
1978 		|| Node().data.MaxDoubleIndirectRange() > roundedSize;
1979 }
1980 
1981 
1982 status_t
1983 Inode::TrimPreallocation(Transaction &transaction)
1984 {
1985 	status_t status = _ShrinkStream(transaction, Size());
1986 	if (status < B_OK)
1987 		return status;
1988 
1989 	return WriteBack(transaction);
1990 }
1991 
1992 
1993 //!	Frees the file's data stream and removes all attributes
1994 status_t
1995 Inode::Free(Transaction &transaction)
1996 {
1997 	FUNCTION();
1998 
1999 	// Perhaps there should be an implementation of Inode::ShrinkStream() that
2000 	// just frees the data_stream, but doesn't change the inode (since it is
2001 	// freed anyway) - that would make an undelete command possible
2002 	status_t status = SetFileSize(transaction, 0);
2003 	if (status < B_OK)
2004 		return status;
2005 
2006 	// Free all attributes, and remove their indices
2007 	{
2008 		// We have to limit the scope of AttributeIterator, so that its
2009 		// destructor is not called after the inode is deleted
2010 		AttributeIterator iterator(this);
2011 
2012 		char name[B_FILE_NAME_LENGTH];
2013 		uint32 type;
2014 		size_t length;
2015 		ino_t id;
2016 		while ((status = iterator.GetNext(name, &length, &type, &id)) == B_OK) {
2017 			RemoveAttribute(transaction, name);
2018 		}
2019 	}
2020 
2021 	if (WriteBack(transaction) < B_OK)
2022 		return B_IO_ERROR;
2023 
2024 	return fVolume->Free(transaction, BlockRun());
2025 }
2026 
2027 
2028 //!	Synchronizes (writes back to disk) the file stream of the inode.
2029 status_t
2030 Inode::Sync()
2031 {
2032 	if (FileCache())
2033 		return file_cache_sync(FileCache());
2034 
2035 	// We may also want to flush the attribute's data stream to
2036 	// disk here... (do we?)
2037 
2038 	if (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0)
2039 		return B_OK;
2040 
2041 	data_stream *data = &Node().data;
2042 	status_t status = B_OK;
2043 
2044 	// flush direct range
2045 
2046 	for (int32 i = 0; i < NUM_DIRECT_BLOCKS; i++) {
2047 		if (data->direct[i].IsZero())
2048 			return B_OK;
2049 
2050 		status = block_cache_sync_etc(fVolume->BlockCache(),
2051 			fVolume->ToBlock(data->direct[i]), data->direct[i].Length());
2052 		if (status != B_OK)
2053 			return status;
2054 	}
2055 
2056 	// flush indirect range
2057 
2058 	if (data->max_indirect_range == 0)
2059 		return B_OK;
2060 
2061 	CachedBlock cached(fVolume);
2062 	off_t block = fVolume->ToBlock(data->indirect);
2063 	int32 count = fVolume->BlockSize() / sizeof(block_run);
2064 
2065 	for (int32 j = 0; j < data->indirect.Length(); j++) {
2066 		block_run *runs = (block_run *)cached.SetTo(block + j);
2067 		if (runs == NULL)
2068 			break;
2069 
2070 		for (int32 i = 0; i < count; i++) {
2071 			if (runs[i].IsZero())
2072 				return B_OK;
2073 
2074 			status = block_cache_sync_etc(fVolume->BlockCache(),
2075 				fVolume->ToBlock(runs[i]), runs[i].Length());
2076 			if (status != B_OK)
2077 				return status;
2078 		}
2079 	}
2080 
2081 	// flush double indirect range
2082 
2083 	if (data->max_double_indirect_range == 0)
2084 		return B_OK;
2085 
2086 	off_t indirectBlock = fVolume->ToBlock(data->double_indirect);
2087 
2088 	for (int32 l = 0; l < data->double_indirect.Length(); l++) {
2089 		block_run *indirectRuns = (block_run *)cached.SetTo(indirectBlock + l);
2090 		if (indirectRuns == NULL)
2091 			return B_FILE_ERROR;
2092 
2093 		CachedBlock directCached(fVolume);
2094 
2095 		for (int32 k = 0; k < count; k++) {
2096 			if (indirectRuns[k].IsZero())
2097 				return B_OK;
2098 
2099 			block = fVolume->ToBlock(indirectRuns[k]);
2100 			for (int32 j = 0; j < indirectRuns[k].Length(); j++) {
2101 				block_run *runs = (block_run *)directCached.SetTo(block + j);
2102 				if (runs == NULL)
2103 					return B_FILE_ERROR;
2104 
2105 				for (int32 i = 0; i < count; i++) {
2106 					if (runs[i].IsZero())
2107 						return B_OK;
2108 
2109 					// TODO: combine single block_runs to bigger ones when
2110 					// they are adjacent
2111 					status = block_cache_sync_etc(fVolume->BlockCache(),
2112 						fVolume->ToBlock(runs[i]), runs[i].Length());
2113 					if (status != B_OK)
2114 						return status;
2115 				}
2116 			}
2117 		}
2118 	}
2119 	return B_OK;
2120 }
2121 
2122 
2123 //	#pragma mark - creation/deletion
2124 
2125 
2126 status_t
2127 Inode::Remove(Transaction &transaction, const char *name, ino_t *_id,
2128 	bool isDirectory)
2129 {
2130 	BPlusTree *tree;
2131 	if (GetTree(&tree) != B_OK)
2132 		RETURN_ERROR(B_BAD_VALUE);
2133 
2134 	RecursiveLocker locker(fVolume->Lock());
2135 
2136 	// does the file even exist?
2137 	off_t id;
2138 	if (tree->Find((uint8 *)name, (uint16)strlen(name), &id) < B_OK)
2139 		return B_ENTRY_NOT_FOUND;
2140 
2141 	if (_id)
2142 		*_id = id;
2143 
2144 	Vnode vnode(fVolume, id);
2145 	Inode *inode;
2146 	status_t status = vnode.Get(&inode);
2147 	if (status < B_OK) {
2148 		REPORT_ERROR(status);
2149 		return B_ENTRY_NOT_FOUND;
2150 	}
2151 
2152 	// Inode::IsContainer() is true also for indices (furthermore, the S_IFDIR
2153 	// bit is set for indices in BFS, not for attribute directories) - but you
2154 	// should really be able to do whatever you want with your indices
2155 	// without having to remove all files first :)
2156 	if (!inode->IsIndex()) {
2157 		// if it's not of the correct type, don't delete it!
2158 		if (inode->IsContainer() != isDirectory)
2159 			return isDirectory ? B_NOT_A_DIRECTORY : B_IS_A_DIRECTORY;
2160 
2161 		// only delete empty directories
2162 		if (isDirectory && !inode->IsEmpty())
2163 			return B_DIRECTORY_NOT_EMPTY;
2164 	}
2165 
2166 	// remove_vnode() allows the inode to be accessed until the last put_vnode()
2167 	status = remove_vnode(fVolume->ID(), id);
2168 	if (status != B_OK)
2169 		return status;
2170 
2171 	if (tree->Remove(transaction, name, id) < B_OK) {
2172 		unremove_vnode(fVolume->ID(), id);
2173 		RETURN_ERROR(B_ERROR);
2174 	}
2175 
2176 #ifdef DEBUG
2177 	if (tree->Find((uint8 *)name, (uint16)strlen(name), &id) == B_OK) {
2178 		DIE(("deleted entry still there"));
2179 	}
2180 #endif
2181 
2182 	// update the inode, so that no one will ever doubt it's deleted :-)
2183 	inode->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
2184 	inode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
2185 
2186 	// In balance to the Inode::Create() method, the main indices
2187 	// are updated here (name, size, & last_modified)
2188 
2189 	Index index(fVolume);
2190 	if (inode->IsRegularNode()) {
2191 		index.RemoveName(transaction, name, inode);
2192 			// If removing from the index fails, it is not regarded as a
2193 			// fatal error and will not be reported back!
2194 			// Deleted inodes won't be visible in queries anyway.
2195 	}
2196 
2197 	if (inode->IsFile() || inode->IsSymLink()) {
2198 		if (inode->IsFile())
2199 			index.RemoveSize(transaction, inode);
2200 		index.RemoveLastModified(transaction, inode);
2201 	}
2202 
2203 	return inode->WriteBack(transaction);
2204 }
2205 
2206 
2207 /*!
2208 	Creates the inode with the specified parent directory, and automatically
2209 	adds the created inode to that parent directory. If an attribute directory
2210 	is created, it will also automatically  be added to the parent inode as
2211 	such. However, the indices root node, and the regular root node won't be
2212 	added to the super block.
2213 	It will also create the initial B+tree for the inode if it's a directory
2214 	of any kind.
2215 	If the "_id" or "_inode" variable is given and non-NULL to store the
2216 	inode's ID, the inode stays locked - you have to call put_vnode() if you
2217 	don't use it anymore.
2218 	If the node already exists, this method will fail if O_EXCL is set, or it's
2219 	a directory or a symlink. Otherwise, it will just be returned. If O_TRUNC
2220 	has been specified, the file will also be truncated.
2221 */
2222 status_t
2223 Inode::Create(Transaction &transaction, Inode *parent, const char *name,
2224 	int32 mode, int openMode, uint32 type, bool *_created, ino_t *_id,
2225 	Inode **_inode)
2226 {
2227 	FUNCTION_START(("name = %s, mode = %ld\n", name, mode));
2228 
2229 	block_run parentRun = parent ? parent->BlockRun() : block_run::Run(0, 0, 0);
2230 	Volume *volume = transaction.GetVolume();
2231 	BPlusTree *tree = NULL;
2232 
2233 	if (parent && (mode & S_ATTR_DIR) == 0 && parent->IsContainer()) {
2234 		// check if the file already exists in the directory
2235 		if (parent->GetTree(&tree) != B_OK)
2236 			RETURN_ERROR(B_BAD_VALUE);
2237 	}
2238 
2239 	WriteLocked locker(parent != NULL ? &parent->Lock() : NULL);
2240 		// the parent directory is locked during the whole inode creation
2241 
2242 	if (tree != NULL) {
2243 		// does the file already exist?
2244 		off_t offset;
2245 		if (tree->Find((uint8 *)name, (uint16)strlen(name), &offset) == B_OK) {
2246 			// return if the file should be a directory/link or opened in exclusive mode
2247 			if (S_ISDIR(mode) || S_ISLNK(mode) || openMode & O_EXCL)
2248 				return B_FILE_EXISTS;
2249 
2250 			Vnode vnode(volume, offset);
2251 			Inode *inode;
2252 			status_t status = vnode.Get(&inode);
2253 			if (status < B_OK) {
2254 				REPORT_ERROR(status);
2255 				return B_ENTRY_NOT_FOUND;
2256 			}
2257 
2258 			// if it's a directory, bail out!
2259 			if (inode->IsDirectory())
2260 				return B_IS_A_DIRECTORY;
2261 
2262 			// we want to open the file, so we should have the rights to do so
2263 			if (inode->CheckPermissions(openModeToAccess(openMode)) != B_OK)
2264 				return B_NOT_ALLOWED;
2265 
2266 			if (openMode & O_TRUNC) {
2267 				// we need write access in order to truncate the file
2268 				status = inode->CheckPermissions(W_OK);
2269 				if (status != B_OK)
2270 					return status;
2271 
2272 				// truncate the existing file
2273 				WriteLocked locked(inode->Lock());
2274 
2275 				status_t status = inode->SetFileSize(transaction, 0);
2276 				if (status >= B_OK)
2277 					status = inode->WriteBack(transaction);
2278 
2279 				if (status < B_OK)
2280 					return status;
2281 			}
2282 
2283 			if (_created)
2284 				*_created = false;
2285 			if (_id)
2286 				*_id = inode->ID();
2287 			if (_inode)
2288 				*_inode = inode;
2289 
2290 			// only keep the vnode in memory if the _id or _inode pointer is provided
2291 			if (_id != NULL || _inode != NULL)
2292 				vnode.Keep();
2293 
2294 			return B_OK;
2295 		}
2296 	} else if (parent && (mode & S_ATTR_DIR) == 0)
2297 		return B_BAD_VALUE;
2298 
2299 	status_t status;
2300 
2301 	// do we have the power to create new files at all?
2302 	if (parent != NULL && (status = parent->CheckPermissions(W_OK)) != B_OK)
2303 		RETURN_ERROR(status);
2304 
2305 	// allocate space for the new inode
2306 	InodeAllocator allocator(transaction);
2307 	block_run run;
2308 	Inode *inode;
2309 	status = allocator.New(&parentRun, mode, run, &inode);
2310 	if (status < B_OK)
2311 		return status;
2312 
2313 	// Initialize the parts of the bfs_inode structure that
2314 	// InodeAllocator::New() hasn't touched yet
2315 
2316 	bfs_inode *node = &inode->Node();
2317 
2318 	if (parent == NULL) {
2319 		// we set the parent to itself in this case
2320 		// (only happens for the root and indices node)
2321 		node->parent = run;
2322 	} else
2323 		node->parent = parentRun;
2324 
2325 	node->uid = HOST_ENDIAN_TO_BFS_INT32(geteuid());
2326 	node->gid = HOST_ENDIAN_TO_BFS_INT32(parent
2327 		? parent->Node().GroupID() : getegid());
2328 		// the group ID is inherited from the parent, if available
2329 
2330 	node->type = HOST_ENDIAN_TO_BFS_INT32(type);
2331 
2332 	inode->WriteBack(transaction);
2333 		// make sure the initialized node is available to others
2334 
2335 	// only add the name to regular files, directories, or symlinks
2336 	// don't add it to attributes, or indices
2337 	if (tree && inode->IsRegularNode() && inode->SetName(transaction, name) < B_OK)
2338 		return B_ERROR;
2339 
2340 	// Initialize b+tree if it's a directory (and add "." & ".." if it's
2341 	// a standard directory for files - not for attributes or indices)
2342 	if (inode->IsContainer()) {
2343 		status = allocator.CreateTree();
2344 		if (status < B_OK)
2345 			return status;
2346 	}
2347 
2348 	// Add a link to the inode from the parent, depending on its type
2349 	// (the vnode is not published yet, so it is safe to make the inode
2350 	// accessable to the file system here)
2351 	if (tree) {
2352 		status = tree->Insert(transaction, name, inode->ID());
2353 	} else if (parent && (mode & S_ATTR_DIR) != 0) {
2354 		parent->Attributes() = run;
2355 		status = parent->WriteBack(transaction);
2356 	}
2357 
2358 	// Note, we only care if the inode could be made accessable for the
2359 	// two cases above; the root node or the indices root node must
2360 	// handle this case on their own (or other cases where "parent" is
2361 	// NULL)
2362 	if (status < B_OK)
2363 		RETURN_ERROR(status);
2364 
2365 	// Update the main indices (name, size & last_modified)
2366 	// (live queries might want to access us after this)
2367 
2368 	Index index(volume);
2369 	if (inode->IsRegularNode() && name != NULL) {
2370 		// the name index only contains regular files
2371 		// (but not the root node where name == NULL)
2372 		status = index.InsertName(transaction, name, inode);
2373 		if (status < B_OK && status != B_BAD_INDEX) {
2374 			// We have to remove the node from the parent at this point,
2375 			// because the InodeAllocator destructor can't handle this
2376 			// case (and if it fails, we can't do anything about it...)
2377 			if (tree)
2378 				tree->Remove(transaction, name, inode->ID());
2379 			else if (parent != NULL && (mode & S_ATTR_DIR) != 0)
2380 				parent->Node().attributes.SetTo(0, 0, 0);
2381 
2382 			RETURN_ERROR(status);
2383 		}
2384 	}
2385 
2386 	inode->UpdateOldLastModified();
2387 
2388 	// The "size" & "last_modified" indices don't contain directories
2389 	if (inode->IsFile() || inode->IsSymLink()) {
2390 		// if adding to these indices fails, the inode creation will not be harmed;
2391 		// they are considered less important than the "name" index
2392 		if (inode->IsFile())
2393 			index.InsertSize(transaction, inode);
2394 		index.InsertLastModified(transaction, inode);
2395 	}
2396 
2397 	// Everything worked well until this point, we have a fully
2398 	// initialized inode, and we want to keep it
2399 	allocator.Keep();
2400 
2401 	if (inode->IsFile() || inode->IsAttribute()) {
2402 		inode->SetFileCache(file_cache_create(volume->ID(), inode->ID(),
2403 			inode->Size(), volume->Device()));
2404 	}
2405 
2406 	if (_created)
2407 		*_created = true;
2408 	if (_id != NULL)
2409 		*_id = inode->ID();
2410 	if (_inode != NULL)
2411 		*_inode = inode;
2412 
2413 	// if either _id or _inode is passed, we will keep the inode locked
2414 	if (_id == NULL && _inode == NULL)
2415 		put_vnode(volume->ID(), inode->ID());
2416 
2417 	return B_OK;
2418 }
2419 
2420 
2421 //	#pragma mark - AttributeIterator
2422 
2423 
2424 AttributeIterator::AttributeIterator(Inode *inode)
2425 	:
2426 	fCurrentSmallData(0),
2427 	fInode(inode),
2428 	fAttributes(NULL),
2429 	fIterator(NULL),
2430 	fBuffer(NULL)
2431 {
2432 	inode->_AddIterator(this);
2433 }
2434 
2435 
2436 AttributeIterator::~AttributeIterator()
2437 {
2438 	if (fAttributes)
2439 		put_vnode(fAttributes->GetVolume()->ID(), fAttributes->ID());
2440 
2441 	delete fIterator;
2442 	fInode->_RemoveIterator(this);
2443 }
2444 
2445 
2446 status_t
2447 AttributeIterator::Rewind()
2448 {
2449 	fCurrentSmallData = 0;
2450 
2451 	if (fIterator != NULL)
2452 		fIterator->Rewind();
2453 
2454 	return B_OK;
2455 }
2456 
2457 
2458 status_t
2459 AttributeIterator::GetNext(char *name, size_t *_length, uint32 *_type,
2460 	ino_t *_id)
2461 {
2462 	// read attributes out of the small data section
2463 
2464 	if (fCurrentSmallData >= 0) {
2465 		NodeGetter nodeGetter(fInode->GetVolume(), fInode);
2466 		const bfs_inode *node = nodeGetter.Node();
2467 		const small_data *item = ((bfs_inode *)node)->SmallDataStart();
2468 
2469 		fInode->SmallDataLock().Lock();
2470 
2471 		int32 i = 0;
2472 		for (;;item = item->Next()) {
2473 			if (item->IsLast(node))
2474 				break;
2475 
2476 			if (item->NameSize() == FILE_NAME_NAME_LENGTH
2477 				&& *item->Name() == FILE_NAME_NAME)
2478 				continue;
2479 
2480 			if (i++ == fCurrentSmallData)
2481 				break;
2482 		}
2483 
2484 		if (!item->IsLast(node)) {
2485 			strncpy(name, item->Name(), B_FILE_NAME_LENGTH);
2486 			*_type = item->Type();
2487 			*_length = item->NameSize();
2488 			*_id = (ino_t)fCurrentSmallData;
2489 
2490 			fCurrentSmallData = i;
2491 		}
2492 		else {
2493 			// stop traversing the small_data section
2494 			fCurrentSmallData = -1;
2495 		}
2496 
2497 		fInode->SmallDataLock().Unlock();
2498 
2499 		if (fCurrentSmallData != -1)
2500 			return B_OK;
2501 	}
2502 
2503 	// read attributes out of the attribute directory
2504 
2505 	if (fInode->Attributes().IsZero())
2506 		return B_ENTRY_NOT_FOUND;
2507 
2508 	Volume *volume = fInode->GetVolume();
2509 
2510 	// if you haven't yet access to the attributes directory, get it
2511 	if (fAttributes == NULL) {
2512 		if (get_vnode(volume->ID(), volume->ToVnode(fInode->Attributes()),
2513 				(void **)&fAttributes) != B_OK) {
2514 			FATAL(("get_vnode() failed in AttributeIterator::GetNext(ino_t"
2515 				" = %Ld,name = \"%s\")\n",fInode->ID(),name));
2516 			return B_ENTRY_NOT_FOUND;
2517 		}
2518 
2519 		BPlusTree *tree;
2520 		if (fAttributes->GetTree(&tree) < B_OK
2521 			|| (fIterator = new TreeIterator(tree)) == NULL) {
2522 			FATAL(("could not get tree in AttributeIterator::GetNext(ino_t"
2523 				" = %Ld,name = \"%s\")\n",fInode->ID(),name));
2524 			return B_ENTRY_NOT_FOUND;
2525 		}
2526 	}
2527 
2528 	uint16 length;
2529 	ino_t id;
2530 	status_t status = fIterator->GetNextEntry(name, &length,
2531 		B_FILE_NAME_LENGTH, &id);
2532 	if (status < B_OK)
2533 		return status;
2534 
2535 	Vnode vnode(volume,id);
2536 	Inode *attribute;
2537 	if ((status = vnode.Get(&attribute)) == B_OK) {
2538 		*_type = attribute->Type();
2539 		*_length = attribute->Size();
2540 		*_id = id;
2541 	}
2542 
2543 	return status;
2544 }
2545 
2546 
2547 void
2548 AttributeIterator::Update(uint16 index, int8 change)
2549 {
2550 	// fCurrentSmallData points already to the next item
2551 	if (index < fCurrentSmallData)
2552 		fCurrentSmallData += change;
2553 }
2554 
2555