xref: /haiku/src/add-ons/kernel/file_systems/bfs/Inode.cpp (revision 2f470aec1c92ce6917b8a903e343795dc77af41f)
1 /*
2  * Copyright 2001-2007, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 //! inode access functions
7 
8 
9 #include "Debug.h"
10 #include "Inode.h"
11 #include "BPlusTree.h"
12 #include "Index.h"
13 
14 
15 class InodeAllocator {
16 	public:
17 		InodeAllocator(Transaction &transaction);
18 		~InodeAllocator();
19 
20 		status_t New(block_run *parentRun, mode_t mode, block_run &run, Inode **_inode);
21 		status_t CreateTree();
22 		status_t Keep();
23 
24 	private:
25 		Transaction *fTransaction;
26 		block_run fRun;
27 		Inode *fInode;
28 };
29 
30 
31 InodeAllocator::InodeAllocator(Transaction &transaction)
32 	:
33 	fTransaction(&transaction),
34 	fInode(NULL)
35 {
36 }
37 
38 
39 InodeAllocator::~InodeAllocator()
40 {
41 	if (fTransaction != NULL) {
42 		Volume *volume = fTransaction->GetVolume();
43 
44 		if (fInode != NULL) {
45 			fInode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
46 				// this unblocks any pending bfs_read_vnode() calls
47 			fInode->Free(*fTransaction);
48 			remove_vnode(volume->ID(), fInode->ID());
49 		} else
50 			volume->Free(*fTransaction, fRun);
51 	}
52 
53 	delete fInode;
54 }
55 
56 
57 status_t
58 InodeAllocator::New(block_run *parentRun, mode_t mode, block_run &run, Inode **_inode)
59 {
60 	Volume *volume = fTransaction->GetVolume();
61 
62 	status_t status = volume->AllocateForInode(*fTransaction, parentRun, mode, fRun);
63 	if (status < B_OK) {
64 		// don't free the space in the destructor, because
65 		// the allocation failed
66 		fTransaction = NULL;
67 		RETURN_ERROR(status);
68 	}
69 
70 	run = fRun;
71 	fInode = new Inode(volume, *fTransaction, volume->ToVnode(run), mode, run);
72 	if (fInode == NULL)
73 		RETURN_ERROR(B_NO_MEMORY);
74 
75 	if (volume->ID() >= 0) {
76 		status = new_vnode(volume->ID(), fInode->ID(), fInode);
77 		if (status < B_OK) {
78 			delete fInode;
79 			fInode = NULL;
80 			RETURN_ERROR(status);
81 		}
82 	}
83 
84 	*_inode = fInode;
85 	return B_OK;
86 }
87 
88 
89 status_t
90 InodeAllocator::CreateTree()
91 {
92 	Volume *volume = fTransaction->GetVolume();
93 
94 	// force S_STR_INDEX to be set, if no type is set
95 	if ((fInode->Mode() & S_INDEX_TYPES) == 0)
96 		fInode->Node().mode |= HOST_ENDIAN_TO_BFS_INT32(S_STR_INDEX);
97 
98 	BPlusTree *tree = fInode->fTree = new BPlusTree(*fTransaction, fInode);
99 	if (tree == NULL || tree->InitCheck() < B_OK)
100 		return B_ERROR;
101 
102 	if (fInode->IsRegularNode()) {
103 		if (tree->Insert(*fTransaction, ".", fInode->ID()) < B_OK
104 			|| tree->Insert(*fTransaction, "..", volume->ToVnode(fInode->Parent())) < B_OK)
105 			return B_ERROR;
106 	}
107 	return B_OK;
108 }
109 
110 
111 status_t
112 InodeAllocator::Keep()
113 {
114 	ASSERT(fInode != NULL && fTransaction != NULL);
115 	Volume *volume = fTransaction->GetVolume();
116 
117 	status_t status = fInode->WriteBack(*fTransaction);
118 	if (status < B_OK) {
119 		FATAL(("writing new inode %Ld failed!\n", fInode->ID()));
120 		return status;
121 	}
122 
123 	if (!fInode->IsSymLink() && volume->ID() >= 0)
124 		status = publish_vnode(volume->ID(), fInode->ID(), fInode);
125 
126 	fTransaction = NULL;
127 	fInode = NULL;
128 
129 	return status;
130 }
131 
132 
133 //	#pragma mark -
134 
135 
136 status_t
137 bfs_inode::InitCheck(Volume *volume)
138 {
139 	if (Magic1() != INODE_MAGIC1
140 		|| !(Flags() & INODE_IN_USE)
141 		|| inode_num.Length() != 1
142 		// matches inode size?
143 		|| (uint32)InodeSize() != volume->InodeSize()
144 		// parent resides on disk?
145 		|| parent.AllocationGroup() > int32(volume->AllocationGroups())
146 		|| parent.AllocationGroup() < 0
147 		|| parent.Start() > (1L << volume->AllocationGroupShift())
148 		|| parent.Length() != 1
149 		// attributes, too?
150 		|| attributes.AllocationGroup() > int32(volume->AllocationGroups())
151 		|| attributes.AllocationGroup() < 0
152 		|| attributes.Start() > (1L << volume->AllocationGroupShift()))
153 		RETURN_ERROR(B_BAD_DATA);
154 
155 	if (Flags() & INODE_DELETED)
156 		return B_NOT_ALLOWED;
157 
158 	// ToDo: Add some tests to check the integrity of the other stuff here,
159 	// especially for the data_stream!
160 
161 	return B_OK;
162 }
163 
164 
165 //	#pragma mark - Inode
166 
167 
168 Inode::Inode(Volume *volume, vnode_id id)
169 	:
170 	fVolume(volume),
171 	fID(id),
172 	fTree(NULL),
173 	fAttributes(NULL),
174 	fCache(NULL)
175 {
176 	PRINT(("Inode::Inode(volume = %p, id = %Ld) @ %p\n", volume, id, this));
177 
178 	NodeGetter node(volume, this);
179 	memcpy(&fNode, node.Node(), sizeof(bfs_inode));
180 
181 	char lockName[B_OS_NAME_LENGTH];
182 	snprintf(lockName, sizeof(lockName), "bfs inode %d.%d",
183 		(int)BlockRun().AllocationGroup(), BlockRun().Start());
184 	fLock.Initialize(lockName);
185 
186 	// these two will help to maintain the indices
187 	fOldSize = Size();
188 	fOldLastModified = LastModified();
189 
190 	if (IsContainer())
191 		fTree = new BPlusTree(this);
192 	if (IsFile() || IsAttribute())
193 		SetFileCache(file_cache_create(fVolume->ID(), ID(), Size(), fVolume->Device()));
194 }
195 
196 
197 Inode::Inode(Volume *volume, Transaction &transaction, vnode_id id, mode_t mode, block_run &run)
198 	:
199 	fVolume(volume),
200 	fID(id),
201 	fTree(NULL),
202 	fAttributes(NULL),
203 	fCache(NULL)
204 {
205 	PRINT(("Inode::Inode(volume = %p, transaction = %p, id = %Ld) @ %p\n",
206 		volume, &transaction, id, this));
207 
208 	char lockName[B_OS_NAME_LENGTH];
209 	snprintf(lockName, sizeof(lockName), "bfs inode+%d.%d",
210 		(int)run.AllocationGroup(), run.Start());
211 	fLock.Initialize(lockName);
212 
213 	NodeGetter node(volume, transaction, this, true);
214 	memset(&fNode, 0, sizeof(bfs_inode));
215 
216 	// Initialize the bfs_inode structure -- it's not written back to disk
217 	// here, because it will be done so already when the inode could be
218 	// created completely.
219 
220 	Node().magic1 = HOST_ENDIAN_TO_BFS_INT32(INODE_MAGIC1);
221 	Node().inode_num = run;
222 	Node().mode = HOST_ENDIAN_TO_BFS_INT32(mode);
223 	Node().flags = HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
224 
225 	Node().create_time = HOST_ENDIAN_TO_BFS_INT64((bigtime_t)time(NULL) << INODE_TIME_SHIFT);
226 	Node().last_modified_time = HOST_ENDIAN_TO_BFS_INT64(Node().create_time
227 		| (volume->GetUniqueID() & INODE_TIME_MASK));
228 		// we use Volume::GetUniqueID() to avoid having too many duplicates in the
229 		// last_modified index
230 
231 	Node().inode_size = HOST_ENDIAN_TO_BFS_INT32(volume->InodeSize());
232 
233 	// these two will help to maintain the indices
234 	fOldSize = Size();
235 	fOldLastModified = LastModified();
236 }
237 
238 
239 Inode::~Inode()
240 {
241 	PRINT(("Inode::~Inode() @ %p\n", this));
242 
243 	file_cache_delete(FileCache());
244 	delete fTree;
245 }
246 
247 
248 status_t
249 Inode::InitCheck(bool checkNode)
250 {
251 	// test inode magic and flags
252 	if (checkNode) {
253 		status_t status = Node().InitCheck(fVolume);
254 		if (status == B_BUSY)
255 			return B_BUSY;
256 
257 		if (status < B_OK) {
258 			FATAL(("inode at block %Ld corrupt!\n", BlockNumber()));
259 			RETURN_ERROR(B_BAD_DATA);
260 		}
261 	}
262 
263 	if (IsContainer()) {
264 		// inodes that have a
265 		if (fTree == NULL)
266 			RETURN_ERROR(B_NO_MEMORY);
267 
268 		status_t status = fTree->InitCheck();
269 		if (status < B_OK) {
270 			FATAL(("inode tree at block %Ld corrupt!\n", BlockNumber()));
271 			RETURN_ERROR(B_BAD_DATA);
272 		}
273 	}
274 
275 	// it's more important to know that the inode is corrupt
276 	// so we check for the lock not until here
277 	return fLock.InitCheck();
278 }
279 
280 
281 status_t
282 Inode::WriteBack(Transaction &transaction)
283 {
284 	NodeGetter node(fVolume, transaction, this);
285 	if (node.WritableNode() == NULL)
286 		return B_IO_ERROR;
287 
288 	memcpy(node.WritableNode(), &Node(), sizeof(bfs_inode));
289 	return B_OK;
290 }
291 
292 
293 status_t
294 Inode::CheckPermissions(int accessMode) const
295 {
296 	uid_t user = geteuid();
297 	gid_t group = getegid();
298 
299 	// you never have write access to a read-only volume
300 	if (accessMode & W_OK && fVolume->IsReadOnly())
301 		return B_READ_ONLY_DEVICE;
302 
303 	// root users always have full access (but they can't execute anything)
304 	if (user == 0 && !((accessMode & X_OK) && (Mode() & S_IXUSR) == 0))
305 		return B_OK;
306 
307 	// shift mode bits, to check directly against accessMode
308 	mode_t mode = Mode();
309 	if (user == (uid_t)fNode.UserID())
310 		mode >>= 6;
311 	else if (group == (gid_t)fNode.GroupID())
312 		mode >>= 3;
313 
314 	if (accessMode & ~(mode & S_IRWXO))
315 		return B_NOT_ALLOWED;
316 
317 	return B_OK;
318 }
319 
320 
321 //	#pragma mark - attributes
322 
323 
324 void
325 Inode::_AddIterator(AttributeIterator *iterator)
326 {
327 	if (fSmallDataLock.Lock() < B_OK)
328 		return;
329 
330 	fIterators.Add(iterator);
331 
332 	fSmallDataLock.Unlock();
333 }
334 
335 
336 void
337 Inode::_RemoveIterator(AttributeIterator *iterator)
338 {
339 	if (fSmallDataLock.Lock() < B_OK)
340 		return;
341 
342 	fIterators.Remove(iterator);
343 
344 	fSmallDataLock.Unlock();
345 }
346 
347 
348 /*!
349 	Tries to free up "bytes" space in the small_data section by moving
350 	attributes to real files. Used for system attributes like the name.
351 	You need to hold the fSmallDataLock when you call this method
352 */
353 status_t
354 Inode::_MakeSpaceForSmallData(Transaction &transaction, bfs_inode *node,
355 	const char *name, int32 bytes)
356 {
357 	ASSERT(fSmallDataLock.IsLocked());
358 
359 	while (bytes > 0) {
360 		small_data *item = node->SmallDataStart(), *max = NULL;
361 		int32 index = 0, maxIndex = 0;
362 		for (; !item->IsLast(node); item = item->Next(), index++) {
363 			// should not remove those
364 			if (*item->Name() == FILE_NAME_NAME || !strcmp(name, item->Name()))
365 				continue;
366 
367 			if (max == NULL || max->Size() < item->Size()) {
368 				maxIndex = index;
369 				max = item;
370 			}
371 
372 			// remove the first one large enough to free the needed amount of bytes
373 			if (bytes < (int32)item->Size())
374 				break;
375 		}
376 
377 		if (item->IsLast(node) || (int32)item->Size() < bytes)
378 			return B_ERROR;
379 
380 		bytes -= max->Size();
381 
382 		// Move the attribute to a real attribute file
383 		// Luckily, this doesn't cause any index updates
384 
385 		Inode *attribute;
386 		status_t status = CreateAttribute(transaction, item->Name(), item->Type(),
387 			&attribute);
388 		if (status < B_OK)
389 			RETURN_ERROR(status);
390 
391 		size_t length = item->DataSize();
392 		status = attribute->WriteAt(transaction, 0, item->Data(), &length);
393 
394 		ReleaseAttribute(attribute);
395 
396 		if (status < B_OK) {
397 			Vnode vnode(fVolume, Attributes());
398 			Inode *attributes;
399 			if (vnode.Get(&attributes) < B_OK
400 				|| attributes->Remove(transaction, name) < B_OK) {
401 				FATAL(("Could not remove newly created attribute!\n"));
402 			}
403 
404 			RETURN_ERROR(status);
405 		}
406 
407 		_RemoveSmallData(node, max, maxIndex);
408 	}
409 	return B_OK;
410 }
411 
412 
413 /*!
414 	Private function which removes the given attribute from the small_data
415 	section.
416 	You need to hold the fSmallDataLock when you call this method
417 */
418 status_t
419 Inode::_RemoveSmallData(bfs_inode *node, small_data *item, int32 index)
420 {
421 	ASSERT(fSmallDataLock.IsLocked());
422 
423 	small_data *next = item->Next();
424 	if (!next->IsLast(node)) {
425 		// find the last attribute
426 		small_data *last = next;
427 		while (!last->IsLast(node))
428 			last = last->Next();
429 
430 		int32 size = (uint8 *)last - (uint8 *)next;
431 		if (size < 0 || size > (uint8 *)node + fVolume->BlockSize() - (uint8 *)next)
432 			return B_BAD_DATA;
433 
434 		memmove(item, next, size);
435 
436 		// Move the "last" one to its new location and
437 		// correctly terminate the small_data section
438 		last = (small_data *)((uint8 *)last - ((uint8 *)next - (uint8 *)item));
439 		memset(last, 0, (uint8 *)node + fVolume->BlockSize() - (uint8 *)last);
440 	} else
441 		memset(item, 0, item->Size());
442 
443 	// update all current iterators
444 	AttributeIterator *iterator = NULL;
445 	while ((iterator = fIterators.Next(iterator)) != NULL) {
446 		iterator->Update(index, -1);
447 	}
448 
449 	return B_OK;
450 }
451 
452 
453 //!	Removes the given attribute from the small_data section.
454 status_t
455 Inode::_RemoveSmallData(Transaction &transaction, NodeGetter &nodeGetter,
456 	const char *name)
457 {
458 	if (name == NULL)
459 		return B_BAD_VALUE;
460 
461 	bfs_inode *node = nodeGetter.WritableNode();
462 	SimpleLocker locker(fSmallDataLock);
463 
464 	// search for the small_data item
465 
466 	small_data *item = node->SmallDataStart();
467 	int32 index = 0;
468 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
469 		item = item->Next();
470 		index++;
471 	}
472 
473 	if (item->IsLast(node))
474 		return B_ENTRY_NOT_FOUND;
475 
476 	nodeGetter.MakeWritable(transaction);
477 	status_t status = _RemoveSmallData(node, item, index);
478 	if (status == B_OK)
479 		status = WriteBack(transaction);
480 
481 	return status;
482 }
483 
484 
485 /*!
486 	Try to place the given attribute in the small_data section - if the
487 	new attribute is too big to fit in that section, it returns B_DEVICE_FULL.
488 	In that case, the attribute should be written to a real attribute file;
489 	it's the caller's responsibility to remove any existing attributes in the small
490 	data section if that's the case.
491 
492 	Note that you need to write back the inode yourself after having called that
493 	method - it's a bad API decision that it needs a transaction but enforces you
494 	to write back the inode all by yourself, but it's just more efficient in most
495 	cases...
496 */
497 status_t
498 Inode::_AddSmallData(Transaction &transaction, NodeGetter &nodeGetter,
499 	const char *name, uint32 type, const uint8 *data, size_t length, bool force)
500 {
501 	bfs_inode *node = nodeGetter.WritableNode();
502 
503 	if (node == NULL || name == NULL || data == NULL)
504 		return B_BAD_VALUE;
505 
506 	// reject any requests that can't fit into the small_data section
507 	uint32 nameLength = strlen(name);
508 	uint32 spaceNeeded = sizeof(small_data) + nameLength + 3 + length + 1;
509 	if (spaceNeeded > fVolume->InodeSize() - sizeof(bfs_inode))
510 		return B_DEVICE_FULL;
511 
512 	nodeGetter.MakeWritable(transaction);
513 	SimpleLocker locker(fSmallDataLock);
514 
515 	// Find the last item or one with the same name we have to add
516 	small_data *item = node->SmallDataStart();
517 	int32 index = 0;
518 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
519 		item = item->Next();
520 		index++;
521 	}
522 
523 	// is the attribute already in the small_data section?
524 	// then just replace the data part of that one
525 	if (!item->IsLast(node)) {
526 		// find last attribute
527 		small_data *last = item;
528 		while (!last->IsLast(node))
529 			last = last->Next();
530 
531 		// try to change the attributes value
532 		if (item->data_size > length
533 			|| force
534 			|| ((uint8 *)last + length - item->DataSize()) <= ((uint8 *)node + fVolume->InodeSize())) {
535 			// make room for the new attribute if needed (and we are forced to do so)
536 			if (force
537 				&& ((uint8 *)last + length - item->DataSize()) > ((uint8 *)node + fVolume->InodeSize())) {
538 				// We also take the free space at the end of the small_data section
539 				// into account, and request only what's really needed
540 				uint32 needed = length - item->DataSize() -
541 						(uint32)((uint8 *)node + fVolume->InodeSize() - (uint8 *)last);
542 
543 				if (_MakeSpaceForSmallData(transaction, node, name, needed) < B_OK)
544 					return B_ERROR;
545 
546 				// reset our pointers
547 				item = node->SmallDataStart();
548 				index = 0;
549 				while (!item->IsLast(node) && strcmp(item->Name(), name)) {
550 					item = item->Next();
551 					index++;
552 				}
553 
554 				last = item;
555 				while (!last->IsLast(node))
556 					last = last->Next();
557 			}
558 
559 			// Normally, we can just overwrite the attribute data as the size
560 			// is specified by the type and does not change that often
561 			if (length != item->DataSize()) {
562 				// move the attributes after the current one
563 				small_data *next = item->Next();
564 				if (!next->IsLast(node))
565 					memmove((uint8 *)item + spaceNeeded, next, (uint8 *)last - (uint8 *)next);
566 
567 				// Move the "last" one to its new location and
568 				// correctly terminate the small_data section
569 				last = (small_data *)((uint8 *)last - ((uint8 *)next - ((uint8 *)item + spaceNeeded)));
570 				if ((uint8 *)last < (uint8 *)node + fVolume->BlockSize())
571 					memset(last, 0, (uint8 *)node + fVolume->BlockSize() - (uint8 *)last);
572 
573 				item->data_size = HOST_ENDIAN_TO_BFS_INT16(length);
574 			}
575 
576 			item->type = HOST_ENDIAN_TO_BFS_INT32(type);
577 			memcpy(item->Data(), data, length);
578 			item->Data()[length] = '\0';
579 
580 			return B_OK;
581 		}
582 
583 		return B_DEVICE_FULL;
584 	}
585 
586 	// try to add the new attribute!
587 
588 	if ((uint8 *)item + spaceNeeded > (uint8 *)node + fVolume->InodeSize()) {
589 		// there is not enough space for it!
590 		if (!force)
591 			return B_DEVICE_FULL;
592 
593 		// make room for the new attribute
594 		if (_MakeSpaceForSmallData(transaction, node, name, spaceNeeded) < B_OK)
595 			return B_ERROR;
596 
597 		// get new last item!
598 		item = node->SmallDataStart();
599 		index = 0;
600 		while (!item->IsLast(node)) {
601 			item = item->Next();
602 			index++;
603 		}
604 	}
605 
606 	memset(item, 0, spaceNeeded);
607 	item->type = HOST_ENDIAN_TO_BFS_INT32(type);
608 	item->name_size = HOST_ENDIAN_TO_BFS_INT16(nameLength);
609 	item->data_size = HOST_ENDIAN_TO_BFS_INT16(length);
610 	strcpy(item->Name(), name);
611 	memcpy(item->Data(), data, length);
612 
613 	// correctly terminate the small_data section
614 	item = item->Next();
615 	if (!item->IsLast(node))
616 		memset(item, 0, (uint8 *)node + fVolume->InodeSize() - (uint8 *)item);
617 
618 	// update all current iterators
619 	AttributeIterator *iterator = NULL;
620 	while ((iterator = fIterators.Next(iterator)) != NULL) {
621 		iterator->Update(index, 1);
622 	}
623 
624 	return B_OK;
625 }
626 
627 
628 /*!
629 	Iterates through the small_data section of an inode.
630 	To start at the beginning of this section, you let smallData
631 	point to NULL, like:
632 		small_data *data = NULL;
633 		while (inode->GetNextSmallData(&data) { ... }
634 
635 	This function is reentrant and doesn't allocate any memory;
636 	you can safely stop calling it at any point (you don't need
637 	to iterate through the whole list).
638 	You need to hold the fSmallDataLock when you call this method
639 */
640 status_t
641 Inode::_GetNextSmallData(bfs_inode *node, small_data **_smallData) const
642 {
643 	if (node == NULL)
644 		RETURN_ERROR(B_BAD_VALUE);
645 
646 	ASSERT(fSmallDataLock.IsLocked());
647 
648 	small_data *data = *_smallData;
649 
650 	// begin from the start?
651 	if (data == NULL)
652 		data = node->SmallDataStart();
653 	else
654 		data = data->Next();
655 
656 	// is already last item?
657 	if (data->IsLast(node))
658 		return B_ENTRY_NOT_FOUND;
659 
660 	*_smallData = data;
661 
662 	return B_OK;
663 }
664 
665 
666 /*!
667 	Finds the attribute "name" in the small data section, and
668 	returns a pointer to it (or NULL if it doesn't exist).
669 	You need to hold the fSmallDataLock when you call this method
670 */
671 small_data *
672 Inode::FindSmallData(const bfs_inode *node, const char *name) const
673 {
674 	ASSERT(fSmallDataLock.IsLocked());
675 
676 	small_data *smallData = NULL;
677 	while (_GetNextSmallData(const_cast<bfs_inode *>(node), &smallData) == B_OK) {
678 		if (!strcmp(smallData->Name(), name))
679 			return smallData;
680 	}
681 	return NULL;
682 }
683 
684 
685 /*!
686 	Returns a pointer to the node's name if present in the small data
687 	section, NULL otherwise.
688 	You need to hold the fSmallDataLock when you call this method
689 */
690 const char *
691 Inode::Name(const bfs_inode *node) const
692 {
693 	ASSERT(fSmallDataLock.IsLocked());
694 
695 	small_data *smallData = NULL;
696 	while (_GetNextSmallData((bfs_inode *)node, &smallData) == B_OK) {
697 		if (*smallData->Name() == FILE_NAME_NAME
698 			&& smallData->NameSize() == FILE_NAME_NAME_LENGTH)
699 			return (const char *)smallData->Data();
700 	}
701 	return NULL;
702 }
703 
704 
705 /*!
706 	Copies the node's name into the provided buffer.
707 	The buffer should be B_FILE_NAME_LENGTH bytes large.
708 */
709 status_t
710 Inode::GetName(char *buffer, size_t size) const
711 {
712 	NodeGetter node(fVolume, this);
713 	SimpleLocker locker(fSmallDataLock);
714 
715 	const char *name = Name(node.Node());
716 	if (name == NULL)
717 		return B_ENTRY_NOT_FOUND;
718 
719 	strlcpy(buffer, name, size);
720 	return B_OK;
721 }
722 
723 
724 /*!
725 	Changes or set the name of a file: in the inode small_data section only, it
726 	doesn't change it in the parent directory's b+tree.
727 	Note that you need to write back the inode yourself after having called
728 	that method. It suffers from the same API decision as AddSmallData() does
729 	(and for the same reason).
730 */
731 status_t
732 Inode::SetName(Transaction &transaction, const char *name)
733 {
734 	if (name == NULL || *name == '\0')
735 		return B_BAD_VALUE;
736 
737 	NodeGetter node(fVolume, transaction, this);
738 	const char nameTag[2] = {FILE_NAME_NAME, 0};
739 
740 	return _AddSmallData(transaction, node, nameTag, FILE_NAME_TYPE,
741 		(uint8 *)name, strlen(name), true);
742 }
743 
744 
745 status_t
746 Inode::_RemoveAttribute(Transaction &transaction, const char *name,
747 	bool hasIndex, Index *index)
748 {
749 	// remove the attribute file if it exists
750 	Vnode vnode(fVolume, Attributes());
751 	Inode *attributes;
752 	status_t status = vnode.Get(&attributes);
753 	if (status < B_OK)
754 		return status;
755 
756 	// update index
757 	if (index != NULL) {
758 		Inode *attribute;
759 		if ((hasIndex || fVolume->CheckForLiveQuery(name))
760 			&& GetAttribute(name, &attribute) == B_OK) {
761 			uint8 data[BPLUSTREE_MAX_KEY_LENGTH];
762 			size_t length = BPLUSTREE_MAX_KEY_LENGTH;
763 			if (attribute->ReadAt(0, data, &length) == B_OK) {
764 				index->Update(transaction, name, attribute->Type(), data,
765 					length, NULL, 0, this);
766 			}
767 
768 			ReleaseAttribute(attribute);
769 		}
770 	}
771 
772 	if ((status = attributes->Remove(transaction, name)) < B_OK)
773 		return status;
774 
775 	if (attributes->IsEmpty()) {
776 		// remove attribute directory (don't fail if that can't be done)
777 		if (remove_vnode(fVolume->ID(), attributes->ID()) == B_OK) {
778 			// update the inode, so that no one will ever doubt it's deleted :-)
779 			attributes->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
780 			if (attributes->WriteBack(transaction) == B_OK) {
781 				Attributes().SetTo(0, 0, 0);
782 				WriteBack(transaction);
783 			} else
784 				unremove_vnode(fVolume->ID(), attributes->ID());
785 		}
786 	}
787 
788 	return status;
789 }
790 
791 
792 /*!
793 	Reads data from the specified attribute.
794 	This is a high-level attribute function that understands attributes
795 	in the small_data section as well as real attribute files.
796 */
797 status_t
798 Inode::ReadAttribute(const char *name, int32 type, off_t pos, uint8 *buffer,
799 	size_t *_length)
800 {
801 	if (pos < 0)
802 		pos = 0;
803 
804 	// search in the small_data section (which has to be locked first)
805 	{
806 		NodeGetter node(fVolume, this);
807 		SimpleLocker locker(fSmallDataLock);
808 
809 		small_data *smallData = FindSmallData(node.Node(), name);
810 		if (smallData != NULL) {
811 			size_t length = *_length;
812 			if (pos >= smallData->data_size) {
813 				*_length = 0;
814 				return B_OK;
815 			}
816 			if (length + pos > smallData->DataSize())
817 				length = smallData->DataSize() - pos;
818 
819 			memcpy(buffer, smallData->Data() + pos, length);
820 			*_length = length;
821 			return B_OK;
822 		}
823 	}
824 
825 	// search in the attribute directory
826 	Inode *attribute;
827 	status_t status = GetAttribute(name, &attribute);
828 	if (status == B_OK) {
829 		if (attribute->Lock().Lock() == B_OK) {
830 			status = attribute->ReadAt(pos, (uint8 *)buffer, _length);
831 			attribute->Lock().Unlock();
832 		} else
833 			status = B_ERROR;
834 
835 		ReleaseAttribute(attribute);
836 	}
837 
838 	RETURN_ERROR(status);
839 }
840 
841 
842 /*!
843 	Writes data to the specified attribute.
844 	This is a high-level attribute function that understands attributes
845 	in the small_data section as well as real attribute files.
846 */
847 status_t
848 Inode::WriteAttribute(Transaction &transaction, const char *name, int32 type,
849 	off_t pos, const uint8 *buffer, size_t *_length)
850 {
851 	// needed to maintain the index
852 	uint8 oldBuffer[BPLUSTREE_MAX_KEY_LENGTH], *oldData = NULL;
853 	size_t oldLength = 0;
854 
855 	// TODO: we actually depend on that the contents of "buffer" are constant.
856 	// If they get changed during the write (hey, user programs), we may mess
857 	// up our index trees!
858 
859 	Index index(fVolume);
860 	index.SetTo(name);
861 
862 	Inode *attribute = NULL;
863 	status_t status = B_OK;
864 
865 	if (GetAttribute(name, &attribute) < B_OK) {
866 		// save the old attribute data
867 		NodeGetter node(fVolume, transaction, this);
868 		fSmallDataLock.Lock();
869 
870 		small_data *smallData = FindSmallData(node.Node(), name);
871 		if (smallData != NULL) {
872 			oldLength = smallData->DataSize();
873 			if (oldLength > 0) {
874 				if (oldLength > BPLUSTREE_MAX_KEY_LENGTH)
875 					oldLength = BPLUSTREE_MAX_KEY_LENGTH;
876 				memcpy(oldData = oldBuffer, smallData->Data(), oldLength);
877 			}
878 		}
879 		fSmallDataLock.Unlock();
880 
881 		// if the attribute doesn't exist yet (as a file), try to put it in the
882 		// small_data section first - if that fails (due to insufficent space),
883 		// create a real attribute file
884 		status = _AddSmallData(transaction, node, name, type, buffer, *_length);
885 		if (status == B_DEVICE_FULL) {
886 			if (smallData != NULL) {
887 				// remove the old attribute from the small data section - there
888 				// is no space left for the new data
889 				status = _RemoveSmallData(transaction, node, name);
890 			} else
891 				status = B_OK;
892 
893 			if (status == B_OK)
894 				status = CreateAttribute(transaction, name, type, &attribute);
895 			if (status < B_OK)
896 				RETURN_ERROR(status);
897 		} else if (status == B_OK)
898 			status = WriteBack(transaction);
899 	}
900 
901 	if (attribute != NULL) {
902 		if (attribute->Lock().LockWrite() == B_OK) {
903 			// save the old attribute data (if this fails, oldLength will reflect it)
904 			if (fVolume->CheckForLiveQuery(name) && attribute->Size() > 0) {
905 				oldLength = BPLUSTREE_MAX_KEY_LENGTH;
906 				if (attribute->ReadAt(0, oldBuffer, &oldLength) == B_OK)
907 					oldData = oldBuffer;
908 			}
909 
910 			// check if the data fits into the small_data section again
911 			NodeGetter node(fVolume, transaction, this);
912 			status = _AddSmallData(transaction, node, name, type, buffer, *_length);
913 			if (status == B_OK) {
914 				// it does - remove its file
915 				status = _RemoveAttribute(transaction, name, false, NULL);
916 			} else {
917 				status = attribute->WriteAt(transaction, pos, buffer, _length);
918 				if (status == B_OK) {
919 					// The attribute type might have been changed - we need to adopt
920 					// the new one - the attribute's inode will be written back on close
921 					attribute->Node().type = HOST_ENDIAN_TO_BFS_INT32(type);
922 				}
923 
924 				attribute->Lock().UnlockWrite();
925 			}
926 		} else
927 			status = B_ERROR;
928 
929 		ReleaseAttribute(attribute);
930 	}
931 
932 	// TODO: find a better way than this "pos" thing (the begin of the old key
933 	//	must be copied to the start of the new one for a comparison)
934 	if (status == B_OK && pos == 0) {
935 		// index only the first BPLUSTREE_MAX_KEY_LENGTH bytes
936 		uint16 length = *_length;
937 		if (length > BPLUSTREE_MAX_KEY_LENGTH)
938 			length = BPLUSTREE_MAX_KEY_LENGTH;
939 
940 		// Update index. Note, Index::Update() may be called even if initializing
941 		// the index failed - it will just update the live queries in this case
942 		if (pos < length || pos < oldLength) {
943 			index.Update(transaction, name, type, oldData, oldLength, buffer,
944 				length, this);
945 		}
946 	}
947 	return status;
948 }
949 
950 
951 /*!
952 	Removes the specified attribute from the inode.
953 	This is a high-level attribute function that understands attributes
954 	in the small_data section as well as real attribute files.
955 */
956 status_t
957 Inode::RemoveAttribute(Transaction &transaction, const char *name)
958 {
959 	Index index(fVolume);
960 	bool hasIndex = index.SetTo(name) == B_OK;
961 	NodeGetter node(fVolume, this);
962 
963 	// update index for attributes in the small_data section
964 	{
965 		fSmallDataLock.Lock();
966 
967 		small_data *smallData = FindSmallData(node.Node(), name);
968 		if (smallData != NULL) {
969 			uint32 length = smallData->DataSize();
970 			if (length > BPLUSTREE_MAX_KEY_LENGTH)
971 				length = BPLUSTREE_MAX_KEY_LENGTH;
972 			index.Update(transaction, name, smallData->Type(), smallData->Data(),
973 				length, NULL, 0, this);
974 		}
975 		fSmallDataLock.Unlock();
976 	}
977 
978 	status_t status = _RemoveSmallData(transaction, node, name);
979 	if (status == B_ENTRY_NOT_FOUND && !Attributes().IsZero()) {
980 		// remove the attribute file if it exists
981 		status = _RemoveAttribute(transaction, name, hasIndex, &index);
982 	}
983 	return status;
984 }
985 
986 
987 status_t
988 Inode::GetAttribute(const char *name, Inode **_attribute)
989 {
990 	// does this inode even have attributes?
991 	if (Attributes().IsZero())
992 		return B_ENTRY_NOT_FOUND;
993 
994 	Vnode vnode(fVolume, Attributes());
995 	Inode *attributes;
996 	if (vnode.Get(&attributes) < B_OK) {
997 		FATAL(("get_vnode() failed in Inode::GetAttribute(name = \"%s\")\n", name));
998 		return B_ERROR;
999 	}
1000 
1001 	BPlusTree *tree;
1002 	status_t status = attributes->GetTree(&tree);
1003 	if (status == B_OK) {
1004 		vnode_id id;
1005 		status = tree->Find((uint8 *)name, (uint16)strlen(name), &id);
1006 		if (status == B_OK) {
1007 			Vnode vnode(fVolume, id);
1008 			Inode *inode;
1009 			// Check if the attribute is really an attribute
1010 			if (vnode.Get(&inode) < B_OK || !inode->IsAttribute())
1011 				return B_ERROR;
1012 
1013 			*_attribute = inode;
1014 			vnode.Keep();
1015 			return B_OK;
1016 		}
1017 	}
1018 	return status;
1019 }
1020 
1021 
1022 void
1023 Inode::ReleaseAttribute(Inode *attribute)
1024 {
1025 	if (attribute == NULL)
1026 		return;
1027 
1028 	put_vnode(fVolume->ID(), attribute->ID());
1029 }
1030 
1031 
1032 status_t
1033 Inode::CreateAttribute(Transaction &transaction, const char *name, uint32 type,
1034 	Inode **attribute)
1035 {
1036 	// do we need to create the attribute directory first?
1037 	if (Attributes().IsZero()) {
1038 		status_t status = Inode::Create(transaction, this, NULL,
1039 			S_ATTR_DIR | S_DIRECTORY | 0666, 0, 0, NULL);
1040 		if (status < B_OK)
1041 			RETURN_ERROR(status);
1042 	}
1043 	Vnode vnode(fVolume, Attributes());
1044 	Inode *attributes;
1045 	if (vnode.Get(&attributes) < B_OK)
1046 		return B_ERROR;
1047 
1048 	// Inode::Create() locks the inode for us
1049 	return Inode::Create(transaction, attributes, name,
1050 		S_ATTR | S_FILE | 0666, 0, type, NULL, attribute);
1051 }
1052 
1053 
1054 //	#pragma mark - directory tree
1055 
1056 
1057 /*!
1058 	Gives the caller direct access to the b+tree for a given directory.
1059 	The tree is no longer created on demand, but when the inode is first
1060 	created. That will report any potential errors upfront, saves locking,
1061 	and should work as good (though a bit slower).
1062 */
1063 status_t
1064 Inode::GetTree(BPlusTree **tree)
1065 {
1066 	if (fTree) {
1067 		*tree = fTree;
1068 		return B_OK;
1069 	}
1070 
1071 	RETURN_ERROR(B_BAD_VALUE);
1072 }
1073 
1074 
1075 bool
1076 Inode::IsEmpty()
1077 {
1078 	BPlusTree *tree;
1079 	status_t status = GetTree(&tree);
1080 	if (status < B_OK)
1081 		return status;
1082 
1083 	TreeIterator iterator(tree);
1084 
1085 	// index and attribute directories are really empty when they are
1086 	// empty - directories for standard files always contain ".", and
1087 	// "..", so we need to ignore those two
1088 
1089 	uint32 count = 0;
1090 	char name[BPLUSTREE_MAX_KEY_LENGTH];
1091 	uint16 length;
1092 	vnode_id id;
1093 	while (iterator.GetNextEntry(name, &length, B_FILE_NAME_LENGTH,
1094 			&id) == B_OK) {
1095 		if (Mode() & (S_ATTR_DIR | S_INDEX_DIR))
1096 			return false;
1097 
1098 		if (++count > 2 || strcmp(".", name) && strcmp("..", name))
1099 			return false;
1100 	}
1101 	return true;
1102 }
1103 
1104 
1105 //	#pragma mark - data stream
1106 
1107 
1108 /*!
1109 	Finds the block_run where "pos" is located in the data_stream of
1110 	the inode.
1111 	If successful, "offset" will then be set to the file offset
1112 	of the block_run returned; so "pos - offset" is for the block_run
1113 	what "pos" is for the whole stream.
1114 	The caller has to make sure that "pos" is inside the stream.
1115 */
1116 status_t
1117 Inode::FindBlockRun(off_t pos, block_run &run, off_t &offset)
1118 {
1119 	data_stream *data = &Node().data;
1120 
1121 	// find matching block run
1122 
1123 	if (data->MaxDirectRange() > 0 && pos >= data->MaxDirectRange()) {
1124 		if (data->MaxDoubleIndirectRange() > 0 && pos >= data->MaxIndirectRange()) {
1125 			// access to double indirect blocks
1126 
1127 			CachedBlock cached(fVolume);
1128 
1129 			off_t start = pos - data->MaxIndirectRange();
1130 			int32 indirectSize = (1L << (INDIRECT_BLOCKS_SHIFT
1131 					+ cached.BlockShift()))
1132 				* (fVolume->BlockSize() / sizeof(block_run));
1133 			int32 directSize = NUM_ARRAY_BLOCKS << cached.BlockShift();
1134 			int32 index = start / indirectSize;
1135 			int32 runsPerBlock = cached.BlockSize() / sizeof(block_run);
1136 
1137 			block_run *indirect = (block_run *)cached.SetTo(
1138 					fVolume->ToBlock(data->double_indirect) + index / runsPerBlock);
1139 			if (indirect == NULL)
1140 				RETURN_ERROR(B_ERROR);
1141 
1142 			//printf("\tstart = %Ld, indirectSize = %ld, directSize = %ld, index = %ld\n",start,indirectSize,directSize,index);
1143 			//printf("\tlook for indirect block at %ld,%d\n",indirect[index].allocation_group,indirect[index].start);
1144 
1145 			int32 current = (start % indirectSize) / directSize;
1146 
1147 			indirect = (block_run *)cached.SetTo(
1148 				fVolume->ToBlock(indirect[index % runsPerBlock])
1149 				+ current / runsPerBlock);
1150 			if (indirect == NULL)
1151 				RETURN_ERROR(B_ERROR);
1152 
1153 			run = indirect[current % runsPerBlock];
1154 			offset = data->MaxIndirectRange() + (index * indirectSize)
1155 				+ (current * directSize);
1156 			//printf("\tfCurrent = %ld, fRunFileOffset = %Ld, fRunBlockEnd = %Ld, fRun = %ld,%d\n",fCurrent,fRunFileOffset,fRunBlockEnd,fRun.allocation_group,fRun.start);
1157 		} else {
1158 			// access to indirect blocks
1159 
1160 			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1161 			off_t runBlockEnd = data->MaxDirectRange();
1162 
1163 			CachedBlock cached(fVolume);
1164 			off_t block = fVolume->ToBlock(data->indirect);
1165 
1166 			for (int32 i = 0; i < data->indirect.Length(); i++) {
1167 				block_run *indirect = (block_run *)cached.SetTo(block + i);
1168 				if (indirect == NULL)
1169 					RETURN_ERROR(B_IO_ERROR);
1170 
1171 				int32 current = -1;
1172 				while (++current < runsPerBlock) {
1173 					if (indirect[current].IsZero())
1174 						break;
1175 
1176 					runBlockEnd += indirect[current].Length()
1177 						<< cached.BlockShift();
1178 					if (runBlockEnd > pos) {
1179 						run = indirect[current];
1180 						offset = runBlockEnd - (run.Length()
1181 							<< cached.BlockShift());
1182 						//printf("reading from indirect block: %ld,%d\n",fRun.allocation_group,fRun.start);
1183 						//printf("### indirect-run[%ld] = (%ld,%d,%d), offset = %Ld\n",fCurrent,fRun.allocation_group,fRun.start,fRun.Length(),fRunFileOffset);
1184 						return fVolume->ValidateBlockRun(run);
1185 					}
1186 				}
1187 			}
1188 			RETURN_ERROR(B_ERROR);
1189 		}
1190 	} else {
1191 		// access from direct blocks
1192 
1193 		off_t runBlockEnd = 0LL;
1194 		int32 current = -1;
1195 
1196 		while (++current < NUM_DIRECT_BLOCKS) {
1197 			if (data->direct[current].IsZero())
1198 				break;
1199 
1200 			runBlockEnd += data->direct[current].Length()
1201 				<< fVolume->BlockShift();
1202 			if (runBlockEnd > pos) {
1203 				run = data->direct[current];
1204 				offset = runBlockEnd - (run.Length() << fVolume->BlockShift());
1205 				//printf("### run[%ld] = (%ld,%d,%d), offset = %Ld\n",fCurrent,fRun.allocation_group,fRun.start,fRun.Length(),fRunFileOffset);
1206 				return fVolume->ValidateBlockRun(run);
1207 			}
1208 		}
1209 
1210 		//PRINT(("FindBlockRun() failed in direct range: size = %Ld, pos = %Ld\n",data->size,pos));
1211 		return B_ENTRY_NOT_FOUND;
1212 	}
1213 	return fVolume->ValidateBlockRun(run);
1214 }
1215 
1216 
1217 status_t
1218 Inode::ReadAt(off_t pos, uint8 *buffer, size_t *_length)
1219 {
1220 #if 0
1221 	// call the right ReadAt() method, depending on the inode flags
1222 
1223 	if (Flags() & INODE_LOGGED)
1224 		return ((Stream<Access::Logged> *)this)->ReadAt(pos, buffer, _length);
1225 
1226 	return ((Stream<Access::Cached> *)this)->ReadAt(pos, buffer, _length);
1227 #endif
1228 	size_t length = *_length;
1229 
1230 	// set/check boundaries for pos/length
1231 	if (pos < 0)
1232 		return B_BAD_VALUE;
1233 
1234 	if (pos >= Size() || length == 0) {
1235 		*_length = 0;
1236 		return B_NO_ERROR;
1237 	}
1238 
1239 	return file_cache_read(FileCache(), pos, buffer, _length);
1240 }
1241 
1242 
1243 status_t
1244 Inode::WriteAt(Transaction &transaction, off_t pos, const uint8 *buffer,
1245 	size_t *_length)
1246 {
1247 	// update the last modification time in memory, it will be written
1248 	// back to the inode, and the index when the file is closed
1249 	// ToDo: should update the internal last modified time only at this point!
1250 	Node().last_modified_time = HOST_ENDIAN_TO_BFS_INT64((bigtime_t)time(NULL)
1251 		<< INODE_TIME_SHIFT);
1252 
1253 	// TODO: support INODE_LOGGED!
1254 #if 0
1255 	if (Flags() & INODE_LOGGED)
1256 		return ((Stream<Access::Logged> *)this)->WriteAt(transaction, pos, buffer, _length);
1257 
1258 	return ((Stream<Access::Cached> *)this)->WriteAt(transaction, pos, buffer, _length);
1259 #endif
1260 	size_t length = *_length;
1261 
1262 	// set/check boundaries for pos/length
1263 	if (pos < 0)
1264 		return B_BAD_VALUE;
1265 
1266 	if (pos + length > Size()) {
1267 		off_t oldSize = Size();
1268 
1269 		// the transaction doesn't have to be started already
1270 		// ToDo: what's that INODE_NO_TRANSACTION flag good for again?
1271 		if ((Flags() & INODE_NO_TRANSACTION) == 0
1272 			&& !transaction.IsStarted())
1273 			transaction.Start(fVolume, BlockNumber());
1274 
1275 		// let's grow the data stream to the size needed
1276 		status_t status = SetFileSize(transaction, pos + length);
1277 		if (status < B_OK) {
1278 			*_length = 0;
1279 			RETURN_ERROR(status);
1280 		}
1281 		// If the position of the write was beyond the file size, we
1282 		// have to fill the gap between that position and the old file
1283 		// size with zeros.
1284 		FillGapWithZeros(oldSize, pos);
1285 
1286 		// we need to write back the inode here because it has to
1287 		// go into this transaction (we cannot wait until the file
1288 		// is closed)
1289 		status = WriteBack(transaction);
1290 		if (status < B_OK)
1291 			return status;
1292 	}
1293 
1294 	// If we don't want to write anything, we can now return (we may
1295 	// just have changed the file size using the position parameter)
1296 	if (length == 0)
1297 		return B_OK;
1298 
1299 	return file_cache_write(FileCache(), pos, buffer, _length);
1300 }
1301 
1302 
1303 /*!
1304 	Fills the gap between the old file size and the new file size
1305 	with zeros.
1306 	It's more or less a copy of Inode::WriteAt() but it can handle
1307 	length differences of more than just 4 GB, and it never uses
1308 	the log, even if the INODE_LOGGED flag is set.
1309 */
1310 status_t
1311 Inode::FillGapWithZeros(off_t pos, off_t newSize)
1312 {
1313 	// ToDo: we currently do anything here, same as original BFS!
1314 	//if (pos >= newSize)
1315 	return B_OK;
1316 #if 0
1317 	block_run run;
1318 	off_t offset;
1319 	if (FindBlockRun(pos, run, offset) < B_OK)
1320 		RETURN_ERROR(B_BAD_VALUE);
1321 
1322 	off_t length = newSize - pos;
1323 	uint32 bytesWritten = 0;
1324 	uint32 blockSize = fVolume->BlockSize();
1325 	uint32 blockShift = fVolume->BlockShift();
1326 	uint8 *block;
1327 
1328 	// the first block_run we write could not be aligned to the block_size boundary
1329 	// (write partial block at the beginning)
1330 
1331 	// pos % block_size == (pos - offset) % block_size, offset % block_size == 0
1332 	if (pos % blockSize != 0) {
1333 		run.start += (pos - offset) / blockSize;
1334 		run.length -= (pos - offset) / blockSize;
1335 
1336 		CachedBlock cached(fVolume,run);
1337 		if ((block = cached.Block()) == NULL)
1338 			RETURN_ERROR(B_BAD_VALUE);
1339 
1340 		bytesWritten = blockSize - (pos % blockSize);
1341 		if (length < bytesWritten)
1342 			bytesWritten = length;
1343 
1344 		memset(block + (pos % blockSize), 0, bytesWritten);
1345 		if (fVolume->WriteBlocks(cached.BlockNumber(), block, 1) < B_OK)
1346 			RETURN_ERROR(B_IO_ERROR);
1347 
1348 		pos += bytesWritten;
1349 
1350 		length -= bytesWritten;
1351 		if (length == 0)
1352 			return B_OK;
1353 
1354 		if (FindBlockRun(pos, run, offset) < B_OK)
1355 			RETURN_ERROR(B_BAD_VALUE);
1356 	}
1357 
1358 	while (length > 0) {
1359 		// offset is the offset to the current pos in the block_run
1360 		run.start = HOST_ENDIAN_TO_BFS_INT16(run.Start() + ((pos - offset) >> blockShift));
1361 		run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length() - ((pos - offset) >> blockShift));
1362 
1363 		CachedBlock cached(fVolume);
1364 		off_t blockNumber = fVolume->ToBlock(run);
1365 		for (int32 i = 0; i < run.Length(); i++) {
1366 			if ((block = cached.SetTo(blockNumber + i, true)) == NULL)
1367 				RETURN_ERROR(B_IO_ERROR);
1368 
1369 			if (fVolume->WriteBlocks(cached.BlockNumber(), block, 1) < B_OK)
1370 				RETURN_ERROR(B_IO_ERROR);
1371 		}
1372 
1373 		int32 bytes = run.Length() << blockShift;
1374 		length -= bytes;
1375 		bytesWritten += bytes;
1376 
1377 		// since we don't respect a last partial block, length can be lower
1378 		if (length <= 0)
1379 			break;
1380 
1381 		pos += bytes;
1382 
1383 		if (FindBlockRun(pos, run, offset) < B_OK)
1384 			RETURN_ERROR(B_BAD_VALUE);
1385 	}
1386 	return B_OK;
1387 #endif
1388 }
1389 
1390 
1391 /*!
1392 	Allocates NUM_ARRAY_BLOCKS blocks, and clears their contents. Growing
1393 	the indirect and double indirect range uses this method.
1394 	The allocated block_run is saved in "run"
1395 */
1396 status_t
1397 Inode::_AllocateBlockArray(Transaction &transaction, block_run &run)
1398 {
1399 	if (!run.IsZero())
1400 		return B_BAD_VALUE;
1401 
1402 	status_t status = fVolume->Allocate(transaction, this, NUM_ARRAY_BLOCKS,
1403 		run, NUM_ARRAY_BLOCKS);
1404 	if (status < B_OK)
1405 		return status;
1406 
1407 	// make sure those blocks are empty
1408 	CachedBlock cached(fVolume);
1409 	off_t block = fVolume->ToBlock(run);
1410 
1411 	for (int32 i = 0; i < run.Length(); i++) {
1412 		block_run *runs = (block_run *)cached.SetToWritable(transaction,
1413 			block + i, true);
1414 		if (runs == NULL)
1415 			return B_IO_ERROR;
1416 	}
1417 	return B_OK;
1418 }
1419 
1420 
1421 status_t
1422 Inode::_GrowStream(Transaction &transaction, off_t size)
1423 {
1424 	data_stream *data = &Node().data;
1425 
1426 	// is the data stream already large enough to hold the new size?
1427 	// (can be the case with preallocated blocks)
1428 	if (size < data->MaxDirectRange()
1429 		|| size < data->MaxIndirectRange()
1430 		|| size < data->MaxDoubleIndirectRange()) {
1431 		data->size = HOST_ENDIAN_TO_BFS_INT64(size);
1432 		return B_OK;
1433 	}
1434 
1435 	// how many bytes are still needed? (unused ranges are always zero)
1436 	uint16 minimum = 1;
1437 	off_t bytes;
1438 	if (data->Size() < data->MaxDoubleIndirectRange()) {
1439 		bytes = size - data->MaxDoubleIndirectRange();
1440 		// the double indirect range can only handle multiple of NUM_ARRAY_BLOCKS
1441 		minimum = NUM_ARRAY_BLOCKS;
1442 	} else if (data->Size() < data->MaxIndirectRange())
1443 		bytes = size - data->MaxIndirectRange();
1444 	else if (data->Size() < data->MaxDirectRange())
1445 		bytes = size - data->MaxDirectRange();
1446 	else
1447 		bytes = size - data->Size();
1448 
1449 	// do we have enough free blocks on the disk?
1450 	off_t blocksRequested = (bytes + fVolume->BlockSize() - 1)
1451 		>> fVolume->BlockShift();
1452 	if (blocksRequested > fVolume->FreeBlocks())
1453 		return B_DEVICE_FULL;
1454 
1455 	off_t blocksNeeded = blocksRequested;
1456 		// because of preallocations and partial allocations, the number of
1457 		// blocks we need to allocate may be different from the one we request
1458 		// from the block allocator
1459 
1460 	// Should we preallocate some blocks (currently, always 64k)?
1461 	// Attributes, attribute directories, and long symlinks usually won't get
1462 	// that big, and should stay close to the inode - preallocating could be
1463 	// counterproductive.
1464 	// Also, if free disk space is tight, we probably don't want to do this as well.
1465 	if (!IsAttribute() && !IsAttributeDirectory() && !IsSymLink()
1466 		&& blocksRequested < (65536 >> fVolume->BlockShift())
1467 		&& fVolume->FreeBlocks() > 128)
1468 		blocksRequested = 65536 >> fVolume->BlockShift();
1469 
1470 	while (blocksNeeded > 0) {
1471 		// the requested blocks do not need to be returned with a
1472 		// single allocation, so we need to iterate until we have
1473 		// enough blocks allocated
1474 		block_run run;
1475 		status_t status = fVolume->Allocate(transaction, this, blocksRequested,
1476 			run, minimum);
1477 		if (status < B_OK)
1478 			return status;
1479 
1480 		// okay, we have the needed blocks, so just distribute them to the
1481 		// different ranges of the stream (direct, indirect & double indirect)
1482 
1483 		// TODO: if anything goes wrong here, we probably want to free the
1484 		// blocks that couldn't be distributed into the stream!
1485 
1486 		blocksNeeded -= run.Length();
1487 		// don't preallocate if the first allocation was already too small
1488 		blocksRequested = blocksNeeded;
1489 		if (minimum > 1) {
1490 			// make sure that "blocks" is a multiple of minimum
1491 			blocksRequested = (blocksRequested + minimum - 1) & ~(minimum - 1);
1492 		}
1493 
1494 		// Direct block range
1495 
1496 		if (data->Size() <= data->MaxDirectRange()) {
1497 			// let's try to put them into the direct block range
1498 			int32 free = 0;
1499 			for (; free < NUM_DIRECT_BLOCKS; free++)
1500 				if (data->direct[free].IsZero())
1501 					break;
1502 
1503 			if (free < NUM_DIRECT_BLOCKS) {
1504 				// can we merge the last allocated run with the new one?
1505 				int32 last = free - 1;
1506 				if (free > 0 && data->direct[last].MergeableWith(run)) {
1507 					data->direct[last].length = HOST_ENDIAN_TO_BFS_INT16(
1508 						data->direct[last].Length() + run.Length());
1509 				} else
1510 					data->direct[free] = run;
1511 
1512 				data->max_direct_range = HOST_ENDIAN_TO_BFS_INT64(
1513 					data->MaxDirectRange() + run.Length() * fVolume->BlockSize());
1514 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1515 					? data->max_direct_range : size);
1516 				continue;
1517 			}
1518 		}
1519 
1520 		// Indirect block range
1521 
1522 		if (data->Size() <= data->MaxIndirectRange()
1523 			|| !data->MaxIndirectRange()) {
1524 			CachedBlock cached(fVolume);
1525 			block_run *runs = NULL;
1526 			uint32 free = 0;
1527 			off_t block;
1528 
1529 			// if there is no indirect block yet, create one
1530 			if (data->indirect.IsZero()) {
1531 				status = _AllocateBlockArray(transaction, data->indirect);
1532 				if (status < B_OK)
1533 					return status;
1534 
1535 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1536 					data->MaxDirectRange());
1537 				// insert the block_run in the first block
1538 				runs = (block_run *)cached.SetTo(data->indirect);
1539 			} else {
1540 				uint32 numberOfRuns = fVolume->BlockSize() / sizeof(block_run);
1541 				block = fVolume->ToBlock(data->indirect);
1542 
1543 				// search first empty entry
1544 				int32 i = 0;
1545 				for (; i < data->indirect.Length(); i++) {
1546 					if ((runs = (block_run *)cached.SetTo(block + i)) == NULL)
1547 						return B_IO_ERROR;
1548 
1549 					for (free = 0; free < numberOfRuns; free++)
1550 						if (runs[free].IsZero())
1551 							break;
1552 
1553 					if (free < numberOfRuns)
1554 						break;
1555 				}
1556 				if (i == data->indirect.Length())
1557 					runs = NULL;
1558 			}
1559 
1560 			if (runs != NULL) {
1561 				// try to insert the run to the last one - note that this
1562 				// doesn't take block borders into account, so it could be
1563 				// further optimized
1564 				cached.MakeWritable(transaction);
1565 
1566 				int32 last = free - 1;
1567 				if (free > 0 && runs[last].MergeableWith(run)) {
1568 					runs[last].length = HOST_ENDIAN_TO_BFS_INT16(
1569 						runs[last].Length() + run.Length());
1570 				} else
1571 					runs[free] = run;
1572 
1573 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1574 					data->MaxIndirectRange()
1575 					+ (run.Length() << fVolume->BlockShift()));
1576 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0
1577 					? data->MaxIndirectRange() : size);
1578 				continue;
1579 			}
1580 		}
1581 
1582 		// Double indirect block range
1583 
1584 		if (data->Size() <= data->MaxDoubleIndirectRange()
1585 			|| !data->max_double_indirect_range) {
1586 			while ((run.Length() % NUM_ARRAY_BLOCKS) != 0) {
1587 				// The number of allocated blocks isn't a multiple of
1588 				// NUM_ARRAY_BLOCKS, so we have to change this. This can happen
1589 				// the first time the stream grows into the double
1590 				// indirect range.
1591 				// First, free the remaining blocks that don't fit into a multiple
1592 				// of NUM_ARRAY_BLOCKS
1593 				int32 rest = run.Length() % NUM_ARRAY_BLOCKS;
1594 				run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length() - rest);
1595 
1596 				status = fVolume->Free(transaction,
1597 					block_run::Run(run.AllocationGroup(),
1598 					run.Start() + run.Length(), rest));
1599 				if (status < B_OK)
1600 					return status;
1601 
1602 				blocksNeeded += rest;
1603 				blocksRequested = (blocksNeeded + NUM_ARRAY_BLOCKS - 1)
1604 					& ~(NUM_ARRAY_BLOCKS - 1);
1605 				minimum = NUM_ARRAY_BLOCKS;
1606 					// we make sure here that we have at minimum
1607 					// NUM_ARRAY_BLOCKS allocated, so if the allocation
1608 					// succeeds, we don't run into an endless loop
1609 
1610 				// Are there any blocks left in the run? If not, allocate a new one
1611 				if (run.length == 0)
1612 					continue;
1613 			}
1614 
1615 			// if there is no double indirect block yet, create one
1616 			if (data->double_indirect.IsZero()) {
1617 				status = _AllocateBlockArray(transaction, data->double_indirect);
1618 				if (status < B_OK)
1619 					return status;
1620 
1621 				data->max_double_indirect_range = data->max_indirect_range;
1622 			}
1623 
1624 			// calculate the index where to insert the new blocks
1625 
1626 			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1627 			int32 indirectSize = ((1L << INDIRECT_BLOCKS_SHIFT) << fVolume->BlockShift())
1628 				* runsPerBlock;
1629 			int32 directSize = NUM_ARRAY_BLOCKS << fVolume->BlockShift();
1630 			int32 runsPerArray = runsPerBlock << ARRAY_BLOCKS_SHIFT;
1631 
1632 			off_t start = data->MaxDoubleIndirectRange() - data->MaxIndirectRange();
1633 			int32 indirectIndex = start / indirectSize;
1634 			int32 index = start / directSize;
1635 
1636 			// distribute the blocks to the array and allocate
1637 			// new array blocks when needed
1638 
1639 			CachedBlock cached(fVolume);
1640 			CachedBlock cachedDirect(fVolume);
1641 			block_run *array = NULL;
1642 			uint32 runLength = run.Length();
1643 
1644 			// TODO: the following code is commented - it could be used to
1645 			// preallocate all needed block arrays to see in advance if the
1646 			// allocation will succeed.
1647 			// I will probably remove it later, because it's no perfect solution
1648 			// either: if the allocation was broken up before (blocksNeeded != 0),
1649 			// it doesn't guarantee anything.
1650 			// And since failing in this case is not that common, it doesn't have
1651 			// to be optimized in that way.
1652 			// Anyway, I wanted to have it in CVS - all those lines, and they will
1653 			// be removed soon :-)
1654 /*
1655 			// allocate new block arrays if needed
1656 
1657 			off_t block = -1;
1658 
1659 			for (int32 i = 0;i < needed;i++) {
1660 				// get the block to insert the run into
1661 				block = fVolume->ToBlock(data->double_indirect) + i + indirectIndex / runsPerBlock;
1662 				if (cached.BlockNumber() != block)
1663 					array = (block_run *)cached.SetTo(block);
1664 
1665 				if (array == NULL)
1666 					return B_ERROR;
1667 
1668 				status = AllocateBlockArray(transaction, array[i + indirectIndex % runsPerBlock]);
1669 				if (status < B_OK)
1670 					return status;
1671 			}
1672 */
1673 
1674 			while (run.length != 0) {
1675 				// get the indirect array block
1676 				if (array == NULL) {
1677 					array = (block_run *)cached.SetTo(fVolume->ToBlock(
1678 						data->double_indirect) + indirectIndex / runsPerBlock);
1679 					if (array == NULL)
1680 						return B_IO_ERROR;
1681 				}
1682 
1683 				do {
1684 					// do we need a new array block?
1685 					if (array[indirectIndex % runsPerBlock].IsZero()) {
1686 						status = _AllocateBlockArray(transaction,
1687 							array[indirectIndex % runsPerBlock]);
1688 						if (status < B_OK)
1689 							return status;
1690 					}
1691 
1692 					block_run *runs = (block_run *)cachedDirect.SetToWritable(
1693 						transaction, fVolume->ToBlock(array[indirectIndex
1694 							% runsPerBlock]) + index / runsPerBlock);
1695 					if (runs == NULL)
1696 						return B_IO_ERROR;
1697 
1698 					do {
1699 						// insert the block_run into the array
1700 						runs[index % runsPerBlock] = run;
1701 						runs[index % runsPerBlock].length
1702 							= HOST_ENDIAN_TO_BFS_INT16(NUM_ARRAY_BLOCKS);
1703 
1704 						// alter the remaining block_run
1705 						run.start = HOST_ENDIAN_TO_BFS_INT16(run.Start()
1706 							+ NUM_ARRAY_BLOCKS);
1707 						run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
1708 							- NUM_ARRAY_BLOCKS);
1709 					} while ((++index % runsPerBlock) != 0 && run.length);
1710 				} while ((index % runsPerArray) != 0 && run.length);
1711 
1712 				if (++indirectIndex % runsPerBlock == 0) {
1713 					array = NULL;
1714 					index = 0;
1715 				}
1716 			}
1717 
1718 			data->max_double_indirect_range = HOST_ENDIAN_TO_BFS_INT64(
1719 				data->MaxDoubleIndirectRange()
1720 				+ (runLength << fVolume->BlockShift()));
1721 			data->size = blocksNeeded > 0 ? HOST_ENDIAN_TO_BFS_INT64(
1722 				data->max_double_indirect_range) : size;
1723 
1724 			continue;
1725 		}
1726 
1727 		RETURN_ERROR(EFBIG);
1728 	}
1729 	// update the size of the data stream
1730 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
1731 
1732 	return B_OK;
1733 }
1734 
1735 
1736 status_t
1737 Inode::_FreeStaticStreamArray(Transaction &transaction, int32 level,
1738 	block_run run, off_t size, off_t offset, off_t &max)
1739 {
1740 	int32 indirectSize = 0;
1741 	if (level == 0)
1742 		indirectSize = (1L << (INDIRECT_BLOCKS_SHIFT + fVolume->BlockShift()))
1743 			* (fVolume->BlockSize() / sizeof(block_run));
1744 	else if (level == 1)
1745 		indirectSize = 4 << fVolume->BlockShift();
1746 
1747 	off_t start;
1748 	if (size > offset)
1749 		start = size - offset;
1750 	else
1751 		start = 0;
1752 
1753 	int32 index = start / indirectSize;
1754 	int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1755 
1756 	CachedBlock cached(fVolume);
1757 	off_t blockNumber = fVolume->ToBlock(run);
1758 
1759 	// set the file offset to the current block run
1760 	offset += (off_t)index * indirectSize;
1761 
1762 	for (int32 i = index / runsPerBlock; i < run.Length(); i++) {
1763 		block_run *array = (block_run *)cached.SetToWritable(transaction,
1764 			blockNumber + i);
1765 		if (array == NULL)
1766 			RETURN_ERROR(B_ERROR);
1767 
1768 		for (index = index % runsPerBlock; index < runsPerBlock; index++) {
1769 			if (array[index].IsZero()) {
1770 				// we also want to break out of the outer loop
1771 				i = run.Length();
1772 				break;
1773 			}
1774 
1775 			status_t status = B_OK;
1776 			if (level == 0) {
1777 				status = _FreeStaticStreamArray(transaction, 1, array[index],
1778 					size, offset, max);
1779 			} else if (offset >= size)
1780 				status = fVolume->Free(transaction, array[index]);
1781 			else
1782 				max = HOST_ENDIAN_TO_BFS_INT64(offset + indirectSize);
1783 
1784 			if (status < B_OK)
1785 				RETURN_ERROR(status);
1786 
1787 			if (offset >= size)
1788 				array[index].SetTo(0, 0, 0);
1789 
1790 			offset += indirectSize;
1791 		}
1792 		index = 0;
1793 	}
1794 	return B_OK;
1795 }
1796 
1797 
1798 /*!
1799 	Frees all block_runs in the array which come after the specified size.
1800 	It also trims the last block_run that contain the size.
1801 	"offset" and "max" are maintained until the last block_run that doesn't
1802 	have to be freed - after this, the values won't be correct anymore, but
1803 	will still assure correct function for all subsequent calls.
1804 	"max" is considered to be in file system byte order.
1805 */
1806 status_t
1807 Inode::_FreeStreamArray(Transaction &transaction, block_run *array,
1808 	uint32 arrayLength, off_t size, off_t &offset, off_t &max)
1809 {
1810 	PRINT(("FreeStreamArray: arrayLength %lu, size %Ld, offset %Ld, max %Ld\n",
1811 		arrayLength, size, offset, max));
1812 
1813 	off_t newOffset = offset;
1814 	uint32 i = 0;
1815 	for (; i < arrayLength; i++, offset = newOffset) {
1816 		if (array[i].IsZero())
1817 			break;
1818 
1819 		newOffset += (off_t)array[i].Length() << fVolume->BlockShift();
1820 		if (newOffset <= size)
1821 			continue;
1822 
1823 		block_run run = array[i];
1824 
1825 		// determine the block_run to be freed
1826 		if (newOffset > size && offset < size) {
1827 			// free partial block_run (and update the original block_run)
1828 			run.start = array[i].start
1829 				+ ((size - offset) >> fVolume->BlockShift()) + 1;
1830 			array[i].length = HOST_ENDIAN_TO_BFS_INT16(run.Start()
1831 				- array[i].Start());
1832 			run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length()
1833 				- array[i].Length());
1834 
1835 			if (run.length == 0)
1836 				continue;
1837 
1838 			// update maximum range
1839 			max = HOST_ENDIAN_TO_BFS_INT64(offset + ((off_t)array[i].Length()
1840 				<< fVolume->BlockShift()));
1841 		} else {
1842 			// free the whole block_run
1843 			array[i].SetTo(0, 0, 0);
1844 
1845 			if ((off_t)BFS_ENDIAN_TO_HOST_INT64(max) > offset)
1846 				max = HOST_ENDIAN_TO_BFS_INT64(offset);
1847 		}
1848 
1849 		if (fVolume->Free(transaction, run) < B_OK)
1850 			return B_IO_ERROR;
1851 	}
1852 	return B_OK;
1853 }
1854 
1855 
1856 status_t
1857 Inode::_ShrinkStream(Transaction &transaction, off_t size)
1858 {
1859 	data_stream *data = &Node().data;
1860 	status_t status;
1861 
1862 	if (data->MaxDoubleIndirectRange() > size) {
1863 		off_t *maxDoubleIndirect = &data->max_double_indirect_range;
1864 			// gcc 4 work-around: "error: cannot bind packed field
1865 			// 'data->data_stream::max_double_indirect_range' to 'off_t&'"
1866 		status = _FreeStaticStreamArray(transaction, 0, data->double_indirect,
1867 			size, data->MaxIndirectRange(), *maxDoubleIndirect);
1868 		if (status < B_OK)
1869 			return status;
1870 
1871 		if (size <= data->MaxIndirectRange()) {
1872 			fVolume->Free(transaction, data->double_indirect);
1873 			data->double_indirect.SetTo(0, 0, 0);
1874 			data->max_double_indirect_range = 0;
1875 		}
1876 	}
1877 
1878 	if (data->MaxIndirectRange() > size) {
1879 		CachedBlock cached(fVolume);
1880 		off_t block = fVolume->ToBlock(data->indirect);
1881 		off_t offset = data->MaxDirectRange();
1882 
1883 		for (int32 i = 0; i < data->indirect.Length(); i++) {
1884 			block_run *array = (block_run *)cached.SetToWritable(transaction,
1885 				block + i);
1886 			if (array == NULL)
1887 				break;
1888 
1889 			off_t *maxIndirect = &data->max_indirect_range;
1890 				// gcc 4 work-around: "error: cannot bind packed field
1891 				// 'data->data_stream::max_indirect_range' to 'off_t&'"
1892 			if (_FreeStreamArray(transaction, array, fVolume->BlockSize()
1893 					/ sizeof(block_run), size, offset, *maxIndirect) != B_OK)
1894 				return B_IO_ERROR;
1895 		}
1896 		if (data->max_direct_range == data->max_indirect_range) {
1897 			fVolume->Free(transaction, data->indirect);
1898 			data->indirect.SetTo(0, 0, 0);
1899 			data->max_indirect_range = 0;
1900 		}
1901 	}
1902 
1903 	if (data->MaxDirectRange() > size) {
1904 		off_t offset = 0;
1905 		off_t *maxDirect = &data->max_direct_range;
1906 			// gcc 4 work-around: "error: cannot bind packed field
1907 			// 'data->data_stream::max_direct_range' to 'off_t&'"
1908 		status = _FreeStreamArray(transaction, data->direct, NUM_DIRECT_BLOCKS,
1909 			size, offset, *maxDirect);
1910 		if (status < B_OK)
1911 			return status;
1912 	}
1913 
1914 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
1915 	return B_OK;
1916 }
1917 
1918 
1919 status_t
1920 Inode::SetFileSize(Transaction &transaction, off_t size)
1921 {
1922 	if (size < 0)
1923 		return B_BAD_VALUE;
1924 
1925 	off_t oldSize = Size();
1926 
1927 	if (size == oldSize)
1928 		return B_OK;
1929 
1930 	// should the data stream grow or shrink?
1931 	status_t status;
1932 	if (size > oldSize) {
1933 		status = _GrowStream(transaction, size);
1934 		if (status < B_OK) {
1935 			// if the growing of the stream fails, the whole operation
1936 			// fails, so we should shrink the stream to its former size
1937 			_ShrinkStream(transaction, oldSize);
1938 		}
1939 	} else
1940 		status = _ShrinkStream(transaction, size);
1941 
1942 	if (status < B_OK)
1943 		return status;
1944 
1945 	file_cache_set_size(FileCache(), size);
1946 	return WriteBack(transaction);
1947 }
1948 
1949 
1950 status_t
1951 Inode::Append(Transaction &transaction, off_t bytes)
1952 {
1953 	return SetFileSize(transaction, Size() + bytes);
1954 }
1955 
1956 
1957 /*!
1958 	Checks wether or not this inode's data stream needs to be trimmed
1959 	because of an earlier preallocation.
1960 	Returns true if there are any blocks to be trimmed.
1961 */
1962 bool
1963 Inode::NeedsTrimming()
1964 {
1965 	// We never trim preallocated index blocks to make them grow as smooth as possible.
1966 	// There are only few indices anyway, so this doesn't hurt
1967 	// Also, if an inode is already in deleted state, we don't bother trimming it
1968 	if (IsIndex() || IsDeleted())
1969 		return false;
1970 
1971 	off_t roundedSize = (Size() + fVolume->BlockSize() - 1)
1972 		& ~(fVolume->BlockSize() - 1);
1973 
1974 	return Node().data.MaxDirectRange() > roundedSize
1975 		|| Node().data.MaxIndirectRange() > roundedSize
1976 		|| Node().data.MaxDoubleIndirectRange() > roundedSize;
1977 }
1978 
1979 
1980 status_t
1981 Inode::TrimPreallocation(Transaction &transaction)
1982 {
1983 	status_t status = _ShrinkStream(transaction, Size());
1984 	if (status < B_OK)
1985 		return status;
1986 
1987 	return WriteBack(transaction);
1988 }
1989 
1990 
1991 //!	Frees the file's data stream and removes all attributes
1992 status_t
1993 Inode::Free(Transaction &transaction)
1994 {
1995 	FUNCTION();
1996 
1997 	// Perhaps there should be an implementation of Inode::ShrinkStream() that
1998 	// just frees the data_stream, but doesn't change the inode (since it is
1999 	// freed anyway) - that would make an undelete command possible
2000 	status_t status = SetFileSize(transaction, 0);
2001 	if (status < B_OK)
2002 		return status;
2003 
2004 	// Free all attributes, and remove their indices
2005 	{
2006 		// We have to limit the scope of AttributeIterator, so that its
2007 		// destructor is not called after the inode is deleted
2008 		AttributeIterator iterator(this);
2009 
2010 		char name[B_FILE_NAME_LENGTH];
2011 		uint32 type;
2012 		size_t length;
2013 		vnode_id id;
2014 		while ((status = iterator.GetNext(name, &length, &type, &id)) == B_OK) {
2015 			RemoveAttribute(transaction, name);
2016 		}
2017 	}
2018 
2019 	if (WriteBack(transaction) < B_OK)
2020 		return B_IO_ERROR;
2021 
2022 	return fVolume->Free(transaction, BlockRun());
2023 }
2024 
2025 
2026 //!	Synchronizes (writes back to disk) the file stream of the inode.
2027 status_t
2028 Inode::Sync()
2029 {
2030 	if (FileCache())
2031 		return file_cache_sync(FileCache());
2032 
2033 	// We may also want to flush the attribute's data stream to
2034 	// disk here... (do we?)
2035 
2036 	if (IsSymLink() && (Flags() & INODE_LONG_SYMLINK) == 0)
2037 		return B_OK;
2038 
2039 	data_stream *data = &Node().data;
2040 	status_t status = B_OK;
2041 
2042 	// flush direct range
2043 
2044 	for (int32 i = 0; i < NUM_DIRECT_BLOCKS; i++) {
2045 		if (data->direct[i].IsZero())
2046 			return B_OK;
2047 
2048 		status = block_cache_sync_etc(fVolume->BlockCache(),
2049 			fVolume->ToBlock(data->direct[i]), data->direct[i].Length());
2050 		if (status != B_OK)
2051 			return status;
2052 	}
2053 
2054 	// flush indirect range
2055 
2056 	if (data->max_indirect_range == 0)
2057 		return B_OK;
2058 
2059 	CachedBlock cached(fVolume);
2060 	off_t block = fVolume->ToBlock(data->indirect);
2061 	int32 count = fVolume->BlockSize() / sizeof(block_run);
2062 
2063 	for (int32 j = 0; j < data->indirect.Length(); j++) {
2064 		block_run *runs = (block_run *)cached.SetTo(block + j);
2065 		if (runs == NULL)
2066 			break;
2067 
2068 		for (int32 i = 0; i < count; i++) {
2069 			if (runs[i].IsZero())
2070 				return B_OK;
2071 
2072 			status = block_cache_sync_etc(fVolume->BlockCache(),
2073 				fVolume->ToBlock(runs[i]), runs[i].Length());
2074 			if (status != B_OK)
2075 				return status;
2076 		}
2077 	}
2078 
2079 	// flush double indirect range
2080 
2081 	if (data->max_double_indirect_range == 0)
2082 		return B_OK;
2083 
2084 	off_t indirectBlock = fVolume->ToBlock(data->double_indirect);
2085 
2086 	for (int32 l = 0; l < data->double_indirect.Length(); l++) {
2087 		block_run *indirectRuns = (block_run *)cached.SetTo(indirectBlock + l);
2088 		if (indirectRuns == NULL)
2089 			return B_FILE_ERROR;
2090 
2091 		CachedBlock directCached(fVolume);
2092 
2093 		for (int32 k = 0; k < count; k++) {
2094 			if (indirectRuns[k].IsZero())
2095 				return B_OK;
2096 
2097 			block = fVolume->ToBlock(indirectRuns[k]);
2098 			for (int32 j = 0; j < indirectRuns[k].Length(); j++) {
2099 				block_run *runs = (block_run *)directCached.SetTo(block + j);
2100 				if (runs == NULL)
2101 					return B_FILE_ERROR;
2102 
2103 				for (int32 i = 0; i < count; i++) {
2104 					if (runs[i].IsZero())
2105 						return B_OK;
2106 
2107 					// TODO: combine single block_runs to bigger ones when
2108 					// they are adjacent
2109 					status = block_cache_sync_etc(fVolume->BlockCache(),
2110 						fVolume->ToBlock(runs[i]), runs[i].Length());
2111 					if (status != B_OK)
2112 						return status;
2113 				}
2114 			}
2115 		}
2116 	}
2117 	return B_OK;
2118 }
2119 
2120 
2121 //	#pragma mark - creation/deletion
2122 
2123 
2124 status_t
2125 Inode::Remove(Transaction &transaction, const char *name, off_t *_id,
2126 	bool isDirectory)
2127 {
2128 	BPlusTree *tree;
2129 	if (GetTree(&tree) != B_OK)
2130 		RETURN_ERROR(B_BAD_VALUE);
2131 
2132 	RecursiveLocker locker(fVolume->Lock());
2133 
2134 	// does the file even exist?
2135 	off_t id;
2136 	if (tree->Find((uint8 *)name, (uint16)strlen(name), &id) < B_OK)
2137 		return B_ENTRY_NOT_FOUND;
2138 
2139 	if (_id)
2140 		*_id = id;
2141 
2142 	Vnode vnode(fVolume, id);
2143 	Inode *inode;
2144 	status_t status = vnode.Get(&inode);
2145 	if (status < B_OK) {
2146 		REPORT_ERROR(status);
2147 		return B_ENTRY_NOT_FOUND;
2148 	}
2149 
2150 	// Inode::IsContainer() is true also for indices (furthermore, the S_IFDIR
2151 	// bit is set for indices in BFS, not for attribute directories) - but you
2152 	// should really be able to do whatever you want with your indices
2153 	// without having to remove all files first :)
2154 	if (!inode->IsIndex()) {
2155 		// if it's not of the correct type, don't delete it!
2156 		if (inode->IsContainer() != isDirectory)
2157 			return isDirectory ? B_NOT_A_DIRECTORY : B_IS_A_DIRECTORY;
2158 
2159 		// only delete empty directories
2160 		if (isDirectory && !inode->IsEmpty())
2161 			return B_DIRECTORY_NOT_EMPTY;
2162 	}
2163 
2164 	// remove_vnode() allows the inode to be accessed until the last put_vnode()
2165 	status = remove_vnode(fVolume->ID(), id);
2166 	if (status != B_OK)
2167 		return status;
2168 
2169 	if (tree->Remove(transaction, name, id) < B_OK) {
2170 		unremove_vnode(fVolume->ID(), id);
2171 		RETURN_ERROR(B_ERROR);
2172 	}
2173 
2174 #ifdef DEBUG
2175 	if (tree->Find((uint8 *)name, (uint16)strlen(name), &id) == B_OK) {
2176 		DIE(("deleted entry still there"));
2177 	}
2178 #endif
2179 
2180 	// update the inode, so that no one will ever doubt it's deleted :-)
2181 	inode->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
2182 	inode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
2183 
2184 	// In balance to the Inode::Create() method, the main indices
2185 	// are updated here (name, size, & last_modified)
2186 
2187 	Index index(fVolume);
2188 	if (inode->IsRegularNode()) {
2189 		index.RemoveName(transaction, name, inode);
2190 			// If removing from the index fails, it is not regarded as a
2191 			// fatal error and will not be reported back!
2192 			// Deleted inodes won't be visible in queries anyway.
2193 	}
2194 
2195 	if (inode->IsFile() || inode->IsSymLink()) {
2196 		if (inode->IsFile())
2197 			index.RemoveSize(transaction, inode);
2198 		index.RemoveLastModified(transaction, inode);
2199 	}
2200 
2201 	return inode->WriteBack(transaction);
2202 }
2203 
2204 
2205 /*!
2206 	Creates the inode with the specified parent directory, and automatically
2207 	adds the created inode to that parent directory. If an attribute directory
2208 	is created, it will also automatically added to the parent inode as such.
2209 	However, the indices root node, and the regular root node won't be added
2210 	to the super block.
2211 	It will also create the initial B+tree for the inode if it's a directory
2212 	of any kind.
2213 	If the "_id" or "_inode" variable is given and non-NULL to store the inode's
2214 	ID, the inode stays locked - you have to call put_vnode() if you don't use it
2215 	anymore.
2216 */
2217 status_t
2218 Inode::Create(Transaction &transaction, Inode *parent, const char *name,
2219 	int32 mode, int openMode, uint32 type, off_t *_id, Inode **_inode)
2220 {
2221 	FUNCTION_START(("name = %s, mode = %ld\n", name, mode));
2222 
2223 	block_run parentRun = parent ? parent->BlockRun() : block_run::Run(0, 0, 0);
2224 	Volume *volume = transaction.GetVolume();
2225 	BPlusTree *tree = NULL;
2226 
2227 	if (parent && (mode & S_ATTR_DIR) == 0 && parent->IsContainer()) {
2228 		// check if the file already exists in the directory
2229 		if (parent->GetTree(&tree) != B_OK)
2230 			RETURN_ERROR(B_BAD_VALUE);
2231 	}
2232 
2233 	WriteLocked locker(parent != NULL ? &parent->Lock() : NULL);
2234 		// the parent directory is locked during the whole inode creation
2235 
2236 	if (tree != NULL) {
2237 		// does the file already exist?
2238 		off_t offset;
2239 		if (tree->Find((uint8 *)name, (uint16)strlen(name), &offset) == B_OK) {
2240 			// return if the file should be a directory/link or opened in exclusive mode
2241 			if (S_ISDIR(mode) || S_ISLNK(mode) || openMode & O_EXCL)
2242 				return B_FILE_EXISTS;
2243 
2244 			Vnode vnode(volume, offset);
2245 			Inode *inode;
2246 			status_t status = vnode.Get(&inode);
2247 			if (status < B_OK) {
2248 				REPORT_ERROR(status);
2249 				return B_ENTRY_NOT_FOUND;
2250 			}
2251 
2252 			// if it's a directory, bail out!
2253 			if (inode->IsDirectory())
2254 				return B_IS_A_DIRECTORY;
2255 
2256 			// we want to open the file, so we should have the rights to do so
2257 			if (inode->CheckPermissions(openModeToAccess(openMode)) != B_OK)
2258 				return B_NOT_ALLOWED;
2259 
2260 			if (openMode & O_TRUNC) {
2261 				// we need write access in order to truncate the file
2262 				status = inode->CheckPermissions(W_OK);
2263 				if (status != B_OK)
2264 					return status;
2265 
2266 				// truncate the existing file
2267 				WriteLocked locked(inode->Lock());
2268 
2269 				status_t status = inode->SetFileSize(transaction, 0);
2270 				if (status >= B_OK)
2271 					status = inode->WriteBack(transaction);
2272 
2273 				if (status < B_OK)
2274 					return status;
2275 			}
2276 
2277 			if (_id)
2278 				*_id = offset;
2279 			if (_inode)
2280 				*_inode = inode;
2281 
2282 			// only keep the vnode in memory if the _id or _inode pointer is provided
2283 			if (_id != NULL || _inode != NULL)
2284 				vnode.Keep();
2285 
2286 			return B_OK;
2287 		}
2288 	} else if (parent && (mode & S_ATTR_DIR) == 0)
2289 		return B_BAD_VALUE;
2290 
2291 	status_t status;
2292 
2293 	// do we have the power to create new files at all?
2294 	if (parent != NULL && (status = parent->CheckPermissions(W_OK)) != B_OK)
2295 		RETURN_ERROR(status);
2296 
2297 	// allocate space for the new inode
2298 	InodeAllocator allocator(transaction);
2299 	block_run run;
2300 	Inode *inode;
2301 	status = allocator.New(&parentRun, mode, run, &inode);
2302 	if (status < B_OK)
2303 		return status;
2304 
2305 	// Initialize the parts of the bfs_inode structure that
2306 	// InodeAllocator::New() hasn't touched yet
2307 
2308 	bfs_inode *node = &inode->Node();
2309 
2310 	if (parent == NULL) {
2311 		// we set the parent to itself in this case
2312 		// (only happens for the root and indices node)
2313 		node->parent = run;
2314 	} else
2315 		node->parent = parentRun;
2316 
2317 	node->uid = HOST_ENDIAN_TO_BFS_INT32(geteuid());
2318 	node->gid = HOST_ENDIAN_TO_BFS_INT32(parent
2319 		? parent->Node().GroupID() : getegid());
2320 		// the group ID is inherited from the parent, if available
2321 
2322 	node->type = HOST_ENDIAN_TO_BFS_INT32(type);
2323 
2324 	inode->WriteBack(transaction);
2325 		// make sure the initialized node is available to others
2326 
2327 	// only add the name to regular files, directories, or symlinks
2328 	// don't add it to attributes, or indices
2329 	if (tree && inode->IsRegularNode() && inode->SetName(transaction, name) < B_OK)
2330 		return B_ERROR;
2331 
2332 	// Initialize b+tree if it's a directory (and add "." & ".." if it's
2333 	// a standard directory for files - not for attributes or indices)
2334 	if (inode->IsContainer()) {
2335 		status = allocator.CreateTree();
2336 		if (status < B_OK)
2337 			return status;
2338 	}
2339 
2340 	// Add a link to the inode from the parent, depending on its type
2341 	// (the vnode is not published yet, so it is safe to make the inode
2342 	// accessable to the file system here)
2343 	if (tree) {
2344 		status = tree->Insert(transaction, name, inode->ID());
2345 	} else if (parent && (mode & S_ATTR_DIR) != 0) {
2346 		parent->Attributes() = run;
2347 		status = parent->WriteBack(transaction);
2348 	}
2349 
2350 	// Note, we only care if the inode could be made accessable for the
2351 	// two cases above; the root node or the indices root node must
2352 	// handle this case on their own (or other cases where "parent" is
2353 	// NULL)
2354 	if (status < B_OK)
2355 		RETURN_ERROR(status);
2356 
2357 	// Update the main indices (name, size & last_modified)
2358 	// (live queries might want to access us after this)
2359 
2360 	Index index(volume);
2361 	if (inode->IsRegularNode() && name != NULL) {
2362 		// the name index only contains regular files
2363 		// (but not the root node where name == NULL)
2364 		status = index.InsertName(transaction, name, inode);
2365 		if (status < B_OK && status != B_BAD_INDEX) {
2366 			// We have to remove the node from the parent at this point,
2367 			// because the InodeAllocator destructor can't handle this
2368 			// case (and if it fails, we can't do anything about it...)
2369 			if (tree)
2370 				tree->Remove(transaction, name, inode->ID());
2371 			else if (parent != NULL && (mode & S_ATTR_DIR) != 0)
2372 				parent->Node().attributes.SetTo(0, 0, 0);
2373 
2374 			RETURN_ERROR(status);
2375 		}
2376 	}
2377 
2378 	inode->UpdateOldLastModified();
2379 
2380 	// The "size" & "last_modified" indices don't contain directories
2381 	if (inode->IsFile() || inode->IsSymLink()) {
2382 		// if adding to these indices fails, the inode creation will not be harmed;
2383 		// they are considered less important than the "name" index
2384 		if (inode->IsFile())
2385 			index.InsertSize(transaction, inode);
2386 		index.InsertLastModified(transaction, inode);
2387 	}
2388 
2389 	// Everything worked well until this point, we have a fully
2390 	// initialized inode, and we want to keep it
2391 	allocator.Keep();
2392 
2393 	if (inode->IsFile() || inode->IsAttribute()) {
2394 		inode->SetFileCache(file_cache_create(volume->ID(), inode->ID(),
2395 			inode->Size(), volume->Device()));
2396 	}
2397 
2398 	if (_id != NULL)
2399 		*_id = inode->ID();
2400 	if (_inode != NULL)
2401 		*_inode = inode;
2402 
2403 	// if either _id or _inode is passed, we will keep the inode locked
2404 	if (_id == NULL && _inode == NULL)
2405 		put_vnode(volume->ID(), inode->ID());
2406 
2407 	return B_OK;
2408 }
2409 
2410 
2411 //	#pragma mark - AttributeIterator
2412 
2413 
2414 AttributeIterator::AttributeIterator(Inode *inode)
2415 	:
2416 	fCurrentSmallData(0),
2417 	fInode(inode),
2418 	fAttributes(NULL),
2419 	fIterator(NULL),
2420 	fBuffer(NULL)
2421 {
2422 	inode->_AddIterator(this);
2423 }
2424 
2425 
2426 AttributeIterator::~AttributeIterator()
2427 {
2428 	if (fAttributes)
2429 		put_vnode(fAttributes->GetVolume()->ID(), fAttributes->ID());
2430 
2431 	delete fIterator;
2432 	fInode->_RemoveIterator(this);
2433 }
2434 
2435 
2436 status_t
2437 AttributeIterator::Rewind()
2438 {
2439 	fCurrentSmallData = 0;
2440 
2441 	if (fIterator != NULL)
2442 		fIterator->Rewind();
2443 
2444 	return B_OK;
2445 }
2446 
2447 
2448 status_t
2449 AttributeIterator::GetNext(char *name, size_t *_length, uint32 *_type,
2450 	vnode_id *_id)
2451 {
2452 	// read attributes out of the small data section
2453 
2454 	if (fCurrentSmallData >= 0) {
2455 		NodeGetter nodeGetter(fInode->GetVolume(), fInode);
2456 		const bfs_inode *node = nodeGetter.Node();
2457 		const small_data *item = ((bfs_inode *)node)->SmallDataStart();
2458 
2459 		fInode->SmallDataLock().Lock();
2460 
2461 		int32 i = 0;
2462 		for (;;item = item->Next()) {
2463 			if (item->IsLast(node))
2464 				break;
2465 
2466 			if (item->NameSize() == FILE_NAME_NAME_LENGTH
2467 				&& *item->Name() == FILE_NAME_NAME)
2468 				continue;
2469 
2470 			if (i++ == fCurrentSmallData)
2471 				break;
2472 		}
2473 
2474 		if (!item->IsLast(node)) {
2475 			strncpy(name, item->Name(), B_FILE_NAME_LENGTH);
2476 			*_type = item->Type();
2477 			*_length = item->NameSize();
2478 			*_id = (vnode_id)fCurrentSmallData;
2479 
2480 			fCurrentSmallData = i;
2481 		}
2482 		else {
2483 			// stop traversing the small_data section
2484 			fCurrentSmallData = -1;
2485 		}
2486 
2487 		fInode->SmallDataLock().Unlock();
2488 
2489 		if (fCurrentSmallData != -1)
2490 			return B_OK;
2491 	}
2492 
2493 	// read attributes out of the attribute directory
2494 
2495 	if (fInode->Attributes().IsZero())
2496 		return B_ENTRY_NOT_FOUND;
2497 
2498 	Volume *volume = fInode->GetVolume();
2499 
2500 	// if you haven't yet access to the attributes directory, get it
2501 	if (fAttributes == NULL) {
2502 		if (get_vnode(volume->ID(), volume->ToVnode(fInode->Attributes()),
2503 				(void **)&fAttributes) != B_OK) {
2504 			FATAL(("get_vnode() failed in AttributeIterator::GetNext(vnode_id"
2505 				" = %Ld,name = \"%s\")\n",fInode->ID(),name));
2506 			return B_ENTRY_NOT_FOUND;
2507 		}
2508 
2509 		BPlusTree *tree;
2510 		if (fAttributes->GetTree(&tree) < B_OK
2511 			|| (fIterator = new TreeIterator(tree)) == NULL) {
2512 			FATAL(("could not get tree in AttributeIterator::GetNext(vnode_id"
2513 				" = %Ld,name = \"%s\")\n",fInode->ID(),name));
2514 			return B_ENTRY_NOT_FOUND;
2515 		}
2516 	}
2517 
2518 	uint16 length;
2519 	vnode_id id;
2520 	status_t status = fIterator->GetNextEntry(name, &length,
2521 		B_FILE_NAME_LENGTH, &id);
2522 	if (status < B_OK)
2523 		return status;
2524 
2525 	Vnode vnode(volume,id);
2526 	Inode *attribute;
2527 	if ((status = vnode.Get(&attribute)) == B_OK) {
2528 		*_type = attribute->Type();
2529 		*_length = attribute->Size();
2530 		*_id = id;
2531 	}
2532 
2533 	return status;
2534 }
2535 
2536 
2537 void
2538 AttributeIterator::Update(uint16 index, int8 change)
2539 {
2540 	// fCurrentSmallData points already to the next item
2541 	if (index < fCurrentSmallData)
2542 		fCurrentSmallData += change;
2543 }
2544 
2545