xref: /haiku/src/add-ons/kernel/file_systems/bfs/Inode.cpp (revision 1d9d47fc72028bb71b5f232a877231e59cfe2438)
1 /* Inode - inode access functions
2  *
3  * Copyright 2001-2006, Axel Dörfler, axeld@pinc-software.de.
4  * This file may be used under the terms of the MIT License.
5  */
6 
7 
8 #include "Debug.h"
9 #include "Inode.h"
10 #include "BPlusTree.h"
11 #include "Index.h"
12 
13 #include <fs_cache.h>
14 #include <util/kernel_cpp.h>
15 
16 #include <string.h>
17 #include <stdio.h>
18 
19 
20 class InodeAllocator {
21 	public:
22 		InodeAllocator(Transaction &transaction);
23 		~InodeAllocator();
24 
25 		status_t New(block_run *parentRun, mode_t mode, block_run &run, Inode **_inode);
26 		status_t CreateTree();
27 		status_t Keep();
28 
29 	private:
30 		Transaction *fTransaction;
31 		block_run fRun;
32 		Inode *fInode;
33 };
34 
35 
36 InodeAllocator::InodeAllocator(Transaction &transaction)
37 	:
38 	fTransaction(&transaction),
39 	fInode(NULL)
40 {
41 }
42 
43 
44 InodeAllocator::~InodeAllocator()
45 {
46 	if (fTransaction != NULL) {
47 		Volume *volume = fTransaction->GetVolume();
48 
49 		if (fInode != NULL) {
50 			fInode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
51 				// this unblocks any pending bfs_read_vnode() calls
52 			fInode->Free(*fTransaction);
53 			remove_vnode(volume->ID(), fInode->ID());
54 		} else
55 			volume->Free(*fTransaction, fRun);
56 	}
57 
58 	delete fInode;
59 }
60 
61 
62 status_t
63 InodeAllocator::New(block_run *parentRun, mode_t mode, block_run &run, Inode **_inode)
64 {
65 	Volume *volume = fTransaction->GetVolume();
66 
67 	status_t status = volume->AllocateForInode(*fTransaction, parentRun, mode, fRun);
68 	if (status < B_OK) {
69 		// don't free the space in the destructor, because
70 		// the allocation failed
71 		fTransaction = NULL;
72 		RETURN_ERROR(status);
73 	}
74 
75 	run = fRun;
76 	fInode = new Inode(volume, *fTransaction, volume->ToVnode(run), mode, run);
77 	if (fInode == NULL)
78 		RETURN_ERROR(B_NO_MEMORY);
79 
80 	status = new_vnode(volume->ID(), fInode->ID(), fInode);
81 	if (status < B_OK) {
82 		delete fInode;
83 		fInode = NULL;
84 		RETURN_ERROR(status);
85 	}
86 
87 	*_inode = fInode;
88 	return B_OK;
89 }
90 
91 
92 status_t
93 InodeAllocator::CreateTree()
94 {
95 	Volume *volume = fTransaction->GetVolume();
96 
97 	// force S_STR_INDEX to be set, if no type is set
98 	if ((fInode->Mode() & S_INDEX_TYPES) == 0)
99 		fInode->Node().mode |= HOST_ENDIAN_TO_BFS_INT32(S_STR_INDEX);
100 
101 	BPlusTree *tree = fInode->fTree = new BPlusTree(*fTransaction, fInode);
102 	if (tree == NULL || tree->InitCheck() < B_OK)
103 		return B_ERROR;
104 
105 	if (fInode->IsRegularNode()) {
106 		if (tree->Insert(*fTransaction, ".", fInode->ID()) < B_OK
107 			|| tree->Insert(*fTransaction, "..", volume->ToVnode(fInode->Parent())) < B_OK)
108 			return B_ERROR;
109 	}
110 	return B_OK;
111 }
112 
113 
114 status_t
115 InodeAllocator::Keep()
116 {
117 	ASSERT(fInode != NULL && fTransaction != NULL);
118 	Volume *volume = fTransaction->GetVolume();
119 
120 	status_t status = fInode->WriteBack(*fTransaction);
121 	if (status < B_OK) {
122 		FATAL(("writing new inode %Ld failed!\n", fInode->ID()));
123 		return status;
124 	}
125 
126 	if (!fInode->IsSymLink())
127 		status = publish_vnode(volume->ID(), fInode->ID(), fInode);
128 
129 	fTransaction = NULL;
130 	fInode = NULL;
131 
132 	return status;
133 }
134 
135 
136 //	#pragma mark -
137 
138 
139 status_t
140 bfs_inode::InitCheck(Volume *volume)
141 {
142 	if (Magic1() != INODE_MAGIC1
143 		|| !(Flags() & INODE_IN_USE)
144 		|| inode_num.Length() != 1
145 		// matches inode size?
146 		|| (uint32)InodeSize() != volume->InodeSize()
147 		// parent resides on disk?
148 		|| parent.AllocationGroup() > int32(volume->AllocationGroups())
149 		|| parent.AllocationGroup() < 0
150 		|| parent.Start() > (1L << volume->AllocationGroupShift())
151 		|| parent.Length() != 1
152 		// attributes, too?
153 		|| attributes.AllocationGroup() > int32(volume->AllocationGroups())
154 		|| attributes.AllocationGroup() < 0
155 		|| attributes.Start() > (1L << volume->AllocationGroupShift()))
156 		RETURN_ERROR(B_BAD_DATA);
157 
158 	if (Flags() & INODE_DELETED)
159 		return B_NOT_ALLOWED;
160 
161 	// ToDo: Add some tests to check the integrity of the other stuff here,
162 	// especially for the data_stream!
163 
164 	return B_OK;
165 }
166 
167 
168 //	#pragma mark -
169 
170 
171 Inode::Inode(Volume *volume, vnode_id id)
172 	:
173 	fVolume(volume),
174 	fID(id),
175 	fTree(NULL),
176 	fAttributes(NULL),
177 	fCache(NULL)
178 {
179 	PRINT(("Inode::Inode(volume = %p, id = %Ld) @ %p\n", volume, id, this));
180 
181 	NodeGetter node(volume, this);
182 	memcpy(&fNode, node.Node(), sizeof(bfs_inode));
183 
184 	char lockName[B_OS_NAME_LENGTH];
185 	snprintf(lockName, sizeof(lockName), "bfs inode %ld.%d",
186 		BlockRun().AllocationGroup(), BlockRun().Start());
187 	fLock.Initialize(lockName);
188 
189 	// these two will help to maintain the indices
190 	fOldSize = Size();
191 	fOldLastModified = LastModified();
192 
193 	if (IsContainer())
194 		fTree = new BPlusTree(this);
195 	if (IsFile() || IsAttribute())
196 		SetFileCache(file_cache_create(fVolume->ID(), ID(), Size(), fVolume->Device()));
197 }
198 
199 
200 Inode::Inode(Volume *volume, Transaction &transaction, vnode_id id, mode_t mode, block_run &run)
201 	:
202 	fVolume(volume),
203 	fID(id),
204 	fTree(NULL),
205 	fAttributes(NULL),
206 	fCache(NULL)
207 {
208 	PRINT(("Inode::Inode(volume = %p, transaction = %p, id = %Ld) @ %p\n",
209 		volume, &transaction, id, this));
210 
211 	char lockName[B_OS_NAME_LENGTH];
212 	snprintf(lockName, sizeof(lockName), "bfs inode+%ld.%d",
213 		run.AllocationGroup(), run.Start());
214 	fLock.Initialize(lockName);
215 
216 	NodeGetter node(volume, transaction, this, true);
217 	memset(&fNode, 0, sizeof(bfs_inode));
218 
219 	// Initialize the bfs_inode structure -- it's not written back to disk
220 	// here, because it will be done so already when the inode could be
221 	// created completely.
222 
223 	Node().magic1 = HOST_ENDIAN_TO_BFS_INT32(INODE_MAGIC1);
224 	Node().inode_num = run;
225 	Node().mode = HOST_ENDIAN_TO_BFS_INT32(mode);
226 	Node().flags = HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
227 
228 	Node().create_time = HOST_ENDIAN_TO_BFS_INT64((bigtime_t)time(NULL) << INODE_TIME_SHIFT);
229 	Node().last_modified_time = HOST_ENDIAN_TO_BFS_INT64(Node().create_time
230 		| (volume->GetUniqueID() & INODE_TIME_MASK));
231 		// we use Volume::GetUniqueID() to avoid having too many duplicates in the
232 		// last_modified index
233 
234 	Node().inode_size = HOST_ENDIAN_TO_BFS_INT32(volume->InodeSize());
235 
236 	// these two will help to maintain the indices
237 	fOldSize = Size();
238 	fOldLastModified = LastModified();
239 }
240 
241 
242 Inode::~Inode()
243 {
244 	PRINT(("Inode::~Inode() @ %p\n", this));
245 
246 	file_cache_delete(FileCache());
247 	delete fTree;
248 }
249 
250 
251 status_t
252 Inode::InitCheck(bool checkNode)
253 {
254 	// test inode magic and flags
255 	if (checkNode) {
256 		status_t status = Node().InitCheck(fVolume);
257 		if (status == B_BUSY)
258 			return B_BUSY;
259 
260 		if (status < B_OK) {
261 			FATAL(("inode at block %Ld corrupt!\n", BlockNumber()));
262 			RETURN_ERROR(B_BAD_DATA);
263 		}
264 	}
265 
266 	if (IsContainer()) {
267 		// inodes that have a
268 		if (fTree == NULL)
269 			RETURN_ERROR(B_NO_MEMORY);
270 
271 		status_t status = fTree->InitCheck();
272 		if (status < B_OK) {
273 			FATAL(("inode tree at block %Ld corrupt!\n", BlockNumber()));
274 			RETURN_ERROR(B_BAD_DATA);
275 		}
276 	}
277 
278 	// it's more important to know that the inode is corrupt
279 	// so we check for the lock not until here
280 	return fLock.InitCheck();
281 }
282 
283 
284 status_t
285 Inode::WriteBack(Transaction &transaction)
286 {
287 	NodeGetter node(fVolume, transaction, this);
288 	if (node.WritableNode() == NULL)
289 		return B_IO_ERROR;
290 
291 	memcpy(node.WritableNode(), &Node(), sizeof(bfs_inode));
292 	return B_OK;
293 }
294 
295 
296 status_t
297 Inode::CheckPermissions(int accessMode) const
298 {
299 	uid_t user = geteuid();
300 	gid_t group = getegid();
301 
302 	// you never have write access to a read-only volume
303 	if (accessMode & W_OK && fVolume->IsReadOnly())
304 		return B_READ_ONLY_DEVICE;
305 
306 	// root users always have full access (but they can't execute anything)
307 	if (user == 0 && !((accessMode & X_OK) && (Mode() & S_IXUSR) == 0))
308 		return B_OK;
309 
310 	// shift mode bits, to check directly against accessMode
311 	mode_t mode = Mode();
312 	if (user == (uid_t)fNode.UserID())
313 		mode >>= 6;
314 	else if (group == (gid_t)fNode.GroupID())
315 		mode >>= 3;
316 
317 	if (accessMode & ~(mode & S_IRWXO))
318 		return B_NOT_ALLOWED;
319 
320 	return B_OK;
321 }
322 
323 
324 //	#pragma mark -
325 
326 
327 void
328 Inode::_AddIterator(AttributeIterator *iterator)
329 {
330 	if (fSmallDataLock.Lock() < B_OK)
331 		return;
332 
333 	fIterators.Add(iterator);
334 
335 	fSmallDataLock.Unlock();
336 }
337 
338 
339 void
340 Inode::_RemoveIterator(AttributeIterator *iterator)
341 {
342 	if (fSmallDataLock.Lock() < B_OK)
343 		return;
344 
345 	fIterators.Remove(iterator);
346 
347 	fSmallDataLock.Unlock();
348 }
349 
350 
351 /**	Tries to free up "bytes" space in the small_data section by moving
352  *	attributes to real files. Used for system attributes like the name.
353  *	You need to hold the fSmallDataLock when you call this method
354  */
355 
356 status_t
357 Inode::MakeSpaceForSmallData(Transaction &transaction, bfs_inode *node, const char *name,
358 	int32 bytes)
359 {
360 	ASSERT(fSmallDataLock.IsLocked());
361 
362 	while (bytes > 0) {
363 		small_data *item = node->SmallDataStart(), *max = NULL;
364 		int32 index = 0, maxIndex = 0;
365 		for (; !item->IsLast(node); item = item->Next(), index++) {
366 			// should not remove those
367 			if (*item->Name() == FILE_NAME_NAME || !strcmp(name, item->Name()))
368 				continue;
369 
370 			if (max == NULL || max->Size() < item->Size()) {
371 				maxIndex = index;
372 				max = item;
373 			}
374 
375 			// remove the first one large enough to free the needed amount of bytes
376 			if (bytes < (int32)item->Size())
377 				break;
378 		}
379 
380 		if (item->IsLast(node) || (int32)item->Size() < bytes)
381 			return B_ERROR;
382 
383 		bytes -= max->Size();
384 
385 		// Move the attribute to a real attribute file
386 		// Luckily, this doesn't cause any index updates
387 
388 		Inode *attribute;
389 		status_t status = CreateAttribute(transaction, item->Name(), item->Type(),
390 			&attribute);
391 		if (status < B_OK)
392 			RETURN_ERROR(status);
393 
394 		size_t length = item->DataSize();
395 		status = attribute->WriteAt(transaction, 0, item->Data(), &length);
396 
397 		ReleaseAttribute(attribute);
398 
399 		if (status < B_OK) {
400 			Vnode vnode(fVolume, Attributes());
401 			Inode *attributes;
402 			if (vnode.Get(&attributes) < B_OK
403 				|| attributes->Remove(transaction, name) < B_OK) {
404 				FATAL(("Could not remove newly created attribute!\n"));
405 			}
406 
407 			RETURN_ERROR(status);
408 		}
409 
410 		_RemoveSmallData(node, max, maxIndex);
411 	}
412 	return B_OK;
413 }
414 
415 
416 /**	Private function which removes the given attribute from the small_data
417  *	section.
418  *	You need to hold the fSmallDataLock when you call this method
419  */
420 
421 status_t
422 Inode::_RemoveSmallData(bfs_inode *node, small_data *item, int32 index)
423 {
424 	ASSERT(fSmallDataLock.IsLocked());
425 
426 	small_data *next = item->Next();
427 	if (!next->IsLast(node)) {
428 		// find the last attribute
429 		small_data *last = next;
430 		while (!last->IsLast(node))
431 			last = last->Next();
432 
433 		int32 size = (uint8 *)last - (uint8 *)next;
434 		if (size < 0 || size > (uint8 *)node + fVolume->BlockSize() - (uint8 *)next)
435 			return B_BAD_DATA;
436 
437 		memmove(item, next, size);
438 
439 		// Move the "last" one to its new location and
440 		// correctly terminate the small_data section
441 		last = (small_data *)((uint8 *)last - ((uint8 *)next - (uint8 *)item));
442 		memset(last, 0, (uint8 *)node + fVolume->BlockSize() - (uint8 *)last);
443 	} else
444 		memset(item, 0, item->Size());
445 
446 	// update all current iterators
447 	AttributeIterator *iterator = NULL;
448 	while ((iterator = fIterators.Next(iterator)) != NULL)
449 		iterator->Update(index, -1);
450 
451 	return B_OK;
452 }
453 
454 
455 /**	Removes the given attribute from the small_data section.
456  *	Note that you need to write back the inode yourself after having called
457  *	that method.
458  */
459 
460 status_t
461 Inode::RemoveSmallData(Transaction &transaction, NodeGetter &nodeGetter, const char *name)
462 {
463 	if (name == NULL)
464 		return B_BAD_VALUE;
465 
466 	bfs_inode *node = nodeGetter.WritableNode();
467 	SimpleLocker locker(fSmallDataLock);
468 
469 	// search for the small_data item
470 
471 	small_data *item = node->SmallDataStart();
472 	int32 index = 0;
473 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
474 		item = item->Next();
475 		index++;
476 	}
477 
478 	if (item->IsLast(node))
479 		return B_ENTRY_NOT_FOUND;
480 
481 	nodeGetter.MakeWritable(transaction);
482 	return _RemoveSmallData(node, item, index);
483 }
484 
485 
486 /**	Try to place the given attribute in the small_data section - if the
487  *	new attribute is too big to fit in that section, it returns B_DEVICE_FULL.
488  *	In that case, the attribute should be written to a real attribute file;
489  *	if the attribute was already part of the small_data section, but the new
490  *	one wouldn't fit, the old one is automatically removed from the small_data
491  *	section.
492  *	Note that you need to write back the inode yourself after having called that
493  *	method - it's a bad API decision that it needs a transaction but enforces you
494  *	to write back the inode all by yourself, but it's just more efficient in most
495  *	cases...
496  */
497 
498 status_t
499 Inode::AddSmallData(Transaction &transaction, NodeGetter &nodeGetter, const char *name,
500 	uint32 type, const uint8 *data, size_t length, bool force)
501 {
502 	bfs_inode *node = nodeGetter.WritableNode();
503 
504 	if (node == NULL || name == NULL || data == NULL)
505 		return B_BAD_VALUE;
506 
507 	// reject any requests that can't fit into the small_data section
508 	uint32 nameLength = strlen(name);
509 	uint32 spaceNeeded = sizeof(small_data) + nameLength + 3 + length + 1;
510 	if (spaceNeeded > fVolume->InodeSize() - sizeof(bfs_inode))
511 		return B_DEVICE_FULL;
512 
513 	nodeGetter.MakeWritable(transaction);
514 	SimpleLocker locker(fSmallDataLock);
515 
516 	// Find the last item or one with the same name we have to add
517 	small_data *item = node->SmallDataStart();
518 	int32 index = 0;
519 	while (!item->IsLast(node) && strcmp(item->Name(), name)) {
520 		item = item->Next();
521 		index++;
522 	}
523 
524 	// is the attribute already in the small_data section?
525 	// then just replace the data part of that one
526 	if (!item->IsLast(node)) {
527 		// find last attribute
528 		small_data *last = item;
529 		while (!last->IsLast(node))
530 			last = last->Next();
531 
532 		// try to change the attributes value
533 		if (item->data_size > length
534 			|| force
535 			|| ((uint8 *)last + length - item->DataSize()) <= ((uint8 *)node + fVolume->InodeSize())) {
536 			// make room for the new attribute if needed (and we are forced to do so)
537 			if (force
538 				&& ((uint8 *)last + length - item->DataSize()) > ((uint8 *)node + fVolume->InodeSize())) {
539 				// We also take the free space at the end of the small_data section
540 				// into account, and request only what's really needed
541 				uint32 needed = length - item->DataSize() -
542 						(uint32)((uint8 *)node + fVolume->InodeSize() - (uint8 *)last);
543 
544 				if (MakeSpaceForSmallData(transaction, node, name, needed) < B_OK)
545 					return B_ERROR;
546 
547 				// reset our pointers
548 				item = node->SmallDataStart();
549 				index = 0;
550 				while (!item->IsLast(node) && strcmp(item->Name(), name)) {
551 					item = item->Next();
552 					index++;
553 				}
554 
555 				last = item;
556 				while (!last->IsLast(node))
557 					last = last->Next();
558 			}
559 
560 			// Normally, we can just overwrite the attribute data as the size
561 			// is specified by the type and does not change that often
562 			if (length != item->DataSize()) {
563 				// move the attributes after the current one
564 				small_data *next = item->Next();
565 				if (!next->IsLast(node))
566 					memmove((uint8 *)item + spaceNeeded, next, (uint8 *)last - (uint8 *)next);
567 
568 				// Move the "last" one to its new location and
569 				// correctly terminate the small_data section
570 				last = (small_data *)((uint8 *)last - ((uint8 *)next - ((uint8 *)item + spaceNeeded)));
571 				if ((uint8 *)last < (uint8 *)node + fVolume->BlockSize())
572 					memset(last, 0, (uint8 *)node + fVolume->BlockSize() - (uint8 *)last);
573 
574 				item->data_size = HOST_ENDIAN_TO_BFS_INT16(length);
575 			}
576 
577 			item->type = HOST_ENDIAN_TO_BFS_INT32(type);
578 			memcpy(item->Data(), data, length);
579 			item->Data()[length] = '\0';
580 
581 			return B_OK;
582 		}
583 
584 		// Could not replace the old attribute, so remove it to let
585 		// let the calling function create an attribute file for it
586 		if (_RemoveSmallData(node, item, index) < B_OK)
587 			return B_ERROR;
588 
589 		return B_DEVICE_FULL;
590 	}
591 
592 	// try to add the new attribute!
593 
594 	if ((uint8 *)item + spaceNeeded > (uint8 *)node + fVolume->InodeSize()) {
595 		// there is not enough space for it!
596 		if (!force)
597 			return B_DEVICE_FULL;
598 
599 		// make room for the new attribute
600 		if (MakeSpaceForSmallData(transaction, node, name, spaceNeeded) < B_OK)
601 			return B_ERROR;
602 
603 		// get new last item!
604 		item = node->SmallDataStart();
605 		index = 0;
606 		while (!item->IsLast(node)) {
607 			item = item->Next();
608 			index++;
609 		}
610 	}
611 
612 	memset(item, 0, spaceNeeded);
613 	item->type = HOST_ENDIAN_TO_BFS_INT32(type);
614 	item->name_size = HOST_ENDIAN_TO_BFS_INT16(nameLength);
615 	item->data_size = HOST_ENDIAN_TO_BFS_INT16(length);
616 	strcpy(item->Name(), name);
617 	memcpy(item->Data(), data, length);
618 
619 	// correctly terminate the small_data section
620 	item = item->Next();
621 	if (!item->IsLast(node))
622 		memset(item, 0, (uint8 *)node + fVolume->InodeSize() - (uint8 *)item);
623 
624 	// update all current iterators
625 	AttributeIterator *iterator = NULL;
626 	while ((iterator = fIterators.Next(iterator)) != NULL)
627 		iterator->Update(index, 1);
628 
629 	return B_OK;
630 }
631 
632 
633 /**	Iterates through the small_data section of an inode.
634  *	To start at the beginning of this section, you let smallData
635  *	point to NULL, like:
636  *		small_data *data = NULL;
637  *		while (inode->GetNextSmallData(&data) { ... }
638  *
639  *	This function is reentrant and doesn't allocate any memory;
640  *	you can safely stop calling it at any point (you don't need
641  *	to iterate through the whole list).
642  *	You need to hold the fSmallDataLock when you call this method
643  */
644 
645 status_t
646 Inode::GetNextSmallData(bfs_inode *node, small_data **_smallData) const
647 {
648 	if (node == NULL)
649 		RETURN_ERROR(B_BAD_VALUE);
650 
651 	ASSERT(fSmallDataLock.IsLocked());
652 
653 	small_data *data = *_smallData;
654 
655 	// begin from the start?
656 	if (data == NULL)
657 		data = node->SmallDataStart();
658 	else
659 		data = data->Next();
660 
661 	// is already last item?
662 	if (data->IsLast(node))
663 		return B_ENTRY_NOT_FOUND;
664 
665 	*_smallData = data;
666 
667 	return B_OK;
668 }
669 
670 
671 /**	Finds the attribute "name" in the small data section, and
672  *	returns a pointer to it (or NULL if it doesn't exist).
673  *	You need to hold the fSmallDataLock when you call this method
674  */
675 
676 small_data *
677 Inode::FindSmallData(const bfs_inode *node, const char *name) const
678 {
679 	ASSERT(fSmallDataLock.IsLocked());
680 
681 	small_data *smallData = NULL;
682 	while (GetNextSmallData(const_cast<bfs_inode *>(node), &smallData) == B_OK) {
683 		if (!strcmp(smallData->Name(), name))
684 			return smallData;
685 	}
686 	return NULL;
687 }
688 
689 
690 /** Returns a pointer to the node's name if present in the small data
691  *	section, NULL otherwise.
692  *	You need to hold the fSmallDataLock when you call this method
693  */
694 
695 const char *
696 Inode::Name(const bfs_inode *node) const
697 {
698 	ASSERT(fSmallDataLock.IsLocked());
699 
700 	small_data *smallData = NULL;
701 	while (GetNextSmallData((bfs_inode *)node, &smallData) == B_OK) {
702 		if (*smallData->Name() == FILE_NAME_NAME && smallData->NameSize() == FILE_NAME_NAME_LENGTH)
703 			return (const char *)smallData->Data();
704 	}
705 	return NULL;
706 }
707 
708 
709 /** Copies the node's name into the provided buffer.
710  *	The buffer should be B_FILE_NAME_LENGTH bytes large.
711  */
712 
713 status_t
714 Inode::GetName(char *buffer, size_t size) const
715 {
716 	NodeGetter node(fVolume, this);
717 	SimpleLocker locker(fSmallDataLock);
718 
719 	const char *name = Name(node.Node());
720 	if (name == NULL)
721 		return B_ENTRY_NOT_FOUND;
722 
723 	strlcpy(buffer, name, size);
724 	return B_OK;
725 }
726 
727 
728 /**	Changes or set the name of a file: in the inode small_data section only, it
729  *	doesn't change it in the parent directory's b+tree.
730  *	Note that you need to write back the inode yourself after having called
731  *	that method. It suffers from the same API decision as AddSmallData() does
732  *	(and for the same reason).
733  */
734 
735 status_t
736 Inode::SetName(Transaction &transaction, const char *name)
737 {
738 	if (name == NULL || *name == '\0')
739 		return B_BAD_VALUE;
740 
741 	NodeGetter node(fVolume, transaction, this);
742 	const char nameTag[2] = {FILE_NAME_NAME, 0};
743 
744 	return AddSmallData(transaction, node, nameTag, FILE_NAME_TYPE,
745 		(uint8 *)name, strlen(name), true);
746 }
747 
748 
749 /**	Reads data from the specified attribute.
750  *	This is a high-level attribute function that understands attributes
751  *	in the small_data section as well as real attribute files.
752  */
753 
754 status_t
755 Inode::ReadAttribute(const char *name, int32 type, off_t pos, uint8 *buffer, size_t *_length)
756 {
757 	if (pos < 0)
758 		pos = 0;
759 
760 	// search in the small_data section (which has to be locked first)
761 	{
762 		NodeGetter node(fVolume, this);
763 		SimpleLocker locker(fSmallDataLock);
764 
765 		small_data *smallData = FindSmallData(node.Node(), name);
766 		if (smallData != NULL) {
767 			size_t length = *_length;
768 			if (pos >= smallData->data_size) {
769 				*_length = 0;
770 				return B_OK;
771 			}
772 			if (length + pos > smallData->DataSize())
773 				length = smallData->DataSize() - pos;
774 
775 			memcpy(buffer, smallData->Data() + pos, length);
776 			*_length = length;
777 			return B_OK;
778 		}
779 	}
780 
781 	// search in the attribute directory
782 	Inode *attribute;
783 	status_t status = GetAttribute(name, &attribute);
784 	if (status == B_OK) {
785 		if (attribute->Lock().Lock() == B_OK) {
786 			status = attribute->ReadAt(pos, (uint8 *)buffer, _length);
787 			attribute->Lock().Unlock();
788 		} else
789 			status = B_ERROR;
790 
791 		ReleaseAttribute(attribute);
792 	}
793 
794 	RETURN_ERROR(status);
795 }
796 
797 
798 /**	Writes data to the specified attribute.
799  *	This is a high-level attribute function that understands attributes
800  *	in the small_data section as well as real attribute files.
801  */
802 
803 status_t
804 Inode::WriteAttribute(Transaction &transaction, const char *name, int32 type, off_t pos,
805 	const uint8 *buffer, size_t *_length)
806 {
807 	// needed to maintain the index
808 	uint8 oldBuffer[BPLUSTREE_MAX_KEY_LENGTH], *oldData = NULL;
809 	size_t oldLength = 0;
810 
811 	// ToDo: we actually depend on that the contents of "buffer" are constant.
812 	// If they get changed during the write (hey, user programs), we may mess
813 	// up our index trees!
814 
815 	Index index(fVolume);
816 	index.SetTo(name);
817 
818 	Inode *attribute = NULL;
819 	status_t status = B_OK;
820 
821 	if (GetAttribute(name, &attribute) < B_OK) {
822 		// save the old attribute data
823 		NodeGetter node(fVolume, transaction, this);
824 		fSmallDataLock.Lock();
825 
826 		small_data *smallData = FindSmallData(node.Node(), name);
827 		if (smallData != NULL) {
828 			oldLength = smallData->DataSize();
829 			if (oldLength > BPLUSTREE_MAX_KEY_LENGTH)
830 				oldLength = BPLUSTREE_MAX_KEY_LENGTH;
831 			memcpy(oldData = oldBuffer, smallData->Data(), oldLength);
832 		}
833 		fSmallDataLock.Unlock();
834 
835 		// if the attribute doesn't exist yet (as a file), try to put it in the
836 		// small_data section first - if that fails (due to insufficent space),
837 		// create a real attribute file
838 		status = AddSmallData(transaction, node, name, type, buffer, *_length);
839 		if (status == B_DEVICE_FULL) {
840 			status = CreateAttribute(transaction, name, type, &attribute);
841 			if (status < B_OK)
842 				RETURN_ERROR(status);
843 		} else if (status == B_OK)
844 			status = WriteBack(transaction);
845 	}
846 
847 	if (attribute != NULL) {
848 		if (attribute->Lock().LockWrite() == B_OK) {
849 			// save the old attribute data (if this fails, oldLength will reflect it)
850 			if (fVolume->CheckForLiveQuery(name)) {
851 				oldLength = BPLUSTREE_MAX_KEY_LENGTH;
852 				if (attribute->ReadAt(0, oldBuffer, &oldLength) == B_OK)
853 					oldData = oldBuffer;
854 			}
855 			// ToDo: check if the data fits in the inode now and delete the attribute file if so
856 			status = attribute->WriteAt(transaction, pos, buffer, _length);
857 
858 			attribute->Lock().UnlockWrite();
859 		} else
860 			status = B_ERROR;
861 
862 		ReleaseAttribute(attribute);
863 	}
864 
865 	// ToDo: find a better way than this "pos" thing (the begin of the old key
866 	//	must be copied to the start of the new one for a comparison)
867 	if (status == B_OK && pos == 0) {
868 		// index only the first BPLUSTREE_MAX_KEY_LENGTH bytes
869 		uint16 length = *_length;
870 		if (length > BPLUSTREE_MAX_KEY_LENGTH)
871 			length = BPLUSTREE_MAX_KEY_LENGTH;
872 
873 		// Update index. Note, Index::Update() may be called even if initializing
874 		// the index failed - it will just update the live queries in this case
875 		if (pos < length || pos < oldLength)
876 			index.Update(transaction, name, type, oldData, oldLength, buffer, length, this);
877 	}
878 	return status;
879 }
880 
881 
882 /**	Removes the specified attribute from the inode.
883  *	This is a high-level attribute function that understands attributes
884  *	in the small_data section as well as real attribute files.
885  */
886 
887 status_t
888 Inode::RemoveAttribute(Transaction &transaction, const char *name)
889 {
890 	Index index(fVolume);
891 	bool hasIndex = index.SetTo(name) == B_OK;
892 	NodeGetter node(fVolume, this);
893 
894 	// update index for attributes in the small_data section
895 	{
896 		fSmallDataLock.Lock();
897 
898 		small_data *smallData = FindSmallData(node.Node(), name);
899 		if (smallData != NULL) {
900 			uint32 length = smallData->DataSize();
901 			if (length > BPLUSTREE_MAX_KEY_LENGTH)
902 				length = BPLUSTREE_MAX_KEY_LENGTH;
903 			index.Update(transaction, name, smallData->Type(), smallData->Data(),
904 				length, NULL, 0, this);
905 		}
906 		fSmallDataLock.Unlock();
907 	}
908 
909 	status_t status = RemoveSmallData(transaction, node, name);
910 	if (status == B_OK) {
911 		WriteBack(transaction);
912 	} else if (status == B_ENTRY_NOT_FOUND && !Attributes().IsZero()) {
913 		// remove the attribute file if it exists
914 		Vnode vnode(fVolume, Attributes());
915 		Inode *attributes;
916 		if ((status = vnode.Get(&attributes)) < B_OK)
917 			return status;
918 
919 		// update index
920 		Inode *attribute;
921 		if ((hasIndex || fVolume->CheckForLiveQuery(name))
922 			&& GetAttribute(name, &attribute) == B_OK) {
923 			uint8 data[BPLUSTREE_MAX_KEY_LENGTH];
924 			size_t length = BPLUSTREE_MAX_KEY_LENGTH;
925 			if (attribute->ReadAt(0, data, &length) == B_OK)
926 				index.Update(transaction, name, attribute->Type(), data, length, NULL, 0, this);
927 
928 			ReleaseAttribute(attribute);
929 		}
930 
931 		if ((status = attributes->Remove(transaction, name)) < B_OK)
932 			return status;
933 
934 		if (attributes->IsEmpty()) {
935 			// remove attribute directory (don't fail if that can't be done)
936 			if (remove_vnode(fVolume->ID(), attributes->ID()) == B_OK) {
937 				// update the inode, so that no one will ever doubt it's deleted :-)
938 				attributes->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
939 				if (attributes->WriteBack(transaction) == B_OK) {
940 					Attributes().SetTo(0, 0, 0);
941 					WriteBack(transaction);
942 				} else
943 					unremove_vnode(fVolume->ID(), attributes->ID());
944 			}
945 		}
946 	}
947 	return status;
948 }
949 
950 
951 status_t
952 Inode::GetAttribute(const char *name, Inode **attribute)
953 {
954 	// does this inode even have attributes?
955 	if (Attributes().IsZero())
956 		return B_ENTRY_NOT_FOUND;
957 
958 	Vnode vnode(fVolume, Attributes());
959 	Inode *attributes;
960 	if (vnode.Get(&attributes) < B_OK) {
961 		FATAL(("get_vnode() failed in Inode::GetAttribute(name = \"%s\")\n", name));
962 		return B_ERROR;
963 	}
964 
965 	BPlusTree *tree;
966 	status_t status = attributes->GetTree(&tree);
967 	if (status == B_OK) {
968 		vnode_id id;
969 		if ((status = tree->Find((uint8 *)name, (uint16)strlen(name), &id)) == B_OK) {
970 			Vnode vnode(fVolume, id);
971 			// Check if the attribute is really an attribute
972 			if (vnode.Get(attribute) < B_OK
973 				|| !(*attribute)->IsAttribute())
974 				return B_ERROR;
975 
976 			vnode.Keep();
977 			return B_OK;
978 		}
979 	}
980 	return status;
981 }
982 
983 
984 void
985 Inode::ReleaseAttribute(Inode *attribute)
986 {
987 	if (attribute == NULL)
988 		return;
989 
990 	put_vnode(fVolume->ID(), attribute->ID());
991 }
992 
993 
994 status_t
995 Inode::CreateAttribute(Transaction &transaction, const char *name, uint32 type,
996 	Inode **attribute)
997 {
998 	// do we need to create the attribute directory first?
999 	if (Attributes().IsZero()) {
1000 		status_t status = Inode::Create(transaction, this, NULL,
1001 			S_ATTR_DIR | S_DIRECTORY | 0666, 0, 0, NULL);
1002 		if (status < B_OK)
1003 			RETURN_ERROR(status);
1004 	}
1005 	Vnode vnode(fVolume, Attributes());
1006 	Inode *attributes;
1007 	if (vnode.Get(&attributes) < B_OK)
1008 		return B_ERROR;
1009 
1010 	// Inode::Create() locks the inode for us
1011 	return Inode::Create(transaction, attributes, name,
1012 		S_ATTR | S_FILE | 0666, 0, type, NULL, attribute);
1013 }
1014 
1015 
1016 //	#pragma mark -
1017 
1018 
1019 /**	Gives the caller direct access to the b+tree for a given directory.
1020  *	The tree is no longer created on demand, but when the inode is first
1021  *	created. That will report any potential errors upfront, saves locking,
1022  *	and should work as good (though a bit slower).
1023  */
1024 
1025 status_t
1026 Inode::GetTree(BPlusTree **tree)
1027 {
1028 	if (fTree) {
1029 		*tree = fTree;
1030 		return B_OK;
1031 	}
1032 
1033 	RETURN_ERROR(B_BAD_VALUE);
1034 }
1035 
1036 
1037 bool
1038 Inode::IsEmpty()
1039 {
1040 	BPlusTree *tree;
1041 	status_t status = GetTree(&tree);
1042 	if (status < B_OK)
1043 		return status;
1044 
1045 	TreeIterator iterator(tree);
1046 
1047 	// index and attribute directories are really empty when they are
1048 	// empty - directories for standard files always contain ".", and
1049 	// "..", so we need to ignore those two
1050 
1051 	uint32 count = 0;
1052 	char name[BPLUSTREE_MAX_KEY_LENGTH];
1053 	uint16 length;
1054 	vnode_id id;
1055 	while (iterator.GetNextEntry(name, &length, B_FILE_NAME_LENGTH, &id) == B_OK) {
1056 		if (Mode() & (S_ATTR_DIR | S_INDEX_DIR))
1057 			return false;
1058 
1059 		if (++count > 2 || strcmp(".", name) && strcmp("..", name))
1060 			return false;
1061 	}
1062 	return true;
1063 }
1064 
1065 
1066 /** Finds the block_run where "pos" is located in the data_stream of
1067  *	the inode.
1068  *	If successful, "offset" will then be set to the file offset
1069  *	of the block_run returned; so "pos - offset" is for the block_run
1070  *	what "pos" is for the whole stream.
1071  *	The caller has to make sure that "pos" is inside the stream.
1072  */
1073 
1074 status_t
1075 Inode::FindBlockRun(off_t pos, block_run &run, off_t &offset)
1076 {
1077 	data_stream *data = &Node().data;
1078 
1079 	// find matching block run
1080 
1081 	if (data->MaxDirectRange() > 0 && pos >= data->MaxDirectRange()) {
1082 		if (data->MaxDoubleIndirectRange() > 0 && pos >= data->MaxIndirectRange()) {
1083 			// access to double indirect blocks
1084 
1085 			CachedBlock cached(fVolume);
1086 
1087 			off_t start = pos - data->MaxIndirectRange();
1088 			int32 indirectSize = (1L << (INDIRECT_BLOCKS_SHIFT + cached.BlockShift()))
1089 				* (fVolume->BlockSize() / sizeof(block_run));
1090 			int32 directSize = NUM_ARRAY_BLOCKS << cached.BlockShift();
1091 			int32 index = start / indirectSize;
1092 			int32 runsPerBlock = cached.BlockSize() / sizeof(block_run);
1093 
1094 			block_run *indirect = (block_run *)cached.SetTo(
1095 					fVolume->ToBlock(data->double_indirect) + index / runsPerBlock);
1096 			if (indirect == NULL)
1097 				RETURN_ERROR(B_ERROR);
1098 
1099 			//printf("\tstart = %Ld, indirectSize = %ld, directSize = %ld, index = %ld\n",start,indirectSize,directSize,index);
1100 			//printf("\tlook for indirect block at %ld,%d\n",indirect[index].allocation_group,indirect[index].start);
1101 
1102 			int32 current = (start % indirectSize) / directSize;
1103 
1104 			indirect = (block_run *)cached.SetTo(
1105 					fVolume->ToBlock(indirect[index % runsPerBlock]) + current / runsPerBlock);
1106 			if (indirect == NULL)
1107 				RETURN_ERROR(B_ERROR);
1108 
1109 			run = indirect[current % runsPerBlock];
1110 			offset = data->MaxIndirectRange() + (index * indirectSize) + (current * directSize);
1111 			//printf("\tfCurrent = %ld, fRunFileOffset = %Ld, fRunBlockEnd = %Ld, fRun = %ld,%d\n",fCurrent,fRunFileOffset,fRunBlockEnd,fRun.allocation_group,fRun.start);
1112 		} else {
1113 			// access to indirect blocks
1114 
1115 			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1116 			off_t runBlockEnd = data->MaxDirectRange();
1117 
1118 			CachedBlock cached(fVolume);
1119 			off_t block = fVolume->ToBlock(data->indirect);
1120 
1121 			for (int32 i = 0; i < data->indirect.Length(); i++) {
1122 				block_run *indirect = (block_run *)cached.SetTo(block + i);
1123 				if (indirect == NULL)
1124 					RETURN_ERROR(B_IO_ERROR);
1125 
1126 				int32 current = -1;
1127 				while (++current < runsPerBlock) {
1128 					if (indirect[current].IsZero())
1129 						break;
1130 
1131 					runBlockEnd += indirect[current].Length() << cached.BlockShift();
1132 					if (runBlockEnd > pos) {
1133 						run = indirect[current];
1134 						offset = runBlockEnd - (run.Length() << cached.BlockShift());
1135 						//printf("reading from indirect block: %ld,%d\n",fRun.allocation_group,fRun.start);
1136 						//printf("### indirect-run[%ld] = (%ld,%d,%d), offset = %Ld\n",fCurrent,fRun.allocation_group,fRun.start,fRun.Length(),fRunFileOffset);
1137 						return fVolume->ValidateBlockRun(run);
1138 					}
1139 				}
1140 			}
1141 			RETURN_ERROR(B_ERROR);
1142 		}
1143 	} else {
1144 		// access from direct blocks
1145 
1146 		off_t runBlockEnd = 0LL;
1147 		int32 current = -1;
1148 
1149 		while (++current < NUM_DIRECT_BLOCKS) {
1150 			if (data->direct[current].IsZero())
1151 				break;
1152 
1153 			runBlockEnd += data->direct[current].Length() << fVolume->BlockShift();
1154 			if (runBlockEnd > pos) {
1155 				run = data->direct[current];
1156 				offset = runBlockEnd - (run.Length() << fVolume->BlockShift());
1157 				//printf("### run[%ld] = (%ld,%d,%d), offset = %Ld\n",fCurrent,fRun.allocation_group,fRun.start,fRun.Length(),fRunFileOffset);
1158 				return fVolume->ValidateBlockRun(run);
1159 			}
1160 		}
1161 
1162 		//PRINT(("FindBlockRun() failed in direct range: size = %Ld, pos = %Ld\n",data->size,pos));
1163 		return B_ENTRY_NOT_FOUND;
1164 	}
1165 	return fVolume->ValidateBlockRun(run);
1166 }
1167 
1168 
1169 status_t
1170 Inode::ReadAt(off_t pos, uint8 *buffer, size_t *_length)
1171 {
1172 #if 0
1173 	// call the right ReadAt() method, depending on the inode flags
1174 
1175 	if (Flags() & INODE_LOGGED)
1176 		return ((Stream<Access::Logged> *)this)->ReadAt(pos, buffer, _length);
1177 
1178 	return ((Stream<Access::Cached> *)this)->ReadAt(pos, buffer, _length);
1179 #endif
1180 	size_t length = *_length;
1181 
1182 	// set/check boundaries for pos/length
1183 	if (pos < 0)
1184 		return B_BAD_VALUE;
1185 
1186 	if (pos >= Size() || length == 0) {
1187 		*_length = 0;
1188 		return B_NO_ERROR;
1189 	}
1190 
1191 	return file_cache_read(FileCache(), pos, buffer, _length);
1192 }
1193 
1194 
1195 status_t
1196 Inode::WriteAt(Transaction &transaction, off_t pos, const uint8 *buffer, size_t *_length)
1197 {
1198 	// update the last modification time in memory, it will be written
1199 	// back to the inode, and the index when the file is closed
1200 	// ToDo: should update the internal last modified time only at this point!
1201 	Node().last_modified_time = HOST_ENDIAN_TO_BFS_INT64((bigtime_t)time(NULL) << INODE_TIME_SHIFT);
1202 
1203 	// ToDo: support INODE_LOGGED!
1204 #if 0
1205 	if (Flags() & INODE_LOGGED)
1206 		return ((Stream<Access::Logged> *)this)->WriteAt(transaction, pos, buffer, _length);
1207 
1208 	return ((Stream<Access::Cached> *)this)->WriteAt(transaction, pos, buffer, _length);
1209 #endif
1210 	size_t length = *_length;
1211 
1212 	// set/check boundaries for pos/length
1213 	if (pos < 0)
1214 		return B_BAD_VALUE;
1215 
1216 	if (pos + length > Size()) {
1217 		off_t oldSize = Size();
1218 
1219 		// the transaction doesn't have to be started already
1220 		// ToDo: what's that INODE_NO_TRANSACTION flag good for again?
1221 		if ((Flags() & INODE_NO_TRANSACTION) == 0
1222 			&& !transaction.IsStarted())
1223 			transaction.Start(fVolume, BlockNumber());
1224 
1225 		// let's grow the data stream to the size needed
1226 		status_t status = SetFileSize(transaction, pos + length);
1227 		if (status < B_OK) {
1228 			*_length = 0;
1229 			RETURN_ERROR(status);
1230 		}
1231 		// If the position of the write was beyond the file size, we
1232 		// have to fill the gap between that position and the old file
1233 		// size with zeros.
1234 		FillGapWithZeros(oldSize, pos);
1235 
1236 		// we need to write back the inode here because it has to
1237 		// go into this transaction (we cannot wait until the file
1238 		// is closed)
1239 		status = WriteBack(transaction);
1240 		if (status < B_OK)
1241 			return status;
1242 	}
1243 
1244 	// If we don't want to write anything, we can now return (we may
1245 	// just have changed the file size using the position parameter)
1246 	if (length == 0)
1247 		return B_OK;
1248 
1249 	return file_cache_write(FileCache(), pos, buffer, _length);
1250 }
1251 
1252 
1253 /**	Fills the gap between the old file size and the new file size
1254  *	with zeros.
1255  *	It's more or less a copy of Inode::WriteAt() but it can handle
1256  *	length differences of more than just 4 GB, and it never uses
1257  *	the log, even if the INODE_LOGGED flag is set.
1258  */
1259 
1260 status_t
1261 Inode::FillGapWithZeros(off_t pos, off_t newSize)
1262 {
1263 	// ToDo: we currently do anything here, same as original BFS!
1264 	//if (pos >= newSize)
1265 	return B_OK;
1266 #if 0
1267 	block_run run;
1268 	off_t offset;
1269 	if (FindBlockRun(pos, run, offset) < B_OK)
1270 		RETURN_ERROR(B_BAD_VALUE);
1271 
1272 	off_t length = newSize - pos;
1273 	uint32 bytesWritten = 0;
1274 	uint32 blockSize = fVolume->BlockSize();
1275 	uint32 blockShift = fVolume->BlockShift();
1276 	uint8 *block;
1277 
1278 	// the first block_run we write could not be aligned to the block_size boundary
1279 	// (write partial block at the beginning)
1280 
1281 	// pos % block_size == (pos - offset) % block_size, offset % block_size == 0
1282 	if (pos % blockSize != 0) {
1283 		run.start += (pos - offset) / blockSize;
1284 		run.length -= (pos - offset) / blockSize;
1285 
1286 		CachedBlock cached(fVolume,run);
1287 		if ((block = cached.Block()) == NULL)
1288 			RETURN_ERROR(B_BAD_VALUE);
1289 
1290 		bytesWritten = blockSize - (pos % blockSize);
1291 		if (length < bytesWritten)
1292 			bytesWritten = length;
1293 
1294 		memset(block + (pos % blockSize), 0, bytesWritten);
1295 		if (fVolume->WriteBlocks(cached.BlockNumber(), block, 1) < B_OK)
1296 			RETURN_ERROR(B_IO_ERROR);
1297 
1298 		pos += bytesWritten;
1299 
1300 		length -= bytesWritten;
1301 		if (length == 0)
1302 			return B_OK;
1303 
1304 		if (FindBlockRun(pos, run, offset) < B_OK)
1305 			RETURN_ERROR(B_BAD_VALUE);
1306 	}
1307 
1308 	while (length > 0) {
1309 		// offset is the offset to the current pos in the block_run
1310 		run.start = HOST_ENDIAN_TO_BFS_INT16(run.Start() + ((pos - offset) >> blockShift));
1311 		run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length() - ((pos - offset) >> blockShift));
1312 
1313 		CachedBlock cached(fVolume);
1314 		off_t blockNumber = fVolume->ToBlock(run);
1315 		for (int32 i = 0; i < run.Length(); i++) {
1316 			if ((block = cached.SetTo(blockNumber + i, true)) == NULL)
1317 				RETURN_ERROR(B_IO_ERROR);
1318 
1319 			if (fVolume->WriteBlocks(cached.BlockNumber(), block, 1) < B_OK)
1320 				RETURN_ERROR(B_IO_ERROR);
1321 		}
1322 
1323 		int32 bytes = run.Length() << blockShift;
1324 		length -= bytes;
1325 		bytesWritten += bytes;
1326 
1327 		// since we don't respect a last partial block, length can be lower
1328 		if (length <= 0)
1329 			break;
1330 
1331 		pos += bytes;
1332 
1333 		if (FindBlockRun(pos, run, offset) < B_OK)
1334 			RETURN_ERROR(B_BAD_VALUE);
1335 	}
1336 	return B_OK;
1337 #endif
1338 }
1339 
1340 
1341 /** Allocates NUM_ARRAY_BLOCKS blocks, and clears their contents. Growing
1342  *	the indirect and double indirect range uses this method.
1343  *	The allocated block_run is saved in "run"
1344  */
1345 
1346 status_t
1347 Inode::_AllocateBlockArray(Transaction &transaction, block_run &run)
1348 {
1349 	if (!run.IsZero())
1350 		return B_BAD_VALUE;
1351 
1352 	status_t status = fVolume->Allocate(transaction, this, NUM_ARRAY_BLOCKS, run,
1353 		NUM_ARRAY_BLOCKS);
1354 	if (status < B_OK)
1355 		return status;
1356 
1357 	// make sure those blocks are empty
1358 	CachedBlock cached(fVolume);
1359 	off_t block = fVolume->ToBlock(run);
1360 
1361 	for (int32 i = 0; i < run.Length(); i++) {
1362 		block_run *runs = (block_run *)cached.SetToWritable(transaction, block + i, true);
1363 		if (runs == NULL)
1364 			return B_IO_ERROR;
1365 	}
1366 	return B_OK;
1367 }
1368 
1369 
1370 status_t
1371 Inode::_GrowStream(Transaction &transaction, off_t size)
1372 {
1373 	data_stream *data = &Node().data;
1374 
1375 	// is the data stream already large enough to hold the new size?
1376 	// (can be the case with preallocated blocks)
1377 	if (size < data->MaxDirectRange()
1378 		|| size < data->MaxIndirectRange()
1379 		|| size < data->MaxDoubleIndirectRange()) {
1380 		data->size = HOST_ENDIAN_TO_BFS_INT64(size);
1381 		return B_OK;
1382 	}
1383 
1384 	// how many bytes are still needed? (unused ranges are always zero)
1385 	uint16 minimum = 1;
1386 	off_t bytes;
1387 	if (data->Size() < data->MaxDoubleIndirectRange()) {
1388 		bytes = size - data->MaxDoubleIndirectRange();
1389 		// the double indirect range can only handle multiple of NUM_ARRAY_BLOCKS
1390 		minimum = NUM_ARRAY_BLOCKS;
1391 	} else if (data->Size() < data->MaxIndirectRange())
1392 		bytes = size - data->MaxIndirectRange();
1393 	else if (data->Size() < data->MaxDirectRange())
1394 		bytes = size - data->MaxDirectRange();
1395 	else
1396 		bytes = size - data->Size();
1397 
1398 	// do we have enough free blocks on the disk?
1399 	off_t blocksRequested = (bytes + fVolume->BlockSize() - 1) >> fVolume->BlockShift();
1400 	if (blocksRequested > fVolume->FreeBlocks())
1401 		return B_DEVICE_FULL;
1402 
1403 	off_t blocksNeeded = blocksRequested;
1404 		// because of preallocations and partial allocations, the number of
1405 		// blocks we need to allocate may be different from the one we request
1406 		// from the block allocator
1407 
1408 	// Should we preallocate some blocks (currently, always 64k)?
1409 	// Attributes, attribute directories, and long symlinks usually won't get that big,
1410 	// and should stay close to the inode - preallocating could be counterproductive.
1411 	// Also, if free disk space is tight, we probably don't want to do this as well.
1412 	if (!IsAttribute() && !IsAttributeDirectory() && !IsSymLink()
1413 		&& blocksRequested < (65536 >> fVolume->BlockShift())
1414 		&& fVolume->FreeBlocks() > 128)
1415 		blocksRequested = 65536 >> fVolume->BlockShift();
1416 
1417 	while (blocksNeeded > 0) {
1418 		// the requested blocks do not need to be returned with a
1419 		// single allocation, so we need to iterate until we have
1420 		// enough blocks allocated
1421 		block_run run;
1422 		status_t status = fVolume->Allocate(transaction, this, blocksRequested, run, minimum);
1423 		if (status < B_OK)
1424 			return status;
1425 
1426 		// okay, we have the needed blocks, so just distribute them to the
1427 		// different ranges of the stream (direct, indirect & double indirect)
1428 
1429 		// ToDo: if anything goes wrong here, we probably want to free the
1430 		// blocks that couldn't be distributed into the stream!
1431 
1432 		blocksNeeded -= run.Length();
1433 		// don't preallocate if the first allocation was already too small
1434 		blocksRequested = blocksNeeded;
1435 		if (minimum > 1) {
1436 			// make sure that "blocks" is a multiple of minimum
1437 			blocksRequested = (blocksRequested + minimum - 1) & ~(minimum - 1);
1438 		}
1439 
1440 		// Direct block range
1441 
1442 		if (data->Size() <= data->MaxDirectRange()) {
1443 			// let's try to put them into the direct block range
1444 			int32 free = 0;
1445 			for (; free < NUM_DIRECT_BLOCKS; free++)
1446 				if (data->direct[free].IsZero())
1447 					break;
1448 
1449 			if (free < NUM_DIRECT_BLOCKS) {
1450 				// can we merge the last allocated run with the new one?
1451 				int32 last = free - 1;
1452 				if (free > 0 && data->direct[last].MergeableWith(run))
1453 					data->direct[last].length = HOST_ENDIAN_TO_BFS_INT16(data->direct[last].Length() + run.Length());
1454 				else
1455 					data->direct[free] = run;
1456 
1457 				data->max_direct_range = HOST_ENDIAN_TO_BFS_INT64(data->MaxDirectRange() + run.Length() * fVolume->BlockSize());
1458 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0 ? data->max_direct_range : size);
1459 				continue;
1460 			}
1461 		}
1462 
1463 		// Indirect block range
1464 
1465 		if (data->Size() <= data->MaxIndirectRange() || !data->MaxIndirectRange()) {
1466 			CachedBlock cached(fVolume);
1467 			block_run *runs = NULL;
1468 			uint32 free = 0;
1469 			off_t block;
1470 
1471 			// if there is no indirect block yet, create one
1472 			if (data->indirect.IsZero()) {
1473 				status = _AllocateBlockArray(transaction, data->indirect);
1474 				if (status < B_OK)
1475 					return status;
1476 
1477 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(data->MaxDirectRange());
1478 				// insert the block_run in the first block
1479 				runs = (block_run *)cached.SetTo(data->indirect);
1480 			} else {
1481 				uint32 numberOfRuns = fVolume->BlockSize() / sizeof(block_run);
1482 				block = fVolume->ToBlock(data->indirect);
1483 
1484 				// search first empty entry
1485 				int32 i = 0;
1486 				for (; i < data->indirect.Length(); i++) {
1487 					if ((runs = (block_run *)cached.SetTo(block + i)) == NULL)
1488 						return B_IO_ERROR;
1489 
1490 					for (free = 0; free < numberOfRuns; free++)
1491 						if (runs[free].IsZero())
1492 							break;
1493 
1494 					if (free < numberOfRuns)
1495 						break;
1496 				}
1497 				if (i == data->indirect.Length())
1498 					runs = NULL;
1499 			}
1500 
1501 			if (runs != NULL) {
1502 				// try to insert the run to the last one - note that this doesn't
1503 				// take block borders into account, so it could be further optimized
1504 				cached.MakeWritable(transaction);
1505 
1506 				int32 last = free - 1;
1507 				if (free > 0 && runs[last].MergeableWith(run))
1508 					runs[last].length = HOST_ENDIAN_TO_BFS_INT16(runs[last].Length() + run.Length());
1509 				else
1510 					runs[free] = run;
1511 
1512 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(data->MaxIndirectRange() + (run.Length() << fVolume->BlockShift()));
1513 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0 ? data->MaxIndirectRange() : size);
1514 				continue;
1515 			}
1516 		}
1517 
1518 		// Double indirect block range
1519 
1520 		if (data->Size() <= data->MaxDoubleIndirectRange() || !data->max_double_indirect_range) {
1521 			while ((run.Length() % NUM_ARRAY_BLOCKS) != 0) {
1522 				// The number of allocated blocks isn't a multiple of NUM_ARRAY_BLOCKS,
1523 				// so we have to change this. This can happen the first time the stream
1524 				// grows into the double indirect range.
1525 				// First, free the remaining blocks that don't fit into a multiple
1526 				// of NUM_ARRAY_BLOCKS
1527 				int32 rest = run.Length() % NUM_ARRAY_BLOCKS;
1528 				run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length() - rest);
1529 
1530 				status = fVolume->Free(transaction, block_run::Run(run.AllocationGroup(),
1531 					run.Start() + run.Length(), rest));
1532 				if (status < B_OK)
1533 					return status;
1534 
1535 				blocksNeeded += rest;
1536 				blocksRequested = (blocksNeeded + NUM_ARRAY_BLOCKS - 1) & ~(NUM_ARRAY_BLOCKS - 1);
1537 				minimum = NUM_ARRAY_BLOCKS;
1538 					// we make sure here that we have at minimum NUM_ARRAY_BLOCKS allocated,
1539 					// so if the allocation succeeds, we don't run into an endless loop
1540 
1541 				// Are there any blocks left in the run? If not, allocate a new one
1542 				if (run.length == 0)
1543 					continue;
1544 			}
1545 
1546 			// if there is no double indirect block yet, create one
1547 			if (data->double_indirect.IsZero()) {
1548 				status = _AllocateBlockArray(transaction, data->double_indirect);
1549 				if (status < B_OK)
1550 					return status;
1551 
1552 				data->max_double_indirect_range = data->max_indirect_range;
1553 			}
1554 
1555 			// calculate the index where to insert the new blocks
1556 
1557 			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1558 			int32 indirectSize = ((1L << INDIRECT_BLOCKS_SHIFT) << fVolume->BlockShift())
1559 				* runsPerBlock;
1560 			int32 directSize = NUM_ARRAY_BLOCKS << fVolume->BlockShift();
1561 			int32 runsPerArray = runsPerBlock << ARRAY_BLOCKS_SHIFT;
1562 
1563 			off_t start = data->MaxDoubleIndirectRange() - data->MaxIndirectRange();
1564 			int32 indirectIndex = start / indirectSize;
1565 			int32 index = start / directSize;
1566 
1567 			// distribute the blocks to the array and allocate
1568 			// new array blocks when needed
1569 
1570 			CachedBlock cached(fVolume);
1571 			CachedBlock cachedDirect(fVolume);
1572 			block_run *array = NULL;
1573 			uint32 runLength = run.Length();
1574 
1575 			// ToDo: the following code is commented - it could be used to
1576 			// preallocate all needed block arrays to see in advance if the
1577 			// allocation will succeed.
1578 			// I will probably remove it later, because it's no perfect solution
1579 			// either: if the allocation was broken up before (blocksNeeded != 0),
1580 			// it doesn't guarantee anything.
1581 			// And since failing in this case is not that common, it doesn't have
1582 			// to be optimized in that way.
1583 			// Anyway, I wanted to have it in CVS - all those lines, and they will
1584 			// be removed soon :-)
1585 /*
1586 			// allocate new block arrays if needed
1587 
1588 			off_t block = -1;
1589 
1590 			for (int32 i = 0;i < needed;i++) {
1591 				// get the block to insert the run into
1592 				block = fVolume->ToBlock(data->double_indirect) + i + indirectIndex / runsPerBlock;
1593 				if (cached.BlockNumber() != block)
1594 					array = (block_run *)cached.SetTo(block);
1595 
1596 				if (array == NULL)
1597 					return B_ERROR;
1598 
1599 				status = AllocateBlockArray(transaction, array[i + indirectIndex % runsPerBlock]);
1600 				if (status < B_OK)
1601 					return status;
1602 			}
1603 */
1604 
1605 			while (run.length != 0) {
1606 				// get the indirect array block
1607 				if (array == NULL) {
1608 					array = (block_run *)cached.SetTo(fVolume->ToBlock(data->double_indirect)
1609 						+ indirectIndex / runsPerBlock);
1610 					if (array == NULL)
1611 						return B_IO_ERROR;
1612 				}
1613 
1614 				do {
1615 					// do we need a new array block?
1616 					if (array[indirectIndex % runsPerBlock].IsZero()) {
1617 						status = _AllocateBlockArray(transaction,
1618 							array[indirectIndex % runsPerBlock]);
1619 						if (status < B_OK)
1620 							return status;
1621 					}
1622 
1623 					block_run *runs = (block_run *)cachedDirect.SetToWritable(transaction,
1624 						fVolume->ToBlock(array[indirectIndex % runsPerBlock])
1625 						+ index / runsPerBlock);
1626 					if (runs == NULL)
1627 						return B_IO_ERROR;
1628 
1629 					do {
1630 						// insert the block_run into the array
1631 						runs[index % runsPerBlock] = run;
1632 						runs[index % runsPerBlock].length = HOST_ENDIAN_TO_BFS_INT16(NUM_ARRAY_BLOCKS);
1633 
1634 						// alter the remaining block_run
1635 						run.start = HOST_ENDIAN_TO_BFS_INT16(run.Start() + NUM_ARRAY_BLOCKS);
1636 						run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length() - NUM_ARRAY_BLOCKS);
1637 					} while ((++index % runsPerBlock) != 0 && run.length);
1638 				} while ((index % runsPerArray) != 0 && run.length);
1639 
1640 				if (++indirectIndex % runsPerBlock == 0) {
1641 					array = NULL;
1642 					index = 0;
1643 				}
1644 			}
1645 
1646 			data->max_double_indirect_range = HOST_ENDIAN_TO_BFS_INT64(data->MaxDoubleIndirectRange() + (runLength << fVolume->BlockShift()));
1647 			data->size = blocksNeeded > 0 ? HOST_ENDIAN_TO_BFS_INT64(data->max_double_indirect_range) : size;
1648 
1649 			continue;
1650 		}
1651 
1652 		RETURN_ERROR(EFBIG);
1653 	}
1654 	// update the size of the data stream
1655 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
1656 
1657 	return B_OK;
1658 }
1659 
1660 
1661 status_t
1662 Inode::_FreeStaticStreamArray(Transaction &transaction, int32 level, block_run run,
1663 	off_t size, off_t offset, off_t &max)
1664 {
1665 	int32 indirectSize = 0;
1666 	if (level == 0)
1667 		indirectSize = (1L << (INDIRECT_BLOCKS_SHIFT + fVolume->BlockShift()))
1668 			* (fVolume->BlockSize() / sizeof(block_run));
1669 	else if (level == 1)
1670 		indirectSize = 4 << fVolume->BlockShift();
1671 
1672 	off_t start;
1673 	if (size > offset)
1674 		start = size - offset;
1675 	else
1676 		start = 0;
1677 
1678 	int32 index = start / indirectSize;
1679 	int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1680 
1681 	CachedBlock cached(fVolume);
1682 	off_t blockNumber = fVolume->ToBlock(run);
1683 
1684 	// set the file offset to the current block run
1685 	offset += (off_t)index * indirectSize;
1686 
1687 	for (int32 i = index / runsPerBlock; i < run.Length(); i++) {
1688 		block_run *array = (block_run *)cached.SetToWritable(transaction, blockNumber + i);
1689 		if (array == NULL)
1690 			RETURN_ERROR(B_ERROR);
1691 
1692 		for (index = index % runsPerBlock; index < runsPerBlock; index++) {
1693 			if (array[index].IsZero()) {
1694 				// we also want to break out of the outer loop
1695 				i = run.Length();
1696 				break;
1697 			}
1698 
1699 			status_t status = B_OK;
1700 			if (level == 0) {
1701 				status = _FreeStaticStreamArray(transaction, 1, array[index], size,
1702 					offset, max);
1703 			} else if (offset >= size)
1704 				status = fVolume->Free(transaction, array[index]);
1705 			else
1706 				max = HOST_ENDIAN_TO_BFS_INT64(offset + indirectSize);
1707 
1708 			if (status < B_OK)
1709 				RETURN_ERROR(status);
1710 
1711 			if (offset >= size)
1712 				array[index].SetTo(0, 0, 0);
1713 
1714 			offset += indirectSize;
1715 		}
1716 		index = 0;
1717 	}
1718 	return B_OK;
1719 }
1720 
1721 
1722 /** Frees all block_runs in the array which come after the specified size.
1723  *	It also trims the last block_run that contain the size.
1724  *	"offset" and "max" are maintained until the last block_run that doesn't
1725  *	have to be freed - after this, the values won't be correct anymore, but
1726  *	will still assure correct function for all subsequent calls.
1727  *	"max" is considered to be in file system byte order.
1728  */
1729 
1730 status_t
1731 Inode::_FreeStreamArray(Transaction &transaction, block_run *array, uint32 arrayLength,
1732 	off_t size, off_t &offset, off_t &max)
1733 {
1734 	PRINT(("FreeStreamArray: arrayLength %lu, size %Ld, offset %Ld, max %Ld\n",
1735 		arrayLength, size, offset, max));
1736 
1737 	off_t newOffset = offset;
1738 	uint32 i = 0;
1739 	for (; i < arrayLength; i++, offset = newOffset) {
1740 		if (array[i].IsZero())
1741 			break;
1742 
1743 		newOffset += (off_t)array[i].Length() << fVolume->BlockShift();
1744 		if (newOffset <= size)
1745 			continue;
1746 
1747 		block_run run = array[i];
1748 
1749 		// determine the block_run to be freed
1750 		if (newOffset > size && offset < size) {
1751 			// free partial block_run (and update the original block_run)
1752 			run.start = array[i].start + ((size - offset) >> fVolume->BlockShift()) + 1;
1753 			array[i].length = HOST_ENDIAN_TO_BFS_INT16(run.Start() - array[i].Start());
1754 			run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length() - array[i].Length());
1755 
1756 			if (run.length == 0)
1757 				continue;
1758 
1759 			// update maximum range
1760 			max = HOST_ENDIAN_TO_BFS_INT64(offset + ((off_t)array[i].Length() << fVolume->BlockShift()));
1761 		} else {
1762 			// free the whole block_run
1763 			array[i].SetTo(0, 0, 0);
1764 
1765 			if ((off_t)BFS_ENDIAN_TO_HOST_INT64(max) > offset)
1766 				max = HOST_ENDIAN_TO_BFS_INT64(offset);
1767 		}
1768 
1769 		if (fVolume->Free(transaction, run) < B_OK)
1770 			return B_IO_ERROR;
1771 	}
1772 	return B_OK;
1773 }
1774 
1775 
1776 status_t
1777 Inode::_ShrinkStream(Transaction &transaction, off_t size)
1778 {
1779 	data_stream *data = &Node().data;
1780 	status_t status;
1781 
1782 	if (data->MaxDoubleIndirectRange() > size) {
1783 		off_t *maxDoubleIndirect = &data->max_double_indirect_range;
1784 			// gcc 4 work-around: "error: cannot bind packed field
1785 			// 'data->data_stream::max_double_indirect_range' to 'off_t&'"
1786 		status = _FreeStaticStreamArray(transaction, 0, data->double_indirect, size,
1787 			data->MaxIndirectRange(), *maxDoubleIndirect);
1788 		if (status < B_OK)
1789 			return status;
1790 
1791 		if (size <= data->MaxIndirectRange()) {
1792 			fVolume->Free(transaction, data->double_indirect);
1793 			data->double_indirect.SetTo(0, 0, 0);
1794 			data->max_double_indirect_range = 0;
1795 		}
1796 	}
1797 
1798 	if (data->MaxIndirectRange() > size) {
1799 		CachedBlock cached(fVolume);
1800 		off_t block = fVolume->ToBlock(data->indirect);
1801 		off_t offset = data->MaxDirectRange();
1802 
1803 		for (int32 i = 0; i < data->indirect.Length(); i++) {
1804 			block_run *array = (block_run *)cached.SetToWritable(transaction, block + i);
1805 			if (array == NULL)
1806 				break;
1807 
1808 			off_t *maxIndirect = &data->max_indirect_range;
1809 				// gcc 4 work-around: "error: cannot bind packed field
1810 				// 'data->data_stream::max_indirect_range' to 'off_t&'"
1811 			if (_FreeStreamArray(transaction, array, fVolume->BlockSize() / sizeof(block_run),
1812 					size, offset, *maxIndirect) != B_OK)
1813 				return B_IO_ERROR;
1814 		}
1815 		if (data->max_direct_range == data->max_indirect_range) {
1816 			fVolume->Free(transaction, data->indirect);
1817 			data->indirect.SetTo(0, 0, 0);
1818 			data->max_indirect_range = 0;
1819 		}
1820 	}
1821 
1822 	if (data->MaxDirectRange() > size) {
1823 		off_t offset = 0;
1824 		off_t *maxDirect = &data->max_direct_range;
1825 			// gcc 4 work-around: "error: cannot bind packed field
1826 			// 'data->data_stream::max_direct_range' to 'off_t&'"
1827 		status = _FreeStreamArray(transaction, data->direct, NUM_DIRECT_BLOCKS,
1828 			size, offset, *maxDirect);
1829 		if (status < B_OK)
1830 			return status;
1831 	}
1832 
1833 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
1834 	return B_OK;
1835 }
1836 
1837 
1838 status_t
1839 Inode::SetFileSize(Transaction &transaction, off_t size)
1840 {
1841 	if (size < 0)
1842 		return B_BAD_VALUE;
1843 
1844 	off_t oldSize = Size();
1845 
1846 	if (size == oldSize)
1847 		return B_OK;
1848 
1849 	// should the data stream grow or shrink?
1850 	status_t status;
1851 	if (size > oldSize) {
1852 		status = _GrowStream(transaction, size);
1853 		if (status < B_OK) {
1854 			// if the growing of the stream fails, the whole operation
1855 			// fails, so we should shrink the stream to its former size
1856 			_ShrinkStream(transaction, oldSize);
1857 		}
1858 	} else
1859 		status = _ShrinkStream(transaction, size);
1860 
1861 	if (status < B_OK)
1862 		return status;
1863 
1864 	file_cache_set_size(FileCache(), size);
1865 	return WriteBack(transaction);
1866 }
1867 
1868 
1869 status_t
1870 Inode::Append(Transaction &transaction, off_t bytes)
1871 {
1872 	return SetFileSize(transaction, Size() + bytes);
1873 }
1874 
1875 
1876 /**	Checks wether or not this inode's data stream needs to be trimmed
1877  *	because of an earlier preallocation.
1878  *	Returns true if there are any blocks to be trimmed.
1879  */
1880 
1881 bool
1882 Inode::NeedsTrimming()
1883 {
1884 	// We never trim preallocated index blocks to make them grow as smooth as possible.
1885 	// There are only few indices anyway, so this doesn't hurt
1886 	// Also, if an inode is already in deleted state, we don't bother trimming it
1887 	if (IsIndex() || IsDeleted())
1888 		return false;
1889 
1890 	off_t roundedSize = (Size() + fVolume->BlockSize() - 1) & ~(fVolume->BlockSize() - 1);
1891 
1892 	return Node().data.MaxDirectRange() > roundedSize
1893 		|| Node().data.MaxIndirectRange() > roundedSize
1894 		|| Node().data.MaxDoubleIndirectRange() > roundedSize;
1895 }
1896 
1897 
1898 status_t
1899 Inode::TrimPreallocation(Transaction &transaction)
1900 {
1901 	status_t status = _ShrinkStream(transaction, Size());
1902 	if (status < B_OK)
1903 		return status;
1904 
1905 	return WriteBack(transaction);
1906 }
1907 
1908 
1909 status_t
1910 Inode::Free(Transaction &transaction)
1911 {
1912 	FUNCTION();
1913 
1914 	// Perhaps there should be an implementation of Inode::ShrinkStream() that
1915 	// just frees the data_stream, but doesn't change the inode (since it is
1916 	// freed anyway) - that would make an undelete command possible
1917 	status_t status = SetFileSize(transaction, 0);
1918 	if (status < B_OK)
1919 		return status;
1920 
1921 	// Free all attributes, and remove their indices
1922 	{
1923 		// We have to limit the scope of AttributeIterator, so that its
1924 		// destructor is not called after the inode is deleted
1925 		AttributeIterator iterator(this);
1926 
1927 		char name[B_FILE_NAME_LENGTH];
1928 		uint32 type;
1929 		size_t length;
1930 		vnode_id id;
1931 		while ((status = iterator.GetNext(name, &length, &type, &id)) == B_OK)
1932 			RemoveAttribute(transaction, name);
1933 	}
1934 
1935 	if (WriteBack(transaction) < B_OK)
1936 		return B_IO_ERROR;
1937 
1938 	return fVolume->Free(transaction, BlockRun());
1939 }
1940 
1941 
1942 status_t
1943 Inode::Sync()
1944 {
1945 	if (FileCache())
1946 		return file_cache_sync(FileCache());
1947 
1948 	// We may also want to flush the attribute's data stream to
1949 	// disk here... (do we?)
1950 
1951 	data_stream *data = &Node().data;
1952 	status_t status = B_OK;
1953 
1954 	// flush direct range
1955 
1956 	for (int32 i = 0; i < NUM_DIRECT_BLOCKS; i++) {
1957 		if (data->direct[i].IsZero())
1958 			return B_OK;
1959 
1960 //		status = flush_blocks(fVolume->Device(), fVolume->ToBlock(data->direct[i]),
1961 //			data->direct[i].Length());
1962 		if (status != B_OK)
1963 			return status;
1964 	}
1965 
1966 	// flush indirect range
1967 
1968 	if (data->max_indirect_range == 0)
1969 		return B_OK;
1970 
1971 	CachedBlock cached(fVolume);
1972 	off_t block = fVolume->ToBlock(data->indirect);
1973 	int32 count = fVolume->BlockSize() / sizeof(block_run);
1974 
1975 	for (int32 j = 0; j < data->indirect.Length(); j++) {
1976 		block_run *runs = (block_run *)cached.SetTo(block + j);
1977 		if (runs == NULL)
1978 			break;
1979 
1980 		for (int32 i = 0; i < count; i++) {
1981 			if (runs[i].IsZero())
1982 				return B_OK;
1983 
1984 //			status = flush_blocks(fVolume->Device(), fVolume->ToBlock(runs[i]), runs[i].Length());
1985 			if (status != B_OK)
1986 				return status;
1987 		}
1988 	}
1989 
1990 	// flush double indirect range
1991 
1992 	if (data->max_double_indirect_range == 0)
1993 		return B_OK;
1994 
1995 	off_t indirectBlock = fVolume->ToBlock(data->double_indirect);
1996 
1997 	for (int32 l = 0; l < data->double_indirect.Length(); l++) {
1998 		block_run *indirectRuns = (block_run *)cached.SetTo(indirectBlock + l);
1999 		if (indirectRuns == NULL)
2000 			return B_FILE_ERROR;
2001 
2002 		CachedBlock directCached(fVolume);
2003 
2004 		for (int32 k = 0; k < count; k++) {
2005 			if (indirectRuns[k].IsZero())
2006 				return B_OK;
2007 
2008 			block = fVolume->ToBlock(indirectRuns[k]);
2009 			for (int32 j = 0; j < indirectRuns[k].Length(); j++) {
2010 				block_run *runs = (block_run *)directCached.SetTo(block + j);
2011 				if (runs == NULL)
2012 					return B_FILE_ERROR;
2013 
2014 				for (int32 i = 0; i < count; i++) {
2015 					if (runs[i].IsZero())
2016 						return B_OK;
2017 
2018 					// ToDo: combine single block_runs to bigger ones when
2019 					// they are adjacent
2020 //					status = flush_blocks(fVolume->Device(), fVolume->ToBlock(runs[i]),
2021 //						runs[i].Length());
2022 					if (status != B_OK)
2023 						return status;
2024 				}
2025 			}
2026 		}
2027 	}
2028 	return B_OK;
2029 }
2030 
2031 
2032 status_t
2033 Inode::Remove(Transaction &transaction, const char *name, off_t *_id, bool isDirectory)
2034 {
2035 	BPlusTree *tree;
2036 	if (GetTree(&tree) != B_OK)
2037 		RETURN_ERROR(B_BAD_VALUE);
2038 
2039 	RecursiveLocker locker(fVolume->Lock());
2040 
2041 	// does the file even exist?
2042 	off_t id;
2043 	if (tree->Find((uint8 *)name, (uint16)strlen(name), &id) < B_OK)
2044 		return B_ENTRY_NOT_FOUND;
2045 
2046 	if (_id)
2047 		*_id = id;
2048 
2049 	Vnode vnode(fVolume, id);
2050 	Inode *inode;
2051 	status_t status = vnode.Get(&inode);
2052 	if (status < B_OK) {
2053 		REPORT_ERROR(status);
2054 		return B_ENTRY_NOT_FOUND;
2055 	}
2056 
2057 	// Inode::IsContainer() is true also for indices (furthermore, the S_IFDIR
2058 	// bit is set for indices in BFS, not for attribute directories) - but you
2059 	// should really be able to do whatever you want with your indices
2060 	// without having to remove all files first :)
2061 	if (!inode->IsIndex()) {
2062 		// if it's not of the correct type, don't delete it!
2063 		if (inode->IsContainer() != isDirectory)
2064 			return isDirectory ? B_NOT_A_DIRECTORY : B_IS_A_DIRECTORY;
2065 
2066 		// only delete empty directories
2067 		if (isDirectory && !inode->IsEmpty())
2068 			return B_DIRECTORY_NOT_EMPTY;
2069 	}
2070 
2071 	// remove_vnode() allows the inode to be accessed until the last put_vnode()
2072 	status = remove_vnode(fVolume->ID(), id);
2073 	if (status != B_OK)
2074 		return status;
2075 
2076 	if (tree->Remove(transaction, name, id) < B_OK) {
2077 		unremove_vnode(fVolume->ID(), id);
2078 		RETURN_ERROR(B_ERROR);
2079 	}
2080 
2081 #ifdef DEBUG
2082 	if (tree->Find((uint8 *)name, (uint16)strlen(name), &id) == B_OK) {
2083 		DIE(("deleted entry still there"));
2084 	}
2085 #endif
2086 
2087 	// update the inode, so that no one will ever doubt it's deleted :-)
2088 	inode->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
2089 	inode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
2090 
2091 	// In balance to the Inode::Create() method, the main indices
2092 	// are updated here (name, size, & last_modified)
2093 
2094 	Index index(fVolume);
2095 	if (inode->IsRegularNode()) {
2096 		index.RemoveName(transaction, name, inode);
2097 			// If removing from the index fails, it is not regarded as a
2098 			// fatal error and will not be reported back!
2099 			// Deleted inodes won't be visible in queries anyway.
2100 	}
2101 
2102 	if (inode->IsFile() || inode->IsSymLink()) {
2103 		if (inode->IsFile())
2104 			index.RemoveSize(transaction, inode);
2105 		index.RemoveLastModified(transaction, inode);
2106 	}
2107 
2108 	return inode->WriteBack(transaction);
2109 }
2110 
2111 
2112 /**	Creates the inode with the specified parent directory, and automatically
2113  *	adds the created inode to that parent directory. If an attribute directory
2114  *	is created, it will also automatically added to the parent inode as such.
2115  *	However, the indices root node, and the regular root node won't be added
2116  *	to the super block.
2117  *	It will also create the initial B+tree for the inode if it's a directory
2118  *	of any kind.
2119  *	If the "_id" or "_inode" variable is given and non-NULL to store the inode's
2120  *	ID, the inode stays locked - you have to call put_vnode() if you don't use it
2121  *	anymore.
2122  */
2123 
2124 status_t
2125 Inode::Create(Transaction &transaction, Inode *parent, const char *name, int32 mode,
2126 	int openMode, uint32 type, off_t *_id, Inode **_inode)
2127 {
2128 	FUNCTION_START(("name = %s, mode = %ld\n", name, mode));
2129 
2130 	block_run parentRun = parent ? parent->BlockRun() : block_run::Run(0, 0, 0);
2131 	Volume *volume = transaction.GetVolume();
2132 	BPlusTree *tree = NULL;
2133 
2134 	if (parent && (mode & S_ATTR_DIR) == 0 && parent->IsContainer()) {
2135 		// check if the file already exists in the directory
2136 		if (parent->GetTree(&tree) != B_OK)
2137 			RETURN_ERROR(B_BAD_VALUE);
2138 	}
2139 
2140 	WriteLocked locker(parent != NULL ? &parent->Lock() : NULL);
2141 		// the parent directory is locked during the whole inode creation
2142 
2143 	if (tree != NULL) {
2144 		// does the file already exist?
2145 		off_t offset;
2146 		if (tree->Find((uint8 *)name, (uint16)strlen(name), &offset) == B_OK) {
2147 			// return if the file should be a directory/link or opened in exclusive mode
2148 			if (S_ISDIR(mode) || S_ISLNK(mode) || openMode & O_EXCL)
2149 				return B_FILE_EXISTS;
2150 
2151 			Vnode vnode(volume, offset);
2152 			Inode *inode;
2153 			status_t status = vnode.Get(&inode);
2154 			if (status < B_OK) {
2155 				REPORT_ERROR(status);
2156 				return B_ENTRY_NOT_FOUND;
2157 			}
2158 
2159 			// if it's a directory, bail out!
2160 			if (inode->IsDirectory())
2161 				return B_IS_A_DIRECTORY;
2162 
2163 			// we want to open the file, so we should have the rights to do so
2164 			if (inode->CheckPermissions(openModeToAccess(openMode)) != B_OK)
2165 				return B_NOT_ALLOWED;
2166 
2167 			if (openMode & O_TRUNC) {
2168 				// we need write access in order to truncate the file
2169 				status = inode->CheckPermissions(W_OK);
2170 				if (status != B_OK)
2171 					return status;
2172 
2173 				// truncate the existing file
2174 				WriteLocked locked(inode->Lock());
2175 
2176 				status_t status = inode->SetFileSize(transaction, 0);
2177 				if (status >= B_OK)
2178 					status = inode->WriteBack(transaction);
2179 
2180 				if (status < B_OK)
2181 					return status;
2182 			}
2183 
2184 			if (_id)
2185 				*_id = offset;
2186 			if (_inode)
2187 				*_inode = inode;
2188 
2189 			// only keep the vnode in memory if the _id or _inode pointer is provided
2190 			if (_id != NULL || _inode != NULL)
2191 				vnode.Keep();
2192 
2193 			return B_OK;
2194 		}
2195 	} else if (parent && (mode & S_ATTR_DIR) == 0)
2196 		return B_BAD_VALUE;
2197 
2198 	status_t status;
2199 
2200 	// do we have the power to create new files at all?
2201 	if (parent != NULL && (status = parent->CheckPermissions(W_OK)) != B_OK)
2202 		RETURN_ERROR(status);
2203 
2204 	// allocate space for the new inode
2205 	InodeAllocator allocator(transaction);
2206 	block_run run;
2207 	Inode *inode;
2208 	status = allocator.New(&parentRun, mode, run, &inode);
2209 	if (status < B_OK)
2210 		return status;
2211 
2212 	// Initialize the parts of the bfs_inode structure that
2213 	// InodeAllocator::New() hasn't touched yet
2214 
2215 	bfs_inode *node = &inode->Node();
2216 
2217 	if (parent == NULL) {
2218 		// we set the parent to itself in this case
2219 		// (only happens for the root and indices node)
2220 		node->parent = run;
2221 	} else
2222 		node->parent = parentRun;
2223 
2224 	node->uid = HOST_ENDIAN_TO_BFS_INT32(geteuid());
2225 	node->gid = HOST_ENDIAN_TO_BFS_INT32(parent ? parent->Node().GroupID() : getegid());
2226 		// the group ID is inherited from the parent, if available
2227 
2228 	node->type = HOST_ENDIAN_TO_BFS_INT32(type);
2229 
2230 	// only add the name to regular files, directories, or symlinks
2231 	// don't add it to attributes, or indices
2232 	if (tree && inode->IsRegularNode() && inode->SetName(transaction, name) < B_OK)
2233 		return B_ERROR;
2234 
2235 	// Initialize b+tree if it's a directory (and add "." & ".." if it's
2236 	// a standard directory for files - not for attributes or indices)
2237 	if (inode->IsContainer()) {
2238 		status = allocator.CreateTree();
2239 		if (status < B_OK)
2240 			return status;
2241 	}
2242 
2243 	// Add a link to the inode from the parent, depending on its type
2244 	// (the vnode is not published yet, so it is safe to make the inode
2245 	// accessable to the file system here)
2246 	if (tree) {
2247 		status = tree->Insert(transaction, name, inode->ID());
2248 	} else if (parent && (mode & S_ATTR_DIR) != 0) {
2249 		parent->Attributes() = run;
2250 		status = parent->WriteBack(transaction);
2251 	}
2252 
2253 	// Note, we only care if the inode could be made accessable for the
2254 	// two cases above; the root node or the indices root node must
2255 	// handle this case on their own (or other cases where "parent" is
2256 	// NULL)
2257 	if (status < B_OK)
2258 		RETURN_ERROR(status);
2259 
2260 	// Update the main indices (name, size & last_modified)
2261 	// (live queries might want to access us after this)
2262 
2263 	Index index(volume);
2264 	if (inode->IsRegularNode() && name != NULL) {
2265 		// the name index only contains regular files
2266 		// (but not the root node where name == NULL)
2267 		status = index.InsertName(transaction, name, inode);
2268 		if (status < B_OK && status != B_BAD_INDEX) {
2269 			// We have to remove the node from the parent at this point,
2270 			// because the InodeAllocator destructor can't handle this
2271 			// case (and if it fails, we can't do anything about it...)
2272 			if (tree)
2273 				tree->Remove(transaction, name, inode->ID());
2274 			else if (parent != NULL && (mode & S_ATTR_DIR) != 0)
2275 				parent->Node().attributes.SetTo(0, 0, 0);
2276 
2277 			RETURN_ERROR(status);
2278 		}
2279 	}
2280 
2281 	inode->UpdateOldLastModified();
2282 
2283 	// The "size" & "last_modified" indices don't contain directories
2284 	if (inode->IsFile() || inode->IsSymLink()) {
2285 		// if adding to these indices fails, the inode creation will not be harmed;
2286 		// they are considered less important than the "name" index
2287 		if (inode->IsFile())
2288 			index.InsertSize(transaction, inode);
2289 		index.InsertLastModified(transaction, inode);
2290 	}
2291 
2292 	// Everything worked well until this point, we have a fully
2293 	// initialized inode, and we want to keep it
2294 	allocator.Keep();
2295 
2296 	if (inode->IsFile() || inode->IsAttribute())
2297 		inode->SetFileCache(file_cache_create(volume->ID(), inode->ID(), inode->Size(), volume->Device()));
2298 
2299 	if (_id != NULL)
2300 		*_id = inode->ID();
2301 	if (_inode != NULL)
2302 		*_inode = inode;
2303 
2304 	// if either _id or _inode is passed, we will keep the inode locked
2305 	if (_id == NULL && _inode == NULL)
2306 		put_vnode(volume->ID(), inode->ID());
2307 
2308 	return B_OK;
2309 }
2310 
2311 
2312 //	#pragma mark -
2313 
2314 
2315 AttributeIterator::AttributeIterator(Inode *inode)
2316 	:
2317 	fCurrentSmallData(0),
2318 	fInode(inode),
2319 	fAttributes(NULL),
2320 	fIterator(NULL),
2321 	fBuffer(NULL)
2322 {
2323 	inode->_AddIterator(this);
2324 }
2325 
2326 
2327 AttributeIterator::~AttributeIterator()
2328 {
2329 	if (fAttributes)
2330 		put_vnode(fAttributes->GetVolume()->ID(), fAttributes->ID());
2331 
2332 	delete fIterator;
2333 	fInode->_RemoveIterator(this);
2334 }
2335 
2336 
2337 status_t
2338 AttributeIterator::Rewind()
2339 {
2340 	fCurrentSmallData = 0;
2341 
2342 	if (fIterator != NULL)
2343 		fIterator->Rewind();
2344 
2345 	return B_OK;
2346 }
2347 
2348 
2349 status_t
2350 AttributeIterator::GetNext(char *name, size_t *_length, uint32 *_type, vnode_id *_id)
2351 {
2352 	// read attributes out of the small data section
2353 
2354 	if (fCurrentSmallData >= 0) {
2355 		NodeGetter nodeGetter(fInode->GetVolume(), fInode);
2356 		const bfs_inode *node = nodeGetter.Node();
2357 		const small_data *item = ((bfs_inode *)node)->SmallDataStart();
2358 
2359 		fInode->SmallDataLock().Lock();
2360 
2361 		int32 i = 0;
2362 		for (;;item = item->Next()) {
2363 			if (item->IsLast(node))
2364 				break;
2365 
2366 			if (item->NameSize() == FILE_NAME_NAME_LENGTH
2367 				&& *item->Name() == FILE_NAME_NAME)
2368 				continue;
2369 
2370 			if (i++ == fCurrentSmallData)
2371 				break;
2372 		}
2373 
2374 		if (!item->IsLast(node)) {
2375 			strncpy(name, item->Name(), B_FILE_NAME_LENGTH);
2376 			*_type = item->Type();
2377 			*_length = item->NameSize();
2378 			*_id = (vnode_id)fCurrentSmallData;
2379 
2380 			fCurrentSmallData = i;
2381 		}
2382 		else {
2383 			// stop traversing the small_data section
2384 			fCurrentSmallData = -1;
2385 		}
2386 
2387 		fInode->SmallDataLock().Unlock();
2388 
2389 		if (fCurrentSmallData != -1)
2390 			return B_OK;
2391 	}
2392 
2393 	// read attributes out of the attribute directory
2394 
2395 	if (fInode->Attributes().IsZero())
2396 		return B_ENTRY_NOT_FOUND;
2397 
2398 	Volume *volume = fInode->GetVolume();
2399 
2400 	// if you haven't yet access to the attributes directory, get it
2401 	if (fAttributes == NULL) {
2402 		if (get_vnode(volume->ID(), volume->ToVnode(fInode->Attributes()),
2403 				(void **)&fAttributes) != B_OK) {
2404 			FATAL(("get_vnode() failed in AttributeIterator::GetNext(vnode_id = %Ld,name = \"%s\")\n",fInode->ID(),name));
2405 			return B_ENTRY_NOT_FOUND;
2406 		}
2407 
2408 		BPlusTree *tree;
2409 		if (fAttributes->GetTree(&tree) < B_OK
2410 			|| (fIterator = new TreeIterator(tree)) == NULL) {
2411 			FATAL(("could not get tree in AttributeIterator::GetNext(vnode_id = %Ld,name = \"%s\")\n",fInode->ID(),name));
2412 			return B_ENTRY_NOT_FOUND;
2413 		}
2414 	}
2415 
2416 	uint16 length;
2417 	vnode_id id;
2418 	status_t status = fIterator->GetNextEntry(name, &length, B_FILE_NAME_LENGTH, &id);
2419 	if (status < B_OK)
2420 		return status;
2421 
2422 	Vnode vnode(volume,id);
2423 	Inode *attribute;
2424 	if ((status = vnode.Get(&attribute)) == B_OK) {
2425 		*_type = attribute->Type();
2426 		*_length = attribute->Size();
2427 		*_id = id;
2428 	}
2429 
2430 	return status;
2431 }
2432 
2433 
2434 void
2435 AttributeIterator::Update(uint16 index, int8 change)
2436 {
2437 	// fCurrentSmallData points already to the next item
2438 	if (index < fCurrentSmallData)
2439 		fCurrentSmallData += change;
2440 }
2441 
2442