xref: /haiku/src/add-ons/kernel/file_systems/bfs/Inode.cpp (revision 81f5654c124bf46fba0fd251f208e2d88d81e1ce)
1 /* Inode - inode access functions
2 **
3 ** Initial version by Axel Dörfler, axeld@pinc-software.de
4 ** This file may be used under the terms of the OpenBeOS License.
5 */
6 
7 
8 #include "Debug.h"
9 #include "Inode.h"
10 #include "BPlusTree.h"
11 #include "Stream.h"
12 #include "Index.h"
13 
14 #include <util/kernel_cpp.h>
15 
16 #include <string.h>
17 #include <stdio.h>
18 
19 
20 class InodeAllocator {
21 	public:
22 		InodeAllocator(Transaction *transaction);
23 		~InodeAllocator();
24 
25 		status_t New(block_run *parentRun, mode_t mode, block_run &run, Inode **_inode);
26 		status_t CreateTree();
27 		status_t Keep();
28 
29 	private:
30 		Transaction *fTransaction;
31 		block_run fRun;
32 		Inode *fInode;
33 };
34 
35 
36 InodeAllocator::InodeAllocator(Transaction *transaction)
37 	:
38 	fTransaction(transaction),
39 	fInode(NULL)
40 {
41 }
42 
43 
44 InodeAllocator::~InodeAllocator()
45 {
46 	if (fTransaction != NULL) {
47 		if (fInode != NULL) {
48 			fInode->Node()->flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE | INODE_NOT_READY);
49 				// this unblocks any pending bfs_read_vnode() calls
50 			fInode->Free(fTransaction);
51 		} else
52 			fTransaction->GetVolume()->Free(fTransaction, fRun);
53 	}
54 
55 	delete fInode;
56 }
57 
58 
59 status_t
60 InodeAllocator::New(block_run *parentRun, mode_t mode, block_run &run, Inode **_inode)
61 {
62 	Volume *volume = fTransaction->GetVolume();
63 
64 	status_t status = volume->AllocateForInode(fTransaction, parentRun, mode, fRun);
65 	if (status < B_OK) {
66 		// don't free the space in the destructor, because
67 		// the allocation failed
68 		fTransaction = NULL;
69 		RETURN_ERROR(status);
70 	}
71 
72 	run = fRun;
73 	fInode = new Inode(volume, volume->ToVnode(run), true);
74 	if (fInode == NULL)
75 		RETURN_ERROR(B_NO_MEMORY);
76 
77 	// initialize the on-disk bfs_inode structure
78 
79 	bfs_inode *node = fInode->Node();
80 
81 	node->magic1 = HOST_ENDIAN_TO_BFS_INT32(INODE_MAGIC1);
82 	node->inode_num = run;
83 	node->mode = HOST_ENDIAN_TO_BFS_INT32(mode);
84 	node->flags = HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE | INODE_NOT_READY);
85 		// INODE_NOT_READY prevents the inode from being opened - it is
86 		// cleared in InodeAllocator::Keep()
87 	node->etc = (uint32)fInode;
88 		// this is temporarily set along INODE_NOT_READY and lets bfs_read_vnode()
89 		// find the associated Inode object
90 
91 	node->create_time = HOST_ENDIAN_TO_BFS_INT64((bigtime_t)time(NULL) << INODE_TIME_SHIFT);
92 	node->last_modified_time = HOST_ENDIAN_TO_BFS_INT64(node->create_time
93 		| (volume->GetUniqueID() & INODE_TIME_MASK));
94 		// we use Volume::GetUniqueID() to avoid having too many duplicates in the
95 		// last_modified index
96 
97 	node->inode_size = HOST_ENDIAN_TO_BFS_INT32(volume->InodeSize());
98 
99 	*_inode = fInode;
100 	return B_OK;
101 }
102 
103 
104 status_t
105 InodeAllocator::CreateTree()
106 {
107 	Volume *volume = fTransaction->GetVolume();
108 
109 	// force S_STR_INDEX to be set, if no type is set
110 	if ((fInode->Mode() & S_INDEX_TYPES) == 0)
111 		fInode->Node()->mode |= HOST_ENDIAN_TO_BFS_INT32(S_STR_INDEX);
112 
113 	BPlusTree *tree = fInode->fTree = new BPlusTree(fTransaction, fInode);
114 	if (tree == NULL || tree->InitCheck() < B_OK)
115 		return B_ERROR;
116 
117 	if (fInode->IsRegularNode()) {
118 		if (tree->Insert(fTransaction, ".", fInode->ID()) < B_OK
119 			|| tree->Insert(fTransaction, "..", volume->ToVnode(fInode->Parent())) < B_OK)
120 			return B_ERROR;
121 	}
122 	return B_OK;
123 }
124 
125 
126 status_t
127 InodeAllocator::Keep()
128 {
129 	ASSERT(fInode != NULL && fTransaction != NULL);
130 
131 	fInode->Node()->flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_NOT_READY);
132 	status_t status = fInode->WriteBack(fTransaction);
133 
134 	fTransaction = NULL;
135 	fInode = NULL;
136 
137 	return status;
138 }
139 
140 
141 //	#pragma mark -
142 
143 
144 status_t
145 bfs_inode::InitCheck(Volume *volume)
146 {
147 	if (Flags() & INODE_NOT_READY) {
148 		// the other fields may not yet contain valid values
149 		return B_BUSY;
150 	}
151 	if (Flags() & INODE_DELETED)
152 		return B_NOT_ALLOWED;
153 
154 	if (Magic1() != INODE_MAGIC1
155 		|| !(Flags() & INODE_IN_USE)
156 		|| inode_num.Length() != 1
157 		// matches inode size?
158 		|| (uint32)InodeSize() != volume->InodeSize()
159 		// parent resides on disk?
160 		|| parent.AllocationGroup() > int32(volume->AllocationGroups())
161 		|| parent.AllocationGroup() < 0
162 		|| parent.Start() > (1L << volume->AllocationGroupShift())
163 		|| parent.Length() != 1
164 		// attributes, too?
165 		|| attributes.AllocationGroup() > int32(volume->AllocationGroups())
166 		|| attributes.AllocationGroup() < 0
167 		|| attributes.Start() > (1L << volume->AllocationGroupShift()))
168 		RETURN_ERROR(B_BAD_DATA);
169 
170 	// ToDo: Add some tests to check the integrity of the other stuff here,
171 	// especially for the data_stream!
172 
173 	return B_OK;
174 }
175 
176 
177 //	#pragma mark -
178 
179 
180 Inode::Inode(Volume *volume, vnode_id id, bool empty, uint8 reenter)
181 	: CachedBlock(volume, volume->VnodeToBlock(id), empty),
182 	fTree(NULL),
183 	fLock()
184 {
185 	PRINT(("Inode::Inode(%p, %Ld, %s, %s) @ %p\n",
186 		volume, id, empty ? "empty" : "not-empty", reenter ? "reenter":"not-reenter", this));
187 
188 	Initialize();
189 }
190 
191 
192 Inode::Inode(CachedBlock *cached)
193 	: CachedBlock(cached),
194 	fTree(NULL),
195 	fLock()
196 {
197 	PRINT(("Inode::Inode(%p) @ %p\n", cached, this));
198 
199 	Initialize();
200 }
201 
202 
203 Inode::~Inode()
204 {
205 	PRINT(("Inode::~Inode() @ %p\n", this));
206 
207 	delete fTree;
208 }
209 
210 
211 void
212 Inode::Initialize()
213 {
214 	char lockName[32];
215 	sprintf(lockName, "bfs inode %ld.%d", BlockRun().AllocationGroup(), BlockRun().Start());
216 	fLock.Initialize(lockName);
217 
218 	Node()->flags &= HOST_ENDIAN_TO_BFS_INT32(INODE_PERMANENT_FLAGS);
219 
220 	// these two will help to maintain the indices
221 	fOldSize = Size();
222 	fOldLastModified = LastModified();
223 }
224 
225 
226 status_t
227 Inode::InitCheck(bool checkNode)
228 {
229 	if (!Node())
230 		RETURN_ERROR(B_IO_ERROR);
231 
232 	// test inode magic and flags
233 	if (checkNode) {
234 		status_t status = Node()->InitCheck(fVolume);
235 		if (status == B_BUSY)
236 			return B_BUSY;
237 
238 		if (status < B_OK) {
239 			FATAL(("inode at block %Ld corrupt!\n", fBlockNumber));
240 			RETURN_ERROR(B_BAD_DATA);
241 		}
242 	}
243 
244 	// it's more important to know that the inode is corrupt
245 	// so we check for the lock not until here
246 	return fLock.InitCheck();
247 }
248 
249 
250 status_t
251 Inode::CheckPermissions(int accessMode) const
252 {
253 	uid_t user = geteuid();
254 	gid_t group = getegid();
255 
256 	// you never have write access to a read-only volume
257 	if (accessMode & W_OK && fVolume->IsReadOnly())
258 		return B_READ_ONLY_DEVICE;
259 
260 	// root users always have full access (but they can't execute anything)
261 	if (user == 0 && !((accessMode & X_OK) && (Mode() & S_IXUSR) == 0))
262 		return B_OK;
263 
264 	// shift mode bits, to check directly against accessMode
265 	mode_t mode = Mode();
266 	if (user == (uid_t)Node()->UserID())
267 		mode >>= 6;
268 	else if (group == (gid_t)Node()->GroupID())
269 		mode >>= 3;
270 
271 	if (accessMode & ~(mode & S_IRWXO))
272 		return B_NOT_ALLOWED;
273 
274 	return B_OK;
275 }
276 
277 
278 //	#pragma mark -
279 
280 
281 void
282 Inode::AddIterator(AttributeIterator *iterator)
283 {
284 	if (fSmallDataLock.Lock() < B_OK)
285 		return;
286 
287 	fIterators.Add(iterator);
288 
289 	fSmallDataLock.Unlock();
290 }
291 
292 
293 void
294 Inode::RemoveIterator(AttributeIterator *iterator)
295 {
296 	if (fSmallDataLock.Lock() < B_OK)
297 		return;
298 
299 	fIterators.Remove(iterator);
300 
301 	fSmallDataLock.Unlock();
302 }
303 
304 
305 /**	Tries to free up "bytes" space in the small_data section by moving
306  *	attributes to real files. Used for system attributes like the name.
307  *	You need to hold the fSmallDataLock when you call this method
308  */
309 
310 status_t
311 Inode::MakeSpaceForSmallData(Transaction *transaction, const char *name, int32 bytes)
312 {
313 	ASSERT(fSmallDataLock.IsLocked());
314 
315 	while (bytes > 0) {
316 		small_data *item = Node()->SmallDataStart(), *max = NULL;
317 		int32 index = 0, maxIndex = 0;
318 		for (; !item->IsLast(Node()); item = item->Next(), index++) {
319 			// should not remove those
320 			if (*item->Name() == FILE_NAME_NAME || !strcmp(name, item->Name()))
321 				continue;
322 
323 			if (max == NULL || max->Size() < item->Size()) {
324 				maxIndex = index;
325 				max = item;
326 			}
327 
328 			// remove the first one large enough to free the needed amount of bytes
329 			if (bytes < (int32)item->Size())
330 				break;
331 		}
332 
333 		if (item->IsLast(Node()) || (int32)item->Size() < bytes)
334 			return B_ERROR;
335 
336 		bytes -= max->Size();
337 
338 		// Move the attribute to a real attribute file
339 		// Luckily, this doesn't cause any index updates
340 
341 		Inode *attribute;
342 		status_t status = CreateAttribute(transaction, item->Name(), item->Type(), &attribute);
343 		if (status < B_OK)
344 			RETURN_ERROR(status);
345 
346 		size_t length = item->DataSize();
347 		status = attribute->WriteAt(transaction, 0, item->Data(), &length);
348 
349 		ReleaseAttribute(attribute);
350 
351 		if (status < B_OK) {
352 			Vnode vnode(fVolume,Attributes());
353 			Inode *attributes;
354 			if (vnode.Get(&attributes) < B_OK
355 				|| attributes->Remove(transaction, name) < B_OK) {
356 				FATAL(("Could not remove newly created attribute!\n"));
357 			}
358 
359 			RETURN_ERROR(status);
360 		}
361 
362 		RemoveSmallData(max, maxIndex);
363 	}
364 	return B_OK;
365 }
366 
367 
368 /**	Private function which removes the given attribute from the small_data
369  *	section.
370  *	You need to hold the fSmallDataLock when you call this method
371  */
372 
373 status_t
374 Inode::RemoveSmallData(small_data *item, int32 index)
375 {
376 	ASSERT(fSmallDataLock.IsLocked());
377 
378 	small_data *next = item->Next();
379 	if (!next->IsLast(Node())) {
380 		// find the last attribute
381 		small_data *last = next;
382 		while (!last->IsLast(Node()))
383 			last = last->Next();
384 
385 		int32 size = (uint8 *)last - (uint8 *)next;
386 		if (size < 0 || size > (uint8 *)Node() + fVolume->BlockSize() - (uint8 *)next)
387 			return B_BAD_DATA;
388 
389 		memmove(item, next, size);
390 
391 		// Move the "last" one to its new location and
392 		// correctly terminate the small_data section
393 		last = (small_data *)((uint8 *)last - ((uint8 *)next - (uint8 *)item));
394 		memset(last, 0, (uint8 *)Node() + fVolume->BlockSize() - (uint8 *)last);
395 	} else
396 		memset(item, 0, item->Size());
397 
398 	// update all current iterators
399 	AttributeIterator *iterator = NULL;
400 	while ((iterator = fIterators.Next(iterator)) != NULL)
401 		iterator->Update(index, -1);
402 
403 	return B_OK;
404 }
405 
406 
407 /**	Removes the given attribute from the small_data section.
408  *	Note that you need to write back the inode yourself after having called
409  *	that method.
410  */
411 
412 status_t
413 Inode::RemoveSmallData(Transaction *transaction, const char *name)
414 {
415 	if (name == NULL)
416 		return B_BAD_VALUE;
417 
418 	SimpleLocker locker(fSmallDataLock);
419 
420 	// search for the small_data item
421 
422 	small_data *item = Node()->SmallDataStart();
423 	int32 index = 0;
424 	while (!item->IsLast(Node()) && strcmp(item->Name(), name)) {
425 		item = item->Next();
426 		index++;
427 	}
428 
429 	if (item->IsLast(Node()))
430 		return B_ENTRY_NOT_FOUND;
431 
432 	return RemoveSmallData(item, index);
433 }
434 
435 
436 /**	Try to place the given attribute in the small_data section - if the
437  *	new attribute is too big to fit in that section, it returns B_DEVICE_FULL.
438  *	In that case, the attribute should be written to a real attribute file;
439  *	if the attribute was already part of the small_data section, but the new
440  *	one wouldn't fit, the old one is automatically removed from the small_data
441  *	section.
442  *	Note that you need to write back the inode yourself after having called that
443  *	method - it's a bad API decision that it needs a transaction but enforces you
444  *	to write back the inode all by yourself, but it's just more efficient in most
445  *	cases...
446  */
447 
448 status_t
449 Inode::AddSmallData(Transaction *transaction, const char *name, uint32 type,
450 	const uint8 *data, size_t length, bool force)
451 {
452 	if (name == NULL || data == NULL || type == 0)
453 		return B_BAD_VALUE;
454 
455 	// reject any requests that can't fit into the small_data section
456 	uint32 nameLength = strlen(name);
457 	uint32 spaceNeeded = sizeof(small_data) + nameLength + 3 + length + 1;
458 	if (spaceNeeded > fVolume->InodeSize() - sizeof(bfs_inode))
459 		return B_DEVICE_FULL;
460 
461 	SimpleLocker locker(fSmallDataLock);
462 
463 	small_data *item = Node()->SmallDataStart();
464 	int32 index = 0;
465 	while (!item->IsLast(Node()) && strcmp(item->Name(), name)) {
466 		item = item->Next();
467 		index++;
468 	}
469 
470 	// is the attribute already in the small_data section?
471 	// then just replace the data part of that one
472 	if (!item->IsLast(Node())) {
473 		// find last attribute
474 		small_data *last = item;
475 		while (!last->IsLast(Node()))
476 			last = last->Next();
477 
478 		// try to change the attributes value
479 		if (item->data_size > length
480 			|| force
481 			|| ((uint8 *)last + length - item->DataSize()) <= ((uint8 *)Node() + fVolume->InodeSize())) {
482 			// make room for the new attribute if needed (and we are forced to do so)
483 			if (force
484 				&& ((uint8 *)last + length - item->DataSize()) > ((uint8 *)Node() + fVolume->InodeSize())) {
485 				// We also take the free space at the end of the small_data section
486 				// into account, and request only what's really needed
487 				uint32 needed = length - item->DataSize() -
488 						(uint32)((uint8 *)Node() + fVolume->InodeSize() - (uint8 *)last);
489 
490 				if (MakeSpaceForSmallData(transaction, name, needed) < B_OK)
491 					return B_ERROR;
492 
493 				// reset our pointers
494 				item = Node()->SmallDataStart();
495 				index = 0;
496 				while (!item->IsLast(Node()) && strcmp(item->Name(), name)) {
497 					item = item->Next();
498 					index++;
499 				}
500 
501 				last = item;
502 				while (!last->IsLast(Node()))
503 					last = last->Next();
504 			}
505 
506 			// move the attributes after the current one
507 			small_data *next = item->Next();
508 			if (!next->IsLast(Node()))
509 				memmove((uint8 *)item + spaceNeeded, next, (uint8 *)last - (uint8 *)next);
510 
511 			// Move the "last" one to its new location and
512 			// correctly terminate the small_data section
513 			last = (small_data *)((uint8 *)last - ((uint8 *)next - ((uint8 *)item + spaceNeeded)));
514 			if ((uint8 *)last < (uint8 *)Node() + fVolume->BlockSize())
515 				memset(last, 0, (uint8 *)Node() + fVolume->BlockSize() - (uint8 *)last);
516 
517 			item->type = HOST_ENDIAN_TO_BFS_INT32(type);
518 			item->data_size = HOST_ENDIAN_TO_BFS_INT16(length);
519 			memcpy(item->Data(), data, length);
520 			item->Data()[length] = '\0';
521 
522 			return B_OK;
523 		}
524 
525 		// Could not replace the old attribute, so remove it to let
526 		// let the calling function create an attribute file for it
527 		if (RemoveSmallData(item, index) < B_OK)
528 			return B_ERROR;
529 
530 		return B_DEVICE_FULL;
531 	}
532 
533 	// try to add the new attribute!
534 
535 	if ((uint8 *)item + spaceNeeded > (uint8 *)Node() + fVolume->InodeSize()) {
536 		// there is not enough space for it!
537 		if (!force)
538 			return B_DEVICE_FULL;
539 
540 		// make room for the new attribute
541 		if (MakeSpaceForSmallData(transaction, name, spaceNeeded) < B_OK)
542 			return B_ERROR;
543 
544 		// get new last item!
545 		item = Node()->SmallDataStart();
546 		index = 0;
547 		while (!item->IsLast(Node())) {
548 			item = item->Next();
549 			index++;
550 		}
551 	}
552 
553 	memset(item, 0, spaceNeeded);
554 	item->type = HOST_ENDIAN_TO_BFS_INT32(type);
555 	item->name_size = HOST_ENDIAN_TO_BFS_INT16(nameLength);
556 	item->data_size = HOST_ENDIAN_TO_BFS_INT16(length);
557 	strcpy(item->Name(), name);
558 	memcpy(item->Data(), data, length);
559 
560 	// correctly terminate the small_data section
561 	item = item->Next();
562 	if (!item->IsLast(Node()))
563 		memset(item, 0, (uint8 *)Node() + fVolume->InodeSize() - (uint8 *)item);
564 
565 	// update all current iterators
566 	AttributeIterator *iterator = NULL;
567 	while ((iterator = fIterators.Next(iterator)) != NULL)
568 		iterator->Update(index, 1);
569 
570 	return B_OK;
571 }
572 
573 
574 /**	Iterates through the small_data section of an inode.
575  *	To start at the beginning of this section, you let smallData
576  *	point to NULL, like:
577  *		small_data *data = NULL;
578  *		while (inode->GetNextSmallData(&data) { ... }
579  *
580  *	This function is reentrant and doesn't allocate any memory;
581  *	you can safely stop calling it at any point (you don't need
582  *	to iterate through the whole list).
583  *	You need to hold the fSmallDataLock when you call this method
584  */
585 
586 status_t
587 Inode::GetNextSmallData(small_data **_smallData) const
588 {
589 	if (!Node())
590 		RETURN_ERROR(B_ERROR);
591 
592 	ASSERT(fSmallDataLock.IsLocked());
593 
594 	small_data *data = *_smallData;
595 
596 	// begin from the start?
597 	if (data == NULL)
598 		data = Node()->SmallDataStart();
599 	else
600 		data = data->Next();
601 
602 	// is already last item?
603 	if (data->IsLast(Node()))
604 		return B_ENTRY_NOT_FOUND;
605 
606 	*_smallData = data;
607 
608 	return B_OK;
609 }
610 
611 
612 /**	Finds the attribute "name" in the small data section, and
613  *	returns a pointer to it (or NULL if it doesn't exist).
614  *	You need to hold the fSmallDataLock when you call this method
615  */
616 
617 small_data *
618 Inode::FindSmallData(const char *name) const
619 {
620 	ASSERT(fSmallDataLock.IsLocked());
621 
622 	small_data *smallData = NULL;
623 	while (GetNextSmallData(&smallData) == B_OK) {
624 		if (!strcmp(smallData->Name(), name))
625 			return smallData;
626 	}
627 	return NULL;
628 }
629 
630 
631 /** Returns a pointer to the node's name if present in the small data
632  *	section, NULL otherwise.
633  *	You need to hold the fSmallDataLock when you call this method
634  */
635 
636 const char *
637 Inode::Name() const
638 {
639 	ASSERT(fSmallDataLock.IsLocked());
640 
641 	small_data *smallData = NULL;
642 	while (GetNextSmallData(&smallData) == B_OK) {
643 		if (*smallData->Name() == FILE_NAME_NAME && smallData->NameSize() == FILE_NAME_NAME_LENGTH)
644 			return (const char *)smallData->Data();
645 	}
646 	return NULL;
647 }
648 
649 
650 /** Copies the node's name into the provided buffer.
651  *	The buffer must be B_FILE_NAME_LENGTH bytes large.
652  */
653 
654 status_t
655 Inode::GetName(char *buffer) const
656 {
657 	SimpleLocker locker(fSmallDataLock);
658 
659 	const char *name = Name();
660 	if (name == NULL)
661 		return B_ENTRY_NOT_FOUND;
662 
663 	strlcpy(buffer, name, B_FILE_NAME_LENGTH);
664 	return B_OK;
665 }
666 
667 
668 /**	Changes or set the name of a file: in the inode small_data section only, it
669  *	doesn't change it in the parent directory's b+tree.
670  *	Note that you need to write back the inode yourself after having called
671  *	that method. It suffers from the same API decision as AddSmallData() does
672  *	(and for the same reason).
673  */
674 
675 status_t
676 Inode::SetName(Transaction *transaction, const char *name)
677 {
678 	if (name == NULL || *name == '\0')
679 		return B_BAD_VALUE;
680 
681 	const char nameTag[2] = {FILE_NAME_NAME, 0};
682 
683 	return AddSmallData(transaction, nameTag, FILE_NAME_TYPE, (uint8 *)name, strlen(name), true);
684 }
685 
686 
687 /**	Reads data from the specified attribute.
688  *	This is a high-level attribute function that understands attributes
689  *	in the small_data section as well as real attribute files.
690  */
691 
692 status_t
693 Inode::ReadAttribute(const char *name, int32 type, off_t pos, uint8 *buffer, size_t *_length)
694 {
695 	if (pos < 0)
696 		pos = 0;
697 
698 	// search in the small_data section (which has to be locked first)
699 	{
700 		SimpleLocker locker(fSmallDataLock);
701 
702 		small_data *smallData = FindSmallData(name);
703 		if (smallData != NULL) {
704 			size_t length = *_length;
705 			if (pos >= smallData->data_size) {
706 				*_length = 0;
707 				return B_OK;
708 			}
709 			if (length + pos > smallData->DataSize())
710 				length = smallData->DataSize() - pos;
711 
712 			memcpy(buffer, smallData->Data() + pos, length);
713 			*_length = length;
714 			return B_OK;
715 		}
716 	}
717 
718 	// search in the attribute directory
719 	Inode *attribute;
720 	status_t status = GetAttribute(name, &attribute);
721 	if (status == B_OK) {
722 		if (attribute->Lock().Lock() == B_OK) {
723 			status = attribute->ReadAt(pos, (uint8 *)buffer, _length);
724 			attribute->Lock().Unlock();
725 		} else
726 			status = B_ERROR;
727 
728 		ReleaseAttribute(attribute);
729 	}
730 
731 	RETURN_ERROR(status);
732 }
733 
734 
735 /**	Writes data to the specified attribute.
736  *	This is a high-level attribute function that understands attributes
737  *	in the small_data section as well as real attribute files.
738  */
739 
740 status_t
741 Inode::WriteAttribute(Transaction *transaction, const char *name, int32 type, off_t pos,
742 	const uint8 *buffer, size_t *_length)
743 {
744 	// needed to maintain the index
745 	uint8 oldBuffer[BPLUSTREE_MAX_KEY_LENGTH], *oldData = NULL;
746 	size_t oldLength = 0;
747 
748 	// ToDo: we actually depend on that the contents of "buffer" are constant.
749 	// If they get changed during the write (hey, user programs), we may mess
750 	// up our index trees!
751 
752 	Index index(fVolume);
753 	index.SetTo(name);
754 
755 	Inode *attribute = NULL;
756 	status_t status = B_OK;
757 
758 	if (GetAttribute(name, &attribute) < B_OK) {
759 		// save the old attribute data
760 		fSmallDataLock.Lock();
761 
762 		small_data *smallData = FindSmallData(name);
763 		if (smallData != NULL) {
764 			oldLength = smallData->DataSize();
765 			if (oldLength > BPLUSTREE_MAX_KEY_LENGTH)
766 				oldLength = BPLUSTREE_MAX_KEY_LENGTH;
767 			memcpy(oldData = oldBuffer, smallData->Data(), oldLength);
768 		}
769 		fSmallDataLock.Unlock();
770 
771 		// if the attribute doesn't exist yet (as a file), try to put it in the
772 		// small_data section first - if that fails (due to insufficent space),
773 		// create a real attribute file
774 		status = AddSmallData(transaction, name, type, buffer, *_length);
775 		if (status == B_DEVICE_FULL) {
776 			status = CreateAttribute(transaction, name, type, &attribute);
777 			if (status < B_OK)
778 				RETURN_ERROR(status);
779 		} else if (status == B_OK)
780 			status = WriteBack(transaction);
781 	}
782 
783 	if (attribute != NULL) {
784 		if (attribute->Lock().LockWrite() == B_OK) {
785 			// save the old attribute data (if this fails, oldLength will reflect it)
786 			if (fVolume->CheckForLiveQuery(name)) {
787 				oldLength = BPLUSTREE_MAX_KEY_LENGTH;
788 				if (attribute->ReadAt(0, oldBuffer, &oldLength) == B_OK)
789 					oldData = oldBuffer;
790 			}
791 			// ToDo: check if the data fits in the inode now and delete the attribute file if so
792 			status = attribute->WriteAt(transaction, pos, buffer, _length);
793 
794 			attribute->Lock().UnlockWrite();
795 		} else
796 			status = B_ERROR;
797 
798 		ReleaseAttribute(attribute);
799 	}
800 
801 	// ToDo: find a better way than this "pos" thing (the begin of the old key
802 	//	must be copied to the start of the new one for a comparison)
803 	if (status == B_OK && pos == 0) {
804 		// index only the first BPLUSTREE_MAX_KEY_LENGTH bytes
805 		uint16 length = *_length;
806 		if (length > BPLUSTREE_MAX_KEY_LENGTH)
807 			length = BPLUSTREE_MAX_KEY_LENGTH;
808 
809 		// Update index. Note, Index::Update() may be called even if initializing
810 		// the index failed - it will just update the live queries in this case
811 		if (pos < length || pos < oldLength)
812 			index.Update(transaction, name, type, oldData, oldLength, buffer, length, this);
813 	}
814 	return status;
815 }
816 
817 
818 /**	Removes the specified attribute from the inode.
819  *	This is a high-level attribute function that understands attributes
820  *	in the small_data section as well as real attribute files.
821  */
822 
823 status_t
824 Inode::RemoveAttribute(Transaction *transaction, const char *name)
825 {
826 	Index index(fVolume);
827 	bool hasIndex = index.SetTo(name) == B_OK;
828 
829 	// update index for attributes in the small_data section
830 	{
831 		fSmallDataLock.Lock();
832 
833 		small_data *smallData = FindSmallData(name);
834 		if (smallData != NULL) {
835 			uint32 length = smallData->DataSize();
836 			if (length > BPLUSTREE_MAX_KEY_LENGTH)
837 				length = BPLUSTREE_MAX_KEY_LENGTH;
838 			index.Update(transaction, name, smallData->Type(), smallData->Data(), length, NULL, 0, this);
839 		}
840 		fSmallDataLock.Unlock();
841 	}
842 
843 	status_t status = RemoveSmallData(transaction, name);
844 	if (status == B_OK) {
845 		status = WriteBack(transaction);
846 	} else if (status == B_ENTRY_NOT_FOUND && !Attributes().IsZero()) {
847 		// remove the attribute file if it exists
848 		Vnode vnode(fVolume, Attributes());
849 		Inode *attributes;
850 		if ((status = vnode.Get(&attributes)) < B_OK)
851 			return status;
852 
853 		// update index
854 		Inode *attribute;
855 		if ((hasIndex || fVolume->CheckForLiveQuery(name))
856 			&& GetAttribute(name, &attribute) == B_OK) {
857 			uint8 data[BPLUSTREE_MAX_KEY_LENGTH];
858 			size_t length = BPLUSTREE_MAX_KEY_LENGTH;
859 			if (attribute->ReadAt(0, data, &length) == B_OK)
860 				index.Update(transaction, name, attribute->Type(), data, length, NULL, 0, this);
861 
862 			ReleaseAttribute(attribute);
863 		}
864 
865 		if ((status = attributes->Remove(transaction, name)) < B_OK)
866 			return status;
867 
868 		if (attributes->IsEmpty()) {
869 			// remove attribute directory (don't fail if that can't be done)
870 			if (remove_vnode(fVolume->ID(), attributes->ID()) == B_OK) {
871 				// update the inode, so that no one will ever doubt it's deleted :-)
872 				attributes->Node()->flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
873 				if (attributes->WriteBack(transaction) == B_OK) {
874 					Attributes().SetTo(0, 0, 0);
875 					WriteBack(transaction);
876 				} else
877 					unremove_vnode(fVolume->ID(), attributes->ID());
878 			}
879 		}
880 	}
881 	return status;
882 }
883 
884 
885 status_t
886 Inode::GetAttribute(const char *name, Inode **attribute)
887 {
888 	// does this inode even have attributes?
889 	if (Attributes().IsZero())
890 		return B_ENTRY_NOT_FOUND;
891 
892 	Vnode vnode(fVolume, Attributes());
893 	Inode *attributes;
894 	if (vnode.Get(&attributes) < B_OK) {
895 		FATAL(("get_vnode() failed in Inode::GetAttribute(name = \"%s\")\n", name));
896 		return B_ERROR;
897 	}
898 
899 	BPlusTree *tree;
900 	status_t status = attributes->GetTree(&tree);
901 	if (status == B_OK) {
902 		vnode_id id;
903 		if ((status = tree->Find((uint8 *)name, (uint16)strlen(name), &id)) == B_OK) {
904 			Vnode vnode(fVolume, id);
905 			// Check if the attribute is really an attribute
906 			if (vnode.Get(attribute) < B_OK
907 				|| !(*attribute)->IsAttribute())
908 				return B_ERROR;
909 
910 			vnode.Keep();
911 			return B_OK;
912 		}
913 	}
914 	return status;
915 }
916 
917 
918 void
919 Inode::ReleaseAttribute(Inode *attribute)
920 {
921 	if (attribute == NULL)
922 		return;
923 
924 	put_vnode(fVolume->ID(), attribute->ID());
925 }
926 
927 
928 status_t
929 Inode::CreateAttribute(Transaction *transaction, const char *name, uint32 type, Inode **attribute)
930 {
931 	// do we need to create the attribute directory first?
932 	if (Attributes().IsZero()) {
933 		status_t status = Inode::Create(transaction, this, NULL, S_ATTR_DIR | 0666, 0, 0, NULL);
934 		if (status < B_OK)
935 			RETURN_ERROR(status);
936 	}
937 	Vnode vnode(fVolume, Attributes());
938 	Inode *attributes;
939 	if (vnode.Get(&attributes) < B_OK)
940 		return B_ERROR;
941 
942 	// Inode::Create() locks the inode for us
943 	return Inode::Create(transaction, attributes, name, S_ATTR | 0666, 0, type, NULL, attribute);
944 }
945 
946 
947 //	#pragma mark -
948 
949 
950 /**	Gives the caller direct access to the b+tree for a given directory.
951  *	The tree is created on demand, but lasts until the inode is
952  *	deleted.
953  */
954 
955 status_t
956 Inode::GetTree(BPlusTree **tree)
957 {
958 	if (fTree) {
959 		*tree = fTree;
960 		return B_OK;
961 	}
962 
963 	if (IsContainer()) {
964 		fTree = new BPlusTree(this);
965 		if (!fTree)
966 			RETURN_ERROR(B_NO_MEMORY);
967 
968 		*tree = fTree;
969 		status_t status = fTree->InitCheck();
970 		if (status < B_OK) {
971 			delete fTree;
972 			fTree = NULL;
973 		}
974 		RETURN_ERROR(status);
975 	}
976 	RETURN_ERROR(B_BAD_VALUE);
977 }
978 
979 
980 bool
981 Inode::IsEmpty()
982 {
983 	BPlusTree *tree;
984 	status_t status = GetTree(&tree);
985 	if (status < B_OK)
986 		return status;
987 
988 	TreeIterator iterator(tree);
989 
990 	// index and attribute directories are really empty when they are
991 	// empty - directories for standard files always contain ".", and
992 	// "..", so we need to ignore those two
993 
994 	uint32 count = 0;
995 	char name[BPLUSTREE_MAX_KEY_LENGTH];
996 	uint16 length;
997 	vnode_id id;
998 	while (iterator.GetNextEntry(name, &length, B_FILE_NAME_LENGTH, &id) == B_OK) {
999 		if (Mode() & (S_ATTR_DIR | S_INDEX_DIR))
1000 			return false;
1001 
1002 		if (++count > 2 || strcmp(".", name) && strcmp("..", name))
1003 			return false;
1004 	}
1005 	return true;
1006 }
1007 
1008 
1009 /** Finds the block_run where "pos" is located in the data_stream of
1010  *	the inode.
1011  *	If successful, "offset" will then be set to the file offset
1012  *	of the block_run returned; so "pos - offset" is for the block_run
1013  *	what "pos" is for the whole stream.
1014  *	The caller has to make sure that "pos" is inside the stream.
1015  */
1016 
1017 status_t
1018 Inode::FindBlockRun(off_t pos, block_run &run, off_t &offset)
1019 {
1020 	// The BPlusTree class will call this function, we'll provide
1021 	// standard cached access only from here
1022 	return ((Stream<Access::Cached> *)this)->FindBlockRun(pos, run, offset);
1023 }
1024 
1025 
1026 status_t
1027 Inode::ReadAt(off_t pos, uint8 *buffer, size_t *_length)
1028 {
1029 	// call the right ReadAt() method, depending on the inode flags
1030 
1031 	if (Flags() & INODE_NO_CACHE)
1032 		return ((Stream<Access::Uncached> *)this)->ReadAt(pos, buffer, _length);
1033 
1034 	if (Flags() & INODE_LOGGED)
1035 		return ((Stream<Access::Logged> *)this)->ReadAt(pos, buffer, _length);
1036 
1037 	return ((Stream<Access::Cached> *)this)->ReadAt(pos, buffer, _length);
1038 }
1039 
1040 
1041 status_t
1042 Inode::WriteAt(Transaction *transaction, off_t pos, const uint8 *buffer, size_t *_length)
1043 {
1044 	// call the right WriteAt() method, depending on the inode flags
1045 
1046 	// update the last modification time in memory, it will be written
1047 	// back to the inode, and the index when the file is closed
1048 	// ToDo: should update the internal last modified time only at this point!
1049 	Node()->last_modified_time = (bigtime_t)time(NULL) << INODE_TIME_SHIFT;
1050 
1051 	if (Flags() & INODE_NO_CACHE)
1052 		return ((Stream<Access::Uncached> *)this)->WriteAt(transaction, pos, buffer, _length);
1053 
1054 	if (Flags() & INODE_LOGGED)
1055 		return ((Stream<Access::Logged> *)this)->WriteAt(transaction, pos, buffer, _length);
1056 
1057 	return ((Stream<Access::Cached> *)this)->WriteAt(transaction, pos, buffer, _length);
1058 }
1059 
1060 
1061 /**	Fills the gap between the old file size and the new file size
1062  *	with zeros.
1063  *	It's more or less a copy of Inode::WriteAt() but it can handle
1064  *	length differences of more than just 4 GB, and it never uses
1065  *	the log, even if the INODE_LOGGED flag is set.
1066  */
1067 
1068 status_t
1069 Inode::FillGapWithZeros(off_t pos, off_t newSize)
1070 {
1071 	// ToDo: we currently do anything here, same as original BFS!
1072 	//if (pos >= newSize)
1073 		return B_OK;
1074 
1075 	block_run run;
1076 	off_t offset;
1077 	if (FindBlockRun(pos, run, offset) < B_OK)
1078 		RETURN_ERROR(B_BAD_VALUE);
1079 
1080 	off_t length = newSize - pos;
1081 	uint32 bytesWritten = 0;
1082 	uint32 blockSize = fVolume->BlockSize();
1083 	uint32 blockShift = fVolume->BlockShift();
1084 	uint8 *block;
1085 
1086 	// the first block_run we write could not be aligned to the block_size boundary
1087 	// (write partial block at the beginning)
1088 
1089 	// pos % block_size == (pos - offset) % block_size, offset % block_size == 0
1090 	if (pos % blockSize != 0) {
1091 		run.start += (pos - offset) / blockSize;
1092 		run.length -= (pos - offset) / blockSize;
1093 
1094 		CachedBlock cached(fVolume,run);
1095 		if ((block = cached.Block()) == NULL)
1096 			RETURN_ERROR(B_BAD_VALUE);
1097 
1098 		bytesWritten = blockSize - (pos % blockSize);
1099 		if (length < bytesWritten)
1100 			bytesWritten = length;
1101 
1102 		memset(block + (pos % blockSize), 0, bytesWritten);
1103 		if (fVolume->WriteBlocks(cached.BlockNumber(), block, 1) < B_OK)
1104 			RETURN_ERROR(B_IO_ERROR);
1105 
1106 		pos += bytesWritten;
1107 
1108 		length -= bytesWritten;
1109 		if (length == 0)
1110 			return B_OK;
1111 
1112 		if (FindBlockRun(pos, run, offset) < B_OK)
1113 			RETURN_ERROR(B_BAD_VALUE);
1114 	}
1115 
1116 	while (length > 0) {
1117 		// offset is the offset to the current pos in the block_run
1118 		run.start = HOST_ENDIAN_TO_BFS_INT16(run.Start() + ((pos - offset) >> blockShift));
1119 		run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length() - ((pos - offset) >> blockShift));
1120 
1121 		CachedBlock cached(fVolume);
1122 		off_t blockNumber = fVolume->ToBlock(run);
1123 		for (int32 i = 0; i < run.Length(); i++) {
1124 			if ((block = cached.SetTo(blockNumber + i, true)) == NULL)
1125 				RETURN_ERROR(B_IO_ERROR);
1126 
1127 			if (fVolume->WriteBlocks(cached.BlockNumber(), block, 1) < B_OK)
1128 				RETURN_ERROR(B_IO_ERROR);
1129 		}
1130 
1131 		int32 bytes = run.Length() << blockShift;
1132 		length -= bytes;
1133 		bytesWritten += bytes;
1134 
1135 		// since we don't respect a last partial block, length can be lower
1136 		if (length <= 0)
1137 			break;
1138 
1139 		pos += bytes;
1140 
1141 		if (FindBlockRun(pos, run, offset) < B_OK)
1142 			RETURN_ERROR(B_BAD_VALUE);
1143 	}
1144 	return B_OK;
1145 }
1146 
1147 
1148 /** Allocates NUM_ARRAY_BLOCKS blocks, and clears their contents. Growing
1149  *	the indirect and double indirect range uses this method.
1150  *	The allocated block_run is saved in "run"
1151  */
1152 
1153 status_t
1154 Inode::AllocateBlockArray(Transaction *transaction, block_run &run)
1155 {
1156 	if (!run.IsZero())
1157 		return B_BAD_VALUE;
1158 
1159 	status_t status = fVolume->Allocate(transaction, this, NUM_ARRAY_BLOCKS, run, NUM_ARRAY_BLOCKS);
1160 	if (status < B_OK)
1161 		return status;
1162 
1163 	// make sure those blocks are empty
1164 	CachedBlock cached(fVolume);
1165 	off_t block = fVolume->ToBlock(run);
1166 
1167 	for (int32 i = 0; i < run.Length(); i++) {
1168 		block_run *runs = (block_run *)cached.SetTo(block + i, true);
1169 		if (runs == NULL)
1170 			return B_IO_ERROR;
1171 
1172 		if (cached.WriteBack(transaction) < B_OK)
1173 			return B_IO_ERROR;
1174 	}
1175 	return B_OK;
1176 }
1177 
1178 
1179 status_t
1180 Inode::GrowStream(Transaction *transaction, off_t size)
1181 {
1182 	data_stream *data = &Node()->data;
1183 
1184 	// is the data stream already large enough to hold the new size?
1185 	// (can be the case with preallocated blocks)
1186 	if (size < data->MaxDirectRange()
1187 		|| size < data->MaxIndirectRange()
1188 		|| size < data->MaxDoubleIndirectRange()) {
1189 		data->size = HOST_ENDIAN_TO_BFS_INT64(size);
1190 		return B_OK;
1191 	}
1192 
1193 	// how many bytes are still needed? (unused ranges are always zero)
1194 	uint16 minimum = 1;
1195 	off_t bytes;
1196 	if (data->Size() < data->MaxDoubleIndirectRange()) {
1197 		bytes = size - data->MaxDoubleIndirectRange();
1198 		// the double indirect range can only handle multiple of NUM_ARRAY_BLOCKS
1199 		minimum = NUM_ARRAY_BLOCKS;
1200 	} else if (data->Size() < data->MaxIndirectRange())
1201 		bytes = size - data->MaxIndirectRange();
1202 	else if (data->Size() < data->MaxDirectRange())
1203 		bytes = size - data->MaxDirectRange();
1204 	else
1205 		bytes = size - data->Size();
1206 
1207 	// do we have enough free blocks on the disk?
1208 	off_t blocksRequested = (bytes + fVolume->BlockSize() - 1) >> fVolume->BlockShift();
1209 	if (blocksRequested > fVolume->FreeBlocks())
1210 		return B_DEVICE_FULL;
1211 
1212 	off_t blocksNeeded = blocksRequested;
1213 		// because of preallocations and partial allocations, the number of
1214 		// blocks we need to allocate may be different from the one we request
1215 		// from the block allocator
1216 
1217 	// should we preallocate some blocks (currently, always 64k)?
1218 	if (blocksRequested < (65536 >> fVolume->BlockShift()) && fVolume->FreeBlocks() > 128)
1219 		blocksRequested = 65536 >> fVolume->BlockShift();
1220 
1221 	while (blocksNeeded > 0) {
1222 		// the requested blocks do not need to be returned with a
1223 		// single allocation, so we need to iterate until we have
1224 		// enough blocks allocated
1225 		block_run run;
1226 		status_t status = fVolume->Allocate(transaction, this, blocksRequested, run, minimum);
1227 		if (status < B_OK)
1228 			return status;
1229 
1230 		// okay, we have the needed blocks, so just distribute them to the
1231 		// different ranges of the stream (direct, indirect & double indirect)
1232 
1233 		// ToDo: if anything goes wrong here, we probably want to free the
1234 		// blocks that couldn't be distributed into the stream!
1235 
1236 		blocksNeeded -= run.Length();
1237 		// don't preallocate if the first allocation was already too small
1238 		blocksRequested = blocksNeeded;
1239 		if (minimum > 1) {
1240 			// make sure that "blocks" is a multiple of minimum
1241 			blocksRequested = (blocksRequested + minimum - 1) & ~(minimum - 1);
1242 		}
1243 
1244 		// Direct block range
1245 
1246 		if (data->Size() <= data->MaxDirectRange()) {
1247 			// let's try to put them into the direct block range
1248 			int32 free = 0;
1249 			for (; free < NUM_DIRECT_BLOCKS; free++)
1250 				if (data->direct[free].IsZero())
1251 					break;
1252 
1253 			if (free < NUM_DIRECT_BLOCKS) {
1254 				// can we merge the last allocated run with the new one?
1255 				int32 last = free - 1;
1256 				if (free > 0 && data->direct[last].MergeableWith(run))
1257 					data->direct[last].length = HOST_ENDIAN_TO_BFS_INT16(data->direct[last].Length() + run.Length());
1258 				else
1259 					data->direct[free] = run;
1260 
1261 				data->max_direct_range = HOST_ENDIAN_TO_BFS_INT64(data->MaxDirectRange() + run.Length() * fVolume->BlockSize());
1262 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0 ? data->max_direct_range : size);
1263 				continue;
1264 			}
1265 		}
1266 
1267 		// Indirect block range
1268 
1269 		if (data->Size() <= data->MaxIndirectRange() || !data->MaxIndirectRange()) {
1270 			CachedBlock cached(fVolume);
1271 			block_run *runs = NULL;
1272 			uint32 free = 0;
1273 			off_t block;
1274 
1275 			// if there is no indirect block yet, create one
1276 			if (data->indirect.IsZero()) {
1277 				status = AllocateBlockArray(transaction, data->indirect);
1278 				if (status < B_OK)
1279 					return status;
1280 
1281 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(data->MaxDirectRange());
1282 				// insert the block_run in the first block
1283 				runs = (block_run *)cached.SetTo(data->indirect);
1284 			} else {
1285 				uint32 numberOfRuns = fVolume->BlockSize() / sizeof(block_run);
1286 				block = fVolume->ToBlock(data->indirect);
1287 
1288 				// search first empty entry
1289 				int32 i = 0;
1290 				for (; i < data->indirect.Length(); i++) {
1291 					if ((runs = (block_run *)cached.SetTo(block + i)) == NULL)
1292 						return B_IO_ERROR;
1293 
1294 					for (free = 0; free < numberOfRuns; free++)
1295 						if (runs[free].IsZero())
1296 							break;
1297 
1298 					if (free < numberOfRuns)
1299 						break;
1300 				}
1301 				if (i == data->indirect.Length())
1302 					runs = NULL;
1303 			}
1304 
1305 			if (runs != NULL) {
1306 				// try to insert the run to the last one - note that this doesn't
1307 				// take block borders into account, so it could be further optimized
1308 				int32 last = free - 1;
1309 				if (free > 0 && runs[last].MergeableWith(run))
1310 					runs[last].length = HOST_ENDIAN_TO_BFS_INT16(runs[last].Length() + run.Length());
1311 				else
1312 					runs[free] = run;
1313 
1314 				data->max_indirect_range = HOST_ENDIAN_TO_BFS_INT64(data->MaxIndirectRange() + (run.Length() << fVolume->BlockShift()));
1315 				data->size = HOST_ENDIAN_TO_BFS_INT64(blocksNeeded > 0 ? data->MaxIndirectRange() : size);
1316 
1317 				cached.WriteBack(transaction);
1318 				continue;
1319 			}
1320 		}
1321 
1322 		// Double indirect block range
1323 
1324 		if (data->Size() <= data->MaxDoubleIndirectRange() || !data->max_double_indirect_range) {
1325 			while ((run.Length() % NUM_ARRAY_BLOCKS) != 0) {
1326 				// The number of allocated blocks isn't a multiple of NUM_ARRAY_BLOCKS,
1327 				// so we have to change this. This can happen the first time the stream
1328 				// grows into the double indirect range.
1329 				// First, free the remaining blocks that don't fit into a multiple
1330 				// of NUM_ARRAY_BLOCKS
1331 				int32 rest = run.Length() % NUM_ARRAY_BLOCKS;
1332 				run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length() - rest);
1333 
1334 				status = fVolume->Free(transaction, block_run::Run(run.AllocationGroup(),
1335 					run.Start() + run.Length(), rest));
1336 				if (status < B_OK)
1337 					return status;
1338 
1339 				blocksNeeded += rest;
1340 				blocksRequested = (blocksNeeded + NUM_ARRAY_BLOCKS - 1) & ~(NUM_ARRAY_BLOCKS - 1);
1341 				minimum = NUM_ARRAY_BLOCKS;
1342 					// we make sure here that we have at minimum NUM_ARRAY_BLOCKS allocated,
1343 					// so if the allocation succeeds, we don't run into an endless loop
1344 
1345 				// Are there any blocks left in the run? If not, allocate a new one
1346 				if (run.length == 0)
1347 					continue;
1348 			}
1349 
1350 			// if there is no double indirect block yet, create one
1351 			if (data->double_indirect.IsZero()) {
1352 				status = AllocateBlockArray(transaction, data->double_indirect);
1353 				if (status < B_OK)
1354 					return status;
1355 
1356 				data->max_double_indirect_range = data->max_indirect_range;
1357 			}
1358 
1359 			// calculate the index where to insert the new blocks
1360 
1361 			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1362 			int32 indirectSize = ((1L << INDIRECT_BLOCKS_SHIFT) << fVolume->BlockShift())
1363 				* runsPerBlock;
1364 			int32 directSize = NUM_ARRAY_BLOCKS << fVolume->BlockShift();
1365 			int32 runsPerArray = runsPerBlock << ARRAY_BLOCKS_SHIFT;
1366 
1367 			off_t start = data->MaxDoubleIndirectRange() - data->MaxIndirectRange();
1368 			int32 indirectIndex = start / indirectSize;
1369 			int32 index = start / directSize;
1370 
1371 			// distribute the blocks to the array and allocate
1372 			// new array blocks when needed
1373 
1374 			CachedBlock cached(fVolume);
1375 			CachedBlock cachedDirect(fVolume);
1376 			block_run *array = NULL;
1377 			uint32 runLength = run.Length();
1378 
1379 			// ToDo: the following code is commented - it could be used to
1380 			// preallocate all needed block arrays to see in advance if the
1381 			// allocation will succeed.
1382 			// I will probably remove it later, because it's no perfect solution
1383 			// either: if the allocation was broken up before (blocksNeeded != 0),
1384 			// it doesn't guarantee anything.
1385 			// And since failing in this case is not that common, it doesn't have
1386 			// to be optimized in that way.
1387 			// Anyway, I wanted to have it in CVS - all those lines, and they will
1388 			// be removed soon :-)
1389 /*
1390 			// allocate new block arrays if needed
1391 
1392 			off_t block = -1;
1393 
1394 			for (int32 i = 0;i < needed;i++) {
1395 				// get the block to insert the run into
1396 				block = fVolume->ToBlock(data->double_indirect) + i + indirectIndex / runsPerBlock;
1397 				if (cached.BlockNumber() != block)
1398 					array = (block_run *)cached.SetTo(block);
1399 
1400 				if (array == NULL)
1401 					return B_ERROR;
1402 
1403 				status = AllocateBlockArray(transaction, array[i + indirectIndex % runsPerBlock]);
1404 				if (status < B_OK)
1405 					return status;
1406 			}
1407 */
1408 
1409 			while (run.length != 0) {
1410 				// get the indirect array block
1411 				if (array == NULL) {
1412 					if (cached.Block() != NULL
1413 						&& cached.WriteBack(transaction) < B_OK)
1414 						return B_IO_ERROR;
1415 
1416 					array = (block_run *)cached.SetTo(fVolume->ToBlock(data->double_indirect)
1417 						+ indirectIndex / runsPerBlock);
1418 					if (array == NULL)
1419 						return B_IO_ERROR;
1420 				}
1421 
1422 				do {
1423 					// do we need a new array block?
1424 					if (array[indirectIndex % runsPerBlock].IsZero()) {
1425 						status = AllocateBlockArray(transaction, array[indirectIndex % runsPerBlock]);
1426 						if (status < B_OK)
1427 							return status;
1428 					}
1429 
1430 					block_run *runs = (block_run *)cachedDirect.SetTo(
1431 						fVolume->ToBlock(array[indirectIndex % runsPerBlock])
1432 						+ index / runsPerBlock);
1433 					if (runs == NULL)
1434 						return B_IO_ERROR;
1435 
1436 					do {
1437 						// insert the block_run into the array
1438 						runs[index % runsPerBlock] = run;
1439 						runs[index % runsPerBlock].length = HOST_ENDIAN_TO_BFS_INT16(NUM_ARRAY_BLOCKS);
1440 
1441 						// alter the remaining block_run
1442 						run.start = HOST_ENDIAN_TO_BFS_INT16(run.Start() + NUM_ARRAY_BLOCKS);
1443 						run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length() - NUM_ARRAY_BLOCKS);
1444 					} while ((++index % runsPerBlock) != 0 && run.length);
1445 
1446 					if (cachedDirect.WriteBack(transaction) < B_OK)
1447 						return B_IO_ERROR;
1448 				} while ((index % runsPerArray) != 0 && run.length);
1449 
1450 				if (++indirectIndex % runsPerBlock == 0) {
1451 					array = NULL;
1452 					index = 0;
1453 				}
1454 			}
1455 
1456 			data->max_double_indirect_range = HOST_ENDIAN_TO_BFS_INT64(data->MaxDoubleIndirectRange() + (runLength << fVolume->BlockShift()));
1457 			data->size = blocksNeeded > 0 ? HOST_ENDIAN_TO_BFS_INT64(data->max_double_indirect_range) : size;
1458 
1459 			continue;
1460 		}
1461 
1462 		RETURN_ERROR(EFBIG);
1463 	}
1464 	// update the size of the data stream
1465 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
1466 
1467 	return B_OK;
1468 }
1469 
1470 
1471 status_t
1472 Inode::FreeStaticStreamArray(Transaction *transaction, int32 level, block_run run,
1473 	off_t size, off_t offset, off_t &max)
1474 {
1475 	int32 indirectSize = 0;
1476 	if (level == 0)
1477 		indirectSize = (1L << (INDIRECT_BLOCKS_SHIFT + fVolume->BlockShift()))
1478 			* (fVolume->BlockSize() / sizeof(block_run));
1479 	else if (level == 1)
1480 		indirectSize = 4 << fVolume->BlockShift();
1481 
1482 	off_t start;
1483 	if (size > offset)
1484 		start = size - offset;
1485 	else
1486 		start = 0;
1487 
1488 	int32 index = start / indirectSize;
1489 	int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1490 
1491 	CachedBlock cached(fVolume);
1492 	off_t blockNumber = fVolume->ToBlock(run);
1493 
1494 	// set the file offset to the current block run
1495 	offset += (off_t)index * indirectSize;
1496 
1497 	for (int32 i = index / runsPerBlock; i < run.Length(); i++) {
1498 		block_run *array = (block_run *)cached.SetTo(blockNumber + i);
1499 		if (array == NULL)
1500 			RETURN_ERROR(B_ERROR);
1501 
1502 		for (index = index % runsPerBlock; index < runsPerBlock; index++) {
1503 			if (array[index].IsZero()) {
1504 				// we also want to break out of the outer loop
1505 				i = run.Length();
1506 				break;
1507 			}
1508 
1509 			status_t status = B_OK;
1510 			if (level == 0)
1511 				status = FreeStaticStreamArray(transaction, 1, array[index], size, offset, max);
1512 			else if (offset >= size)
1513 				status = fVolume->Free(transaction, array[index]);
1514 			else
1515 				max = HOST_ENDIAN_TO_BFS_INT64(offset + indirectSize);
1516 
1517 			if (status < B_OK)
1518 				RETURN_ERROR(status);
1519 
1520 			if (offset >= size)
1521 				array[index].SetTo(0, 0, 0);
1522 
1523 			offset += indirectSize;
1524 		}
1525 		index = 0;
1526 
1527 		cached.WriteBack(transaction);
1528 	}
1529 	return B_OK;
1530 }
1531 
1532 
1533 /** Frees all block_runs in the array which come after the specified size.
1534  *	It also trims the last block_run that contain the size.
1535  *	"offset" and "max" are maintained until the last block_run that doesn't
1536  *	have to be freed - after this, the values won't be correct anymore, but
1537  *	will still assure correct function for all subsequent calls.
1538  *	"max" is considered to be in file system byte order.
1539  */
1540 
1541 status_t
1542 Inode::FreeStreamArray(Transaction *transaction, block_run *array, uint32 arrayLength,
1543 	off_t size, off_t &offset, off_t &max)
1544 {
1545 	off_t newOffset = offset;
1546 	uint32 i = 0;
1547 	for (; i < arrayLength; i++, offset = newOffset) {
1548 		if (array[i].IsZero())
1549 			break;
1550 
1551 		newOffset += (off_t)array[i].Length() << fVolume->BlockShift();
1552 		if (newOffset <= size)
1553 			continue;
1554 
1555 		block_run run = array[i];
1556 
1557 		// determine the block_run to be freed
1558 		if (newOffset > size && offset < size) {
1559 			// free partial block_run (and update the original block_run)
1560 			run.start = array[i].start + ((size - offset) >> fVolume->BlockShift()) + 1;
1561 			array[i].length = HOST_ENDIAN_TO_BFS_INT16(run.Start() - array[i].Start());
1562 			run.length = HOST_ENDIAN_TO_BFS_INT16(run.Length() - array[i].Length());
1563 
1564 			if (run.length == 0)
1565 				continue;
1566 
1567 			// update maximum range
1568 			max = HOST_ENDIAN_TO_BFS_INT64(offset + ((off_t)array[i].Length() << fVolume->BlockShift()));
1569 		} else {
1570 			// free the whole block_run
1571 			array[i].SetTo(0, 0, 0);
1572 
1573 			if ((off_t)BFS_ENDIAN_TO_HOST_INT64(max) > offset)
1574 				max = HOST_ENDIAN_TO_BFS_INT64(offset);
1575 		}
1576 
1577 		if (fVolume->Free(transaction, run) < B_OK)
1578 			return B_IO_ERROR;
1579 	}
1580 	return B_OK;
1581 }
1582 
1583 
1584 status_t
1585 Inode::ShrinkStream(Transaction *transaction, off_t size)
1586 {
1587 	data_stream *data = &Node()->data;
1588 
1589 	if (data->MaxDoubleIndirectRange() > size) {
1590 		FreeStaticStreamArray(transaction, 0, data->double_indirect, size,
1591 			data->MaxIndirectRange(), data->max_double_indirect_range);
1592 
1593 		if (size <= data->MaxIndirectRange()) {
1594 			fVolume->Free(transaction, data->double_indirect);
1595 			data->double_indirect.SetTo(0, 0, 0);
1596 			data->max_double_indirect_range = 0;
1597 		}
1598 	}
1599 	if (data->MaxIndirectRange() > size) {
1600 		CachedBlock cached(fVolume);
1601 		off_t block = fVolume->ToBlock(data->indirect);
1602 		off_t offset = data->MaxDirectRange();
1603 
1604 		for (int32 i = 0; i < data->indirect.Length(); i++) {
1605 			block_run *array = (block_run *)cached.SetTo(block + i);
1606 			if (array == NULL)
1607 				break;
1608 
1609 			if (FreeStreamArray(transaction, array, fVolume->BlockSize() / sizeof(block_run),
1610 					size, offset, data->max_indirect_range) == B_OK)
1611 				cached.WriteBack(transaction);
1612 		}
1613 		if (data->max_direct_range == data->max_indirect_range) {
1614 			fVolume->Free(transaction, data->indirect);
1615 			data->indirect.SetTo(0, 0, 0);
1616 			data->max_indirect_range = 0;
1617 		}
1618 	}
1619 	if (data->MaxDirectRange() > size) {
1620 		off_t offset = 0;
1621 		FreeStreamArray(transaction, data->direct, NUM_DIRECT_BLOCKS, size, offset,
1622 			data->max_direct_range);
1623 	}
1624 
1625 	data->size = HOST_ENDIAN_TO_BFS_INT64(size);
1626 	return B_OK;
1627 }
1628 
1629 
1630 status_t
1631 Inode::SetFileSize(Transaction *transaction, off_t size)
1632 {
1633 	if (size < 0
1634 		// uncached files can't be resized (Stream<Cache>::WriteAt() specifically
1635 		// denies growing uncached files because of efficiency, so it had to be
1636 		// adapted if this ever changes [which will probably happen in OpenBeOS]).
1637 		|| Flags() & INODE_NO_CACHE)
1638 		return B_BAD_VALUE;
1639 
1640 	off_t oldSize = Size();
1641 
1642 	if (size == oldSize)
1643 		return B_OK;
1644 
1645 	// should the data stream grow or shrink?
1646 	status_t status;
1647 	if (size > oldSize) {
1648 		status = GrowStream(transaction, size);
1649 		if (status < B_OK) {
1650 			// if the growing of the stream fails, the whole operation
1651 			// fails, so we should shrink the stream to its former size
1652 			ShrinkStream(transaction, oldSize);
1653 		}
1654 	}
1655 	else
1656 		status = ShrinkStream(transaction, size);
1657 
1658 	if (status < B_OK)
1659 		return status;
1660 
1661 	return WriteBack(transaction);
1662 }
1663 
1664 
1665 status_t
1666 Inode::Append(Transaction *transaction, off_t bytes)
1667 {
1668 	return SetFileSize(transaction, Size() + bytes);
1669 }
1670 
1671 
1672 status_t
1673 Inode::Trim(Transaction *transaction)
1674 {
1675 	status_t status = ShrinkStream(transaction, Size());
1676 	if (status < B_OK)
1677 		return status;
1678 
1679 	return WriteBack(transaction);
1680 }
1681 
1682 
1683 status_t
1684 Inode::Free(Transaction *transaction)
1685 {
1686 	FUNCTION();
1687 
1688 	// Perhaps there should be an implementation of Inode::ShrinkStream() that
1689 	// just frees the data_stream, but doesn't change the inode (since it is
1690 	// freed anyway) - that would make an undelete command possible
1691 	status_t status = SetFileSize(transaction, 0);
1692 	if (status < B_OK)
1693 		return status;
1694 
1695 	// Free all attributes, and remove their indices
1696 	{
1697 		// We have to limit the scope of AttributeIterator, so that its
1698 		// destructor is not called after the inode is deleted
1699 		AttributeIterator iterator(this);
1700 
1701 		char name[B_FILE_NAME_LENGTH];
1702 		uint32 type;
1703 		size_t length;
1704 		vnode_id id;
1705 		while ((status = iterator.GetNext(name, &length, &type, &id)) == B_OK)
1706 			RemoveAttribute(transaction, name);
1707 	}
1708 
1709 	if (WriteBack(transaction) < B_OK)
1710 		return B_ERROR;
1711 
1712 	return fVolume->Free(transaction, BlockRun());
1713 }
1714 
1715 
1716 status_t
1717 Inode::Sync()
1718 {
1719 	// We may also want to flush the attribute's data stream to
1720 	// disk here... (do we?)
1721 
1722 	data_stream *data = &Node()->data;
1723 	status_t status;
1724 
1725 	// flush direct range
1726 
1727 	for (int32 i = 0; i < NUM_DIRECT_BLOCKS; i++) {
1728 		if (data->direct[i].IsZero())
1729 			return B_OK;
1730 
1731 		status = flush_blocks(fVolume->Device(), fVolume->ToBlock(data->direct[i]),
1732 			data->direct[i].Length());
1733 		if (status != B_OK)
1734 			return status;
1735 	}
1736 
1737 	// flush indirect range
1738 
1739 	if (data->max_indirect_range == 0)
1740 		return B_OK;
1741 
1742 	CachedBlock cached(fVolume);
1743 	off_t block = fVolume->ToBlock(data->indirect);
1744 	int32 count = fVolume->BlockSize() / sizeof(block_run);
1745 
1746 	for (int32 j = 0; j < data->indirect.Length(); j++) {
1747 		block_run *runs = (block_run *)cached.SetTo(block + j);
1748 		if (runs == NULL)
1749 			break;
1750 
1751 		for (int32 i = 0; i < count; i++) {
1752 			if (runs[i].IsZero())
1753 				return B_OK;
1754 
1755 			status = flush_blocks(fVolume->Device(), fVolume->ToBlock(runs[i]), runs[i].Length());
1756 			if (status != B_OK)
1757 				return status;
1758 		}
1759 	}
1760 
1761 	// flush double indirect range
1762 
1763 	if (data->max_double_indirect_range == 0)
1764 		return B_OK;
1765 
1766 	off_t indirectBlock = fVolume->ToBlock(data->double_indirect);
1767 
1768 	for (int32 l = 0; l < data->double_indirect.Length(); l++) {
1769 		block_run *indirectRuns = (block_run *)cached.SetTo(indirectBlock + l);
1770 		if (indirectRuns == NULL)
1771 			return B_FILE_ERROR;
1772 
1773 		CachedBlock directCached(fVolume);
1774 
1775 		for (int32 k = 0; k < count; k++) {
1776 			if (indirectRuns[k].IsZero())
1777 				return B_OK;
1778 
1779 			block = fVolume->ToBlock(indirectRuns[k]);
1780 			for (int32 j = 0; j < indirectRuns[k].Length(); j++) {
1781 				block_run *runs = (block_run *)directCached.SetTo(block + j);
1782 				if (runs == NULL)
1783 					return B_FILE_ERROR;
1784 
1785 				for (int32 i = 0; i < count; i++) {
1786 					if (runs[i].IsZero())
1787 						return B_OK;
1788 
1789 					// ToDo: combine single block_runs to bigger ones when
1790 					// they are adjacent
1791 					status = flush_blocks(fVolume->Device(), fVolume->ToBlock(runs[i]),
1792 						runs[i].Length());
1793 					if (status != B_OK)
1794 						return status;
1795 				}
1796 			}
1797 		}
1798 	}
1799 	return B_OK;
1800 }
1801 
1802 
1803 status_t
1804 Inode::Remove(Transaction *transaction, const char *name, off_t *_id, bool isDirectory)
1805 {
1806 	BPlusTree *tree;
1807 	if (GetTree(&tree) != B_OK)
1808 		RETURN_ERROR(B_BAD_VALUE);
1809 
1810 	RecursiveLocker locker(fVolume->Lock());
1811 
1812 	// does the file even exist?
1813 	off_t id;
1814 	if (tree->Find((uint8 *)name, (uint16)strlen(name), &id) < B_OK)
1815 		return B_ENTRY_NOT_FOUND;
1816 
1817 	if (_id)
1818 		*_id = id;
1819 
1820 	Vnode vnode(fVolume, id);
1821 	Inode *inode;
1822 	status_t status = vnode.Get(&inode);
1823 	if (status < B_OK) {
1824 		REPORT_ERROR(status);
1825 		return B_ENTRY_NOT_FOUND;
1826 	}
1827 
1828 	// You can't unlink a mounted image or the VM file while it is being used - while
1829 	// this is not really necessary, it copies the behaviour of the original BFS
1830 	// and let you and me feel a little bit safer
1831 	if (inode->Flags() & INODE_NO_CACHE)
1832 		return B_NOT_ALLOWED;
1833 
1834 	// Inode::IsContainer() is true also for indices (furthermore, the S_IFDIR
1835 	// bit is set for indices in BFS, not for attribute directories) - but you
1836 	// should really be able to do whatever you want with your indices
1837 	// without having to remove all files first :)
1838 	if (!inode->IsIndex()) {
1839 		// if it's not of the correct type, don't delete it!
1840 		if (inode->IsContainer() != isDirectory)
1841 			return isDirectory ? B_NOT_A_DIRECTORY : B_IS_A_DIRECTORY;
1842 
1843 		// only delete empty directories
1844 		if (isDirectory && !inode->IsEmpty())
1845 			return B_DIRECTORY_NOT_EMPTY;
1846 	}
1847 
1848 	// remove_vnode() allows the inode to be accessed until the last put_vnode()
1849 	if (remove_vnode(fVolume->ID(), id) != B_OK)
1850 		return B_ERROR;
1851 
1852 	if (tree->Remove(transaction, name, id) < B_OK) {
1853 		unremove_vnode(fVolume->ID(), id);
1854 		RETURN_ERROR(B_ERROR);
1855 	}
1856 
1857 #ifdef DEBUG
1858 	if (tree->Find((uint8 *)name, (uint16)strlen(name), &id) == B_OK) {
1859 		DIE(("deleted entry still there"));
1860 	}
1861 #endif
1862 
1863 	// update the inode, so that no one will ever doubt it's deleted :-)
1864 	inode->Node()->flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DELETED);
1865 	inode->Node()->flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_USE);
1866 
1867 	// In balance to the Inode::Create() method, the main indices
1868 	// are updated here (name, size, & last_modified)
1869 
1870 	Index index(fVolume);
1871 	if (inode->IsRegularNode()) {
1872 		index.RemoveName(transaction, name, inode);
1873 			// If removing from the index fails, it is not regarded as a
1874 			// fatal error and will not be reported back!
1875 			// Deleted inodes won't be visible in queries anyway.
1876 	}
1877 
1878 	if ((inode->Mode() & (S_FILE | S_SYMLINK)) != 0) {
1879 		if (inode->IsFile())
1880 			index.RemoveSize(transaction, inode);
1881 		index.RemoveLastModified(transaction, inode);
1882 	}
1883 
1884 	if (inode->WriteBack(transaction) < B_OK)
1885 		return B_ERROR;
1886 
1887 	return B_OK;
1888 }
1889 
1890 
1891 /**	Creates the inode with the specified parent directory, and automatically
1892  *	adds the created inode to that parent directory. If an attribute directory
1893  *	is created, it will also automatically added to the parent inode as such.
1894  *	However, the indices root node, and the regular root node won't be added
1895  *	to the super block.
1896  *	It will also create the initial B+tree for the inode if it's a directory
1897  *	of any kind.
1898  *	If the "_id" or "_inode" variable is given and non-NULL to store the inode's
1899  *	ID, the inode stays locked - you have to call put_vnode() if you don't use it
1900  *	anymore.
1901  */
1902 
1903 status_t
1904 Inode::Create(Transaction *transaction, Inode *parent, const char *name, int32 mode,
1905 	int omode, uint32 type, off_t *_id, Inode **_inode)
1906 {
1907 	FUNCTION();
1908 
1909 	block_run parentRun = parent ? parent->BlockRun() : block_run::Run(0, 0, 0);
1910 	Volume *volume = transaction->GetVolume();
1911 	BPlusTree *tree = NULL;
1912 
1913 	RecursiveLocker locker(volume->Lock());
1914 		// ToDo: it would be nicer to only lock the parent directory, if possible
1915 		//	(but that lock will already be held during any B+tree action)
1916 
1917 	if (parent && (mode & S_ATTR_DIR) == 0 && parent->IsContainer()) {
1918 		// check if the file already exists in the directory
1919 		if (parent->GetTree(&tree) != B_OK)
1920 			RETURN_ERROR(B_BAD_VALUE);
1921 
1922 		// does the file already exist?
1923 		off_t offset;
1924 		if (tree->Find((uint8 *)name, (uint16)strlen(name), &offset) == B_OK) {
1925 			// return if the file should be a directory or opened in exclusive mode
1926 			if (mode & S_DIRECTORY || omode & O_EXCL)
1927 				return B_FILE_EXISTS;
1928 
1929 			Vnode vnode(volume, offset);
1930 			Inode *inode;
1931 			status_t status = vnode.Get(&inode);
1932 			if (status < B_OK) {
1933 				REPORT_ERROR(status);
1934 				return B_ENTRY_NOT_FOUND;
1935 			}
1936 
1937 			// if it's a directory, bail out!
1938 			if (inode->IsDirectory())
1939 				return B_IS_A_DIRECTORY;
1940 
1941 			// if it is a mounted device or the VM file, we don't allow to delete it
1942 			// while it is open and in use
1943 			if (inode->Flags() & INODE_NO_CACHE)
1944 				return B_NOT_ALLOWED;
1945 
1946 			// if omode & O_TRUNC, truncate the existing file
1947 			if (omode & O_TRUNC) {
1948 				WriteLocked locked(inode->Lock());
1949 
1950 				status_t status = inode->SetFileSize(transaction, 0);
1951 				if (status < B_OK)
1952 					return status;
1953 			}
1954 
1955 			if (_id)
1956 				*_id = offset;
1957 			if (_inode)
1958 				*_inode = inode;
1959 
1960 			// only keep the vnode in memory if the _id or _inode pointer is provided
1961 			if (_id != NULL || _inode != NULL)
1962 				vnode.Keep();
1963 
1964 			return B_OK;
1965 		}
1966 	} else if (parent && (mode & S_ATTR_DIR) == 0)
1967 		return B_BAD_VALUE;
1968 
1969 	// allocate space for the new inode
1970 	InodeAllocator allocator(transaction);
1971 	block_run run;
1972 	Inode *inode;
1973 	status_t status = allocator.New(&parentRun, mode, run, &inode);
1974 	if (status < B_OK)
1975 		return status;
1976 
1977 	// Initialize the parts of the bfs_inode structure that
1978 	// InodeAllocator::New() hasn't touched yet
1979 
1980 	bfs_inode *node = inode->Node();
1981 
1982 	if (parent == NULL) {
1983 		// we set the parent to itself in this case
1984 		// (only happens for the root and indices node)
1985 		node->parent = run;
1986 	} else
1987 		node->parent = parentRun;
1988 
1989 	node->uid = HOST_ENDIAN_TO_BFS_INT32(geteuid());
1990 	node->gid = HOST_ENDIAN_TO_BFS_INT32(parent ? parent->Node()->gid : getegid());
1991 		// the group ID is inherited from the parent, if available
1992 
1993 	node->type = HOST_ENDIAN_TO_BFS_INT32(type);
1994 
1995 	// only add the name to regular files, directories, or symlinks
1996 	// don't add it to attributes, or indices
1997 	if (tree && inode->IsRegularNode() && inode->SetName(transaction, name) < B_OK)
1998 		return B_ERROR;
1999 
2000 	// Initialize b+tree if it's a directory (and add "." & ".." if it's
2001 	// a standard directory for files - not for attributes or indices)
2002 	if (inode->IsContainer()) {
2003 		status = allocator.CreateTree();
2004 		if (status < B_OK)
2005 			return status;
2006 	}
2007 
2008 	// Add a link to the inode from the parent, depending on its type
2009 	// (the INODE_NOT_READY flag is set, so it is safe to make the inode
2010 	// accessable to the file system here)
2011 	if (tree) {
2012 		status = tree->Insert(transaction, name, inode->ID());
2013 	} else if (parent && (mode & S_ATTR_DIR) != 0) {
2014 		parent->Attributes() = run;
2015 		status = parent->WriteBack(transaction);
2016 	}
2017 
2018 	// Note, we only care if the inode could be made accessable for the
2019 	// two cases above; the root node or the indices root node must
2020 	// handle this case on their own (or other cases where "parent" is
2021 	// NULL)
2022 	if (status < B_OK)
2023 		RETURN_ERROR(status);
2024 
2025 	// Update the main indices (name, size & last_modified)
2026 	// (live queries might want to access us after this)
2027 
2028 	Index index(volume);
2029 	if (inode->IsRegularNode() && name != NULL) {
2030 		// the name index only contains regular files
2031 		// (but not the root node where name == NULL)
2032 		status = index.InsertName(transaction, name, inode);
2033 		if (status < B_OK && status != B_BAD_INDEX) {
2034 			// We have to remove the node from the parent at this point,
2035 			// because the InodeAllocator destructor can't handle this
2036 			// case (and if it fails, we can't do anything about it...)
2037 			if (tree)
2038 				tree->Remove(transaction, name, inode->ID());
2039 			else if (parent != NULL && (mode & S_ATTR_DIR) != 0)
2040 				parent->Node()->attributes.SetTo(0, 0, 0);
2041 
2042 			return status;
2043 		}
2044 	}
2045 
2046 	inode->UpdateOldLastModified();
2047 
2048 	// The "size" & "last_modified" indices don't contain directories
2049 	if ((mode & (S_FILE | S_SYMLINK)) != 0) {
2050 		// if adding to these indices fails, the inode creation will not be harmed;
2051 		// they are considered less important than the "name" index
2052 		if (inode->IsFile())
2053 			index.InsertSize(transaction, inode);
2054 		index.InsertLastModified(transaction, inode);
2055 	}
2056 
2057 	// Everything worked well until this point, we have a fully
2058 	// initialized inode, and we want to keep it
2059 	allocator.Keep();
2060 
2061 	// We hold the volume lock to make sure that bfs_read_vnode()
2062 	// won't succeed in the meantime (between the call right
2063 	// above and below)!
2064 
2065 	if ((status = new_vnode(volume->ID(), inode->ID(), inode)) != B_OK) {
2066 		// this is a really fatal error, and we can't recover from that
2067 		FATAL(("new_vnode() failed with: %s\n", strerror(status)));
2068 		DIE(("new_vnode() failed for inode!"));
2069 	}
2070 
2071 	if (_id != NULL)
2072 		*_id = inode->ID();
2073 	if (_inode != NULL)
2074 		*_inode = inode;
2075 
2076 	// if either _id or _inode is passed, we will keep the inode locked
2077 	if (_id == NULL && _inode == NULL)
2078 		put_vnode(volume->ID(), inode->ID());
2079 
2080 	return B_OK;
2081 }
2082 
2083 
2084 //	#pragma mark -
2085 
2086 
2087 AttributeIterator::AttributeIterator(Inode *inode)
2088 	:
2089 	fCurrentSmallData(0),
2090 	fInode(inode),
2091 	fAttributes(NULL),
2092 	fIterator(NULL),
2093 	fBuffer(NULL)
2094 {
2095 	inode->AddIterator(this);
2096 }
2097 
2098 
2099 AttributeIterator::~AttributeIterator()
2100 {
2101 	if (fAttributes)
2102 		put_vnode(fAttributes->GetVolume()->ID(), fAttributes->ID());
2103 
2104 	delete fIterator;
2105 	fInode->RemoveIterator(this);
2106 }
2107 
2108 
2109 status_t
2110 AttributeIterator::Rewind()
2111 {
2112 	fCurrentSmallData = 0;
2113 
2114 	if (fIterator != NULL)
2115 		fIterator->Rewind();
2116 
2117 	return B_OK;
2118 }
2119 
2120 
2121 status_t
2122 AttributeIterator::GetNext(char *name, size_t *_length, uint32 *_type, vnode_id *_id)
2123 {
2124 	// read attributes out of the small data section
2125 
2126 	if (fCurrentSmallData >= 0) {
2127 		small_data *item = fInode->Node()->SmallDataStart();
2128 
2129 		fInode->SmallDataLock().Lock();
2130 
2131 		int32 i = 0;
2132 		for (;;item = item->Next()) {
2133 			if (item->IsLast(fInode->Node()))
2134 				break;
2135 
2136 			if (item->NameSize() == FILE_NAME_NAME_LENGTH
2137 				&& *item->Name() == FILE_NAME_NAME)
2138 				continue;
2139 
2140 			if (i++ == fCurrentSmallData)
2141 				break;
2142 		}
2143 
2144 		if (!item->IsLast(fInode->Node())) {
2145 			strncpy(name, item->Name(), B_FILE_NAME_LENGTH);
2146 			*_type = item->Type();
2147 			*_length = item->NameSize();
2148 			*_id = (vnode_id)fCurrentSmallData;
2149 
2150 			fCurrentSmallData = i;
2151 		}
2152 		else {
2153 			// stop traversing the small_data section
2154 			fCurrentSmallData = -1;
2155 		}
2156 
2157 		fInode->SmallDataLock().Unlock();
2158 
2159 		if (fCurrentSmallData != -1)
2160 			return B_OK;
2161 	}
2162 
2163 	// read attributes out of the attribute directory
2164 
2165 	if (fInode->Attributes().IsZero())
2166 		return B_ENTRY_NOT_FOUND;
2167 
2168 	Volume *volume = fInode->GetVolume();
2169 
2170 	// if you haven't yet access to the attributes directory, get it
2171 	if (fAttributes == NULL) {
2172 #ifdef UNSAFE_GET_VNODE
2173 		RecursiveLocker locker(volume->Lock());
2174 #endif
2175 		if (get_vnode(volume->ID(), volume->ToVnode(fInode->Attributes()),
2176 				(void **)&fAttributes) != 0
2177 			|| fAttributes == NULL) {
2178 			FATAL(("get_vnode() failed in AttributeIterator::GetNext(vnode_id = %Ld,name = \"%s\")\n",fInode->ID(),name));
2179 			return B_ENTRY_NOT_FOUND;
2180 		}
2181 
2182 		BPlusTree *tree;
2183 		if (fAttributes->GetTree(&tree) < B_OK
2184 			|| (fIterator = new TreeIterator(tree)) == NULL) {
2185 			FATAL(("could not get tree in AttributeIterator::GetNext(vnode_id = %Ld,name = \"%s\")\n",fInode->ID(),name));
2186 			return B_ENTRY_NOT_FOUND;
2187 		}
2188 	}
2189 
2190 	uint16 length;
2191 	vnode_id id;
2192 	status_t status = fIterator->GetNextEntry(name, &length, B_FILE_NAME_LENGTH, &id);
2193 	if (status < B_OK)
2194 		return status;
2195 
2196 	Vnode vnode(volume,id);
2197 	Inode *attribute;
2198 	if ((status = vnode.Get(&attribute)) == B_OK) {
2199 		*_type = attribute->Type();
2200 		*_length = attribute->Size();
2201 		*_id = id;
2202 	}
2203 
2204 	return status;
2205 }
2206 
2207 
2208 void
2209 AttributeIterator::Update(uint16 index, int8 change)
2210 {
2211 	// fCurrentSmallData points already to the next item
2212 	if (index < fCurrentSmallData)
2213 		fCurrentSmallData += change;
2214 }
2215 
2216