xref: /haiku/src/add-ons/kernel/file_systems/userlandfs/server/beos/fs_cache.c (revision e1c4049fed1047bdb957b0529e1921e97ef94770)
1 /*
2    This file contains the global device cache for the BeOS.  All
3    file system I/O comes through here.  The cache can handle blocks
4    of different sizes for multiple different underlying physical
5    devices.
6 
7    The cache is organized as a hash table (for lookups by device
8    and block number) and two doubly-linked lists.  The normal
9    list is for "normal" blocks which are either clean or dirty.
10    The locked list is for blocks that are locked in the cache by
11    BFS.  The lists are LRU ordered.
12 
13    Most of the work happens in the function cache_block_io() which
14    is quite lengthy.  The other functions of interest are get_ents()
15    which picks victims to be kicked out of the cache; flush_ents()
16    which does the work of flushing them; and set_blocks_info() which
17    handles cloning blocks and setting callbacks so that the BFS
18    journal will work properly.  If you want to modify this code it
19    will take some study but it's not too bad.  Do not think about
20    separating the list of clean and dirty blocks into two lists as
21    I did that already and it's slower.
22 
23    Originally this cache code was written while listening to the album
24    "Ride the Lightning" by Metallica.  The current version was written
25    while listening to "Welcome to SkyValley" by Kyuss as well as an
26    ambient collection on the German label FAX.  Music helps a lot when
27    writing code.
28 
29    THIS CODE COPYRIGHT DOMINIC GIAMPAOLO.  NO WARRANTY IS EXPRESSED
30    OR IMPLIED.  YOU MAY USE THIS CODE AND FREELY DISTRIBUTE IT FOR
31    NON-COMMERCIAL USE AS LONG AS THIS NOTICE REMAINS ATTACHED.
32 
33    FOR COMMERCIAL USE, CONTACT DOMINIC GIAMPAOLO (dbg@be.com).
34 
35    Dominic Giampaolo
36    dbg@be.com
37 */
38 
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <memory.h>
42 #include <string.h>
43 #include <errno.h>
44 #include <fcntl.h>
45 #include <sys/types.h>
46 #include <sys/uio.h>
47 #include <unistd.h>
48 
49 
50 #include <OS.h>
51 #include <KernelExport.h>
52 
53 #include "fs_cache.h"
54 #include "fs_cache_priv.h"
55 #include "lock.h"
56 
57 
58 
59 #ifndef USER
60 #define printf dprintf
61 #endif
62 #ifdef USER
63 #define kprintf printf
64 #endif
65 
66 
67 typedef off_t fs_off_t;
68 
69 /* forward prototypes */
70 static int   flush_ents(cache_ent **ents, int n_ents);
71 
72 //static int   do_dump(int argc, char **argv);
73 //static int   do_find_block(int argc, char **argv);
74 //static int   do_find_data(int argc, char **argv);
75 //static void  cache_flusher(void *arg, int phase);
76 
77 
78 int chatty_io = 0;
79 
80 #define CHUNK (512 * 1024)   /* a hack to work around scsi driver bugs */
81 
82 static void
83 beos_panic(const char *format, ...)
84 {
85     va_list     ap;
86 
87     va_start(ap, format);
88     vfprintf(stderr, format, ap);
89     va_end(ap);
90 
91     while (TRUE)
92         ;
93 }
94 
95 size_t
96 beos_read_phys_blocks(int fd, fs_off_t bnum, void *data, uint num_blocks, int bsize)
97 {
98     size_t ret = 0;
99     size_t sum;
100 
101     if (chatty_io)
102         printf("R: %8" B_PRIdOFF " : %3d\n", bnum, num_blocks);
103 
104     if (num_blocks * bsize < CHUNK)
105         ret = read_pos(fd, bnum * bsize, data, num_blocks * bsize);
106     else {
107         for(sum=0; (sum + CHUNK) <= (num_blocks * bsize); sum += CHUNK) {
108             ret = read_pos(fd, (bnum * bsize) + sum, data, CHUNK);
109             if (ret != CHUNK)
110                 break;
111 
112             data = (void *)((char *)data + CHUNK);
113         }
114 
115         if (ret == CHUNK && ((num_blocks * bsize) - sum) > 0) {
116             ret = read_pos(fd, (bnum * bsize) + sum, data,
117                            (num_blocks * bsize) - sum);
118 
119             if (ret == (num_blocks * bsize) - sum)
120                 ret = num_blocks * bsize;
121         } else if (ret == CHUNK) {
122             ret = num_blocks * bsize;
123         }
124     }
125 
126     if (ret == num_blocks * bsize)
127         return 0;
128     else
129         return EBADF;
130 }
131 
132 size_t
133 beos_write_phys_blocks(int fd, fs_off_t bnum, void *data, uint num_blocks, int bsize)
134 {
135     size_t ret = 0;
136     size_t sum;
137 
138     if (chatty_io)
139         printf("W: %8" B_PRIdOFF " : %3d\n", bnum, num_blocks);
140 
141     if (num_blocks * bsize < CHUNK)
142         ret = write_pos(fd, bnum * bsize, data, num_blocks * bsize);
143     else {
144         for(sum=0; (sum + CHUNK) <= (num_blocks * bsize); sum += CHUNK) {
145             ret = write_pos(fd, (bnum * bsize) + sum, data, CHUNK);
146             if (ret != CHUNK)
147                 break;
148 
149             data = (void *)((char *)data + CHUNK);
150         }
151 
152         if (ret == CHUNK && ((num_blocks * bsize) - sum) > 0) {
153             ret = write_pos(fd, (bnum * bsize) + sum, data,
154                             (num_blocks * bsize) - sum);
155 
156             if (ret == (num_blocks * bsize) - sum)
157                 ret = num_blocks * bsize;
158         } else if (ret == CHUNK) {
159             ret = num_blocks * bsize;
160         }
161     }
162 
163 
164     if (ret == num_blocks * bsize)
165         return 0;
166     else
167         return EBADF;
168 }
169 
170 //	#pragma mark -
171 
172 static int
173 init_hash_table(hash_table *ht)
174 {
175     ht->max   = HT_DEFAULT_MAX;
176     ht->mask  = ht->max - 1;
177     ht->num_elements = 0;
178 
179     ht->table = (hash_ent **)calloc(ht->max, sizeof(hash_ent *));
180     if (ht->table == NULL)
181         return ENOMEM;
182 
183     return 0;
184 }
185 
186 
187 static void
188 shutdown_hash_table(hash_table *ht)
189 {
190     int       i, hash_len;
191     hash_ent *he, *next;
192 
193     for(i=0; i < ht->max; i++) {
194         he = ht->table[i];
195 
196         for(hash_len=0; he; hash_len++, he=next) {
197             next = he->next;
198             free(he);
199         }
200     }
201 
202     if (ht->table)
203         free(ht->table);
204     ht->table = NULL;
205 }
206 
207 #if 0
208 static void
209 print_hash_stats(hash_table *ht)
210 {
211     int       i, hash_len, max = -1, sum = 0;
212     hash_ent *he, *next;
213 
214     for(i=0; i < ht->max; i++) {
215         he = ht->table[i];
216 
217         for(hash_len=0; he; hash_len++, he=next) {
218             next = he->next;
219         }
220         if (hash_len)
221             printf("bucket %3d : %3d\n", i, hash_len);
222 
223         sum += hash_len;
224         if (hash_len > max)
225             max = hash_len;
226     }
227 
228     printf("max # of chains: %d,  average chain length %d\n", max,sum/ht->max);
229 }
230 #endif
231 
232 #define HASH(d, b)   ((((fs_off_t)d) << (sizeof(fs_off_t)*8 - 6)) | (b))
233 
234 static hash_ent *
235 new_hash_ent(int dev, fs_off_t bnum, void *data)
236 {
237     hash_ent *he;
238 
239     he = (hash_ent *)malloc(sizeof(*he));
240     if (he == NULL)
241         return NULL;
242 
243     he->hash_val = HASH(dev, bnum);
244     he->dev      = dev;
245     he->bnum     = bnum;
246     he->data     = data;
247     he->next     = NULL;
248 
249     return he;
250 }
251 
252 
253 static int
254 grow_hash_table(hash_table *ht)
255 {
256     int        i, omax, newsize, newmask;
257     fs_off_t      hash;
258     hash_ent **new_table, *he, *next;
259 
260     if (ht->max & ht->mask) {
261         printf("*** hashtable size %d or mask %d looks weird!\n", ht->max,
262                ht->mask);
263     }
264 
265     omax    = ht->max;
266     newsize = omax * 2;        /* have to grow in powers of two */
267     newmask = newsize - 1;
268 
269     new_table = (hash_ent **)calloc(newsize, sizeof(hash_ent *));
270     if (new_table == NULL)
271         return ENOMEM;
272 
273     for(i=0; i < omax; i++) {
274         for(he=ht->table[i]; he; he=next) {
275             hash = he->hash_val & newmask;
276             next = he->next;
277 
278             he->next        = new_table[hash];
279             new_table[hash] = he;
280         }
281     }
282 
283     free(ht->table);
284     ht->table = new_table;
285     ht->max   = newsize;
286     ht->mask  = newmask;
287 
288     return 0;
289 }
290 
291 
292 
293 
294 static int
295 hash_insert(hash_table *ht, int dev, fs_off_t bnum, void *data)
296 {
297     fs_off_t    hash;
298     hash_ent *he, *curr;
299 
300     hash = HASH(dev, bnum) & ht->mask;
301 
302     curr = ht->table[hash];
303     for(; curr != NULL; curr=curr->next)
304         if (curr->dev == dev && curr->bnum == bnum)
305             break;
306 
307     if (curr && curr->dev == dev && curr->bnum == bnum) {
308         printf("entry %d:%" B_PRIdOFF " already in the hash table!\n",
309             dev, bnum);
310         return EEXIST;
311     }
312 
313     he = new_hash_ent(dev, bnum, data);
314     if (he == NULL)
315         return ENOMEM;
316 
317     he->next        = ht->table[hash];
318     ht->table[hash] = he;
319 
320     ht->num_elements++;
321     if (ht->num_elements >= ((ht->max * 3) / 4)) {
322         if (grow_hash_table(ht) != 0)
323             return ENOMEM;
324     }
325 
326     return 0;
327 }
328 
329 static void *
330 hash_lookup(hash_table *ht, int dev, fs_off_t bnum)
331 {
332     hash_ent *he;
333 
334     he = ht->table[HASH(dev, bnum) & ht->mask];
335 
336     for(; he != NULL; he=he->next) {
337         if (he->dev == dev && he->bnum == bnum)
338             break;
339     }
340 
341     if (he)
342         return he->data;
343     else
344         return NULL;
345 }
346 
347 
348 static void *
349 hash_delete(hash_table *ht, int dev, fs_off_t bnum)
350 {
351     void     *data;
352     fs_off_t     hash;
353     hash_ent *he, *prev = NULL;
354 
355     hash = HASH(dev, bnum) & ht->mask;
356     he = ht->table[hash];
357 
358     for(; he != NULL; prev=he,he=he->next) {
359         if (he->dev == dev && he->bnum == bnum)
360             break;
361     }
362 
363     if (he == NULL) {
364         printf("*** hash_delete: tried to delete non-existent block %d:%"
365             B_PRIdOFF "\n", dev, bnum);
366         return NULL;
367     }
368 
369     data = he->data;
370 
371     if (ht->table[hash] == he)
372         ht->table[hash] = he->next;
373     else if (prev)
374         prev->next = he->next;
375     else
376         beos_panic("hash table is inconsistent\n");
377 
378     free(he);
379     ht->num_elements--;
380 
381     return data;
382 }
383 
384 //	#pragma mark -
385 
386 /*
387   These are the global variables for the cache.
388 */
389 static block_cache  bc;
390 
391 #define       MAX_IOVECS  64           /* # of iovecs for use by cache code */
392 static beos_lock   iovec_lock;
393 static struct iovec *iovec_pool[MAX_IOVECS];  /* each ptr is to an array of iovecs */
394 static int    iovec_used[MAX_IOVECS];  /* non-zero == iovec is in use */
395 
396 #define NUM_FLUSH_BLOCKS 64    /* size of the iovec array pointed by each ptr */
397 
398 
399 #define DEFAULT_READ_AHEAD_SIZE  (32 * 1024)
400 static int read_ahead_size = DEFAULT_READ_AHEAD_SIZE;
401 
402 /* this array stores the size of each device so we can error check requests */
403 #define MAX_DEVICES  256
404 fs_off_t max_device_blocks[MAX_DEVICES];
405 
406 
407 /* has the time of the last cache access so cache flushing doesn't interfere */
408 static bigtime_t last_cache_access = 0;
409 
410 
411 int
412 beos_init_block_cache(int max_blocks, int flags)
413 {
414     memset(&bc, 0, sizeof(bc));
415     memset(iovec_pool, 0, sizeof(iovec_pool));
416     memset(iovec_used, 0, sizeof(iovec_used));
417     memset(&max_device_blocks, 0, sizeof(max_device_blocks));
418 
419     if (init_hash_table(&bc.ht) != 0)
420         return ENOMEM;
421 
422     bc.lock.s = iovec_lock.s = -1;
423 
424     bc.max_blocks = max_blocks;
425     bc.flags      = flags;
426     if (beos_new_lock(&bc.lock, "bollockcache") != 0)
427         goto err;
428 
429     if (beos_new_lock(&iovec_lock, "iovec_lock") != 0)
430         goto err;
431 
432     /* allocate two of these up front so vm won't accidently re-enter itself */
433     iovec_pool[0] = (struct iovec *)malloc(sizeof(struct iovec)*NUM_FLUSH_BLOCKS);
434     iovec_pool[1] = (struct iovec *)malloc(sizeof(struct iovec)*NUM_FLUSH_BLOCKS);
435 
436 #ifndef USER
437 #ifdef DEBUG
438     add_debugger_command("bcache", do_dump, "dump the block cache list");
439     add_debugger_command("fblock", do_find_block, "find a block in the cache");
440     add_debugger_command("fdata",  do_find_data, "find a data block ptr in the cache");
441 #endif
442     register_kernel_daemon(cache_flusher, NULL, 3);
443 #endif
444 
445     return 0;
446 
447  err:
448     if (bc.lock.s >= 0)
449         beos_free_lock(&bc.lock);
450 
451     if (iovec_lock.s >= 0)
452         beos_free_lock(&iovec_lock);
453 
454     shutdown_hash_table(&bc.ht);
455     memset((void *)&bc, 0, sizeof(bc));
456     return ENOMEM;
457 }
458 
459 
460 static struct iovec *
461 get_iovec_array(void)
462 {
463     int i;
464     struct iovec *iov;
465 
466     LOCK(iovec_lock);
467 
468     for(i = 0; i < MAX_IOVECS; i++) {
469         if (iovec_used[i] == 0)
470             break;
471     }
472 
473     if (i >= MAX_IOVECS)       /* uh-oh */
474         beos_panic("cache: ran out of iovecs (pool %p, used %p)!\n",
475               &iovec_pool[0], &iovec_used[0]);
476 
477     if (iovec_pool[i] == NULL) {
478         iovec_pool[i] = (struct iovec *)malloc(sizeof(struct iovec)*NUM_FLUSH_BLOCKS);
479         if (iovec_pool[i] == NULL)
480             beos_panic("can't allocate an iovec!\n");
481     }
482 
483     iov = iovec_pool[i];
484     iovec_used[i] = 1;
485 
486     UNLOCK(iovec_lock);
487 
488     return iov;
489 }
490 
491 
492 static void
493 release_iovec_array(struct iovec *iov)
494 {
495     int i;
496 
497     LOCK(iovec_lock);
498 
499     for(i=0; i < MAX_IOVECS; i++) {
500         if (iov == iovec_pool[i])
501             break;
502     }
503 
504     if (i < MAX_IOVECS)
505         iovec_used[i] = 0;
506     else                     /* uh-oh */
507         printf("cache: released an iovec I don't own (iov %p)\n", iov);
508 
509 
510     UNLOCK(iovec_lock);
511 }
512 
513 
514 
515 
516 static void
517 real_dump_cache_list(cache_ent_list *cel)
518 {
519     cache_ent *ce;
520 
521     kprintf("starting from LRU end:\n");
522 
523     for (ce = cel->lru; ce; ce = ce->next) {
524         kprintf("ce %p dev %2d bnum %6" B_PRIdOFF " lock %d flag %d arg %p "
525                "clone %p\n", ce, ce->dev, ce->block_num, ce->lock,
526                ce->flags, ce->arg, ce->clone);
527     }
528     kprintf("MRU end\n");
529 }
530 
531 static void
532 dump_cache_list(void)
533 {
534     kprintf("NORMAL BLOCKS\n");
535     real_dump_cache_list(&bc.normal);
536 
537     kprintf("LOCKED BLOCKS\n");
538     real_dump_cache_list(&bc.locked);
539 
540     kprintf("cur blocks %d, max blocks %d ht @ %p\n", bc.cur_blocks,
541            bc.max_blocks, &bc.ht);
542 }
543 
544 #if 0
545 static void
546 check_bcache(char *str)
547 {
548     int count = 0;
549     cache_ent *ce, *prev = NULL;
550 
551     LOCK(bc.lock);
552 
553     for(ce=bc.normal.lru; ce; prev=ce, ce=ce->next) {
554         count++;
555     }
556 
557     for(ce=bc.locked.lru; ce; prev=ce, ce=ce->next) {
558         count++;
559     }
560 
561     if (count != bc.cur_blocks) {
562         if (count < bc.cur_blocks - 16)
563             beos_panic("%s: count == %d, cur_blocks %d, prev 0x%x\n",
564                     str, count, bc.cur_blocks, prev);
565         else
566             printf("%s: count == %d, cur_blocks %d, prev 0x%x\n",
567                     str, count, bc.cur_blocks, prev);
568     }
569 
570     UNLOCK(bc.lock);
571 }
572 
573 
574 static void
575 dump_lists(void)
576 {
577     cache_ent *nce;
578 
579     printf("LOCKED 0x%x  (tail 0x%x, head 0x%x)\n", &bc.locked,
580            bc.locked.lru, bc.locked.mru);
581     for(nce=bc.locked.lru; nce; nce=nce->next)
582         printf("nce @ 0x%x dev %d bnum %ld flags %d lock %d clone 0x%x func 0x%x\n",
583                nce, nce->dev, nce->block_num, nce->flags, nce->lock, nce->clone,
584                nce->func);
585 
586     printf("NORMAL 0x%x  (tail 0x%x, head 0x%x)\n", &bc.normal,
587            bc.normal.lru, bc.normal.mru);
588     for(nce=bc.normal.lru; nce; nce=nce->next)
589         printf("nce @ 0x%x dev %d bnum %ld flags %d lock %d clone 0x%x func 0x%x\n",
590                nce, nce->dev, nce->block_num, nce->flags, nce->lock, nce->clone,
591                nce->func);
592 }
593 
594 
595 static void
596 check_lists(void)
597 {
598     cache_ent *ce, *prev, *oce;
599     cache_ent_list *cel;
600 
601     cel = &bc.normal;
602     for(ce=cel->lru,prev=NULL; ce; prev=ce, ce=ce->next) {
603         for(oce=bc.locked.lru; oce; oce=oce->next) {
604             if (oce == ce) {
605                 dump_lists();
606                 beos_panic("1:ce @ 0x%x is in two lists(cel 0x%x &LOCKED)\n",ce,cel);
607             }
608         }
609     }
610     if (prev && prev != cel->mru) {
611         dump_lists();
612         beos_panic("*** last element in list != cel mru (ce 0x%x, cel 0x%x)\n",
613               prev, cel);
614     }
615 
616     cel = &bc.locked;
617     for(ce=cel->lru,prev=NULL; ce; prev=ce, ce=ce->next) {
618         for(oce=bc.normal.lru; oce; oce=oce->next) {
619             if (oce == ce) {
620                 dump_lists();
621                 beos_panic("3:ce @ 0x%x is in two lists(cel 0x%x & DIRTY)\n",ce,cel);
622             }
623         }
624     }
625     if (prev && prev != cel->mru) {
626         dump_lists();
627         beos_panic("*** last element in list != cel mru (ce 0x%x, cel 0x%x)\n",
628               prev, cel);
629     }
630 }
631 #endif
632 
633 
634 #ifdef DEBUG
635 #if 0
636 static int
637 do_dump(int argc, char **argv)
638 {
639     dump_cache_list();
640     return 1;
641 }
642 
643 
644 static int
645 do_find_block(int argc, char **argv)
646 {
647     int        i;
648     fs_off_t  bnum;
649     cache_ent *ce;
650 
651     if (argc < 2) {
652         kprintf("%s: needs a block # argument\n", argv[0]);
653         return 1;
654     }
655 
656     for(i=1; i < argc; i++) {
657         bnum = strtoul(argv[i], NULL, 0);
658 
659         for(ce=bc.normal.lru; ce; ce=ce->next) {
660             if (ce->block_num == bnum) {
661                 kprintf("found clean bnum %ld @ 0x%lx (data @ 0x%lx)\n",
662                         bnum, ce, ce->data);
663             }
664         }
665 
666         for(ce=bc.locked.lru; ce; ce=ce->next) {
667             if (ce->block_num == bnum) {
668                 kprintf("found locked bnum %ld @ 0x%lx (data @ 0x%lx)\n",
669                         bnum, ce, ce->data);
670             }
671         }
672     }
673 
674     return 0;
675 }
676 
677 static int
678 do_find_data(int argc, char **argv)
679 {
680     int        i;
681     void      *data;
682     cache_ent *ce;
683 
684     if (argc < 2) {
685         kprintf("%s: needs a block # argument\n", argv[0]);
686         return 1;
687     }
688 
689     for(i=1; i < argc; i++) {
690         data = (void *)strtoul(argv[i], NULL, 0);
691 
692         for(ce=bc.normal.lru; ce; ce=ce->next) {
693             if (ce->data == data) {
694                 kprintf("found normal data ptr for bnum %ld @ ce 0x%lx\n",
695                         ce->block_num, ce);
696             }
697         }
698 
699         for(ce=bc.locked.lru; ce; ce=ce->next) {
700             if (ce->data == data) {
701                 kprintf("found locked data ptr for bnum %ld @ ce 0x%lx\n",
702                         ce->block_num, ce);
703             }
704         }
705     }
706 
707     return 0;
708 }
709 #endif
710 #endif /* DEBUG */
711 
712 
713 
714 /*
715   this function detaches the cache_ent from the list.
716 */
717 static void
718 delete_from_list(cache_ent_list *cel, cache_ent *ce)
719 {
720     if (ce->next)
721         ce->next->prev = ce->prev;
722     if (ce->prev)
723         ce->prev->next = ce->next;
724 
725     if (cel->lru == ce)
726         cel->lru = ce->next;
727     if (cel->mru == ce)
728         cel->mru = ce->prev;
729 
730     ce->next = NULL;
731     ce->prev = NULL;
732 }
733 
734 
735 
736 /*
737   this function adds the cache_ent ce to the head of the
738   list (i.e. the MRU end).  the cache_ent should *not*
739   be in any lists.
740 */
741 static void
742 add_to_head(cache_ent_list *cel, cache_ent *ce)
743 {
744 if (ce->next != NULL || ce->prev != NULL) {
745     beos_panic("*** ath: ce has non-null next/prev ptr (ce %p nxt %p, prv %p)\n",
746            ce, ce->next, ce->prev);
747 }
748 
749     ce->next = NULL;
750     ce->prev = cel->mru;
751 
752     if (cel->mru)
753         cel->mru->next = ce;
754     cel->mru = ce;
755 
756     if (cel->lru == NULL)
757         cel->lru = ce;
758 }
759 
760 
761 /*
762   this function adds the cache_ent ce to the tail of the
763   list (i.e. the MRU end).  the cache_ent should *not*
764   be in any lists.
765 */
766 static void
767 add_to_tail(cache_ent_list *cel, cache_ent *ce)
768 {
769 if (ce->next != NULL || ce->prev != NULL) {
770     beos_panic("*** att: ce has non-null next/prev ptr (ce %p nxt %p, prv %p)\n",
771            ce, ce->next, ce->prev);
772 }
773 
774     ce->next = cel->lru;
775     ce->prev = NULL;
776 
777     if (cel->lru)
778         cel->lru->prev = ce;
779     cel->lru = ce;
780 
781     if (cel->mru == NULL)
782         cel->mru = ce;
783 }
784 
785 
786 static int
787 cache_ent_cmp(const void *a, const void *b)
788 {
789     fs_off_t  diff;
790     cache_ent *p1 = *(cache_ent **)a, *p2 = *(cache_ent **)b;
791 
792     if (p1 == NULL || p2 == NULL)
793         beos_panic("cache_ent pointers are null?!? (a %p, b %p\n)\n", a, b);
794 
795     if (p1->dev == p2->dev) {
796         diff = p1->block_num - p2->block_num;
797         return (int)diff;
798     } else {
799         return p1->dev - p2->dev;
800     }
801 }
802 
803 #if 0
804 // ToDo: add this to the fsh (as a background thread)?
805 static void
806 cache_flusher(void *arg, int phase)
807 {
808     int    i, num_ents, err;
809     bigtime_t now = system_time();
810     static cache_ent *ce = NULL;
811     static cache_ent *ents[NUM_FLUSH_BLOCKS];
812 
813     /*
814        if someone else was in the cache recently then just bail out so
815        we don't lock them out unnecessarily
816     */
817     if ((now - last_cache_access) < 1000000)
818         return;
819 
820     LOCK(bc.lock);
821 
822     ce = bc.normal.lru;
823 
824     for(num_ents=0; ce && num_ents < NUM_FLUSH_BLOCKS; ce=ce->next) {
825         if (ce->flags & CE_BUSY)
826             continue;
827 
828         if ((ce->flags & CE_DIRTY) == 0 && ce->clone == NULL)
829             continue;
830 
831         ents[num_ents] = ce;
832         ents[num_ents]->flags |= CE_BUSY;
833         num_ents++;
834     }
835 
836     /* if we've got some room left over, look for cloned locked blocks */
837     if (num_ents < NUM_FLUSH_BLOCKS) {
838         ce = bc.locked.lru;
839 
840         for(; num_ents < NUM_FLUSH_BLOCKS;) {
841             for(;
842                 ce && ((ce->flags & CE_BUSY) || ce->clone == NULL);
843                 ce=ce->next)
844                 /* skip ents that meet the above criteria */;
845 
846             if (ce == NULL)
847                 break;
848 
849             ents[num_ents] = ce;
850             ents[num_ents]->flags |= CE_BUSY;
851             ce = ce->next;
852             num_ents++;
853         }
854     }
855 
856     UNLOCK(bc.lock);
857 
858     if (num_ents == 0)
859         return;
860 
861     qsort(ents, num_ents, sizeof(cache_ent **), cache_ent_cmp);
862 
863     if ((err = flush_ents(ents, num_ents)) != 0) {
864         printf("flush ents failed (ents @ 0x%lx, num_ents %d!\n",
865                (ulong)ents, num_ents);
866     }
867 
868     for(i=0; i < num_ents; i++) {       /* clear the busy bit on each of ent */
869         ents[i]->flags &= ~CE_BUSY;
870     }
871 }
872 #endif
873 
874 
875 static int
876 flush_cache_ent(cache_ent *ce)
877 {
878     int   ret = 0;
879     void *data;
880 
881     /* if true, then there's nothing to flush */
882     if ((ce->flags & CE_DIRTY) == 0 && ce->clone == NULL)
883         return 0;
884 
885     /* same thing here */
886     if (ce->clone == NULL && ce->lock != 0)
887         return 0;
888 
889  restart:
890 	if (ce->clone)
891 		data = ce->clone;
892 	else
893 		data = ce->data;
894 
895 	if (chatty_io > 2) printf("flush: %7" B_PRIdOFF "\n", ce->block_num);
896 	ret = beos_write_phys_blocks(ce->dev, ce->block_num, data, 1, ce->bsize);
897 
898     if (ce->func) {
899         ce->func(ce->logged_bnum, 1, ce->arg);
900         ce->func = NULL;
901     }
902 
903     if (ce->clone) {
904         free(ce->clone);
905         ce->clone = NULL;
906 
907         if (ce->lock == 0 && (ce->flags & CE_DIRTY))
908             goto restart;     /* also write the real data ptr */
909     } else {
910         ce->flags &= ~CE_DIRTY;
911     }
912 
913     return ret;
914 }
915 
916 
917 static int
918 flush_ents(cache_ent **ents, int n_ents)
919 {
920     int    i, j, k, ret = 0, bsize, iocnt, do_again = 0;
921     fs_off_t  start_bnum;
922     struct iovec *iov;
923 
924     iov = get_iovec_array();
925     if (iov == NULL)
926         return ENOMEM;
927 
928 restart:
929     for(i=0; i < n_ents; i++) {
930         /* if true, then there's nothing to flush */
931         if ((ents[i]->flags & CE_DIRTY) == 0 && ents[i]->clone == NULL)
932             continue;
933 
934         /* if true we can't touch the dirty data yet because it's locked */
935         if (ents[i]->clone == NULL && ents[i]->lock != 0)
936             continue;
937 
938 
939         bsize      = ents[i]->bsize;
940         start_bnum = ents[i]->block_num;
941 
942         for(j=i+1; j < n_ents && (j - i) < NUM_FLUSH_BLOCKS; j++) {
943             if (ents[j]->dev != ents[i]->dev ||
944                 ents[j]->block_num != start_bnum + (j - i))
945                 break;
946 
947             if (ents[j]->clone == NULL && ents[j]->lock != 0)
948                 break;
949         }
950 
951         if (j == i+1) {           /* only one block, just flush it directly */
952             if ((ret = flush_cache_ent(ents[i])) != 0)
953                 break;
954             continue;
955         }
956 
957 
958         for(k=i,iocnt=0; k < j; k++,iocnt++) {
959             if (ents[k]->clone)
960                 iov[iocnt].iov_base = ents[k]->clone;
961             else
962                 iov[iocnt].iov_base = ents[k]->data;
963 
964             iov[iocnt].iov_len = bsize;
965         }
966 
967 		if (chatty_io)
968 	        printf("writev @ %" B_PRIdOFF " for %d blocks\n",
969                 start_bnum, iocnt);
970 
971         ret = writev_pos(ents[i]->dev, start_bnum * (fs_off_t)bsize,
972                          &iov[0], iocnt);
973         if (ret != iocnt*bsize) {
974             int idx;
975 
976             printf("flush_ents: writev failed: "
977                 "iocnt %d start bnum %" B_PRIdOFF " bsize %d, ret %d\n",
978                 iocnt, start_bnum, bsize, ret);
979 
980             for(idx=0; idx < iocnt; idx++)
981                 printf("iov[%2d] = %p :: %ld\n", idx, iov[idx].iov_base,
982                        iov[idx].iov_len);
983 
984             printf("error %s writing blocks %" B_PRIdOFF ":%d (%d != %d)\n",
985                    strerror(errno), start_bnum, iocnt, ret, iocnt*bsize);
986             ret = EINVAL;
987             break;
988         }
989         ret = 0;
990 
991 
992         for(k=i; k < j; k++) {
993             if (ents[k]->func) {
994                 ents[k]->func(ents[k]->logged_bnum, 1, ents[k]->arg);
995                 ents[k]->func = NULL;
996             }
997 
998             if (ents[k]->clone) {
999                 free(ents[k]->clone);
1000                 ents[k]->clone = NULL;
1001             } else {
1002                 ents[k]->flags &= ~CE_DIRTY;
1003             }
1004         }
1005 
1006 
1007         i = j - 1;  /* i gets incremented by the outer for loop */
1008     }
1009 
1010     /*
1011        here we have to go back through and flush any blocks that are
1012        still dirty.  with an arched brow you astutely ask, "but how
1013        could this happen given the above loop?"  Ahhh young grasshopper
1014        I say, the path through the cache is long and twisty and fraught
1015        with peril.  The reason it can happen is that a block can be both
1016        cloned and dirty.  The above loop would only flush the cloned half
1017        of the data, not the main dirty block.  So we have to go back
1018        through and see if there are any blocks that are still dirty.  If
1019        there are we go back to the top of the function and do the whole
1020        thing over.  Kind of grody but it is necessary to insure the
1021        correctness of the log for the Be file system.
1022     */
1023     if (do_again == 0) {
1024         for(i=0; i < n_ents; i++) {
1025             if ((ents[i]->flags & CE_DIRTY) == 0 || ents[i]->lock)
1026                 continue;
1027 
1028             do_again = 1;
1029             break;
1030         }
1031 
1032         if (do_again)
1033             goto restart;
1034     }
1035 
1036     release_iovec_array(iov);
1037 
1038 
1039     return ret;
1040 }
1041 
1042 static void
1043 delete_cache_list(cache_ent_list *cel)
1044 {
1045     void      *junk;
1046     cache_ent *ce, *next;
1047 
1048     for (ce = cel->lru; ce; ce = next) {
1049         next = ce->next;
1050         if (ce->lock != 0) {
1051             if (ce->func)
1052                 printf("*** shutdown_block_cache: "
1053                     "block %" B_PRIdOFF ", lock == %d (arg %p)!\n",
1054                     ce->block_num, ce->lock, ce->arg);
1055             else
1056                 printf("*** shutdown_block_cache: "
1057                     "block %" B_PRIdOFF ", lock == %d!\n",
1058                     ce->block_num, ce->lock);
1059         }
1060 
1061         if (ce->flags & CE_BUSY) {
1062             printf("* shutdown block cache: "
1063                 "bnum %" B_PRIdOFF " is busy? ce %p\n", ce->block_num, ce);
1064         }
1065 
1066         if ((ce->flags & CE_DIRTY) || ce->clone) {
1067             flush_cache_ent(ce);
1068         }
1069 
1070         if (ce->clone)
1071             free(ce->clone);
1072         ce->clone = NULL;
1073 
1074         if (ce->data)
1075             free(ce->data);
1076         ce->data = NULL;
1077 
1078         if ((junk = hash_delete(&bc.ht, ce->dev, ce->block_num)) != ce) {
1079             printf("*** free_device_cache: "
1080                 "bad hash table entry %" B_PRIdOFF " %p != %p\n",
1081                 ce->block_num, junk, ce);
1082         }
1083 
1084         explicit_bzero(ce, sizeof(*ce));
1085         free(ce);
1086 
1087         bc.cur_blocks--;
1088     }
1089 }
1090 
1091 
1092 void
1093 beos_shutdown_block_cache(void)
1094 {
1095     /* print_hash_stats(&bc.ht); */
1096 
1097     if (bc.lock.s > 0)
1098         LOCK(bc.lock);
1099 
1100 #ifndef USER
1101     unregister_kernel_daemon(cache_flusher, NULL);
1102 #endif
1103 
1104     delete_cache_list(&bc.normal);
1105     delete_cache_list(&bc.locked);
1106 
1107     bc.normal.lru = bc.normal.mru = NULL;
1108     bc.locked.lru = bc.locked.mru = NULL;
1109 
1110     shutdown_hash_table(&bc.ht);
1111 
1112     if (bc.lock.s > 0)
1113         beos_free_lock(&bc.lock);
1114     bc.lock.s = -1;
1115 
1116     if (iovec_lock.s >= 0)
1117         beos_free_lock(&iovec_lock);
1118 }
1119 
1120 
1121 
1122 int
1123 beos_init_cache_for_device(int fd, fs_off_t max_blocks)
1124 {
1125     int ret = 0;
1126 
1127     if (fd >= MAX_DEVICES)
1128         return -1;
1129 
1130     LOCK(bc.lock);
1131 
1132     if (max_device_blocks[fd] != 0) {
1133         printf("device %d is already initialized!\n", fd);
1134         ret = -1;
1135     } else {
1136         max_device_blocks[fd] = max_blocks;
1137     }
1138 
1139     UNLOCK(bc.lock);
1140 
1141     return ret;
1142 }
1143 
1144 
1145 
1146 /*
1147    this routine assumes that bc.lock has been acquired
1148 */
1149 static cache_ent *
1150 block_lookup(int dev, fs_off_t bnum)
1151 {
1152     int        count = 0;
1153     cache_ent *ce;
1154 
1155     while (1) {
1156         ce = hash_lookup(&bc.ht, dev, bnum);
1157         if (ce == NULL)
1158             return NULL;
1159 
1160         if ((ce->flags & CE_BUSY) == 0) /* it's ok, break out and return it */
1161             break;
1162 
1163         /* else, it's busy and we need to retry our lookup */
1164         UNLOCK(bc.lock);
1165 
1166         snooze(5000);
1167         if (count++ == 5000) {  /* then a lot of time has elapsed */
1168             printf("block %" B_PRIdOFF " isn't coming un-busy (ce @ %p)\n",
1169                    ce->block_num, ce);
1170         }
1171 
1172         LOCK(bc.lock);
1173     }
1174 
1175     if (ce->flags & CE_BUSY)
1176         beos_panic("block lookup: returning a busy block @ %p?!?\n", ce);
1177 
1178     return ce;
1179 }
1180 
1181 
1182 
1183 int
1184 beos_set_blocks_info(int dev, fs_off_t *blocks, int nblocks,
1185                void (*func)(fs_off_t bnum, size_t nblocks, void *arg), void *arg)
1186 {
1187     int        i, j, cur;
1188     cache_ent *ce;
1189     cache_ent *ents[NUM_FLUSH_BLOCKS];
1190 
1191     LOCK(bc.lock);
1192 
1193 
1194     for(i=0, cur=0; i < nblocks; i++) {
1195 
1196         /* printf("sbi:   %ld (arg 0x%x)\n", blocks[i], arg); */
1197         ce = block_lookup(dev, blocks[i]);
1198         if (ce == NULL) {
1199             beos_panic("*** set_block_info can't find bnum %ld!\n", blocks[i]);
1200             UNLOCK(bc.lock);
1201             return ENOENT;   /* hopefully this doesn't happen... */
1202         }
1203 
1204 
1205         if (blocks[i] != ce->block_num || dev != ce->dev) {
1206             UNLOCK(bc.lock);
1207             beos_panic("** error1: looked up dev %d block %ld but found dev %d "
1208                     "bnum %ld\n", dev, blocks[i], ce->dev, ce->block_num);
1209             return EBADF;
1210         }
1211 
1212         if (ce->lock == 0) {
1213             beos_panic("* set_block_info on bnum %ld (%d) but it's not locked!\n",
1214                     blocks[i], nblocks);
1215         }
1216 
1217 
1218         if ((ce->flags & CE_DIRTY) == 0) {
1219             beos_panic("*** set_block_info on non-dirty block bnum %ld (%d)!\n",
1220                     blocks[i], nblocks);
1221         }
1222 
1223         ce->flags |= CE_BUSY;     /* mark all blocks as busy till we're done */
1224 
1225         /* if there is cloned data, it needs to be flushed now */
1226         if (ce->clone && ce->func) {
1227             ents[cur++] = ce;
1228 
1229             if (cur >= NUM_FLUSH_BLOCKS) {
1230                 UNLOCK(bc.lock);
1231 
1232                 qsort(ents, cur, sizeof(cache_ent **), cache_ent_cmp);
1233 
1234                 flush_ents(ents, cur);
1235 
1236                 LOCK(bc.lock);
1237                 for(j=0; j < cur; j++)
1238                     ents[j]->flags &= ~CE_BUSY;
1239                 cur = 0;
1240             }
1241         }
1242     }
1243 
1244 
1245     if (cur != 0) {
1246         UNLOCK(bc.lock);
1247 
1248         qsort(ents, cur, sizeof(cache_ent **), cache_ent_cmp);
1249 
1250         flush_ents(ents, cur);
1251 
1252         LOCK(bc.lock);
1253         for(j=0; j < cur; j++)
1254             ents[j]->flags &= ~CE_BUSY;
1255         cur = 0;
1256     }
1257 
1258 
1259     /* now go through and set the info that we were asked to */
1260     for (i = 0; i < nblocks; i++) {
1261         /* we can call hash_lookup() here because we know it's around */
1262         ce = hash_lookup(&bc.ht, dev, blocks[i]);
1263         if (ce == NULL) {
1264             beos_panic("*** set_block_info can't find bnum %lld!\n", blocks[i]);
1265             UNLOCK(bc.lock);
1266             return ENOENT;   /* hopefully this doesn't happen... */
1267         }
1268 
1269         ce->flags &= ~(CE_DIRTY | CE_BUSY);
1270 
1271         if (ce->func != NULL) {
1272             beos_panic("*** set_block_info non-null callback on bnum %lld\n",
1273                     ce->block_num);
1274         }
1275 
1276         if (ce->clone != NULL) {
1277             beos_panic("*** ce->clone == %p, not NULL in set_block_info\n",
1278                     ce->clone);
1279         }
1280 
1281         ce->clone = (void *)malloc(ce->bsize);
1282         if (ce->clone == NULL)
1283             beos_panic("*** can't clone bnum %lld (bsize %d)\n",
1284                     ce->block_num, ce->bsize);
1285 
1286 
1287         memcpy(ce->clone, ce->data, ce->bsize);
1288 
1289         ce->func   = func;
1290         ce->arg    = arg;
1291 
1292         ce->logged_bnum = blocks[i];
1293 
1294         ce->lock--;
1295         if (ce->lock < 0) {
1296             printf("sbi: "
1297                 "whoa nellie! ce @ %p (%" B_PRIdOFF ") has lock == %d\n",
1298                    ce, ce->block_num, ce->lock);
1299         }
1300 
1301         if (ce->lock == 0) {
1302             delete_from_list(&bc.locked, ce);
1303             add_to_head(&bc.normal, ce);
1304         }
1305     }
1306 
1307     UNLOCK(bc.lock);
1308 
1309     return 0;
1310 }
1311 
1312 
1313 /* this function is only for use by flush_device() */
1314 static void
1315 do_flush(cache_ent **ents, int max)
1316 {
1317     int i;
1318 
1319     for(i=0; i < max; i++) {
1320         ents[i]->flags |= CE_BUSY;
1321     }
1322 
1323     UNLOCK(bc.lock);
1324 
1325     qsort(ents, max, sizeof(cache_ent **), cache_ent_cmp);
1326     flush_ents(ents, max);
1327 
1328     LOCK(bc.lock);
1329     for(i=0; i < max; i++) {
1330         ents[i]->flags &= ~CE_BUSY;
1331     }
1332 }
1333 
1334 int
1335 beos_flush_device(int dev, int warn_locked)
1336 {
1337     int cur;
1338     cache_ent *ce;
1339     cache_ent *ents[NUM_FLUSH_BLOCKS];
1340 
1341     LOCK(bc.lock);
1342 
1343     cur = 0;
1344     ce = bc.normal.lru;
1345     while (ce) {
1346         if (ce->dev != dev || (ce->flags & CE_BUSY)) {
1347             ce = ce->next;
1348             continue;
1349         }
1350 
1351         if ((ce->flags & CE_DIRTY) || ce->clone) {
1352             ents[cur++] = ce;
1353             if (cur >= NUM_FLUSH_BLOCKS) {
1354                 do_flush(ents, cur);
1355 
1356                 ce = bc.normal.lru;
1357                 cur = 0;
1358                 continue;
1359             }
1360         }
1361 
1362         ce = ce->next;
1363     }
1364 
1365     if (cur != 0)
1366         do_flush(ents, cur);
1367 
1368     cur = 0;
1369     ce = bc.locked.lru;
1370     while (ce) {
1371         if (ce->dev != dev || (ce->flags & CE_BUSY)) {
1372             ce = ce->next;
1373             continue;
1374         }
1375 
1376         if (ce->clone) {
1377             ents[cur++] = ce;
1378             if (cur >= NUM_FLUSH_BLOCKS) {
1379                 do_flush(ents, cur);
1380 
1381                 ce = bc.locked.lru;
1382                 cur = 0;
1383                 continue;
1384             }
1385         }
1386 
1387         ce = ce->next;
1388     }
1389 
1390     if (cur != 0)
1391         do_flush(ents, cur);
1392 
1393     UNLOCK(bc.lock);
1394 
1395     return 0;
1396 }
1397 
1398 
1399 static void
1400 real_remove_cached_blocks(int dev, int allow_writes, cache_ent_list *cel)
1401 {
1402     void      *junk;
1403     cache_ent *ce, *next = NULL;
1404 
1405     for(ce=cel->lru; ce; ce=next) {
1406         next = ce->next;
1407 
1408         if (ce->dev != dev) {
1409             continue;
1410         }
1411 
1412         if (ce->lock != 0 || (ce->flags & CE_BUSY)) {
1413             printf("*** remove_cached_dev: "
1414                 "block %" B_PRIdOFF " has lock = %d, flags 0x%x! ce @ %p\n",
1415                 ce->block_num, ce->lock, ce->flags, ce);
1416         }
1417 
1418         if (allow_writes == ALLOW_WRITES &&
1419             ((ce->flags & CE_DIRTY) || ce->clone)) {
1420             ce->flags |= CE_BUSY;
1421             flush_cache_ent(ce);
1422             ce->flags &= ~CE_BUSY;
1423         }
1424 
1425         /* unlink this guy */
1426         if (cel->lru == ce)
1427             cel->lru = ce->next;
1428 
1429         if (cel->mru == ce)
1430             cel->mru = ce->prev;
1431 
1432         if (ce->prev)
1433             ce->prev->next = ce->next;
1434         if (ce->next)
1435             ce->next->prev = ce->prev;
1436 
1437         if (ce->clone)
1438             free(ce->clone);
1439         ce->clone = NULL;
1440 
1441         if (ce->data)
1442             free(ce->data);
1443         ce->data = NULL;
1444 
1445         if ((junk = hash_delete(&bc.ht, ce->dev, ce->block_num)) != ce) {
1446             beos_panic("*** remove_cached_device: bad hash table entry %ld "
1447                    "%p != %p\n", ce->block_num, junk, ce);
1448         }
1449 
1450         free(ce);
1451 
1452         bc.cur_blocks--;
1453     }
1454 
1455 }
1456 
1457 int
1458 beos_remove_cached_device_blocks(int dev, int allow_writes)
1459 {
1460     LOCK(bc.lock);
1461 
1462     real_remove_cached_blocks(dev, allow_writes, &bc.normal);
1463     real_remove_cached_blocks(dev, allow_writes, &bc.locked);
1464 
1465     max_device_blocks[dev] = 0;
1466 
1467     UNLOCK(bc.lock);
1468 
1469     return 0;
1470 }
1471 
1472 
1473 
1474 int
1475 beos_flush_blocks(int dev, fs_off_t bnum, int nblocks)
1476 {
1477     int        cur, i;
1478     cache_ent *ce;
1479     cache_ent *ents[NUM_FLUSH_BLOCKS];
1480 
1481     if (nblocks == 0)   /* might as well check for this */
1482         return 0;
1483 
1484     LOCK(bc.lock);
1485 
1486     cur = 0;
1487     for(; nblocks > 0; nblocks--, bnum++) {
1488         ce = block_lookup(dev, bnum);
1489         if (ce == NULL)
1490             continue;
1491 
1492         if (bnum != ce->block_num || dev != ce->dev) {
1493             UNLOCK(bc.lock);
1494             beos_panic("error2: looked up dev %d block %ld but found %d %ld\n",
1495                   dev, bnum, ce->dev, ce->block_num);
1496             return EBADF;
1497         }
1498 
1499         if ((ce->flags & CE_DIRTY) == 0 && ce->clone == NULL)
1500             continue;
1501 
1502         ce->flags |= CE_BUSY;
1503         ents[cur++] = ce;
1504 
1505         if (cur >= NUM_FLUSH_BLOCKS) {
1506             UNLOCK(bc.lock);
1507 
1508             qsort(ents, cur, sizeof(cache_ent **), cache_ent_cmp);
1509             flush_ents(ents, cur);
1510 
1511             LOCK(bc.lock);
1512             for(i=0; i < cur; i++) {
1513                 ents[i]->flags &= ~CE_BUSY;
1514             }
1515             cur = 0;
1516         }
1517     }
1518 
1519     UNLOCK(bc.lock);
1520 
1521     if (cur == 0)     /* nothing more to do */
1522         return 0;
1523 
1524     /* flush out the last few buggers */
1525     qsort(ents, cur, sizeof(cache_ent **), cache_ent_cmp);
1526     flush_ents(ents, cur);
1527 
1528     for(i=0; i < cur; i++) {
1529         ents[i]->flags &= ~CE_BUSY;
1530     }
1531 
1532     return 0;
1533 }
1534 
1535 
1536 int
1537 beos_mark_blocks_dirty(int dev, fs_off_t bnum, int nblocks)
1538 {
1539     int        ret = 0;
1540     cache_ent *ce;
1541 
1542     LOCK(bc.lock);
1543 
1544     while(nblocks > 0) {
1545         ce = block_lookup(dev, bnum);
1546         if (ce) {
1547             ce->flags |= CE_DIRTY;
1548             bnum      += 1;
1549             nblocks   -= 1;
1550         } else {     /* hmmm, that's odd, didn't find it */
1551             printf("** mark_blocks_diry couldn't find block %" B_PRIdOFF " "
1552                 "(len %d)\n", bnum, nblocks);
1553             ret = ENOENT;
1554             break;
1555         }
1556     }
1557 
1558     UNLOCK(bc.lock);
1559 
1560     return ret;
1561 }
1562 
1563 
1564 
1565 int
1566 beos_release_block(int dev, fs_off_t bnum)
1567 {
1568     cache_ent *ce;
1569 
1570     /* printf("rlsb: %ld\n", bnum); */
1571     LOCK(bc.lock);
1572 
1573     ce = block_lookup(dev, bnum);
1574     if (ce) {
1575         if (bnum != ce->block_num || dev != ce->dev) {
1576             beos_panic("*** error3: looked up dev %d block %ld but found %d %ld\n",
1577                     dev, bnum, ce->dev, ce->block_num);
1578             UNLOCK(bc.lock);
1579             return EBADF;
1580         }
1581 
1582         ce->lock--;
1583 
1584         if (ce->lock < 0) {
1585             printf("rlsb: whoa nellie! ce %" B_PRIdOFF " has lock == %d\n",
1586                    ce->block_num, ce->lock);
1587         }
1588 
1589         if (ce->lock == 0) {
1590             delete_from_list(&bc.locked, ce);
1591             add_to_head(&bc.normal, ce);
1592         }
1593 
1594     } else {     /* hmmm, that's odd, didn't find it */
1595         beos_panic("** release_block asked to find %ld but it's not here\n",
1596                bnum);
1597     }
1598 
1599     UNLOCK(bc.lock);
1600 
1601     return 0;
1602 }
1603 
1604 
1605 static cache_ent *
1606 new_cache_ent(int bsize)
1607 {
1608     cache_ent *ce;
1609 
1610     ce = (cache_ent *)calloc(1, sizeof(cache_ent));
1611     if (ce == NULL) {
1612         beos_panic("*** error: cache can't allocate memory!\n");
1613         return NULL;
1614     }
1615 
1616     ce->data = malloc(bsize);
1617     if (ce->data == NULL) {
1618         free(ce);
1619         beos_panic("** error cache can't allocate data memory\n");
1620         UNLOCK(bc.lock);
1621         return NULL;
1622     }
1623 
1624     ce->dev       = -1;
1625     ce->block_num = -1;
1626 
1627     return ce;
1628 }
1629 
1630 
1631 static void
1632 get_ents(cache_ent **ents, int num_needed, int max, int *num_gotten, int bsize)
1633 {
1634     int        cur, retry_counter = 0, max_retry = num_needed * 256;
1635     cache_ent *ce;
1636 
1637     if (num_needed > max)
1638         beos_panic("get_ents: num_needed %d but max %d (doh!)\n", num_needed, max);
1639 
1640     /* if the cache isn't full yet, just allocate the blocks */
1641     for(cur=0; bc.cur_blocks < bc.max_blocks && cur < num_needed; cur++) {
1642         ents[cur] = new_cache_ent(bsize);
1643         if (ents[cur] == NULL)
1644             break;
1645         bc.cur_blocks++;
1646     }
1647 
1648     /* pluck off blocks from the LRU end of the normal list, keep trying too */
1649     while(cur < num_needed && retry_counter < max_retry) {
1650         for(ce=bc.normal.lru; ce && cur < num_needed; ce=ce->next) {
1651             if (ce->lock)
1652                 beos_panic("get_ents: normal list has locked blocks (ce %p)\n",ce);
1653 
1654             if (ce->flags & CE_BUSY)   /* don't touch busy blocks */
1655                 continue;
1656 
1657             ce->flags   |= CE_BUSY;
1658             ents[cur++]  = ce;
1659         }
1660 
1661         if (cur < num_needed) {
1662             UNLOCK(bc.lock);
1663             snooze(10000);
1664             LOCK(bc.lock);
1665             retry_counter++;
1666         }
1667     }
1668 
1669     if (cur < num_needed && retry_counter >= max_retry) {  /* oh shit! */
1670         dump_cache_list();
1671         UNLOCK(bc.lock);
1672         beos_panic("get_ents: waited too long; can't get enough ce's (c %d n %d)\n",
1673               cur, num_needed);
1674     }
1675 
1676     /*
1677       If the last block is a dirty one, try to get more of 'em so
1678       that we can flush a bunch of blocks at once.
1679     */
1680     if (cur && cur < max &&
1681         ((ents[cur-1]->flags & CE_DIRTY) || ents[cur-1]->clone)) {
1682 
1683         for(ce=ents[cur-1]->next; ce && cur < max; ce=ce->next) {
1684             if (ce->flags & CE_BUSY)   /* don't touch busy blocks */
1685                 continue;
1686 
1687             if (ce->lock)
1688                 beos_panic("get_ents:2 dirty list has locked blocks (ce %p)\n",ce);
1689 
1690             ce->flags   |= CE_BUSY;
1691             ents[cur++]  = ce;
1692         }
1693     }
1694 
1695     *num_gotten = cur;
1696 }
1697 
1698 
1699 static int
1700 read_into_ents(int dev, fs_off_t bnum, cache_ent **ents, int num, int bsize)
1701 {
1702     int    i, ret;
1703     struct iovec *iov;
1704 
1705     iov = get_iovec_array();
1706 
1707     for (i = 0; i < num; i++) {
1708         iov[i].iov_base = ents[i]->data;
1709         iov[i].iov_len  = bsize;
1710     }
1711 
1712 	if (chatty_io > 2)
1713 		printf("readv @ %" B_PRIdOFF " for %d blocks "
1714 			"(at %" B_PRIdOFF ", block_size = %d)\n",
1715 			bnum, num, bnum*bsize, bsize);
1716     ret = readv_pos(dev, bnum*bsize, iov, num);
1717 
1718     release_iovec_array(iov);
1719 
1720     if (ret != num*bsize) {
1721         printf("read_into_ents: asked to read %d bytes but got %d\n",
1722                num*bsize, ret);
1723         printf("*** iov @ %p (num %d)\n", iov, num);
1724         return EINVAL;
1725     } else
1726         return 0;
1727 }
1728 
1729 
1730 
1731 #define CACHE_READ          0x0001
1732 #define CACHE_WRITE         0x0002
1733 #define CACHE_NOOP          0x0004     /* for getting empty blocks */
1734 #define CACHE_LOCKED        0x0008
1735 #define CACHE_READ_AHEAD_OK 0x0010     /* it's ok to do read-ahead */
1736 
1737 
1738 static char *
1739 op_to_str(int op)
1740 {
1741     static char buff[128];
1742 
1743     if (op & CACHE_READ)
1744         strcpy(buff, "READ");
1745     else if (op & CACHE_WRITE)
1746         strcpy(buff, "WRITE");
1747     else if (op & CACHE_NOOP)
1748         strcpy(buff, "NOP");
1749 
1750     if (op & CACHE_LOCKED)
1751         strcat(buff, " LOCKED");
1752 
1753     if (op & CACHE_READ_AHEAD_OK)
1754         strcat(buff, " (AHEAD)");
1755 
1756     return buff;
1757 }
1758 
1759 static int
1760 cache_block_io(int dev, fs_off_t bnum, void *data, fs_off_t num_blocks, int bsize,
1761                int op, void **dataptr)
1762 {
1763     size_t          err = 0;
1764     cache_ent      *ce;
1765     cache_ent_list *cel;
1766 
1767     if (chatty_io > 1)
1768         printf("cbio: bnum = %" B_PRIdOFF ", num_blocks = %" B_PRIdOFF ", "
1769             "bsize = %d, op = %s\n", bnum, num_blocks, bsize, op_to_str(op));
1770 
1771     /* some sanity checks first */
1772     if (bsize == 0)
1773         beos_panic("cache_io: block size == 0 for bnum %lld?!?\n", bnum);
1774 
1775     if (num_blocks == 0)
1776         beos_panic("cache_io: bnum %lld has num_blocks == 0!\n", bnum);
1777 
1778     if (data == NULL && dataptr == NULL) {
1779         printf("major butthead move: "
1780             "null data and dataptr! bnum %" B_PRIdOFF ":%" B_PRIdOFF "\n",
1781             bnum, num_blocks);
1782         return ENOMEM;
1783     }
1784 
1785     if (data == NULL) {
1786         if (num_blocks != 1)    /* get_block() should never do that */
1787             beos_panic("cache_io: num_blocks %lld but should be 1\n",
1788                   num_blocks);
1789 
1790         if (op & CACHE_WRITE)
1791             beos_panic("cache_io: get_block() asked to write?!?\n");
1792     }
1793 
1794     if (bnum + num_blocks > max_device_blocks[dev]) {
1795         printf("dev %d: "
1796             "access to blocks %" B_PRIdOFF ":%" B_PRIdOFF " "
1797             "but max_dev_blocks is %" B_PRIdOFF "\n",
1798             dev, bnum, num_blocks, max_device_blocks[dev]);
1799 
1800 		// let the app crash here
1801 		*(int *)0x3100 = 0xc0debabe;
1802         return EINVAL;
1803     }
1804 
1805     last_cache_access = system_time();
1806 
1807     /* if the i/o is greater than 64k, do it directly */
1808     if (num_blocks * bsize >= 64 * 1024) {
1809         char  *ptr;
1810         fs_off_t  tmp;
1811 
1812         if (data == NULL || (op & CACHE_LOCKED)) {
1813             beos_panic("*** asked to do a large locked io that's too hard!\n");
1814         }
1815 
1816 
1817         if (op & CACHE_READ) {
1818             if (beos_read_phys_blocks(dev, bnum, data, num_blocks, bsize) != 0) {
1819                 printf("cache read:read_phys_blocks failed "
1820                     "(%s on blocks %" B_PRIdOFF ":%" B_PRIdOFF ")!\n",
1821                      strerror(errno), bnum, num_blocks);
1822                 return EINVAL;
1823             }
1824 
1825             LOCK(bc.lock);
1826 
1827             /* if any of the blocks are in the cache, grab them instead */
1828             ptr = data;
1829             for(tmp=bnum; tmp < bnum+num_blocks; tmp++, ptr+=bsize) {
1830                 ce = block_lookup(dev, tmp);
1831                 /*
1832                     if we find a block in the cache we have to copy its
1833                     data just in case it is more recent than what we just
1834                     read from disk (which could happen if someone wrote
1835                     these blocks after we did the read but before we locked
1836                     the cache and entered this loop).
1837                 */
1838                 if (ce) {
1839                     if (tmp != ce->block_num || dev != ce->dev) {
1840                         UNLOCK(bc.lock);
1841                         beos_panic("*** error4: looked up dev %d block %lld but "
1842                                 "found %d %lld\n", dev, tmp, ce->dev,
1843                                 ce->block_num);
1844                     }
1845 
1846                     memcpy(ptr, ce->data, bsize);
1847                 }
1848             }
1849 
1850             UNLOCK(bc.lock);
1851         } else if (op & CACHE_WRITE) {
1852             LOCK(bc.lock);
1853 
1854             /* if any of the blocks are in the cache, update them too */
1855             ptr = data;
1856             for(tmp=bnum; tmp < bnum+num_blocks; tmp++, ptr+=bsize) {
1857                 ce = block_lookup(dev, tmp);
1858                 if (ce) {
1859                     if (tmp != ce->block_num || dev != ce->dev) {
1860                         UNLOCK(bc.lock);
1861                         beos_panic("*** error5: looked up dev %d block %lld but "
1862                                 "found %d %lld\n", dev, tmp, ce->dev,
1863                                 ce->block_num);
1864                         return EBADF;
1865                     }
1866 
1867                     /* XXXdbg -- this isn't strictly necessary */
1868                     if (ce->clone) {
1869                         printf("over-writing cloned data "
1870                             "(ce %p bnum %" B_PRIdOFF ")...\n", ce, tmp);
1871                         flush_cache_ent(ce);
1872                     }
1873 
1874                     /* copy the data into the cache */
1875                     memcpy(ce->data, ptr, bsize);
1876                 }
1877             }
1878 
1879             UNLOCK(bc.lock);
1880 
1881             if (beos_write_phys_blocks(dev, bnum, data, num_blocks, bsize) != 0) {
1882                 printf("cache write: write_phys_blocks failed (%s on blocks "
1883                     "%" B_PRIdOFF ":%" B_PRIdOFF ")!\n",
1884                     strerror(errno), bnum, num_blocks);
1885                 return EINVAL;
1886             }
1887         } else {
1888             printf("bad cache op %d "
1889                 "(bnum %" B_PRIdOFF " nblocks %" B_PRIdOFF ")\n",
1890                 op, bnum, num_blocks);
1891             return EINVAL;
1892         }
1893 
1894         return 0;
1895     }
1896 
1897 
1898     LOCK(bc.lock);
1899     while(num_blocks) {
1900 
1901         ce = block_lookup(dev, bnum);
1902         if (ce) {
1903             if (bnum != ce->block_num || dev != ce->dev) {
1904                 UNLOCK(bc.lock);
1905                 beos_panic("*** error6: looked up dev %d block %ld but found "
1906                         "%d %ld\n", dev, bnum, ce->dev, ce->block_num);
1907                 return EBADF;
1908             }
1909 
1910             if (bsize != ce->bsize) {
1911                 beos_panic("*** requested bsize %d but ce->bsize %d ce @ %p\n",
1912                         bsize, ce->bsize, ce);
1913             }
1914 
1915             /* delete this ent from the list it is in because it may change */
1916             if (ce->lock)
1917                 cel = &bc.locked;
1918             else
1919                 cel = &bc.normal;
1920 
1921             delete_from_list(cel, ce);
1922 
1923             if (op & CACHE_READ) {
1924                 if (data && data != ce->data) {
1925                     memcpy(data, ce->data, bsize);
1926                 } else if (dataptr) {
1927                     *dataptr = ce->data;
1928                 } else {
1929                     printf("cbio:data %p dptr %p ce @ %p ce->data %p\n",
1930                            data, dataptr, ce, ce->data);
1931                 }
1932             } else if (op & CACHE_WRITE) {
1933                 if (data && data != ce->data)
1934                     memcpy(ce->data, data, bsize);
1935 
1936                 ce->flags |= CE_DIRTY;
1937             } else if (op & CACHE_NOOP) {
1938                 memset(ce->data, 0, bsize);
1939                 if (data)
1940                     memset(data, 0, bsize);
1941 
1942                 if (dataptr)
1943                     *dataptr = ce->data;
1944 
1945                 ce->flags |= CE_DIRTY;
1946             } else {
1947                 beos_panic("cached_block_io: bogus op %d\n", op);
1948             }
1949 
1950             if (op & CACHE_LOCKED)
1951                 ce->lock++;
1952 
1953             if (ce->lock)
1954                 cel = &bc.locked;
1955             else
1956                 cel = &bc.normal;
1957 
1958             /* now put this ent at the head of the appropriate list */
1959             add_to_head(cel, ce);
1960 
1961             if (data != NULL)
1962                 data = (void *)((char *)data + bsize);
1963 
1964             bnum       += 1;
1965             num_blocks -= 1;
1966 
1967             continue;
1968         } else {                                  /* it's not in the cache */
1969             int        cur, cur_nblocks, num_dirty, real_nblocks, num_needed;
1970             cache_ent *ents[NUM_FLUSH_BLOCKS];
1971 
1972             /*
1973                here we find out how many additional blocks in this request
1974                are not in the cache.  the idea is that then we can do one
1975                big i/o on that many blocks at once.
1976             */
1977             for(cur_nblocks=1;
1978                 cur_nblocks < num_blocks && cur_nblocks < NUM_FLUSH_BLOCKS;
1979                 cur_nblocks++) {
1980 
1981                 /* we can call hash_lookup() directly instead of
1982                    block_lookup() because we don't care about the
1983                    state of the busy bit of the block at this point
1984                 */
1985                 if (hash_lookup(&bc.ht, dev, bnum + cur_nblocks))
1986                     break;
1987             }
1988 
1989             /*
1990               here we try to figure out how many extra blocks we should read
1991               for read-ahead.  we want to read as many as possible that are
1992               not already in the cache and that don't cause us to try and
1993               read beyond the end of the disk.
1994             */
1995             if ((op & CACHE_READ) && (op & CACHE_READ_AHEAD_OK) &&
1996                 (cur_nblocks * bsize) < read_ahead_size) {
1997 
1998                 for(num_needed=cur_nblocks;
1999                     num_needed < (read_ahead_size / bsize);
2000                     num_needed++) {
2001 
2002                     if ((bnum + num_needed) >= max_device_blocks[dev])
2003                         break;
2004 
2005                     if (hash_lookup(&bc.ht, dev, bnum + num_needed))
2006                         break;
2007                 }
2008             } else {
2009                 num_needed = cur_nblocks;
2010             }
2011 
2012             /* this will get us pointers to a bunch of cache_ents we can use */
2013             get_ents(ents, num_needed, NUM_FLUSH_BLOCKS, &real_nblocks, bsize);
2014 
2015             if (real_nblocks < num_needed) {
2016                 beos_panic("don't have enough cache ents "
2017                      "(need %d got %d %ld::%" B_PRIdOFF ")\n",
2018                       num_needed, real_nblocks, bnum, num_blocks);
2019             }
2020 
2021             /*
2022               There are now three variables used as limits within the ents
2023               array.  This is how they are related:
2024 
2025                  cur_nblocks <= num_needed <= real_nblocks
2026 
2027               Ents from 0 to cur_nblocks-1 are going to be used to fulfill
2028               this IO request.  Ents from cur_nblocks to num_needed-1 are
2029               for read-ahead.  Ents from num_needed to real_nblocks are
2030               extra blocks that get_ents() asked us to flush.  Often (and
2031               always on writes) cur_nblocks == num_needed.
2032 
2033               Below, we sort the list of ents so that when we flush them
2034               they go out in order.
2035             */
2036 
2037             qsort(ents, real_nblocks, sizeof(cache_ent **), cache_ent_cmp);
2038 
2039             /*
2040               delete each ent from its list because it will change.  also
2041               count up how many dirty blocks there are and insert into the
2042               hash table any new blocks so that no one else will try to
2043               read them in when we release the cache semaphore to do our I/O.
2044             */
2045             for(cur=0,num_dirty=0; cur < real_nblocks; cur++) {
2046                 ce = ents[cur];
2047                 ce->flags |= CE_BUSY;
2048 
2049                 /*
2050                    insert the new block into the hash table with its new block
2051                    number. note that the block is still in the hash table for
2052                    its old block number -- and it has to be until we are done
2053                    flushing it from the cache (to prevent someone else from
2054                    sneaking in in front of us and trying to read the same
2055                    block that we're flushing).
2056                 */
2057                 if (cur < num_needed) {
2058                     if (hash_insert(&bc.ht, dev, bnum + cur, ce) != 0)
2059                         beos_panic("could not insert cache ent for %d %ld (%p)\n",
2060                               dev, bnum + cur, ents[cur]);
2061                 }
2062 
2063                 if (ce->dev == -1)
2064                     continue;
2065 
2066                 if ((ce->flags & CE_DIRTY) || ce->clone)
2067                     num_dirty++;
2068 
2069                 if (ce->lock) {
2070                     beos_panic("cbio: can't use locked blocks here ce @ %p\n",ce);
2071                 } else {
2072                     cel = &bc.normal;
2073 	                delete_from_list(cel, ce);
2074 				}
2075             }
2076             ce = NULL;
2077 
2078 
2079             /*
2080                we release the block cache semaphore here so that we can
2081                go do all the i/o we need to do (flushing dirty blocks
2082                that we're kicking out as well as reading any new data).
2083 
2084                because all the blocks we're touching are marked busy
2085                no one else should mess with them while we're doing this.
2086             */
2087             if (num_dirty || (op & CACHE_READ)) {
2088                 UNLOCK(bc.lock);
2089 
2090                 /* this flushes any blocks we're kicking out that are dirty */
2091                 if (num_dirty && (err = flush_ents(ents, real_nblocks)) != 0) {
2092                     printf("flush ents failed (ents @ %p, nblocks %d!\n",
2093                            ents, cur_nblocks);
2094                     goto handle_err;
2095                 }
2096 
2097             }
2098 
2099             /*
2100                now that everything is flushed to disk, go through and
2101                make sure that the data blocks we're going to use are
2102                the right block size for this current request (it's
2103                possible we're kicking out some smaller blocks and need
2104                to reallocate the data block pointer). We do this in two
2105                steps, first free'ing everything and then going through
2106                and doing the malloc's to try and be nice to the memory
2107                system (i.e. allow it to coalesce stuff, etc).
2108             */
2109             err = 0;
2110             for(cur=0; cur < num_needed; cur++) {
2111                 if (ents[cur]->bsize != bsize) {
2112                     free(ents[cur]->data);
2113                     ents[cur]->data = NULL;
2114 
2115                     if (ents[cur]->clone) {
2116                         free(ents[cur]->clone);
2117                         ents[cur]->clone = NULL;
2118                     }
2119                 }
2120             }
2121 
2122             for(cur=0; cur < num_needed; cur++) {
2123                 if (ents[cur]->data == NULL) {
2124                     ents[cur]->data  = (void *)malloc(bsize);
2125                     ents[cur]->bsize = bsize;
2126                 }
2127 
2128                 if (ents[cur]->data == NULL) {
2129                     printf("cache: no memory for block (bsize %d)!\n",
2130                            bsize);
2131                     err = ENOMEM;
2132                     break;
2133                 }
2134             }
2135 
2136             /*
2137                if this condition is true it's a pretty serious error.
2138                we'll try and back out gracefully but we're in pretty
2139                deep at this point and it ain't going to be easy.
2140             */
2141   handle_err:
2142             if (err) {
2143                 for(cur=0; cur < num_needed; cur++) {
2144                     cache_ent *tmp_ce;
2145 
2146                     tmp_ce = (cache_ent *)hash_delete(&bc.ht,dev,bnum+cur);
2147                     if (tmp_ce != ents[cur]) {
2148                         beos_panic("hash_del0: %d %ld got %p, not %p\n",
2149                                 dev, bnum+cur, tmp_ce,
2150                                 ents[cur]);
2151                     }
2152 
2153                     tmp_ce = (cache_ent *)hash_delete(&bc.ht,ents[cur]->dev,
2154                                                         ents[cur]->block_num);
2155                     if (tmp_ce != ents[cur]) {
2156                         beos_panic("hash_del1: %d %ld got %p, not %p\n",
2157                                 ents[cur]->dev, ents[cur]->block_num, tmp_ce,
2158                                 ents[cur]);
2159                     }
2160 
2161                     ents[cur]->flags &= ~CE_BUSY;
2162                     if (ents[cur]->data)
2163                         free(ents[cur]->data);
2164                     free(ents[cur]);
2165                     ents[cur] = NULL;
2166 
2167                     bc.cur_blocks--;
2168                 }
2169 
2170                 if (cur < real_nblocks) {
2171                     LOCK(bc.lock);
2172                     for(; cur < real_nblocks; cur++) {
2173                         ents[cur]->flags &= ~CE_BUSY;
2174 
2175                         /* we have to put them back here */
2176                         add_to_tail(&bc.normal, ents[cur]);
2177                     }
2178                     UNLOCK(bc.lock);
2179                 }
2180 
2181                 return ENOMEM;
2182             }
2183 
2184 
2185             /*
2186                If we go into this if statement, the block cache lock
2187                has *already been released* up above when we flushed the
2188                dirty entries.  As always, since the blocks we're mucking
2189                with are marked busy, they shouldn't get messed with.
2190             */
2191             err = 0;
2192             if (num_dirty || (op & CACHE_READ)) {
2193                 /* this section performs the i/o that we need to do */
2194                 if (op & CACHE_READ) {
2195                     err = read_into_ents(dev, bnum, ents, num_needed, bsize);
2196                 } else {
2197                     err = 0;
2198                 }
2199 
2200                 if (err != 0) {
2201                     printf("err %s on dev %d block %" B_PRIdOFF ":%d (%d) "
2202                            "data %p, ents[0] %p\n",
2203                            strerror(errno), dev, bnum, cur_nblocks,
2204                            bsize, data, ents[0]);
2205                 }
2206 
2207                 /*
2208                    acquire the semaphore here so that we can go on mucking
2209                    with the cache data structures.  We need to delete old
2210                    block numbers from the hash table and set the new block
2211                    number's for the blocks we just read in.  We also put the
2212                    read-ahead blocks at the head of mru list.
2213                 */
2214 
2215                 LOCK(bc.lock);
2216             }
2217 
2218             for(cur=0; cur < num_needed; cur++) {
2219                 cache_ent *tmp_ce;
2220 
2221                 ce = ents[cur];
2222                 if (ce->dev != -1) {
2223                     tmp_ce = hash_delete(&bc.ht, ce->dev, ce->block_num);
2224                     if (tmp_ce == NULL || tmp_ce != ce) {
2225                         beos_panic("*** hash_delete failure (ce %p tce %p)\n",
2226                               ce, tmp_ce);
2227                     }
2228                 }
2229 
2230                 if (err == 0 && cur >= cur_nblocks) {
2231                     ce->dev       = dev;
2232                     ce->block_num = bnum + cur;
2233                     ce->flags    &= ~CE_BUSY;
2234                     add_to_head(&bc.normal, ce);
2235                 }
2236             }
2237             ce = NULL;
2238 
2239             /*
2240               clear the busy bit on the blocks we force-flushed and
2241               put them on the normal list since they're now clean.
2242             */
2243             for(; cur < real_nblocks; cur++) {
2244                 ents[cur]->flags &= ~CE_BUSY;
2245 
2246                 if (ents[cur]->lock)
2247                     beos_panic("should not have locked blocks here (ce %p)\n",
2248                           ents[cur]);
2249 
2250                 add_to_tail(&bc.normal, ents[cur]);
2251             }
2252 
2253             if (err) {   /* then we have some cleanup to do */
2254                 for(cur=0; cur < num_needed; cur++) {
2255                     cache_ent *tmp_ce;
2256 
2257                     /* we delete all blocks from the cache so we don't
2258                        leave partially written blocks in the cache */
2259 
2260                     tmp_ce = (cache_ent *)hash_delete(&bc.ht,dev,bnum+cur);
2261                     if (tmp_ce != ents[cur]) {
2262                         beos_panic("hash_del: %d %ld got %p, not %p\n",
2263                                 dev, bnum+cur, tmp_ce,
2264                                 ents[cur]);
2265                     }
2266 
2267                     ce = ents[cur];
2268                     ce->flags &= ~CE_BUSY;
2269 
2270                     free(ce->data);
2271                     ce->data = NULL;
2272 
2273                     free(ce);
2274                     ents[cur] = NULL;
2275 
2276                     bc.cur_blocks--;
2277                 }
2278                 ce = NULL;
2279 
2280                 UNLOCK(bc.lock);
2281                 return err;
2282             }
2283 
2284 
2285             /*
2286                last step: go through and make sure all the cache_ent
2287                structures have the right data in them, delete old guys, etc.
2288             */
2289             for(cur=0; cur < cur_nblocks; cur++) {
2290                 ce = ents[cur];
2291 
2292                 if (ce->dev != -1) {   /* then clean this guy up */
2293                     if (ce->next || ce->prev)
2294                         beos_panic("ce @ %p should not be in a list yet!\n", ce);
2295 
2296                     if (ce->clone)
2297                         free(ce->clone);
2298 
2299                     if (ce->data == NULL)
2300                         beos_panic("ce @ %p has a null data ptr\n", ce);
2301                 }
2302 
2303                 ce->dev        = dev;
2304                 ce->block_num  = bnum + cur;
2305                 ce->bsize      = bsize;
2306                 ce->flags      = CE_NORMAL;
2307                 ce->lock       = 0;
2308                 ce->clone      = NULL;
2309                 ce->func = ce->arg  = NULL;
2310                 ce->next = ce->prev = NULL;
2311 
2312                 if (op & CACHE_READ) {
2313                     if (data)
2314                         memcpy(data, ce->data, bsize);
2315                 } else if (op & CACHE_WRITE) {
2316                     ce->flags  |= CE_DIRTY;
2317                     memcpy(ce->data, data, bsize);
2318                 } else if (op & CACHE_NOOP) {
2319                     memset(ce->data, 0, bsize);
2320                     if (data)
2321                         memset(data, 0, bsize);
2322 
2323                     ce->flags |= CE_DIRTY;
2324                 }
2325 
2326                 if (op & CACHE_LOCKED) {
2327                     ce->lock++;
2328                     cel = &bc.locked;
2329                 } else {
2330                     cel = &bc.normal;
2331                 }
2332 
2333                 /* now stick this puppy at the head of the mru list */
2334                 add_to_head(cel, ce);
2335 
2336 
2337                 if (dataptr) {
2338                     *dataptr = ce->data;
2339                 }
2340 
2341                 if (data != NULL)
2342                     data = (void *)((char *)data + bsize);
2343                 else if (cur_nblocks != 1)
2344                     beos_panic("cache can't handle setting data_ptr twice!\n");
2345             }  /* end of for(cur=0; cur < cur_nblocks; cur++) */
2346 
2347             bnum       += cur_nblocks;
2348             num_blocks -= cur_nblocks;
2349 
2350         }   /* end of else it's not in the cache */
2351 
2352     }   /* end of while(num_blocks) */
2353 
2354     UNLOCK(bc.lock);
2355 
2356     return 0;
2357 }
2358 
2359 
2360 void *
2361 beos_get_block(int dev, fs_off_t bnum, int bsize)
2362 {
2363     void *data;
2364 
2365     if (cache_block_io(dev, bnum, NULL, 1, bsize, CACHE_READ|CACHE_LOCKED|CACHE_READ_AHEAD_OK,
2366                        &data) != 0)
2367         return NULL;
2368 
2369     return data;
2370 }
2371 
2372 void *
2373 beos_get_empty_block(int dev, fs_off_t bnum, int bsize)
2374 {
2375     void *data;
2376 
2377     if (cache_block_io(dev, bnum, NULL, 1, bsize, CACHE_NOOP|CACHE_LOCKED,
2378                        &data) != 0)
2379         return NULL;
2380 
2381     return data;
2382 }
2383 
2384 int
2385 beos_cached_read(int dev, fs_off_t bnum, void *data, fs_off_t num_blocks, int bsize)
2386 {
2387     return cache_block_io(dev, bnum, data, num_blocks, bsize,
2388                           CACHE_READ | CACHE_READ_AHEAD_OK, NULL);
2389 }
2390 
2391 
2392 int
2393 beos_cached_write(int dev, fs_off_t bnum, const void *data, fs_off_t num_blocks,int bsize)
2394 {
2395     return cache_block_io(dev, bnum, (void *)data, num_blocks, bsize,
2396                           CACHE_WRITE, NULL);
2397 }
2398 
2399 int
2400 beos_cached_write_locked(int dev, fs_off_t bnum, const void *data,
2401                     fs_off_t num_blocks, int bsize)
2402 {
2403     return cache_block_io(dev, bnum, (void *)data, num_blocks, bsize,
2404                           CACHE_WRITE | CACHE_LOCKED, NULL);
2405 }
2406 
2407 
2408 void
2409 beos_force_cache_flush(int dev, int prefer_log_blocks)
2410 {
2411     int        i, count = 0;
2412     cache_ent *ce;
2413     cache_ent *ents[NUM_FLUSH_BLOCKS];
2414 
2415 
2416     LOCK(bc.lock);
2417 
2418     for(ce=bc.normal.lru; ce; ce=ce->next) {
2419         if ((ce->dev == dev) &&
2420             (ce->flags & CE_BUSY) == 0 &&
2421             ((ce->flags & CE_DIRTY) || ce->clone) &&
2422             ((prefer_log_blocks && ce->func) || (prefer_log_blocks == 0))) {
2423 
2424             ce->flags |= CE_BUSY;
2425             ents[count++] = ce;
2426 
2427             if (count >= NUM_FLUSH_BLOCKS) {
2428                 break;
2429             }
2430         }
2431     }
2432 
2433     /* if we've got some room left, try and grab any cloned blocks */
2434     if (count < NUM_FLUSH_BLOCKS) {
2435         for(ce=bc.locked.lru; ce; ce=ce->next) {
2436             if ((ce->dev == dev) &&
2437                 (ce->flags & CE_BUSY) == 0 &&
2438                 (ce->clone)) {
2439 
2440                 ce->flags |= CE_BUSY;
2441                 ents[count++] = ce;
2442 
2443                 if (count >= NUM_FLUSH_BLOCKS) {
2444                     break;
2445                 }
2446             }
2447         }
2448     }
2449 
2450     UNLOCK(bc.lock);
2451 
2452     if (count != 0) {
2453         qsort(ents, count, sizeof(cache_ent **), cache_ent_cmp);
2454         flush_ents(ents, count);
2455 
2456         for(i=0; i < count; i++)
2457             ents[i]->flags &= ~CE_BUSY;
2458     }
2459 }
2460