1 /*
2 * Copyright 2019-2022, Haiku, Inc. All rights reserved.
3 * Distributed under the terms of the MIT License.
4 *
5 * Authors:
6 * Augustin Cavalier <waddlesplash>
7 */
8
9
10 #include <stdio.h>
11 #include <stdlib.h>
12
13 #include <algorithm>
14 #include <condition_variable.h>
15 #include <AutoDeleter.h>
16 #include <kernel.h>
17 #include <smp.h>
18 #include <util/AutoLock.h>
19
20 #include <fs/devfs.h>
21 #include <bus/PCI.h>
22 #include <vm/vm.h>
23
24 #include "IORequest.h"
25
26 extern "C" {
27 #include <libnvme/nvme.h>
28 #include <libnvme/nvme_internal.h>
29 }
30
31
32 //#define TRACE_NVME_DISK
33 #ifdef TRACE_NVME_DISK
34 # define TRACE(x...) dprintf("nvme_disk: " x)
35 #else
36 # define TRACE(x...) ;
37 #endif
38 #define TRACE_ALWAYS(x...) dprintf("nvme_disk: " x)
39 #define TRACE_ERROR(x...) dprintf("\33[33mnvme_disk:\33[0m " x)
40 #define CALLED() TRACE("CALLED %s\n", __PRETTY_FUNCTION__)
41
42
43 static const uint8 kDriveIcon[] = {
44 0x6e, 0x63, 0x69, 0x66, 0x08, 0x03, 0x01, 0x00, 0x00, 0x02, 0x00, 0x16,
45 0x02, 0x3c, 0xc7, 0xee, 0x38, 0x9b, 0xc0, 0xba, 0x16, 0x57, 0x3e, 0x39,
46 0xb0, 0x49, 0x77, 0xc8, 0x42, 0xad, 0xc7, 0x00, 0xff, 0xff, 0xd3, 0x02,
47 0x00, 0x06, 0x02, 0x3c, 0x96, 0x32, 0x3a, 0x4d, 0x3f, 0xba, 0xfc, 0x01,
48 0x3d, 0x5a, 0x97, 0x4b, 0x57, 0xa5, 0x49, 0x84, 0x4d, 0x00, 0x47, 0x47,
49 0x47, 0xff, 0xa5, 0xa0, 0xa0, 0x02, 0x00, 0x16, 0x02, 0xbc, 0x59, 0x2f,
50 0xbb, 0x29, 0xa7, 0x3c, 0x0c, 0xe4, 0xbd, 0x0b, 0x7c, 0x48, 0x92, 0xc0,
51 0x4b, 0x79, 0x66, 0x00, 0x7d, 0xff, 0xd4, 0x02, 0x00, 0x06, 0x02, 0x38,
52 0xdb, 0xb4, 0x39, 0x97, 0x33, 0xbc, 0x4a, 0x33, 0x3b, 0xa5, 0x42, 0x48,
53 0x6e, 0x66, 0x49, 0xee, 0x7b, 0x00, 0x59, 0x67, 0x56, 0xff, 0xeb, 0xb2,
54 0xb2, 0x03, 0xa7, 0xff, 0x00, 0x03, 0xff, 0x00, 0x00, 0x04, 0x01, 0x80,
55 0x07, 0x0a, 0x06, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x5a, 0x3e, 0x5a,
56 0x31, 0x39, 0x25, 0x0a, 0x04, 0x22, 0x3c, 0x44, 0x4b, 0x5a, 0x31, 0x39,
57 0x25, 0x0a, 0x04, 0x44, 0x4b, 0x44, 0x5b, 0x5a, 0x3e, 0x5a, 0x31, 0x0a,
58 0x04, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x44, 0x4b, 0x08, 0x02, 0x27,
59 0x43, 0xb8, 0x14, 0xc1, 0xf1, 0x08, 0x02, 0x26, 0x43, 0x29, 0x44, 0x0a,
60 0x05, 0x44, 0x5d, 0x49, 0x5d, 0x60, 0x3e, 0x5a, 0x3b, 0x5b, 0x3f, 0x08,
61 0x0a, 0x07, 0x01, 0x06, 0x00, 0x0a, 0x00, 0x01, 0x00, 0x10, 0x01, 0x17,
62 0x84, 0x00, 0x04, 0x0a, 0x01, 0x01, 0x01, 0x00, 0x0a, 0x02, 0x01, 0x02,
63 0x00, 0x0a, 0x03, 0x01, 0x03, 0x00, 0x0a, 0x04, 0x01, 0x04, 0x10, 0x01,
64 0x17, 0x85, 0x20, 0x04, 0x0a, 0x06, 0x01, 0x05, 0x30, 0x24, 0xb3, 0x99,
65 0x01, 0x17, 0x82, 0x00, 0x04, 0x0a, 0x05, 0x01, 0x05, 0x30, 0x20, 0xb2,
66 0xe6, 0x01, 0x17, 0x82, 0x00, 0x04
67 };
68
69
70 #define NVME_DISK_DRIVER_MODULE_NAME "drivers/disk/nvme_disk/driver_v1"
71 #define NVME_DISK_DEVICE_MODULE_NAME "drivers/disk/nvme_disk/device_v1"
72 #define NVME_DISK_DEVICE_ID_GENERATOR "nvme_disk/device_id"
73
74 #define NVME_MAX_QPAIRS (16)
75
76
77 static device_manager_info* sDeviceManager;
78
79 typedef struct {
80 device_node* node;
81 pci_info info;
82
83 struct nvme_ctrlr* ctrlr;
84
85 struct nvme_ns* ns;
86 uint64 capacity;
87 uint32 block_size;
88 uint32 max_io_blocks;
89 status_t media_status;
90
91 DMAResource dma_resource;
92 sem_id dma_buffers_sem;
93
94 rw_lock rounded_write_lock;
95
96 ConditionVariable interrupt;
97 int32 polling;
98
99 struct qpair_info {
100 struct nvme_qpair* qpair;
101 } qpairs[NVME_MAX_QPAIRS];
102 uint32 qpair_count;
103 } nvme_disk_driver_info;
104 typedef nvme_disk_driver_info::qpair_info qpair_info;
105
106
107 typedef struct {
108 nvme_disk_driver_info* info;
109 } nvme_disk_handle;
110
111
112 static status_t
get_geometry(nvme_disk_handle * handle,device_geometry * geometry)113 get_geometry(nvme_disk_handle* handle, device_geometry* geometry)
114 {
115 nvme_disk_driver_info* info = handle->info;
116
117 devfs_compute_geometry_size(geometry, info->capacity, info->block_size);
118 geometry->bytes_per_physical_sector = info->block_size;
119
120 geometry->device_type = B_DISK;
121 geometry->removable = false;
122
123 geometry->read_only = false;
124 geometry->write_once = false;
125
126 TRACE("get_geometry(): %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %d, %d, %d, %d\n",
127 geometry->bytes_per_sector, geometry->sectors_per_track,
128 geometry->cylinder_count, geometry->head_count, geometry->device_type,
129 geometry->removable, geometry->read_only, geometry->write_once);
130
131 return B_OK;
132 }
133
134
135 static void
nvme_disk_set_capacity(nvme_disk_driver_info * info,uint64 capacity,uint32 blockSize)136 nvme_disk_set_capacity(nvme_disk_driver_info* info, uint64 capacity,
137 uint32 blockSize)
138 {
139 TRACE("set_capacity(device = %p, capacity = %" B_PRIu64 ", blockSize = %" B_PRIu32 ")\n",
140 info, capacity, blockSize);
141
142 info->capacity = capacity;
143 info->block_size = blockSize;
144 }
145
146
147 // #pragma mark - device module API
148
149
150 static int32 nvme_interrupt_handler(void* _info);
151
152
153 static status_t
nvme_disk_init_device(void * _info,void ** _cookie)154 nvme_disk_init_device(void* _info, void** _cookie)
155 {
156 CALLED();
157 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info;
158 ASSERT(info->ctrlr == NULL);
159
160 pci_device_module_info* pci;
161 pci_device* pcidev;
162 device_node* parent = sDeviceManager->get_parent_node(info->node);
163 sDeviceManager->get_driver(parent, (driver_module_info**)&pci,
164 (void**)&pcidev);
165 pci->get_pci_info(pcidev, &info->info);
166 sDeviceManager->put_node(parent);
167
168 // construct the libnvme pci_device struct
169 pci_device* device = new pci_device;
170 device->vendor_id = info->info.vendor_id;
171 device->device_id = info->info.device_id;
172 device->subvendor_id = 0;
173 device->subdevice_id = 0;
174
175 device->domain = 0;
176 device->bus = info->info.bus;
177 device->dev = info->info.device;
178 device->func = info->info.function;
179
180 device->pci_info = &info->info;
181
182 // enable busmaster and memory mapped access
183 uint16 command = pci->read_pci_config(pcidev, PCI_command, 2);
184 command |= PCI_command_master | PCI_command_memory;
185 pci->write_pci_config(pcidev, PCI_command, 2, command);
186
187 // open the controller
188 info->ctrlr = nvme_ctrlr_open(device, NULL);
189 if (info->ctrlr == NULL) {
190 TRACE_ERROR("failed to open the controller!\n");
191 return B_ERROR;
192 }
193
194 struct nvme_ctrlr_stat cstat;
195 int err = nvme_ctrlr_stat(info->ctrlr, &cstat);
196 if (err != 0) {
197 TRACE_ERROR("failed to get controller information!\n");
198 nvme_ctrlr_close(info->ctrlr);
199 return err;
200 }
201
202 TRACE_ALWAYS("attached to NVMe device \"%s (%s)\"\n", cstat.mn, cstat.sn);
203 TRACE_ALWAYS("\tmaximum transfer size: %" B_PRIuSIZE "\n", cstat.max_xfer_size);
204 TRACE_ALWAYS("\tqpair count: %d\n", cstat.io_qpairs);
205
206 // TODO: export more than just the first namespace!
207 info->ns = nvme_ns_open(info->ctrlr, cstat.ns_ids[0]);
208 if (info->ns == NULL) {
209 TRACE_ERROR("failed to open namespace!\n");
210 nvme_ctrlr_close(info->ctrlr);
211 return B_ERROR;
212 }
213 TRACE_ALWAYS("namespace 0\n");
214
215 struct nvme_ns_stat nsstat;
216 err = nvme_ns_stat(info->ns, &nsstat);
217 if (err != 0) {
218 TRACE_ERROR("failed to get namespace information!\n");
219 nvme_ctrlr_close(info->ctrlr);
220 return err;
221 }
222
223 // store capacity information
224 TRACE_ALWAYS("\tblock size: %" B_PRIuSIZE ", stripe size: %u\n",
225 nsstat.sector_size, info->ns->stripe_size);
226 nvme_disk_set_capacity(info, nsstat.sectors, nsstat.sector_size);
227
228 command = pci->read_pci_config(pcidev, PCI_command, 2);
229 command &= ~(PCI_command_int_disable);
230 pci->write_pci_config(pcidev, PCI_command, 2, command);
231
232 uint32 irq = info->info.u.h0.interrupt_line;
233 if (irq == 0xFF)
234 irq = 0;
235
236 if (pci->get_msix_count(pcidev)) {
237 uint32 msixVector = 0;
238 if (pci->configure_msix(pcidev, 1, &msixVector) == B_OK
239 && pci->enable_msix(pcidev) == B_OK) {
240 TRACE_ALWAYS("using MSI-X\n");
241 irq = msixVector;
242 }
243 } else if (pci->get_msi_count(pcidev) >= 1) {
244 uint32 msiVector = 0;
245 if (pci->configure_msi(pcidev, 1, &msiVector) == B_OK
246 && pci->enable_msi(pcidev) == B_OK) {
247 TRACE_ALWAYS("using message signaled interrupts\n");
248 irq = msiVector;
249 }
250 }
251
252 if (irq == 0) {
253 TRACE_ERROR("device PCI:%d:%d:%d was assigned an invalid IRQ\n",
254 info->info.bus, info->info.device, info->info.function);
255 info->polling = 1;
256 } else {
257 info->polling = 0;
258 }
259 info->interrupt.Init(NULL, NULL);
260 install_io_interrupt_handler(irq, nvme_interrupt_handler, (void*)info, B_NO_HANDLED_INFO);
261
262 if (info->ctrlr->feature_supported[NVME_FEAT_INTERRUPT_COALESCING]) {
263 uint32 microseconds = 16, threshold = 32;
264 nvme_admin_set_feature(info->ctrlr, false, NVME_FEAT_INTERRUPT_COALESCING,
265 ((microseconds / 100) << 8) | threshold, 0, NULL);
266 }
267
268 // allocate qpairs
269 uint32 try_qpairs = cstat.io_qpairs;
270 try_qpairs = min_c(try_qpairs, NVME_MAX_QPAIRS);
271 if (try_qpairs >= (uint32)smp_get_num_cpus()) {
272 try_qpairs = smp_get_num_cpus();
273 } else {
274 // Find the highest number of qpairs that evenly divides the number of CPUs.
275 while ((smp_get_num_cpus() % try_qpairs) != 0)
276 try_qpairs--;
277 }
278 info->qpair_count = 0;
279 for (uint32 i = 0; i < try_qpairs; i++) {
280 info->qpairs[i].qpair = nvme_ioqp_get(info->ctrlr,
281 (enum nvme_qprio)0, 0);
282 if (info->qpairs[i].qpair == NULL)
283 break;
284
285 info->qpair_count++;
286 }
287 if (info->qpair_count == 0) {
288 TRACE_ERROR("failed to allocate qpairs!\n");
289 nvme_ctrlr_close(info->ctrlr);
290 return B_NO_MEMORY;
291 }
292 if (info->qpair_count != try_qpairs) {
293 TRACE_ALWAYS("warning: did not get expected number of qpairs\n");
294 }
295
296 // allocate DMA buffers
297 int buffers = info->qpair_count * 2;
298
299 dma_restrictions restrictions = {};
300 restrictions.alignment = B_PAGE_SIZE;
301 // Technically, the first and last segments in a transfer can be aligned
302 // only on 32-bits, and the rest only need to have sizes that are a multiple
303 // of the block size.
304 restrictions.max_segment_count = (NVME_MAX_SGL_DESCRIPTORS / 2);
305 restrictions.max_transfer_size = cstat.max_xfer_size;
306 info->max_io_blocks = cstat.max_xfer_size / nsstat.sector_size;
307
308 err = info->dma_resource.Init(restrictions, B_PAGE_SIZE, buffers, buffers);
309 if (err != 0) {
310 TRACE_ERROR("failed to initialize DMA resource!\n");
311 nvme_ctrlr_close(info->ctrlr);
312 return err;
313 }
314
315 info->dma_buffers_sem = create_sem(buffers, "nvme buffers sem");
316 if (info->dma_buffers_sem < 0) {
317 TRACE_ERROR("failed to create DMA buffers semaphore!\n");
318 nvme_ctrlr_close(info->ctrlr);
319 return info->dma_buffers_sem;
320 }
321
322 // set up rounded-write lock
323 rw_lock_init(&info->rounded_write_lock, "nvme rounded writes");
324
325 *_cookie = info;
326 return B_OK;
327 }
328
329
330 static void
nvme_disk_uninit_device(void * _cookie)331 nvme_disk_uninit_device(void* _cookie)
332 {
333 CALLED();
334 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie;
335
336 remove_io_interrupt_handler(info->info.u.h0.interrupt_line,
337 nvme_interrupt_handler, (void*)info);
338
339 rw_lock_destroy(&info->rounded_write_lock);
340
341 nvme_ns_close(info->ns);
342 nvme_ctrlr_close(info->ctrlr);
343
344 // TODO: Deallocate MSI(-X).
345 // TODO: Deallocate PCI.
346 }
347
348
349 static status_t
nvme_disk_open(void * _info,const char * path,int openMode,void ** _cookie)350 nvme_disk_open(void* _info, const char* path, int openMode, void** _cookie)
351 {
352 CALLED();
353
354 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info;
355 nvme_disk_handle* handle = (nvme_disk_handle*)malloc(
356 sizeof(nvme_disk_handle));
357 if (handle == NULL)
358 return B_NO_MEMORY;
359
360 handle->info = info;
361
362 *_cookie = handle;
363 return B_OK;
364 }
365
366
367 static status_t
nvme_disk_close(void * cookie)368 nvme_disk_close(void* cookie)
369 {
370 CALLED();
371
372 //nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
373 return B_OK;
374 }
375
376
377 static status_t
nvme_disk_free(void * cookie)378 nvme_disk_free(void* cookie)
379 {
380 CALLED();
381
382 nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
383 free(handle);
384 return B_OK;
385 }
386
387
388 // #pragma mark - I/O
389
390
391 static int32
nvme_interrupt_handler(void * _info)392 nvme_interrupt_handler(void* _info)
393 {
394 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info;
395 info->interrupt.NotifyAll();
396 info->polling = -1;
397 return 0;
398 }
399
400
401 static qpair_info*
get_qpair(nvme_disk_driver_info * info)402 get_qpair(nvme_disk_driver_info* info)
403 {
404 return &info->qpairs[smp_get_current_cpu() % info->qpair_count];
405 }
406
407
408 static void
io_finished_callback(status_t * status,const struct nvme_cpl * cpl)409 io_finished_callback(status_t* status, const struct nvme_cpl* cpl)
410 {
411 *status = nvme_cpl_is_error(cpl) ? B_IO_ERROR : B_OK;
412 }
413
414
415 static void
await_status(nvme_disk_driver_info * info,struct nvme_qpair * qpair,status_t & status)416 await_status(nvme_disk_driver_info* info, struct nvme_qpair* qpair, status_t& status)
417 {
418 CALLED();
419
420 ConditionVariableEntry entry;
421 int timeouts = 0;
422 while (status == EINPROGRESS) {
423 info->interrupt.Add(&entry);
424
425 nvme_qpair_poll(qpair, 0);
426
427 if (status != EINPROGRESS)
428 return;
429
430 if (info->polling > 0) {
431 entry.Wait(B_RELATIVE_TIMEOUT, min_c(5 * 1000 * 1000,
432 (1 << timeouts) * 1000));
433 timeouts++;
434 } else if (entry.Wait(B_RELATIVE_TIMEOUT, 5 * 1000 * 1000) != B_OK) {
435 // This should never happen, as we are woken up on every interrupt
436 // no matter the qpair or transfer within; so if it does occur,
437 // that probably means the controller stalled, or maybe cannot
438 // generate interrupts at all.
439
440 TRACE_ERROR("timed out waiting for interrupt!\n");
441 if (timeouts++ >= 3) {
442 nvme_qpair_fail(qpair);
443 status = B_TIMED_OUT;
444 return;
445 }
446
447 info->polling++;
448 if (info->polling > 0) {
449 TRACE_ALWAYS("switching to polling mode, performance will be affected!\n");
450 }
451 }
452
453 nvme_qpair_poll(qpair, 0);
454 }
455 }
456
457
458 struct nvme_io_request {
459 status_t status;
460
461 bool write;
462
463 off_t lba_start;
464 size_t lba_count;
465
466 physical_entry* iovecs;
467 int32 iovec_count;
468
469 int32 iovec_i;
470 uint32 iovec_offset;
471 };
472
473
474 static void
ior_reset_sgl(nvme_io_request * request,uint32_t offset)475 ior_reset_sgl(nvme_io_request* request, uint32_t offset)
476 {
477 TRACE("IOR Reset: %" B_PRIu32 "\n", offset);
478
479 int32 i = 0;
480 while (offset > 0 && request->iovecs[i].size <= offset) {
481 offset -= request->iovecs[i].size;
482 i++;
483 }
484 request->iovec_i = i;
485 request->iovec_offset = offset;
486 }
487
488
489 static int
ior_next_sge(nvme_io_request * request,uint64_t * address,uint32_t * length)490 ior_next_sge(nvme_io_request* request, uint64_t* address, uint32_t* length)
491 {
492 int32 index = request->iovec_i;
493 if (index < 0 || index > request->iovec_count)
494 return -1;
495
496 *address = request->iovecs[index].address + request->iovec_offset;
497 *length = request->iovecs[index].size - request->iovec_offset;
498
499 TRACE("IOV %d (+ " B_PRIu32 "): 0x%" B_PRIx64 ", %" B_PRIu32 "\n",
500 request->iovec_i, request->iovec_offset, *address, *length);
501
502 request->iovec_i++;
503 request->iovec_offset = 0;
504 return 0;
505 }
506
507
508 static status_t
do_nvme_io_request(nvme_disk_driver_info * info,nvme_io_request * request)509 do_nvme_io_request(nvme_disk_driver_info* info, nvme_io_request* request)
510 {
511 request->status = EINPROGRESS;
512
513 qpair_info* qpinfo = get_qpair(info);
514 int ret = -1;
515 if (request->write) {
516 ret = nvme_ns_writev(info->ns, qpinfo->qpair, request->lba_start,
517 request->lba_count, (nvme_cmd_cb)io_finished_callback, request,
518 0, (nvme_req_reset_sgl_cb)ior_reset_sgl,
519 (nvme_req_next_sge_cb)ior_next_sge);
520 } else {
521 ret = nvme_ns_readv(info->ns, qpinfo->qpair, request->lba_start,
522 request->lba_count, (nvme_cmd_cb)io_finished_callback, request,
523 0, (nvme_req_reset_sgl_cb)ior_reset_sgl,
524 (nvme_req_next_sge_cb)ior_next_sge);
525 }
526 if (ret != 0) {
527 TRACE_ERROR("attempt to queue %s I/O at LBA %" B_PRIdOFF " of %" B_PRIuSIZE
528 " blocks failed!\n", request->write ? "write" : "read",
529 request->lba_start, request->lba_count);
530
531 request->lba_count = 0;
532 return ret;
533 }
534
535 await_status(info, qpinfo->qpair, request->status);
536
537 if (request->status != B_OK) {
538 TRACE_ERROR("%s at LBA %" B_PRIdOFF " of %" B_PRIuSIZE
539 " blocks failed!\n", request->write ? "write" : "read",
540 request->lba_start, request->lba_count);
541
542 request->lba_count = 0;
543 }
544 return request->status;
545 }
546
547
548 static status_t
nvme_disk_bounced_io(nvme_disk_handle * handle,io_request * request)549 nvme_disk_bounced_io(nvme_disk_handle* handle, io_request* request)
550 {
551 CALLED();
552
553 WriteLocker writeLocker;
554 if (request->IsWrite())
555 writeLocker.SetTo(handle->info->rounded_write_lock, false);
556
557 status_t status = acquire_sem(handle->info->dma_buffers_sem);
558 if (status != B_OK) {
559 request->SetStatusAndNotify(status);
560 return status;
561 }
562
563 const size_t block_size = handle->info->block_size;
564
565 TRACE("%p: IOR Offset: %" B_PRIdOFF "; Length %" B_PRIuGENADDR
566 "; Write %s\n", request, request->Offset(), request->Length(),
567 request->IsWrite() ? "yes" : "no");
568
569 nvme_io_request nvme_request;
570 while (request->RemainingBytes() > 0) {
571 IOOperation operation;
572 status = handle->info->dma_resource.TranslateNext(request, &operation, 0);
573 if (status != B_OK)
574 break;
575
576 do {
577 TRACE("%p: IOO offset: %" B_PRIdOFF ", length: %" B_PRIuGENADDR
578 ", write: %s\n", request, operation.Offset(),
579 operation.Length(), operation.IsWrite() ? "yes" : "no");
580
581 nvme_request.write = operation.IsWrite();
582 nvme_request.lba_start = operation.Offset() / block_size;
583 nvme_request.lba_count = operation.Length() / block_size;
584 nvme_request.iovecs = (physical_entry*)operation.Vecs();
585 nvme_request.iovec_count = operation.VecCount();
586
587 status = do_nvme_io_request(handle->info, &nvme_request);
588
589 operation.SetStatus(status,
590 status == B_OK ? operation.Length() : 0);
591 } while (status == B_OK && !operation.Finish());
592
593 if (status == B_OK && operation.Status() != B_OK) {
594 TRACE_ERROR("I/O succeeded but IOOperation failed!\n");
595 status = operation.Status();
596 }
597
598 request->OperationFinished(&operation);
599
600 handle->info->dma_resource.RecycleBuffer(operation.Buffer());
601
602 TRACE("%p: status %s, remaining bytes %" B_PRIuGENADDR "\n", request,
603 strerror(status), request->RemainingBytes());
604 if (status != B_OK)
605 break;
606 }
607
608 release_sem(handle->info->dma_buffers_sem);
609
610 // Notify() also takes care of UnlockMemory().
611 if (status != B_OK && request->Status() == B_OK)
612 request->SetStatusAndNotify(status);
613 else
614 request->NotifyFinished();
615 return status;
616 }
617
618
619 static status_t
nvme_disk_io(void * cookie,io_request * request)620 nvme_disk_io(void* cookie, io_request* request)
621 {
622 CALLED();
623
624 nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
625
626 const off_t ns_end = (handle->info->capacity * handle->info->block_size);
627 if ((request->Offset() + (off_t)request->Length()) > ns_end)
628 return ERANGE;
629
630 nvme_io_request nvme_request;
631 memset(&nvme_request, 0, sizeof(nvme_io_request));
632
633 nvme_request.write = request->IsWrite();
634
635 physical_entry* vtophys = NULL;
636 MemoryDeleter vtophysDeleter;
637
638 IOBuffer* buffer = request->Buffer();
639 status_t status = B_OK;
640 if (!buffer->IsPhysical()) {
641 status = buffer->LockMemory(request->TeamID(), request->IsWrite());
642 if (status != B_OK) {
643 TRACE_ERROR("failed to lock memory: %s\n", strerror(status));
644 return status;
645 }
646 // SetStatusAndNotify() takes care of unlocking memory if necessary.
647
648 const int32 vtophysLength = (request->Length() / B_PAGE_SIZE) + 2;
649 if (vtophysLength <= 8) {
650 vtophys = (physical_entry*)alloca(sizeof(physical_entry) * vtophysLength);
651 } else {
652 vtophys = (physical_entry*)malloc(sizeof(physical_entry) * vtophysLength);
653 vtophysDeleter.SetTo(vtophys);
654 }
655 if (vtophys == NULL) {
656 TRACE_ERROR("failed to allocate memory for iovecs\n");
657 request->SetStatusAndNotify(B_NO_MEMORY);
658 return B_NO_MEMORY;
659 }
660
661 for (size_t i = 0; i < buffer->VecCount(); i++) {
662 generic_io_vec virt = buffer->VecAt(i);
663 uint32 entries = vtophysLength - nvme_request.iovec_count;
664
665 // Avoid copies by going straight into the vtophys array.
666 status = get_memory_map_etc(request->TeamID(), (void*)virt.base,
667 virt.length, vtophys + nvme_request.iovec_count, &entries);
668
669 if (status == B_BAD_VALUE && entries == 0)
670 status = B_BUFFER_OVERFLOW;
671 if (status == B_BUFFER_OVERFLOW) {
672 // Too many physical_entries to use unbounced I/O.
673 vtophysDeleter.Delete();
674 vtophys = NULL;
675 break;
676 }
677 if (status != B_OK) {
678 TRACE_ERROR("I/O get_memory_map failed: %s\n", strerror(status));
679 request->SetStatusAndNotify(status);
680 return status;
681 }
682
683 nvme_request.iovec_count += entries;
684 }
685
686 nvme_request.iovecs = vtophys;
687 } else {
688 nvme_request.iovecs = (physical_entry*)buffer->Vecs();
689 nvme_request.iovec_count = buffer->VecCount();
690 }
691
692 // See if we need to bounce anything other than the first or last vec.
693 const size_t block_size = handle->info->block_size;
694 bool bounceAll = (nvme_request.iovecs == NULL);
695 for (int32 i = 1; !bounceAll && i < (nvme_request.iovec_count - 1); i++) {
696 if ((nvme_request.iovecs[i].address % B_PAGE_SIZE) != 0)
697 bounceAll = true;
698 if ((nvme_request.iovecs[i].size % B_PAGE_SIZE) != 0)
699 bounceAll = true;
700 }
701
702 // See if we need to bounce due to the first or last vecs.
703 if (nvme_request.iovec_count > 1) {
704 // There are middle vecs, so the first and last vecs have different restrictions: they
705 // need only be a multiple of the block size, and must end and start on a page boundary,
706 // respectively, though the start address must always be 32-bit-aligned.
707 physical_entry* entry = &nvme_request.iovecs[0];
708 if (!bounceAll && (((entry->address + entry->size) % B_PAGE_SIZE) != 0
709 || (entry->address & 0x3) != 0 || (entry->size % block_size) != 0))
710 bounceAll = true;
711
712 entry = &nvme_request.iovecs[nvme_request.iovec_count - 1];
713 if (!bounceAll && ((entry->address % B_PAGE_SIZE) != 0
714 || (entry->size % block_size) != 0))
715 bounceAll = true;
716 } else {
717 // There is only one vec. Check that it is a multiple of the block size,
718 // and that its address is 32-bit-aligned.
719 physical_entry* entry = &nvme_request.iovecs[0];
720 if (!bounceAll && ((entry->address & 0x3) != 0 || (entry->size % block_size) != 0))
721 bounceAll = true;
722 }
723
724 // See if we need to bounce due to rounding.
725 const off_t rounded_pos = ROUNDDOWN(request->Offset(), block_size);
726 const phys_size_t rounded_len = ROUNDUP(request->Length() + (request->Offset()
727 - rounded_pos), block_size);
728 if (rounded_pos != request->Offset() || rounded_len != request->Length())
729 bounceAll = true;
730
731 if (bounceAll) {
732 // Let the bounced I/O routine take care of everything from here.
733 return nvme_disk_bounced_io(handle, request);
734 }
735
736 // No bouncing was required.
737 ReadLocker readLocker;
738 if (nvme_request.write)
739 readLocker.SetTo(handle->info->rounded_write_lock, false);
740
741 // Error check before actually doing I/O.
742 if (status != B_OK) {
743 TRACE_ERROR("I/O failed early: %s\n", strerror(status));
744 request->SetStatusAndNotify(status);
745 return status;
746 }
747
748 const uint32 max_io_blocks = handle->info->max_io_blocks;
749 int32 remaining = nvme_request.iovec_count;
750 nvme_request.lba_start = rounded_pos / block_size;
751 while (remaining > 0) {
752 nvme_request.iovec_count = min_c(remaining,
753 NVME_MAX_SGL_DESCRIPTORS / 2);
754
755 nvme_request.lba_count = 0;
756 for (int i = 0; i < nvme_request.iovec_count; i++) {
757 uint32 new_lba_count = nvme_request.lba_count
758 + (nvme_request.iovecs[i].size / block_size);
759 if (nvme_request.lba_count > 0 && new_lba_count > max_io_blocks) {
760 // We already have a nonzero length, and adding this vec would
761 // make us go over (or we already are over.) Stop adding.
762 nvme_request.iovec_count = i;
763 break;
764 }
765
766 nvme_request.lba_count = new_lba_count;
767 }
768
769 status = do_nvme_io_request(handle->info, &nvme_request);
770 if (status != B_OK)
771 break;
772
773 nvme_request.iovecs += nvme_request.iovec_count;
774 remaining -= nvme_request.iovec_count;
775 nvme_request.lba_start += nvme_request.lba_count;
776 }
777
778 if (status != B_OK)
779 TRACE_ERROR("I/O failed: %s\n", strerror(status));
780
781 readLocker.Unlock();
782
783 request->SetTransferredBytes(status != B_OK,
784 (nvme_request.lba_start * block_size) - rounded_pos);
785 request->SetStatusAndNotify(status);
786 return status;
787 }
788
789
790 static status_t
nvme_disk_read(void * cookie,off_t pos,void * buffer,size_t * length)791 nvme_disk_read(void* cookie, off_t pos, void* buffer, size_t* length)
792 {
793 CALLED();
794 nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
795
796 const off_t ns_end = (handle->info->capacity * handle->info->block_size);
797 if (pos >= ns_end)
798 return B_BAD_VALUE;
799 if ((pos + (off_t)*length) > ns_end)
800 *length = ns_end - pos;
801
802 IORequest request;
803 status_t status = request.Init(pos, (addr_t)buffer, *length, false, 0);
804 if (status != B_OK)
805 return status;
806
807 status = nvme_disk_io(handle, &request);
808 *length = request.TransferredBytes();
809 return status;
810 }
811
812
813 static status_t
nvme_disk_write(void * cookie,off_t pos,const void * buffer,size_t * length)814 nvme_disk_write(void* cookie, off_t pos, const void* buffer, size_t* length)
815 {
816 CALLED();
817 nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
818
819 const off_t ns_end = (handle->info->capacity * handle->info->block_size);
820 if (pos >= ns_end)
821 return B_BAD_VALUE;
822 if ((pos + (off_t)*length) > ns_end)
823 *length = ns_end - pos;
824
825 IORequest request;
826 status_t status = request.Init(pos, (addr_t)buffer, *length, true, 0);
827 if (status != B_OK)
828 return status;
829
830 status = nvme_disk_io(handle, &request);
831 *length = request.TransferredBytes();
832 return status;
833 }
834
835
836 static status_t
nvme_disk_flush(nvme_disk_driver_info * info)837 nvme_disk_flush(nvme_disk_driver_info* info)
838 {
839 CALLED();
840 status_t status = EINPROGRESS;
841
842 qpair_info* qpinfo = get_qpair(info);
843 int ret = nvme_ns_flush(info->ns, qpinfo->qpair,
844 (nvme_cmd_cb)io_finished_callback, &status);
845 if (ret != 0)
846 return ret;
847
848 await_status(info, qpinfo->qpair, status);
849 return status;
850 }
851
852
853 static status_t
nvme_disk_trim(nvme_disk_driver_info * info,fs_trim_data * trimData)854 nvme_disk_trim(nvme_disk_driver_info* info, fs_trim_data* trimData)
855 {
856 CALLED();
857 trimData->trimmed_size = 0;
858
859 const off_t deviceSize = info->capacity * info->block_size; // in bytes
860 if (deviceSize < 0)
861 return B_BAD_VALUE;
862
863 STATIC_ASSERT(sizeof(deviceSize) <= sizeof(uint64));
864 ASSERT(deviceSize >= 0);
865
866 // Do not trim past device end.
867 for (uint32 i = 0; i < trimData->range_count; i++) {
868 uint64 offset = trimData->ranges[i].offset;
869 uint64& size = trimData->ranges[i].size;
870
871 if (offset >= (uint64)deviceSize)
872 return B_BAD_VALUE;
873 size = std::min(size, (uint64)deviceSize - offset);
874 }
875
876 // We need contiguous memory for the DSM ranges.
877 nvme_dsm_range* dsmRanges = (nvme_dsm_range*)nvme_mem_alloc_node(
878 trimData->range_count * sizeof(nvme_dsm_range), 0, 0, NULL);
879 if (dsmRanges == NULL)
880 return B_NO_MEMORY;
881 CObjectDeleter<void, void, nvme_free> dsmRangesDeleter(dsmRanges);
882
883 uint64 trimmingSize = 0;
884 for (uint32 i = 0; i < trimData->range_count; i++) {
885 uint64 offset = trimData->ranges[i].offset;
886 uint64 length = trimData->ranges[i].size;
887
888 // Round up offset and length to the block size.
889 // (Some space at the beginning and end may thus not be trimmed.)
890 offset = ROUNDUP(offset, info->block_size);
891 length -= offset - trimData->ranges[i].offset;
892 length = ROUNDDOWN(length, info->block_size);
893
894 if (length == 0)
895 continue;
896 if ((length / info->block_size) > UINT32_MAX)
897 length = uint64(UINT32_MAX) * info->block_size;
898 // TODO: Break into smaller trim ranges!
899
900 TRACE("trim %" B_PRIu64 " bytes from %" B_PRIu64 "\n", length, offset);
901
902 dsmRanges[i].attributes = 0;
903 dsmRanges[i].length = length / info->block_size;
904 dsmRanges[i].starting_lba = offset / info->block_size;
905
906 trimmingSize += length;
907 }
908
909 status_t status = EINPROGRESS;
910 qpair_info* qpair = get_qpair(info);
911 if (nvme_ns_deallocate(info->ns, qpair->qpair, dsmRanges, trimData->range_count,
912 (nvme_cmd_cb)io_finished_callback, &status) != 0)
913 return B_IO_ERROR;
914
915 await_status(info, qpair->qpair, status);
916 if (status != B_OK)
917 return status;
918
919 trimData->trimmed_size = trimmingSize;
920 return B_OK;
921 }
922
923
924 static status_t
nvme_disk_ioctl(void * cookie,uint32 op,void * buffer,size_t length)925 nvme_disk_ioctl(void* cookie, uint32 op, void* buffer, size_t length)
926 {
927 CALLED();
928 nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
929 nvme_disk_driver_info* info = handle->info;
930
931 TRACE("ioctl(op = %" B_PRId32 ")\n", op);
932
933 switch (op) {
934 case B_GET_MEDIA_STATUS:
935 {
936 return user_memcpy(buffer, &info->media_status, sizeof(status_t));
937 }
938
939 case B_GET_DEVICE_SIZE:
940 {
941 size_t size = info->capacity * info->block_size;
942 return user_memcpy(buffer, &size, sizeof(size_t));
943 }
944
945 case B_GET_GEOMETRY:
946 {
947 if (buffer == NULL || length > sizeof(device_geometry))
948 return B_BAD_VALUE;
949
950 device_geometry geometry;
951 status_t status = get_geometry(handle, &geometry);
952 if (status != B_OK)
953 return status;
954
955 return user_memcpy(buffer, &geometry, length);
956 }
957
958 case B_GET_ICON_NAME:
959 return user_strlcpy((char*)buffer, "devices/drive-harddisk",
960 B_FILE_NAME_LENGTH);
961
962 case B_GET_VECTOR_ICON:
963 {
964 device_icon iconData;
965 if (length != sizeof(device_icon))
966 return B_BAD_VALUE;
967 if (user_memcpy(&iconData, buffer, sizeof(device_icon)) != B_OK)
968 return B_BAD_ADDRESS;
969
970 if (iconData.icon_size >= (int32)sizeof(kDriveIcon)) {
971 if (user_memcpy(iconData.icon_data, kDriveIcon,
972 sizeof(kDriveIcon)) != B_OK)
973 return B_BAD_ADDRESS;
974 }
975
976 iconData.icon_size = sizeof(kDriveIcon);
977 return user_memcpy(buffer, &iconData, sizeof(device_icon));
978 }
979
980 case B_FLUSH_DRIVE_CACHE:
981 return nvme_disk_flush(info);
982
983 case B_TRIM_DEVICE:
984 ASSERT(IS_KERNEL_ADDRESS(buffer));
985 return nvme_disk_trim(info, (fs_trim_data*)buffer);
986 }
987
988 return B_DEV_INVALID_IOCTL;
989 }
990
991
992 // #pragma mark - driver module API
993
994
995 static float
nvme_disk_supports_device(device_node * parent)996 nvme_disk_supports_device(device_node *parent)
997 {
998 CALLED();
999
1000 const char* bus;
1001 uint16 baseClass, subClass;
1002
1003 if (sDeviceManager->get_attr_string(parent, B_DEVICE_BUS, &bus, false) != B_OK
1004 || sDeviceManager->get_attr_uint16(parent, B_DEVICE_TYPE, &baseClass, false) != B_OK
1005 || sDeviceManager->get_attr_uint16(parent, B_DEVICE_SUB_TYPE, &subClass, false) != B_OK)
1006 return -1.0f;
1007
1008 if (strcmp(bus, "pci") != 0 || baseClass != PCI_mass_storage)
1009 return 0.0f;
1010
1011 if (subClass != PCI_nvm)
1012 return 0.0f;
1013
1014 TRACE("NVMe device found!\n");
1015 return 1.0f;
1016 }
1017
1018
1019 static status_t
nvme_disk_register_device(device_node * parent)1020 nvme_disk_register_device(device_node* parent)
1021 {
1022 CALLED();
1023
1024 device_attr attrs[] = {
1025 { B_DEVICE_PRETTY_NAME, B_STRING_TYPE, { .string = "NVMe Disk" } },
1026 { NULL }
1027 };
1028
1029 return sDeviceManager->register_node(parent, NVME_DISK_DRIVER_MODULE_NAME,
1030 attrs, NULL, NULL);
1031 }
1032
1033
1034 static status_t
nvme_disk_init_driver(device_node * node,void ** cookie)1035 nvme_disk_init_driver(device_node* node, void** cookie)
1036 {
1037 CALLED();
1038
1039 int ret = nvme_lib_init((enum nvme_log_level)0, (enum nvme_log_facility)0, NULL);
1040 if (ret != 0) {
1041 TRACE_ERROR("libnvme initialization failed!\n");
1042 return ret;
1043 }
1044
1045 nvme_disk_driver_info* info = new nvme_disk_driver_info;
1046 if (info == NULL)
1047 return B_NO_MEMORY;
1048
1049 info->media_status = B_OK;
1050 info->node = node;
1051
1052 info->ctrlr = NULL;
1053
1054 *cookie = info;
1055 return B_OK;
1056 }
1057
1058
1059 static void
nvme_disk_uninit_driver(void * _cookie)1060 nvme_disk_uninit_driver(void* _cookie)
1061 {
1062 CALLED();
1063
1064 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie;
1065 free(info);
1066 }
1067
1068
1069 static status_t
nvme_disk_register_child_devices(void * _cookie)1070 nvme_disk_register_child_devices(void* _cookie)
1071 {
1072 CALLED();
1073
1074 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie;
1075 status_t status;
1076
1077 int32 id = sDeviceManager->create_id(NVME_DISK_DEVICE_ID_GENERATOR);
1078 if (id < 0)
1079 return id;
1080
1081 char name[64];
1082 snprintf(name, sizeof(name), "disk/nvme/%" B_PRId32 "/raw",
1083 id);
1084
1085 status = sDeviceManager->publish_device(info->node, name,
1086 NVME_DISK_DEVICE_MODULE_NAME);
1087
1088 return status;
1089 }
1090
1091
1092 // #pragma mark -
1093
1094
1095 module_dependency module_dependencies[] = {
1096 { B_DEVICE_MANAGER_MODULE_NAME, (module_info**)&sDeviceManager },
1097 { NULL }
1098 };
1099
1100 struct device_module_info sNvmeDiskDevice = {
1101 {
1102 NVME_DISK_DEVICE_MODULE_NAME,
1103 0,
1104 NULL
1105 },
1106
1107 nvme_disk_init_device,
1108 nvme_disk_uninit_device,
1109 NULL, // remove,
1110
1111 nvme_disk_open,
1112 nvme_disk_close,
1113 nvme_disk_free,
1114 nvme_disk_read,
1115 nvme_disk_write,
1116 nvme_disk_io,
1117 nvme_disk_ioctl,
1118
1119 NULL, // select
1120 NULL, // deselect
1121 };
1122
1123 struct driver_module_info sNvmeDiskDriver = {
1124 {
1125 NVME_DISK_DRIVER_MODULE_NAME,
1126 0,
1127 NULL
1128 },
1129
1130 nvme_disk_supports_device,
1131 nvme_disk_register_device,
1132 nvme_disk_init_driver,
1133 nvme_disk_uninit_driver,
1134 nvme_disk_register_child_devices,
1135 NULL, // rescan
1136 NULL, // removed
1137 };
1138
1139 module_info* modules[] = {
1140 (module_info*)&sNvmeDiskDriver,
1141 (module_info*)&sNvmeDiskDevice,
1142 NULL
1143 };
1144