1 /* 2 * Copyright 2019-2022, Haiku, Inc. All rights reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Augustin Cavalier <waddlesplash> 7 */ 8 9 10 #include <stdio.h> 11 #include <stdlib.h> 12 13 #include <algorithm> 14 #include <condition_variable.h> 15 #include <AutoDeleter.h> 16 #include <kernel.h> 17 #include <smp.h> 18 #include <util/AutoLock.h> 19 20 #include <fs/devfs.h> 21 #include <bus/PCI.h> 22 #include <PCI_x86.h> 23 #include <vm/vm.h> 24 25 #include "IORequest.h" 26 27 extern "C" { 28 #include <libnvme/nvme.h> 29 #include <libnvme/nvme_internal.h> 30 } 31 32 33 //#define TRACE_NVME_DISK 34 #ifdef TRACE_NVME_DISK 35 # define TRACE(x...) dprintf("nvme_disk: " x) 36 #else 37 # define TRACE(x...) ; 38 #endif 39 #define TRACE_ALWAYS(x...) dprintf("nvme_disk: " x) 40 #define TRACE_ERROR(x...) dprintf("\33[33mnvme_disk:\33[0m " x) 41 #define CALLED() TRACE("CALLED %s\n", __PRETTY_FUNCTION__) 42 43 44 static const uint8 kDriveIcon[] = { 45 0x6e, 0x63, 0x69, 0x66, 0x08, 0x03, 0x01, 0x00, 0x00, 0x02, 0x00, 0x16, 46 0x02, 0x3c, 0xc7, 0xee, 0x38, 0x9b, 0xc0, 0xba, 0x16, 0x57, 0x3e, 0x39, 47 0xb0, 0x49, 0x77, 0xc8, 0x42, 0xad, 0xc7, 0x00, 0xff, 0xff, 0xd3, 0x02, 48 0x00, 0x06, 0x02, 0x3c, 0x96, 0x32, 0x3a, 0x4d, 0x3f, 0xba, 0xfc, 0x01, 49 0x3d, 0x5a, 0x97, 0x4b, 0x57, 0xa5, 0x49, 0x84, 0x4d, 0x00, 0x47, 0x47, 50 0x47, 0xff, 0xa5, 0xa0, 0xa0, 0x02, 0x00, 0x16, 0x02, 0xbc, 0x59, 0x2f, 51 0xbb, 0x29, 0xa7, 0x3c, 0x0c, 0xe4, 0xbd, 0x0b, 0x7c, 0x48, 0x92, 0xc0, 52 0x4b, 0x79, 0x66, 0x00, 0x7d, 0xff, 0xd4, 0x02, 0x00, 0x06, 0x02, 0x38, 53 0xdb, 0xb4, 0x39, 0x97, 0x33, 0xbc, 0x4a, 0x33, 0x3b, 0xa5, 0x42, 0x48, 54 0x6e, 0x66, 0x49, 0xee, 0x7b, 0x00, 0x59, 0x67, 0x56, 0xff, 0xeb, 0xb2, 55 0xb2, 0x03, 0xa7, 0xff, 0x00, 0x03, 0xff, 0x00, 0x00, 0x04, 0x01, 0x80, 56 0x07, 0x0a, 0x06, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x5a, 0x3e, 0x5a, 57 0x31, 0x39, 0x25, 0x0a, 0x04, 0x22, 0x3c, 0x44, 0x4b, 0x5a, 0x31, 0x39, 58 0x25, 0x0a, 0x04, 0x44, 0x4b, 0x44, 0x5b, 0x5a, 0x3e, 0x5a, 0x31, 0x0a, 59 0x04, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x44, 0x4b, 0x08, 0x02, 0x27, 60 0x43, 0xb8, 0x14, 0xc1, 0xf1, 0x08, 0x02, 0x26, 0x43, 0x29, 0x44, 0x0a, 61 0x05, 0x44, 0x5d, 0x49, 0x5d, 0x60, 0x3e, 0x5a, 0x3b, 0x5b, 0x3f, 0x08, 62 0x0a, 0x07, 0x01, 0x06, 0x00, 0x0a, 0x00, 0x01, 0x00, 0x10, 0x01, 0x17, 63 0x84, 0x00, 0x04, 0x0a, 0x01, 0x01, 0x01, 0x00, 0x0a, 0x02, 0x01, 0x02, 64 0x00, 0x0a, 0x03, 0x01, 0x03, 0x00, 0x0a, 0x04, 0x01, 0x04, 0x10, 0x01, 65 0x17, 0x85, 0x20, 0x04, 0x0a, 0x06, 0x01, 0x05, 0x30, 0x24, 0xb3, 0x99, 66 0x01, 0x17, 0x82, 0x00, 0x04, 0x0a, 0x05, 0x01, 0x05, 0x30, 0x20, 0xb2, 67 0xe6, 0x01, 0x17, 0x82, 0x00, 0x04 68 }; 69 70 71 #define NVME_DISK_DRIVER_MODULE_NAME "drivers/disk/nvme_disk/driver_v1" 72 #define NVME_DISK_DEVICE_MODULE_NAME "drivers/disk/nvme_disk/device_v1" 73 #define NVME_DISK_DEVICE_ID_GENERATOR "nvme_disk/device_id" 74 75 #define NVME_MAX_QPAIRS (16) 76 77 78 static device_manager_info* sDeviceManager; 79 static pci_x86_module_info* sPCIx86Module; 80 81 typedef struct { 82 device_node* node; 83 pci_info info; 84 85 struct nvme_ctrlr* ctrlr; 86 87 struct nvme_ns* ns; 88 uint64 capacity; 89 uint32 block_size; 90 uint32 max_io_blocks; 91 status_t media_status; 92 93 DMAResource dma_resource; 94 sem_id dma_buffers_sem; 95 96 rw_lock rounded_write_lock; 97 98 ConditionVariable interrupt; 99 int32 polling; 100 101 struct qpair_info { 102 struct nvme_qpair* qpair; 103 } qpairs[NVME_MAX_QPAIRS]; 104 uint32 qpair_count; 105 } nvme_disk_driver_info; 106 typedef nvme_disk_driver_info::qpair_info qpair_info; 107 108 109 typedef struct { 110 nvme_disk_driver_info* info; 111 } nvme_disk_handle; 112 113 114 static status_t 115 get_geometry(nvme_disk_handle* handle, device_geometry* geometry) 116 { 117 nvme_disk_driver_info* info = handle->info; 118 119 devfs_compute_geometry_size(geometry, info->capacity, info->block_size); 120 geometry->bytes_per_physical_sector = info->block_size; 121 122 geometry->device_type = B_DISK; 123 geometry->removable = false; 124 125 geometry->read_only = false; 126 geometry->write_once = false; 127 128 TRACE("get_geometry(): %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %d, %d, %d, %d\n", 129 geometry->bytes_per_sector, geometry->sectors_per_track, 130 geometry->cylinder_count, geometry->head_count, geometry->device_type, 131 geometry->removable, geometry->read_only, geometry->write_once); 132 133 return B_OK; 134 } 135 136 137 static void 138 nvme_disk_set_capacity(nvme_disk_driver_info* info, uint64 capacity, 139 uint32 blockSize) 140 { 141 TRACE("set_capacity(device = %p, capacity = %" B_PRIu64 ", blockSize = %" B_PRIu32 ")\n", 142 info, capacity, blockSize); 143 144 info->capacity = capacity; 145 info->block_size = blockSize; 146 } 147 148 149 // #pragma mark - device module API 150 151 152 static int32 nvme_interrupt_handler(void* _info); 153 154 155 static status_t 156 nvme_disk_init_device(void* _info, void** _cookie) 157 { 158 CALLED(); 159 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info; 160 ASSERT(info->ctrlr == NULL); 161 162 pci_device_module_info* pci; 163 pci_device* pcidev; 164 device_node* parent = sDeviceManager->get_parent_node(info->node); 165 sDeviceManager->get_driver(parent, (driver_module_info**)&pci, 166 (void**)&pcidev); 167 pci->get_pci_info(pcidev, &info->info); 168 sDeviceManager->put_node(parent); 169 170 // construct the libnvme pci_device struct 171 pci_device* device = new pci_device; 172 device->vendor_id = info->info.vendor_id; 173 device->device_id = info->info.device_id; 174 device->subvendor_id = 0; 175 device->subdevice_id = 0; 176 177 device->domain = 0; 178 device->bus = info->info.bus; 179 device->dev = info->info.device; 180 device->func = info->info.function; 181 182 device->pci_info = &info->info; 183 184 // enable busmaster and memory mapped access 185 uint16 command = pci->read_pci_config(pcidev, PCI_command, 2); 186 command |= PCI_command_master | PCI_command_memory; 187 pci->write_pci_config(pcidev, PCI_command, 2, command); 188 189 // open the controller 190 info->ctrlr = nvme_ctrlr_open(device, NULL); 191 if (info->ctrlr == NULL) { 192 TRACE_ERROR("failed to open the controller!\n"); 193 return B_ERROR; 194 } 195 196 struct nvme_ctrlr_stat cstat; 197 int err = nvme_ctrlr_stat(info->ctrlr, &cstat); 198 if (err != 0) { 199 TRACE_ERROR("failed to get controller information!\n"); 200 nvme_ctrlr_close(info->ctrlr); 201 return err; 202 } 203 204 TRACE_ALWAYS("attached to NVMe device \"%s (%s)\"\n", cstat.mn, cstat.sn); 205 TRACE_ALWAYS("\tmaximum transfer size: %" B_PRIuSIZE "\n", cstat.max_xfer_size); 206 TRACE_ALWAYS("\tqpair count: %d\n", cstat.io_qpairs); 207 208 // TODO: export more than just the first namespace! 209 info->ns = nvme_ns_open(info->ctrlr, cstat.ns_ids[0]); 210 if (info->ns == NULL) { 211 TRACE_ERROR("failed to open namespace!\n"); 212 nvme_ctrlr_close(info->ctrlr); 213 return B_ERROR; 214 } 215 TRACE_ALWAYS("namespace 0\n"); 216 217 struct nvme_ns_stat nsstat; 218 err = nvme_ns_stat(info->ns, &nsstat); 219 if (err != 0) { 220 TRACE_ERROR("failed to get namespace information!\n"); 221 nvme_ctrlr_close(info->ctrlr); 222 return err; 223 } 224 225 // store capacity information 226 TRACE_ALWAYS("\tblock size: %" B_PRIuSIZE ", stripe size: %u\n", 227 nsstat.sector_size, info->ns->stripe_size); 228 nvme_disk_set_capacity(info, nsstat.sectors, nsstat.sector_size); 229 230 // set up interrupts 231 if (get_module(B_PCI_X86_MODULE_NAME, (module_info**)&sPCIx86Module) 232 != B_OK) { 233 sPCIx86Module = NULL; 234 } 235 236 command = pci->read_pci_config(pcidev, PCI_command, 2); 237 command &= ~(PCI_command_int_disable); 238 pci->write_pci_config(pcidev, PCI_command, 2, command); 239 240 uint8 irq = info->info.u.h0.interrupt_line; 241 if (sPCIx86Module != NULL) { 242 if (sPCIx86Module->get_msix_count(info->info.bus, info->info.device, 243 info->info.function)) { 244 uint8 msixVector = 0; 245 if (sPCIx86Module->configure_msix(info->info.bus, info->info.device, 246 info->info.function, 1, &msixVector) == B_OK 247 && sPCIx86Module->enable_msix(info->info.bus, info->info.device, 248 info->info.function) == B_OK) { 249 TRACE_ALWAYS("using MSI-X\n"); 250 irq = msixVector; 251 } 252 } else if (sPCIx86Module->get_msi_count(info->info.bus, 253 info->info.device, info->info.function) >= 1) { 254 uint8 msiVector = 0; 255 if (sPCIx86Module->configure_msi(info->info.bus, info->info.device, 256 info->info.function, 1, &msiVector) == B_OK 257 && sPCIx86Module->enable_msi(info->info.bus, info->info.device, 258 info->info.function) == B_OK) { 259 TRACE_ALWAYS("using message signaled interrupts\n"); 260 irq = msiVector; 261 } 262 } 263 } 264 265 if (irq == 0 || irq == 0xFF) { 266 TRACE_ERROR("device PCI:%d:%d:%d was assigned an invalid IRQ\n", 267 info->info.bus, info->info.device, info->info.function); 268 info->polling = 1; 269 } else { 270 info->polling = 0; 271 } 272 info->interrupt.Init(NULL, NULL); 273 install_io_interrupt_handler(irq, nvme_interrupt_handler, (void*)info, B_NO_HANDLED_INFO); 274 275 if (info->ctrlr->feature_supported[NVME_FEAT_INTERRUPT_COALESCING]) { 276 uint32 microseconds = 16, threshold = 32; 277 nvme_admin_set_feature(info->ctrlr, false, NVME_FEAT_INTERRUPT_COALESCING, 278 ((microseconds / 100) << 8) | threshold, 0, NULL); 279 } 280 281 // allocate qpairs 282 uint32 try_qpairs = cstat.io_qpairs; 283 try_qpairs = min_c(try_qpairs, NVME_MAX_QPAIRS); 284 if (try_qpairs >= (uint32)smp_get_num_cpus()) { 285 try_qpairs = smp_get_num_cpus(); 286 } else { 287 // Find the highest number of qpairs that evenly divides the number of CPUs. 288 while ((smp_get_num_cpus() % try_qpairs) != 0) 289 try_qpairs--; 290 } 291 info->qpair_count = 0; 292 for (uint32 i = 0; i < try_qpairs; i++) { 293 info->qpairs[i].qpair = nvme_ioqp_get(info->ctrlr, 294 (enum nvme_qprio)0, 0); 295 if (info->qpairs[i].qpair == NULL) 296 break; 297 298 info->qpair_count++; 299 } 300 if (info->qpair_count == 0) { 301 TRACE_ERROR("failed to allocate qpairs!\n"); 302 nvme_ctrlr_close(info->ctrlr); 303 return B_NO_MEMORY; 304 } 305 if (info->qpair_count != try_qpairs) { 306 TRACE_ALWAYS("warning: did not get expected number of qpairs\n"); 307 } 308 309 // allocate DMA buffers 310 int buffers = info->qpair_count * 2; 311 312 dma_restrictions restrictions = {}; 313 restrictions.alignment = B_PAGE_SIZE; 314 // Technically, the first and last segments in a transfer can be aligned 315 // only on 32-bits, and the rest only need to have sizes that are a multiple 316 // of the block size. 317 restrictions.max_segment_count = (NVME_MAX_SGL_DESCRIPTORS / 2); 318 restrictions.max_transfer_size = cstat.max_xfer_size; 319 info->max_io_blocks = cstat.max_xfer_size / nsstat.sector_size; 320 321 err = info->dma_resource.Init(restrictions, B_PAGE_SIZE, buffers, buffers); 322 if (err != 0) { 323 TRACE_ERROR("failed to initialize DMA resource!\n"); 324 nvme_ctrlr_close(info->ctrlr); 325 return err; 326 } 327 328 info->dma_buffers_sem = create_sem(buffers, "nvme buffers sem"); 329 if (info->dma_buffers_sem < 0) { 330 TRACE_ERROR("failed to create DMA buffers semaphore!\n"); 331 nvme_ctrlr_close(info->ctrlr); 332 return info->dma_buffers_sem; 333 } 334 335 // set up rounded-write lock 336 rw_lock_init(&info->rounded_write_lock, "nvme rounded writes"); 337 338 *_cookie = info; 339 return B_OK; 340 } 341 342 343 static void 344 nvme_disk_uninit_device(void* _cookie) 345 { 346 CALLED(); 347 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie; 348 349 remove_io_interrupt_handler(info->info.u.h0.interrupt_line, 350 nvme_interrupt_handler, (void*)info); 351 352 rw_lock_destroy(&info->rounded_write_lock); 353 354 nvme_ns_close(info->ns); 355 nvme_ctrlr_close(info->ctrlr); 356 357 // TODO: Deallocate MSI(-X). 358 // TODO: Deallocate PCI. 359 } 360 361 362 static status_t 363 nvme_disk_open(void* _info, const char* path, int openMode, void** _cookie) 364 { 365 CALLED(); 366 367 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info; 368 nvme_disk_handle* handle = (nvme_disk_handle*)malloc( 369 sizeof(nvme_disk_handle)); 370 if (handle == NULL) 371 return B_NO_MEMORY; 372 373 handle->info = info; 374 375 *_cookie = handle; 376 return B_OK; 377 } 378 379 380 static status_t 381 nvme_disk_close(void* cookie) 382 { 383 CALLED(); 384 385 //nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 386 return B_OK; 387 } 388 389 390 static status_t 391 nvme_disk_free(void* cookie) 392 { 393 CALLED(); 394 395 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 396 free(handle); 397 return B_OK; 398 } 399 400 401 // #pragma mark - I/O 402 403 404 static int32 405 nvme_interrupt_handler(void* _info) 406 { 407 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info; 408 info->interrupt.NotifyAll(); 409 info->polling = -1; 410 return 0; 411 } 412 413 414 static qpair_info* 415 get_qpair(nvme_disk_driver_info* info) 416 { 417 return &info->qpairs[smp_get_current_cpu() % info->qpair_count]; 418 } 419 420 421 static void 422 io_finished_callback(status_t* status, const struct nvme_cpl* cpl) 423 { 424 *status = nvme_cpl_is_error(cpl) ? B_IO_ERROR : B_OK; 425 } 426 427 428 static void 429 await_status(nvme_disk_driver_info* info, struct nvme_qpair* qpair, status_t& status) 430 { 431 CALLED(); 432 433 ConditionVariableEntry entry; 434 int timeouts = 0; 435 while (status == EINPROGRESS) { 436 info->interrupt.Add(&entry); 437 438 nvme_qpair_poll(qpair, 0); 439 440 if (status != EINPROGRESS) 441 return; 442 443 if (info->polling > 0) { 444 entry.Wait(B_RELATIVE_TIMEOUT, min_c(5 * 1000 * 1000, 445 (1 << timeouts) * 1000)); 446 timeouts++; 447 } else if (entry.Wait(B_RELATIVE_TIMEOUT, 5 * 1000 * 1000) != B_OK) { 448 // This should never happen, as we are woken up on every interrupt 449 // no matter the qpair or transfer within; so if it does occur, 450 // that probably means the controller stalled, or maybe cannot 451 // generate interrupts at all. 452 453 TRACE_ERROR("timed out waiting for interrupt!\n"); 454 if (timeouts++ >= 3) { 455 nvme_qpair_fail(qpair); 456 status = B_TIMED_OUT; 457 return; 458 } 459 460 info->polling++; 461 if (info->polling > 0) { 462 TRACE_ALWAYS("switching to polling mode, performance will be affected!\n"); 463 } 464 } 465 466 nvme_qpair_poll(qpair, 0); 467 } 468 } 469 470 471 struct nvme_io_request { 472 status_t status; 473 474 bool write; 475 476 off_t lba_start; 477 size_t lba_count; 478 479 physical_entry* iovecs; 480 int32 iovec_count; 481 482 int32 iovec_i; 483 uint32 iovec_offset; 484 }; 485 486 487 static void 488 ior_reset_sgl(nvme_io_request* request, uint32_t offset) 489 { 490 TRACE("IOR Reset: %" B_PRIu32 "\n", offset); 491 492 int32 i = 0; 493 while (offset > 0 && request->iovecs[i].size <= offset) { 494 offset -= request->iovecs[i].size; 495 i++; 496 } 497 request->iovec_i = i; 498 request->iovec_offset = offset; 499 } 500 501 502 static int 503 ior_next_sge(nvme_io_request* request, uint64_t* address, uint32_t* length) 504 { 505 int32 index = request->iovec_i; 506 if (index < 0 || index > request->iovec_count) 507 return -1; 508 509 *address = request->iovecs[index].address + request->iovec_offset; 510 *length = request->iovecs[index].size - request->iovec_offset; 511 512 TRACE("IOV %d (+ " B_PRIu32 "): 0x%" B_PRIx64 ", %" B_PRIu32 "\n", 513 request->iovec_i, request->iovec_offset, *address, *length); 514 515 request->iovec_i++; 516 request->iovec_offset = 0; 517 return 0; 518 } 519 520 521 static status_t 522 do_nvme_io_request(nvme_disk_driver_info* info, nvme_io_request* request) 523 { 524 request->status = EINPROGRESS; 525 526 qpair_info* qpinfo = get_qpair(info); 527 int ret = -1; 528 if (request->write) { 529 ret = nvme_ns_writev(info->ns, qpinfo->qpair, request->lba_start, 530 request->lba_count, (nvme_cmd_cb)io_finished_callback, request, 531 0, (nvme_req_reset_sgl_cb)ior_reset_sgl, 532 (nvme_req_next_sge_cb)ior_next_sge); 533 } else { 534 ret = nvme_ns_readv(info->ns, qpinfo->qpair, request->lba_start, 535 request->lba_count, (nvme_cmd_cb)io_finished_callback, request, 536 0, (nvme_req_reset_sgl_cb)ior_reset_sgl, 537 (nvme_req_next_sge_cb)ior_next_sge); 538 } 539 if (ret != 0) { 540 TRACE_ERROR("attempt to queue %s I/O at LBA %" B_PRIdOFF " of %" B_PRIuSIZE 541 " blocks failed!\n", request->write ? "write" : "read", 542 request->lba_start, request->lba_count); 543 544 request->lba_count = 0; 545 return ret; 546 } 547 548 await_status(info, qpinfo->qpair, request->status); 549 550 if (request->status != B_OK) { 551 TRACE_ERROR("%s at LBA %" B_PRIdOFF " of %" B_PRIuSIZE 552 " blocks failed!\n", request->write ? "write" : "read", 553 request->lba_start, request->lba_count); 554 555 request->lba_count = 0; 556 } 557 return request->status; 558 } 559 560 561 static status_t 562 nvme_disk_bounced_io(nvme_disk_handle* handle, io_request* request) 563 { 564 CALLED(); 565 566 WriteLocker writeLocker; 567 if (request->IsWrite()) 568 writeLocker.SetTo(handle->info->rounded_write_lock, false); 569 570 status_t status = acquire_sem(handle->info->dma_buffers_sem); 571 if (status != B_OK) { 572 request->SetStatusAndNotify(status); 573 return status; 574 } 575 576 const size_t block_size = handle->info->block_size; 577 578 TRACE("%p: IOR Offset: %" B_PRIdOFF "; Length %" B_PRIuGENADDR 579 "; Write %s\n", request, request->Offset(), request->Length(), 580 request->IsWrite() ? "yes" : "no"); 581 582 nvme_io_request nvme_request; 583 while (request->RemainingBytes() > 0) { 584 IOOperation operation; 585 status = handle->info->dma_resource.TranslateNext(request, &operation, 0); 586 if (status != B_OK) 587 break; 588 589 size_t transferredBytes = 0; 590 do { 591 TRACE("%p: IOO offset: %" B_PRIdOFF ", length: %" B_PRIuGENADDR 592 ", write: %s\n", request, operation.Offset(), 593 operation.Length(), operation.IsWrite() ? "yes" : "no"); 594 595 nvme_request.write = operation.IsWrite(); 596 nvme_request.lba_start = operation.Offset() / block_size; 597 nvme_request.lba_count = operation.Length() / block_size; 598 nvme_request.iovecs = (physical_entry*)operation.Vecs(); 599 nvme_request.iovec_count = operation.VecCount(); 600 601 status = do_nvme_io_request(handle->info, &nvme_request); 602 if (status == B_OK && nvme_request.write == request->IsWrite()) 603 transferredBytes += operation.OriginalLength(); 604 605 operation.SetStatus(status); 606 } while (status == B_OK && !operation.Finish()); 607 608 if (status == B_OK && operation.Status() != B_OK) { 609 TRACE_ERROR("I/O succeeded but IOOperation failed!\n"); 610 status = operation.Status(); 611 } 612 613 operation.SetTransferredBytes(transferredBytes); 614 request->OperationFinished(&operation, status, status != B_OK, 615 operation.OriginalOffset() + transferredBytes); 616 617 handle->info->dma_resource.RecycleBuffer(operation.Buffer()); 618 619 TRACE("%p: status %s, remaining bytes %" B_PRIuGENADDR "\n", request, 620 strerror(status), request->RemainingBytes()); 621 if (status != B_OK) 622 break; 623 } 624 625 release_sem(handle->info->dma_buffers_sem); 626 627 // Notify() also takes care of UnlockMemory(). 628 if (status != B_OK && request->Status() == B_OK) 629 request->SetStatusAndNotify(status); 630 else 631 request->NotifyFinished(); 632 return status; 633 } 634 635 636 static status_t 637 nvme_disk_io(void* cookie, io_request* request) 638 { 639 CALLED(); 640 641 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 642 643 nvme_io_request nvme_request; 644 memset(&nvme_request, 0, sizeof(nvme_io_request)); 645 646 nvme_request.write = request->IsWrite(); 647 648 physical_entry* vtophys = NULL; 649 MemoryDeleter vtophysDeleter; 650 651 IOBuffer* buffer = request->Buffer(); 652 status_t status = B_OK; 653 if (!buffer->IsPhysical()) { 654 status = buffer->LockMemory(request->TeamID(), request->IsWrite()); 655 if (status != B_OK) { 656 TRACE_ERROR("failed to lock memory: %s\n", strerror(status)); 657 return status; 658 } 659 // SetStatusAndNotify() takes care of unlocking memory if necessary. 660 661 // This is slightly inefficient, as we could use a BStackOrHeapArray in 662 // the optimal case (few physical entries required), but we would not 663 // know whether or not that was possible until calling get_memory_map() 664 // and then potentially reallocating, which would complicate the logic. 665 666 int32 vtophys_length = (request->Length() / B_PAGE_SIZE) + 2; 667 nvme_request.iovecs = vtophys = (physical_entry*)malloc(sizeof(physical_entry) 668 * vtophys_length); 669 if (vtophys == NULL) { 670 TRACE_ERROR("failed to allocate memory for iovecs\n"); 671 request->SetStatusAndNotify(B_NO_MEMORY); 672 return B_NO_MEMORY; 673 } 674 vtophysDeleter.SetTo(vtophys); 675 676 for (size_t i = 0; i < buffer->VecCount(); i++) { 677 generic_io_vec virt = buffer->VecAt(i); 678 uint32 entries = vtophys_length - nvme_request.iovec_count; 679 680 // Avoid copies by going straight into the vtophys array. 681 status = get_memory_map_etc(request->TeamID(), (void*)virt.base, 682 virt.length, vtophys + nvme_request.iovec_count, &entries); 683 if (status == B_BUFFER_OVERFLOW) { 684 TRACE("vtophys array was too small, reallocating\n"); 685 686 vtophysDeleter.Detach(); 687 vtophys_length *= 2; 688 nvme_request.iovecs = vtophys = (physical_entry*)realloc(vtophys, 689 sizeof(physical_entry) * vtophys_length); 690 vtophysDeleter.SetTo(vtophys); 691 if (vtophys == NULL) { 692 status = B_NO_MEMORY; 693 } else { 694 // Try again, with the larger buffer this time. 695 i--; 696 continue; 697 } 698 } 699 if (status != B_OK) { 700 TRACE_ERROR("I/O get_memory_map failed: %s\n", strerror(status)); 701 request->SetStatusAndNotify(status); 702 return status; 703 } 704 705 nvme_request.iovec_count += entries; 706 } 707 } else { 708 nvme_request.iovecs = (physical_entry*)buffer->Vecs(); 709 nvme_request.iovec_count = buffer->VecCount(); 710 } 711 712 // See if we need to bounce anything other than the first or last vec. 713 const size_t block_size = handle->info->block_size; 714 bool bounceAll = false; 715 for (int32 i = 1; !bounceAll && i < (nvme_request.iovec_count - 1); i++) { 716 if ((nvme_request.iovecs[i].address % B_PAGE_SIZE) != 0) 717 bounceAll = true; 718 if ((nvme_request.iovecs[i].size % B_PAGE_SIZE) != 0) 719 bounceAll = true; 720 } 721 722 // See if we need to bounce due to the first or last vecs. 723 if (nvme_request.iovec_count > 1) { 724 // There are middle vecs, so the first and last vecs have different restrictions: they 725 // need only be a multiple of the block size, and must end and start on a page boundary, 726 // respectively, though the start address must always be 32-bit-aligned. 727 physical_entry* entry = &nvme_request.iovecs[0]; 728 if (!bounceAll && (((entry->address + entry->size) % B_PAGE_SIZE) != 0 729 || (entry->address & 0x3) != 0 || (entry->size % block_size) != 0)) 730 bounceAll = true; 731 732 entry = &nvme_request.iovecs[nvme_request.iovec_count - 1]; 733 if (!bounceAll && ((entry->address % B_PAGE_SIZE) != 0 734 || (entry->size % block_size) != 0)) 735 bounceAll = true; 736 } else { 737 // There is only one vec. Check that it is a multiple of the block size, 738 // and that its address is 32-bit-aligned. 739 physical_entry* entry = &nvme_request.iovecs[0]; 740 if (!bounceAll && ((entry->address & 0x3) != 0 || (entry->size % block_size) != 0)) 741 bounceAll = true; 742 } 743 744 // See if we need to bounce due to rounding. 745 const off_t rounded_pos = ROUNDDOWN(request->Offset(), block_size); 746 phys_size_t rounded_len = ROUNDUP(request->Length() + (request->Offset() 747 - rounded_pos), block_size); 748 if (rounded_pos != request->Offset() || rounded_len != request->Length()) 749 bounceAll = true; 750 751 if (bounceAll) { 752 // Let the bounced I/O routine take care of everything from here. 753 return nvme_disk_bounced_io(handle, request); 754 } 755 756 nvme_request.lba_start = rounded_pos / block_size; 757 nvme_request.lba_count = rounded_len / block_size; 758 759 // No bouncing was required. 760 ReadLocker readLocker; 761 if (nvme_request.write) 762 readLocker.SetTo(handle->info->rounded_write_lock, false); 763 764 // Error check before actually doing I/O. 765 if (status != B_OK) { 766 TRACE_ERROR("I/O failed early: %s\n", strerror(status)); 767 request->SetStatusAndNotify(status); 768 return status; 769 } 770 771 const uint32 max_io_blocks = handle->info->max_io_blocks; 772 int32 remaining = nvme_request.iovec_count; 773 while (remaining > 0) { 774 nvme_request.iovec_count = min_c(remaining, 775 NVME_MAX_SGL_DESCRIPTORS / 2); 776 777 nvme_request.lba_count = 0; 778 for (int i = 0; i < nvme_request.iovec_count; i++) { 779 uint32 new_lba_count = nvme_request.lba_count 780 + (nvme_request.iovecs[i].size / block_size); 781 if (nvme_request.lba_count > 0 && new_lba_count > max_io_blocks) { 782 // We already have a nonzero length, and adding this vec would 783 // make us go over (or we already are over.) Stop adding. 784 nvme_request.iovec_count = i; 785 break; 786 } 787 788 nvme_request.lba_count = new_lba_count; 789 } 790 791 status = do_nvme_io_request(handle->info, &nvme_request); 792 if (status != B_OK) 793 break; 794 795 nvme_request.iovecs += nvme_request.iovec_count; 796 remaining -= nvme_request.iovec_count; 797 nvme_request.lba_start += nvme_request.lba_count; 798 } 799 800 if (status != B_OK) 801 TRACE_ERROR("I/O failed: %s\n", strerror(status)); 802 803 request->SetTransferredBytes(status != B_OK, 804 (nvme_request.lba_start * block_size) - rounded_pos); 805 request->SetStatusAndNotify(status); 806 return status; 807 } 808 809 810 static status_t 811 nvme_disk_read(void* cookie, off_t pos, void* buffer, size_t* length) 812 { 813 CALLED(); 814 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 815 816 const off_t end = (handle->info->capacity * handle->info->block_size); 817 if (pos >= end) 818 return B_BAD_VALUE; 819 if (pos + (off_t)*length > end) 820 *length = end - pos; 821 822 IORequest request; 823 status_t status = request.Init(pos, (addr_t)buffer, *length, false, 0); 824 if (status != B_OK) 825 return status; 826 827 status = nvme_disk_io(handle, &request); 828 *length = request.TransferredBytes(); 829 return status; 830 } 831 832 833 static status_t 834 nvme_disk_write(void* cookie, off_t pos, const void* buffer, size_t* length) 835 { 836 CALLED(); 837 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 838 839 const off_t end = (handle->info->capacity * handle->info->block_size); 840 if (pos >= end) 841 return B_BAD_VALUE; 842 if (pos + (off_t)*length > end) 843 *length = end - pos; 844 845 IORequest request; 846 status_t status = request.Init(pos, (addr_t)buffer, *length, true, 0); 847 if (status != B_OK) 848 return status; 849 850 status = nvme_disk_io(handle, &request); 851 *length = request.TransferredBytes(); 852 return status; 853 } 854 855 856 static status_t 857 nvme_disk_flush(nvme_disk_driver_info* info) 858 { 859 CALLED(); 860 status_t status = EINPROGRESS; 861 862 qpair_info* qpinfo = get_qpair(info); 863 int ret = nvme_ns_flush(info->ns, qpinfo->qpair, 864 (nvme_cmd_cb)io_finished_callback, &status); 865 if (ret != 0) 866 return ret; 867 868 await_status(info, qpinfo->qpair, status); 869 return status; 870 } 871 872 873 static status_t 874 nvme_disk_trim(nvme_disk_driver_info* info, fs_trim_data* trimData) 875 { 876 CALLED(); 877 trimData->trimmed_size = 0; 878 879 const off_t deviceSize = info->capacity * info->block_size; // in bytes 880 if (deviceSize < 0) 881 return B_BAD_VALUE; 882 883 STATIC_ASSERT(sizeof(deviceSize) <= sizeof(uint64)); 884 ASSERT(deviceSize >= 0); 885 886 // Do not trim past device end. 887 for (uint32 i = 0; i < trimData->range_count; i++) { 888 uint64 offset = trimData->ranges[i].offset; 889 uint64& size = trimData->ranges[i].size; 890 891 if (offset >= (uint64)deviceSize) 892 return B_BAD_VALUE; 893 size = std::min(size, (uint64)deviceSize - offset); 894 } 895 896 // We need contiguous memory for the DSM ranges. 897 nvme_dsm_range* dsmRanges = (nvme_dsm_range*)nvme_mem_alloc_node( 898 trimData->range_count * sizeof(nvme_dsm_range), 0, 0, NULL); 899 if (dsmRanges == NULL) 900 return B_NO_MEMORY; 901 CObjectDeleter<void, void, nvme_free> dsmRangesDeleter(dsmRanges); 902 903 uint64 trimmingSize = 0; 904 for (uint32 i = 0; i < trimData->range_count; i++) { 905 uint64 offset = trimData->ranges[i].offset; 906 uint64 length = trimData->ranges[i].size; 907 908 // Round up offset and length to the block size. 909 // (Some space at the beginning and end may thus not be trimmed.) 910 offset = ROUNDUP(offset, info->block_size); 911 length -= offset - trimData->ranges[i].offset; 912 length = ROUNDDOWN(length, info->block_size); 913 914 if (length == 0) 915 continue; 916 if ((length / info->block_size) > UINT32_MAX) 917 length = uint64(UINT32_MAX) * info->block_size; 918 // TODO: Break into smaller trim ranges! 919 920 TRACE("trim %" B_PRIu64 " bytes from %" B_PRIu64 "\n", length, offset); 921 922 dsmRanges[i].attributes = 0; 923 dsmRanges[i].length = length / info->block_size; 924 dsmRanges[i].starting_lba = offset / info->block_size; 925 926 trimmingSize += length; 927 } 928 929 status_t status = EINPROGRESS; 930 qpair_info* qpair = get_qpair(info); 931 if (nvme_ns_deallocate(info->ns, qpair->qpair, dsmRanges, trimData->range_count, 932 (nvme_cmd_cb)io_finished_callback, &status) != 0) 933 return B_IO_ERROR; 934 935 await_status(info, qpair->qpair, status); 936 if (status != B_OK) 937 return status; 938 939 trimData->trimmed_size = trimmingSize; 940 return B_OK; 941 } 942 943 944 static status_t 945 nvme_disk_ioctl(void* cookie, uint32 op, void* buffer, size_t length) 946 { 947 CALLED(); 948 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 949 nvme_disk_driver_info* info = handle->info; 950 951 TRACE("ioctl(op = %" B_PRId32 ")\n", op); 952 953 switch (op) { 954 case B_GET_MEDIA_STATUS: 955 { 956 *(status_t *)buffer = info->media_status; 957 info->media_status = B_OK; 958 return B_OK; 959 break; 960 } 961 962 case B_GET_DEVICE_SIZE: 963 { 964 size_t size = info->capacity * info->block_size; 965 return user_memcpy(buffer, &size, sizeof(size_t)); 966 } 967 968 case B_GET_GEOMETRY: 969 { 970 if (buffer == NULL || length > sizeof(device_geometry)) 971 return B_BAD_VALUE; 972 973 device_geometry geometry; 974 status_t status = get_geometry(handle, &geometry); 975 if (status != B_OK) 976 return status; 977 978 return user_memcpy(buffer, &geometry, length); 979 } 980 981 case B_GET_ICON_NAME: 982 return user_strlcpy((char*)buffer, "devices/drive-harddisk", 983 B_FILE_NAME_LENGTH); 984 985 case B_GET_VECTOR_ICON: 986 { 987 device_icon iconData; 988 if (length != sizeof(device_icon)) 989 return B_BAD_VALUE; 990 if (user_memcpy(&iconData, buffer, sizeof(device_icon)) != B_OK) 991 return B_BAD_ADDRESS; 992 993 if (iconData.icon_size >= (int32)sizeof(kDriveIcon)) { 994 if (user_memcpy(iconData.icon_data, kDriveIcon, 995 sizeof(kDriveIcon)) != B_OK) 996 return B_BAD_ADDRESS; 997 } 998 999 iconData.icon_size = sizeof(kDriveIcon); 1000 return user_memcpy(buffer, &iconData, sizeof(device_icon)); 1001 } 1002 1003 case B_FLUSH_DRIVE_CACHE: 1004 return nvme_disk_flush(info); 1005 1006 case B_TRIM_DEVICE: 1007 ASSERT(IS_KERNEL_ADDRESS(buffer)); 1008 return nvme_disk_trim(info, (fs_trim_data*)buffer); 1009 } 1010 1011 return B_DEV_INVALID_IOCTL; 1012 } 1013 1014 1015 // #pragma mark - driver module API 1016 1017 1018 static float 1019 nvme_disk_supports_device(device_node *parent) 1020 { 1021 CALLED(); 1022 1023 const char* bus; 1024 uint16 baseClass, subClass; 1025 1026 if (sDeviceManager->get_attr_string(parent, B_DEVICE_BUS, &bus, false) != B_OK 1027 || sDeviceManager->get_attr_uint16(parent, B_DEVICE_TYPE, &baseClass, false) != B_OK 1028 || sDeviceManager->get_attr_uint16(parent, B_DEVICE_SUB_TYPE, &subClass, false) != B_OK) 1029 return -1.0f; 1030 1031 if (strcmp(bus, "pci") != 0 || baseClass != PCI_mass_storage) 1032 return 0.0f; 1033 1034 if (subClass != PCI_nvm) 1035 return 0.0f; 1036 1037 TRACE("NVMe device found!\n"); 1038 return 1.0f; 1039 } 1040 1041 1042 static status_t 1043 nvme_disk_register_device(device_node* parent) 1044 { 1045 CALLED(); 1046 1047 device_attr attrs[] = { 1048 { B_DEVICE_PRETTY_NAME, B_STRING_TYPE, { .string = "NVMe Disk" } }, 1049 { NULL } 1050 }; 1051 1052 return sDeviceManager->register_node(parent, NVME_DISK_DRIVER_MODULE_NAME, 1053 attrs, NULL, NULL); 1054 } 1055 1056 1057 static status_t 1058 nvme_disk_init_driver(device_node* node, void** cookie) 1059 { 1060 CALLED(); 1061 1062 int ret = nvme_lib_init((enum nvme_log_level)0, (enum nvme_log_facility)0, NULL); 1063 if (ret != 0) { 1064 TRACE_ERROR("libnvme initialization failed!\n"); 1065 return ret; 1066 } 1067 1068 nvme_disk_driver_info* info = new nvme_disk_driver_info; 1069 if (info == NULL) 1070 return B_NO_MEMORY; 1071 1072 info->media_status = B_OK; 1073 info->node = node; 1074 1075 info->ctrlr = NULL; 1076 1077 *cookie = info; 1078 return B_OK; 1079 } 1080 1081 1082 static void 1083 nvme_disk_uninit_driver(void* _cookie) 1084 { 1085 CALLED(); 1086 1087 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie; 1088 free(info); 1089 } 1090 1091 1092 static status_t 1093 nvme_disk_register_child_devices(void* _cookie) 1094 { 1095 CALLED(); 1096 1097 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie; 1098 status_t status; 1099 1100 int32 id = sDeviceManager->create_id(NVME_DISK_DEVICE_ID_GENERATOR); 1101 if (id < 0) 1102 return id; 1103 1104 char name[64]; 1105 snprintf(name, sizeof(name), "disk/nvme/%" B_PRId32 "/raw", 1106 id); 1107 1108 status = sDeviceManager->publish_device(info->node, name, 1109 NVME_DISK_DEVICE_MODULE_NAME); 1110 1111 return status; 1112 } 1113 1114 1115 // #pragma mark - 1116 1117 1118 module_dependency module_dependencies[] = { 1119 { B_DEVICE_MANAGER_MODULE_NAME, (module_info**)&sDeviceManager }, 1120 { NULL } 1121 }; 1122 1123 struct device_module_info sNvmeDiskDevice = { 1124 { 1125 NVME_DISK_DEVICE_MODULE_NAME, 1126 0, 1127 NULL 1128 }, 1129 1130 nvme_disk_init_device, 1131 nvme_disk_uninit_device, 1132 NULL, // remove, 1133 1134 nvme_disk_open, 1135 nvme_disk_close, 1136 nvme_disk_free, 1137 nvme_disk_read, 1138 nvme_disk_write, 1139 nvme_disk_io, 1140 nvme_disk_ioctl, 1141 1142 NULL, // select 1143 NULL, // deselect 1144 }; 1145 1146 struct driver_module_info sNvmeDiskDriver = { 1147 { 1148 NVME_DISK_DRIVER_MODULE_NAME, 1149 0, 1150 NULL 1151 }, 1152 1153 nvme_disk_supports_device, 1154 nvme_disk_register_device, 1155 nvme_disk_init_driver, 1156 nvme_disk_uninit_driver, 1157 nvme_disk_register_child_devices, 1158 NULL, // rescan 1159 NULL, // removed 1160 }; 1161 1162 module_info* modules[] = { 1163 (module_info*)&sNvmeDiskDriver, 1164 (module_info*)&sNvmeDiskDevice, 1165 NULL 1166 }; 1167