1 /* 2 * Copyright 2019-2022, Haiku, Inc. All rights reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Augustin Cavalier <waddlesplash> 7 */ 8 9 10 #include <stdio.h> 11 #include <stdlib.h> 12 13 #include <algorithm> 14 #include <condition_variable.h> 15 #include <AutoDeleter.h> 16 #include <kernel.h> 17 #include <smp.h> 18 #include <util/AutoLock.h> 19 20 #include <fs/devfs.h> 21 #include <bus/PCI.h> 22 #include <vm/vm.h> 23 24 #include "IORequest.h" 25 26 extern "C" { 27 #include <libnvme/nvme.h> 28 #include <libnvme/nvme_internal.h> 29 } 30 31 32 //#define TRACE_NVME_DISK 33 #ifdef TRACE_NVME_DISK 34 # define TRACE(x...) dprintf("nvme_disk: " x) 35 #else 36 # define TRACE(x...) ; 37 #endif 38 #define TRACE_ALWAYS(x...) dprintf("nvme_disk: " x) 39 #define TRACE_ERROR(x...) dprintf("\33[33mnvme_disk:\33[0m " x) 40 #define CALLED() TRACE("CALLED %s\n", __PRETTY_FUNCTION__) 41 42 43 static const uint8 kDriveIcon[] = { 44 0x6e, 0x63, 0x69, 0x66, 0x08, 0x03, 0x01, 0x00, 0x00, 0x02, 0x00, 0x16, 45 0x02, 0x3c, 0xc7, 0xee, 0x38, 0x9b, 0xc0, 0xba, 0x16, 0x57, 0x3e, 0x39, 46 0xb0, 0x49, 0x77, 0xc8, 0x42, 0xad, 0xc7, 0x00, 0xff, 0xff, 0xd3, 0x02, 47 0x00, 0x06, 0x02, 0x3c, 0x96, 0x32, 0x3a, 0x4d, 0x3f, 0xba, 0xfc, 0x01, 48 0x3d, 0x5a, 0x97, 0x4b, 0x57, 0xa5, 0x49, 0x84, 0x4d, 0x00, 0x47, 0x47, 49 0x47, 0xff, 0xa5, 0xa0, 0xa0, 0x02, 0x00, 0x16, 0x02, 0xbc, 0x59, 0x2f, 50 0xbb, 0x29, 0xa7, 0x3c, 0x0c, 0xe4, 0xbd, 0x0b, 0x7c, 0x48, 0x92, 0xc0, 51 0x4b, 0x79, 0x66, 0x00, 0x7d, 0xff, 0xd4, 0x02, 0x00, 0x06, 0x02, 0x38, 52 0xdb, 0xb4, 0x39, 0x97, 0x33, 0xbc, 0x4a, 0x33, 0x3b, 0xa5, 0x42, 0x48, 53 0x6e, 0x66, 0x49, 0xee, 0x7b, 0x00, 0x59, 0x67, 0x56, 0xff, 0xeb, 0xb2, 54 0xb2, 0x03, 0xa7, 0xff, 0x00, 0x03, 0xff, 0x00, 0x00, 0x04, 0x01, 0x80, 55 0x07, 0x0a, 0x06, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x5a, 0x3e, 0x5a, 56 0x31, 0x39, 0x25, 0x0a, 0x04, 0x22, 0x3c, 0x44, 0x4b, 0x5a, 0x31, 0x39, 57 0x25, 0x0a, 0x04, 0x44, 0x4b, 0x44, 0x5b, 0x5a, 0x3e, 0x5a, 0x31, 0x0a, 58 0x04, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x44, 0x4b, 0x08, 0x02, 0x27, 59 0x43, 0xb8, 0x14, 0xc1, 0xf1, 0x08, 0x02, 0x26, 0x43, 0x29, 0x44, 0x0a, 60 0x05, 0x44, 0x5d, 0x49, 0x5d, 0x60, 0x3e, 0x5a, 0x3b, 0x5b, 0x3f, 0x08, 61 0x0a, 0x07, 0x01, 0x06, 0x00, 0x0a, 0x00, 0x01, 0x00, 0x10, 0x01, 0x17, 62 0x84, 0x00, 0x04, 0x0a, 0x01, 0x01, 0x01, 0x00, 0x0a, 0x02, 0x01, 0x02, 63 0x00, 0x0a, 0x03, 0x01, 0x03, 0x00, 0x0a, 0x04, 0x01, 0x04, 0x10, 0x01, 64 0x17, 0x85, 0x20, 0x04, 0x0a, 0x06, 0x01, 0x05, 0x30, 0x24, 0xb3, 0x99, 65 0x01, 0x17, 0x82, 0x00, 0x04, 0x0a, 0x05, 0x01, 0x05, 0x30, 0x20, 0xb2, 66 0xe6, 0x01, 0x17, 0x82, 0x00, 0x04 67 }; 68 69 70 #define NVME_DISK_DRIVER_MODULE_NAME "drivers/disk/nvme_disk/driver_v1" 71 #define NVME_DISK_DEVICE_MODULE_NAME "drivers/disk/nvme_disk/device_v1" 72 #define NVME_DISK_DEVICE_ID_GENERATOR "nvme_disk/device_id" 73 74 #define NVME_MAX_QPAIRS (16) 75 76 77 static device_manager_info* sDeviceManager; 78 79 typedef struct { 80 device_node* node; 81 pci_info info; 82 83 struct nvme_ctrlr* ctrlr; 84 85 struct nvme_ns* ns; 86 uint64 capacity; 87 uint32 block_size; 88 uint32 max_io_blocks; 89 status_t media_status; 90 91 DMAResource dma_resource; 92 sem_id dma_buffers_sem; 93 94 rw_lock rounded_write_lock; 95 96 ConditionVariable interrupt; 97 int32 polling; 98 99 struct qpair_info { 100 struct nvme_qpair* qpair; 101 } qpairs[NVME_MAX_QPAIRS]; 102 uint32 qpair_count; 103 } nvme_disk_driver_info; 104 typedef nvme_disk_driver_info::qpair_info qpair_info; 105 106 107 typedef struct { 108 nvme_disk_driver_info* info; 109 } nvme_disk_handle; 110 111 112 static status_t 113 get_geometry(nvme_disk_handle* handle, device_geometry* geometry) 114 { 115 nvme_disk_driver_info* info = handle->info; 116 117 devfs_compute_geometry_size(geometry, info->capacity, info->block_size); 118 geometry->bytes_per_physical_sector = info->block_size; 119 120 geometry->device_type = B_DISK; 121 geometry->removable = false; 122 123 geometry->read_only = false; 124 geometry->write_once = false; 125 126 TRACE("get_geometry(): %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %d, %d, %d, %d\n", 127 geometry->bytes_per_sector, geometry->sectors_per_track, 128 geometry->cylinder_count, geometry->head_count, geometry->device_type, 129 geometry->removable, geometry->read_only, geometry->write_once); 130 131 return B_OK; 132 } 133 134 135 static void 136 nvme_disk_set_capacity(nvme_disk_driver_info* info, uint64 capacity, 137 uint32 blockSize) 138 { 139 TRACE("set_capacity(device = %p, capacity = %" B_PRIu64 ", blockSize = %" B_PRIu32 ")\n", 140 info, capacity, blockSize); 141 142 info->capacity = capacity; 143 info->block_size = blockSize; 144 } 145 146 147 // #pragma mark - device module API 148 149 150 static int32 nvme_interrupt_handler(void* _info); 151 152 153 static status_t 154 nvme_disk_init_device(void* _info, void** _cookie) 155 { 156 CALLED(); 157 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info; 158 ASSERT(info->ctrlr == NULL); 159 160 pci_device_module_info* pci; 161 pci_device* pcidev; 162 device_node* parent = sDeviceManager->get_parent_node(info->node); 163 sDeviceManager->get_driver(parent, (driver_module_info**)&pci, 164 (void**)&pcidev); 165 pci->get_pci_info(pcidev, &info->info); 166 sDeviceManager->put_node(parent); 167 168 // construct the libnvme pci_device struct 169 pci_device* device = new pci_device; 170 device->vendor_id = info->info.vendor_id; 171 device->device_id = info->info.device_id; 172 device->subvendor_id = 0; 173 device->subdevice_id = 0; 174 175 device->domain = 0; 176 device->bus = info->info.bus; 177 device->dev = info->info.device; 178 device->func = info->info.function; 179 180 device->pci_info = &info->info; 181 182 // enable busmaster and memory mapped access 183 uint16 command = pci->read_pci_config(pcidev, PCI_command, 2); 184 command |= PCI_command_master | PCI_command_memory; 185 pci->write_pci_config(pcidev, PCI_command, 2, command); 186 187 // open the controller 188 info->ctrlr = nvme_ctrlr_open(device, NULL); 189 if (info->ctrlr == NULL) { 190 TRACE_ERROR("failed to open the controller!\n"); 191 return B_ERROR; 192 } 193 194 struct nvme_ctrlr_stat cstat; 195 int err = nvme_ctrlr_stat(info->ctrlr, &cstat); 196 if (err != 0) { 197 TRACE_ERROR("failed to get controller information!\n"); 198 nvme_ctrlr_close(info->ctrlr); 199 return err; 200 } 201 202 TRACE_ALWAYS("attached to NVMe device \"%s (%s)\"\n", cstat.mn, cstat.sn); 203 TRACE_ALWAYS("\tmaximum transfer size: %" B_PRIuSIZE "\n", cstat.max_xfer_size); 204 TRACE_ALWAYS("\tqpair count: %d\n", cstat.io_qpairs); 205 206 // TODO: export more than just the first namespace! 207 info->ns = nvme_ns_open(info->ctrlr, cstat.ns_ids[0]); 208 if (info->ns == NULL) { 209 TRACE_ERROR("failed to open namespace!\n"); 210 nvme_ctrlr_close(info->ctrlr); 211 return B_ERROR; 212 } 213 TRACE_ALWAYS("namespace 0\n"); 214 215 struct nvme_ns_stat nsstat; 216 err = nvme_ns_stat(info->ns, &nsstat); 217 if (err != 0) { 218 TRACE_ERROR("failed to get namespace information!\n"); 219 nvme_ctrlr_close(info->ctrlr); 220 return err; 221 } 222 223 // store capacity information 224 TRACE_ALWAYS("\tblock size: %" B_PRIuSIZE ", stripe size: %u\n", 225 nsstat.sector_size, info->ns->stripe_size); 226 nvme_disk_set_capacity(info, nsstat.sectors, nsstat.sector_size); 227 228 command = pci->read_pci_config(pcidev, PCI_command, 2); 229 command &= ~(PCI_command_int_disable); 230 pci->write_pci_config(pcidev, PCI_command, 2, command); 231 232 uint32 irq = info->info.u.h0.interrupt_line; 233 if (irq == 0xFF) 234 irq = 0; 235 236 if (pci->get_msix_count(pcidev)) { 237 uint32 msixVector = 0; 238 if (pci->configure_msix(pcidev, 1, &msixVector) == B_OK 239 && pci->enable_msix(pcidev) == B_OK) { 240 TRACE_ALWAYS("using MSI-X\n"); 241 irq = msixVector; 242 } 243 } else if (pci->get_msi_count(pcidev) >= 1) { 244 uint32 msiVector = 0; 245 if (pci->configure_msi(pcidev, 1, &msiVector) == B_OK 246 && pci->enable_msi(pcidev) == B_OK) { 247 TRACE_ALWAYS("using message signaled interrupts\n"); 248 irq = msiVector; 249 } 250 } 251 252 if (irq == 0) { 253 TRACE_ERROR("device PCI:%d:%d:%d was assigned an invalid IRQ\n", 254 info->info.bus, info->info.device, info->info.function); 255 info->polling = 1; 256 } else { 257 info->polling = 0; 258 } 259 info->interrupt.Init(NULL, NULL); 260 install_io_interrupt_handler(irq, nvme_interrupt_handler, (void*)info, B_NO_HANDLED_INFO); 261 262 if (info->ctrlr->feature_supported[NVME_FEAT_INTERRUPT_COALESCING]) { 263 uint32 microseconds = 16, threshold = 32; 264 nvme_admin_set_feature(info->ctrlr, false, NVME_FEAT_INTERRUPT_COALESCING, 265 ((microseconds / 100) << 8) | threshold, 0, NULL); 266 } 267 268 // allocate qpairs 269 uint32 try_qpairs = cstat.io_qpairs; 270 try_qpairs = min_c(try_qpairs, NVME_MAX_QPAIRS); 271 if (try_qpairs >= (uint32)smp_get_num_cpus()) { 272 try_qpairs = smp_get_num_cpus(); 273 } else { 274 // Find the highest number of qpairs that evenly divides the number of CPUs. 275 while ((smp_get_num_cpus() % try_qpairs) != 0) 276 try_qpairs--; 277 } 278 info->qpair_count = 0; 279 for (uint32 i = 0; i < try_qpairs; i++) { 280 info->qpairs[i].qpair = nvme_ioqp_get(info->ctrlr, 281 (enum nvme_qprio)0, 0); 282 if (info->qpairs[i].qpair == NULL) 283 break; 284 285 info->qpair_count++; 286 } 287 if (info->qpair_count == 0) { 288 TRACE_ERROR("failed to allocate qpairs!\n"); 289 nvme_ctrlr_close(info->ctrlr); 290 return B_NO_MEMORY; 291 } 292 if (info->qpair_count != try_qpairs) { 293 TRACE_ALWAYS("warning: did not get expected number of qpairs\n"); 294 } 295 296 // allocate DMA buffers 297 int buffers = info->qpair_count * 2; 298 299 dma_restrictions restrictions = {}; 300 restrictions.alignment = B_PAGE_SIZE; 301 // Technically, the first and last segments in a transfer can be aligned 302 // only on 32-bits, and the rest only need to have sizes that are a multiple 303 // of the block size. 304 restrictions.max_segment_count = (NVME_MAX_SGL_DESCRIPTORS / 2); 305 restrictions.max_transfer_size = cstat.max_xfer_size; 306 info->max_io_blocks = cstat.max_xfer_size / nsstat.sector_size; 307 308 err = info->dma_resource.Init(restrictions, B_PAGE_SIZE, buffers, buffers); 309 if (err != 0) { 310 TRACE_ERROR("failed to initialize DMA resource!\n"); 311 nvme_ctrlr_close(info->ctrlr); 312 return err; 313 } 314 315 info->dma_buffers_sem = create_sem(buffers, "nvme buffers sem"); 316 if (info->dma_buffers_sem < 0) { 317 TRACE_ERROR("failed to create DMA buffers semaphore!\n"); 318 nvme_ctrlr_close(info->ctrlr); 319 return info->dma_buffers_sem; 320 } 321 322 // set up rounded-write lock 323 rw_lock_init(&info->rounded_write_lock, "nvme rounded writes"); 324 325 *_cookie = info; 326 return B_OK; 327 } 328 329 330 static void 331 nvme_disk_uninit_device(void* _cookie) 332 { 333 CALLED(); 334 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie; 335 336 remove_io_interrupt_handler(info->info.u.h0.interrupt_line, 337 nvme_interrupt_handler, (void*)info); 338 339 rw_lock_destroy(&info->rounded_write_lock); 340 341 nvme_ns_close(info->ns); 342 nvme_ctrlr_close(info->ctrlr); 343 344 // TODO: Deallocate MSI(-X). 345 // TODO: Deallocate PCI. 346 } 347 348 349 static status_t 350 nvme_disk_open(void* _info, const char* path, int openMode, void** _cookie) 351 { 352 CALLED(); 353 354 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info; 355 nvme_disk_handle* handle = (nvme_disk_handle*)malloc( 356 sizeof(nvme_disk_handle)); 357 if (handle == NULL) 358 return B_NO_MEMORY; 359 360 handle->info = info; 361 362 *_cookie = handle; 363 return B_OK; 364 } 365 366 367 static status_t 368 nvme_disk_close(void* cookie) 369 { 370 CALLED(); 371 372 //nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 373 return B_OK; 374 } 375 376 377 static status_t 378 nvme_disk_free(void* cookie) 379 { 380 CALLED(); 381 382 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 383 free(handle); 384 return B_OK; 385 } 386 387 388 // #pragma mark - I/O 389 390 391 static int32 392 nvme_interrupt_handler(void* _info) 393 { 394 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info; 395 info->interrupt.NotifyAll(); 396 info->polling = -1; 397 return 0; 398 } 399 400 401 static qpair_info* 402 get_qpair(nvme_disk_driver_info* info) 403 { 404 return &info->qpairs[smp_get_current_cpu() % info->qpair_count]; 405 } 406 407 408 static void 409 io_finished_callback(status_t* status, const struct nvme_cpl* cpl) 410 { 411 *status = nvme_cpl_is_error(cpl) ? B_IO_ERROR : B_OK; 412 } 413 414 415 static void 416 await_status(nvme_disk_driver_info* info, struct nvme_qpair* qpair, status_t& status) 417 { 418 CALLED(); 419 420 ConditionVariableEntry entry; 421 int timeouts = 0; 422 while (status == EINPROGRESS) { 423 info->interrupt.Add(&entry); 424 425 nvme_qpair_poll(qpair, 0); 426 427 if (status != EINPROGRESS) 428 return; 429 430 if (info->polling > 0) { 431 entry.Wait(B_RELATIVE_TIMEOUT, min_c(5 * 1000 * 1000, 432 (1 << timeouts) * 1000)); 433 timeouts++; 434 } else if (entry.Wait(B_RELATIVE_TIMEOUT, 5 * 1000 * 1000) != B_OK) { 435 // This should never happen, as we are woken up on every interrupt 436 // no matter the qpair or transfer within; so if it does occur, 437 // that probably means the controller stalled, or maybe cannot 438 // generate interrupts at all. 439 440 TRACE_ERROR("timed out waiting for interrupt!\n"); 441 if (timeouts++ >= 3) { 442 nvme_qpair_fail(qpair); 443 status = B_TIMED_OUT; 444 return; 445 } 446 447 info->polling++; 448 if (info->polling > 0) { 449 TRACE_ALWAYS("switching to polling mode, performance will be affected!\n"); 450 } 451 } 452 453 nvme_qpair_poll(qpair, 0); 454 } 455 } 456 457 458 struct nvme_io_request { 459 status_t status; 460 461 bool write; 462 463 off_t lba_start; 464 size_t lba_count; 465 466 physical_entry* iovecs; 467 int32 iovec_count; 468 469 int32 iovec_i; 470 uint32 iovec_offset; 471 }; 472 473 474 static void 475 ior_reset_sgl(nvme_io_request* request, uint32_t offset) 476 { 477 TRACE("IOR Reset: %" B_PRIu32 "\n", offset); 478 479 int32 i = 0; 480 while (offset > 0 && request->iovecs[i].size <= offset) { 481 offset -= request->iovecs[i].size; 482 i++; 483 } 484 request->iovec_i = i; 485 request->iovec_offset = offset; 486 } 487 488 489 static int 490 ior_next_sge(nvme_io_request* request, uint64_t* address, uint32_t* length) 491 { 492 int32 index = request->iovec_i; 493 if (index < 0 || index > request->iovec_count) 494 return -1; 495 496 *address = request->iovecs[index].address + request->iovec_offset; 497 *length = request->iovecs[index].size - request->iovec_offset; 498 499 TRACE("IOV %d (+ " B_PRIu32 "): 0x%" B_PRIx64 ", %" B_PRIu32 "\n", 500 request->iovec_i, request->iovec_offset, *address, *length); 501 502 request->iovec_i++; 503 request->iovec_offset = 0; 504 return 0; 505 } 506 507 508 static status_t 509 do_nvme_io_request(nvme_disk_driver_info* info, nvme_io_request* request) 510 { 511 request->status = EINPROGRESS; 512 513 qpair_info* qpinfo = get_qpair(info); 514 int ret = -1; 515 if (request->write) { 516 ret = nvme_ns_writev(info->ns, qpinfo->qpair, request->lba_start, 517 request->lba_count, (nvme_cmd_cb)io_finished_callback, request, 518 0, (nvme_req_reset_sgl_cb)ior_reset_sgl, 519 (nvme_req_next_sge_cb)ior_next_sge); 520 } else { 521 ret = nvme_ns_readv(info->ns, qpinfo->qpair, request->lba_start, 522 request->lba_count, (nvme_cmd_cb)io_finished_callback, request, 523 0, (nvme_req_reset_sgl_cb)ior_reset_sgl, 524 (nvme_req_next_sge_cb)ior_next_sge); 525 } 526 if (ret != 0) { 527 TRACE_ERROR("attempt to queue %s I/O at LBA %" B_PRIdOFF " of %" B_PRIuSIZE 528 " blocks failed!\n", request->write ? "write" : "read", 529 request->lba_start, request->lba_count); 530 531 request->lba_count = 0; 532 return ret; 533 } 534 535 await_status(info, qpinfo->qpair, request->status); 536 537 if (request->status != B_OK) { 538 TRACE_ERROR("%s at LBA %" B_PRIdOFF " of %" B_PRIuSIZE 539 " blocks failed!\n", request->write ? "write" : "read", 540 request->lba_start, request->lba_count); 541 542 request->lba_count = 0; 543 } 544 return request->status; 545 } 546 547 548 static status_t 549 nvme_disk_bounced_io(nvme_disk_handle* handle, io_request* request) 550 { 551 CALLED(); 552 553 WriteLocker writeLocker; 554 if (request->IsWrite()) 555 writeLocker.SetTo(handle->info->rounded_write_lock, false); 556 557 status_t status = acquire_sem(handle->info->dma_buffers_sem); 558 if (status != B_OK) { 559 request->SetStatusAndNotify(status); 560 return status; 561 } 562 563 const size_t block_size = handle->info->block_size; 564 565 TRACE("%p: IOR Offset: %" B_PRIdOFF "; Length %" B_PRIuGENADDR 566 "; Write %s\n", request, request->Offset(), request->Length(), 567 request->IsWrite() ? "yes" : "no"); 568 569 nvme_io_request nvme_request; 570 while (request->RemainingBytes() > 0) { 571 IOOperation operation; 572 status = handle->info->dma_resource.TranslateNext(request, &operation, 0); 573 if (status != B_OK) 574 break; 575 576 do { 577 TRACE("%p: IOO offset: %" B_PRIdOFF ", length: %" B_PRIuGENADDR 578 ", write: %s\n", request, operation.Offset(), 579 operation.Length(), operation.IsWrite() ? "yes" : "no"); 580 581 nvme_request.write = operation.IsWrite(); 582 nvme_request.lba_start = operation.Offset() / block_size; 583 nvme_request.lba_count = operation.Length() / block_size; 584 nvme_request.iovecs = (physical_entry*)operation.Vecs(); 585 nvme_request.iovec_count = operation.VecCount(); 586 587 status = do_nvme_io_request(handle->info, &nvme_request); 588 589 operation.SetStatus(status, 590 status == B_OK ? operation.Length() : 0); 591 } while (status == B_OK && !operation.Finish()); 592 593 if (status == B_OK && operation.Status() != B_OK) { 594 TRACE_ERROR("I/O succeeded but IOOperation failed!\n"); 595 status = operation.Status(); 596 } 597 598 request->OperationFinished(&operation); 599 600 handle->info->dma_resource.RecycleBuffer(operation.Buffer()); 601 602 TRACE("%p: status %s, remaining bytes %" B_PRIuGENADDR "\n", request, 603 strerror(status), request->RemainingBytes()); 604 if (status != B_OK) 605 break; 606 } 607 608 release_sem(handle->info->dma_buffers_sem); 609 610 // Notify() also takes care of UnlockMemory(). 611 if (status != B_OK && request->Status() == B_OK) 612 request->SetStatusAndNotify(status); 613 else 614 request->NotifyFinished(); 615 return status; 616 } 617 618 619 static status_t 620 nvme_disk_io(void* cookie, io_request* request) 621 { 622 CALLED(); 623 624 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 625 626 const off_t ns_end = (handle->info->capacity * handle->info->block_size); 627 if ((request->Offset() + (off_t)request->Length()) > ns_end) 628 return ERANGE; 629 630 nvme_io_request nvme_request; 631 memset(&nvme_request, 0, sizeof(nvme_io_request)); 632 633 nvme_request.write = request->IsWrite(); 634 635 physical_entry* vtophys = NULL; 636 MemoryDeleter vtophysDeleter; 637 638 IOBuffer* buffer = request->Buffer(); 639 status_t status = B_OK; 640 if (!buffer->IsPhysical()) { 641 status = buffer->LockMemory(request->TeamID(), request->IsWrite()); 642 if (status != B_OK) { 643 TRACE_ERROR("failed to lock memory: %s\n", strerror(status)); 644 return status; 645 } 646 // SetStatusAndNotify() takes care of unlocking memory if necessary. 647 648 // This is slightly inefficient, as we could use a BStackOrHeapArray in 649 // the optimal case (few physical entries required), but we would not 650 // know whether or not that was possible until calling get_memory_map() 651 // and then potentially reallocating, which would complicate the logic. 652 653 int32 vtophys_length = (request->Length() / B_PAGE_SIZE) + 2; 654 nvme_request.iovecs = vtophys = (physical_entry*)malloc(sizeof(physical_entry) 655 * vtophys_length); 656 if (vtophys == NULL) { 657 TRACE_ERROR("failed to allocate memory for iovecs\n"); 658 request->SetStatusAndNotify(B_NO_MEMORY); 659 return B_NO_MEMORY; 660 } 661 vtophysDeleter.SetTo(vtophys); 662 663 for (size_t i = 0; i < buffer->VecCount(); i++) { 664 generic_io_vec virt = buffer->VecAt(i); 665 uint32 entries = vtophys_length - nvme_request.iovec_count; 666 667 // Avoid copies by going straight into the vtophys array. 668 status = get_memory_map_etc(request->TeamID(), (void*)virt.base, 669 virt.length, vtophys + nvme_request.iovec_count, &entries); 670 671 if (status == B_BAD_VALUE && entries == 0) 672 status = B_BUFFER_OVERFLOW; 673 if (status == B_BUFFER_OVERFLOW) { 674 TRACE("vtophys array was too small, reallocating\n"); 675 676 vtophys_length *= 2; 677 nvme_request.iovecs = vtophys = (physical_entry*)realloc(vtophys, 678 sizeof(physical_entry) * vtophys_length); 679 if (vtophys != NULL) { 680 vtophysDeleter.Detach(); 681 vtophysDeleter.SetTo(vtophys); 682 683 // Try again, with the larger buffer this time. 684 i--; 685 continue; 686 } else { 687 status = B_NO_MEMORY; 688 } 689 } 690 if (status != B_OK) { 691 TRACE_ERROR("I/O get_memory_map failed: %s\n", strerror(status)); 692 request->SetStatusAndNotify(status); 693 return status; 694 } 695 696 nvme_request.iovec_count += entries; 697 } 698 } else { 699 nvme_request.iovecs = (physical_entry*)buffer->Vecs(); 700 nvme_request.iovec_count = buffer->VecCount(); 701 } 702 703 // See if we need to bounce anything other than the first or last vec. 704 const size_t block_size = handle->info->block_size; 705 bool bounceAll = false; 706 for (int32 i = 1; !bounceAll && i < (nvme_request.iovec_count - 1); i++) { 707 if ((nvme_request.iovecs[i].address % B_PAGE_SIZE) != 0) 708 bounceAll = true; 709 if ((nvme_request.iovecs[i].size % B_PAGE_SIZE) != 0) 710 bounceAll = true; 711 } 712 713 // See if we need to bounce due to the first or last vecs. 714 if (nvme_request.iovec_count > 1) { 715 // There are middle vecs, so the first and last vecs have different restrictions: they 716 // need only be a multiple of the block size, and must end and start on a page boundary, 717 // respectively, though the start address must always be 32-bit-aligned. 718 physical_entry* entry = &nvme_request.iovecs[0]; 719 if (!bounceAll && (((entry->address + entry->size) % B_PAGE_SIZE) != 0 720 || (entry->address & 0x3) != 0 || (entry->size % block_size) != 0)) 721 bounceAll = true; 722 723 entry = &nvme_request.iovecs[nvme_request.iovec_count - 1]; 724 if (!bounceAll && ((entry->address % B_PAGE_SIZE) != 0 725 || (entry->size % block_size) != 0)) 726 bounceAll = true; 727 } else { 728 // There is only one vec. Check that it is a multiple of the block size, 729 // and that its address is 32-bit-aligned. 730 physical_entry* entry = &nvme_request.iovecs[0]; 731 if (!bounceAll && ((entry->address & 0x3) != 0 || (entry->size % block_size) != 0)) 732 bounceAll = true; 733 } 734 735 // See if we need to bounce due to rounding. 736 const off_t rounded_pos = ROUNDDOWN(request->Offset(), block_size); 737 phys_size_t rounded_len = ROUNDUP(request->Length() + (request->Offset() 738 - rounded_pos), block_size); 739 if (rounded_pos != request->Offset() || rounded_len != request->Length()) 740 bounceAll = true; 741 742 if (bounceAll) { 743 // Let the bounced I/O routine take care of everything from here. 744 return nvme_disk_bounced_io(handle, request); 745 } 746 747 nvme_request.lba_start = rounded_pos / block_size; 748 nvme_request.lba_count = rounded_len / block_size; 749 750 // No bouncing was required. 751 ReadLocker readLocker; 752 if (nvme_request.write) 753 readLocker.SetTo(handle->info->rounded_write_lock, false); 754 755 // Error check before actually doing I/O. 756 if (status != B_OK) { 757 TRACE_ERROR("I/O failed early: %s\n", strerror(status)); 758 request->SetStatusAndNotify(status); 759 return status; 760 } 761 762 const uint32 max_io_blocks = handle->info->max_io_blocks; 763 int32 remaining = nvme_request.iovec_count; 764 while (remaining > 0) { 765 nvme_request.iovec_count = min_c(remaining, 766 NVME_MAX_SGL_DESCRIPTORS / 2); 767 768 nvme_request.lba_count = 0; 769 for (int i = 0; i < nvme_request.iovec_count; i++) { 770 uint32 new_lba_count = nvme_request.lba_count 771 + (nvme_request.iovecs[i].size / block_size); 772 if (nvme_request.lba_count > 0 && new_lba_count > max_io_blocks) { 773 // We already have a nonzero length, and adding this vec would 774 // make us go over (or we already are over.) Stop adding. 775 nvme_request.iovec_count = i; 776 break; 777 } 778 779 nvme_request.lba_count = new_lba_count; 780 } 781 782 status = do_nvme_io_request(handle->info, &nvme_request); 783 if (status != B_OK) 784 break; 785 786 nvme_request.iovecs += nvme_request.iovec_count; 787 remaining -= nvme_request.iovec_count; 788 nvme_request.lba_start += nvme_request.lba_count; 789 } 790 791 if (status != B_OK) 792 TRACE_ERROR("I/O failed: %s\n", strerror(status)); 793 794 request->SetTransferredBytes(status != B_OK, 795 (nvme_request.lba_start * block_size) - rounded_pos); 796 request->SetStatusAndNotify(status); 797 return status; 798 } 799 800 801 static status_t 802 nvme_disk_read(void* cookie, off_t pos, void* buffer, size_t* length) 803 { 804 CALLED(); 805 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 806 807 const off_t ns_end = (handle->info->capacity * handle->info->block_size); 808 if (pos >= ns_end) 809 return B_BAD_VALUE; 810 if ((pos + (off_t)*length) > ns_end) 811 *length = ns_end - pos; 812 813 IORequest request; 814 status_t status = request.Init(pos, (addr_t)buffer, *length, false, 0); 815 if (status != B_OK) 816 return status; 817 818 status = nvme_disk_io(handle, &request); 819 *length = request.TransferredBytes(); 820 return status; 821 } 822 823 824 static status_t 825 nvme_disk_write(void* cookie, off_t pos, const void* buffer, size_t* length) 826 { 827 CALLED(); 828 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 829 830 const off_t ns_end = (handle->info->capacity * handle->info->block_size); 831 if (pos >= ns_end) 832 return B_BAD_VALUE; 833 if ((pos + (off_t)*length) > ns_end) 834 *length = ns_end - pos; 835 836 IORequest request; 837 status_t status = request.Init(pos, (addr_t)buffer, *length, true, 0); 838 if (status != B_OK) 839 return status; 840 841 status = nvme_disk_io(handle, &request); 842 *length = request.TransferredBytes(); 843 return status; 844 } 845 846 847 static status_t 848 nvme_disk_flush(nvme_disk_driver_info* info) 849 { 850 CALLED(); 851 status_t status = EINPROGRESS; 852 853 qpair_info* qpinfo = get_qpair(info); 854 int ret = nvme_ns_flush(info->ns, qpinfo->qpair, 855 (nvme_cmd_cb)io_finished_callback, &status); 856 if (ret != 0) 857 return ret; 858 859 await_status(info, qpinfo->qpair, status); 860 return status; 861 } 862 863 864 static status_t 865 nvme_disk_trim(nvme_disk_driver_info* info, fs_trim_data* trimData) 866 { 867 CALLED(); 868 trimData->trimmed_size = 0; 869 870 const off_t deviceSize = info->capacity * info->block_size; // in bytes 871 if (deviceSize < 0) 872 return B_BAD_VALUE; 873 874 STATIC_ASSERT(sizeof(deviceSize) <= sizeof(uint64)); 875 ASSERT(deviceSize >= 0); 876 877 // Do not trim past device end. 878 for (uint32 i = 0; i < trimData->range_count; i++) { 879 uint64 offset = trimData->ranges[i].offset; 880 uint64& size = trimData->ranges[i].size; 881 882 if (offset >= (uint64)deviceSize) 883 return B_BAD_VALUE; 884 size = std::min(size, (uint64)deviceSize - offset); 885 } 886 887 // We need contiguous memory for the DSM ranges. 888 nvme_dsm_range* dsmRanges = (nvme_dsm_range*)nvme_mem_alloc_node( 889 trimData->range_count * sizeof(nvme_dsm_range), 0, 0, NULL); 890 if (dsmRanges == NULL) 891 return B_NO_MEMORY; 892 CObjectDeleter<void, void, nvme_free> dsmRangesDeleter(dsmRanges); 893 894 uint64 trimmingSize = 0; 895 for (uint32 i = 0; i < trimData->range_count; i++) { 896 uint64 offset = trimData->ranges[i].offset; 897 uint64 length = trimData->ranges[i].size; 898 899 // Round up offset and length to the block size. 900 // (Some space at the beginning and end may thus not be trimmed.) 901 offset = ROUNDUP(offset, info->block_size); 902 length -= offset - trimData->ranges[i].offset; 903 length = ROUNDDOWN(length, info->block_size); 904 905 if (length == 0) 906 continue; 907 if ((length / info->block_size) > UINT32_MAX) 908 length = uint64(UINT32_MAX) * info->block_size; 909 // TODO: Break into smaller trim ranges! 910 911 TRACE("trim %" B_PRIu64 " bytes from %" B_PRIu64 "\n", length, offset); 912 913 dsmRanges[i].attributes = 0; 914 dsmRanges[i].length = length / info->block_size; 915 dsmRanges[i].starting_lba = offset / info->block_size; 916 917 trimmingSize += length; 918 } 919 920 status_t status = EINPROGRESS; 921 qpair_info* qpair = get_qpair(info); 922 if (nvme_ns_deallocate(info->ns, qpair->qpair, dsmRanges, trimData->range_count, 923 (nvme_cmd_cb)io_finished_callback, &status) != 0) 924 return B_IO_ERROR; 925 926 await_status(info, qpair->qpair, status); 927 if (status != B_OK) 928 return status; 929 930 trimData->trimmed_size = trimmingSize; 931 return B_OK; 932 } 933 934 935 static status_t 936 nvme_disk_ioctl(void* cookie, uint32 op, void* buffer, size_t length) 937 { 938 CALLED(); 939 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 940 nvme_disk_driver_info* info = handle->info; 941 942 TRACE("ioctl(op = %" B_PRId32 ")\n", op); 943 944 switch (op) { 945 case B_GET_MEDIA_STATUS: 946 { 947 return user_memcpy(buffer, &info->media_status, sizeof(status_t)); 948 } 949 950 case B_GET_DEVICE_SIZE: 951 { 952 size_t size = info->capacity * info->block_size; 953 return user_memcpy(buffer, &size, sizeof(size_t)); 954 } 955 956 case B_GET_GEOMETRY: 957 { 958 if (buffer == NULL || length > sizeof(device_geometry)) 959 return B_BAD_VALUE; 960 961 device_geometry geometry; 962 status_t status = get_geometry(handle, &geometry); 963 if (status != B_OK) 964 return status; 965 966 return user_memcpy(buffer, &geometry, length); 967 } 968 969 case B_GET_ICON_NAME: 970 return user_strlcpy((char*)buffer, "devices/drive-harddisk", 971 B_FILE_NAME_LENGTH); 972 973 case B_GET_VECTOR_ICON: 974 { 975 device_icon iconData; 976 if (length != sizeof(device_icon)) 977 return B_BAD_VALUE; 978 if (user_memcpy(&iconData, buffer, sizeof(device_icon)) != B_OK) 979 return B_BAD_ADDRESS; 980 981 if (iconData.icon_size >= (int32)sizeof(kDriveIcon)) { 982 if (user_memcpy(iconData.icon_data, kDriveIcon, 983 sizeof(kDriveIcon)) != B_OK) 984 return B_BAD_ADDRESS; 985 } 986 987 iconData.icon_size = sizeof(kDriveIcon); 988 return user_memcpy(buffer, &iconData, sizeof(device_icon)); 989 } 990 991 case B_FLUSH_DRIVE_CACHE: 992 return nvme_disk_flush(info); 993 994 case B_TRIM_DEVICE: 995 ASSERT(IS_KERNEL_ADDRESS(buffer)); 996 return nvme_disk_trim(info, (fs_trim_data*)buffer); 997 } 998 999 return B_DEV_INVALID_IOCTL; 1000 } 1001 1002 1003 // #pragma mark - driver module API 1004 1005 1006 static float 1007 nvme_disk_supports_device(device_node *parent) 1008 { 1009 CALLED(); 1010 1011 const char* bus; 1012 uint16 baseClass, subClass; 1013 1014 if (sDeviceManager->get_attr_string(parent, B_DEVICE_BUS, &bus, false) != B_OK 1015 || sDeviceManager->get_attr_uint16(parent, B_DEVICE_TYPE, &baseClass, false) != B_OK 1016 || sDeviceManager->get_attr_uint16(parent, B_DEVICE_SUB_TYPE, &subClass, false) != B_OK) 1017 return -1.0f; 1018 1019 if (strcmp(bus, "pci") != 0 || baseClass != PCI_mass_storage) 1020 return 0.0f; 1021 1022 if (subClass != PCI_nvm) 1023 return 0.0f; 1024 1025 TRACE("NVMe device found!\n"); 1026 return 1.0f; 1027 } 1028 1029 1030 static status_t 1031 nvme_disk_register_device(device_node* parent) 1032 { 1033 CALLED(); 1034 1035 device_attr attrs[] = { 1036 { B_DEVICE_PRETTY_NAME, B_STRING_TYPE, { .string = "NVMe Disk" } }, 1037 { NULL } 1038 }; 1039 1040 return sDeviceManager->register_node(parent, NVME_DISK_DRIVER_MODULE_NAME, 1041 attrs, NULL, NULL); 1042 } 1043 1044 1045 static status_t 1046 nvme_disk_init_driver(device_node* node, void** cookie) 1047 { 1048 CALLED(); 1049 1050 int ret = nvme_lib_init((enum nvme_log_level)0, (enum nvme_log_facility)0, NULL); 1051 if (ret != 0) { 1052 TRACE_ERROR("libnvme initialization failed!\n"); 1053 return ret; 1054 } 1055 1056 nvme_disk_driver_info* info = new nvme_disk_driver_info; 1057 if (info == NULL) 1058 return B_NO_MEMORY; 1059 1060 info->media_status = B_OK; 1061 info->node = node; 1062 1063 info->ctrlr = NULL; 1064 1065 *cookie = info; 1066 return B_OK; 1067 } 1068 1069 1070 static void 1071 nvme_disk_uninit_driver(void* _cookie) 1072 { 1073 CALLED(); 1074 1075 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie; 1076 free(info); 1077 } 1078 1079 1080 static status_t 1081 nvme_disk_register_child_devices(void* _cookie) 1082 { 1083 CALLED(); 1084 1085 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie; 1086 status_t status; 1087 1088 int32 id = sDeviceManager->create_id(NVME_DISK_DEVICE_ID_GENERATOR); 1089 if (id < 0) 1090 return id; 1091 1092 char name[64]; 1093 snprintf(name, sizeof(name), "disk/nvme/%" B_PRId32 "/raw", 1094 id); 1095 1096 status = sDeviceManager->publish_device(info->node, name, 1097 NVME_DISK_DEVICE_MODULE_NAME); 1098 1099 return status; 1100 } 1101 1102 1103 // #pragma mark - 1104 1105 1106 module_dependency module_dependencies[] = { 1107 { B_DEVICE_MANAGER_MODULE_NAME, (module_info**)&sDeviceManager }, 1108 { NULL } 1109 }; 1110 1111 struct device_module_info sNvmeDiskDevice = { 1112 { 1113 NVME_DISK_DEVICE_MODULE_NAME, 1114 0, 1115 NULL 1116 }, 1117 1118 nvme_disk_init_device, 1119 nvme_disk_uninit_device, 1120 NULL, // remove, 1121 1122 nvme_disk_open, 1123 nvme_disk_close, 1124 nvme_disk_free, 1125 nvme_disk_read, 1126 nvme_disk_write, 1127 nvme_disk_io, 1128 nvme_disk_ioctl, 1129 1130 NULL, // select 1131 NULL, // deselect 1132 }; 1133 1134 struct driver_module_info sNvmeDiskDriver = { 1135 { 1136 NVME_DISK_DRIVER_MODULE_NAME, 1137 0, 1138 NULL 1139 }, 1140 1141 nvme_disk_supports_device, 1142 nvme_disk_register_device, 1143 nvme_disk_init_driver, 1144 nvme_disk_uninit_driver, 1145 nvme_disk_register_child_devices, 1146 NULL, // rescan 1147 NULL, // removed 1148 }; 1149 1150 module_info* modules[] = { 1151 (module_info*)&sNvmeDiskDriver, 1152 (module_info*)&sNvmeDiskDevice, 1153 NULL 1154 }; 1155