1 /* 2 * Copyright 2019-2022, Haiku, Inc. All rights reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Augustin Cavalier <waddlesplash> 7 */ 8 9 10 #include <stdio.h> 11 #include <stdlib.h> 12 13 #include <algorithm> 14 #include <condition_variable.h> 15 #include <AutoDeleter.h> 16 #include <kernel.h> 17 #include <smp.h> 18 #include <util/AutoLock.h> 19 20 #include <fs/devfs.h> 21 #include <bus/PCI.h> 22 #include <vm/vm.h> 23 24 #include "IORequest.h" 25 26 extern "C" { 27 #include <libnvme/nvme.h> 28 #include <libnvme/nvme_internal.h> 29 } 30 31 32 //#define TRACE_NVME_DISK 33 #ifdef TRACE_NVME_DISK 34 # define TRACE(x...) dprintf("nvme_disk: " x) 35 #else 36 # define TRACE(x...) ; 37 #endif 38 #define TRACE_ALWAYS(x...) dprintf("nvme_disk: " x) 39 #define TRACE_ERROR(x...) dprintf("\33[33mnvme_disk:\33[0m " x) 40 #define CALLED() TRACE("CALLED %s\n", __PRETTY_FUNCTION__) 41 42 43 static const uint8 kDriveIcon[] = { 44 0x6e, 0x63, 0x69, 0x66, 0x08, 0x03, 0x01, 0x00, 0x00, 0x02, 0x00, 0x16, 45 0x02, 0x3c, 0xc7, 0xee, 0x38, 0x9b, 0xc0, 0xba, 0x16, 0x57, 0x3e, 0x39, 46 0xb0, 0x49, 0x77, 0xc8, 0x42, 0xad, 0xc7, 0x00, 0xff, 0xff, 0xd3, 0x02, 47 0x00, 0x06, 0x02, 0x3c, 0x96, 0x32, 0x3a, 0x4d, 0x3f, 0xba, 0xfc, 0x01, 48 0x3d, 0x5a, 0x97, 0x4b, 0x57, 0xa5, 0x49, 0x84, 0x4d, 0x00, 0x47, 0x47, 49 0x47, 0xff, 0xa5, 0xa0, 0xa0, 0x02, 0x00, 0x16, 0x02, 0xbc, 0x59, 0x2f, 50 0xbb, 0x29, 0xa7, 0x3c, 0x0c, 0xe4, 0xbd, 0x0b, 0x7c, 0x48, 0x92, 0xc0, 51 0x4b, 0x79, 0x66, 0x00, 0x7d, 0xff, 0xd4, 0x02, 0x00, 0x06, 0x02, 0x38, 52 0xdb, 0xb4, 0x39, 0x97, 0x33, 0xbc, 0x4a, 0x33, 0x3b, 0xa5, 0x42, 0x48, 53 0x6e, 0x66, 0x49, 0xee, 0x7b, 0x00, 0x59, 0x67, 0x56, 0xff, 0xeb, 0xb2, 54 0xb2, 0x03, 0xa7, 0xff, 0x00, 0x03, 0xff, 0x00, 0x00, 0x04, 0x01, 0x80, 55 0x07, 0x0a, 0x06, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x5a, 0x3e, 0x5a, 56 0x31, 0x39, 0x25, 0x0a, 0x04, 0x22, 0x3c, 0x44, 0x4b, 0x5a, 0x31, 0x39, 57 0x25, 0x0a, 0x04, 0x44, 0x4b, 0x44, 0x5b, 0x5a, 0x3e, 0x5a, 0x31, 0x0a, 58 0x04, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x44, 0x4b, 0x08, 0x02, 0x27, 59 0x43, 0xb8, 0x14, 0xc1, 0xf1, 0x08, 0x02, 0x26, 0x43, 0x29, 0x44, 0x0a, 60 0x05, 0x44, 0x5d, 0x49, 0x5d, 0x60, 0x3e, 0x5a, 0x3b, 0x5b, 0x3f, 0x08, 61 0x0a, 0x07, 0x01, 0x06, 0x00, 0x0a, 0x00, 0x01, 0x00, 0x10, 0x01, 0x17, 62 0x84, 0x00, 0x04, 0x0a, 0x01, 0x01, 0x01, 0x00, 0x0a, 0x02, 0x01, 0x02, 63 0x00, 0x0a, 0x03, 0x01, 0x03, 0x00, 0x0a, 0x04, 0x01, 0x04, 0x10, 0x01, 64 0x17, 0x85, 0x20, 0x04, 0x0a, 0x06, 0x01, 0x05, 0x30, 0x24, 0xb3, 0x99, 65 0x01, 0x17, 0x82, 0x00, 0x04, 0x0a, 0x05, 0x01, 0x05, 0x30, 0x20, 0xb2, 66 0xe6, 0x01, 0x17, 0x82, 0x00, 0x04 67 }; 68 69 70 #define NVME_DISK_DRIVER_MODULE_NAME "drivers/disk/nvme_disk/driver_v1" 71 #define NVME_DISK_DEVICE_MODULE_NAME "drivers/disk/nvme_disk/device_v1" 72 #define NVME_DISK_DEVICE_ID_GENERATOR "nvme_disk/device_id" 73 74 #define NVME_MAX_QPAIRS (16) 75 76 77 static device_manager_info* sDeviceManager; 78 79 typedef struct { 80 device_node* node; 81 pci_info info; 82 83 struct nvme_ctrlr* ctrlr; 84 85 struct nvme_ns* ns; 86 uint64 capacity; 87 uint32 block_size; 88 uint32 max_io_blocks; 89 status_t media_status; 90 91 DMAResource dma_resource; 92 sem_id dma_buffers_sem; 93 94 rw_lock rounded_write_lock; 95 96 ConditionVariable interrupt; 97 int32 polling; 98 99 struct qpair_info { 100 struct nvme_qpair* qpair; 101 } qpairs[NVME_MAX_QPAIRS]; 102 uint32 qpair_count; 103 } nvme_disk_driver_info; 104 typedef nvme_disk_driver_info::qpair_info qpair_info; 105 106 107 typedef struct { 108 nvme_disk_driver_info* info; 109 } nvme_disk_handle; 110 111 112 static status_t 113 get_geometry(nvme_disk_handle* handle, device_geometry* geometry) 114 { 115 nvme_disk_driver_info* info = handle->info; 116 117 devfs_compute_geometry_size(geometry, info->capacity, info->block_size); 118 geometry->bytes_per_physical_sector = info->block_size; 119 120 geometry->device_type = B_DISK; 121 geometry->removable = false; 122 123 geometry->read_only = false; 124 geometry->write_once = false; 125 126 TRACE("get_geometry(): %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %d, %d, %d, %d\n", 127 geometry->bytes_per_sector, geometry->sectors_per_track, 128 geometry->cylinder_count, geometry->head_count, geometry->device_type, 129 geometry->removable, geometry->read_only, geometry->write_once); 130 131 return B_OK; 132 } 133 134 135 static void 136 nvme_disk_set_capacity(nvme_disk_driver_info* info, uint64 capacity, 137 uint32 blockSize) 138 { 139 TRACE("set_capacity(device = %p, capacity = %" B_PRIu64 ", blockSize = %" B_PRIu32 ")\n", 140 info, capacity, blockSize); 141 142 info->capacity = capacity; 143 info->block_size = blockSize; 144 } 145 146 147 // #pragma mark - device module API 148 149 150 static int32 nvme_interrupt_handler(void* _info); 151 152 153 static status_t 154 nvme_disk_init_device(void* _info, void** _cookie) 155 { 156 CALLED(); 157 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info; 158 ASSERT(info->ctrlr == NULL); 159 160 pci_device_module_info* pci; 161 pci_device* pcidev; 162 device_node* parent = sDeviceManager->get_parent_node(info->node); 163 sDeviceManager->get_driver(parent, (driver_module_info**)&pci, 164 (void**)&pcidev); 165 pci->get_pci_info(pcidev, &info->info); 166 sDeviceManager->put_node(parent); 167 168 // construct the libnvme pci_device struct 169 pci_device* device = new pci_device; 170 device->vendor_id = info->info.vendor_id; 171 device->device_id = info->info.device_id; 172 device->subvendor_id = 0; 173 device->subdevice_id = 0; 174 175 device->domain = 0; 176 device->bus = info->info.bus; 177 device->dev = info->info.device; 178 device->func = info->info.function; 179 180 device->pci_info = &info->info; 181 182 // enable busmaster and memory mapped access 183 uint16 command = pci->read_pci_config(pcidev, PCI_command, 2); 184 command |= PCI_command_master | PCI_command_memory; 185 pci->write_pci_config(pcidev, PCI_command, 2, command); 186 187 // open the controller 188 info->ctrlr = nvme_ctrlr_open(device, NULL); 189 if (info->ctrlr == NULL) { 190 TRACE_ERROR("failed to open the controller!\n"); 191 return B_ERROR; 192 } 193 194 struct nvme_ctrlr_stat cstat; 195 int err = nvme_ctrlr_stat(info->ctrlr, &cstat); 196 if (err != 0) { 197 TRACE_ERROR("failed to get controller information!\n"); 198 nvme_ctrlr_close(info->ctrlr); 199 return err; 200 } 201 202 TRACE_ALWAYS("attached to NVMe device \"%s (%s)\"\n", cstat.mn, cstat.sn); 203 TRACE_ALWAYS("\tmaximum transfer size: %" B_PRIuSIZE "\n", cstat.max_xfer_size); 204 TRACE_ALWAYS("\tqpair count: %d\n", cstat.io_qpairs); 205 206 // TODO: export more than just the first namespace! 207 info->ns = nvme_ns_open(info->ctrlr, cstat.ns_ids[0]); 208 if (info->ns == NULL) { 209 TRACE_ERROR("failed to open namespace!\n"); 210 nvme_ctrlr_close(info->ctrlr); 211 return B_ERROR; 212 } 213 TRACE_ALWAYS("namespace 0\n"); 214 215 struct nvme_ns_stat nsstat; 216 err = nvme_ns_stat(info->ns, &nsstat); 217 if (err != 0) { 218 TRACE_ERROR("failed to get namespace information!\n"); 219 nvme_ctrlr_close(info->ctrlr); 220 return err; 221 } 222 223 // store capacity information 224 TRACE_ALWAYS("\tblock size: %" B_PRIuSIZE ", stripe size: %u\n", 225 nsstat.sector_size, info->ns->stripe_size); 226 nvme_disk_set_capacity(info, nsstat.sectors, nsstat.sector_size); 227 228 command = pci->read_pci_config(pcidev, PCI_command, 2); 229 command &= ~(PCI_command_int_disable); 230 pci->write_pci_config(pcidev, PCI_command, 2, command); 231 232 uint32 irq = info->info.u.h0.interrupt_line; 233 if (irq == 0xFF) 234 irq = 0; 235 236 if (pci->get_msix_count(pcidev)) { 237 uint32 msixVector = 0; 238 if (pci->configure_msix(pcidev, 1, &msixVector) == B_OK 239 && pci->enable_msix(pcidev) == B_OK) { 240 TRACE_ALWAYS("using MSI-X\n"); 241 irq = msixVector; 242 } 243 } else if (pci->get_msi_count(pcidev) >= 1) { 244 uint32 msiVector = 0; 245 if (pci->configure_msi(pcidev, 1, &msiVector) == B_OK 246 && pci->enable_msi(pcidev) == B_OK) { 247 TRACE_ALWAYS("using message signaled interrupts\n"); 248 irq = msiVector; 249 } 250 } 251 252 if (irq == 0) { 253 TRACE_ERROR("device PCI:%d:%d:%d was assigned an invalid IRQ\n", 254 info->info.bus, info->info.device, info->info.function); 255 info->polling = 1; 256 } else { 257 info->polling = 0; 258 } 259 info->interrupt.Init(NULL, NULL); 260 install_io_interrupt_handler(irq, nvme_interrupt_handler, (void*)info, B_NO_HANDLED_INFO); 261 262 if (info->ctrlr->feature_supported[NVME_FEAT_INTERRUPT_COALESCING]) { 263 uint32 microseconds = 16, threshold = 32; 264 nvme_admin_set_feature(info->ctrlr, false, NVME_FEAT_INTERRUPT_COALESCING, 265 ((microseconds / 100) << 8) | threshold, 0, NULL); 266 } 267 268 // allocate qpairs 269 uint32 try_qpairs = cstat.io_qpairs; 270 try_qpairs = min_c(try_qpairs, NVME_MAX_QPAIRS); 271 if (try_qpairs >= (uint32)smp_get_num_cpus()) { 272 try_qpairs = smp_get_num_cpus(); 273 } else { 274 // Find the highest number of qpairs that evenly divides the number of CPUs. 275 while ((smp_get_num_cpus() % try_qpairs) != 0) 276 try_qpairs--; 277 } 278 info->qpair_count = 0; 279 for (uint32 i = 0; i < try_qpairs; i++) { 280 info->qpairs[i].qpair = nvme_ioqp_get(info->ctrlr, 281 (enum nvme_qprio)0, 0); 282 if (info->qpairs[i].qpair == NULL) 283 break; 284 285 info->qpair_count++; 286 } 287 if (info->qpair_count == 0) { 288 TRACE_ERROR("failed to allocate qpairs!\n"); 289 nvme_ctrlr_close(info->ctrlr); 290 return B_NO_MEMORY; 291 } 292 if (info->qpair_count != try_qpairs) { 293 TRACE_ALWAYS("warning: did not get expected number of qpairs\n"); 294 } 295 296 // allocate DMA buffers 297 int buffers = info->qpair_count * 2; 298 299 dma_restrictions restrictions = {}; 300 restrictions.alignment = B_PAGE_SIZE; 301 // Technically, the first and last segments in a transfer can be aligned 302 // only on 32-bits, and the rest only need to have sizes that are a multiple 303 // of the block size. 304 restrictions.max_segment_count = (NVME_MAX_SGL_DESCRIPTORS / 2); 305 restrictions.max_transfer_size = cstat.max_xfer_size; 306 info->max_io_blocks = cstat.max_xfer_size / nsstat.sector_size; 307 308 err = info->dma_resource.Init(restrictions, B_PAGE_SIZE, buffers, buffers); 309 if (err != 0) { 310 TRACE_ERROR("failed to initialize DMA resource!\n"); 311 nvme_ctrlr_close(info->ctrlr); 312 return err; 313 } 314 315 info->dma_buffers_sem = create_sem(buffers, "nvme buffers sem"); 316 if (info->dma_buffers_sem < 0) { 317 TRACE_ERROR("failed to create DMA buffers semaphore!\n"); 318 nvme_ctrlr_close(info->ctrlr); 319 return info->dma_buffers_sem; 320 } 321 322 // set up rounded-write lock 323 rw_lock_init(&info->rounded_write_lock, "nvme rounded writes"); 324 325 *_cookie = info; 326 return B_OK; 327 } 328 329 330 static void 331 nvme_disk_uninit_device(void* _cookie) 332 { 333 CALLED(); 334 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie; 335 336 remove_io_interrupt_handler(info->info.u.h0.interrupt_line, 337 nvme_interrupt_handler, (void*)info); 338 339 rw_lock_destroy(&info->rounded_write_lock); 340 341 nvme_ns_close(info->ns); 342 nvme_ctrlr_close(info->ctrlr); 343 344 // TODO: Deallocate MSI(-X). 345 // TODO: Deallocate PCI. 346 } 347 348 349 static status_t 350 nvme_disk_open(void* _info, const char* path, int openMode, void** _cookie) 351 { 352 CALLED(); 353 354 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info; 355 nvme_disk_handle* handle = (nvme_disk_handle*)malloc( 356 sizeof(nvme_disk_handle)); 357 if (handle == NULL) 358 return B_NO_MEMORY; 359 360 handle->info = info; 361 362 *_cookie = handle; 363 return B_OK; 364 } 365 366 367 static status_t 368 nvme_disk_close(void* cookie) 369 { 370 CALLED(); 371 372 //nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 373 return B_OK; 374 } 375 376 377 static status_t 378 nvme_disk_free(void* cookie) 379 { 380 CALLED(); 381 382 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 383 free(handle); 384 return B_OK; 385 } 386 387 388 // #pragma mark - I/O 389 390 391 static int32 392 nvme_interrupt_handler(void* _info) 393 { 394 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info; 395 info->interrupt.NotifyAll(); 396 info->polling = -1; 397 return 0; 398 } 399 400 401 static qpair_info* 402 get_qpair(nvme_disk_driver_info* info) 403 { 404 return &info->qpairs[smp_get_current_cpu() % info->qpair_count]; 405 } 406 407 408 static void 409 io_finished_callback(status_t* status, const struct nvme_cpl* cpl) 410 { 411 *status = nvme_cpl_is_error(cpl) ? B_IO_ERROR : B_OK; 412 } 413 414 415 static void 416 await_status(nvme_disk_driver_info* info, struct nvme_qpair* qpair, status_t& status) 417 { 418 CALLED(); 419 420 ConditionVariableEntry entry; 421 int timeouts = 0; 422 while (status == EINPROGRESS) { 423 info->interrupt.Add(&entry); 424 425 nvme_qpair_poll(qpair, 0); 426 427 if (status != EINPROGRESS) 428 return; 429 430 if (info->polling > 0) { 431 entry.Wait(B_RELATIVE_TIMEOUT, min_c(5 * 1000 * 1000, 432 (1 << timeouts) * 1000)); 433 timeouts++; 434 } else if (entry.Wait(B_RELATIVE_TIMEOUT, 5 * 1000 * 1000) != B_OK) { 435 // This should never happen, as we are woken up on every interrupt 436 // no matter the qpair or transfer within; so if it does occur, 437 // that probably means the controller stalled, or maybe cannot 438 // generate interrupts at all. 439 440 TRACE_ERROR("timed out waiting for interrupt!\n"); 441 if (timeouts++ >= 3) { 442 nvme_qpair_fail(qpair); 443 status = B_TIMED_OUT; 444 return; 445 } 446 447 info->polling++; 448 if (info->polling > 0) { 449 TRACE_ALWAYS("switching to polling mode, performance will be affected!\n"); 450 } 451 } 452 453 nvme_qpair_poll(qpair, 0); 454 } 455 } 456 457 458 struct nvme_io_request { 459 status_t status; 460 461 bool write; 462 463 off_t lba_start; 464 size_t lba_count; 465 466 physical_entry* iovecs; 467 int32 iovec_count; 468 469 int32 iovec_i; 470 uint32 iovec_offset; 471 }; 472 473 474 static void 475 ior_reset_sgl(nvme_io_request* request, uint32_t offset) 476 { 477 TRACE("IOR Reset: %" B_PRIu32 "\n", offset); 478 479 int32 i = 0; 480 while (offset > 0 && request->iovecs[i].size <= offset) { 481 offset -= request->iovecs[i].size; 482 i++; 483 } 484 request->iovec_i = i; 485 request->iovec_offset = offset; 486 } 487 488 489 static int 490 ior_next_sge(nvme_io_request* request, uint64_t* address, uint32_t* length) 491 { 492 int32 index = request->iovec_i; 493 if (index < 0 || index > request->iovec_count) 494 return -1; 495 496 *address = request->iovecs[index].address + request->iovec_offset; 497 *length = request->iovecs[index].size - request->iovec_offset; 498 499 TRACE("IOV %d (+ " B_PRIu32 "): 0x%" B_PRIx64 ", %" B_PRIu32 "\n", 500 request->iovec_i, request->iovec_offset, *address, *length); 501 502 request->iovec_i++; 503 request->iovec_offset = 0; 504 return 0; 505 } 506 507 508 static status_t 509 do_nvme_io_request(nvme_disk_driver_info* info, nvme_io_request* request) 510 { 511 request->status = EINPROGRESS; 512 513 qpair_info* qpinfo = get_qpair(info); 514 int ret = -1; 515 if (request->write) { 516 ret = nvme_ns_writev(info->ns, qpinfo->qpair, request->lba_start, 517 request->lba_count, (nvme_cmd_cb)io_finished_callback, request, 518 0, (nvme_req_reset_sgl_cb)ior_reset_sgl, 519 (nvme_req_next_sge_cb)ior_next_sge); 520 } else { 521 ret = nvme_ns_readv(info->ns, qpinfo->qpair, request->lba_start, 522 request->lba_count, (nvme_cmd_cb)io_finished_callback, request, 523 0, (nvme_req_reset_sgl_cb)ior_reset_sgl, 524 (nvme_req_next_sge_cb)ior_next_sge); 525 } 526 if (ret != 0) { 527 TRACE_ERROR("attempt to queue %s I/O at LBA %" B_PRIdOFF " of %" B_PRIuSIZE 528 " blocks failed!\n", request->write ? "write" : "read", 529 request->lba_start, request->lba_count); 530 531 request->lba_count = 0; 532 return ret; 533 } 534 535 await_status(info, qpinfo->qpair, request->status); 536 537 if (request->status != B_OK) { 538 TRACE_ERROR("%s at LBA %" B_PRIdOFF " of %" B_PRIuSIZE 539 " blocks failed!\n", request->write ? "write" : "read", 540 request->lba_start, request->lba_count); 541 542 request->lba_count = 0; 543 } 544 return request->status; 545 } 546 547 548 static status_t 549 nvme_disk_bounced_io(nvme_disk_handle* handle, io_request* request) 550 { 551 CALLED(); 552 553 WriteLocker writeLocker; 554 if (request->IsWrite()) 555 writeLocker.SetTo(handle->info->rounded_write_lock, false); 556 557 status_t status = acquire_sem(handle->info->dma_buffers_sem); 558 if (status != B_OK) { 559 request->SetStatusAndNotify(status); 560 return status; 561 } 562 563 const size_t block_size = handle->info->block_size; 564 565 TRACE("%p: IOR Offset: %" B_PRIdOFF "; Length %" B_PRIuGENADDR 566 "; Write %s\n", request, request->Offset(), request->Length(), 567 request->IsWrite() ? "yes" : "no"); 568 569 nvme_io_request nvme_request; 570 while (request->RemainingBytes() > 0) { 571 IOOperation operation; 572 status = handle->info->dma_resource.TranslateNext(request, &operation, 0); 573 if (status != B_OK) 574 break; 575 576 do { 577 TRACE("%p: IOO offset: %" B_PRIdOFF ", length: %" B_PRIuGENADDR 578 ", write: %s\n", request, operation.Offset(), 579 operation.Length(), operation.IsWrite() ? "yes" : "no"); 580 581 nvme_request.write = operation.IsWrite(); 582 nvme_request.lba_start = operation.Offset() / block_size; 583 nvme_request.lba_count = operation.Length() / block_size; 584 nvme_request.iovecs = (physical_entry*)operation.Vecs(); 585 nvme_request.iovec_count = operation.VecCount(); 586 587 status = do_nvme_io_request(handle->info, &nvme_request); 588 589 operation.SetStatus(status, 590 status == B_OK ? operation.Length() : 0); 591 } while (status == B_OK && !operation.Finish()); 592 593 if (status == B_OK && operation.Status() != B_OK) { 594 TRACE_ERROR("I/O succeeded but IOOperation failed!\n"); 595 status = operation.Status(); 596 } 597 598 request->OperationFinished(&operation); 599 600 handle->info->dma_resource.RecycleBuffer(operation.Buffer()); 601 602 TRACE("%p: status %s, remaining bytes %" B_PRIuGENADDR "\n", request, 603 strerror(status), request->RemainingBytes()); 604 if (status != B_OK) 605 break; 606 } 607 608 release_sem(handle->info->dma_buffers_sem); 609 610 // Notify() also takes care of UnlockMemory(). 611 if (status != B_OK && request->Status() == B_OK) 612 request->SetStatusAndNotify(status); 613 else 614 request->NotifyFinished(); 615 return status; 616 } 617 618 619 static status_t 620 nvme_disk_io(void* cookie, io_request* request) 621 { 622 CALLED(); 623 624 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 625 626 const off_t ns_end = (handle->info->capacity * handle->info->block_size); 627 if ((request->Offset() + (off_t)request->Length()) > ns_end) 628 return ERANGE; 629 630 nvme_io_request nvme_request; 631 memset(&nvme_request, 0, sizeof(nvme_io_request)); 632 633 nvme_request.write = request->IsWrite(); 634 635 physical_entry* vtophys = NULL; 636 MemoryDeleter vtophysDeleter; 637 638 IOBuffer* buffer = request->Buffer(); 639 status_t status = B_OK; 640 if (!buffer->IsPhysical()) { 641 status = buffer->LockMemory(request->TeamID(), request->IsWrite()); 642 if (status != B_OK) { 643 TRACE_ERROR("failed to lock memory: %s\n", strerror(status)); 644 return status; 645 } 646 // SetStatusAndNotify() takes care of unlocking memory if necessary. 647 648 const int32 vtophysLength = (request->Length() / B_PAGE_SIZE) + 2; 649 if (vtophysLength <= 8) { 650 vtophys = (physical_entry*)alloca(sizeof(physical_entry) * vtophysLength); 651 } else { 652 vtophys = (physical_entry*)malloc(sizeof(physical_entry) * vtophysLength); 653 vtophysDeleter.SetTo(vtophys); 654 } 655 if (vtophys == NULL) { 656 TRACE_ERROR("failed to allocate memory for iovecs\n"); 657 request->SetStatusAndNotify(B_NO_MEMORY); 658 return B_NO_MEMORY; 659 } 660 661 for (size_t i = 0; i < buffer->VecCount(); i++) { 662 generic_io_vec virt = buffer->VecAt(i); 663 uint32 entries = vtophysLength - nvme_request.iovec_count; 664 665 // Avoid copies by going straight into the vtophys array. 666 status = get_memory_map_etc(request->TeamID(), (void*)virt.base, 667 virt.length, vtophys + nvme_request.iovec_count, &entries); 668 669 if (status == B_BAD_VALUE && entries == 0) 670 status = B_BUFFER_OVERFLOW; 671 if (status == B_BUFFER_OVERFLOW) { 672 // Too many physical_entries to use unbounced I/O. 673 vtophysDeleter.Delete(); 674 vtophys = NULL; 675 break; 676 } 677 if (status != B_OK) { 678 TRACE_ERROR("I/O get_memory_map failed: %s\n", strerror(status)); 679 request->SetStatusAndNotify(status); 680 return status; 681 } 682 683 nvme_request.iovec_count += entries; 684 } 685 686 nvme_request.iovecs = vtophys; 687 } else { 688 nvme_request.iovecs = (physical_entry*)buffer->Vecs(); 689 nvme_request.iovec_count = buffer->VecCount(); 690 } 691 692 // See if we need to bounce anything other than the first or last vec. 693 const size_t block_size = handle->info->block_size; 694 bool bounceAll = (nvme_request.iovecs == NULL); 695 for (int32 i = 1; !bounceAll && i < (nvme_request.iovec_count - 1); i++) { 696 if ((nvme_request.iovecs[i].address % B_PAGE_SIZE) != 0) 697 bounceAll = true; 698 if ((nvme_request.iovecs[i].size % B_PAGE_SIZE) != 0) 699 bounceAll = true; 700 } 701 702 // See if we need to bounce due to the first or last vecs. 703 if (nvme_request.iovec_count > 1) { 704 // There are middle vecs, so the first and last vecs have different restrictions: they 705 // need only be a multiple of the block size, and must end and start on a page boundary, 706 // respectively, though the start address must always be 32-bit-aligned. 707 physical_entry* entry = &nvme_request.iovecs[0]; 708 if (!bounceAll && (((entry->address + entry->size) % B_PAGE_SIZE) != 0 709 || (entry->address & 0x3) != 0 || (entry->size % block_size) != 0)) 710 bounceAll = true; 711 712 entry = &nvme_request.iovecs[nvme_request.iovec_count - 1]; 713 if (!bounceAll && ((entry->address % B_PAGE_SIZE) != 0 714 || (entry->size % block_size) != 0)) 715 bounceAll = true; 716 } else { 717 // There is only one vec. Check that it is a multiple of the block size, 718 // and that its address is 32-bit-aligned. 719 physical_entry* entry = &nvme_request.iovecs[0]; 720 if (!bounceAll && ((entry->address & 0x3) != 0 || (entry->size % block_size) != 0)) 721 bounceAll = true; 722 } 723 724 // See if we need to bounce due to rounding. 725 const off_t rounded_pos = ROUNDDOWN(request->Offset(), block_size); 726 phys_size_t rounded_len = ROUNDUP(request->Length() + (request->Offset() 727 - rounded_pos), block_size); 728 if (rounded_pos != request->Offset() || rounded_len != request->Length()) 729 bounceAll = true; 730 731 if (bounceAll) { 732 // Let the bounced I/O routine take care of everything from here. 733 return nvme_disk_bounced_io(handle, request); 734 } 735 736 nvme_request.lba_start = rounded_pos / block_size; 737 nvme_request.lba_count = rounded_len / block_size; 738 739 // No bouncing was required. 740 ReadLocker readLocker; 741 if (nvme_request.write) 742 readLocker.SetTo(handle->info->rounded_write_lock, false); 743 744 // Error check before actually doing I/O. 745 if (status != B_OK) { 746 TRACE_ERROR("I/O failed early: %s\n", strerror(status)); 747 request->SetStatusAndNotify(status); 748 return status; 749 } 750 751 const uint32 max_io_blocks = handle->info->max_io_blocks; 752 int32 remaining = nvme_request.iovec_count; 753 while (remaining > 0) { 754 nvme_request.iovec_count = min_c(remaining, 755 NVME_MAX_SGL_DESCRIPTORS / 2); 756 757 nvme_request.lba_count = 0; 758 for (int i = 0; i < nvme_request.iovec_count; i++) { 759 uint32 new_lba_count = nvme_request.lba_count 760 + (nvme_request.iovecs[i].size / block_size); 761 if (nvme_request.lba_count > 0 && new_lba_count > max_io_blocks) { 762 // We already have a nonzero length, and adding this vec would 763 // make us go over (or we already are over.) Stop adding. 764 nvme_request.iovec_count = i; 765 break; 766 } 767 768 nvme_request.lba_count = new_lba_count; 769 } 770 771 status = do_nvme_io_request(handle->info, &nvme_request); 772 if (status != B_OK) 773 break; 774 775 nvme_request.iovecs += nvme_request.iovec_count; 776 remaining -= nvme_request.iovec_count; 777 nvme_request.lba_start += nvme_request.lba_count; 778 } 779 780 if (status != B_OK) 781 TRACE_ERROR("I/O failed: %s\n", strerror(status)); 782 783 request->SetTransferredBytes(status != B_OK, 784 (nvme_request.lba_start * block_size) - rounded_pos); 785 request->SetStatusAndNotify(status); 786 return status; 787 } 788 789 790 static status_t 791 nvme_disk_read(void* cookie, off_t pos, void* buffer, size_t* length) 792 { 793 CALLED(); 794 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 795 796 const off_t ns_end = (handle->info->capacity * handle->info->block_size); 797 if (pos >= ns_end) 798 return B_BAD_VALUE; 799 if ((pos + (off_t)*length) > ns_end) 800 *length = ns_end - pos; 801 802 IORequest request; 803 status_t status = request.Init(pos, (addr_t)buffer, *length, false, 0); 804 if (status != B_OK) 805 return status; 806 807 status = nvme_disk_io(handle, &request); 808 *length = request.TransferredBytes(); 809 return status; 810 } 811 812 813 static status_t 814 nvme_disk_write(void* cookie, off_t pos, const void* buffer, size_t* length) 815 { 816 CALLED(); 817 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 818 819 const off_t ns_end = (handle->info->capacity * handle->info->block_size); 820 if (pos >= ns_end) 821 return B_BAD_VALUE; 822 if ((pos + (off_t)*length) > ns_end) 823 *length = ns_end - pos; 824 825 IORequest request; 826 status_t status = request.Init(pos, (addr_t)buffer, *length, true, 0); 827 if (status != B_OK) 828 return status; 829 830 status = nvme_disk_io(handle, &request); 831 *length = request.TransferredBytes(); 832 return status; 833 } 834 835 836 static status_t 837 nvme_disk_flush(nvme_disk_driver_info* info) 838 { 839 CALLED(); 840 status_t status = EINPROGRESS; 841 842 qpair_info* qpinfo = get_qpair(info); 843 int ret = nvme_ns_flush(info->ns, qpinfo->qpair, 844 (nvme_cmd_cb)io_finished_callback, &status); 845 if (ret != 0) 846 return ret; 847 848 await_status(info, qpinfo->qpair, status); 849 return status; 850 } 851 852 853 static status_t 854 nvme_disk_trim(nvme_disk_driver_info* info, fs_trim_data* trimData) 855 { 856 CALLED(); 857 trimData->trimmed_size = 0; 858 859 const off_t deviceSize = info->capacity * info->block_size; // in bytes 860 if (deviceSize < 0) 861 return B_BAD_VALUE; 862 863 STATIC_ASSERT(sizeof(deviceSize) <= sizeof(uint64)); 864 ASSERT(deviceSize >= 0); 865 866 // Do not trim past device end. 867 for (uint32 i = 0; i < trimData->range_count; i++) { 868 uint64 offset = trimData->ranges[i].offset; 869 uint64& size = trimData->ranges[i].size; 870 871 if (offset >= (uint64)deviceSize) 872 return B_BAD_VALUE; 873 size = std::min(size, (uint64)deviceSize - offset); 874 } 875 876 // We need contiguous memory for the DSM ranges. 877 nvme_dsm_range* dsmRanges = (nvme_dsm_range*)nvme_mem_alloc_node( 878 trimData->range_count * sizeof(nvme_dsm_range), 0, 0, NULL); 879 if (dsmRanges == NULL) 880 return B_NO_MEMORY; 881 CObjectDeleter<void, void, nvme_free> dsmRangesDeleter(dsmRanges); 882 883 uint64 trimmingSize = 0; 884 for (uint32 i = 0; i < trimData->range_count; i++) { 885 uint64 offset = trimData->ranges[i].offset; 886 uint64 length = trimData->ranges[i].size; 887 888 // Round up offset and length to the block size. 889 // (Some space at the beginning and end may thus not be trimmed.) 890 offset = ROUNDUP(offset, info->block_size); 891 length -= offset - trimData->ranges[i].offset; 892 length = ROUNDDOWN(length, info->block_size); 893 894 if (length == 0) 895 continue; 896 if ((length / info->block_size) > UINT32_MAX) 897 length = uint64(UINT32_MAX) * info->block_size; 898 // TODO: Break into smaller trim ranges! 899 900 TRACE("trim %" B_PRIu64 " bytes from %" B_PRIu64 "\n", length, offset); 901 902 dsmRanges[i].attributes = 0; 903 dsmRanges[i].length = length / info->block_size; 904 dsmRanges[i].starting_lba = offset / info->block_size; 905 906 trimmingSize += length; 907 } 908 909 status_t status = EINPROGRESS; 910 qpair_info* qpair = get_qpair(info); 911 if (nvme_ns_deallocate(info->ns, qpair->qpair, dsmRanges, trimData->range_count, 912 (nvme_cmd_cb)io_finished_callback, &status) != 0) 913 return B_IO_ERROR; 914 915 await_status(info, qpair->qpair, status); 916 if (status != B_OK) 917 return status; 918 919 trimData->trimmed_size = trimmingSize; 920 return B_OK; 921 } 922 923 924 static status_t 925 nvme_disk_ioctl(void* cookie, uint32 op, void* buffer, size_t length) 926 { 927 CALLED(); 928 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 929 nvme_disk_driver_info* info = handle->info; 930 931 TRACE("ioctl(op = %" B_PRId32 ")\n", op); 932 933 switch (op) { 934 case B_GET_MEDIA_STATUS: 935 { 936 return user_memcpy(buffer, &info->media_status, sizeof(status_t)); 937 } 938 939 case B_GET_DEVICE_SIZE: 940 { 941 size_t size = info->capacity * info->block_size; 942 return user_memcpy(buffer, &size, sizeof(size_t)); 943 } 944 945 case B_GET_GEOMETRY: 946 { 947 if (buffer == NULL || length > sizeof(device_geometry)) 948 return B_BAD_VALUE; 949 950 device_geometry geometry; 951 status_t status = get_geometry(handle, &geometry); 952 if (status != B_OK) 953 return status; 954 955 return user_memcpy(buffer, &geometry, length); 956 } 957 958 case B_GET_ICON_NAME: 959 return user_strlcpy((char*)buffer, "devices/drive-harddisk", 960 B_FILE_NAME_LENGTH); 961 962 case B_GET_VECTOR_ICON: 963 { 964 device_icon iconData; 965 if (length != sizeof(device_icon)) 966 return B_BAD_VALUE; 967 if (user_memcpy(&iconData, buffer, sizeof(device_icon)) != B_OK) 968 return B_BAD_ADDRESS; 969 970 if (iconData.icon_size >= (int32)sizeof(kDriveIcon)) { 971 if (user_memcpy(iconData.icon_data, kDriveIcon, 972 sizeof(kDriveIcon)) != B_OK) 973 return B_BAD_ADDRESS; 974 } 975 976 iconData.icon_size = sizeof(kDriveIcon); 977 return user_memcpy(buffer, &iconData, sizeof(device_icon)); 978 } 979 980 case B_FLUSH_DRIVE_CACHE: 981 return nvme_disk_flush(info); 982 983 case B_TRIM_DEVICE: 984 ASSERT(IS_KERNEL_ADDRESS(buffer)); 985 return nvme_disk_trim(info, (fs_trim_data*)buffer); 986 } 987 988 return B_DEV_INVALID_IOCTL; 989 } 990 991 992 // #pragma mark - driver module API 993 994 995 static float 996 nvme_disk_supports_device(device_node *parent) 997 { 998 CALLED(); 999 1000 const char* bus; 1001 uint16 baseClass, subClass; 1002 1003 if (sDeviceManager->get_attr_string(parent, B_DEVICE_BUS, &bus, false) != B_OK 1004 || sDeviceManager->get_attr_uint16(parent, B_DEVICE_TYPE, &baseClass, false) != B_OK 1005 || sDeviceManager->get_attr_uint16(parent, B_DEVICE_SUB_TYPE, &subClass, false) != B_OK) 1006 return -1.0f; 1007 1008 if (strcmp(bus, "pci") != 0 || baseClass != PCI_mass_storage) 1009 return 0.0f; 1010 1011 if (subClass != PCI_nvm) 1012 return 0.0f; 1013 1014 TRACE("NVMe device found!\n"); 1015 return 1.0f; 1016 } 1017 1018 1019 static status_t 1020 nvme_disk_register_device(device_node* parent) 1021 { 1022 CALLED(); 1023 1024 device_attr attrs[] = { 1025 { B_DEVICE_PRETTY_NAME, B_STRING_TYPE, { .string = "NVMe Disk" } }, 1026 { NULL } 1027 }; 1028 1029 return sDeviceManager->register_node(parent, NVME_DISK_DRIVER_MODULE_NAME, 1030 attrs, NULL, NULL); 1031 } 1032 1033 1034 static status_t 1035 nvme_disk_init_driver(device_node* node, void** cookie) 1036 { 1037 CALLED(); 1038 1039 int ret = nvme_lib_init((enum nvme_log_level)0, (enum nvme_log_facility)0, NULL); 1040 if (ret != 0) { 1041 TRACE_ERROR("libnvme initialization failed!\n"); 1042 return ret; 1043 } 1044 1045 nvme_disk_driver_info* info = new nvme_disk_driver_info; 1046 if (info == NULL) 1047 return B_NO_MEMORY; 1048 1049 info->media_status = B_OK; 1050 info->node = node; 1051 1052 info->ctrlr = NULL; 1053 1054 *cookie = info; 1055 return B_OK; 1056 } 1057 1058 1059 static void 1060 nvme_disk_uninit_driver(void* _cookie) 1061 { 1062 CALLED(); 1063 1064 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie; 1065 free(info); 1066 } 1067 1068 1069 static status_t 1070 nvme_disk_register_child_devices(void* _cookie) 1071 { 1072 CALLED(); 1073 1074 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie; 1075 status_t status; 1076 1077 int32 id = sDeviceManager->create_id(NVME_DISK_DEVICE_ID_GENERATOR); 1078 if (id < 0) 1079 return id; 1080 1081 char name[64]; 1082 snprintf(name, sizeof(name), "disk/nvme/%" B_PRId32 "/raw", 1083 id); 1084 1085 status = sDeviceManager->publish_device(info->node, name, 1086 NVME_DISK_DEVICE_MODULE_NAME); 1087 1088 return status; 1089 } 1090 1091 1092 // #pragma mark - 1093 1094 1095 module_dependency module_dependencies[] = { 1096 { B_DEVICE_MANAGER_MODULE_NAME, (module_info**)&sDeviceManager }, 1097 { NULL } 1098 }; 1099 1100 struct device_module_info sNvmeDiskDevice = { 1101 { 1102 NVME_DISK_DEVICE_MODULE_NAME, 1103 0, 1104 NULL 1105 }, 1106 1107 nvme_disk_init_device, 1108 nvme_disk_uninit_device, 1109 NULL, // remove, 1110 1111 nvme_disk_open, 1112 nvme_disk_close, 1113 nvme_disk_free, 1114 nvme_disk_read, 1115 nvme_disk_write, 1116 nvme_disk_io, 1117 nvme_disk_ioctl, 1118 1119 NULL, // select 1120 NULL, // deselect 1121 }; 1122 1123 struct driver_module_info sNvmeDiskDriver = { 1124 { 1125 NVME_DISK_DRIVER_MODULE_NAME, 1126 0, 1127 NULL 1128 }, 1129 1130 nvme_disk_supports_device, 1131 nvme_disk_register_device, 1132 nvme_disk_init_driver, 1133 nvme_disk_uninit_driver, 1134 nvme_disk_register_child_devices, 1135 NULL, // rescan 1136 NULL, // removed 1137 }; 1138 1139 module_info* modules[] = { 1140 (module_info*)&sNvmeDiskDriver, 1141 (module_info*)&sNvmeDiskDevice, 1142 NULL 1143 }; 1144