1 /* 2 * Copyright 2019-2022, Haiku, Inc. All rights reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Augustin Cavalier <waddlesplash> 7 */ 8 9 10 #include <stdio.h> 11 #include <stdlib.h> 12 13 #include <algorithm> 14 #include <condition_variable.h> 15 #include <AutoDeleter.h> 16 #include <kernel.h> 17 #include <smp.h> 18 #include <util/AutoLock.h> 19 20 #include <fs/devfs.h> 21 #include <bus/PCI.h> 22 #include <vm/vm.h> 23 24 #include "IORequest.h" 25 26 extern "C" { 27 #include <libnvme/nvme.h> 28 #include <libnvme/nvme_internal.h> 29 } 30 31 32 //#define TRACE_NVME_DISK 33 #ifdef TRACE_NVME_DISK 34 # define TRACE(x...) dprintf("nvme_disk: " x) 35 #else 36 # define TRACE(x...) ; 37 #endif 38 #define TRACE_ALWAYS(x...) dprintf("nvme_disk: " x) 39 #define TRACE_ERROR(x...) dprintf("\33[33mnvme_disk:\33[0m " x) 40 #define CALLED() TRACE("CALLED %s\n", __PRETTY_FUNCTION__) 41 42 43 static const uint8 kDriveIcon[] = { 44 0x6e, 0x63, 0x69, 0x66, 0x08, 0x03, 0x01, 0x00, 0x00, 0x02, 0x00, 0x16, 45 0x02, 0x3c, 0xc7, 0xee, 0x38, 0x9b, 0xc0, 0xba, 0x16, 0x57, 0x3e, 0x39, 46 0xb0, 0x49, 0x77, 0xc8, 0x42, 0xad, 0xc7, 0x00, 0xff, 0xff, 0xd3, 0x02, 47 0x00, 0x06, 0x02, 0x3c, 0x96, 0x32, 0x3a, 0x4d, 0x3f, 0xba, 0xfc, 0x01, 48 0x3d, 0x5a, 0x97, 0x4b, 0x57, 0xa5, 0x49, 0x84, 0x4d, 0x00, 0x47, 0x47, 49 0x47, 0xff, 0xa5, 0xa0, 0xa0, 0x02, 0x00, 0x16, 0x02, 0xbc, 0x59, 0x2f, 50 0xbb, 0x29, 0xa7, 0x3c, 0x0c, 0xe4, 0xbd, 0x0b, 0x7c, 0x48, 0x92, 0xc0, 51 0x4b, 0x79, 0x66, 0x00, 0x7d, 0xff, 0xd4, 0x02, 0x00, 0x06, 0x02, 0x38, 52 0xdb, 0xb4, 0x39, 0x97, 0x33, 0xbc, 0x4a, 0x33, 0x3b, 0xa5, 0x42, 0x48, 53 0x6e, 0x66, 0x49, 0xee, 0x7b, 0x00, 0x59, 0x67, 0x56, 0xff, 0xeb, 0xb2, 54 0xb2, 0x03, 0xa7, 0xff, 0x00, 0x03, 0xff, 0x00, 0x00, 0x04, 0x01, 0x80, 55 0x07, 0x0a, 0x06, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x5a, 0x3e, 0x5a, 56 0x31, 0x39, 0x25, 0x0a, 0x04, 0x22, 0x3c, 0x44, 0x4b, 0x5a, 0x31, 0x39, 57 0x25, 0x0a, 0x04, 0x44, 0x4b, 0x44, 0x5b, 0x5a, 0x3e, 0x5a, 0x31, 0x0a, 58 0x04, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x44, 0x4b, 0x08, 0x02, 0x27, 59 0x43, 0xb8, 0x14, 0xc1, 0xf1, 0x08, 0x02, 0x26, 0x43, 0x29, 0x44, 0x0a, 60 0x05, 0x44, 0x5d, 0x49, 0x5d, 0x60, 0x3e, 0x5a, 0x3b, 0x5b, 0x3f, 0x08, 61 0x0a, 0x07, 0x01, 0x06, 0x00, 0x0a, 0x00, 0x01, 0x00, 0x10, 0x01, 0x17, 62 0x84, 0x00, 0x04, 0x0a, 0x01, 0x01, 0x01, 0x00, 0x0a, 0x02, 0x01, 0x02, 63 0x00, 0x0a, 0x03, 0x01, 0x03, 0x00, 0x0a, 0x04, 0x01, 0x04, 0x10, 0x01, 64 0x17, 0x85, 0x20, 0x04, 0x0a, 0x06, 0x01, 0x05, 0x30, 0x24, 0xb3, 0x99, 65 0x01, 0x17, 0x82, 0x00, 0x04, 0x0a, 0x05, 0x01, 0x05, 0x30, 0x20, 0xb2, 66 0xe6, 0x01, 0x17, 0x82, 0x00, 0x04 67 }; 68 69 70 #define NVME_DISK_DRIVER_MODULE_NAME "drivers/disk/nvme_disk/driver_v1" 71 #define NVME_DISK_DEVICE_MODULE_NAME "drivers/disk/nvme_disk/device_v1" 72 #define NVME_DISK_DEVICE_ID_GENERATOR "nvme_disk/device_id" 73 74 #define NVME_MAX_QPAIRS (16) 75 76 77 static device_manager_info* sDeviceManager; 78 79 typedef struct { 80 device_node* node; 81 pci_info info; 82 83 struct nvme_ctrlr* ctrlr; 84 85 struct nvme_ns* ns; 86 uint64 capacity; 87 uint32 block_size; 88 uint32 max_io_blocks; 89 status_t media_status; 90 91 DMAResource dma_resource; 92 sem_id dma_buffers_sem; 93 94 rw_lock rounded_write_lock; 95 96 ConditionVariable interrupt; 97 int32 polling; 98 99 struct qpair_info { 100 struct nvme_qpair* qpair; 101 } qpairs[NVME_MAX_QPAIRS]; 102 uint32 qpair_count; 103 } nvme_disk_driver_info; 104 typedef nvme_disk_driver_info::qpair_info qpair_info; 105 106 107 typedef struct { 108 nvme_disk_driver_info* info; 109 } nvme_disk_handle; 110 111 112 static status_t 113 get_geometry(nvme_disk_handle* handle, device_geometry* geometry) 114 { 115 nvme_disk_driver_info* info = handle->info; 116 117 devfs_compute_geometry_size(geometry, info->capacity, info->block_size); 118 geometry->bytes_per_physical_sector = info->block_size; 119 120 geometry->device_type = B_DISK; 121 geometry->removable = false; 122 123 geometry->read_only = false; 124 geometry->write_once = false; 125 126 TRACE("get_geometry(): %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %d, %d, %d, %d\n", 127 geometry->bytes_per_sector, geometry->sectors_per_track, 128 geometry->cylinder_count, geometry->head_count, geometry->device_type, 129 geometry->removable, geometry->read_only, geometry->write_once); 130 131 return B_OK; 132 } 133 134 135 static void 136 nvme_disk_set_capacity(nvme_disk_driver_info* info, uint64 capacity, 137 uint32 blockSize) 138 { 139 TRACE("set_capacity(device = %p, capacity = %" B_PRIu64 ", blockSize = %" B_PRIu32 ")\n", 140 info, capacity, blockSize); 141 142 info->capacity = capacity; 143 info->block_size = blockSize; 144 } 145 146 147 // #pragma mark - device module API 148 149 150 static int32 nvme_interrupt_handler(void* _info); 151 152 153 static status_t 154 nvme_disk_init_device(void* _info, void** _cookie) 155 { 156 CALLED(); 157 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info; 158 ASSERT(info->ctrlr == NULL); 159 160 pci_device_module_info* pci; 161 pci_device* pcidev; 162 device_node* parent = sDeviceManager->get_parent_node(info->node); 163 sDeviceManager->get_driver(parent, (driver_module_info**)&pci, 164 (void**)&pcidev); 165 pci->get_pci_info(pcidev, &info->info); 166 sDeviceManager->put_node(parent); 167 168 // construct the libnvme pci_device struct 169 pci_device* device = new pci_device; 170 device->vendor_id = info->info.vendor_id; 171 device->device_id = info->info.device_id; 172 device->subvendor_id = 0; 173 device->subdevice_id = 0; 174 175 device->domain = 0; 176 device->bus = info->info.bus; 177 device->dev = info->info.device; 178 device->func = info->info.function; 179 180 device->pci_info = &info->info; 181 182 // enable busmaster and memory mapped access 183 uint16 command = pci->read_pci_config(pcidev, PCI_command, 2); 184 command |= PCI_command_master | PCI_command_memory; 185 pci->write_pci_config(pcidev, PCI_command, 2, command); 186 187 // open the controller 188 info->ctrlr = nvme_ctrlr_open(device, NULL); 189 if (info->ctrlr == NULL) { 190 TRACE_ERROR("failed to open the controller!\n"); 191 return B_ERROR; 192 } 193 194 struct nvme_ctrlr_stat cstat; 195 int err = nvme_ctrlr_stat(info->ctrlr, &cstat); 196 if (err != 0) { 197 TRACE_ERROR("failed to get controller information!\n"); 198 nvme_ctrlr_close(info->ctrlr); 199 return err; 200 } 201 202 TRACE_ALWAYS("attached to NVMe device \"%s (%s)\"\n", cstat.mn, cstat.sn); 203 TRACE_ALWAYS("\tmaximum transfer size: %" B_PRIuSIZE "\n", cstat.max_xfer_size); 204 TRACE_ALWAYS("\tqpair count: %d\n", cstat.io_qpairs); 205 206 // TODO: export more than just the first namespace! 207 info->ns = nvme_ns_open(info->ctrlr, cstat.ns_ids[0]); 208 if (info->ns == NULL) { 209 TRACE_ERROR("failed to open namespace!\n"); 210 nvme_ctrlr_close(info->ctrlr); 211 return B_ERROR; 212 } 213 TRACE_ALWAYS("namespace 0\n"); 214 215 struct nvme_ns_stat nsstat; 216 err = nvme_ns_stat(info->ns, &nsstat); 217 if (err != 0) { 218 TRACE_ERROR("failed to get namespace information!\n"); 219 nvme_ctrlr_close(info->ctrlr); 220 return err; 221 } 222 223 // store capacity information 224 TRACE_ALWAYS("\tblock size: %" B_PRIuSIZE ", stripe size: %u\n", 225 nsstat.sector_size, info->ns->stripe_size); 226 nvme_disk_set_capacity(info, nsstat.sectors, nsstat.sector_size); 227 228 command = pci->read_pci_config(pcidev, PCI_command, 2); 229 command &= ~(PCI_command_int_disable); 230 pci->write_pci_config(pcidev, PCI_command, 2, command); 231 232 uint32 irq = info->info.u.h0.interrupt_line; 233 if (irq == 0xFF) 234 irq = 0; 235 236 if (pci->get_msix_count(pcidev)) { 237 uint32 msixVector = 0; 238 if (pci->configure_msix(pcidev, 1, &msixVector) == B_OK 239 && pci->enable_msix(pcidev) == B_OK) { 240 TRACE_ALWAYS("using MSI-X\n"); 241 irq = msixVector; 242 } 243 } else if (pci->get_msi_count(pcidev) >= 1) { 244 uint32 msiVector = 0; 245 if (pci->configure_msi(pcidev, 1, &msiVector) == B_OK 246 && pci->enable_msi(pcidev) == B_OK) { 247 TRACE_ALWAYS("using message signaled interrupts\n"); 248 irq = msiVector; 249 } 250 } 251 252 if (irq == 0) { 253 TRACE_ERROR("device PCI:%d:%d:%d was assigned an invalid IRQ\n", 254 info->info.bus, info->info.device, info->info.function); 255 info->polling = 1; 256 } else { 257 info->polling = 0; 258 } 259 info->interrupt.Init(NULL, NULL); 260 install_io_interrupt_handler(irq, nvme_interrupt_handler, (void*)info, B_NO_HANDLED_INFO); 261 262 if (info->ctrlr->feature_supported[NVME_FEAT_INTERRUPT_COALESCING]) { 263 uint32 microseconds = 16, threshold = 32; 264 nvme_admin_set_feature(info->ctrlr, false, NVME_FEAT_INTERRUPT_COALESCING, 265 ((microseconds / 100) << 8) | threshold, 0, NULL); 266 } 267 268 // allocate qpairs 269 uint32 try_qpairs = cstat.io_qpairs; 270 try_qpairs = min_c(try_qpairs, NVME_MAX_QPAIRS); 271 if (try_qpairs >= (uint32)smp_get_num_cpus()) { 272 try_qpairs = smp_get_num_cpus(); 273 } else { 274 // Find the highest number of qpairs that evenly divides the number of CPUs. 275 while ((smp_get_num_cpus() % try_qpairs) != 0) 276 try_qpairs--; 277 } 278 info->qpair_count = 0; 279 for (uint32 i = 0; i < try_qpairs; i++) { 280 info->qpairs[i].qpair = nvme_ioqp_get(info->ctrlr, 281 (enum nvme_qprio)0, 0); 282 if (info->qpairs[i].qpair == NULL) 283 break; 284 285 info->qpair_count++; 286 } 287 if (info->qpair_count == 0) { 288 TRACE_ERROR("failed to allocate qpairs!\n"); 289 nvme_ctrlr_close(info->ctrlr); 290 return B_NO_MEMORY; 291 } 292 if (info->qpair_count != try_qpairs) { 293 TRACE_ALWAYS("warning: did not get expected number of qpairs\n"); 294 } 295 296 // allocate DMA buffers 297 int buffers = info->qpair_count * 2; 298 299 dma_restrictions restrictions = {}; 300 restrictions.alignment = B_PAGE_SIZE; 301 // Technically, the first and last segments in a transfer can be aligned 302 // only on 32-bits, and the rest only need to have sizes that are a multiple 303 // of the block size. 304 restrictions.max_segment_count = (NVME_MAX_SGL_DESCRIPTORS / 2); 305 restrictions.max_transfer_size = cstat.max_xfer_size; 306 info->max_io_blocks = cstat.max_xfer_size / nsstat.sector_size; 307 308 err = info->dma_resource.Init(restrictions, B_PAGE_SIZE, buffers, buffers); 309 if (err != 0) { 310 TRACE_ERROR("failed to initialize DMA resource!\n"); 311 nvme_ctrlr_close(info->ctrlr); 312 return err; 313 } 314 315 info->dma_buffers_sem = create_sem(buffers, "nvme buffers sem"); 316 if (info->dma_buffers_sem < 0) { 317 TRACE_ERROR("failed to create DMA buffers semaphore!\n"); 318 nvme_ctrlr_close(info->ctrlr); 319 return info->dma_buffers_sem; 320 } 321 322 // set up rounded-write lock 323 rw_lock_init(&info->rounded_write_lock, "nvme rounded writes"); 324 325 *_cookie = info; 326 return B_OK; 327 } 328 329 330 static void 331 nvme_disk_uninit_device(void* _cookie) 332 { 333 CALLED(); 334 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie; 335 336 remove_io_interrupt_handler(info->info.u.h0.interrupt_line, 337 nvme_interrupt_handler, (void*)info); 338 339 rw_lock_destroy(&info->rounded_write_lock); 340 341 nvme_ns_close(info->ns); 342 nvme_ctrlr_close(info->ctrlr); 343 344 // TODO: Deallocate MSI(-X). 345 // TODO: Deallocate PCI. 346 } 347 348 349 static status_t 350 nvme_disk_open(void* _info, const char* path, int openMode, void** _cookie) 351 { 352 CALLED(); 353 354 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info; 355 nvme_disk_handle* handle = (nvme_disk_handle*)malloc( 356 sizeof(nvme_disk_handle)); 357 if (handle == NULL) 358 return B_NO_MEMORY; 359 360 handle->info = info; 361 362 *_cookie = handle; 363 return B_OK; 364 } 365 366 367 static status_t 368 nvme_disk_close(void* cookie) 369 { 370 CALLED(); 371 372 //nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 373 return B_OK; 374 } 375 376 377 static status_t 378 nvme_disk_free(void* cookie) 379 { 380 CALLED(); 381 382 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 383 free(handle); 384 return B_OK; 385 } 386 387 388 // #pragma mark - I/O 389 390 391 static int32 392 nvme_interrupt_handler(void* _info) 393 { 394 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info; 395 info->interrupt.NotifyAll(); 396 info->polling = -1; 397 return 0; 398 } 399 400 401 static qpair_info* 402 get_qpair(nvme_disk_driver_info* info) 403 { 404 return &info->qpairs[smp_get_current_cpu() % info->qpair_count]; 405 } 406 407 408 static void 409 io_finished_callback(status_t* status, const struct nvme_cpl* cpl) 410 { 411 *status = nvme_cpl_is_error(cpl) ? B_IO_ERROR : B_OK; 412 } 413 414 415 static void 416 await_status(nvme_disk_driver_info* info, struct nvme_qpair* qpair, status_t& status) 417 { 418 CALLED(); 419 420 ConditionVariableEntry entry; 421 int timeouts = 0; 422 while (status == EINPROGRESS) { 423 info->interrupt.Add(&entry); 424 425 nvme_qpair_poll(qpair, 0); 426 427 if (status != EINPROGRESS) 428 return; 429 430 if (info->polling > 0) { 431 entry.Wait(B_RELATIVE_TIMEOUT, min_c(5 * 1000 * 1000, 432 (1 << timeouts) * 1000)); 433 timeouts++; 434 } else if (entry.Wait(B_RELATIVE_TIMEOUT, 5 * 1000 * 1000) != B_OK) { 435 // This should never happen, as we are woken up on every interrupt 436 // no matter the qpair or transfer within; so if it does occur, 437 // that probably means the controller stalled, or maybe cannot 438 // generate interrupts at all. 439 440 TRACE_ERROR("timed out waiting for interrupt!\n"); 441 if (timeouts++ >= 3) { 442 nvme_qpair_fail(qpair); 443 status = B_TIMED_OUT; 444 return; 445 } 446 447 info->polling++; 448 if (info->polling > 0) { 449 TRACE_ALWAYS("switching to polling mode, performance will be affected!\n"); 450 } 451 } 452 453 nvme_qpair_poll(qpair, 0); 454 } 455 } 456 457 458 struct nvme_io_request { 459 status_t status; 460 461 bool write; 462 463 off_t lba_start; 464 size_t lba_count; 465 466 physical_entry* iovecs; 467 int32 iovec_count; 468 469 int32 iovec_i; 470 uint32 iovec_offset; 471 }; 472 473 474 static void 475 ior_reset_sgl(nvme_io_request* request, uint32_t offset) 476 { 477 TRACE("IOR Reset: %" B_PRIu32 "\n", offset); 478 479 int32 i = 0; 480 while (offset > 0 && request->iovecs[i].size <= offset) { 481 offset -= request->iovecs[i].size; 482 i++; 483 } 484 request->iovec_i = i; 485 request->iovec_offset = offset; 486 } 487 488 489 static int 490 ior_next_sge(nvme_io_request* request, uint64_t* address, uint32_t* length) 491 { 492 int32 index = request->iovec_i; 493 if (index < 0 || index > request->iovec_count) 494 return -1; 495 496 *address = request->iovecs[index].address + request->iovec_offset; 497 *length = request->iovecs[index].size - request->iovec_offset; 498 499 TRACE("IOV %d (+ " B_PRIu32 "): 0x%" B_PRIx64 ", %" B_PRIu32 "\n", 500 request->iovec_i, request->iovec_offset, *address, *length); 501 502 request->iovec_i++; 503 request->iovec_offset = 0; 504 return 0; 505 } 506 507 508 static status_t 509 do_nvme_io_request(nvme_disk_driver_info* info, nvme_io_request* request) 510 { 511 request->status = EINPROGRESS; 512 513 qpair_info* qpinfo = get_qpair(info); 514 int ret = -1; 515 if (request->write) { 516 ret = nvme_ns_writev(info->ns, qpinfo->qpair, request->lba_start, 517 request->lba_count, (nvme_cmd_cb)io_finished_callback, request, 518 0, (nvme_req_reset_sgl_cb)ior_reset_sgl, 519 (nvme_req_next_sge_cb)ior_next_sge); 520 } else { 521 ret = nvme_ns_readv(info->ns, qpinfo->qpair, request->lba_start, 522 request->lba_count, (nvme_cmd_cb)io_finished_callback, request, 523 0, (nvme_req_reset_sgl_cb)ior_reset_sgl, 524 (nvme_req_next_sge_cb)ior_next_sge); 525 } 526 if (ret != 0) { 527 TRACE_ERROR("attempt to queue %s I/O at LBA %" B_PRIdOFF " of %" B_PRIuSIZE 528 " blocks failed!\n", request->write ? "write" : "read", 529 request->lba_start, request->lba_count); 530 531 request->lba_count = 0; 532 return ret; 533 } 534 535 await_status(info, qpinfo->qpair, request->status); 536 537 if (request->status != B_OK) { 538 TRACE_ERROR("%s at LBA %" B_PRIdOFF " of %" B_PRIuSIZE 539 " blocks failed!\n", request->write ? "write" : "read", 540 request->lba_start, request->lba_count); 541 542 request->lba_count = 0; 543 } 544 return request->status; 545 } 546 547 548 static status_t 549 nvme_disk_bounced_io(nvme_disk_handle* handle, io_request* request) 550 { 551 CALLED(); 552 553 WriteLocker writeLocker; 554 if (request->IsWrite()) 555 writeLocker.SetTo(handle->info->rounded_write_lock, false); 556 557 status_t status = acquire_sem(handle->info->dma_buffers_sem); 558 if (status != B_OK) { 559 request->SetStatusAndNotify(status); 560 return status; 561 } 562 563 const size_t block_size = handle->info->block_size; 564 565 TRACE("%p: IOR Offset: %" B_PRIdOFF "; Length %" B_PRIuGENADDR 566 "; Write %s\n", request, request->Offset(), request->Length(), 567 request->IsWrite() ? "yes" : "no"); 568 569 nvme_io_request nvme_request; 570 while (request->RemainingBytes() > 0) { 571 IOOperation operation; 572 status = handle->info->dma_resource.TranslateNext(request, &operation, 0); 573 if (status != B_OK) 574 break; 575 576 do { 577 TRACE("%p: IOO offset: %" B_PRIdOFF ", length: %" B_PRIuGENADDR 578 ", write: %s\n", request, operation.Offset(), 579 operation.Length(), operation.IsWrite() ? "yes" : "no"); 580 581 nvme_request.write = operation.IsWrite(); 582 nvme_request.lba_start = operation.Offset() / block_size; 583 nvme_request.lba_count = operation.Length() / block_size; 584 nvme_request.iovecs = (physical_entry*)operation.Vecs(); 585 nvme_request.iovec_count = operation.VecCount(); 586 587 status = do_nvme_io_request(handle->info, &nvme_request); 588 589 operation.SetStatus(status, 590 status == B_OK ? operation.Length() : 0); 591 } while (status == B_OK && !operation.Finish()); 592 593 if (status == B_OK && operation.Status() != B_OK) { 594 TRACE_ERROR("I/O succeeded but IOOperation failed!\n"); 595 status = operation.Status(); 596 } 597 598 request->OperationFinished(&operation); 599 600 handle->info->dma_resource.RecycleBuffer(operation.Buffer()); 601 602 TRACE("%p: status %s, remaining bytes %" B_PRIuGENADDR "\n", request, 603 strerror(status), request->RemainingBytes()); 604 if (status != B_OK) 605 break; 606 } 607 608 release_sem(handle->info->dma_buffers_sem); 609 610 // Notify() also takes care of UnlockMemory(). 611 if (status != B_OK && request->Status() == B_OK) 612 request->SetStatusAndNotify(status); 613 else 614 request->NotifyFinished(); 615 return status; 616 } 617 618 619 static status_t 620 nvme_disk_io(void* cookie, io_request* request) 621 { 622 CALLED(); 623 624 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 625 626 const off_t ns_end = (handle->info->capacity * handle->info->block_size); 627 if ((request->Offset() + (off_t)request->Length()) > ns_end) 628 return ERANGE; 629 630 nvme_io_request nvme_request; 631 memset(&nvme_request, 0, sizeof(nvme_io_request)); 632 633 nvme_request.write = request->IsWrite(); 634 635 physical_entry* vtophys = NULL; 636 MemoryDeleter vtophysDeleter; 637 638 IOBuffer* buffer = request->Buffer(); 639 status_t status = B_OK; 640 if (!buffer->IsPhysical()) { 641 status = buffer->LockMemory(request->TeamID(), request->IsWrite()); 642 if (status != B_OK) { 643 TRACE_ERROR("failed to lock memory: %s\n", strerror(status)); 644 return status; 645 } 646 // SetStatusAndNotify() takes care of unlocking memory if necessary. 647 648 // This is slightly inefficient, as we could use a BStackOrHeapArray in 649 // the optimal case (few physical entries required), but we would not 650 // know whether or not that was possible until calling get_memory_map() 651 // and then potentially reallocating, which would complicate the logic. 652 653 int32 vtophys_length = (request->Length() / B_PAGE_SIZE) + 2; 654 nvme_request.iovecs = vtophys = (physical_entry*)malloc(sizeof(physical_entry) 655 * vtophys_length); 656 if (vtophys == NULL) { 657 TRACE_ERROR("failed to allocate memory for iovecs\n"); 658 request->SetStatusAndNotify(B_NO_MEMORY); 659 return B_NO_MEMORY; 660 } 661 vtophysDeleter.SetTo(vtophys); 662 663 for (size_t i = 0; i < buffer->VecCount(); i++) { 664 generic_io_vec virt = buffer->VecAt(i); 665 uint32 entries = vtophys_length - nvme_request.iovec_count; 666 667 // Avoid copies by going straight into the vtophys array. 668 status = get_memory_map_etc(request->TeamID(), (void*)virt.base, 669 virt.length, vtophys + nvme_request.iovec_count, &entries); 670 if (status == B_BUFFER_OVERFLOW) { 671 TRACE("vtophys array was too small, reallocating\n"); 672 673 vtophysDeleter.Detach(); 674 vtophys_length *= 2; 675 nvme_request.iovecs = vtophys = (physical_entry*)realloc(vtophys, 676 sizeof(physical_entry) * vtophys_length); 677 vtophysDeleter.SetTo(vtophys); 678 if (vtophys == NULL) { 679 status = B_NO_MEMORY; 680 } else { 681 // Try again, with the larger buffer this time. 682 i--; 683 continue; 684 } 685 } 686 if (status != B_OK) { 687 TRACE_ERROR("I/O get_memory_map failed: %s\n", strerror(status)); 688 request->SetStatusAndNotify(status); 689 return status; 690 } 691 692 nvme_request.iovec_count += entries; 693 } 694 } else { 695 nvme_request.iovecs = (physical_entry*)buffer->Vecs(); 696 nvme_request.iovec_count = buffer->VecCount(); 697 } 698 699 // See if we need to bounce anything other than the first or last vec. 700 const size_t block_size = handle->info->block_size; 701 bool bounceAll = false; 702 for (int32 i = 1; !bounceAll && i < (nvme_request.iovec_count - 1); i++) { 703 if ((nvme_request.iovecs[i].address % B_PAGE_SIZE) != 0) 704 bounceAll = true; 705 if ((nvme_request.iovecs[i].size % B_PAGE_SIZE) != 0) 706 bounceAll = true; 707 } 708 709 // See if we need to bounce due to the first or last vecs. 710 if (nvme_request.iovec_count > 1) { 711 // There are middle vecs, so the first and last vecs have different restrictions: they 712 // need only be a multiple of the block size, and must end and start on a page boundary, 713 // respectively, though the start address must always be 32-bit-aligned. 714 physical_entry* entry = &nvme_request.iovecs[0]; 715 if (!bounceAll && (((entry->address + entry->size) % B_PAGE_SIZE) != 0 716 || (entry->address & 0x3) != 0 || (entry->size % block_size) != 0)) 717 bounceAll = true; 718 719 entry = &nvme_request.iovecs[nvme_request.iovec_count - 1]; 720 if (!bounceAll && ((entry->address % B_PAGE_SIZE) != 0 721 || (entry->size % block_size) != 0)) 722 bounceAll = true; 723 } else { 724 // There is only one vec. Check that it is a multiple of the block size, 725 // and that its address is 32-bit-aligned. 726 physical_entry* entry = &nvme_request.iovecs[0]; 727 if (!bounceAll && ((entry->address & 0x3) != 0 || (entry->size % block_size) != 0)) 728 bounceAll = true; 729 } 730 731 // See if we need to bounce due to rounding. 732 const off_t rounded_pos = ROUNDDOWN(request->Offset(), block_size); 733 phys_size_t rounded_len = ROUNDUP(request->Length() + (request->Offset() 734 - rounded_pos), block_size); 735 if (rounded_pos != request->Offset() || rounded_len != request->Length()) 736 bounceAll = true; 737 738 if (bounceAll) { 739 // Let the bounced I/O routine take care of everything from here. 740 return nvme_disk_bounced_io(handle, request); 741 } 742 743 nvme_request.lba_start = rounded_pos / block_size; 744 nvme_request.lba_count = rounded_len / block_size; 745 746 // No bouncing was required. 747 ReadLocker readLocker; 748 if (nvme_request.write) 749 readLocker.SetTo(handle->info->rounded_write_lock, false); 750 751 // Error check before actually doing I/O. 752 if (status != B_OK) { 753 TRACE_ERROR("I/O failed early: %s\n", strerror(status)); 754 request->SetStatusAndNotify(status); 755 return status; 756 } 757 758 const uint32 max_io_blocks = handle->info->max_io_blocks; 759 int32 remaining = nvme_request.iovec_count; 760 while (remaining > 0) { 761 nvme_request.iovec_count = min_c(remaining, 762 NVME_MAX_SGL_DESCRIPTORS / 2); 763 764 nvme_request.lba_count = 0; 765 for (int i = 0; i < nvme_request.iovec_count; i++) { 766 uint32 new_lba_count = nvme_request.lba_count 767 + (nvme_request.iovecs[i].size / block_size); 768 if (nvme_request.lba_count > 0 && new_lba_count > max_io_blocks) { 769 // We already have a nonzero length, and adding this vec would 770 // make us go over (or we already are over.) Stop adding. 771 nvme_request.iovec_count = i; 772 break; 773 } 774 775 nvme_request.lba_count = new_lba_count; 776 } 777 778 status = do_nvme_io_request(handle->info, &nvme_request); 779 if (status != B_OK) 780 break; 781 782 nvme_request.iovecs += nvme_request.iovec_count; 783 remaining -= nvme_request.iovec_count; 784 nvme_request.lba_start += nvme_request.lba_count; 785 } 786 787 if (status != B_OK) 788 TRACE_ERROR("I/O failed: %s\n", strerror(status)); 789 790 request->SetTransferredBytes(status != B_OK, 791 (nvme_request.lba_start * block_size) - rounded_pos); 792 request->SetStatusAndNotify(status); 793 return status; 794 } 795 796 797 static status_t 798 nvme_disk_read(void* cookie, off_t pos, void* buffer, size_t* length) 799 { 800 CALLED(); 801 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 802 803 const off_t ns_end = (handle->info->capacity * handle->info->block_size); 804 if (pos >= ns_end) 805 return B_BAD_VALUE; 806 if ((pos + (off_t)*length) > ns_end) 807 *length = ns_end - pos; 808 809 IORequest request; 810 status_t status = request.Init(pos, (addr_t)buffer, *length, false, 0); 811 if (status != B_OK) 812 return status; 813 814 status = nvme_disk_io(handle, &request); 815 *length = request.TransferredBytes(); 816 return status; 817 } 818 819 820 static status_t 821 nvme_disk_write(void* cookie, off_t pos, const void* buffer, size_t* length) 822 { 823 CALLED(); 824 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 825 826 const off_t ns_end = (handle->info->capacity * handle->info->block_size); 827 if (pos >= ns_end) 828 return B_BAD_VALUE; 829 if ((pos + (off_t)*length) > ns_end) 830 *length = ns_end - pos; 831 832 IORequest request; 833 status_t status = request.Init(pos, (addr_t)buffer, *length, true, 0); 834 if (status != B_OK) 835 return status; 836 837 status = nvme_disk_io(handle, &request); 838 *length = request.TransferredBytes(); 839 return status; 840 } 841 842 843 static status_t 844 nvme_disk_flush(nvme_disk_driver_info* info) 845 { 846 CALLED(); 847 status_t status = EINPROGRESS; 848 849 qpair_info* qpinfo = get_qpair(info); 850 int ret = nvme_ns_flush(info->ns, qpinfo->qpair, 851 (nvme_cmd_cb)io_finished_callback, &status); 852 if (ret != 0) 853 return ret; 854 855 await_status(info, qpinfo->qpair, status); 856 return status; 857 } 858 859 860 static status_t 861 nvme_disk_trim(nvme_disk_driver_info* info, fs_trim_data* trimData) 862 { 863 CALLED(); 864 trimData->trimmed_size = 0; 865 866 const off_t deviceSize = info->capacity * info->block_size; // in bytes 867 if (deviceSize < 0) 868 return B_BAD_VALUE; 869 870 STATIC_ASSERT(sizeof(deviceSize) <= sizeof(uint64)); 871 ASSERT(deviceSize >= 0); 872 873 // Do not trim past device end. 874 for (uint32 i = 0; i < trimData->range_count; i++) { 875 uint64 offset = trimData->ranges[i].offset; 876 uint64& size = trimData->ranges[i].size; 877 878 if (offset >= (uint64)deviceSize) 879 return B_BAD_VALUE; 880 size = std::min(size, (uint64)deviceSize - offset); 881 } 882 883 // We need contiguous memory for the DSM ranges. 884 nvme_dsm_range* dsmRanges = (nvme_dsm_range*)nvme_mem_alloc_node( 885 trimData->range_count * sizeof(nvme_dsm_range), 0, 0, NULL); 886 if (dsmRanges == NULL) 887 return B_NO_MEMORY; 888 CObjectDeleter<void, void, nvme_free> dsmRangesDeleter(dsmRanges); 889 890 uint64 trimmingSize = 0; 891 for (uint32 i = 0; i < trimData->range_count; i++) { 892 uint64 offset = trimData->ranges[i].offset; 893 uint64 length = trimData->ranges[i].size; 894 895 // Round up offset and length to the block size. 896 // (Some space at the beginning and end may thus not be trimmed.) 897 offset = ROUNDUP(offset, info->block_size); 898 length -= offset - trimData->ranges[i].offset; 899 length = ROUNDDOWN(length, info->block_size); 900 901 if (length == 0) 902 continue; 903 if ((length / info->block_size) > UINT32_MAX) 904 length = uint64(UINT32_MAX) * info->block_size; 905 // TODO: Break into smaller trim ranges! 906 907 TRACE("trim %" B_PRIu64 " bytes from %" B_PRIu64 "\n", length, offset); 908 909 dsmRanges[i].attributes = 0; 910 dsmRanges[i].length = length / info->block_size; 911 dsmRanges[i].starting_lba = offset / info->block_size; 912 913 trimmingSize += length; 914 } 915 916 status_t status = EINPROGRESS; 917 qpair_info* qpair = get_qpair(info); 918 if (nvme_ns_deallocate(info->ns, qpair->qpair, dsmRanges, trimData->range_count, 919 (nvme_cmd_cb)io_finished_callback, &status) != 0) 920 return B_IO_ERROR; 921 922 await_status(info, qpair->qpair, status); 923 if (status != B_OK) 924 return status; 925 926 trimData->trimmed_size = trimmingSize; 927 return B_OK; 928 } 929 930 931 static status_t 932 nvme_disk_ioctl(void* cookie, uint32 op, void* buffer, size_t length) 933 { 934 CALLED(); 935 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 936 nvme_disk_driver_info* info = handle->info; 937 938 TRACE("ioctl(op = %" B_PRId32 ")\n", op); 939 940 switch (op) { 941 case B_GET_MEDIA_STATUS: 942 { 943 return user_memcpy(buffer, &info->media_status, sizeof(status_t)); 944 } 945 946 case B_GET_DEVICE_SIZE: 947 { 948 size_t size = info->capacity * info->block_size; 949 return user_memcpy(buffer, &size, sizeof(size_t)); 950 } 951 952 case B_GET_GEOMETRY: 953 { 954 if (buffer == NULL || length > sizeof(device_geometry)) 955 return B_BAD_VALUE; 956 957 device_geometry geometry; 958 status_t status = get_geometry(handle, &geometry); 959 if (status != B_OK) 960 return status; 961 962 return user_memcpy(buffer, &geometry, length); 963 } 964 965 case B_GET_ICON_NAME: 966 return user_strlcpy((char*)buffer, "devices/drive-harddisk", 967 B_FILE_NAME_LENGTH); 968 969 case B_GET_VECTOR_ICON: 970 { 971 device_icon iconData; 972 if (length != sizeof(device_icon)) 973 return B_BAD_VALUE; 974 if (user_memcpy(&iconData, buffer, sizeof(device_icon)) != B_OK) 975 return B_BAD_ADDRESS; 976 977 if (iconData.icon_size >= (int32)sizeof(kDriveIcon)) { 978 if (user_memcpy(iconData.icon_data, kDriveIcon, 979 sizeof(kDriveIcon)) != B_OK) 980 return B_BAD_ADDRESS; 981 } 982 983 iconData.icon_size = sizeof(kDriveIcon); 984 return user_memcpy(buffer, &iconData, sizeof(device_icon)); 985 } 986 987 case B_FLUSH_DRIVE_CACHE: 988 return nvme_disk_flush(info); 989 990 case B_TRIM_DEVICE: 991 ASSERT(IS_KERNEL_ADDRESS(buffer)); 992 return nvme_disk_trim(info, (fs_trim_data*)buffer); 993 } 994 995 return B_DEV_INVALID_IOCTL; 996 } 997 998 999 // #pragma mark - driver module API 1000 1001 1002 static float 1003 nvme_disk_supports_device(device_node *parent) 1004 { 1005 CALLED(); 1006 1007 const char* bus; 1008 uint16 baseClass, subClass; 1009 1010 if (sDeviceManager->get_attr_string(parent, B_DEVICE_BUS, &bus, false) != B_OK 1011 || sDeviceManager->get_attr_uint16(parent, B_DEVICE_TYPE, &baseClass, false) != B_OK 1012 || sDeviceManager->get_attr_uint16(parent, B_DEVICE_SUB_TYPE, &subClass, false) != B_OK) 1013 return -1.0f; 1014 1015 if (strcmp(bus, "pci") != 0 || baseClass != PCI_mass_storage) 1016 return 0.0f; 1017 1018 if (subClass != PCI_nvm) 1019 return 0.0f; 1020 1021 TRACE("NVMe device found!\n"); 1022 return 1.0f; 1023 } 1024 1025 1026 static status_t 1027 nvme_disk_register_device(device_node* parent) 1028 { 1029 CALLED(); 1030 1031 device_attr attrs[] = { 1032 { B_DEVICE_PRETTY_NAME, B_STRING_TYPE, { .string = "NVMe Disk" } }, 1033 { NULL } 1034 }; 1035 1036 return sDeviceManager->register_node(parent, NVME_DISK_DRIVER_MODULE_NAME, 1037 attrs, NULL, NULL); 1038 } 1039 1040 1041 static status_t 1042 nvme_disk_init_driver(device_node* node, void** cookie) 1043 { 1044 CALLED(); 1045 1046 int ret = nvme_lib_init((enum nvme_log_level)0, (enum nvme_log_facility)0, NULL); 1047 if (ret != 0) { 1048 TRACE_ERROR("libnvme initialization failed!\n"); 1049 return ret; 1050 } 1051 1052 nvme_disk_driver_info* info = new nvme_disk_driver_info; 1053 if (info == NULL) 1054 return B_NO_MEMORY; 1055 1056 info->media_status = B_OK; 1057 info->node = node; 1058 1059 info->ctrlr = NULL; 1060 1061 *cookie = info; 1062 return B_OK; 1063 } 1064 1065 1066 static void 1067 nvme_disk_uninit_driver(void* _cookie) 1068 { 1069 CALLED(); 1070 1071 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie; 1072 free(info); 1073 } 1074 1075 1076 static status_t 1077 nvme_disk_register_child_devices(void* _cookie) 1078 { 1079 CALLED(); 1080 1081 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie; 1082 status_t status; 1083 1084 int32 id = sDeviceManager->create_id(NVME_DISK_DEVICE_ID_GENERATOR); 1085 if (id < 0) 1086 return id; 1087 1088 char name[64]; 1089 snprintf(name, sizeof(name), "disk/nvme/%" B_PRId32 "/raw", 1090 id); 1091 1092 status = sDeviceManager->publish_device(info->node, name, 1093 NVME_DISK_DEVICE_MODULE_NAME); 1094 1095 return status; 1096 } 1097 1098 1099 // #pragma mark - 1100 1101 1102 module_dependency module_dependencies[] = { 1103 { B_DEVICE_MANAGER_MODULE_NAME, (module_info**)&sDeviceManager }, 1104 { NULL } 1105 }; 1106 1107 struct device_module_info sNvmeDiskDevice = { 1108 { 1109 NVME_DISK_DEVICE_MODULE_NAME, 1110 0, 1111 NULL 1112 }, 1113 1114 nvme_disk_init_device, 1115 nvme_disk_uninit_device, 1116 NULL, // remove, 1117 1118 nvme_disk_open, 1119 nvme_disk_close, 1120 nvme_disk_free, 1121 nvme_disk_read, 1122 nvme_disk_write, 1123 nvme_disk_io, 1124 nvme_disk_ioctl, 1125 1126 NULL, // select 1127 NULL, // deselect 1128 }; 1129 1130 struct driver_module_info sNvmeDiskDriver = { 1131 { 1132 NVME_DISK_DRIVER_MODULE_NAME, 1133 0, 1134 NULL 1135 }, 1136 1137 nvme_disk_supports_device, 1138 nvme_disk_register_device, 1139 nvme_disk_init_driver, 1140 nvme_disk_uninit_driver, 1141 nvme_disk_register_child_devices, 1142 NULL, // rescan 1143 NULL, // removed 1144 }; 1145 1146 module_info* modules[] = { 1147 (module_info*)&sNvmeDiskDriver, 1148 (module_info*)&sNvmeDiskDevice, 1149 NULL 1150 }; 1151