1 /* 2 * Copyright 2019-2022, Haiku, Inc. All rights reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Augustin Cavalier <waddlesplash> 7 */ 8 9 10 #include <stdio.h> 11 #include <stdlib.h> 12 13 #include <algorithm> 14 #include <condition_variable.h> 15 #include <AutoDeleter.h> 16 #include <kernel.h> 17 #include <smp.h> 18 #include <util/AutoLock.h> 19 20 #include <fs/devfs.h> 21 #include <bus/PCI.h> 22 #include <vm/vm.h> 23 24 #include "IORequest.h" 25 26 extern "C" { 27 #include <libnvme/nvme.h> 28 #include <libnvme/nvme_internal.h> 29 } 30 31 32 //#define TRACE_NVME_DISK 33 #ifdef TRACE_NVME_DISK 34 # define TRACE(x...) dprintf("nvme_disk: " x) 35 #else 36 # define TRACE(x...) ; 37 #endif 38 #define TRACE_ALWAYS(x...) dprintf("nvme_disk: " x) 39 #define TRACE_ERROR(x...) dprintf("\33[33mnvme_disk:\33[0m " x) 40 #define CALLED() TRACE("CALLED %s\n", __PRETTY_FUNCTION__) 41 42 43 static const uint8 kDriveIcon[] = { 44 0x6e, 0x63, 0x69, 0x66, 0x08, 0x03, 0x01, 0x00, 0x00, 0x02, 0x00, 0x16, 45 0x02, 0x3c, 0xc7, 0xee, 0x38, 0x9b, 0xc0, 0xba, 0x16, 0x57, 0x3e, 0x39, 46 0xb0, 0x49, 0x77, 0xc8, 0x42, 0xad, 0xc7, 0x00, 0xff, 0xff, 0xd3, 0x02, 47 0x00, 0x06, 0x02, 0x3c, 0x96, 0x32, 0x3a, 0x4d, 0x3f, 0xba, 0xfc, 0x01, 48 0x3d, 0x5a, 0x97, 0x4b, 0x57, 0xa5, 0x49, 0x84, 0x4d, 0x00, 0x47, 0x47, 49 0x47, 0xff, 0xa5, 0xa0, 0xa0, 0x02, 0x00, 0x16, 0x02, 0xbc, 0x59, 0x2f, 50 0xbb, 0x29, 0xa7, 0x3c, 0x0c, 0xe4, 0xbd, 0x0b, 0x7c, 0x48, 0x92, 0xc0, 51 0x4b, 0x79, 0x66, 0x00, 0x7d, 0xff, 0xd4, 0x02, 0x00, 0x06, 0x02, 0x38, 52 0xdb, 0xb4, 0x39, 0x97, 0x33, 0xbc, 0x4a, 0x33, 0x3b, 0xa5, 0x42, 0x48, 53 0x6e, 0x66, 0x49, 0xee, 0x7b, 0x00, 0x59, 0x67, 0x56, 0xff, 0xeb, 0xb2, 54 0xb2, 0x03, 0xa7, 0xff, 0x00, 0x03, 0xff, 0x00, 0x00, 0x04, 0x01, 0x80, 55 0x07, 0x0a, 0x06, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x5a, 0x3e, 0x5a, 56 0x31, 0x39, 0x25, 0x0a, 0x04, 0x22, 0x3c, 0x44, 0x4b, 0x5a, 0x31, 0x39, 57 0x25, 0x0a, 0x04, 0x44, 0x4b, 0x44, 0x5b, 0x5a, 0x3e, 0x5a, 0x31, 0x0a, 58 0x04, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x44, 0x4b, 0x08, 0x02, 0x27, 59 0x43, 0xb8, 0x14, 0xc1, 0xf1, 0x08, 0x02, 0x26, 0x43, 0x29, 0x44, 0x0a, 60 0x05, 0x44, 0x5d, 0x49, 0x5d, 0x60, 0x3e, 0x5a, 0x3b, 0x5b, 0x3f, 0x08, 61 0x0a, 0x07, 0x01, 0x06, 0x00, 0x0a, 0x00, 0x01, 0x00, 0x10, 0x01, 0x17, 62 0x84, 0x00, 0x04, 0x0a, 0x01, 0x01, 0x01, 0x00, 0x0a, 0x02, 0x01, 0x02, 63 0x00, 0x0a, 0x03, 0x01, 0x03, 0x00, 0x0a, 0x04, 0x01, 0x04, 0x10, 0x01, 64 0x17, 0x85, 0x20, 0x04, 0x0a, 0x06, 0x01, 0x05, 0x30, 0x24, 0xb3, 0x99, 65 0x01, 0x17, 0x82, 0x00, 0x04, 0x0a, 0x05, 0x01, 0x05, 0x30, 0x20, 0xb2, 66 0xe6, 0x01, 0x17, 0x82, 0x00, 0x04 67 }; 68 69 70 #define NVME_DISK_DRIVER_MODULE_NAME "drivers/disk/nvme_disk/driver_v1" 71 #define NVME_DISK_DEVICE_MODULE_NAME "drivers/disk/nvme_disk/device_v1" 72 #define NVME_DISK_DEVICE_ID_GENERATOR "nvme_disk/device_id" 73 74 #define NVME_MAX_QPAIRS (16) 75 76 77 static device_manager_info* sDeviceManager; 78 79 typedef struct { 80 device_node* node; 81 pci_info info; 82 83 struct nvme_ctrlr* ctrlr; 84 85 struct nvme_ns* ns; 86 uint64 capacity; 87 uint32 block_size; 88 uint32 max_io_blocks; 89 status_t media_status; 90 91 DMAResource dma_resource; 92 sem_id dma_buffers_sem; 93 94 rw_lock rounded_write_lock; 95 96 ConditionVariable interrupt; 97 int32 polling; 98 99 struct qpair_info { 100 struct nvme_qpair* qpair; 101 } qpairs[NVME_MAX_QPAIRS]; 102 uint32 qpair_count; 103 } nvme_disk_driver_info; 104 typedef nvme_disk_driver_info::qpair_info qpair_info; 105 106 107 typedef struct { 108 nvme_disk_driver_info* info; 109 } nvme_disk_handle; 110 111 112 static status_t 113 get_geometry(nvme_disk_handle* handle, device_geometry* geometry) 114 { 115 nvme_disk_driver_info* info = handle->info; 116 117 devfs_compute_geometry_size(geometry, info->capacity, info->block_size); 118 geometry->bytes_per_physical_sector = info->block_size; 119 120 geometry->device_type = B_DISK; 121 geometry->removable = false; 122 123 geometry->read_only = false; 124 geometry->write_once = false; 125 126 TRACE("get_geometry(): %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %d, %d, %d, %d\n", 127 geometry->bytes_per_sector, geometry->sectors_per_track, 128 geometry->cylinder_count, geometry->head_count, geometry->device_type, 129 geometry->removable, geometry->read_only, geometry->write_once); 130 131 return B_OK; 132 } 133 134 135 static void 136 nvme_disk_set_capacity(nvme_disk_driver_info* info, uint64 capacity, 137 uint32 blockSize) 138 { 139 TRACE("set_capacity(device = %p, capacity = %" B_PRIu64 ", blockSize = %" B_PRIu32 ")\n", 140 info, capacity, blockSize); 141 142 info->capacity = capacity; 143 info->block_size = blockSize; 144 } 145 146 147 // #pragma mark - device module API 148 149 150 static int32 nvme_interrupt_handler(void* _info); 151 152 153 static status_t 154 nvme_disk_init_device(void* _info, void** _cookie) 155 { 156 CALLED(); 157 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info; 158 ASSERT(info->ctrlr == NULL); 159 160 pci_device_module_info* pci; 161 pci_device* pcidev; 162 device_node* parent = sDeviceManager->get_parent_node(info->node); 163 sDeviceManager->get_driver(parent, (driver_module_info**)&pci, 164 (void**)&pcidev); 165 pci->get_pci_info(pcidev, &info->info); 166 sDeviceManager->put_node(parent); 167 168 // construct the libnvme pci_device struct 169 pci_device* device = new pci_device; 170 device->vendor_id = info->info.vendor_id; 171 device->device_id = info->info.device_id; 172 device->subvendor_id = 0; 173 device->subdevice_id = 0; 174 175 device->domain = 0; 176 device->bus = info->info.bus; 177 device->dev = info->info.device; 178 device->func = info->info.function; 179 180 device->pci_info = &info->info; 181 182 // enable busmaster and memory mapped access 183 uint16 command = pci->read_pci_config(pcidev, PCI_command, 2); 184 command |= PCI_command_master | PCI_command_memory; 185 pci->write_pci_config(pcidev, PCI_command, 2, command); 186 187 // open the controller 188 info->ctrlr = nvme_ctrlr_open(device, NULL); 189 if (info->ctrlr == NULL) { 190 TRACE_ERROR("failed to open the controller!\n"); 191 return B_ERROR; 192 } 193 194 struct nvme_ctrlr_stat cstat; 195 int err = nvme_ctrlr_stat(info->ctrlr, &cstat); 196 if (err != 0) { 197 TRACE_ERROR("failed to get controller information!\n"); 198 nvme_ctrlr_close(info->ctrlr); 199 return err; 200 } 201 202 TRACE_ALWAYS("attached to NVMe device \"%s (%s)\"\n", cstat.mn, cstat.sn); 203 TRACE_ALWAYS("\tmaximum transfer size: %" B_PRIuSIZE "\n", cstat.max_xfer_size); 204 TRACE_ALWAYS("\tqpair count: %d\n", cstat.io_qpairs); 205 206 // TODO: export more than just the first namespace! 207 info->ns = nvme_ns_open(info->ctrlr, cstat.ns_ids[0]); 208 if (info->ns == NULL) { 209 TRACE_ERROR("failed to open namespace!\n"); 210 nvme_ctrlr_close(info->ctrlr); 211 return B_ERROR; 212 } 213 TRACE_ALWAYS("namespace 0\n"); 214 215 struct nvme_ns_stat nsstat; 216 err = nvme_ns_stat(info->ns, &nsstat); 217 if (err != 0) { 218 TRACE_ERROR("failed to get namespace information!\n"); 219 nvme_ctrlr_close(info->ctrlr); 220 return err; 221 } 222 223 // store capacity information 224 TRACE_ALWAYS("\tblock size: %" B_PRIuSIZE ", stripe size: %u\n", 225 nsstat.sector_size, info->ns->stripe_size); 226 nvme_disk_set_capacity(info, nsstat.sectors, nsstat.sector_size); 227 228 command = pci->read_pci_config(pcidev, PCI_command, 2); 229 command &= ~(PCI_command_int_disable); 230 pci->write_pci_config(pcidev, PCI_command, 2, command); 231 232 uint8 irq = info->info.u.h0.interrupt_line; 233 if (pci->get_msix_count(pcidev)) { 234 uint8 msixVector = 0; 235 if (pci->configure_msix(pcidev, 1, &msixVector) == B_OK 236 && pci->enable_msix(pcidev) == B_OK) { 237 TRACE_ALWAYS("using MSI-X\n"); 238 irq = msixVector; 239 } 240 } else if (pci->get_msi_count(pcidev) >= 1) { 241 uint8 msiVector = 0; 242 if (pci->configure_msi(pcidev, 1, &msiVector) == B_OK 243 && pci->enable_msi(pcidev) == B_OK) { 244 TRACE_ALWAYS("using message signaled interrupts\n"); 245 irq = msiVector; 246 } 247 } 248 249 if (irq == 0 || irq == 0xFF) { 250 TRACE_ERROR("device PCI:%d:%d:%d was assigned an invalid IRQ\n", 251 info->info.bus, info->info.device, info->info.function); 252 info->polling = 1; 253 } else { 254 info->polling = 0; 255 } 256 info->interrupt.Init(NULL, NULL); 257 install_io_interrupt_handler(irq, nvme_interrupt_handler, (void*)info, B_NO_HANDLED_INFO); 258 259 if (info->ctrlr->feature_supported[NVME_FEAT_INTERRUPT_COALESCING]) { 260 uint32 microseconds = 16, threshold = 32; 261 nvme_admin_set_feature(info->ctrlr, false, NVME_FEAT_INTERRUPT_COALESCING, 262 ((microseconds / 100) << 8) | threshold, 0, NULL); 263 } 264 265 // allocate qpairs 266 uint32 try_qpairs = cstat.io_qpairs; 267 try_qpairs = min_c(try_qpairs, NVME_MAX_QPAIRS); 268 if (try_qpairs >= (uint32)smp_get_num_cpus()) { 269 try_qpairs = smp_get_num_cpus(); 270 } else { 271 // Find the highest number of qpairs that evenly divides the number of CPUs. 272 while ((smp_get_num_cpus() % try_qpairs) != 0) 273 try_qpairs--; 274 } 275 info->qpair_count = 0; 276 for (uint32 i = 0; i < try_qpairs; i++) { 277 info->qpairs[i].qpair = nvme_ioqp_get(info->ctrlr, 278 (enum nvme_qprio)0, 0); 279 if (info->qpairs[i].qpair == NULL) 280 break; 281 282 info->qpair_count++; 283 } 284 if (info->qpair_count == 0) { 285 TRACE_ERROR("failed to allocate qpairs!\n"); 286 nvme_ctrlr_close(info->ctrlr); 287 return B_NO_MEMORY; 288 } 289 if (info->qpair_count != try_qpairs) { 290 TRACE_ALWAYS("warning: did not get expected number of qpairs\n"); 291 } 292 293 // allocate DMA buffers 294 int buffers = info->qpair_count * 2; 295 296 dma_restrictions restrictions = {}; 297 restrictions.alignment = B_PAGE_SIZE; 298 // Technically, the first and last segments in a transfer can be aligned 299 // only on 32-bits, and the rest only need to have sizes that are a multiple 300 // of the block size. 301 restrictions.max_segment_count = (NVME_MAX_SGL_DESCRIPTORS / 2); 302 restrictions.max_transfer_size = cstat.max_xfer_size; 303 info->max_io_blocks = cstat.max_xfer_size / nsstat.sector_size; 304 305 err = info->dma_resource.Init(restrictions, B_PAGE_SIZE, buffers, buffers); 306 if (err != 0) { 307 TRACE_ERROR("failed to initialize DMA resource!\n"); 308 nvme_ctrlr_close(info->ctrlr); 309 return err; 310 } 311 312 info->dma_buffers_sem = create_sem(buffers, "nvme buffers sem"); 313 if (info->dma_buffers_sem < 0) { 314 TRACE_ERROR("failed to create DMA buffers semaphore!\n"); 315 nvme_ctrlr_close(info->ctrlr); 316 return info->dma_buffers_sem; 317 } 318 319 // set up rounded-write lock 320 rw_lock_init(&info->rounded_write_lock, "nvme rounded writes"); 321 322 *_cookie = info; 323 return B_OK; 324 } 325 326 327 static void 328 nvme_disk_uninit_device(void* _cookie) 329 { 330 CALLED(); 331 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie; 332 333 remove_io_interrupt_handler(info->info.u.h0.interrupt_line, 334 nvme_interrupt_handler, (void*)info); 335 336 rw_lock_destroy(&info->rounded_write_lock); 337 338 nvme_ns_close(info->ns); 339 nvme_ctrlr_close(info->ctrlr); 340 341 // TODO: Deallocate MSI(-X). 342 // TODO: Deallocate PCI. 343 } 344 345 346 static status_t 347 nvme_disk_open(void* _info, const char* path, int openMode, void** _cookie) 348 { 349 CALLED(); 350 351 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info; 352 nvme_disk_handle* handle = (nvme_disk_handle*)malloc( 353 sizeof(nvme_disk_handle)); 354 if (handle == NULL) 355 return B_NO_MEMORY; 356 357 handle->info = info; 358 359 *_cookie = handle; 360 return B_OK; 361 } 362 363 364 static status_t 365 nvme_disk_close(void* cookie) 366 { 367 CALLED(); 368 369 //nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 370 return B_OK; 371 } 372 373 374 static status_t 375 nvme_disk_free(void* cookie) 376 { 377 CALLED(); 378 379 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 380 free(handle); 381 return B_OK; 382 } 383 384 385 // #pragma mark - I/O 386 387 388 static int32 389 nvme_interrupt_handler(void* _info) 390 { 391 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info; 392 info->interrupt.NotifyAll(); 393 info->polling = -1; 394 return 0; 395 } 396 397 398 static qpair_info* 399 get_qpair(nvme_disk_driver_info* info) 400 { 401 return &info->qpairs[smp_get_current_cpu() % info->qpair_count]; 402 } 403 404 405 static void 406 io_finished_callback(status_t* status, const struct nvme_cpl* cpl) 407 { 408 *status = nvme_cpl_is_error(cpl) ? B_IO_ERROR : B_OK; 409 } 410 411 412 static void 413 await_status(nvme_disk_driver_info* info, struct nvme_qpair* qpair, status_t& status) 414 { 415 CALLED(); 416 417 ConditionVariableEntry entry; 418 int timeouts = 0; 419 while (status == EINPROGRESS) { 420 info->interrupt.Add(&entry); 421 422 nvme_qpair_poll(qpair, 0); 423 424 if (status != EINPROGRESS) 425 return; 426 427 if (info->polling > 0) { 428 entry.Wait(B_RELATIVE_TIMEOUT, min_c(5 * 1000 * 1000, 429 (1 << timeouts) * 1000)); 430 timeouts++; 431 } else if (entry.Wait(B_RELATIVE_TIMEOUT, 5 * 1000 * 1000) != B_OK) { 432 // This should never happen, as we are woken up on every interrupt 433 // no matter the qpair or transfer within; so if it does occur, 434 // that probably means the controller stalled, or maybe cannot 435 // generate interrupts at all. 436 437 TRACE_ERROR("timed out waiting for interrupt!\n"); 438 if (timeouts++ >= 3) { 439 nvme_qpair_fail(qpair); 440 status = B_TIMED_OUT; 441 return; 442 } 443 444 info->polling++; 445 if (info->polling > 0) { 446 TRACE_ALWAYS("switching to polling mode, performance will be affected!\n"); 447 } 448 } 449 450 nvme_qpair_poll(qpair, 0); 451 } 452 } 453 454 455 struct nvme_io_request { 456 status_t status; 457 458 bool write; 459 460 off_t lba_start; 461 size_t lba_count; 462 463 physical_entry* iovecs; 464 int32 iovec_count; 465 466 int32 iovec_i; 467 uint32 iovec_offset; 468 }; 469 470 471 static void 472 ior_reset_sgl(nvme_io_request* request, uint32_t offset) 473 { 474 TRACE("IOR Reset: %" B_PRIu32 "\n", offset); 475 476 int32 i = 0; 477 while (offset > 0 && request->iovecs[i].size <= offset) { 478 offset -= request->iovecs[i].size; 479 i++; 480 } 481 request->iovec_i = i; 482 request->iovec_offset = offset; 483 } 484 485 486 static int 487 ior_next_sge(nvme_io_request* request, uint64_t* address, uint32_t* length) 488 { 489 int32 index = request->iovec_i; 490 if (index < 0 || index > request->iovec_count) 491 return -1; 492 493 *address = request->iovecs[index].address + request->iovec_offset; 494 *length = request->iovecs[index].size - request->iovec_offset; 495 496 TRACE("IOV %d (+ " B_PRIu32 "): 0x%" B_PRIx64 ", %" B_PRIu32 "\n", 497 request->iovec_i, request->iovec_offset, *address, *length); 498 499 request->iovec_i++; 500 request->iovec_offset = 0; 501 return 0; 502 } 503 504 505 static status_t 506 do_nvme_io_request(nvme_disk_driver_info* info, nvme_io_request* request) 507 { 508 request->status = EINPROGRESS; 509 510 qpair_info* qpinfo = get_qpair(info); 511 int ret = -1; 512 if (request->write) { 513 ret = nvme_ns_writev(info->ns, qpinfo->qpair, request->lba_start, 514 request->lba_count, (nvme_cmd_cb)io_finished_callback, request, 515 0, (nvme_req_reset_sgl_cb)ior_reset_sgl, 516 (nvme_req_next_sge_cb)ior_next_sge); 517 } else { 518 ret = nvme_ns_readv(info->ns, qpinfo->qpair, request->lba_start, 519 request->lba_count, (nvme_cmd_cb)io_finished_callback, request, 520 0, (nvme_req_reset_sgl_cb)ior_reset_sgl, 521 (nvme_req_next_sge_cb)ior_next_sge); 522 } 523 if (ret != 0) { 524 TRACE_ERROR("attempt to queue %s I/O at LBA %" B_PRIdOFF " of %" B_PRIuSIZE 525 " blocks failed!\n", request->write ? "write" : "read", 526 request->lba_start, request->lba_count); 527 528 request->lba_count = 0; 529 return ret; 530 } 531 532 await_status(info, qpinfo->qpair, request->status); 533 534 if (request->status != B_OK) { 535 TRACE_ERROR("%s at LBA %" B_PRIdOFF " of %" B_PRIuSIZE 536 " blocks failed!\n", request->write ? "write" : "read", 537 request->lba_start, request->lba_count); 538 539 request->lba_count = 0; 540 } 541 return request->status; 542 } 543 544 545 static status_t 546 nvme_disk_bounced_io(nvme_disk_handle* handle, io_request* request) 547 { 548 CALLED(); 549 550 WriteLocker writeLocker; 551 if (request->IsWrite()) 552 writeLocker.SetTo(handle->info->rounded_write_lock, false); 553 554 status_t status = acquire_sem(handle->info->dma_buffers_sem); 555 if (status != B_OK) { 556 request->SetStatusAndNotify(status); 557 return status; 558 } 559 560 const size_t block_size = handle->info->block_size; 561 562 TRACE("%p: IOR Offset: %" B_PRIdOFF "; Length %" B_PRIuGENADDR 563 "; Write %s\n", request, request->Offset(), request->Length(), 564 request->IsWrite() ? "yes" : "no"); 565 566 nvme_io_request nvme_request; 567 while (request->RemainingBytes() > 0) { 568 IOOperation operation; 569 status = handle->info->dma_resource.TranslateNext(request, &operation, 0); 570 if (status != B_OK) 571 break; 572 573 do { 574 TRACE("%p: IOO offset: %" B_PRIdOFF ", length: %" B_PRIuGENADDR 575 ", write: %s\n", request, operation.Offset(), 576 operation.Length(), operation.IsWrite() ? "yes" : "no"); 577 578 nvme_request.write = operation.IsWrite(); 579 nvme_request.lba_start = operation.Offset() / block_size; 580 nvme_request.lba_count = operation.Length() / block_size; 581 nvme_request.iovecs = (physical_entry*)operation.Vecs(); 582 nvme_request.iovec_count = operation.VecCount(); 583 584 status = do_nvme_io_request(handle->info, &nvme_request); 585 586 operation.SetStatus(status, 587 status == B_OK ? operation.Length() : 0); 588 } while (status == B_OK && !operation.Finish()); 589 590 if (status == B_OK && operation.Status() != B_OK) { 591 TRACE_ERROR("I/O succeeded but IOOperation failed!\n"); 592 status = operation.Status(); 593 } 594 595 request->OperationFinished(&operation); 596 597 handle->info->dma_resource.RecycleBuffer(operation.Buffer()); 598 599 TRACE("%p: status %s, remaining bytes %" B_PRIuGENADDR "\n", request, 600 strerror(status), request->RemainingBytes()); 601 if (status != B_OK) 602 break; 603 } 604 605 release_sem(handle->info->dma_buffers_sem); 606 607 // Notify() also takes care of UnlockMemory(). 608 if (status != B_OK && request->Status() == B_OK) 609 request->SetStatusAndNotify(status); 610 else 611 request->NotifyFinished(); 612 return status; 613 } 614 615 616 static status_t 617 nvme_disk_io(void* cookie, io_request* request) 618 { 619 CALLED(); 620 621 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 622 623 const off_t ns_end = (handle->info->capacity * handle->info->block_size); 624 if ((request->Offset() + (off_t)request->Length()) > ns_end) 625 return ERANGE; 626 627 nvme_io_request nvme_request; 628 memset(&nvme_request, 0, sizeof(nvme_io_request)); 629 630 nvme_request.write = request->IsWrite(); 631 632 physical_entry* vtophys = NULL; 633 MemoryDeleter vtophysDeleter; 634 635 IOBuffer* buffer = request->Buffer(); 636 status_t status = B_OK; 637 if (!buffer->IsPhysical()) { 638 status = buffer->LockMemory(request->TeamID(), request->IsWrite()); 639 if (status != B_OK) { 640 TRACE_ERROR("failed to lock memory: %s\n", strerror(status)); 641 return status; 642 } 643 // SetStatusAndNotify() takes care of unlocking memory if necessary. 644 645 // This is slightly inefficient, as we could use a BStackOrHeapArray in 646 // the optimal case (few physical entries required), but we would not 647 // know whether or not that was possible until calling get_memory_map() 648 // and then potentially reallocating, which would complicate the logic. 649 650 int32 vtophys_length = (request->Length() / B_PAGE_SIZE) + 2; 651 nvme_request.iovecs = vtophys = (physical_entry*)malloc(sizeof(physical_entry) 652 * vtophys_length); 653 if (vtophys == NULL) { 654 TRACE_ERROR("failed to allocate memory for iovecs\n"); 655 request->SetStatusAndNotify(B_NO_MEMORY); 656 return B_NO_MEMORY; 657 } 658 vtophysDeleter.SetTo(vtophys); 659 660 for (size_t i = 0; i < buffer->VecCount(); i++) { 661 generic_io_vec virt = buffer->VecAt(i); 662 uint32 entries = vtophys_length - nvme_request.iovec_count; 663 664 // Avoid copies by going straight into the vtophys array. 665 status = get_memory_map_etc(request->TeamID(), (void*)virt.base, 666 virt.length, vtophys + nvme_request.iovec_count, &entries); 667 if (status == B_BUFFER_OVERFLOW) { 668 TRACE("vtophys array was too small, reallocating\n"); 669 670 vtophysDeleter.Detach(); 671 vtophys_length *= 2; 672 nvme_request.iovecs = vtophys = (physical_entry*)realloc(vtophys, 673 sizeof(physical_entry) * vtophys_length); 674 vtophysDeleter.SetTo(vtophys); 675 if (vtophys == NULL) { 676 status = B_NO_MEMORY; 677 } else { 678 // Try again, with the larger buffer this time. 679 i--; 680 continue; 681 } 682 } 683 if (status != B_OK) { 684 TRACE_ERROR("I/O get_memory_map failed: %s\n", strerror(status)); 685 request->SetStatusAndNotify(status); 686 return status; 687 } 688 689 nvme_request.iovec_count += entries; 690 } 691 } else { 692 nvme_request.iovecs = (physical_entry*)buffer->Vecs(); 693 nvme_request.iovec_count = buffer->VecCount(); 694 } 695 696 // See if we need to bounce anything other than the first or last vec. 697 const size_t block_size = handle->info->block_size; 698 bool bounceAll = false; 699 for (int32 i = 1; !bounceAll && i < (nvme_request.iovec_count - 1); i++) { 700 if ((nvme_request.iovecs[i].address % B_PAGE_SIZE) != 0) 701 bounceAll = true; 702 if ((nvme_request.iovecs[i].size % B_PAGE_SIZE) != 0) 703 bounceAll = true; 704 } 705 706 // See if we need to bounce due to the first or last vecs. 707 if (nvme_request.iovec_count > 1) { 708 // There are middle vecs, so the first and last vecs have different restrictions: they 709 // need only be a multiple of the block size, and must end and start on a page boundary, 710 // respectively, though the start address must always be 32-bit-aligned. 711 physical_entry* entry = &nvme_request.iovecs[0]; 712 if (!bounceAll && (((entry->address + entry->size) % B_PAGE_SIZE) != 0 713 || (entry->address & 0x3) != 0 || (entry->size % block_size) != 0)) 714 bounceAll = true; 715 716 entry = &nvme_request.iovecs[nvme_request.iovec_count - 1]; 717 if (!bounceAll && ((entry->address % B_PAGE_SIZE) != 0 718 || (entry->size % block_size) != 0)) 719 bounceAll = true; 720 } else { 721 // There is only one vec. Check that it is a multiple of the block size, 722 // and that its address is 32-bit-aligned. 723 physical_entry* entry = &nvme_request.iovecs[0]; 724 if (!bounceAll && ((entry->address & 0x3) != 0 || (entry->size % block_size) != 0)) 725 bounceAll = true; 726 } 727 728 // See if we need to bounce due to rounding. 729 const off_t rounded_pos = ROUNDDOWN(request->Offset(), block_size); 730 phys_size_t rounded_len = ROUNDUP(request->Length() + (request->Offset() 731 - rounded_pos), block_size); 732 if (rounded_pos != request->Offset() || rounded_len != request->Length()) 733 bounceAll = true; 734 735 if (bounceAll) { 736 // Let the bounced I/O routine take care of everything from here. 737 return nvme_disk_bounced_io(handle, request); 738 } 739 740 nvme_request.lba_start = rounded_pos / block_size; 741 nvme_request.lba_count = rounded_len / block_size; 742 743 // No bouncing was required. 744 ReadLocker readLocker; 745 if (nvme_request.write) 746 readLocker.SetTo(handle->info->rounded_write_lock, false); 747 748 // Error check before actually doing I/O. 749 if (status != B_OK) { 750 TRACE_ERROR("I/O failed early: %s\n", strerror(status)); 751 request->SetStatusAndNotify(status); 752 return status; 753 } 754 755 const uint32 max_io_blocks = handle->info->max_io_blocks; 756 int32 remaining = nvme_request.iovec_count; 757 while (remaining > 0) { 758 nvme_request.iovec_count = min_c(remaining, 759 NVME_MAX_SGL_DESCRIPTORS / 2); 760 761 nvme_request.lba_count = 0; 762 for (int i = 0; i < nvme_request.iovec_count; i++) { 763 uint32 new_lba_count = nvme_request.lba_count 764 + (nvme_request.iovecs[i].size / block_size); 765 if (nvme_request.lba_count > 0 && new_lba_count > max_io_blocks) { 766 // We already have a nonzero length, and adding this vec would 767 // make us go over (or we already are over.) Stop adding. 768 nvme_request.iovec_count = i; 769 break; 770 } 771 772 nvme_request.lba_count = new_lba_count; 773 } 774 775 status = do_nvme_io_request(handle->info, &nvme_request); 776 if (status != B_OK) 777 break; 778 779 nvme_request.iovecs += nvme_request.iovec_count; 780 remaining -= nvme_request.iovec_count; 781 nvme_request.lba_start += nvme_request.lba_count; 782 } 783 784 if (status != B_OK) 785 TRACE_ERROR("I/O failed: %s\n", strerror(status)); 786 787 request->SetTransferredBytes(status != B_OK, 788 (nvme_request.lba_start * block_size) - rounded_pos); 789 request->SetStatusAndNotify(status); 790 return status; 791 } 792 793 794 static status_t 795 nvme_disk_read(void* cookie, off_t pos, void* buffer, size_t* length) 796 { 797 CALLED(); 798 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 799 800 const off_t ns_end = (handle->info->capacity * handle->info->block_size); 801 if (pos >= ns_end) 802 return B_BAD_VALUE; 803 if ((pos + (off_t)*length) > ns_end) 804 *length = ns_end - pos; 805 806 IORequest request; 807 status_t status = request.Init(pos, (addr_t)buffer, *length, false, 0); 808 if (status != B_OK) 809 return status; 810 811 status = nvme_disk_io(handle, &request); 812 *length = request.TransferredBytes(); 813 return status; 814 } 815 816 817 static status_t 818 nvme_disk_write(void* cookie, off_t pos, const void* buffer, size_t* length) 819 { 820 CALLED(); 821 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 822 823 const off_t ns_end = (handle->info->capacity * handle->info->block_size); 824 if (pos >= ns_end) 825 return B_BAD_VALUE; 826 if ((pos + (off_t)*length) > ns_end) 827 *length = ns_end - pos; 828 829 IORequest request; 830 status_t status = request.Init(pos, (addr_t)buffer, *length, true, 0); 831 if (status != B_OK) 832 return status; 833 834 status = nvme_disk_io(handle, &request); 835 *length = request.TransferredBytes(); 836 return status; 837 } 838 839 840 static status_t 841 nvme_disk_flush(nvme_disk_driver_info* info) 842 { 843 CALLED(); 844 status_t status = EINPROGRESS; 845 846 qpair_info* qpinfo = get_qpair(info); 847 int ret = nvme_ns_flush(info->ns, qpinfo->qpair, 848 (nvme_cmd_cb)io_finished_callback, &status); 849 if (ret != 0) 850 return ret; 851 852 await_status(info, qpinfo->qpair, status); 853 return status; 854 } 855 856 857 static status_t 858 nvme_disk_trim(nvme_disk_driver_info* info, fs_trim_data* trimData) 859 { 860 CALLED(); 861 trimData->trimmed_size = 0; 862 863 const off_t deviceSize = info->capacity * info->block_size; // in bytes 864 if (deviceSize < 0) 865 return B_BAD_VALUE; 866 867 STATIC_ASSERT(sizeof(deviceSize) <= sizeof(uint64)); 868 ASSERT(deviceSize >= 0); 869 870 // Do not trim past device end. 871 for (uint32 i = 0; i < trimData->range_count; i++) { 872 uint64 offset = trimData->ranges[i].offset; 873 uint64& size = trimData->ranges[i].size; 874 875 if (offset >= (uint64)deviceSize) 876 return B_BAD_VALUE; 877 size = std::min(size, (uint64)deviceSize - offset); 878 } 879 880 // We need contiguous memory for the DSM ranges. 881 nvme_dsm_range* dsmRanges = (nvme_dsm_range*)nvme_mem_alloc_node( 882 trimData->range_count * sizeof(nvme_dsm_range), 0, 0, NULL); 883 if (dsmRanges == NULL) 884 return B_NO_MEMORY; 885 CObjectDeleter<void, void, nvme_free> dsmRangesDeleter(dsmRanges); 886 887 uint64 trimmingSize = 0; 888 for (uint32 i = 0; i < trimData->range_count; i++) { 889 uint64 offset = trimData->ranges[i].offset; 890 uint64 length = trimData->ranges[i].size; 891 892 // Round up offset and length to the block size. 893 // (Some space at the beginning and end may thus not be trimmed.) 894 offset = ROUNDUP(offset, info->block_size); 895 length -= offset - trimData->ranges[i].offset; 896 length = ROUNDDOWN(length, info->block_size); 897 898 if (length == 0) 899 continue; 900 if ((length / info->block_size) > UINT32_MAX) 901 length = uint64(UINT32_MAX) * info->block_size; 902 // TODO: Break into smaller trim ranges! 903 904 TRACE("trim %" B_PRIu64 " bytes from %" B_PRIu64 "\n", length, offset); 905 906 dsmRanges[i].attributes = 0; 907 dsmRanges[i].length = length / info->block_size; 908 dsmRanges[i].starting_lba = offset / info->block_size; 909 910 trimmingSize += length; 911 } 912 913 status_t status = EINPROGRESS; 914 qpair_info* qpair = get_qpair(info); 915 if (nvme_ns_deallocate(info->ns, qpair->qpair, dsmRanges, trimData->range_count, 916 (nvme_cmd_cb)io_finished_callback, &status) != 0) 917 return B_IO_ERROR; 918 919 await_status(info, qpair->qpair, status); 920 if (status != B_OK) 921 return status; 922 923 trimData->trimmed_size = trimmingSize; 924 return B_OK; 925 } 926 927 928 static status_t 929 nvme_disk_ioctl(void* cookie, uint32 op, void* buffer, size_t length) 930 { 931 CALLED(); 932 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 933 nvme_disk_driver_info* info = handle->info; 934 935 TRACE("ioctl(op = %" B_PRId32 ")\n", op); 936 937 switch (op) { 938 case B_GET_MEDIA_STATUS: 939 { 940 return user_memcpy(buffer, &info->media_status, sizeof(status_t)); 941 } 942 943 case B_GET_DEVICE_SIZE: 944 { 945 size_t size = info->capacity * info->block_size; 946 return user_memcpy(buffer, &size, sizeof(size_t)); 947 } 948 949 case B_GET_GEOMETRY: 950 { 951 if (buffer == NULL || length > sizeof(device_geometry)) 952 return B_BAD_VALUE; 953 954 device_geometry geometry; 955 status_t status = get_geometry(handle, &geometry); 956 if (status != B_OK) 957 return status; 958 959 return user_memcpy(buffer, &geometry, length); 960 } 961 962 case B_GET_ICON_NAME: 963 return user_strlcpy((char*)buffer, "devices/drive-harddisk", 964 B_FILE_NAME_LENGTH); 965 966 case B_GET_VECTOR_ICON: 967 { 968 device_icon iconData; 969 if (length != sizeof(device_icon)) 970 return B_BAD_VALUE; 971 if (user_memcpy(&iconData, buffer, sizeof(device_icon)) != B_OK) 972 return B_BAD_ADDRESS; 973 974 if (iconData.icon_size >= (int32)sizeof(kDriveIcon)) { 975 if (user_memcpy(iconData.icon_data, kDriveIcon, 976 sizeof(kDriveIcon)) != B_OK) 977 return B_BAD_ADDRESS; 978 } 979 980 iconData.icon_size = sizeof(kDriveIcon); 981 return user_memcpy(buffer, &iconData, sizeof(device_icon)); 982 } 983 984 case B_FLUSH_DRIVE_CACHE: 985 return nvme_disk_flush(info); 986 987 case B_TRIM_DEVICE: 988 ASSERT(IS_KERNEL_ADDRESS(buffer)); 989 return nvme_disk_trim(info, (fs_trim_data*)buffer); 990 } 991 992 return B_DEV_INVALID_IOCTL; 993 } 994 995 996 // #pragma mark - driver module API 997 998 999 static float 1000 nvme_disk_supports_device(device_node *parent) 1001 { 1002 CALLED(); 1003 1004 const char* bus; 1005 uint16 baseClass, subClass; 1006 1007 if (sDeviceManager->get_attr_string(parent, B_DEVICE_BUS, &bus, false) != B_OK 1008 || sDeviceManager->get_attr_uint16(parent, B_DEVICE_TYPE, &baseClass, false) != B_OK 1009 || sDeviceManager->get_attr_uint16(parent, B_DEVICE_SUB_TYPE, &subClass, false) != B_OK) 1010 return -1.0f; 1011 1012 if (strcmp(bus, "pci") != 0 || baseClass != PCI_mass_storage) 1013 return 0.0f; 1014 1015 if (subClass != PCI_nvm) 1016 return 0.0f; 1017 1018 TRACE("NVMe device found!\n"); 1019 return 1.0f; 1020 } 1021 1022 1023 static status_t 1024 nvme_disk_register_device(device_node* parent) 1025 { 1026 CALLED(); 1027 1028 device_attr attrs[] = { 1029 { B_DEVICE_PRETTY_NAME, B_STRING_TYPE, { .string = "NVMe Disk" } }, 1030 { NULL } 1031 }; 1032 1033 return sDeviceManager->register_node(parent, NVME_DISK_DRIVER_MODULE_NAME, 1034 attrs, NULL, NULL); 1035 } 1036 1037 1038 static status_t 1039 nvme_disk_init_driver(device_node* node, void** cookie) 1040 { 1041 CALLED(); 1042 1043 int ret = nvme_lib_init((enum nvme_log_level)0, (enum nvme_log_facility)0, NULL); 1044 if (ret != 0) { 1045 TRACE_ERROR("libnvme initialization failed!\n"); 1046 return ret; 1047 } 1048 1049 nvme_disk_driver_info* info = new nvme_disk_driver_info; 1050 if (info == NULL) 1051 return B_NO_MEMORY; 1052 1053 info->media_status = B_OK; 1054 info->node = node; 1055 1056 info->ctrlr = NULL; 1057 1058 *cookie = info; 1059 return B_OK; 1060 } 1061 1062 1063 static void 1064 nvme_disk_uninit_driver(void* _cookie) 1065 { 1066 CALLED(); 1067 1068 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie; 1069 free(info); 1070 } 1071 1072 1073 static status_t 1074 nvme_disk_register_child_devices(void* _cookie) 1075 { 1076 CALLED(); 1077 1078 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie; 1079 status_t status; 1080 1081 int32 id = sDeviceManager->create_id(NVME_DISK_DEVICE_ID_GENERATOR); 1082 if (id < 0) 1083 return id; 1084 1085 char name[64]; 1086 snprintf(name, sizeof(name), "disk/nvme/%" B_PRId32 "/raw", 1087 id); 1088 1089 status = sDeviceManager->publish_device(info->node, name, 1090 NVME_DISK_DEVICE_MODULE_NAME); 1091 1092 return status; 1093 } 1094 1095 1096 // #pragma mark - 1097 1098 1099 module_dependency module_dependencies[] = { 1100 { B_DEVICE_MANAGER_MODULE_NAME, (module_info**)&sDeviceManager }, 1101 { NULL } 1102 }; 1103 1104 struct device_module_info sNvmeDiskDevice = { 1105 { 1106 NVME_DISK_DEVICE_MODULE_NAME, 1107 0, 1108 NULL 1109 }, 1110 1111 nvme_disk_init_device, 1112 nvme_disk_uninit_device, 1113 NULL, // remove, 1114 1115 nvme_disk_open, 1116 nvme_disk_close, 1117 nvme_disk_free, 1118 nvme_disk_read, 1119 nvme_disk_write, 1120 nvme_disk_io, 1121 nvme_disk_ioctl, 1122 1123 NULL, // select 1124 NULL, // deselect 1125 }; 1126 1127 struct driver_module_info sNvmeDiskDriver = { 1128 { 1129 NVME_DISK_DRIVER_MODULE_NAME, 1130 0, 1131 NULL 1132 }, 1133 1134 nvme_disk_supports_device, 1135 nvme_disk_register_device, 1136 nvme_disk_init_driver, 1137 nvme_disk_uninit_driver, 1138 nvme_disk_register_child_devices, 1139 NULL, // rescan 1140 NULL, // removed 1141 }; 1142 1143 module_info* modules[] = { 1144 (module_info*)&sNvmeDiskDriver, 1145 (module_info*)&sNvmeDiskDevice, 1146 NULL 1147 }; 1148