1 /* 2 * Copyright 2019-2020, Haiku, Inc. All rights reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Augustin Cavalier <waddlesplash> 7 */ 8 9 10 #include <stdio.h> 11 #include <stdlib.h> 12 13 #include <algorithm> 14 #include <condition_variable.h> 15 #include <AutoDeleter.h> 16 #include <kernel.h> 17 #include <smp.h> 18 #include <util/AutoLock.h> 19 20 #include <fs/devfs.h> 21 #include <bus/PCI.h> 22 #include <PCI_x86.h> 23 #include <vm/vm.h> 24 25 #include "IORequest.h" 26 27 extern "C" { 28 #include <libnvme/nvme.h> 29 #include <libnvme/nvme_internal.h> 30 } 31 32 33 //#define TRACE_NVME_DISK 34 #ifdef TRACE_NVME_DISK 35 # define TRACE(x...) dprintf("nvme_disk: " x) 36 #else 37 # define TRACE(x...) ; 38 #endif 39 #define TRACE_ALWAYS(x...) dprintf("nvme_disk: " x) 40 #define TRACE_ERROR(x...) dprintf("\33[33mnvme_disk:\33[0m " x) 41 #define CALLED() TRACE("CALLED %s\n", __PRETTY_FUNCTION__) 42 43 44 static const uint8 kDriveIcon[] = { 45 0x6e, 0x63, 0x69, 0x66, 0x08, 0x03, 0x01, 0x00, 0x00, 0x02, 0x00, 0x16, 46 0x02, 0x3c, 0xc7, 0xee, 0x38, 0x9b, 0xc0, 0xba, 0x16, 0x57, 0x3e, 0x39, 47 0xb0, 0x49, 0x77, 0xc8, 0x42, 0xad, 0xc7, 0x00, 0xff, 0xff, 0xd3, 0x02, 48 0x00, 0x06, 0x02, 0x3c, 0x96, 0x32, 0x3a, 0x4d, 0x3f, 0xba, 0xfc, 0x01, 49 0x3d, 0x5a, 0x97, 0x4b, 0x57, 0xa5, 0x49, 0x84, 0x4d, 0x00, 0x47, 0x47, 50 0x47, 0xff, 0xa5, 0xa0, 0xa0, 0x02, 0x00, 0x16, 0x02, 0xbc, 0x59, 0x2f, 51 0xbb, 0x29, 0xa7, 0x3c, 0x0c, 0xe4, 0xbd, 0x0b, 0x7c, 0x48, 0x92, 0xc0, 52 0x4b, 0x79, 0x66, 0x00, 0x7d, 0xff, 0xd4, 0x02, 0x00, 0x06, 0x02, 0x38, 53 0xdb, 0xb4, 0x39, 0x97, 0x33, 0xbc, 0x4a, 0x33, 0x3b, 0xa5, 0x42, 0x48, 54 0x6e, 0x66, 0x49, 0xee, 0x7b, 0x00, 0x59, 0x67, 0x56, 0xff, 0xeb, 0xb2, 55 0xb2, 0x03, 0xa7, 0xff, 0x00, 0x03, 0xff, 0x00, 0x00, 0x04, 0x01, 0x80, 56 0x07, 0x0a, 0x06, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x5a, 0x3e, 0x5a, 57 0x31, 0x39, 0x25, 0x0a, 0x04, 0x22, 0x3c, 0x44, 0x4b, 0x5a, 0x31, 0x39, 58 0x25, 0x0a, 0x04, 0x44, 0x4b, 0x44, 0x5b, 0x5a, 0x3e, 0x5a, 0x31, 0x0a, 59 0x04, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x44, 0x4b, 0x08, 0x02, 0x27, 60 0x43, 0xb8, 0x14, 0xc1, 0xf1, 0x08, 0x02, 0x26, 0x43, 0x29, 0x44, 0x0a, 61 0x05, 0x44, 0x5d, 0x49, 0x5d, 0x60, 0x3e, 0x5a, 0x3b, 0x5b, 0x3f, 0x08, 62 0x0a, 0x07, 0x01, 0x06, 0x00, 0x0a, 0x00, 0x01, 0x00, 0x10, 0x01, 0x17, 63 0x84, 0x00, 0x04, 0x0a, 0x01, 0x01, 0x01, 0x00, 0x0a, 0x02, 0x01, 0x02, 64 0x00, 0x0a, 0x03, 0x01, 0x03, 0x00, 0x0a, 0x04, 0x01, 0x04, 0x10, 0x01, 65 0x17, 0x85, 0x20, 0x04, 0x0a, 0x06, 0x01, 0x05, 0x30, 0x24, 0xb3, 0x99, 66 0x01, 0x17, 0x82, 0x00, 0x04, 0x0a, 0x05, 0x01, 0x05, 0x30, 0x20, 0xb2, 67 0xe6, 0x01, 0x17, 0x82, 0x00, 0x04 68 }; 69 70 71 #define NVME_DISK_DRIVER_MODULE_NAME "drivers/disk/nvme_disk/driver_v1" 72 #define NVME_DISK_DEVICE_MODULE_NAME "drivers/disk/nvme_disk/device_v1" 73 #define NVME_DISK_DEVICE_ID_GENERATOR "nvme_disk/device_id" 74 75 #define NVME_MAX_QPAIRS (16) 76 77 78 static device_manager_info* sDeviceManager; 79 static pci_x86_module_info* sPCIx86Module; 80 81 typedef struct { 82 device_node* node; 83 pci_info info; 84 85 struct nvme_ctrlr* ctrlr; 86 87 struct nvme_ns* ns; 88 uint64 capacity; 89 uint32 block_size; 90 uint32 max_io_blocks; 91 status_t media_status; 92 93 DMAResource dma_resource; 94 sem_id dma_buffers_sem; 95 96 rw_lock rounded_write_lock; 97 98 ConditionVariable interrupt; 99 100 struct qpair_info { 101 struct nvme_qpair* qpair; 102 } qpairs[NVME_MAX_QPAIRS]; 103 uint32 qpair_count; 104 } nvme_disk_driver_info; 105 typedef nvme_disk_driver_info::qpair_info qpair_info; 106 107 108 typedef struct { 109 nvme_disk_driver_info* info; 110 } nvme_disk_handle; 111 112 113 static status_t 114 get_geometry(nvme_disk_handle* handle, device_geometry* geometry) 115 { 116 nvme_disk_driver_info* info = handle->info; 117 118 devfs_compute_geometry_size(geometry, info->capacity, info->block_size); 119 120 geometry->device_type = B_DISK; 121 geometry->removable = false; 122 123 geometry->read_only = false; 124 geometry->write_once = false; 125 126 TRACE("get_geometry(): %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %d, %d, %d, %d\n", 127 geometry->bytes_per_sector, geometry->sectors_per_track, 128 geometry->cylinder_count, geometry->head_count, geometry->device_type, 129 geometry->removable, geometry->read_only, geometry->write_once); 130 131 return B_OK; 132 } 133 134 135 static void 136 nvme_disk_set_capacity(nvme_disk_driver_info* info, uint64 capacity, 137 uint32 blockSize) 138 { 139 TRACE("set_capacity(device = %p, capacity = %" B_PRIu64 ", blockSize = %" B_PRIu32 ")\n", 140 info, capacity, blockSize); 141 142 info->capacity = capacity; 143 info->block_size = blockSize; 144 } 145 146 147 // #pragma mark - device module API 148 149 150 static int32 nvme_interrupt_handler(void* _info); 151 152 153 static status_t 154 nvme_disk_init_device(void* _info, void** _cookie) 155 { 156 CALLED(); 157 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info; 158 ASSERT(info->ctrlr == NULL); 159 160 pci_device_module_info* pci; 161 pci_device* pcidev; 162 device_node* parent = sDeviceManager->get_parent_node(info->node); 163 sDeviceManager->get_driver(parent, (driver_module_info**)&pci, 164 (void**)&pcidev); 165 pci->get_pci_info(pcidev, &info->info); 166 sDeviceManager->put_node(parent); 167 168 // construct the libnvme pci_device struct 169 pci_device* device = new pci_device; 170 device->vendor_id = info->info.vendor_id; 171 device->device_id = info->info.device_id; 172 device->subvendor_id = 0; 173 device->subdevice_id = 0; 174 175 device->domain = 0; 176 device->bus = info->info.bus; 177 device->dev = info->info.device; 178 device->func = info->info.function; 179 180 device->pci_info = &info->info; 181 182 // enable busmaster and memory mapped access 183 uint16 command = pci->read_pci_config(pcidev, PCI_command, 2); 184 command |= PCI_command_master | PCI_command_memory; 185 pci->write_pci_config(pcidev, PCI_command, 2, command); 186 187 // open the controller 188 info->ctrlr = nvme_ctrlr_open(device, NULL); 189 if (info->ctrlr == NULL) { 190 TRACE_ERROR("failed to open the controller!\n"); 191 return B_ERROR; 192 } 193 194 struct nvme_ctrlr_stat cstat; 195 int err = nvme_ctrlr_stat(info->ctrlr, &cstat); 196 if (err != 0) { 197 TRACE_ERROR("failed to get controller information!\n"); 198 nvme_ctrlr_close(info->ctrlr); 199 return err; 200 } 201 202 TRACE_ALWAYS("attached to NVMe device \"%s (%s)\"\n", cstat.mn, cstat.sn); 203 TRACE_ALWAYS("\tmaximum transfer size: %" B_PRIuSIZE "\n", cstat.max_xfer_size); 204 TRACE_ALWAYS("\tqpair count: %d\n", cstat.io_qpairs); 205 206 // TODO: export more than just the first namespace! 207 info->ns = nvme_ns_open(info->ctrlr, cstat.ns_ids[0]); 208 if (info->ns == NULL) { 209 TRACE_ERROR("failed to open namespace!\n"); 210 nvme_ctrlr_close(info->ctrlr); 211 return B_ERROR; 212 } 213 TRACE_ALWAYS("namespace 0\n"); 214 215 struct nvme_ns_stat nsstat; 216 err = nvme_ns_stat(info->ns, &nsstat); 217 if (err != 0) { 218 TRACE_ERROR("failed to get namespace information!\n"); 219 nvme_ctrlr_close(info->ctrlr); 220 return err; 221 } 222 223 // store capacity information 224 TRACE_ALWAYS("\tblock size: %" B_PRIuSIZE ", stripe size: %" B_PRIu32 "\n", 225 nsstat.sector_size, info->ns->stripe_size); 226 nvme_disk_set_capacity(info, nsstat.sectors, nsstat.sector_size); 227 228 // set up interrupts 229 if (get_module(B_PCI_X86_MODULE_NAME, (module_info**)&sPCIx86Module) 230 != B_OK) { 231 sPCIx86Module = NULL; 232 } 233 234 command = pci->read_pci_config(pcidev, PCI_command, 2); 235 command &= ~(PCI_command_int_disable); 236 pci->write_pci_config(pcidev, PCI_command, 2, command); 237 238 uint8 irq = info->info.u.h0.interrupt_line; 239 if (sPCIx86Module != NULL) { 240 if (sPCIx86Module->get_msix_count(info->info.bus, info->info.device, 241 info->info.function)) { 242 uint8 msixVector = 0; 243 if (sPCIx86Module->configure_msix(info->info.bus, info->info.device, 244 info->info.function, 1, &msixVector) == B_OK 245 && sPCIx86Module->enable_msix(info->info.bus, info->info.device, 246 info->info.function) == B_OK) { 247 TRACE_ALWAYS("using MSI-X\n"); 248 irq = msixVector; 249 } 250 } else if (sPCIx86Module->get_msi_count(info->info.bus, 251 info->info.device, info->info.function) >= 1) { 252 uint8 msiVector = 0; 253 if (sPCIx86Module->configure_msi(info->info.bus, info->info.device, 254 info->info.function, 1, &msiVector) == B_OK 255 && sPCIx86Module->enable_msi(info->info.bus, info->info.device, 256 info->info.function) == B_OK) { 257 TRACE_ALWAYS("using message signaled interrupts\n"); 258 irq = msiVector; 259 } 260 } 261 } 262 263 if (irq == 0 || irq == 0xFF) { 264 TRACE_ERROR("device PCI:%d:%d:%d was assigned an invalid IRQ\n", 265 info->info.bus, info->info.device, info->info.function); 266 return B_ERROR; 267 } 268 info->interrupt.Init(NULL, NULL); 269 install_io_interrupt_handler(irq, nvme_interrupt_handler, (void*)info, B_NO_HANDLED_INFO); 270 271 if (info->ctrlr->feature_supported[NVME_FEAT_INTERRUPT_COALESCING]) { 272 uint32 microseconds = 16, threshold = 32; 273 nvme_admin_set_feature(info->ctrlr, false, NVME_FEAT_INTERRUPT_COALESCING, 274 ((microseconds / 100) << 8) | threshold, 0, NULL); 275 } 276 277 // allocate qpairs 278 int32 try_qpairs = cstat.io_qpairs; 279 try_qpairs = min_c(try_qpairs, NVME_MAX_QPAIRS); 280 if (try_qpairs >= smp_get_num_cpus()) { 281 try_qpairs = smp_get_num_cpus(); 282 } else { 283 // Find the highest number of qpairs that evenly divides the number of CPUs. 284 while ((smp_get_num_cpus() % try_qpairs) != 0) 285 try_qpairs--; 286 } 287 info->qpair_count = 0; 288 for (uint32 i = 0; i < try_qpairs; i++) { 289 info->qpairs[i].qpair = nvme_ioqp_get(info->ctrlr, 290 (enum nvme_qprio)0, 0); 291 if (info->qpairs[i].qpair == NULL) 292 break; 293 294 info->qpair_count++; 295 } 296 if (info->qpair_count == 0) { 297 TRACE_ERROR("failed to allocate qpairs!\n"); 298 nvme_ctrlr_close(info->ctrlr); 299 return B_NO_MEMORY; 300 } 301 if (info->qpair_count != try_qpairs) { 302 TRACE_ALWAYS("warning: did not get expected number of qpairs\n"); 303 } 304 305 // allocate DMA buffers 306 int buffers = info->qpair_count * 2; 307 308 dma_restrictions restrictions = {}; 309 restrictions.alignment = B_PAGE_SIZE; 310 // Technically, the first and last segments in a transfer can be aligned 311 // only on 32-bits, and the rest only need to have sizes that are a multiple 312 // of the block size. 313 restrictions.max_segment_count = (NVME_MAX_SGL_DESCRIPTORS / 2); 314 restrictions.max_transfer_size = cstat.max_xfer_size; 315 info->max_io_blocks = cstat.max_xfer_size / nsstat.sector_size; 316 317 err = info->dma_resource.Init(restrictions, B_PAGE_SIZE, buffers, buffers); 318 if (err != 0) { 319 TRACE_ERROR("failed to initialize DMA resource!\n"); 320 nvme_ctrlr_close(info->ctrlr); 321 return err; 322 } 323 324 info->dma_buffers_sem = create_sem(buffers, "nvme buffers sem"); 325 if (info->dma_buffers_sem < 0) { 326 TRACE_ERROR("failed to create DMA buffers semaphore!\n"); 327 nvme_ctrlr_close(info->ctrlr); 328 return info->dma_buffers_sem; 329 } 330 331 // set up rounded-write lock 332 rw_lock_init(&info->rounded_write_lock, "nvme rounded writes"); 333 334 *_cookie = info; 335 return B_OK; 336 } 337 338 339 static void 340 nvme_disk_uninit_device(void* _cookie) 341 { 342 CALLED(); 343 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie; 344 345 remove_io_interrupt_handler(info->info.u.h0.interrupt_line, 346 nvme_interrupt_handler, (void*)info); 347 348 rw_lock_destroy(&info->rounded_write_lock); 349 350 nvme_ns_close(info->ns); 351 nvme_ctrlr_close(info->ctrlr); 352 353 // TODO: Deallocate MSI(-X). 354 // TODO: Deallocate PCI. 355 } 356 357 358 static status_t 359 nvme_disk_open(void* _info, const char* path, int openMode, void** _cookie) 360 { 361 CALLED(); 362 363 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info; 364 nvme_disk_handle* handle = (nvme_disk_handle*)malloc( 365 sizeof(nvme_disk_handle)); 366 if (handle == NULL) 367 return B_NO_MEMORY; 368 369 handle->info = info; 370 371 *_cookie = handle; 372 return B_OK; 373 } 374 375 376 static status_t 377 nvme_disk_close(void* cookie) 378 { 379 CALLED(); 380 381 //nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 382 return B_OK; 383 } 384 385 386 static status_t 387 nvme_disk_free(void* cookie) 388 { 389 CALLED(); 390 391 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 392 free(handle); 393 return B_OK; 394 } 395 396 397 // #pragma mark - I/O 398 399 400 static int32 401 nvme_interrupt_handler(void* _info) 402 { 403 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info; 404 info->interrupt.NotifyAll(); 405 return 0; 406 } 407 408 409 static qpair_info* 410 get_qpair(nvme_disk_driver_info* info) 411 { 412 return &info->qpairs[smp_get_current_cpu() % info->qpair_count]; 413 } 414 415 416 static void 417 io_finished_callback(status_t* status, const struct nvme_cpl* cpl) 418 { 419 *status = nvme_cpl_is_error(cpl) ? B_IO_ERROR : B_OK; 420 } 421 422 423 static void 424 await_status(nvme_disk_driver_info* info, struct nvme_qpair* qpair, status_t& status) 425 { 426 CALLED(); 427 428 ConditionVariableEntry entry; 429 int timeouts = 0; 430 while (status == EINPROGRESS) { 431 info->interrupt.Add(&entry); 432 433 nvme_qpair_poll(qpair, 0); 434 435 if (status != EINPROGRESS) 436 return; 437 438 if (entry.Wait(B_RELATIVE_TIMEOUT, 5 * 1000 * 1000) != B_OK) { 439 // This should never happen, as we are woken up on every interrupt 440 // no matter the qpair or transfer within; so if it does occur, 441 // that probably means the controller stalled or something. 442 443 TRACE_ERROR("timed out waiting for interrupt!\n"); 444 if (timeouts++ >= 3) { 445 nvme_qpair_fail(qpair); 446 status = B_TIMED_OUT; 447 return; 448 } 449 } 450 451 nvme_qpair_poll(qpair, 0); 452 } 453 } 454 455 456 struct nvme_io_request { 457 status_t status; 458 459 bool write; 460 461 off_t lba_start; 462 size_t lba_count; 463 464 physical_entry* iovecs; 465 int32 iovec_count; 466 467 int32 iovec_i; 468 uint32 iovec_offset; 469 }; 470 471 472 void ior_reset_sgl(nvme_io_request* request, uint32_t offset) 473 { 474 TRACE("IOR Reset: %" B_PRIu32 "\n", offset); 475 476 int32 i = 0; 477 while (offset > 0 && request->iovecs[i].size <= offset) { 478 offset -= request->iovecs[i].size; 479 i++; 480 } 481 request->iovec_i = i; 482 request->iovec_offset = offset; 483 } 484 485 486 int ior_next_sge(nvme_io_request* request, uint64_t* address, uint32_t* length) 487 { 488 int32 index = request->iovec_i; 489 if (index < 0 || index > request->iovec_count) 490 return -1; 491 492 *address = request->iovecs[index].address + request->iovec_offset; 493 *length = request->iovecs[index].size - request->iovec_offset; 494 495 TRACE("IOV %d (+ " B_PRIu32 "): 0x%" B_PRIx64 ", %" B_PRIu32 "\n", 496 request->iovec_i, request->iovec_offset, *address, *length); 497 498 request->iovec_i++; 499 request->iovec_offset = 0; 500 return 0; 501 } 502 503 504 static status_t 505 do_nvme_io_request(nvme_disk_driver_info* info, nvme_io_request* request) 506 { 507 request->status = EINPROGRESS; 508 509 qpair_info* qpinfo = get_qpair(info); 510 int ret = -1; 511 if (request->write) { 512 ret = nvme_ns_writev(info->ns, qpinfo->qpair, request->lba_start, 513 request->lba_count, (nvme_cmd_cb)io_finished_callback, request, 514 0, (nvme_req_reset_sgl_cb)ior_reset_sgl, 515 (nvme_req_next_sge_cb)ior_next_sge); 516 } else { 517 ret = nvme_ns_readv(info->ns, qpinfo->qpair, request->lba_start, 518 request->lba_count, (nvme_cmd_cb)io_finished_callback, request, 519 0, (nvme_req_reset_sgl_cb)ior_reset_sgl, 520 (nvme_req_next_sge_cb)ior_next_sge); 521 } 522 if (ret != 0) { 523 TRACE_ERROR("attempt to queue %s I/O at LBA %" B_PRIdOFF " of %" B_PRIuSIZE 524 " blocks failed!\n", request->write ? "write" : "read", 525 request->lba_start, request->lba_count); 526 527 request->lba_count = 0; 528 return ret; 529 } 530 531 await_status(info, qpinfo->qpair, request->status); 532 533 if (request->status != B_OK) { 534 TRACE_ERROR("%s at LBA %" B_PRIdOFF " of %" B_PRIuSIZE 535 " blocks failed!\n", request->write ? "write" : "read", 536 request->lba_start, request->lba_count); 537 538 request->lba_count = 0; 539 } 540 return request->status; 541 } 542 543 544 static status_t 545 nvme_disk_bounced_io(nvme_disk_handle* handle, io_request* request) 546 { 547 CALLED(); 548 549 WriteLocker writeLocker; 550 if (request->IsWrite()) 551 writeLocker.SetTo(handle->info->rounded_write_lock, false); 552 553 status_t status = acquire_sem(handle->info->dma_buffers_sem); 554 if (status != B_OK) { 555 request->SetStatusAndNotify(status); 556 return status; 557 } 558 559 const size_t block_size = handle->info->block_size; 560 561 TRACE("%p: IOR Offset: %" B_PRIdOFF "; Length %" B_PRIuGENADDR 562 "; Write %s\n", request, request->Offset(), request->Length(), 563 request->IsWrite() ? "yes" : "no"); 564 565 nvme_io_request nvme_request; 566 while (request->RemainingBytes() > 0) { 567 IOOperation operation; 568 status = handle->info->dma_resource.TranslateNext(request, &operation, 0); 569 if (status != B_OK) 570 break; 571 572 size_t transferredBytes = 0; 573 do { 574 TRACE("%p: IOO offset: %" B_PRIdOFF ", length: %" B_PRIuGENADDR 575 ", write: %s\n", request, operation.Offset(), 576 operation.Length(), operation.IsWrite() ? "yes" : "no"); 577 578 nvme_request.write = operation.IsWrite(); 579 nvme_request.lba_start = operation.Offset() / block_size; 580 nvme_request.lba_count = operation.Length() / block_size; 581 nvme_request.iovecs = (physical_entry*)operation.Vecs(); 582 nvme_request.iovec_count = operation.VecCount(); 583 584 status = do_nvme_io_request(handle->info, &nvme_request); 585 if (status == B_OK && nvme_request.write == request->IsWrite()) 586 transferredBytes += operation.OriginalLength(); 587 588 operation.SetStatus(status); 589 } while (status == B_OK && !operation.Finish()); 590 591 if (status == B_OK && operation.Status() != B_OK) { 592 TRACE_ERROR("I/O succeeded but IOOperation failed!\n"); 593 status = operation.Status(); 594 } 595 596 operation.SetTransferredBytes(transferredBytes); 597 request->OperationFinished(&operation, status, status != B_OK, 598 operation.OriginalOffset() + transferredBytes); 599 600 handle->info->dma_resource.RecycleBuffer(operation.Buffer()); 601 602 TRACE("%p: status %s, remaining bytes %" B_PRIuGENADDR "\n", request, 603 strerror(status), request->RemainingBytes()); 604 if (status != B_OK) 605 break; 606 } 607 608 release_sem(handle->info->dma_buffers_sem); 609 610 // Notify() also takes care of UnlockMemory(). 611 if (status != B_OK && request->Status() == B_OK) 612 request->SetStatusAndNotify(status); 613 else 614 request->NotifyFinished(); 615 return status; 616 } 617 618 619 static status_t 620 nvme_disk_io(void* cookie, io_request* request) 621 { 622 CALLED(); 623 624 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 625 626 nvme_io_request nvme_request; 627 memset(&nvme_request, 0, sizeof(nvme_io_request)); 628 629 nvme_request.write = request->IsWrite(); 630 631 physical_entry* vtophys = NULL; 632 MemoryDeleter vtophysDeleter; 633 634 IOBuffer* buffer = request->Buffer(); 635 status_t status = B_OK; 636 if (!buffer->IsPhysical()) { 637 status = buffer->LockMemory(request->TeamID(), request->IsWrite()); 638 if (status != B_OK) { 639 TRACE_ERROR("failed to lock memory: %s\n", strerror(status)); 640 return status; 641 } 642 // SetStatusAndNotify() takes care of unlocking memory if necessary. 643 644 // This is slightly inefficient, as we could use a BStackOrHeapArray in 645 // the optimal case (few physical entries required), but we would not 646 // know whether or not that was possible until calling get_memory_map() 647 // and then potentially reallocating, which would complicate the logic. 648 649 int32 vtophys_length = (request->Length() / B_PAGE_SIZE) + 2; 650 nvme_request.iovecs = vtophys = (physical_entry*)malloc(sizeof(physical_entry) 651 * vtophys_length); 652 if (vtophys == NULL) { 653 TRACE_ERROR("failed to allocate memory for iovecs\n"); 654 request->SetStatusAndNotify(B_NO_MEMORY); 655 return B_NO_MEMORY; 656 } 657 vtophysDeleter.SetTo(vtophys); 658 659 for (size_t i = 0; i < buffer->VecCount(); i++) { 660 generic_io_vec virt = buffer->VecAt(i); 661 uint32 entries = vtophys_length - nvme_request.iovec_count; 662 663 // Avoid copies by going straight into the vtophys array. 664 status = get_memory_map_etc(request->TeamID(), (void*)virt.base, 665 virt.length, vtophys + nvme_request.iovec_count, &entries); 666 if (status == B_BUFFER_OVERFLOW) { 667 TRACE("vtophys array was too small, reallocating\n"); 668 669 vtophysDeleter.Detach(); 670 vtophys_length *= 2; 671 nvme_request.iovecs = vtophys = (physical_entry*)realloc(vtophys, 672 sizeof(physical_entry) * vtophys_length); 673 vtophysDeleter.SetTo(vtophys); 674 if (vtophys == NULL) { 675 status = B_NO_MEMORY; 676 } else { 677 // Try again, with the larger buffer this time. 678 i--; 679 continue; 680 } 681 } 682 if (status != B_OK) { 683 TRACE_ERROR("I/O get_memory_map failed: %s\n", strerror(status)); 684 request->SetStatusAndNotify(status); 685 return status; 686 } 687 688 nvme_request.iovec_count += entries; 689 } 690 } else { 691 nvme_request.iovecs = (physical_entry*)buffer->Vecs(); 692 nvme_request.iovec_count = buffer->VecCount(); 693 } 694 695 // See if we need to bounce anything other than the first or last vec. 696 const size_t block_size = handle->info->block_size; 697 bool bounceAll = false; 698 for (int32 i = 1; !bounceAll && i < (nvme_request.iovec_count - 1); i++) { 699 if ((nvme_request.iovecs[i].address % B_PAGE_SIZE) != 0) 700 bounceAll = true; 701 if ((nvme_request.iovecs[i].size % B_PAGE_SIZE) != 0) 702 bounceAll = true; 703 } 704 705 // See if we need to bounce due to the first or last vecs. 706 if (nvme_request.iovec_count > 1) { 707 // There are middle vecs, so the first and last vecs have different restrictions: they 708 // need only be a multiple of the block size, and must end and start on a page boundary, 709 // respectively, though the start address must always be 32-bit-aligned. 710 physical_entry* entry = &nvme_request.iovecs[0]; 711 if (!bounceAll && (((entry->address + entry->size) % B_PAGE_SIZE) != 0 712 || (entry->address & 0x3) != 0 || (entry->size % block_size) != 0)) 713 bounceAll = true; 714 715 entry = &nvme_request.iovecs[nvme_request.iovec_count - 1]; 716 if (!bounceAll && ((entry->address % B_PAGE_SIZE) != 0 717 || (entry->size % block_size) != 0)) 718 bounceAll = true; 719 } else { 720 // There is only one vec. Check that it is a multiple of the block size, 721 // and that its address is 32-bit-aligned. 722 physical_entry* entry = &nvme_request.iovecs[0]; 723 if (!bounceAll && ((entry->address & 0x3) != 0 || (entry->size % block_size) != 0)) 724 bounceAll = true; 725 } 726 727 // See if we need to bounce due to rounding. 728 const off_t rounded_pos = ROUNDDOWN(request->Offset(), block_size); 729 phys_size_t rounded_len = ROUNDUP(request->Length() + (request->Offset() 730 - rounded_pos), block_size); 731 if (rounded_pos != request->Offset() || rounded_len != request->Length()) 732 bounceAll = true; 733 734 if (bounceAll) { 735 // Let the bounced I/O routine take care of everything from here. 736 return nvme_disk_bounced_io(handle, request); 737 } 738 739 nvme_request.lba_start = rounded_pos / block_size; 740 nvme_request.lba_count = rounded_len / block_size; 741 742 // No bouncing was required. 743 ReadLocker readLocker; 744 if (nvme_request.write) 745 readLocker.SetTo(handle->info->rounded_write_lock, false); 746 747 // Error check before actually doing I/O. 748 if (status != B_OK) { 749 TRACE_ERROR("I/O failed early: %s\n", strerror(status)); 750 request->SetStatusAndNotify(status); 751 return status; 752 } 753 754 const uint32 max_io_blocks = handle->info->max_io_blocks; 755 int32 remaining = nvme_request.iovec_count; 756 while (remaining > 0) { 757 nvme_request.iovec_count = min_c(remaining, 758 NVME_MAX_SGL_DESCRIPTORS / 2); 759 760 nvme_request.lba_count = 0; 761 for (int i = 0; i < nvme_request.iovec_count; i++) { 762 int32 new_lba_count = nvme_request.lba_count 763 + (nvme_request.iovecs[i].size / block_size); 764 if (nvme_request.lba_count > 0 && new_lba_count > max_io_blocks) { 765 // We already have a nonzero length, and adding this vec would 766 // make us go over (or we already are over.) Stop adding. 767 nvme_request.iovec_count = i; 768 break; 769 } 770 771 nvme_request.lba_count = new_lba_count; 772 } 773 774 status = do_nvme_io_request(handle->info, &nvme_request); 775 if (status != B_OK) 776 break; 777 778 nvme_request.iovecs += nvme_request.iovec_count; 779 remaining -= nvme_request.iovec_count; 780 nvme_request.lba_start += nvme_request.lba_count; 781 } 782 783 if (status != B_OK) 784 TRACE_ERROR("I/O failed: %s\n", strerror(status)); 785 786 request->SetTransferredBytes(status != B_OK, 787 (nvme_request.lba_start * block_size) - rounded_pos); 788 request->SetStatusAndNotify(status); 789 return status; 790 } 791 792 793 static status_t 794 nvme_disk_read(void* cookie, off_t pos, void* buffer, size_t* length) 795 { 796 CALLED(); 797 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 798 799 const off_t end = (handle->info->capacity * handle->info->block_size); 800 if (pos >= end) 801 return B_BAD_VALUE; 802 if (pos + (off_t)*length > end) 803 *length = end - pos; 804 805 IORequest request; 806 status_t status = request.Init(pos, (addr_t)buffer, *length, false, 0); 807 if (status != B_OK) 808 return status; 809 810 status = nvme_disk_io(handle, &request); 811 *length = request.TransferredBytes(); 812 return status; 813 } 814 815 816 static status_t 817 nvme_disk_write(void* cookie, off_t pos, const void* buffer, size_t* length) 818 { 819 CALLED(); 820 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 821 822 const off_t end = (handle->info->capacity * handle->info->block_size); 823 if (pos >= end) 824 return B_BAD_VALUE; 825 if (pos + (off_t)*length > end) 826 *length = end - pos; 827 828 IORequest request; 829 status_t status = request.Init(pos, (addr_t)buffer, *length, true, 0); 830 if (status != B_OK) 831 return status; 832 833 status = nvme_disk_io(handle, &request); 834 *length = request.TransferredBytes(); 835 return status; 836 } 837 838 839 static status_t 840 nvme_disk_flush(nvme_disk_driver_info* info) 841 { 842 CALLED(); 843 status_t status = EINPROGRESS; 844 845 qpair_info* qpinfo = get_qpair(info); 846 int ret = nvme_ns_flush(info->ns, qpinfo->qpair, 847 (nvme_cmd_cb)io_finished_callback, &status); 848 if (ret != 0) 849 return ret; 850 851 await_status(info, qpinfo->qpair, status); 852 return status; 853 } 854 855 856 static status_t 857 nvme_disk_trim(nvme_disk_driver_info* info, fs_trim_data* trimData) 858 { 859 CALLED(); 860 trimData->trimmed_size = 0; 861 862 const off_t deviceSize = info->capacity * info->block_size; // in bytes 863 if (deviceSize < 0) 864 return B_BAD_VALUE; 865 866 STATIC_ASSERT(sizeof(deviceSize) <= sizeof(uint64)); 867 ASSERT(deviceSize >= 0); 868 869 // Do not trim past device end. 870 for (uint32 i = 0; i < trimData->range_count; i++) { 871 uint64 offset = trimData->ranges[i].offset; 872 uint64& size = trimData->ranges[i].size; 873 874 if (offset >= (uint64)deviceSize) 875 return B_BAD_VALUE; 876 size = std::min(size, (uint64)deviceSize - offset); 877 } 878 879 // We need contiguous memory for the DSM ranges. 880 nvme_dsm_range* dsmRanges = (nvme_dsm_range*)nvme_mem_alloc_node( 881 trimData->range_count * sizeof(nvme_dsm_range), 0, 0, NULL); 882 if (dsmRanges == NULL) 883 return B_NO_MEMORY; 884 CObjectDeleter<void, void, nvme_free> dsmRangesDeleter(dsmRanges); 885 886 uint64 trimmingSize = 0; 887 for (uint32 i = 0; i < trimData->range_count; i++) { 888 uint64 offset = trimData->ranges[i].offset; 889 uint64 length = trimData->ranges[i].size; 890 891 // Round up offset and length to the block size. 892 // (Some space at the beginning and end may thus not be trimmed.) 893 offset = ROUNDUP(offset, info->block_size); 894 length -= offset - trimData->ranges[i].offset; 895 length = ROUNDDOWN(length, info->block_size); 896 897 if (length == 0) 898 continue; 899 if ((length / info->block_size) > UINT32_MAX) 900 length = uint64(UINT32_MAX) * info->block_size; 901 // TODO: Break into smaller trim ranges! 902 903 TRACE("trim %" B_PRIu64 " bytes from %" B_PRIu64 "\n", length, offset); 904 905 dsmRanges[i].attributes = 0; 906 dsmRanges[i].length = length / info->block_size; 907 dsmRanges[i].starting_lba = offset / info->block_size; 908 909 trimmingSize += dsmRanges[i].length; 910 } 911 912 status_t status = EINPROGRESS; 913 qpair_info* qpair = get_qpair(info); 914 if (nvme_ns_deallocate(info->ns, qpair->qpair, dsmRanges, trimData->range_count, 915 (nvme_cmd_cb)io_finished_callback, &status) != 0) 916 return B_IO_ERROR; 917 918 await_status(info, qpair->qpair, status); 919 if (status != B_OK) 920 return status; 921 922 trimData->trimmed_size = trimmingSize; 923 return B_OK; 924 } 925 926 927 static status_t 928 nvme_disk_ioctl(void* cookie, uint32 op, void* buffer, size_t length) 929 { 930 CALLED(); 931 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 932 nvme_disk_driver_info* info = handle->info; 933 934 TRACE("ioctl(op = %" B_PRId32 ")\n", op); 935 936 switch (op) { 937 case B_GET_MEDIA_STATUS: 938 { 939 *(status_t *)buffer = info->media_status; 940 info->media_status = B_OK; 941 return B_OK; 942 break; 943 } 944 945 case B_GET_DEVICE_SIZE: 946 { 947 size_t size = info->capacity * info->block_size; 948 return user_memcpy(buffer, &size, sizeof(size_t)); 949 } 950 951 case B_GET_GEOMETRY: 952 { 953 if (buffer == NULL /*|| length != sizeof(device_geometry)*/) 954 return B_BAD_VALUE; 955 956 device_geometry geometry; 957 status_t status = get_geometry(handle, &geometry); 958 if (status != B_OK) 959 return status; 960 961 return user_memcpy(buffer, &geometry, sizeof(device_geometry)); 962 } 963 964 case B_GET_ICON_NAME: 965 return user_strlcpy((char*)buffer, "devices/drive-harddisk", 966 B_FILE_NAME_LENGTH); 967 968 case B_GET_VECTOR_ICON: 969 { 970 device_icon iconData; 971 if (length != sizeof(device_icon)) 972 return B_BAD_VALUE; 973 if (user_memcpy(&iconData, buffer, sizeof(device_icon)) != B_OK) 974 return B_BAD_ADDRESS; 975 976 if (iconData.icon_size >= (int32)sizeof(kDriveIcon)) { 977 if (user_memcpy(iconData.icon_data, kDriveIcon, 978 sizeof(kDriveIcon)) != B_OK) 979 return B_BAD_ADDRESS; 980 } 981 982 iconData.icon_size = sizeof(kDriveIcon); 983 return user_memcpy(buffer, &iconData, sizeof(device_icon)); 984 } 985 986 case B_FLUSH_DRIVE_CACHE: 987 return nvme_disk_flush(info); 988 989 case B_TRIM_DEVICE: 990 ASSERT(IS_KERNEL_ADDRESS(buffer)); 991 return nvme_disk_trim(info, (fs_trim_data*)buffer); 992 } 993 994 return B_DEV_INVALID_IOCTL; 995 } 996 997 998 // #pragma mark - driver module API 999 1000 1001 static float 1002 nvme_disk_supports_device(device_node *parent) 1003 { 1004 CALLED(); 1005 1006 const char* bus; 1007 uint16 baseClass, subClass; 1008 1009 if (sDeviceManager->get_attr_string(parent, B_DEVICE_BUS, &bus, false) != B_OK 1010 || sDeviceManager->get_attr_uint16(parent, B_DEVICE_TYPE, &baseClass, false) != B_OK 1011 || sDeviceManager->get_attr_uint16(parent, B_DEVICE_SUB_TYPE, &subClass, false) != B_OK) 1012 return -1.0f; 1013 1014 if (strcmp(bus, "pci") != 0 || baseClass != PCI_mass_storage) 1015 return 0.0f; 1016 1017 if (subClass != PCI_nvm) 1018 return 0.0f; 1019 1020 TRACE("NVMe device found!\n"); 1021 return 1.0f; 1022 } 1023 1024 1025 static status_t 1026 nvme_disk_register_device(device_node* parent) 1027 { 1028 CALLED(); 1029 1030 device_attr attrs[] = { 1031 { B_DEVICE_PRETTY_NAME, B_STRING_TYPE, { string: "NVMe Disk" } }, 1032 { NULL } 1033 }; 1034 1035 return sDeviceManager->register_node(parent, NVME_DISK_DRIVER_MODULE_NAME, 1036 attrs, NULL, NULL); 1037 } 1038 1039 1040 static status_t 1041 nvme_disk_init_driver(device_node* node, void** cookie) 1042 { 1043 CALLED(); 1044 1045 int ret = nvme_lib_init((enum nvme_log_level)0, (enum nvme_log_facility)0, NULL); 1046 if (ret != 0) { 1047 TRACE_ERROR("libnvme initialization failed!\n"); 1048 return ret; 1049 } 1050 1051 nvme_disk_driver_info* info = new nvme_disk_driver_info; 1052 if (info == NULL) 1053 return B_NO_MEMORY; 1054 1055 info->media_status = B_OK; 1056 info->node = node; 1057 1058 info->ctrlr = NULL; 1059 1060 *cookie = info; 1061 return B_OK; 1062 } 1063 1064 1065 static void 1066 nvme_disk_uninit_driver(void* _cookie) 1067 { 1068 CALLED(); 1069 1070 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie; 1071 free(info); 1072 } 1073 1074 1075 static status_t 1076 nvme_disk_register_child_devices(void* _cookie) 1077 { 1078 CALLED(); 1079 1080 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie; 1081 status_t status; 1082 1083 int32 id = sDeviceManager->create_id(NVME_DISK_DEVICE_ID_GENERATOR); 1084 if (id < 0) 1085 return id; 1086 1087 char name[64]; 1088 snprintf(name, sizeof(name), "disk/nvme/%" B_PRId32 "/raw", 1089 id); 1090 1091 status = sDeviceManager->publish_device(info->node, name, 1092 NVME_DISK_DEVICE_MODULE_NAME); 1093 1094 return status; 1095 } 1096 1097 1098 // #pragma mark - 1099 1100 1101 module_dependency module_dependencies[] = { 1102 { B_DEVICE_MANAGER_MODULE_NAME, (module_info**)&sDeviceManager }, 1103 { NULL } 1104 }; 1105 1106 struct device_module_info sNvmeDiskDevice = { 1107 { 1108 NVME_DISK_DEVICE_MODULE_NAME, 1109 0, 1110 NULL 1111 }, 1112 1113 nvme_disk_init_device, 1114 nvme_disk_uninit_device, 1115 NULL, // remove, 1116 1117 nvme_disk_open, 1118 nvme_disk_close, 1119 nvme_disk_free, 1120 nvme_disk_read, 1121 nvme_disk_write, 1122 nvme_disk_io, 1123 nvme_disk_ioctl, 1124 1125 NULL, // select 1126 NULL, // deselect 1127 }; 1128 1129 struct driver_module_info sNvmeDiskDriver = { 1130 { 1131 NVME_DISK_DRIVER_MODULE_NAME, 1132 0, 1133 NULL 1134 }, 1135 1136 nvme_disk_supports_device, 1137 nvme_disk_register_device, 1138 nvme_disk_init_driver, 1139 nvme_disk_uninit_driver, 1140 nvme_disk_register_child_devices, 1141 NULL, // rescan 1142 NULL, // removed 1143 }; 1144 1145 module_info* modules[] = { 1146 (module_info*)&sNvmeDiskDriver, 1147 (module_info*)&sNvmeDiskDevice, 1148 NULL 1149 }; 1150