1 /* 2 * Copyright 2019-2020, Haiku, Inc. All rights reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Augustin Cavalier <waddlesplash> 7 */ 8 9 10 #include <stdio.h> 11 #include <stdlib.h> 12 13 #include <algorithm> 14 #include <condition_variable.h> 15 #include <AutoDeleter.h> 16 #include <kernel.h> 17 #include <util/AutoLock.h> 18 19 #include <fs/devfs.h> 20 #include <bus/PCI.h> 21 #include <PCI_x86.h> 22 #include <vm/vm.h> 23 24 #include "IORequest.h" 25 26 extern "C" { 27 #include <libnvme/nvme.h> 28 #include <libnvme/nvme_internal.h> 29 } 30 31 32 //#define TRACE_NVME_DISK 33 #ifdef TRACE_NVME_DISK 34 # define TRACE(x...) dprintf("nvme_disk: " x) 35 #else 36 # define TRACE(x...) ; 37 #endif 38 #define TRACE_ALWAYS(x...) dprintf("nvme_disk: " x) 39 #define TRACE_ERROR(x...) dprintf("\33[33mnvme_disk:\33[0m " x) 40 #define CALLED() TRACE("CALLED %s\n", __PRETTY_FUNCTION__) 41 42 43 static const uint8 kDriveIcon[] = { 44 0x6e, 0x63, 0x69, 0x66, 0x08, 0x03, 0x01, 0x00, 0x00, 0x02, 0x00, 0x16, 45 0x02, 0x3c, 0xc7, 0xee, 0x38, 0x9b, 0xc0, 0xba, 0x16, 0x57, 0x3e, 0x39, 46 0xb0, 0x49, 0x77, 0xc8, 0x42, 0xad, 0xc7, 0x00, 0xff, 0xff, 0xd3, 0x02, 47 0x00, 0x06, 0x02, 0x3c, 0x96, 0x32, 0x3a, 0x4d, 0x3f, 0xba, 0xfc, 0x01, 48 0x3d, 0x5a, 0x97, 0x4b, 0x57, 0xa5, 0x49, 0x84, 0x4d, 0x00, 0x47, 0x47, 49 0x47, 0xff, 0xa5, 0xa0, 0xa0, 0x02, 0x00, 0x16, 0x02, 0xbc, 0x59, 0x2f, 50 0xbb, 0x29, 0xa7, 0x3c, 0x0c, 0xe4, 0xbd, 0x0b, 0x7c, 0x48, 0x92, 0xc0, 51 0x4b, 0x79, 0x66, 0x00, 0x7d, 0xff, 0xd4, 0x02, 0x00, 0x06, 0x02, 0x38, 52 0xdb, 0xb4, 0x39, 0x97, 0x33, 0xbc, 0x4a, 0x33, 0x3b, 0xa5, 0x42, 0x48, 53 0x6e, 0x66, 0x49, 0xee, 0x7b, 0x00, 0x59, 0x67, 0x56, 0xff, 0xeb, 0xb2, 54 0xb2, 0x03, 0xa7, 0xff, 0x00, 0x03, 0xff, 0x00, 0x00, 0x04, 0x01, 0x80, 55 0x07, 0x0a, 0x06, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x5a, 0x3e, 0x5a, 56 0x31, 0x39, 0x25, 0x0a, 0x04, 0x22, 0x3c, 0x44, 0x4b, 0x5a, 0x31, 0x39, 57 0x25, 0x0a, 0x04, 0x44, 0x4b, 0x44, 0x5b, 0x5a, 0x3e, 0x5a, 0x31, 0x0a, 58 0x04, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x44, 0x4b, 0x08, 0x02, 0x27, 59 0x43, 0xb8, 0x14, 0xc1, 0xf1, 0x08, 0x02, 0x26, 0x43, 0x29, 0x44, 0x0a, 60 0x05, 0x44, 0x5d, 0x49, 0x5d, 0x60, 0x3e, 0x5a, 0x3b, 0x5b, 0x3f, 0x08, 61 0x0a, 0x07, 0x01, 0x06, 0x00, 0x0a, 0x00, 0x01, 0x00, 0x10, 0x01, 0x17, 62 0x84, 0x00, 0x04, 0x0a, 0x01, 0x01, 0x01, 0x00, 0x0a, 0x02, 0x01, 0x02, 63 0x00, 0x0a, 0x03, 0x01, 0x03, 0x00, 0x0a, 0x04, 0x01, 0x04, 0x10, 0x01, 64 0x17, 0x85, 0x20, 0x04, 0x0a, 0x06, 0x01, 0x05, 0x30, 0x24, 0xb3, 0x99, 65 0x01, 0x17, 0x82, 0x00, 0x04, 0x0a, 0x05, 0x01, 0x05, 0x30, 0x20, 0xb2, 66 0xe6, 0x01, 0x17, 0x82, 0x00, 0x04 67 }; 68 69 70 #define NVME_DISK_DRIVER_MODULE_NAME "drivers/disk/nvme_disk/driver_v1" 71 #define NVME_DISK_DEVICE_MODULE_NAME "drivers/disk/nvme_disk/device_v1" 72 #define NVME_DISK_DEVICE_ID_GENERATOR "nvme_disk/device_id" 73 74 #define NVME_MAX_QPAIRS (8) 75 76 77 static device_manager_info* sDeviceManager; 78 static pci_x86_module_info* sPCIx86Module; 79 80 typedef struct { 81 device_node* node; 82 pci_info info; 83 84 struct nvme_ctrlr* ctrlr; 85 86 struct nvme_ns* ns; 87 uint64 capacity; 88 uint32 block_size; 89 uint32 max_io_blocks; 90 status_t media_status; 91 92 struct qpair_info { 93 struct nvme_qpair* qpair; 94 } qpairs[NVME_MAX_QPAIRS]; 95 uint32 qpair_count; 96 uint32 next_qpair; 97 98 DMAResource dma_resource; 99 sem_id dma_buffers_sem; 100 101 rw_lock rounded_write_lock; 102 103 ConditionVariable interrupt; 104 } nvme_disk_driver_info; 105 typedef nvme_disk_driver_info::qpair_info qpair_info; 106 107 108 typedef struct { 109 nvme_disk_driver_info* info; 110 } nvme_disk_handle; 111 112 113 static status_t 114 get_geometry(nvme_disk_handle* handle, device_geometry* geometry) 115 { 116 nvme_disk_driver_info* info = handle->info; 117 118 devfs_compute_geometry_size(geometry, info->capacity, info->block_size); 119 120 geometry->device_type = B_DISK; 121 geometry->removable = false; 122 123 geometry->read_only = false; 124 geometry->write_once = false; 125 126 TRACE("get_geometry(): %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %d, %d, %d, %d\n", 127 geometry->bytes_per_sector, geometry->sectors_per_track, 128 geometry->cylinder_count, geometry->head_count, geometry->device_type, 129 geometry->removable, geometry->read_only, geometry->write_once); 130 131 return B_OK; 132 } 133 134 135 static int 136 log2(uint32 x) 137 { 138 int y; 139 140 for (y = 31; y >= 0; --y) { 141 if (x == ((uint32)1 << y)) 142 break; 143 } 144 145 return y; 146 } 147 148 149 static void 150 nvme_disk_set_capacity(nvme_disk_driver_info* info, uint64 capacity, 151 uint32 blockSize) 152 { 153 TRACE("set_capacity(device = %p, capacity = %" B_PRIu64 ", blockSize = %" B_PRIu32 ")\n", 154 info, capacity, blockSize); 155 156 // get log2, if possible 157 uint32 blockShift = log2(blockSize); 158 159 if ((1UL << blockShift) != blockSize) 160 blockShift = 0; 161 162 info->capacity = capacity; 163 info->block_size = blockSize; 164 } 165 166 167 // #pragma mark - device module API 168 169 170 static int32 nvme_interrupt_handler(void* _info); 171 172 173 static status_t 174 nvme_disk_init_device(void* _info, void** _cookie) 175 { 176 CALLED(); 177 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info; 178 179 pci_device_module_info* pci; 180 pci_device* pcidev; 181 device_node* parent = sDeviceManager->get_parent_node(info->node); 182 sDeviceManager->get_driver(parent, (driver_module_info**)&pci, 183 (void**)&pcidev); 184 pci->get_pci_info(pcidev, &info->info); 185 sDeviceManager->put_node(parent); 186 187 // construct the libnvme pci_device struct 188 pci_device* device = new pci_device; 189 device->vendor_id = info->info.vendor_id; 190 device->device_id = info->info.device_id; 191 device->subvendor_id = 0; 192 device->subdevice_id = 0; 193 194 device->domain = 0; 195 device->bus = info->info.bus; 196 device->dev = info->info.device; 197 device->func = info->info.function; 198 199 device->pci_info = &info->info; 200 201 // open the controller 202 info->ctrlr = nvme_ctrlr_open(device, NULL); 203 if (info->ctrlr == NULL) { 204 TRACE_ERROR("failed to open the controller!\n"); 205 return B_ERROR; 206 } 207 208 struct nvme_ctrlr_stat cstat; 209 int err = nvme_ctrlr_stat(info->ctrlr, &cstat); 210 if (err != 0) { 211 TRACE_ERROR("failed to get controller information!\n"); 212 nvme_ctrlr_close(info->ctrlr); 213 return err; 214 } 215 216 TRACE_ALWAYS("attached to NVMe device \"%s (%s)\"\n", cstat.mn, cstat.sn); 217 TRACE_ALWAYS("\tmaximum transfer size: %" B_PRIuSIZE "\n", cstat.max_xfer_size); 218 TRACE_ALWAYS("\tqpair count: %d\n", cstat.io_qpairs); 219 220 // TODO: export more than just the first namespace! 221 info->ns = nvme_ns_open(info->ctrlr, cstat.ns_ids[0]); 222 if (info->ns == NULL) { 223 TRACE_ERROR("failed to open namespace!\n"); 224 nvme_ctrlr_close(info->ctrlr); 225 return B_ERROR; 226 } 227 228 struct nvme_ns_stat nsstat; 229 err = nvme_ns_stat(info->ns, &nsstat); 230 if (err != 0) { 231 TRACE_ERROR("failed to get namespace information!\n"); 232 nvme_ctrlr_close(info->ctrlr); 233 return err; 234 } 235 236 // store capacity information 237 nvme_disk_set_capacity(info, nsstat.sectors, nsstat.sector_size); 238 239 TRACE("capacity: %" B_PRIu64 ", block_size %" B_PRIu32 "\n", 240 info->capacity, info->block_size); 241 242 // allocate qpairs 243 info->qpair_count = info->next_qpair = 0; 244 for (uint32 i = 0; i < NVME_MAX_QPAIRS && i < cstat.io_qpairs; i++) { 245 info->qpairs[i].qpair = nvme_ioqp_get(info->ctrlr, 246 (enum nvme_qprio)0, 0); 247 if (info->qpairs[i].qpair == NULL) 248 break; 249 250 info->qpair_count++; 251 } 252 if (info->qpair_count == 0) { 253 TRACE_ERROR("failed to allocate qpairs!\n"); 254 nvme_ctrlr_close(info->ctrlr); 255 return B_NO_MEMORY; 256 } 257 258 // allocate DMA buffers 259 int buffers = info->qpair_count * 2; 260 261 dma_restrictions restrictions = {}; 262 restrictions.alignment = B_PAGE_SIZE; 263 // Technically, the first and last segments in a transfer can be 264 // unaligned, and the rest only need to have sizes that are a multiple 265 // of the block size. 266 restrictions.max_segment_count = (NVME_MAX_SGL_DESCRIPTORS / 2); 267 restrictions.max_transfer_size = cstat.max_xfer_size; 268 info->max_io_blocks = cstat.max_xfer_size / nsstat.sector_size; 269 270 err = info->dma_resource.Init(restrictions, B_PAGE_SIZE, buffers, buffers); 271 if (err != 0) { 272 TRACE_ERROR("failed to initialize DMA resource!\n"); 273 nvme_ctrlr_close(info->ctrlr); 274 return err; 275 } 276 277 info->dma_buffers_sem = create_sem(buffers, "nvme buffers sem"); 278 if (info->dma_buffers_sem < 0) { 279 TRACE_ERROR("failed to create DMA buffers semaphore!\n"); 280 nvme_ctrlr_close(info->ctrlr); 281 return info->dma_buffers_sem; 282 } 283 284 // set up rounded-write lock 285 rw_lock_init(&info->rounded_write_lock, "nvme rounded writes"); 286 287 // set up interrupt 288 if (get_module(B_PCI_X86_MODULE_NAME, (module_info**)&sPCIx86Module) 289 != B_OK) { 290 sPCIx86Module = NULL; 291 } 292 293 uint16 command = pci->read_pci_config(pcidev, PCI_command, 2); 294 command &= ~(PCI_command_int_disable); 295 pci->write_pci_config(pcidev, PCI_command, 2, command); 296 297 uint8 irq = info->info.u.h0.interrupt_line; 298 if (sPCIx86Module != NULL) { 299 if (sPCIx86Module->get_msix_count(info->info.bus, info->info.device, 300 info->info.function)) { 301 uint8 msixVector = 0; 302 if (sPCIx86Module->configure_msix(info->info.bus, info->info.device, 303 info->info.function, 1, &msixVector) == B_OK 304 && sPCIx86Module->enable_msix(info->info.bus, info->info.device, 305 info->info.function) == B_OK) { 306 TRACE_ALWAYS("using MSI-X\n"); 307 irq = msixVector; 308 } 309 } else if (sPCIx86Module->get_msi_count(info->info.bus, 310 info->info.device, info->info.function) >= 1) { 311 uint8 msiVector = 0; 312 if (sPCIx86Module->configure_msi(info->info.bus, info->info.device, 313 info->info.function, 1, &msiVector) == B_OK 314 && sPCIx86Module->enable_msi(info->info.bus, info->info.device, 315 info->info.function) == B_OK) { 316 TRACE_ALWAYS("using message signaled interrupts\n"); 317 irq = msiVector; 318 } 319 } 320 } 321 322 if (irq == 0 || irq == 0xFF) { 323 TRACE_ERROR("device PCI:%d:%d:%d was assigned an invalid IRQ\n", 324 info->info.bus, info->info.device, info->info.function); 325 return B_ERROR; 326 } 327 info->interrupt.Init(NULL, NULL); 328 install_io_interrupt_handler(irq, nvme_interrupt_handler, (void*)info, B_NO_HANDLED_INFO); 329 330 if (info->ctrlr->feature_supported[NVME_FEAT_INTERRUPT_COALESCING]) { 331 uint32 microseconds = 16, threshold = 32; 332 nvme_admin_set_feature(info->ctrlr, false, NVME_FEAT_INTERRUPT_COALESCING, 333 ((microseconds / 100) << 8) | threshold, 0, NULL); 334 } 335 336 *_cookie = info; 337 return B_OK; 338 } 339 340 341 static void 342 nvme_disk_uninit_device(void* _cookie) 343 { 344 CALLED(); 345 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie; 346 347 remove_io_interrupt_handler(info->info.u.h0.interrupt_line, 348 nvme_interrupt_handler, (void*)info); 349 350 rw_lock_destroy(&info->rounded_write_lock); 351 352 nvme_ns_close(info->ns); 353 nvme_ctrlr_close(info->ctrlr); 354 355 // TODO: Deallocate MSI(-X). 356 // TODO: Deallocate PCI. 357 } 358 359 360 static status_t 361 nvme_disk_open(void* _info, const char* path, int openMode, void** _cookie) 362 { 363 CALLED(); 364 365 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info; 366 nvme_disk_handle* handle = (nvme_disk_handle*)malloc( 367 sizeof(nvme_disk_handle)); 368 if (handle == NULL) 369 return B_NO_MEMORY; 370 371 handle->info = info; 372 373 *_cookie = handle; 374 return B_OK; 375 } 376 377 378 static status_t 379 nvme_disk_close(void* cookie) 380 { 381 CALLED(); 382 383 //nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 384 return B_OK; 385 } 386 387 388 static status_t 389 nvme_disk_free(void* cookie) 390 { 391 CALLED(); 392 393 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 394 free(handle); 395 return B_OK; 396 } 397 398 399 // #pragma mark - I/O 400 401 402 static int32 403 nvme_interrupt_handler(void* _info) 404 { 405 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info; 406 info->interrupt.NotifyAll(); 407 return 0; 408 } 409 410 411 static qpair_info* 412 get_qpair(nvme_disk_driver_info* info) 413 { 414 return &info->qpairs[atomic_add((int32*)&info->next_qpair, 1) 415 % info->qpair_count]; 416 } 417 418 419 static void 420 io_finished_callback(status_t* status, const struct nvme_cpl* cpl) 421 { 422 *status = nvme_cpl_is_error(cpl) ? B_IO_ERROR : B_OK; 423 } 424 425 426 static void 427 await_status(nvme_disk_driver_info* info, struct nvme_qpair* qpair, status_t& status) 428 { 429 CALLED(); 430 431 ConditionVariableEntry entry; 432 int timeouts = 0; 433 while (status == EINPROGRESS) { 434 info->interrupt.Add(&entry); 435 436 nvme_qpair_poll(qpair, 0); 437 438 if (status != EINPROGRESS) 439 return; 440 441 if (entry.Wait(B_RELATIVE_TIMEOUT, 5 * 1000 * 1000) != B_OK) { 442 // This should never happen, as we are woken up on every interrupt 443 // no matter the qpair or transfer within; so if it does occur, 444 // that probably means the controller stalled or something. 445 446 TRACE_ERROR("timed out waiting for interrupt!\n"); 447 if (timeouts++ >= 3) { 448 nvme_qpair_fail(qpair); 449 status = B_TIMED_OUT; 450 return; 451 } 452 } 453 454 nvme_qpair_poll(qpair, 0); 455 } 456 } 457 458 459 struct nvme_io_request { 460 status_t status; 461 462 bool write; 463 464 off_t lba_start; 465 size_t lba_count; 466 467 physical_entry* iovecs; 468 int32 iovec_count; 469 470 int32 iovec_i; 471 uint32 iovec_offset; 472 }; 473 474 475 void ior_reset_sgl(nvme_io_request* request, uint32_t offset) 476 { 477 TRACE("IOR Reset: %" B_PRIu32 "\n", offset); 478 479 int32 i = 0; 480 while (offset > 0 && request->iovecs[i].size <= offset) { 481 offset -= request->iovecs[i].size; 482 i++; 483 } 484 request->iovec_i = i; 485 request->iovec_offset = offset; 486 } 487 488 489 int ior_next_sge(nvme_io_request* request, uint64_t* address, uint32_t* length) 490 { 491 int32 index = request->iovec_i; 492 if (index < 0 || index > request->iovec_count) 493 return -1; 494 495 *address = request->iovecs[index].address + request->iovec_offset; 496 *length = request->iovecs[index].size - request->iovec_offset; 497 498 TRACE("IOV %d (+ " B_PRIu32 "): 0x%" B_PRIx64 ", %" B_PRIu32 "\n", 499 request->iovec_i, request->iovec_offset, *address, *length); 500 501 request->iovec_i++; 502 request->iovec_offset = 0; 503 return 0; 504 } 505 506 507 static status_t 508 do_nvme_io_request(nvme_disk_driver_info* info, nvme_io_request* request) 509 { 510 request->status = EINPROGRESS; 511 512 qpair_info* qpinfo = get_qpair(info); 513 int ret = -1; 514 if (request->write) { 515 ret = nvme_ns_writev(info->ns, qpinfo->qpair, request->lba_start, 516 request->lba_count, (nvme_cmd_cb)io_finished_callback, request, 517 0, (nvme_req_reset_sgl_cb)ior_reset_sgl, 518 (nvme_req_next_sge_cb)ior_next_sge); 519 } else { 520 ret = nvme_ns_readv(info->ns, qpinfo->qpair, request->lba_start, 521 request->lba_count, (nvme_cmd_cb)io_finished_callback, request, 522 0, (nvme_req_reset_sgl_cb)ior_reset_sgl, 523 (nvme_req_next_sge_cb)ior_next_sge); 524 } 525 if (ret != 0) { 526 TRACE_ERROR("attempt to queue %s I/O at LBA %" B_PRIdOFF " of %" B_PRIuSIZE 527 " blocks failed!\n", request->write ? "write" : "read", 528 request->lba_start, request->lba_count); 529 530 request->lba_count = 0; 531 return ret; 532 } 533 534 await_status(info, qpinfo->qpair, request->status); 535 536 if (request->status != B_OK) { 537 TRACE_ERROR("%s at LBA %" B_PRIdOFF " of %" B_PRIuSIZE 538 " blocks failed!\n", request->write ? "write" : "read", 539 request->lba_start, request->lba_count); 540 541 request->lba_count = 0; 542 } 543 return request->status; 544 } 545 546 547 static status_t 548 nvme_disk_bounced_io(nvme_disk_handle* handle, io_request* request) 549 { 550 CALLED(); 551 552 WriteLocker writeLocker; 553 if (request->IsWrite()) 554 writeLocker.SetTo(handle->info->rounded_write_lock, false); 555 556 status_t status = acquire_sem(handle->info->dma_buffers_sem); 557 if (status != B_OK) { 558 request->SetStatusAndNotify(status); 559 return status; 560 } 561 562 const size_t block_size = handle->info->block_size; 563 564 TRACE("%p: IOR Offset: %" B_PRIdOFF "; Length %" B_PRIuGENADDR 565 "; Write %s\n", request, request->Offset(), request->Length(), 566 request->IsWrite() ? "yes" : "no"); 567 568 nvme_io_request nvme_request; 569 while (request->RemainingBytes() > 0) { 570 IOOperation operation; 571 status = handle->info->dma_resource.TranslateNext(request, &operation, 0); 572 if (status != B_OK) 573 break; 574 575 size_t transferredBytes = 0; 576 do { 577 TRACE("%p: IOO offset: %" B_PRIdOFF ", length: %" B_PRIuGENADDR 578 ", write: %s\n", request, operation.Offset(), 579 operation.Length(), operation.IsWrite() ? "yes" : "no"); 580 581 nvme_request.write = operation.IsWrite(); 582 nvme_request.lba_start = operation.Offset() / block_size; 583 nvme_request.lba_count = operation.Length() / block_size; 584 nvme_request.iovecs = (physical_entry*)operation.Vecs(); 585 nvme_request.iovec_count = operation.VecCount(); 586 587 status = do_nvme_io_request(handle->info, &nvme_request); 588 if (status == B_OK && nvme_request.write == request->IsWrite()) 589 transferredBytes += operation.OriginalLength(); 590 591 operation.SetStatus(status); 592 } while (status == B_OK && !operation.Finish()); 593 594 if (status == B_OK && operation.Status() != B_OK) { 595 TRACE_ERROR("I/O succeeded but IOOperation failed!\n"); 596 status = operation.Status(); 597 } 598 599 operation.SetTransferredBytes(transferredBytes); 600 request->OperationFinished(&operation, status, status != B_OK, 601 operation.OriginalOffset() + transferredBytes); 602 603 handle->info->dma_resource.RecycleBuffer(operation.Buffer()); 604 605 TRACE("%p: status %s, remaining bytes %" B_PRIuGENADDR "\n", request, 606 strerror(status), request->RemainingBytes()); 607 if (status != B_OK) 608 break; 609 } 610 611 release_sem(handle->info->dma_buffers_sem); 612 613 // Notify() also takes care of UnlockMemory(). 614 if (status != B_OK && request->Status() == B_OK) 615 request->SetStatusAndNotify(status); 616 else 617 request->NotifyFinished(); 618 return status; 619 } 620 621 622 static status_t 623 nvme_disk_io(void* cookie, io_request* request) 624 { 625 CALLED(); 626 627 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 628 629 nvme_io_request nvme_request; 630 memset(&nvme_request, 0, sizeof(nvme_io_request)); 631 632 nvme_request.write = request->IsWrite(); 633 634 physical_entry* vtophys = NULL; 635 MemoryDeleter vtophysDeleter; 636 637 IOBuffer* buffer = request->Buffer(); 638 status_t status = B_OK; 639 if (!buffer->IsPhysical()) { 640 status = buffer->LockMemory(request->TeamID(), request->IsWrite()); 641 if (status != B_OK) { 642 TRACE_ERROR("failed to lock memory: %s\n", strerror(status)); 643 return status; 644 } 645 // SetStatusAndNotify() takes care of unlocking memory if necessary. 646 647 // This is slightly inefficient, as we could use a BStackOrHeapArray in 648 // the optimal case (few physical entries required), but we would not 649 // know whether or not that was possible until calling get_memory_map() 650 // and then potentially reallocating, which would complicate the logic. 651 652 int32 vtophys_length = (request->Length() / B_PAGE_SIZE) + 2; 653 nvme_request.iovecs = vtophys = (physical_entry*)malloc(sizeof(physical_entry) 654 * vtophys_length); 655 if (vtophys == NULL) { 656 TRACE_ERROR("failed to allocate memory for iovecs\n"); 657 request->SetStatusAndNotify(B_NO_MEMORY); 658 return B_NO_MEMORY; 659 } 660 vtophysDeleter.SetTo(vtophys); 661 662 for (size_t i = 0; i < buffer->VecCount(); i++) { 663 generic_io_vec virt = buffer->VecAt(i); 664 uint32 entries = vtophys_length - nvme_request.iovec_count; 665 666 // Avoid copies by going straight into the vtophys array. 667 status = get_memory_map_etc(request->TeamID(), (void*)virt.base, 668 virt.length, vtophys + nvme_request.iovec_count, &entries); 669 if (status == B_BUFFER_OVERFLOW) { 670 TRACE("vtophys array was too small, reallocating\n"); 671 672 vtophysDeleter.Detach(); 673 vtophys_length *= 2; 674 nvme_request.iovecs = vtophys = (physical_entry*)realloc(vtophys, 675 sizeof(physical_entry) * vtophys_length); 676 vtophysDeleter.SetTo(vtophys); 677 if (vtophys == NULL) { 678 status = B_NO_MEMORY; 679 } else { 680 // Try again, with the larger buffer this time. 681 i--; 682 continue; 683 } 684 } 685 if (status != B_OK) { 686 TRACE_ERROR("I/O get_memory_map failed: %s\n", strerror(status)); 687 request->SetStatusAndNotify(status); 688 return status; 689 } 690 691 nvme_request.iovec_count += entries; 692 } 693 } else { 694 nvme_request.iovecs = (physical_entry*)buffer->Vecs(); 695 nvme_request.iovec_count = buffer->VecCount(); 696 } 697 698 // See if we need to bounce anything other than the first or last vec. 699 const size_t block_size = handle->info->block_size; 700 bool bounceAll = false; 701 for (int32 i = 1; !bounceAll && i < (nvme_request.iovec_count - 1); i++) { 702 if ((nvme_request.iovecs[i].address % B_PAGE_SIZE) != 0) 703 bounceAll = true; 704 if ((nvme_request.iovecs[i].size % B_PAGE_SIZE) != 0) 705 bounceAll = true; 706 } 707 708 // See if we need to bounce due to the first or last vec. 709 if (nvme_request.iovec_count > 1) { 710 physical_entry* entry = &nvme_request.iovecs[0]; 711 if (!bounceAll && (((entry->address + entry->size) % B_PAGE_SIZE) != 0 712 || (entry->size % block_size) != 0)) 713 bounceAll = true; 714 715 entry = &nvme_request.iovecs[nvme_request.iovec_count - 1]; 716 if (!bounceAll && ((entry->address % B_PAGE_SIZE) != 0 717 || (entry->size % block_size) != 0)) 718 bounceAll = true; 719 } 720 721 // See if we need to bounce due to rounding. 722 const off_t rounded_pos = ROUNDDOWN(request->Offset(), block_size); 723 phys_size_t rounded_len = ROUNDUP(request->Length() + (request->Offset() 724 - rounded_pos), block_size); 725 if (rounded_pos != request->Offset() || rounded_len != request->Length()) 726 bounceAll = true; 727 728 if (bounceAll) { 729 // Let the bounced I/O routine take care of everything from here. 730 return nvme_disk_bounced_io(handle, request); 731 } 732 733 nvme_request.lba_start = rounded_pos / block_size; 734 nvme_request.lba_count = rounded_len / block_size; 735 736 // No bouncing was required. 737 ReadLocker readLocker; 738 if (nvme_request.write) 739 readLocker.SetTo(handle->info->rounded_write_lock, false); 740 741 // Error check before actually doing I/O. 742 if (status != B_OK) { 743 TRACE_ERROR("I/O failed early: %s\n", strerror(status)); 744 request->SetStatusAndNotify(status); 745 return status; 746 } 747 748 const uint32 max_io_blocks = handle->info->max_io_blocks; 749 int32 remaining = nvme_request.iovec_count; 750 while (remaining > 0) { 751 nvme_request.iovec_count = min_c(remaining, 752 NVME_MAX_SGL_DESCRIPTORS / 2); 753 754 nvme_request.lba_count = 0; 755 for (int i = 0; i < nvme_request.iovec_count; i++) { 756 int32 new_lba_count = nvme_request.lba_count 757 + (nvme_request.iovecs[i].size / block_size); 758 if (nvme_request.lba_count > 0 && new_lba_count > max_io_blocks) { 759 // We already have a nonzero length, and adding this vec would 760 // make us go over (or we already are over.) Stop adding. 761 nvme_request.iovec_count = i; 762 break; 763 } 764 765 nvme_request.lba_count = new_lba_count; 766 } 767 768 status = do_nvme_io_request(handle->info, &nvme_request); 769 if (status != B_OK) 770 break; 771 772 nvme_request.iovecs += nvme_request.iovec_count; 773 remaining -= nvme_request.iovec_count; 774 nvme_request.lba_start += nvme_request.lba_count; 775 } 776 777 if (status != B_OK) 778 TRACE_ERROR("I/O failed: %s\n", strerror(status)); 779 780 request->SetTransferredBytes(status != B_OK, 781 (nvme_request.lba_start * block_size) - rounded_pos); 782 request->SetStatusAndNotify(status); 783 return status; 784 } 785 786 787 static status_t 788 nvme_disk_read(void* cookie, off_t pos, void* buffer, size_t* length) 789 { 790 CALLED(); 791 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 792 793 const off_t end = (handle->info->capacity * handle->info->block_size); 794 if (pos >= end) 795 return B_BAD_VALUE; 796 if (pos + (off_t)*length > end) 797 *length = end - pos; 798 799 IORequest request; 800 status_t status = request.Init(pos, (addr_t)buffer, *length, false, 0); 801 if (status != B_OK) 802 return status; 803 804 status = nvme_disk_io(handle, &request); 805 *length = request.TransferredBytes(); 806 return status; 807 } 808 809 810 static status_t 811 nvme_disk_write(void* cookie, off_t pos, const void* buffer, size_t* length) 812 { 813 CALLED(); 814 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 815 816 const off_t end = (handle->info->capacity * handle->info->block_size); 817 if (pos >= end) 818 return B_BAD_VALUE; 819 if (pos + (off_t)*length > end) 820 *length = end - pos; 821 822 IORequest request; 823 status_t status = request.Init(pos, (addr_t)buffer, *length, true, 0); 824 if (status != B_OK) 825 return status; 826 827 status = nvme_disk_io(handle, &request); 828 *length = request.TransferredBytes(); 829 return status; 830 } 831 832 833 static status_t 834 nvme_disk_flush(nvme_disk_driver_info* info) 835 { 836 status_t status = EINPROGRESS; 837 838 qpair_info* qpinfo = get_qpair(info); 839 int ret = nvme_ns_flush(info->ns, qpinfo->qpair, 840 (nvme_cmd_cb)io_finished_callback, &status); 841 if (ret != 0) 842 return ret; 843 844 await_status(info, qpinfo->qpair, status); 845 return status; 846 } 847 848 849 static status_t 850 nvme_disk_ioctl(void* cookie, uint32 op, void* buffer, size_t length) 851 { 852 CALLED(); 853 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 854 nvme_disk_driver_info* info = handle->info; 855 856 TRACE("ioctl(op = %" B_PRId32 ")\n", op); 857 858 switch (op) { 859 case B_GET_MEDIA_STATUS: 860 { 861 *(status_t *)buffer = info->media_status; 862 info->media_status = B_OK; 863 return B_OK; 864 break; 865 } 866 867 case B_GET_DEVICE_SIZE: 868 { 869 size_t size = info->capacity * info->block_size; 870 return user_memcpy(buffer, &size, sizeof(size_t)); 871 } 872 873 case B_GET_GEOMETRY: 874 { 875 if (buffer == NULL /*|| length != sizeof(device_geometry)*/) 876 return B_BAD_VALUE; 877 878 device_geometry geometry; 879 status_t status = get_geometry(handle, &geometry); 880 if (status != B_OK) 881 return status; 882 883 return user_memcpy(buffer, &geometry, sizeof(device_geometry)); 884 } 885 886 case B_GET_ICON_NAME: 887 return user_strlcpy((char*)buffer, "devices/drive-harddisk", 888 B_FILE_NAME_LENGTH); 889 890 case B_GET_VECTOR_ICON: 891 { 892 device_icon iconData; 893 if (length != sizeof(device_icon)) 894 return B_BAD_VALUE; 895 if (user_memcpy(&iconData, buffer, sizeof(device_icon)) != B_OK) 896 return B_BAD_ADDRESS; 897 898 if (iconData.icon_size >= (int32)sizeof(kDriveIcon)) { 899 if (user_memcpy(iconData.icon_data, kDriveIcon, 900 sizeof(kDriveIcon)) != B_OK) 901 return B_BAD_ADDRESS; 902 } 903 904 iconData.icon_size = sizeof(kDriveIcon); 905 return user_memcpy(buffer, &iconData, sizeof(device_icon)); 906 } 907 908 case B_FLUSH_DRIVE_CACHE: 909 return nvme_disk_flush(info); 910 } 911 912 return B_DEV_INVALID_IOCTL; 913 } 914 915 916 // #pragma mark - driver module API 917 918 919 static float 920 nvme_disk_supports_device(device_node *parent) 921 { 922 CALLED(); 923 924 const char* bus; 925 uint16 baseClass, subClass; 926 927 if (sDeviceManager->get_attr_string(parent, B_DEVICE_BUS, &bus, false) != B_OK 928 || sDeviceManager->get_attr_uint16(parent, B_DEVICE_TYPE, &baseClass, false) != B_OK 929 || sDeviceManager->get_attr_uint16(parent, B_DEVICE_SUB_TYPE, &subClass, false) != B_OK) 930 return -1.0f; 931 932 if (strcmp(bus, "pci") != 0 || baseClass != PCI_mass_storage) 933 return 0.0f; 934 935 if (subClass != PCI_nvm) 936 return 0.0f; 937 938 TRACE("NVMe device found!\n"); 939 return 1.0f; 940 } 941 942 943 static status_t 944 nvme_disk_register_device(device_node* parent) 945 { 946 CALLED(); 947 948 device_attr attrs[] = { 949 { NULL } 950 }; 951 952 return sDeviceManager->register_node(parent, NVME_DISK_DRIVER_MODULE_NAME, 953 attrs, NULL, NULL); 954 } 955 956 957 static status_t 958 nvme_disk_init_driver(device_node* node, void** cookie) 959 { 960 CALLED(); 961 962 int ret = nvme_lib_init((enum nvme_log_level)0, (enum nvme_log_facility)0, NULL); 963 if (ret != 0) { 964 TRACE_ERROR("libnvme initialization failed!\n"); 965 return ret; 966 } 967 968 nvme_disk_driver_info* info = new nvme_disk_driver_info; 969 if (info == NULL) 970 return B_NO_MEMORY; 971 972 info->media_status = B_OK; 973 info->node = node; 974 975 info->ctrlr = NULL; 976 977 *cookie = info; 978 return B_OK; 979 } 980 981 982 static void 983 nvme_disk_uninit_driver(void* _cookie) 984 { 985 CALLED(); 986 987 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie; 988 free(info); 989 } 990 991 992 static status_t 993 nvme_disk_register_child_devices(void* _cookie) 994 { 995 CALLED(); 996 997 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie; 998 status_t status; 999 1000 int32 id = sDeviceManager->create_id(NVME_DISK_DEVICE_ID_GENERATOR); 1001 if (id < 0) 1002 return id; 1003 1004 char name[64]; 1005 snprintf(name, sizeof(name), "disk/nvme/%" B_PRId32 "/raw", 1006 id); 1007 1008 status = sDeviceManager->publish_device(info->node, name, 1009 NVME_DISK_DEVICE_MODULE_NAME); 1010 1011 return status; 1012 } 1013 1014 1015 // #pragma mark - 1016 1017 1018 module_dependency module_dependencies[] = { 1019 {B_DEVICE_MANAGER_MODULE_NAME, (module_info**)&sDeviceManager}, 1020 {} 1021 }; 1022 1023 struct device_module_info sNvmeDiskDevice = { 1024 { 1025 NVME_DISK_DEVICE_MODULE_NAME, 1026 0, 1027 NULL 1028 }, 1029 1030 nvme_disk_init_device, 1031 nvme_disk_uninit_device, 1032 NULL, // remove, 1033 1034 nvme_disk_open, 1035 nvme_disk_close, 1036 nvme_disk_free, 1037 nvme_disk_read, 1038 nvme_disk_write, 1039 nvme_disk_io, 1040 nvme_disk_ioctl, 1041 1042 NULL, // select 1043 NULL, // deselect 1044 }; 1045 1046 struct driver_module_info sNvmeDiskDriver = { 1047 { 1048 NVME_DISK_DRIVER_MODULE_NAME, 1049 0, 1050 NULL 1051 }, 1052 1053 nvme_disk_supports_device, 1054 nvme_disk_register_device, 1055 nvme_disk_init_driver, 1056 nvme_disk_uninit_driver, 1057 nvme_disk_register_child_devices, 1058 NULL, // rescan 1059 NULL, // removed 1060 }; 1061 1062 module_info* modules[] = { 1063 (module_info*)&sNvmeDiskDriver, 1064 (module_info*)&sNvmeDiskDevice, 1065 NULL 1066 }; 1067