1 /* 2 * Copyright 2019-2020, Haiku, Inc. All rights reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Augustin Cavalier <waddlesplash> 7 */ 8 9 10 #include <stdio.h> 11 #include <stdlib.h> 12 13 #include <algorithm> 14 #include <condition_variable.h> 15 #include <AutoDeleter.h> 16 #include <kernel.h> 17 #include <util/AutoLock.h> 18 19 #include <fs/devfs.h> 20 #include <bus/PCI.h> 21 #include <PCI_x86.h> 22 #include <vm/vm.h> 23 24 #include "IORequest.h" 25 26 extern "C" { 27 #include <libnvme/nvme.h> 28 #include <libnvme/nvme_internal.h> 29 } 30 31 32 //#define TRACE_NVME_DISK 33 #ifdef TRACE_NVME_DISK 34 # define TRACE(x...) dprintf("nvme_disk: " x) 35 #else 36 # define TRACE(x...) ; 37 #endif 38 #define TRACE_ALWAYS(x...) dprintf("nvme_disk: " x) 39 #define TRACE_ERROR(x...) dprintf("\33[33mnvme_disk:\33[0m " x) 40 #define CALLED() TRACE("CALLED %s\n", __PRETTY_FUNCTION__) 41 42 43 static const uint8 kDriveIcon[] = { 44 0x6e, 0x63, 0x69, 0x66, 0x08, 0x03, 0x01, 0x00, 0x00, 0x02, 0x00, 0x16, 45 0x02, 0x3c, 0xc7, 0xee, 0x38, 0x9b, 0xc0, 0xba, 0x16, 0x57, 0x3e, 0x39, 46 0xb0, 0x49, 0x77, 0xc8, 0x42, 0xad, 0xc7, 0x00, 0xff, 0xff, 0xd3, 0x02, 47 0x00, 0x06, 0x02, 0x3c, 0x96, 0x32, 0x3a, 0x4d, 0x3f, 0xba, 0xfc, 0x01, 48 0x3d, 0x5a, 0x97, 0x4b, 0x57, 0xa5, 0x49, 0x84, 0x4d, 0x00, 0x47, 0x47, 49 0x47, 0xff, 0xa5, 0xa0, 0xa0, 0x02, 0x00, 0x16, 0x02, 0xbc, 0x59, 0x2f, 50 0xbb, 0x29, 0xa7, 0x3c, 0x0c, 0xe4, 0xbd, 0x0b, 0x7c, 0x48, 0x92, 0xc0, 51 0x4b, 0x79, 0x66, 0x00, 0x7d, 0xff, 0xd4, 0x02, 0x00, 0x06, 0x02, 0x38, 52 0xdb, 0xb4, 0x39, 0x97, 0x33, 0xbc, 0x4a, 0x33, 0x3b, 0xa5, 0x42, 0x48, 53 0x6e, 0x66, 0x49, 0xee, 0x7b, 0x00, 0x59, 0x67, 0x56, 0xff, 0xeb, 0xb2, 54 0xb2, 0x03, 0xa7, 0xff, 0x00, 0x03, 0xff, 0x00, 0x00, 0x04, 0x01, 0x80, 55 0x07, 0x0a, 0x06, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x5a, 0x3e, 0x5a, 56 0x31, 0x39, 0x25, 0x0a, 0x04, 0x22, 0x3c, 0x44, 0x4b, 0x5a, 0x31, 0x39, 57 0x25, 0x0a, 0x04, 0x44, 0x4b, 0x44, 0x5b, 0x5a, 0x3e, 0x5a, 0x31, 0x0a, 58 0x04, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x44, 0x4b, 0x08, 0x02, 0x27, 59 0x43, 0xb8, 0x14, 0xc1, 0xf1, 0x08, 0x02, 0x26, 0x43, 0x29, 0x44, 0x0a, 60 0x05, 0x44, 0x5d, 0x49, 0x5d, 0x60, 0x3e, 0x5a, 0x3b, 0x5b, 0x3f, 0x08, 61 0x0a, 0x07, 0x01, 0x06, 0x00, 0x0a, 0x00, 0x01, 0x00, 0x10, 0x01, 0x17, 62 0x84, 0x00, 0x04, 0x0a, 0x01, 0x01, 0x01, 0x00, 0x0a, 0x02, 0x01, 0x02, 63 0x00, 0x0a, 0x03, 0x01, 0x03, 0x00, 0x0a, 0x04, 0x01, 0x04, 0x10, 0x01, 64 0x17, 0x85, 0x20, 0x04, 0x0a, 0x06, 0x01, 0x05, 0x30, 0x24, 0xb3, 0x99, 65 0x01, 0x17, 0x82, 0x00, 0x04, 0x0a, 0x05, 0x01, 0x05, 0x30, 0x20, 0xb2, 66 0xe6, 0x01, 0x17, 0x82, 0x00, 0x04 67 }; 68 69 70 #define NVME_DISK_DRIVER_MODULE_NAME "drivers/disk/nvme_disk/driver_v1" 71 #define NVME_DISK_DEVICE_MODULE_NAME "drivers/disk/nvme_disk/device_v1" 72 #define NVME_DISK_DEVICE_ID_GENERATOR "nvme_disk/device_id" 73 74 #define NVME_MAX_QPAIRS (8) 75 76 77 static device_manager_info* sDeviceManager; 78 static pci_x86_module_info* sPCIx86Module; 79 80 typedef struct { 81 device_node* node; 82 pci_info info; 83 84 struct nvme_ctrlr* ctrlr; 85 86 struct nvme_ns* ns; 87 uint64 capacity; 88 uint32 block_size; 89 uint32 max_io_blocks; 90 status_t media_status; 91 92 struct qpair_info { 93 struct nvme_qpair* qpair; 94 } qpairs[NVME_MAX_QPAIRS]; 95 uint32 qpair_count; 96 uint32 next_qpair; 97 98 DMAResource dma_resource; 99 sem_id dma_buffers_sem; 100 101 rw_lock rounded_write_lock; 102 103 ConditionVariable interrupt; 104 } nvme_disk_driver_info; 105 typedef nvme_disk_driver_info::qpair_info qpair_info; 106 107 108 typedef struct { 109 nvme_disk_driver_info* info; 110 } nvme_disk_handle; 111 112 113 static status_t 114 get_geometry(nvme_disk_handle* handle, device_geometry* geometry) 115 { 116 nvme_disk_driver_info* info = handle->info; 117 118 devfs_compute_geometry_size(geometry, info->capacity, info->block_size); 119 120 geometry->device_type = B_DISK; 121 geometry->removable = false; 122 123 geometry->read_only = false; 124 geometry->write_once = false; 125 126 TRACE("get_geometry(): %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %d, %d, %d, %d\n", 127 geometry->bytes_per_sector, geometry->sectors_per_track, 128 geometry->cylinder_count, geometry->head_count, geometry->device_type, 129 geometry->removable, geometry->read_only, geometry->write_once); 130 131 return B_OK; 132 } 133 134 135 static int 136 log2(uint32 x) 137 { 138 int y; 139 140 for (y = 31; y >= 0; --y) { 141 if (x == ((uint32)1 << y)) 142 break; 143 } 144 145 return y; 146 } 147 148 149 static void 150 nvme_disk_set_capacity(nvme_disk_driver_info* info, uint64 capacity, 151 uint32 blockSize) 152 { 153 TRACE("set_capacity(device = %p, capacity = %" B_PRIu64 ", blockSize = %" B_PRIu32 ")\n", 154 info, capacity, blockSize); 155 156 // get log2, if possible 157 uint32 blockShift = log2(blockSize); 158 159 if ((1UL << blockShift) != blockSize) 160 blockShift = 0; 161 162 info->capacity = capacity; 163 info->block_size = blockSize; 164 } 165 166 167 // #pragma mark - device module API 168 169 170 static int32 nvme_interrupt_handler(void* _info); 171 172 173 static status_t 174 nvme_disk_init_device(void* _info, void** _cookie) 175 { 176 CALLED(); 177 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info; 178 179 pci_device_module_info* pci; 180 pci_device* pcidev; 181 device_node* parent = sDeviceManager->get_parent_node(info->node); 182 sDeviceManager->get_driver(parent, (driver_module_info**)&pci, 183 (void**)&pcidev); 184 pci->get_pci_info(pcidev, &info->info); 185 sDeviceManager->put_node(parent); 186 187 // construct the libnvme pci_device struct 188 pci_device* device = new pci_device; 189 device->vendor_id = info->info.vendor_id; 190 device->device_id = info->info.device_id; 191 device->subvendor_id = 0; 192 device->subdevice_id = 0; 193 194 device->domain = 0; 195 device->bus = info->info.bus; 196 device->dev = info->info.device; 197 device->func = info->info.function; 198 199 device->pci_info = &info->info; 200 201 // enable busmaster and memory mapped access 202 uint16 command = pci->read_pci_config(pcidev, PCI_command, 2); 203 command |= PCI_command_master | PCI_command_memory; 204 pci->write_pci_config(pcidev, PCI_command, 2, command); 205 206 // open the controller 207 info->ctrlr = nvme_ctrlr_open(device, NULL); 208 if (info->ctrlr == NULL) { 209 TRACE_ERROR("failed to open the controller!\n"); 210 return B_ERROR; 211 } 212 213 struct nvme_ctrlr_stat cstat; 214 int err = nvme_ctrlr_stat(info->ctrlr, &cstat); 215 if (err != 0) { 216 TRACE_ERROR("failed to get controller information!\n"); 217 nvme_ctrlr_close(info->ctrlr); 218 return err; 219 } 220 221 TRACE_ALWAYS("attached to NVMe device \"%s (%s)\"\n", cstat.mn, cstat.sn); 222 TRACE_ALWAYS("\tmaximum transfer size: %" B_PRIuSIZE "\n", cstat.max_xfer_size); 223 TRACE_ALWAYS("\tqpair count: %d\n", cstat.io_qpairs); 224 225 // TODO: export more than just the first namespace! 226 info->ns = nvme_ns_open(info->ctrlr, cstat.ns_ids[0]); 227 if (info->ns == NULL) { 228 TRACE_ERROR("failed to open namespace!\n"); 229 nvme_ctrlr_close(info->ctrlr); 230 return B_ERROR; 231 } 232 233 struct nvme_ns_stat nsstat; 234 err = nvme_ns_stat(info->ns, &nsstat); 235 if (err != 0) { 236 TRACE_ERROR("failed to get namespace information!\n"); 237 nvme_ctrlr_close(info->ctrlr); 238 return err; 239 } 240 241 // store capacity information 242 nvme_disk_set_capacity(info, nsstat.sectors, nsstat.sector_size); 243 244 TRACE("capacity: %" B_PRIu64 ", block_size %" B_PRIu32 "\n", 245 info->capacity, info->block_size); 246 247 // allocate qpairs 248 info->qpair_count = info->next_qpair = 0; 249 for (uint32 i = 0; i < NVME_MAX_QPAIRS && i < cstat.io_qpairs; i++) { 250 info->qpairs[i].qpair = nvme_ioqp_get(info->ctrlr, 251 (enum nvme_qprio)0, 0); 252 if (info->qpairs[i].qpair == NULL) 253 break; 254 255 info->qpair_count++; 256 } 257 if (info->qpair_count == 0) { 258 TRACE_ERROR("failed to allocate qpairs!\n"); 259 nvme_ctrlr_close(info->ctrlr); 260 return B_NO_MEMORY; 261 } 262 263 // allocate DMA buffers 264 int buffers = info->qpair_count * 2; 265 266 dma_restrictions restrictions = {}; 267 restrictions.alignment = B_PAGE_SIZE; 268 // Technically, the first and last segments in a transfer can be 269 // unaligned, and the rest only need to have sizes that are a multiple 270 // of the block size. 271 restrictions.max_segment_count = (NVME_MAX_SGL_DESCRIPTORS / 2); 272 restrictions.max_transfer_size = cstat.max_xfer_size; 273 info->max_io_blocks = cstat.max_xfer_size / nsstat.sector_size; 274 275 err = info->dma_resource.Init(restrictions, B_PAGE_SIZE, buffers, buffers); 276 if (err != 0) { 277 TRACE_ERROR("failed to initialize DMA resource!\n"); 278 nvme_ctrlr_close(info->ctrlr); 279 return err; 280 } 281 282 info->dma_buffers_sem = create_sem(buffers, "nvme buffers sem"); 283 if (info->dma_buffers_sem < 0) { 284 TRACE_ERROR("failed to create DMA buffers semaphore!\n"); 285 nvme_ctrlr_close(info->ctrlr); 286 return info->dma_buffers_sem; 287 } 288 289 // set up rounded-write lock 290 rw_lock_init(&info->rounded_write_lock, "nvme rounded writes"); 291 292 // set up interrupt 293 if (get_module(B_PCI_X86_MODULE_NAME, (module_info**)&sPCIx86Module) 294 != B_OK) { 295 sPCIx86Module = NULL; 296 } 297 298 command = pci->read_pci_config(pcidev, PCI_command, 2); 299 command &= ~(PCI_command_int_disable); 300 pci->write_pci_config(pcidev, PCI_command, 2, command); 301 302 uint8 irq = info->info.u.h0.interrupt_line; 303 if (sPCIx86Module != NULL) { 304 #if 0 305 if (sPCIx86Module->get_msix_count(info->info.bus, info->info.device, 306 info->info.function)) { 307 uint8 msixVector = 0; 308 if (sPCIx86Module->configure_msix(info->info.bus, info->info.device, 309 info->info.function, 1, &msixVector) == B_OK 310 && sPCIx86Module->enable_msix(info->info.bus, info->info.device, 311 info->info.function) == B_OK) { 312 TRACE_ALWAYS("using MSI-X\n"); 313 irq = msixVector; 314 } 315 } else 316 #endif 317 if (sPCIx86Module->get_msi_count(info->info.bus, 318 info->info.device, info->info.function) >= 1) { 319 uint8 msiVector = 0; 320 if (sPCIx86Module->configure_msi(info->info.bus, info->info.device, 321 info->info.function, 1, &msiVector) == B_OK 322 && sPCIx86Module->enable_msi(info->info.bus, info->info.device, 323 info->info.function) == B_OK) { 324 TRACE_ALWAYS("using message signaled interrupts\n"); 325 irq = msiVector; 326 } 327 } 328 } 329 330 if (irq == 0 || irq == 0xFF) { 331 TRACE_ERROR("device PCI:%d:%d:%d was assigned an invalid IRQ\n", 332 info->info.bus, info->info.device, info->info.function); 333 return B_ERROR; 334 } 335 info->interrupt.Init(NULL, NULL); 336 install_io_interrupt_handler(irq, nvme_interrupt_handler, (void*)info, B_NO_HANDLED_INFO); 337 338 if (info->ctrlr->feature_supported[NVME_FEAT_INTERRUPT_COALESCING]) { 339 uint32 microseconds = 16, threshold = 32; 340 nvme_admin_set_feature(info->ctrlr, false, NVME_FEAT_INTERRUPT_COALESCING, 341 ((microseconds / 100) << 8) | threshold, 0, NULL); 342 } 343 344 *_cookie = info; 345 return B_OK; 346 } 347 348 349 static void 350 nvme_disk_uninit_device(void* _cookie) 351 { 352 CALLED(); 353 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie; 354 355 remove_io_interrupt_handler(info->info.u.h0.interrupt_line, 356 nvme_interrupt_handler, (void*)info); 357 358 rw_lock_destroy(&info->rounded_write_lock); 359 360 nvme_ns_close(info->ns); 361 nvme_ctrlr_close(info->ctrlr); 362 363 // TODO: Deallocate MSI(-X). 364 // TODO: Deallocate PCI. 365 } 366 367 368 static status_t 369 nvme_disk_open(void* _info, const char* path, int openMode, void** _cookie) 370 { 371 CALLED(); 372 373 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info; 374 nvme_disk_handle* handle = (nvme_disk_handle*)malloc( 375 sizeof(nvme_disk_handle)); 376 if (handle == NULL) 377 return B_NO_MEMORY; 378 379 handle->info = info; 380 381 *_cookie = handle; 382 return B_OK; 383 } 384 385 386 static status_t 387 nvme_disk_close(void* cookie) 388 { 389 CALLED(); 390 391 //nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 392 return B_OK; 393 } 394 395 396 static status_t 397 nvme_disk_free(void* cookie) 398 { 399 CALLED(); 400 401 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 402 free(handle); 403 return B_OK; 404 } 405 406 407 // #pragma mark - I/O 408 409 410 static int32 411 nvme_interrupt_handler(void* _info) 412 { 413 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info; 414 info->interrupt.NotifyAll(); 415 return 0; 416 } 417 418 419 static qpair_info* 420 get_qpair(nvme_disk_driver_info* info) 421 { 422 return &info->qpairs[atomic_add((int32*)&info->next_qpair, 1) 423 % info->qpair_count]; 424 } 425 426 427 static void 428 io_finished_callback(status_t* status, const struct nvme_cpl* cpl) 429 { 430 *status = nvme_cpl_is_error(cpl) ? B_IO_ERROR : B_OK; 431 } 432 433 434 static void 435 await_status(nvme_disk_driver_info* info, struct nvme_qpair* qpair, status_t& status) 436 { 437 CALLED(); 438 439 ConditionVariableEntry entry; 440 int timeouts = 0; 441 while (status == EINPROGRESS) { 442 info->interrupt.Add(&entry); 443 444 nvme_qpair_poll(qpair, 0); 445 446 if (status != EINPROGRESS) 447 return; 448 449 if (entry.Wait(B_RELATIVE_TIMEOUT, 5 * 1000 * 1000) != B_OK) { 450 // This should never happen, as we are woken up on every interrupt 451 // no matter the qpair or transfer within; so if it does occur, 452 // that probably means the controller stalled or something. 453 454 TRACE_ERROR("timed out waiting for interrupt!\n"); 455 if (timeouts++ >= 3) { 456 nvme_qpair_fail(qpair); 457 status = B_TIMED_OUT; 458 return; 459 } 460 } 461 462 nvme_qpair_poll(qpair, 0); 463 } 464 } 465 466 467 struct nvme_io_request { 468 status_t status; 469 470 bool write; 471 472 off_t lba_start; 473 size_t lba_count; 474 475 physical_entry* iovecs; 476 int32 iovec_count; 477 478 int32 iovec_i; 479 uint32 iovec_offset; 480 }; 481 482 483 void ior_reset_sgl(nvme_io_request* request, uint32_t offset) 484 { 485 TRACE("IOR Reset: %" B_PRIu32 "\n", offset); 486 487 int32 i = 0; 488 while (offset > 0 && request->iovecs[i].size <= offset) { 489 offset -= request->iovecs[i].size; 490 i++; 491 } 492 request->iovec_i = i; 493 request->iovec_offset = offset; 494 } 495 496 497 int ior_next_sge(nvme_io_request* request, uint64_t* address, uint32_t* length) 498 { 499 int32 index = request->iovec_i; 500 if (index < 0 || index > request->iovec_count) 501 return -1; 502 503 *address = request->iovecs[index].address + request->iovec_offset; 504 *length = request->iovecs[index].size - request->iovec_offset; 505 506 TRACE("IOV %d (+ " B_PRIu32 "): 0x%" B_PRIx64 ", %" B_PRIu32 "\n", 507 request->iovec_i, request->iovec_offset, *address, *length); 508 509 request->iovec_i++; 510 request->iovec_offset = 0; 511 return 0; 512 } 513 514 515 static status_t 516 do_nvme_io_request(nvme_disk_driver_info* info, nvme_io_request* request) 517 { 518 request->status = EINPROGRESS; 519 520 qpair_info* qpinfo = get_qpair(info); 521 int ret = -1; 522 if (request->write) { 523 ret = nvme_ns_writev(info->ns, qpinfo->qpair, request->lba_start, 524 request->lba_count, (nvme_cmd_cb)io_finished_callback, request, 525 0, (nvme_req_reset_sgl_cb)ior_reset_sgl, 526 (nvme_req_next_sge_cb)ior_next_sge); 527 } else { 528 ret = nvme_ns_readv(info->ns, qpinfo->qpair, request->lba_start, 529 request->lba_count, (nvme_cmd_cb)io_finished_callback, request, 530 0, (nvme_req_reset_sgl_cb)ior_reset_sgl, 531 (nvme_req_next_sge_cb)ior_next_sge); 532 } 533 if (ret != 0) { 534 TRACE_ERROR("attempt to queue %s I/O at LBA %" B_PRIdOFF " of %" B_PRIuSIZE 535 " blocks failed!\n", request->write ? "write" : "read", 536 request->lba_start, request->lba_count); 537 538 request->lba_count = 0; 539 return ret; 540 } 541 542 await_status(info, qpinfo->qpair, request->status); 543 544 if (request->status != B_OK) { 545 TRACE_ERROR("%s at LBA %" B_PRIdOFF " of %" B_PRIuSIZE 546 " blocks failed!\n", request->write ? "write" : "read", 547 request->lba_start, request->lba_count); 548 549 request->lba_count = 0; 550 } 551 return request->status; 552 } 553 554 555 static status_t 556 nvme_disk_bounced_io(nvme_disk_handle* handle, io_request* request) 557 { 558 CALLED(); 559 560 WriteLocker writeLocker; 561 if (request->IsWrite()) 562 writeLocker.SetTo(handle->info->rounded_write_lock, false); 563 564 status_t status = acquire_sem(handle->info->dma_buffers_sem); 565 if (status != B_OK) { 566 request->SetStatusAndNotify(status); 567 return status; 568 } 569 570 const size_t block_size = handle->info->block_size; 571 572 TRACE("%p: IOR Offset: %" B_PRIdOFF "; Length %" B_PRIuGENADDR 573 "; Write %s\n", request, request->Offset(), request->Length(), 574 request->IsWrite() ? "yes" : "no"); 575 576 nvme_io_request nvme_request; 577 while (request->RemainingBytes() > 0) { 578 IOOperation operation; 579 status = handle->info->dma_resource.TranslateNext(request, &operation, 0); 580 if (status != B_OK) 581 break; 582 583 size_t transferredBytes = 0; 584 do { 585 TRACE("%p: IOO offset: %" B_PRIdOFF ", length: %" B_PRIuGENADDR 586 ", write: %s\n", request, operation.Offset(), 587 operation.Length(), operation.IsWrite() ? "yes" : "no"); 588 589 nvme_request.write = operation.IsWrite(); 590 nvme_request.lba_start = operation.Offset() / block_size; 591 nvme_request.lba_count = operation.Length() / block_size; 592 nvme_request.iovecs = (physical_entry*)operation.Vecs(); 593 nvme_request.iovec_count = operation.VecCount(); 594 595 status = do_nvme_io_request(handle->info, &nvme_request); 596 if (status == B_OK && nvme_request.write == request->IsWrite()) 597 transferredBytes += operation.OriginalLength(); 598 599 operation.SetStatus(status); 600 } while (status == B_OK && !operation.Finish()); 601 602 if (status == B_OK && operation.Status() != B_OK) { 603 TRACE_ERROR("I/O succeeded but IOOperation failed!\n"); 604 status = operation.Status(); 605 } 606 607 operation.SetTransferredBytes(transferredBytes); 608 request->OperationFinished(&operation, status, status != B_OK, 609 operation.OriginalOffset() + transferredBytes); 610 611 handle->info->dma_resource.RecycleBuffer(operation.Buffer()); 612 613 TRACE("%p: status %s, remaining bytes %" B_PRIuGENADDR "\n", request, 614 strerror(status), request->RemainingBytes()); 615 if (status != B_OK) 616 break; 617 } 618 619 release_sem(handle->info->dma_buffers_sem); 620 621 // Notify() also takes care of UnlockMemory(). 622 if (status != B_OK && request->Status() == B_OK) 623 request->SetStatusAndNotify(status); 624 else 625 request->NotifyFinished(); 626 return status; 627 } 628 629 630 static status_t 631 nvme_disk_io(void* cookie, io_request* request) 632 { 633 CALLED(); 634 635 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 636 637 nvme_io_request nvme_request; 638 memset(&nvme_request, 0, sizeof(nvme_io_request)); 639 640 nvme_request.write = request->IsWrite(); 641 642 physical_entry* vtophys = NULL; 643 MemoryDeleter vtophysDeleter; 644 645 IOBuffer* buffer = request->Buffer(); 646 status_t status = B_OK; 647 if (!buffer->IsPhysical()) { 648 status = buffer->LockMemory(request->TeamID(), request->IsWrite()); 649 if (status != B_OK) { 650 TRACE_ERROR("failed to lock memory: %s\n", strerror(status)); 651 return status; 652 } 653 // SetStatusAndNotify() takes care of unlocking memory if necessary. 654 655 // This is slightly inefficient, as we could use a BStackOrHeapArray in 656 // the optimal case (few physical entries required), but we would not 657 // know whether or not that was possible until calling get_memory_map() 658 // and then potentially reallocating, which would complicate the logic. 659 660 int32 vtophys_length = (request->Length() / B_PAGE_SIZE) + 2; 661 nvme_request.iovecs = vtophys = (physical_entry*)malloc(sizeof(physical_entry) 662 * vtophys_length); 663 if (vtophys == NULL) { 664 TRACE_ERROR("failed to allocate memory for iovecs\n"); 665 request->SetStatusAndNotify(B_NO_MEMORY); 666 return B_NO_MEMORY; 667 } 668 vtophysDeleter.SetTo(vtophys); 669 670 for (size_t i = 0; i < buffer->VecCount(); i++) { 671 generic_io_vec virt = buffer->VecAt(i); 672 uint32 entries = vtophys_length - nvme_request.iovec_count; 673 674 // Avoid copies by going straight into the vtophys array. 675 status = get_memory_map_etc(request->TeamID(), (void*)virt.base, 676 virt.length, vtophys + nvme_request.iovec_count, &entries); 677 if (status == B_BUFFER_OVERFLOW) { 678 TRACE("vtophys array was too small, reallocating\n"); 679 680 vtophysDeleter.Detach(); 681 vtophys_length *= 2; 682 nvme_request.iovecs = vtophys = (physical_entry*)realloc(vtophys, 683 sizeof(physical_entry) * vtophys_length); 684 vtophysDeleter.SetTo(vtophys); 685 if (vtophys == NULL) { 686 status = B_NO_MEMORY; 687 } else { 688 // Try again, with the larger buffer this time. 689 i--; 690 continue; 691 } 692 } 693 if (status != B_OK) { 694 TRACE_ERROR("I/O get_memory_map failed: %s\n", strerror(status)); 695 request->SetStatusAndNotify(status); 696 return status; 697 } 698 699 nvme_request.iovec_count += entries; 700 } 701 } else { 702 nvme_request.iovecs = (physical_entry*)buffer->Vecs(); 703 nvme_request.iovec_count = buffer->VecCount(); 704 } 705 706 // See if we need to bounce anything other than the first or last vec. 707 const size_t block_size = handle->info->block_size; 708 bool bounceAll = false; 709 for (int32 i = 1; !bounceAll && i < (nvme_request.iovec_count - 1); i++) { 710 if ((nvme_request.iovecs[i].address % B_PAGE_SIZE) != 0) 711 bounceAll = true; 712 if ((nvme_request.iovecs[i].size % B_PAGE_SIZE) != 0) 713 bounceAll = true; 714 } 715 716 // See if we need to bounce due to the first or last vec (which, unlike middle vecs, 717 // need only be a multiple of the block size, and must end and start on a page boundary, 718 // respectively, though the start address must always be 32-bit-aligned.) 719 if (nvme_request.iovec_count > 1) { 720 physical_entry* entry = &nvme_request.iovecs[0]; 721 if (!bounceAll && (((entry->address + entry->size) % B_PAGE_SIZE) != 0 722 || (entry->address & 0x3) != 0 || (entry->size % block_size) != 0)) 723 bounceAll = true; 724 725 entry = &nvme_request.iovecs[nvme_request.iovec_count - 1]; 726 if (!bounceAll && ((entry->address % B_PAGE_SIZE) != 0 727 || (entry->address & 0x3) != 0 || (entry->size % block_size) != 0)) 728 bounceAll = true; 729 } 730 731 // See if we need to bounce due to rounding. 732 const off_t rounded_pos = ROUNDDOWN(request->Offset(), block_size); 733 phys_size_t rounded_len = ROUNDUP(request->Length() + (request->Offset() 734 - rounded_pos), block_size); 735 if (rounded_pos != request->Offset() || rounded_len != request->Length()) 736 bounceAll = true; 737 738 if (bounceAll) { 739 // Let the bounced I/O routine take care of everything from here. 740 return nvme_disk_bounced_io(handle, request); 741 } 742 743 nvme_request.lba_start = rounded_pos / block_size; 744 nvme_request.lba_count = rounded_len / block_size; 745 746 // No bouncing was required. 747 ReadLocker readLocker; 748 if (nvme_request.write) 749 readLocker.SetTo(handle->info->rounded_write_lock, false); 750 751 // Error check before actually doing I/O. 752 if (status != B_OK) { 753 TRACE_ERROR("I/O failed early: %s\n", strerror(status)); 754 request->SetStatusAndNotify(status); 755 return status; 756 } 757 758 const uint32 max_io_blocks = handle->info->max_io_blocks; 759 int32 remaining = nvme_request.iovec_count; 760 while (remaining > 0) { 761 nvme_request.iovec_count = min_c(remaining, 762 NVME_MAX_SGL_DESCRIPTORS / 2); 763 764 nvme_request.lba_count = 0; 765 for (int i = 0; i < nvme_request.iovec_count; i++) { 766 int32 new_lba_count = nvme_request.lba_count 767 + (nvme_request.iovecs[i].size / block_size); 768 if (nvme_request.lba_count > 0 && new_lba_count > max_io_blocks) { 769 // We already have a nonzero length, and adding this vec would 770 // make us go over (or we already are over.) Stop adding. 771 nvme_request.iovec_count = i; 772 break; 773 } 774 775 nvme_request.lba_count = new_lba_count; 776 } 777 778 status = do_nvme_io_request(handle->info, &nvme_request); 779 if (status != B_OK) 780 break; 781 782 nvme_request.iovecs += nvme_request.iovec_count; 783 remaining -= nvme_request.iovec_count; 784 nvme_request.lba_start += nvme_request.lba_count; 785 } 786 787 if (status != B_OK) 788 TRACE_ERROR("I/O failed: %s\n", strerror(status)); 789 790 request->SetTransferredBytes(status != B_OK, 791 (nvme_request.lba_start * block_size) - rounded_pos); 792 request->SetStatusAndNotify(status); 793 return status; 794 } 795 796 797 static status_t 798 nvme_disk_read(void* cookie, off_t pos, void* buffer, size_t* length) 799 { 800 CALLED(); 801 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 802 803 const off_t end = (handle->info->capacity * handle->info->block_size); 804 if (pos >= end) 805 return B_BAD_VALUE; 806 if (pos + (off_t)*length > end) 807 *length = end - pos; 808 809 IORequest request; 810 status_t status = request.Init(pos, (addr_t)buffer, *length, false, 0); 811 if (status != B_OK) 812 return status; 813 814 status = nvme_disk_io(handle, &request); 815 *length = request.TransferredBytes(); 816 return status; 817 } 818 819 820 static status_t 821 nvme_disk_write(void* cookie, off_t pos, const void* buffer, size_t* length) 822 { 823 CALLED(); 824 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 825 826 const off_t end = (handle->info->capacity * handle->info->block_size); 827 if (pos >= end) 828 return B_BAD_VALUE; 829 if (pos + (off_t)*length > end) 830 *length = end - pos; 831 832 IORequest request; 833 status_t status = request.Init(pos, (addr_t)buffer, *length, true, 0); 834 if (status != B_OK) 835 return status; 836 837 status = nvme_disk_io(handle, &request); 838 *length = request.TransferredBytes(); 839 return status; 840 } 841 842 843 static status_t 844 nvme_disk_flush(nvme_disk_driver_info* info) 845 { 846 status_t status = EINPROGRESS; 847 848 qpair_info* qpinfo = get_qpair(info); 849 int ret = nvme_ns_flush(info->ns, qpinfo->qpair, 850 (nvme_cmd_cb)io_finished_callback, &status); 851 if (ret != 0) 852 return ret; 853 854 await_status(info, qpinfo->qpair, status); 855 return status; 856 } 857 858 859 static status_t 860 nvme_disk_ioctl(void* cookie, uint32 op, void* buffer, size_t length) 861 { 862 CALLED(); 863 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 864 nvme_disk_driver_info* info = handle->info; 865 866 TRACE("ioctl(op = %" B_PRId32 ")\n", op); 867 868 switch (op) { 869 case B_GET_MEDIA_STATUS: 870 { 871 *(status_t *)buffer = info->media_status; 872 info->media_status = B_OK; 873 return B_OK; 874 break; 875 } 876 877 case B_GET_DEVICE_SIZE: 878 { 879 size_t size = info->capacity * info->block_size; 880 return user_memcpy(buffer, &size, sizeof(size_t)); 881 } 882 883 case B_GET_GEOMETRY: 884 { 885 if (buffer == NULL /*|| length != sizeof(device_geometry)*/) 886 return B_BAD_VALUE; 887 888 device_geometry geometry; 889 status_t status = get_geometry(handle, &geometry); 890 if (status != B_OK) 891 return status; 892 893 return user_memcpy(buffer, &geometry, sizeof(device_geometry)); 894 } 895 896 case B_GET_ICON_NAME: 897 return user_strlcpy((char*)buffer, "devices/drive-harddisk", 898 B_FILE_NAME_LENGTH); 899 900 case B_GET_VECTOR_ICON: 901 { 902 device_icon iconData; 903 if (length != sizeof(device_icon)) 904 return B_BAD_VALUE; 905 if (user_memcpy(&iconData, buffer, sizeof(device_icon)) != B_OK) 906 return B_BAD_ADDRESS; 907 908 if (iconData.icon_size >= (int32)sizeof(kDriveIcon)) { 909 if (user_memcpy(iconData.icon_data, kDriveIcon, 910 sizeof(kDriveIcon)) != B_OK) 911 return B_BAD_ADDRESS; 912 } 913 914 iconData.icon_size = sizeof(kDriveIcon); 915 return user_memcpy(buffer, &iconData, sizeof(device_icon)); 916 } 917 918 case B_FLUSH_DRIVE_CACHE: 919 return nvme_disk_flush(info); 920 } 921 922 return B_DEV_INVALID_IOCTL; 923 } 924 925 926 // #pragma mark - driver module API 927 928 929 static float 930 nvme_disk_supports_device(device_node *parent) 931 { 932 CALLED(); 933 934 const char* bus; 935 uint16 baseClass, subClass; 936 937 if (sDeviceManager->get_attr_string(parent, B_DEVICE_BUS, &bus, false) != B_OK 938 || sDeviceManager->get_attr_uint16(parent, B_DEVICE_TYPE, &baseClass, false) != B_OK 939 || sDeviceManager->get_attr_uint16(parent, B_DEVICE_SUB_TYPE, &subClass, false) != B_OK) 940 return -1.0f; 941 942 if (strcmp(bus, "pci") != 0 || baseClass != PCI_mass_storage) 943 return 0.0f; 944 945 if (subClass != PCI_nvm) 946 return 0.0f; 947 948 TRACE("NVMe device found!\n"); 949 return 1.0f; 950 } 951 952 953 static status_t 954 nvme_disk_register_device(device_node* parent) 955 { 956 CALLED(); 957 958 device_attr attrs[] = { 959 { B_DEVICE_PRETTY_NAME, B_STRING_TYPE, { string: "NVMe Disk" } }, 960 { NULL } 961 }; 962 963 return sDeviceManager->register_node(parent, NVME_DISK_DRIVER_MODULE_NAME, 964 attrs, NULL, NULL); 965 } 966 967 968 static status_t 969 nvme_disk_init_driver(device_node* node, void** cookie) 970 { 971 CALLED(); 972 973 int ret = nvme_lib_init((enum nvme_log_level)0, (enum nvme_log_facility)0, NULL); 974 if (ret != 0) { 975 TRACE_ERROR("libnvme initialization failed!\n"); 976 return ret; 977 } 978 979 nvme_disk_driver_info* info = new nvme_disk_driver_info; 980 if (info == NULL) 981 return B_NO_MEMORY; 982 983 info->media_status = B_OK; 984 info->node = node; 985 986 info->ctrlr = NULL; 987 988 *cookie = info; 989 return B_OK; 990 } 991 992 993 static void 994 nvme_disk_uninit_driver(void* _cookie) 995 { 996 CALLED(); 997 998 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie; 999 free(info); 1000 } 1001 1002 1003 static status_t 1004 nvme_disk_register_child_devices(void* _cookie) 1005 { 1006 CALLED(); 1007 1008 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie; 1009 status_t status; 1010 1011 int32 id = sDeviceManager->create_id(NVME_DISK_DEVICE_ID_GENERATOR); 1012 if (id < 0) 1013 return id; 1014 1015 char name[64]; 1016 snprintf(name, sizeof(name), "disk/nvme/%" B_PRId32 "/raw", 1017 id); 1018 1019 status = sDeviceManager->publish_device(info->node, name, 1020 NVME_DISK_DEVICE_MODULE_NAME); 1021 1022 return status; 1023 } 1024 1025 1026 // #pragma mark - 1027 1028 1029 module_dependency module_dependencies[] = { 1030 { B_DEVICE_MANAGER_MODULE_NAME, (module_info**)&sDeviceManager }, 1031 { NULL } 1032 }; 1033 1034 struct device_module_info sNvmeDiskDevice = { 1035 { 1036 NVME_DISK_DEVICE_MODULE_NAME, 1037 0, 1038 NULL 1039 }, 1040 1041 nvme_disk_init_device, 1042 nvme_disk_uninit_device, 1043 NULL, // remove, 1044 1045 nvme_disk_open, 1046 nvme_disk_close, 1047 nvme_disk_free, 1048 nvme_disk_read, 1049 nvme_disk_write, 1050 nvme_disk_io, 1051 nvme_disk_ioctl, 1052 1053 NULL, // select 1054 NULL, // deselect 1055 }; 1056 1057 struct driver_module_info sNvmeDiskDriver = { 1058 { 1059 NVME_DISK_DRIVER_MODULE_NAME, 1060 0, 1061 NULL 1062 }, 1063 1064 nvme_disk_supports_device, 1065 nvme_disk_register_device, 1066 nvme_disk_init_driver, 1067 nvme_disk_uninit_driver, 1068 nvme_disk_register_child_devices, 1069 NULL, // rescan 1070 NULL, // removed 1071 }; 1072 1073 module_info* modules[] = { 1074 (module_info*)&sNvmeDiskDriver, 1075 (module_info*)&sNvmeDiskDevice, 1076 NULL 1077 }; 1078