1 /* 2 * Copyright 2019-2020, Haiku, Inc. All rights reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Augustin Cavalier <waddlesplash> 7 */ 8 9 10 #include <stdio.h> 11 #include <stdlib.h> 12 13 #include <algorithm> 14 #include <condition_variable.h> 15 #include <AutoDeleter.h> 16 #include <kernel.h> 17 #include <util/AutoLock.h> 18 19 #include <fs/devfs.h> 20 #include <bus/PCI.h> 21 #include <PCI_x86.h> 22 #include <vm/vm.h> 23 24 #include "IORequest.h" 25 26 extern "C" { 27 #include <libnvme/nvme.h> 28 #include <libnvme/nvme_internal.h> 29 } 30 31 32 //#define TRACE_NVME_DISK 33 #ifdef TRACE_NVME_DISK 34 # define TRACE(x...) dprintf("nvme_disk: " x) 35 #else 36 # define TRACE(x...) ; 37 #endif 38 #define TRACE_ALWAYS(x...) dprintf("nvme_disk: " x) 39 #define TRACE_ERROR(x...) dprintf("\33[33mnvme_disk:\33[0m " x) 40 #define CALLED() TRACE("CALLED %s\n", __PRETTY_FUNCTION__) 41 42 43 static const uint8 kDriveIcon[] = { 44 0x6e, 0x63, 0x69, 0x66, 0x08, 0x03, 0x01, 0x00, 0x00, 0x02, 0x00, 0x16, 45 0x02, 0x3c, 0xc7, 0xee, 0x38, 0x9b, 0xc0, 0xba, 0x16, 0x57, 0x3e, 0x39, 46 0xb0, 0x49, 0x77, 0xc8, 0x42, 0xad, 0xc7, 0x00, 0xff, 0xff, 0xd3, 0x02, 47 0x00, 0x06, 0x02, 0x3c, 0x96, 0x32, 0x3a, 0x4d, 0x3f, 0xba, 0xfc, 0x01, 48 0x3d, 0x5a, 0x97, 0x4b, 0x57, 0xa5, 0x49, 0x84, 0x4d, 0x00, 0x47, 0x47, 49 0x47, 0xff, 0xa5, 0xa0, 0xa0, 0x02, 0x00, 0x16, 0x02, 0xbc, 0x59, 0x2f, 50 0xbb, 0x29, 0xa7, 0x3c, 0x0c, 0xe4, 0xbd, 0x0b, 0x7c, 0x48, 0x92, 0xc0, 51 0x4b, 0x79, 0x66, 0x00, 0x7d, 0xff, 0xd4, 0x02, 0x00, 0x06, 0x02, 0x38, 52 0xdb, 0xb4, 0x39, 0x97, 0x33, 0xbc, 0x4a, 0x33, 0x3b, 0xa5, 0x42, 0x48, 53 0x6e, 0x66, 0x49, 0xee, 0x7b, 0x00, 0x59, 0x67, 0x56, 0xff, 0xeb, 0xb2, 54 0xb2, 0x03, 0xa7, 0xff, 0x00, 0x03, 0xff, 0x00, 0x00, 0x04, 0x01, 0x80, 55 0x07, 0x0a, 0x06, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x5a, 0x3e, 0x5a, 56 0x31, 0x39, 0x25, 0x0a, 0x04, 0x22, 0x3c, 0x44, 0x4b, 0x5a, 0x31, 0x39, 57 0x25, 0x0a, 0x04, 0x44, 0x4b, 0x44, 0x5b, 0x5a, 0x3e, 0x5a, 0x31, 0x0a, 58 0x04, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x44, 0x4b, 0x08, 0x02, 0x27, 59 0x43, 0xb8, 0x14, 0xc1, 0xf1, 0x08, 0x02, 0x26, 0x43, 0x29, 0x44, 0x0a, 60 0x05, 0x44, 0x5d, 0x49, 0x5d, 0x60, 0x3e, 0x5a, 0x3b, 0x5b, 0x3f, 0x08, 61 0x0a, 0x07, 0x01, 0x06, 0x00, 0x0a, 0x00, 0x01, 0x00, 0x10, 0x01, 0x17, 62 0x84, 0x00, 0x04, 0x0a, 0x01, 0x01, 0x01, 0x00, 0x0a, 0x02, 0x01, 0x02, 63 0x00, 0x0a, 0x03, 0x01, 0x03, 0x00, 0x0a, 0x04, 0x01, 0x04, 0x10, 0x01, 64 0x17, 0x85, 0x20, 0x04, 0x0a, 0x06, 0x01, 0x05, 0x30, 0x24, 0xb3, 0x99, 65 0x01, 0x17, 0x82, 0x00, 0x04, 0x0a, 0x05, 0x01, 0x05, 0x30, 0x20, 0xb2, 66 0xe6, 0x01, 0x17, 0x82, 0x00, 0x04 67 }; 68 69 70 #define NVME_DISK_DRIVER_MODULE_NAME "drivers/disk/nvme_disk/driver_v1" 71 #define NVME_DISK_DEVICE_MODULE_NAME "drivers/disk/nvme_disk/device_v1" 72 #define NVME_DISK_DEVICE_ID_GENERATOR "nvme_disk/device_id" 73 74 #define NVME_MAX_QPAIRS (8) 75 76 77 static device_manager_info* sDeviceManager; 78 static pci_x86_module_info* sPCIx86Module; 79 80 typedef struct { 81 device_node* node; 82 pci_info info; 83 84 struct nvme_ctrlr* ctrlr; 85 86 struct nvme_ns* ns; 87 uint64 capacity; 88 uint32 block_size; 89 uint32 max_io_blocks; 90 status_t media_status; 91 92 struct qpair_info { 93 struct nvme_qpair* qpair; 94 } qpairs[NVME_MAX_QPAIRS]; 95 uint32 qpair_count; 96 uint32 next_qpair; 97 98 DMAResource dma_resource; 99 sem_id dma_buffers_sem; 100 101 rw_lock rounded_write_lock; 102 103 ConditionVariable interrupt; 104 } nvme_disk_driver_info; 105 typedef nvme_disk_driver_info::qpair_info qpair_info; 106 107 108 typedef struct { 109 nvme_disk_driver_info* info; 110 } nvme_disk_handle; 111 112 113 static status_t 114 get_geometry(nvme_disk_handle* handle, device_geometry* geometry) 115 { 116 nvme_disk_driver_info* info = handle->info; 117 118 devfs_compute_geometry_size(geometry, info->capacity, info->block_size); 119 120 geometry->device_type = B_DISK; 121 geometry->removable = false; 122 123 geometry->read_only = false; 124 geometry->write_once = false; 125 126 TRACE("get_geometry(): %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %d, %d, %d, %d\n", 127 geometry->bytes_per_sector, geometry->sectors_per_track, 128 geometry->cylinder_count, geometry->head_count, geometry->device_type, 129 geometry->removable, geometry->read_only, geometry->write_once); 130 131 return B_OK; 132 } 133 134 135 static int 136 log2(uint32 x) 137 { 138 int y; 139 140 for (y = 31; y >= 0; --y) { 141 if (x == ((uint32)1 << y)) 142 break; 143 } 144 145 return y; 146 } 147 148 149 static void 150 nvme_disk_set_capacity(nvme_disk_driver_info* info, uint64 capacity, 151 uint32 blockSize) 152 { 153 TRACE("set_capacity(device = %p, capacity = %" B_PRIu64 ", blockSize = %" B_PRIu32 ")\n", 154 info, capacity, blockSize); 155 156 // get log2, if possible 157 uint32 blockShift = log2(blockSize); 158 159 if ((1UL << blockShift) != blockSize) 160 blockShift = 0; 161 162 info->capacity = capacity; 163 info->block_size = blockSize; 164 } 165 166 167 // #pragma mark - device module API 168 169 170 static int32 nvme_interrupt_handler(void* _info); 171 172 173 static status_t 174 nvme_disk_init_device(void* _info, void** _cookie) 175 { 176 CALLED(); 177 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info; 178 179 pci_device_module_info* pci; 180 pci_device* pcidev; 181 device_node* parent = sDeviceManager->get_parent_node(info->node); 182 sDeviceManager->get_driver(parent, (driver_module_info**)&pci, 183 (void**)&pcidev); 184 pci->get_pci_info(pcidev, &info->info); 185 sDeviceManager->put_node(parent); 186 187 // construct the libnvme pci_device struct 188 pci_device* device = new pci_device; 189 device->vendor_id = info->info.vendor_id; 190 device->device_id = info->info.device_id; 191 device->subvendor_id = 0; 192 device->subdevice_id = 0; 193 194 device->domain = 0; 195 device->bus = info->info.bus; 196 device->dev = info->info.device; 197 device->func = info->info.function; 198 199 device->pci_info = &info->info; 200 201 // enable busmaster and memory mapped access 202 uint16 command = pci->read_pci_config(pcidev, PCI_command, 2); 203 command |= PCI_command_master | PCI_command_memory; 204 pci->write_pci_config(pcidev, PCI_command, 2, command); 205 206 // open the controller 207 info->ctrlr = nvme_ctrlr_open(device, NULL); 208 if (info->ctrlr == NULL) { 209 TRACE_ERROR("failed to open the controller!\n"); 210 return B_ERROR; 211 } 212 213 struct nvme_ctrlr_stat cstat; 214 int err = nvme_ctrlr_stat(info->ctrlr, &cstat); 215 if (err != 0) { 216 TRACE_ERROR("failed to get controller information!\n"); 217 nvme_ctrlr_close(info->ctrlr); 218 return err; 219 } 220 221 TRACE_ALWAYS("attached to NVMe device \"%s (%s)\"\n", cstat.mn, cstat.sn); 222 TRACE_ALWAYS("\tmaximum transfer size: %" B_PRIuSIZE "\n", cstat.max_xfer_size); 223 TRACE_ALWAYS("\tqpair count: %d\n", cstat.io_qpairs); 224 225 // TODO: export more than just the first namespace! 226 info->ns = nvme_ns_open(info->ctrlr, cstat.ns_ids[0]); 227 if (info->ns == NULL) { 228 TRACE_ERROR("failed to open namespace!\n"); 229 nvme_ctrlr_close(info->ctrlr); 230 return B_ERROR; 231 } 232 233 struct nvme_ns_stat nsstat; 234 err = nvme_ns_stat(info->ns, &nsstat); 235 if (err != 0) { 236 TRACE_ERROR("failed to get namespace information!\n"); 237 nvme_ctrlr_close(info->ctrlr); 238 return err; 239 } 240 241 // store capacity information 242 nvme_disk_set_capacity(info, nsstat.sectors, nsstat.sector_size); 243 244 TRACE("capacity: %" B_PRIu64 ", block_size %" B_PRIu32 "\n", 245 info->capacity, info->block_size); 246 247 // allocate qpairs 248 info->qpair_count = info->next_qpair = 0; 249 for (uint32 i = 0; i < NVME_MAX_QPAIRS && i < cstat.io_qpairs; i++) { 250 info->qpairs[i].qpair = nvme_ioqp_get(info->ctrlr, 251 (enum nvme_qprio)0, 0); 252 if (info->qpairs[i].qpair == NULL) 253 break; 254 255 info->qpair_count++; 256 } 257 if (info->qpair_count == 0) { 258 TRACE_ERROR("failed to allocate qpairs!\n"); 259 nvme_ctrlr_close(info->ctrlr); 260 return B_NO_MEMORY; 261 } 262 263 // allocate DMA buffers 264 int buffers = info->qpair_count * 2; 265 266 dma_restrictions restrictions = {}; 267 restrictions.alignment = B_PAGE_SIZE; 268 // Technically, the first and last segments in a transfer can be 269 // unaligned, and the rest only need to have sizes that are a multiple 270 // of the block size. 271 restrictions.max_segment_count = (NVME_MAX_SGL_DESCRIPTORS / 2); 272 restrictions.max_transfer_size = cstat.max_xfer_size; 273 info->max_io_blocks = cstat.max_xfer_size / nsstat.sector_size; 274 275 err = info->dma_resource.Init(restrictions, B_PAGE_SIZE, buffers, buffers); 276 if (err != 0) { 277 TRACE_ERROR("failed to initialize DMA resource!\n"); 278 nvme_ctrlr_close(info->ctrlr); 279 return err; 280 } 281 282 info->dma_buffers_sem = create_sem(buffers, "nvme buffers sem"); 283 if (info->dma_buffers_sem < 0) { 284 TRACE_ERROR("failed to create DMA buffers semaphore!\n"); 285 nvme_ctrlr_close(info->ctrlr); 286 return info->dma_buffers_sem; 287 } 288 289 // set up rounded-write lock 290 rw_lock_init(&info->rounded_write_lock, "nvme rounded writes"); 291 292 // set up interrupt 293 if (get_module(B_PCI_X86_MODULE_NAME, (module_info**)&sPCIx86Module) 294 != B_OK) { 295 sPCIx86Module = NULL; 296 } 297 298 uint8 irq = info->info.u.h0.interrupt_line; 299 if (sPCIx86Module != NULL) { 300 if (sPCIx86Module->get_msix_count(info->info.bus, info->info.device, 301 info->info.function)) { 302 uint8 msixVector = 0; 303 if (sPCIx86Module->configure_msix(info->info.bus, info->info.device, 304 info->info.function, 1, &msixVector) == B_OK 305 && sPCIx86Module->enable_msix(info->info.bus, info->info.device, 306 info->info.function) == B_OK) { 307 TRACE_ALWAYS("using MSI-X\n"); 308 irq = msixVector; 309 } 310 } else if (sPCIx86Module->get_msi_count(info->info.bus, 311 info->info.device, info->info.function) >= 1) { 312 uint8 msiVector = 0; 313 if (sPCIx86Module->configure_msi(info->info.bus, info->info.device, 314 info->info.function, 1, &msiVector) == B_OK 315 && sPCIx86Module->enable_msi(info->info.bus, info->info.device, 316 info->info.function) == B_OK) { 317 TRACE_ALWAYS("using message signaled interrupts\n"); 318 irq = msiVector; 319 } 320 } 321 } else { 322 uint16 command = pci->read_pci_config(pcidev, PCI_command, 2); 323 command &= ~(PCI_command_int_disable); 324 pci->write_pci_config(pcidev, PCI_command, 2, command); 325 } 326 327 if (irq == 0 || irq == 0xFF) { 328 TRACE_ERROR("device PCI:%d:%d:%d was assigned an invalid IRQ\n", 329 info->info.bus, info->info.device, info->info.function); 330 return B_ERROR; 331 } 332 info->interrupt.Init(NULL, NULL); 333 install_io_interrupt_handler(irq, nvme_interrupt_handler, (void*)info, B_NO_HANDLED_INFO); 334 335 if (info->ctrlr->feature_supported[NVME_FEAT_INTERRUPT_COALESCING]) { 336 uint32 microseconds = 16, threshold = 32; 337 nvme_admin_set_feature(info->ctrlr, false, NVME_FEAT_INTERRUPT_COALESCING, 338 ((microseconds / 100) << 8) | threshold, 0, NULL); 339 } 340 341 *_cookie = info; 342 return B_OK; 343 } 344 345 346 static void 347 nvme_disk_uninit_device(void* _cookie) 348 { 349 CALLED(); 350 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie; 351 352 remove_io_interrupt_handler(info->info.u.h0.interrupt_line, 353 nvme_interrupt_handler, (void*)info); 354 355 rw_lock_destroy(&info->rounded_write_lock); 356 357 nvme_ns_close(info->ns); 358 nvme_ctrlr_close(info->ctrlr); 359 360 // TODO: Deallocate MSI(-X). 361 // TODO: Deallocate PCI. 362 } 363 364 365 static status_t 366 nvme_disk_open(void* _info, const char* path, int openMode, void** _cookie) 367 { 368 CALLED(); 369 370 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info; 371 nvme_disk_handle* handle = (nvme_disk_handle*)malloc( 372 sizeof(nvme_disk_handle)); 373 if (handle == NULL) 374 return B_NO_MEMORY; 375 376 handle->info = info; 377 378 *_cookie = handle; 379 return B_OK; 380 } 381 382 383 static status_t 384 nvme_disk_close(void* cookie) 385 { 386 CALLED(); 387 388 //nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 389 return B_OK; 390 } 391 392 393 static status_t 394 nvme_disk_free(void* cookie) 395 { 396 CALLED(); 397 398 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 399 free(handle); 400 return B_OK; 401 } 402 403 404 // #pragma mark - I/O 405 406 407 static int32 408 nvme_interrupt_handler(void* _info) 409 { 410 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info; 411 info->interrupt.NotifyAll(); 412 return 0; 413 } 414 415 416 static qpair_info* 417 get_qpair(nvme_disk_driver_info* info) 418 { 419 return &info->qpairs[atomic_add((int32*)&info->next_qpair, 1) 420 % info->qpair_count]; 421 } 422 423 424 static void 425 io_finished_callback(status_t* status, const struct nvme_cpl* cpl) 426 { 427 *status = nvme_cpl_is_error(cpl) ? B_IO_ERROR : B_OK; 428 } 429 430 431 static void 432 await_status(nvme_disk_driver_info* info, struct nvme_qpair* qpair, status_t& status) 433 { 434 CALLED(); 435 436 ConditionVariableEntry entry; 437 int timeouts = 0; 438 while (status == EINPROGRESS) { 439 info->interrupt.Add(&entry); 440 441 nvme_qpair_poll(qpair, 0); 442 443 if (status != EINPROGRESS) 444 return; 445 446 if (entry.Wait(B_RELATIVE_TIMEOUT, 5 * 1000 * 1000) != B_OK) { 447 // This should never happen, as we are woken up on every interrupt 448 // no matter the qpair or transfer within; so if it does occur, 449 // that probably means the controller stalled or something. 450 451 TRACE_ERROR("timed out waiting for interrupt!\n"); 452 if (timeouts++ >= 3) { 453 nvme_qpair_fail(qpair); 454 status = B_TIMED_OUT; 455 return; 456 } 457 } 458 459 nvme_qpair_poll(qpair, 0); 460 } 461 } 462 463 464 struct nvme_io_request { 465 status_t status; 466 467 bool write; 468 469 off_t lba_start; 470 size_t lba_count; 471 472 physical_entry* iovecs; 473 int32 iovec_count; 474 475 int32 iovec_i; 476 uint32 iovec_offset; 477 }; 478 479 480 void ior_reset_sgl(nvme_io_request* request, uint32_t offset) 481 { 482 TRACE("IOR Reset: %" B_PRIu32 "\n", offset); 483 484 int32 i = 0; 485 while (offset > 0 && request->iovecs[i].size <= offset) { 486 offset -= request->iovecs[i].size; 487 i++; 488 } 489 request->iovec_i = i; 490 request->iovec_offset = offset; 491 } 492 493 494 int ior_next_sge(nvme_io_request* request, uint64_t* address, uint32_t* length) 495 { 496 int32 index = request->iovec_i; 497 if (index < 0 || index > request->iovec_count) 498 return -1; 499 500 *address = request->iovecs[index].address + request->iovec_offset; 501 *length = request->iovecs[index].size - request->iovec_offset; 502 503 TRACE("IOV %d (+ " B_PRIu32 "): 0x%" B_PRIx64 ", %" B_PRIu32 "\n", 504 request->iovec_i, request->iovec_offset, *address, *length); 505 506 request->iovec_i++; 507 request->iovec_offset = 0; 508 return 0; 509 } 510 511 512 static status_t 513 do_nvme_io_request(nvme_disk_driver_info* info, nvme_io_request* request) 514 { 515 request->status = EINPROGRESS; 516 517 qpair_info* qpinfo = get_qpair(info); 518 int ret = -1; 519 if (request->write) { 520 ret = nvme_ns_writev(info->ns, qpinfo->qpair, request->lba_start, 521 request->lba_count, (nvme_cmd_cb)io_finished_callback, request, 522 0, (nvme_req_reset_sgl_cb)ior_reset_sgl, 523 (nvme_req_next_sge_cb)ior_next_sge); 524 } else { 525 ret = nvme_ns_readv(info->ns, qpinfo->qpair, request->lba_start, 526 request->lba_count, (nvme_cmd_cb)io_finished_callback, request, 527 0, (nvme_req_reset_sgl_cb)ior_reset_sgl, 528 (nvme_req_next_sge_cb)ior_next_sge); 529 } 530 if (ret != 0) { 531 TRACE_ERROR("attempt to queue %s I/O at LBA %" B_PRIdOFF " of %" B_PRIuSIZE 532 " blocks failed!\n", request->write ? "write" : "read", 533 request->lba_start, request->lba_count); 534 535 request->lba_count = 0; 536 return ret; 537 } 538 539 await_status(info, qpinfo->qpair, request->status); 540 541 if (request->status != B_OK) { 542 TRACE_ERROR("%s at LBA %" B_PRIdOFF " of %" B_PRIuSIZE 543 " blocks failed!\n", request->write ? "write" : "read", 544 request->lba_start, request->lba_count); 545 546 request->lba_count = 0; 547 } 548 return request->status; 549 } 550 551 552 static status_t 553 nvme_disk_bounced_io(nvme_disk_handle* handle, io_request* request) 554 { 555 CALLED(); 556 557 WriteLocker writeLocker; 558 if (request->IsWrite()) 559 writeLocker.SetTo(handle->info->rounded_write_lock, false); 560 561 status_t status = acquire_sem(handle->info->dma_buffers_sem); 562 if (status != B_OK) { 563 request->SetStatusAndNotify(status); 564 return status; 565 } 566 567 const size_t block_size = handle->info->block_size; 568 569 TRACE("%p: IOR Offset: %" B_PRIdOFF "; Length %" B_PRIuGENADDR 570 "; Write %s\n", request, request->Offset(), request->Length(), 571 request->IsWrite() ? "yes" : "no"); 572 573 nvme_io_request nvme_request; 574 while (request->RemainingBytes() > 0) { 575 IOOperation operation; 576 status = handle->info->dma_resource.TranslateNext(request, &operation, 0); 577 if (status != B_OK) 578 break; 579 580 size_t transferredBytes = 0; 581 do { 582 TRACE("%p: IOO offset: %" B_PRIdOFF ", length: %" B_PRIuGENADDR 583 ", write: %s\n", request, operation.Offset(), 584 operation.Length(), operation.IsWrite() ? "yes" : "no"); 585 586 nvme_request.write = operation.IsWrite(); 587 nvme_request.lba_start = operation.Offset() / block_size; 588 nvme_request.lba_count = operation.Length() / block_size; 589 nvme_request.iovecs = (physical_entry*)operation.Vecs(); 590 nvme_request.iovec_count = operation.VecCount(); 591 592 status = do_nvme_io_request(handle->info, &nvme_request); 593 if (status == B_OK && nvme_request.write == request->IsWrite()) 594 transferredBytes += operation.OriginalLength(); 595 596 operation.SetStatus(status); 597 } while (status == B_OK && !operation.Finish()); 598 599 if (status == B_OK && operation.Status() != B_OK) { 600 TRACE_ERROR("I/O succeeded but IOOperation failed!\n"); 601 status = operation.Status(); 602 } 603 604 operation.SetTransferredBytes(transferredBytes); 605 request->OperationFinished(&operation, status, status != B_OK, 606 operation.OriginalOffset() + transferredBytes); 607 608 handle->info->dma_resource.RecycleBuffer(operation.Buffer()); 609 610 TRACE("%p: status %s, remaining bytes %" B_PRIuGENADDR "\n", request, 611 strerror(status), request->RemainingBytes()); 612 if (status != B_OK) 613 break; 614 } 615 616 release_sem(handle->info->dma_buffers_sem); 617 618 // Notify() also takes care of UnlockMemory(). 619 if (status != B_OK && request->Status() == B_OK) 620 request->SetStatusAndNotify(status); 621 else 622 request->NotifyFinished(); 623 return status; 624 } 625 626 627 static status_t 628 nvme_disk_io(void* cookie, io_request* request) 629 { 630 CALLED(); 631 632 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 633 634 nvme_io_request nvme_request; 635 memset(&nvme_request, 0, sizeof(nvme_io_request)); 636 637 nvme_request.write = request->IsWrite(); 638 639 physical_entry* vtophys = NULL; 640 MemoryDeleter vtophysDeleter; 641 642 IOBuffer* buffer = request->Buffer(); 643 status_t status = B_OK; 644 if (!buffer->IsPhysical()) { 645 status = buffer->LockMemory(request->TeamID(), request->IsWrite()); 646 if (status != B_OK) { 647 TRACE_ERROR("failed to lock memory: %s\n", strerror(status)); 648 return status; 649 } 650 // SetStatusAndNotify() takes care of unlocking memory if necessary. 651 652 // This is slightly inefficient, as we could use a BStackOrHeapArray in 653 // the optimal case (few physical entries required), but we would not 654 // know whether or not that was possible until calling get_memory_map() 655 // and then potentially reallocating, which would complicate the logic. 656 657 int32 vtophys_length = (request->Length() / B_PAGE_SIZE) + 2; 658 nvme_request.iovecs = vtophys = (physical_entry*)malloc(sizeof(physical_entry) 659 * vtophys_length); 660 if (vtophys == NULL) { 661 TRACE_ERROR("failed to allocate memory for iovecs\n"); 662 request->SetStatusAndNotify(B_NO_MEMORY); 663 return B_NO_MEMORY; 664 } 665 vtophysDeleter.SetTo(vtophys); 666 667 for (size_t i = 0; i < buffer->VecCount(); i++) { 668 generic_io_vec virt = buffer->VecAt(i); 669 uint32 entries = vtophys_length - nvme_request.iovec_count; 670 671 // Avoid copies by going straight into the vtophys array. 672 status = get_memory_map_etc(request->TeamID(), (void*)virt.base, 673 virt.length, vtophys + nvme_request.iovec_count, &entries); 674 if (status == B_BUFFER_OVERFLOW) { 675 TRACE("vtophys array was too small, reallocating\n"); 676 677 vtophysDeleter.Detach(); 678 vtophys_length *= 2; 679 nvme_request.iovecs = vtophys = (physical_entry*)realloc(vtophys, 680 sizeof(physical_entry) * vtophys_length); 681 vtophysDeleter.SetTo(vtophys); 682 if (vtophys == NULL) { 683 status = B_NO_MEMORY; 684 } else { 685 // Try again, with the larger buffer this time. 686 i--; 687 continue; 688 } 689 } 690 if (status != B_OK) { 691 TRACE_ERROR("I/O get_memory_map failed: %s\n", strerror(status)); 692 request->SetStatusAndNotify(status); 693 return status; 694 } 695 696 nvme_request.iovec_count += entries; 697 } 698 } else { 699 nvme_request.iovecs = (physical_entry*)buffer->Vecs(); 700 nvme_request.iovec_count = buffer->VecCount(); 701 } 702 703 // See if we need to bounce anything other than the first or last vec. 704 const size_t block_size = handle->info->block_size; 705 bool bounceAll = false; 706 for (int32 i = 1; !bounceAll && i < (nvme_request.iovec_count - 1); i++) { 707 if ((nvme_request.iovecs[i].address % B_PAGE_SIZE) != 0) 708 bounceAll = true; 709 if ((nvme_request.iovecs[i].size % B_PAGE_SIZE) != 0) 710 bounceAll = true; 711 } 712 713 // See if we need to bounce due to the first or last vec. 714 if (nvme_request.iovec_count > 1) { 715 physical_entry* entry = &nvme_request.iovecs[0]; 716 if (!bounceAll && (((entry->address + entry->size) % B_PAGE_SIZE) != 0 717 || (entry->size % block_size) != 0)) 718 bounceAll = true; 719 720 entry = &nvme_request.iovecs[nvme_request.iovec_count - 1]; 721 if (!bounceAll && ((entry->address % B_PAGE_SIZE) != 0 722 || (entry->size % block_size) != 0)) 723 bounceAll = true; 724 } 725 726 // See if we need to bounce due to rounding. 727 const off_t rounded_pos = ROUNDDOWN(request->Offset(), block_size); 728 phys_size_t rounded_len = ROUNDUP(request->Length() + (request->Offset() 729 - rounded_pos), block_size); 730 if (rounded_pos != request->Offset() || rounded_len != request->Length()) 731 bounceAll = true; 732 733 if (bounceAll) { 734 // Let the bounced I/O routine take care of everything from here. 735 return nvme_disk_bounced_io(handle, request); 736 } 737 738 nvme_request.lba_start = rounded_pos / block_size; 739 nvme_request.lba_count = rounded_len / block_size; 740 741 // No bouncing was required. 742 ReadLocker readLocker; 743 if (nvme_request.write) 744 readLocker.SetTo(handle->info->rounded_write_lock, false); 745 746 // Error check before actually doing I/O. 747 if (status != B_OK) { 748 TRACE_ERROR("I/O failed early: %s\n", strerror(status)); 749 request->SetStatusAndNotify(status); 750 return status; 751 } 752 753 const uint32 max_io_blocks = handle->info->max_io_blocks; 754 int32 remaining = nvme_request.iovec_count; 755 while (remaining > 0) { 756 nvme_request.iovec_count = min_c(remaining, 757 NVME_MAX_SGL_DESCRIPTORS / 2); 758 759 nvme_request.lba_count = 0; 760 for (int i = 0; i < nvme_request.iovec_count; i++) { 761 int32 new_lba_count = nvme_request.lba_count 762 + (nvme_request.iovecs[i].size / block_size); 763 if (nvme_request.lba_count > 0 && new_lba_count > max_io_blocks) { 764 // We already have a nonzero length, and adding this vec would 765 // make us go over (or we already are over.) Stop adding. 766 nvme_request.iovec_count = i; 767 break; 768 } 769 770 nvme_request.lba_count = new_lba_count; 771 } 772 773 status = do_nvme_io_request(handle->info, &nvme_request); 774 if (status != B_OK) 775 break; 776 777 nvme_request.iovecs += nvme_request.iovec_count; 778 remaining -= nvme_request.iovec_count; 779 nvme_request.lba_start += nvme_request.lba_count; 780 } 781 782 if (status != B_OK) 783 TRACE_ERROR("I/O failed: %s\n", strerror(status)); 784 785 request->SetTransferredBytes(status != B_OK, 786 (nvme_request.lba_start * block_size) - rounded_pos); 787 request->SetStatusAndNotify(status); 788 return status; 789 } 790 791 792 static status_t 793 nvme_disk_read(void* cookie, off_t pos, void* buffer, size_t* length) 794 { 795 CALLED(); 796 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 797 798 const off_t end = (handle->info->capacity * handle->info->block_size); 799 if (pos >= end) 800 return B_BAD_VALUE; 801 if (pos + (off_t)*length > end) 802 *length = end - pos; 803 804 IORequest request; 805 status_t status = request.Init(pos, (addr_t)buffer, *length, false, 0); 806 if (status != B_OK) 807 return status; 808 809 status = nvme_disk_io(handle, &request); 810 *length = request.TransferredBytes(); 811 return status; 812 } 813 814 815 static status_t 816 nvme_disk_write(void* cookie, off_t pos, const void* buffer, size_t* length) 817 { 818 CALLED(); 819 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 820 821 const off_t end = (handle->info->capacity * handle->info->block_size); 822 if (pos >= end) 823 return B_BAD_VALUE; 824 if (pos + (off_t)*length > end) 825 *length = end - pos; 826 827 IORequest request; 828 status_t status = request.Init(pos, (addr_t)buffer, *length, true, 0); 829 if (status != B_OK) 830 return status; 831 832 status = nvme_disk_io(handle, &request); 833 *length = request.TransferredBytes(); 834 return status; 835 } 836 837 838 static status_t 839 nvme_disk_flush(nvme_disk_driver_info* info) 840 { 841 status_t status = EINPROGRESS; 842 843 qpair_info* qpinfo = get_qpair(info); 844 int ret = nvme_ns_flush(info->ns, qpinfo->qpair, 845 (nvme_cmd_cb)io_finished_callback, &status); 846 if (ret != 0) 847 return ret; 848 849 await_status(info, qpinfo->qpair, status); 850 return status; 851 } 852 853 854 static status_t 855 nvme_disk_ioctl(void* cookie, uint32 op, void* buffer, size_t length) 856 { 857 CALLED(); 858 nvme_disk_handle* handle = (nvme_disk_handle*)cookie; 859 nvme_disk_driver_info* info = handle->info; 860 861 TRACE("ioctl(op = %" B_PRId32 ")\n", op); 862 863 switch (op) { 864 case B_GET_MEDIA_STATUS: 865 { 866 *(status_t *)buffer = info->media_status; 867 info->media_status = B_OK; 868 return B_OK; 869 break; 870 } 871 872 case B_GET_DEVICE_SIZE: 873 { 874 size_t size = info->capacity * info->block_size; 875 return user_memcpy(buffer, &size, sizeof(size_t)); 876 } 877 878 case B_GET_GEOMETRY: 879 { 880 if (buffer == NULL /*|| length != sizeof(device_geometry)*/) 881 return B_BAD_VALUE; 882 883 device_geometry geometry; 884 status_t status = get_geometry(handle, &geometry); 885 if (status != B_OK) 886 return status; 887 888 return user_memcpy(buffer, &geometry, sizeof(device_geometry)); 889 } 890 891 case B_GET_ICON_NAME: 892 return user_strlcpy((char*)buffer, "devices/drive-harddisk", 893 B_FILE_NAME_LENGTH); 894 895 case B_GET_VECTOR_ICON: 896 { 897 device_icon iconData; 898 if (length != sizeof(device_icon)) 899 return B_BAD_VALUE; 900 if (user_memcpy(&iconData, buffer, sizeof(device_icon)) != B_OK) 901 return B_BAD_ADDRESS; 902 903 if (iconData.icon_size >= (int32)sizeof(kDriveIcon)) { 904 if (user_memcpy(iconData.icon_data, kDriveIcon, 905 sizeof(kDriveIcon)) != B_OK) 906 return B_BAD_ADDRESS; 907 } 908 909 iconData.icon_size = sizeof(kDriveIcon); 910 return user_memcpy(buffer, &iconData, sizeof(device_icon)); 911 } 912 913 case B_FLUSH_DRIVE_CACHE: 914 return nvme_disk_flush(info); 915 } 916 917 return B_DEV_INVALID_IOCTL; 918 } 919 920 921 // #pragma mark - driver module API 922 923 924 static float 925 nvme_disk_supports_device(device_node *parent) 926 { 927 CALLED(); 928 929 const char* bus; 930 uint16 baseClass, subClass; 931 932 if (sDeviceManager->get_attr_string(parent, B_DEVICE_BUS, &bus, false) != B_OK 933 || sDeviceManager->get_attr_uint16(parent, B_DEVICE_TYPE, &baseClass, false) != B_OK 934 || sDeviceManager->get_attr_uint16(parent, B_DEVICE_SUB_TYPE, &subClass, false) != B_OK) 935 return -1.0f; 936 937 if (strcmp(bus, "pci") != 0 || baseClass != PCI_mass_storage) 938 return 0.0f; 939 940 if (subClass != PCI_nvm) 941 return 0.0f; 942 943 TRACE("NVMe device found!\n"); 944 return 1.0f; 945 } 946 947 948 static status_t 949 nvme_disk_register_device(device_node* parent) 950 { 951 CALLED(); 952 953 device_attr attrs[] = { 954 { B_DEVICE_PRETTY_NAME, B_STRING_TYPE, { string: "NVMe Disk" } }, 955 { NULL } 956 }; 957 958 return sDeviceManager->register_node(parent, NVME_DISK_DRIVER_MODULE_NAME, 959 attrs, NULL, NULL); 960 } 961 962 963 static status_t 964 nvme_disk_init_driver(device_node* node, void** cookie) 965 { 966 CALLED(); 967 968 int ret = nvme_lib_init((enum nvme_log_level)0, (enum nvme_log_facility)0, NULL); 969 if (ret != 0) { 970 TRACE_ERROR("libnvme initialization failed!\n"); 971 return ret; 972 } 973 974 nvme_disk_driver_info* info = new nvme_disk_driver_info; 975 if (info == NULL) 976 return B_NO_MEMORY; 977 978 info->media_status = B_OK; 979 info->node = node; 980 981 info->ctrlr = NULL; 982 983 *cookie = info; 984 return B_OK; 985 } 986 987 988 static void 989 nvme_disk_uninit_driver(void* _cookie) 990 { 991 CALLED(); 992 993 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie; 994 free(info); 995 } 996 997 998 static status_t 999 nvme_disk_register_child_devices(void* _cookie) 1000 { 1001 CALLED(); 1002 1003 nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie; 1004 status_t status; 1005 1006 int32 id = sDeviceManager->create_id(NVME_DISK_DEVICE_ID_GENERATOR); 1007 if (id < 0) 1008 return id; 1009 1010 char name[64]; 1011 snprintf(name, sizeof(name), "disk/nvme/%" B_PRId32 "/raw", 1012 id); 1013 1014 status = sDeviceManager->publish_device(info->node, name, 1015 NVME_DISK_DEVICE_MODULE_NAME); 1016 1017 return status; 1018 } 1019 1020 1021 // #pragma mark - 1022 1023 1024 module_dependency module_dependencies[] = { 1025 { B_DEVICE_MANAGER_MODULE_NAME, (module_info**)&sDeviceManager }, 1026 { NULL } 1027 }; 1028 1029 struct device_module_info sNvmeDiskDevice = { 1030 { 1031 NVME_DISK_DEVICE_MODULE_NAME, 1032 0, 1033 NULL 1034 }, 1035 1036 nvme_disk_init_device, 1037 nvme_disk_uninit_device, 1038 NULL, // remove, 1039 1040 nvme_disk_open, 1041 nvme_disk_close, 1042 nvme_disk_free, 1043 nvme_disk_read, 1044 nvme_disk_write, 1045 nvme_disk_io, 1046 nvme_disk_ioctl, 1047 1048 NULL, // select 1049 NULL, // deselect 1050 }; 1051 1052 struct driver_module_info sNvmeDiskDriver = { 1053 { 1054 NVME_DISK_DRIVER_MODULE_NAME, 1055 0, 1056 NULL 1057 }, 1058 1059 nvme_disk_supports_device, 1060 nvme_disk_register_device, 1061 nvme_disk_init_driver, 1062 nvme_disk_uninit_driver, 1063 nvme_disk_register_child_devices, 1064 NULL, // rescan 1065 NULL, // removed 1066 }; 1067 1068 module_info* modules[] = { 1069 (module_info*)&sNvmeDiskDriver, 1070 (module_info*)&sNvmeDiskDevice, 1071 NULL 1072 }; 1073