1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. All rights reserved. 5 * Copyright (c) 2017, Western Digital Corporation or its affiliates. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #ifndef __NVME_INTERNAL_H__ 35 #define __NVME_INTERNAL_H__ 36 37 #include "nvme_common.h" 38 #include "nvme_pci.h" 39 #include "nvme_intel.h" 40 #include "nvme_mem.h" 41 42 #ifndef __HAIKU__ 43 #include <pthread.h> 44 #include <sys/user.h> /* PAGE_SIZE */ 45 #else 46 #include "nvme_platform.h" 47 #endif 48 49 /* 50 * List functions. 51 */ 52 #define LIST_FOREACH_SAFE(var, head, field, tvar) \ 53 for ((var) = LIST_FIRST((head)); \ 54 (var) && ((tvar) = LIST_NEXT((var), field), 1); \ 55 (var) = (tvar)) 56 57 /* 58 * Tail queue functions. 59 */ 60 #define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ 61 for ((var) = TAILQ_FIRST((head)); \ 62 (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ 63 (var) = (tvar)) 64 65 #define INTEL_DC_P3X00_DEVID 0x09538086 66 67 #define NVME_TIMEOUT_INFINITE UINT64_MAX 68 69 /* 70 * Some Intel devices support vendor-unique read latency log page even 71 * though the log page directory says otherwise. 72 */ 73 #define NVME_INTEL_QUIRK_READ_LATENCY 0x1 74 75 /* 76 * Some Intel devices support vendor-unique write latency log page even 77 * though the log page directory says otherwise. 78 */ 79 #define NVME_INTEL_QUIRK_WRITE_LATENCY 0x2 80 81 /* 82 * Some controllers need a delay before starting to check the device 83 * readiness, which is done by reading the controller status register rdy bit. 84 */ 85 #define NVME_QUIRK_DELAY_BEFORE_CHK_RDY 0x4 86 87 /* 88 * Some controllers need a delay once the controller status register rdy bit 89 * switches from 0 to 1. 90 */ 91 #define NVME_QUIRK_DELAY_AFTER_RDY 0x8 92 93 /* 94 * Queues may consist of a contiguous block of physical 95 * memory or optionally a non-contiguous set of physical 96 * memory pages (defined by a Physical Region Pages List) 97 */ 98 #define NVME_MAX_PRP_LIST_ENTRIES (506) 99 100 /* 101 * For commands requiring more than 2 PRP entries, one PRP will be 102 * embedded in the command (prp1), and the rest of the PRP entries 103 * will be in a list pointed to by the command (prp2). This means 104 * that real max number of PRP entries we support is 506+1, which 105 * results in a max xfer size of 506*PAGE_SIZE. 106 */ 107 #define NVME_MAX_XFER_SIZE NVME_MAX_PRP_LIST_ENTRIES * PAGE_SIZE 108 109 #define NVME_ADMIN_TRACKERS (16) 110 #define NVME_ADMIN_ENTRIES (128) 111 112 /* 113 * NVME_IO_ENTRIES defines the size of an I/O qpair's submission and completion 114 * queues, while NVME_IO_TRACKERS defines the maximum number of I/O that we 115 * will allow outstanding on an I/O qpair at any time. The only advantage in 116 * having IO_ENTRIES > IO_TRACKERS is for debugging purposes - when dumping 117 * the contents of the submission and completion queues, it will show a longer 118 * history of data. 119 */ 120 #define NVME_IO_ENTRIES (1024U) 121 #define NVME_IO_TRACKERS (128U) 122 #define NVME_IO_ENTRIES_VS_TRACKERS_RATIO (NVME_IO_ENTRIES / NVME_IO_TRACKERS) 123 124 /* 125 * NVME_MAX_SGL_DESCRIPTORS defines the maximum number of descriptors in one SGL 126 * segment. 127 */ 128 #define NVME_MAX_SGL_DESCRIPTORS (253) 129 130 /* 131 * NVME_MAX_IO_ENTRIES is not defined, since it is specified in CC.MQES 132 * for each controller. 133 */ 134 135 #define NVME_MAX_ASYNC_EVENTS (8) 136 137 /* 138 * NVME_MAX_IO_QUEUES in nvme_spec.h defines the 64K spec-limit, but this 139 * define specifies the maximum number of queues this driver will actually 140 * try to configure, if available. 141 */ 142 #define DEFAULT_MAX_IO_QUEUES (1024) 143 144 /* 145 * Maximum of times a failed command can be retried. 146 */ 147 #define NVME_MAX_RETRY_COUNT (3) 148 149 /* 150 * I/O queue type. 151 */ 152 enum nvme_io_queue_type { 153 154 NVME_IO_QTYPE_INVALID = 0, 155 NVME_IO_SUBMISSION_QUEUE, 156 NVME_IO_COMPLETION_QUEUE, 157 }; 158 159 enum nvme_payload_type { 160 161 NVME_PAYLOAD_TYPE_INVALID = 0, 162 163 /* 164 * nvme_request::u.payload.contig_buffer is valid for this request. 165 */ 166 NVME_PAYLOAD_TYPE_CONTIG, 167 168 /* 169 * nvme_request::u.sgl is valid for this request 170 */ 171 NVME_PAYLOAD_TYPE_SGL, 172 }; 173 174 /* 175 * Controller support flags. 176 */ 177 enum nvme_ctrlr_flags { 178 179 /* 180 * The SGL is supported. 181 */ 182 NVME_CTRLR_SGL_SUPPORTED = 0x1, 183 184 }; 185 186 /* 187 * Descriptor for a request data payload. 188 * 189 * This struct is arranged so that it fits nicely in struct nvme_request. 190 */ 191 struct __attribute__((packed)) nvme_payload { 192 193 union { 194 /* 195 * Virtual memory address of a single 196 * physically contiguous buffer 197 */ 198 void *contig; 199 200 /* 201 * Call back functions for retrieving physical 202 * addresses for scattered payloads. 203 */ 204 struct { 205 nvme_req_reset_sgl_cb reset_sgl_fn; 206 nvme_req_next_sge_cb next_sge_fn; 207 void *cb_arg; 208 } sgl; 209 } u; 210 211 /* 212 * Virtual memory address of a single physically 213 * contiguous metadata buffer 214 */ 215 void *md; 216 217 /* 218 * Payload type. 219 */ 220 uint8_t type; 221 222 }; 223 224 struct nvme_request { 225 226 /* 227 * NVMe command: must be aligned on 64B. 228 */ 229 struct nvme_cmd cmd; 230 231 /* 232 * Data payload for this request's command. 233 */ 234 struct nvme_payload payload; 235 236 uint8_t retries; 237 238 /* 239 * Number of child requests still outstanding for this 240 * request which was split into multiple child requests. 241 */ 242 uint8_t child_reqs; 243 uint32_t payload_size; 244 245 /* 246 * Offset in bytes from the beginning of payload for this request. 247 * This is used for I/O commands that are split into multiple requests. 248 */ 249 uint32_t payload_offset; 250 uint32_t md_offset; 251 252 nvme_cmd_cb cb_fn; 253 void *cb_arg; 254 255 /* 256 * The following members should not be reordered with members 257 * above. These members are only needed when splitting 258 * requests which is done rarely, and the driver is careful 259 * to not touch the following fields until a split operation is 260 * needed, to avoid touching an extra cacheline. 261 */ 262 263 /* 264 * Points to the outstanding child requests for a parent request. 265 * Only valid if a request was split into multiple child 266 * requests, and is not initialized for non-split requests. 267 */ 268 TAILQ_HEAD(, nvme_request) children; 269 270 /* 271 * Linked-list pointers for a child request in its parent's list. 272 */ 273 TAILQ_ENTRY(nvme_request) child_tailq; 274 275 /* 276 * For queueing in qpair queued_req or free_req. 277 */ 278 struct nvme_qpair *qpair; 279 STAILQ_ENTRY(nvme_request) stailq; 280 281 /* 282 * Points to a parent request if part of a split request, 283 * NULL otherwise. 284 */ 285 struct nvme_request *parent; 286 287 /* 288 * Completion status for a parent request. Initialized to all 0's 289 * (SUCCESS) before child requests are submitted. If a child 290 * request completes with error, the error status is copied here, 291 * to ensure that the parent request is also completed with error 292 * status once all child requests are completed. 293 */ 294 struct nvme_cpl parent_status; 295 296 } __attribute__((aligned(64))); 297 298 struct nvme_completion_poll_status { 299 struct nvme_cpl cpl; 300 bool done; 301 }; 302 303 struct nvme_async_event_request { 304 struct nvme_ctrlr *ctrlr; 305 struct nvme_request *req; 306 struct nvme_cpl cpl; 307 }; 308 309 struct nvme_tracker { 310 311 LIST_ENTRY(nvme_tracker) list; 312 313 struct nvme_request *req; 314 #if INTPTR_MAX == INT32_MAX 315 int32_t __pad[3]; 316 #elif !defined(INTPTR_MAX) 317 # error Need definition of INTPTR_MAX! 318 #endif 319 320 uint16_t cid; 321 322 uint16_t rsvd1: 15; 323 uint16_t active: 1; 324 325 uint32_t rsvd2; 326 327 uint64_t prp_sgl_bus_addr; 328 329 union { 330 uint64_t prp[NVME_MAX_PRP_LIST_ENTRIES]; 331 struct nvme_sgl_descriptor sgl[NVME_MAX_SGL_DESCRIPTORS]; 332 } u; 333 334 uint64_t rsvd3; 335 }; 336 337 /* 338 * struct nvme_tracker must be exactly 4K so that the prp[] array does not 339 * cross a page boundery and so that there is no padding required to meet 340 * alignment requirements. 341 */ 342 nvme_static_assert(sizeof(struct nvme_tracker) == 4096, 343 "nvme_tracker is not 4K"); 344 nvme_static_assert((offsetof(struct nvme_tracker, u.sgl) & 7) == 0, 345 "SGL must be Qword aligned"); 346 347 struct nvme_qpair { 348 /* 349 * Guards access to this structure. 350 */ 351 pthread_mutex_t lock; 352 353 volatile uint32_t *sq_tdbl; 354 volatile uint32_t *cq_hdbl; 355 356 /* 357 * Submission queue 358 */ 359 struct nvme_cmd *cmd; 360 361 /* 362 * Completion queue 363 */ 364 struct nvme_cpl *cpl; 365 366 LIST_HEAD(, nvme_tracker) free_tr; 367 LIST_HEAD(, nvme_tracker) outstanding_tr; 368 369 /* 370 * Array of trackers indexed by command ID. 371 */ 372 uint16_t trackers; 373 struct nvme_tracker *tr; 374 375 struct nvme_request *reqs; 376 unsigned int num_reqs; 377 STAILQ_HEAD(, nvme_request) free_req; 378 STAILQ_HEAD(, nvme_request) queued_req; 379 380 uint16_t id; 381 382 uint16_t entries; 383 uint16_t sq_tail; 384 uint16_t cq_head; 385 386 uint8_t phase; 387 388 bool enabled; 389 bool sq_in_cmb; 390 391 /* 392 * Fields below this point should not be touched on the 393 * normal I/O happy path. 394 */ 395 396 uint8_t qprio; 397 398 struct nvme_ctrlr *ctrlr; 399 400 /* List entry for nvme_ctrlr::free_io_qpairs and active_io_qpairs */ 401 TAILQ_ENTRY(nvme_qpair) tailq; 402 403 phys_addr_t cmd_bus_addr; 404 phys_addr_t cpl_bus_addr; 405 }; 406 407 struct nvme_ns { 408 409 struct nvme_ctrlr *ctrlr; 410 411 uint32_t stripe_size; 412 uint32_t sector_size; 413 414 uint32_t md_size; 415 uint32_t pi_type; 416 417 uint32_t sectors_per_max_io; 418 uint32_t sectors_per_stripe; 419 420 uint16_t id; 421 uint16_t flags; 422 423 int open_count; 424 425 }; 426 427 /* 428 * State of struct nvme_ctrlr (in particular, during initialization). 429 */ 430 enum nvme_ctrlr_state { 431 432 /* 433 * Controller has not been initialized yet. 434 */ 435 NVME_CTRLR_STATE_INIT = 0, 436 437 /* 438 * Waiting for CSTS.RDY to transition from 0 to 1 439 * so that CC.EN may be set to 0. 440 */ 441 NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1, 442 443 /* 444 * Waiting for CSTS.RDY to transition from 1 to 0 445 * so that CC.EN may be set to 1. 446 */ 447 NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0, 448 449 /* 450 * Waiting for CSTS.RDY to transition from 0 to 1 451 * after enabling the controller. 452 */ 453 NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1, 454 455 /* 456 * Controller initialization has completed and 457 * the controller is ready. 458 */ 459 NVME_CTRLR_STATE_READY 460 }; 461 462 /* 463 * One of these per allocated PCI device. 464 */ 465 struct nvme_ctrlr { 466 467 /* 468 * NVMe MMIO register space. 469 */ 470 volatile struct nvme_registers *regs; 471 472 /* 473 * Array of I/O queue pairs. 474 */ 475 struct nvme_qpair *ioq; 476 477 /* 478 * Size of the array of I/O queue pairs. 479 */ 480 unsigned int io_queues; 481 482 /* 483 * Maximum I/O queue pairs. 484 */ 485 unsigned int max_io_queues; 486 487 /* 488 * Number of I/O queue pairs enabled 489 */ 490 unsigned int enabled_io_qpairs; 491 492 /* 493 * Maximum entries for I/O qpairs 494 */ 495 unsigned int io_qpairs_max_entries; 496 497 /* 498 * Array of namespace IDs. 499 */ 500 unsigned int nr_ns; 501 struct nvme_ns *ns; 502 503 /* 504 * Controller state. 505 */ 506 bool resetting; 507 bool failed; 508 509 /* 510 * Controller support flags. 511 */ 512 uint64_t flags; 513 514 /* 515 * Cold data (not accessed in normal I/O path) is after this point. 516 */ 517 enum nvme_ctrlr_state state; 518 uint64_t state_timeout_ms; 519 520 /* 521 * All the log pages supported. 522 */ 523 bool log_page_supported[256]; 524 525 /* 526 * All the features supported. 527 */ 528 bool feature_supported[256]; 529 530 /* 531 * Associated PCI device information. 532 */ 533 struct pci_device *pci_dev; 534 535 /* 536 * Maximum i/o size in bytes. 537 */ 538 uint32_t max_xfer_size; 539 540 /* 541 * Minimum page size supported by this controller in bytes. 542 */ 543 uint32_t min_page_size; 544 545 /* 546 * Stride in uint32_t units between doorbell registers 547 * (1 = 4 bytes, 2 = 8 bytes, ...). 548 */ 549 uint32_t doorbell_stride_u32; 550 551 uint32_t num_aers; 552 struct nvme_async_event_request aer[NVME_MAX_ASYNC_EVENTS]; 553 nvme_aer_cb aer_cb_fn; 554 void *aer_cb_arg; 555 556 /* 557 * Admin queue pair. 558 */ 559 struct nvme_qpair adminq; 560 561 /* 562 * Guards access to the controller itself. 563 */ 564 pthread_mutex_t lock; 565 566 /* 567 * Identify Controller data. 568 */ 569 struct nvme_ctrlr_data cdata; 570 571 /* 572 * Array of Identify Namespace data. 573 * Stored separately from ns since nsdata should 574 * not normally be accessed during I/O. 575 */ 576 struct nvme_ns_data *nsdata; 577 578 TAILQ_HEAD(, nvme_qpair) free_io_qpairs; 579 TAILQ_HEAD(, nvme_qpair) active_io_qpairs; 580 581 /* 582 * Controller option set on open. 583 */ 584 struct nvme_ctrlr_opts opts; 585 586 /* 587 * BAR mapping address which contains controller memory buffer. 588 */ 589 void *cmb_bar_virt_addr; 590 591 /* 592 * BAR physical address which contains controller memory buffer. 593 */ 594 uint64_t cmb_bar_phys_addr; 595 596 /* 597 * Controller memory buffer size in Bytes. 598 */ 599 uint64_t cmb_size; 600 601 /* 602 * Current offset of controller memory buffer. 603 */ 604 uint64_t cmb_current_offset; 605 606 /* 607 * Quirks flags. 608 */ 609 unsigned int quirks; 610 611 /* 612 * For controller list. 613 */ 614 LIST_ENTRY(nvme_ctrlr) link; 615 616 } __attribute__((aligned(PAGE_SIZE))); 617 618 /* 619 * Admin functions. 620 */ 621 extern int nvme_admin_identify_ctrlr(struct nvme_ctrlr *ctrlr, 622 struct nvme_ctrlr_data *cdata); 623 624 extern int nvme_admin_get_feature(struct nvme_ctrlr *ctrlr, 625 enum nvme_feat_sel sel, 626 enum nvme_feat feature, 627 uint32_t cdw11, uint32_t *attributes); 628 629 extern int nvme_admin_set_feature(struct nvme_ctrlr *ctrlr, 630 bool save, 631 enum nvme_feat feature, 632 uint32_t cdw11, uint32_t cdw12, 633 uint32_t *attributes); 634 635 extern int nvme_admin_format_nvm(struct nvme_ctrlr *ctrlr, 636 unsigned int nsid, 637 struct nvme_format *format); 638 639 extern int nvme_admin_get_log_page(struct nvme_ctrlr *ctrlr, 640 uint8_t log_page, uint32_t nsid, 641 void *payload, uint32_t payload_size); 642 643 extern int nvme_admin_abort_cmd(struct nvme_ctrlr *ctrlr, 644 uint16_t cid, uint16_t sqid); 645 646 extern int nvme_admin_create_ioq(struct nvme_ctrlr *ctrlr, 647 struct nvme_qpair *io_que, 648 enum nvme_io_queue_type io_qtype); 649 650 extern int nvme_admin_delete_ioq(struct nvme_ctrlr *ctrlr, 651 struct nvme_qpair *qpair, 652 enum nvme_io_queue_type io_qtype); 653 654 extern int nvme_admin_identify_ns(struct nvme_ctrlr *ctrlr, 655 uint16_t nsid, 656 struct nvme_ns_data *nsdata); 657 658 extern int nvme_admin_attach_ns(struct nvme_ctrlr *ctrlr, 659 uint32_t nsid, 660 struct nvme_ctrlr_list *clist); 661 662 extern int nvme_admin_detach_ns(struct nvme_ctrlr *ctrlr, 663 uint32_t nsid, 664 struct nvme_ctrlr_list *clist); 665 666 extern int nvme_admin_create_ns(struct nvme_ctrlr *ctrlr, 667 struct nvme_ns_data *nsdata, 668 unsigned int *nsid); 669 670 extern int nvme_admin_delete_ns(struct nvme_ctrlr *ctrlr, 671 unsigned int nsid); 672 673 extern int nvme_admin_fw_commit(struct nvme_ctrlr *ctrlr, 674 const struct nvme_fw_commit *fw_commit); 675 676 extern int nvme_admin_fw_image_dl(struct nvme_ctrlr *ctrlr, 677 void *fw, uint32_t size, uint32_t offset); 678 679 extern void nvme_request_completion_poll_cb(void *arg, 680 const struct nvme_cpl *cpl); 681 682 extern struct nvme_ctrlr *nvme_ctrlr_attach(struct pci_device *pci_dev, 683 struct nvme_ctrlr_opts *opts); 684 685 extern void nvme_ctrlr_detach(struct nvme_ctrlr *ctrlr); 686 687 extern int nvme_qpair_construct(struct nvme_ctrlr *ctrlr, 688 struct nvme_qpair *qpair, enum nvme_qprio qprio, 689 uint16_t entries, uint16_t trackers); 690 691 extern void nvme_qpair_destroy(struct nvme_qpair *qpair); 692 extern void nvme_qpair_enable(struct nvme_qpair *qpair); 693 extern void nvme_qpair_disable(struct nvme_qpair *qpair); 694 extern int nvme_qpair_submit_request(struct nvme_qpair *qpair, 695 struct nvme_request *req); 696 extern void nvme_qpair_reset(struct nvme_qpair *qpair); 697 extern void nvme_qpair_fail(struct nvme_qpair *qpair); 698 699 extern int nvme_request_pool_construct(struct nvme_qpair *qpair); 700 701 extern void nvme_request_pool_destroy(struct nvme_qpair *qpair); 702 703 extern struct nvme_request *nvme_request_allocate(struct nvme_qpair *qpair, 704 const struct nvme_payload *payload, uint32_t payload_size, 705 nvme_cmd_cb cb_fn, void *cb_arg); 706 707 extern struct nvme_request *nvme_request_allocate_null(struct nvme_qpair *qpair, 708 nvme_cmd_cb cb_fn, 709 void *cb_arg); 710 711 extern struct nvme_request * 712 nvme_request_allocate_contig(struct nvme_qpair *qpair, 713 void *buffer, uint32_t payload_size, 714 nvme_cmd_cb cb_fn, void *cb_arg); 715 716 extern void nvme_request_free(struct nvme_request *req); 717 extern void nvme_request_free_locked(struct nvme_request *req); 718 719 extern void nvme_request_add_child(struct nvme_request *parent, 720 struct nvme_request *child); 721 722 extern void nvme_request_remove_child(struct nvme_request *parent, 723 struct nvme_request *child); 724 725 extern unsigned int nvme_ctrlr_get_quirks(struct pci_device *pdev); 726 727 extern int nvme_ns_construct(struct nvme_ctrlr *ctrlr, 728 struct nvme_ns *ns, unsigned int id); 729 730 /* 731 * Registers mmio access. 732 */ 733 #define nvme_reg_mmio_read_4(sc, reg) \ 734 nvme_mmio_read_4((__u32 *)&(sc)->regs->reg) 735 736 #define nvme_reg_mmio_read_8(sc, reg) \ 737 nvme_mmio_read_8((__u64 *)&(sc)->regs->reg) 738 739 #define nvme_reg_mmio_write_4(sc, reg, val) \ 740 nvme_mmio_write_4((__u32 *)&(sc)->regs->reg, val) 741 742 #define nvme_reg_mmio_write_8(sc, reg, val) \ 743 nvme_mmio_write_8((__u64 *)&(sc)->regs->reg, val) 744 745 #endif /* __NVME_INTERNAL_H__ */ 746