xref: /haiku/src/add-ons/kernel/drivers/disk/nvme/libnvme/nvme_ctrlr.c (revision 0dbb417d43214d4b72df6a8a383c1f91926ce521)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation. All rights reserved.
5  *   Copyright (c) 2017, Western Digital Corporation or its affiliates.
6  *
7  *   Redistribution and use in sourete and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of sourete code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "nvme_internal.h"
35 
36 /*
37  * Host software shall wait a minimum of CAP.TO x 500 milleseconds for CSTS.RDY
38  * to be set to '1' after setting CC.EN to '1' from a previous value of '0'.
39  */
40 static inline unsigned int
41 nvme_ctrlr_get_ready_to_in_ms(struct nvme_ctrlr *ctrlr)
42 {
43 	union nvme_cap_register	cap;
44 
45 /* The TO unit in ms */
46 #define NVME_READY_TIMEOUT_UNIT 500
47 
48 	cap.raw = nvme_reg_mmio_read_8(ctrlr, cap.raw);
49 
50 	return (NVME_READY_TIMEOUT_UNIT * cap.bits.to);
51 }
52 
53 /*
54  * Create a queue pair.
55  */
56 static int nvme_ctrlr_create_qpair(struct nvme_ctrlr *ctrlr,
57 				   struct nvme_qpair *qpair)
58 {
59 	int ret;
60 
61 	/* Create the completion queue */
62 	ret = nvme_admin_create_ioq(ctrlr, qpair, NVME_IO_COMPLETION_QUEUE);
63 	if (ret != 0) {
64 		nvme_notice("Create completion queue %u failed\n",
65 			    qpair->id);
66 		return ret;
67 	}
68 
69 	/* Create the submission queue */
70 	ret = nvme_admin_create_ioq(ctrlr, qpair, NVME_IO_SUBMISSION_QUEUE);
71 	if (ret != 0) {
72 		/* Attempt to delete the completion queue */
73 		nvme_notice("Create submission queue %u failed\n",
74 			    qpair->id);
75 		nvme_admin_delete_ioq(ctrlr, qpair, NVME_IO_COMPLETION_QUEUE);
76 		return ret;
77 	}
78 
79 	nvme_qpair_reset(qpair);
80 
81 	return 0;
82 }
83 
84 /*
85  * Delete a queue pair.
86  */
87 static int nvme_ctrlr_delete_qpair(struct nvme_ctrlr *ctrlr,
88 				   struct nvme_qpair *qpair)
89 {
90 	int ret;
91 
92 	/* Delete the submission queue */
93 	ret = nvme_admin_delete_ioq(ctrlr, qpair, NVME_IO_SUBMISSION_QUEUE);
94 	if (ret != 0) {
95 		nvme_notice("Delete submission queue %u failed\n",
96 			    qpair->id);
97 		return ret;
98 	}
99 
100 	/* Delete the completion queue */
101 	ret = nvme_admin_delete_ioq(ctrlr, qpair, NVME_IO_COMPLETION_QUEUE);
102 	if (ret != 0) {
103 		nvme_notice("Delete completion queue %u failed\n",
104 			    qpair->id);
105 		return ret;
106 	}
107 
108 	return 0;
109 }
110 
111 /*
112  * Intel log page.
113  */
114 static void
115 nvme_ctrlr_construct_intel_support_log_page_list(struct nvme_ctrlr *ctrlr,
116 				struct nvme_intel_log_page_dir *log_page_dir)
117 {
118 
119 	if (ctrlr->cdata.vid != NVME_PCI_VID_INTEL ||
120 	    log_page_dir == NULL)
121 		return;
122 
123 	ctrlr->log_page_supported[NVME_INTEL_LOG_PAGE_DIR] = true;
124 
125 	if (log_page_dir->read_latency_log_len ||
126 	    (ctrlr->quirks & NVME_INTEL_QUIRK_READ_LATENCY))
127 		ctrlr->log_page_supported[NVME_INTEL_LOG_READ_CMD_LATENCY] = true;
128 
129 	if (log_page_dir->write_latency_log_len ||
130 	    (ctrlr->quirks & NVME_INTEL_QUIRK_WRITE_LATENCY))
131 		ctrlr->log_page_supported[NVME_INTEL_LOG_WRITE_CMD_LATENCY] = true;
132 
133 	if (log_page_dir->temperature_statistics_log_len)
134 		ctrlr->log_page_supported[NVME_INTEL_LOG_TEMPERATURE] = true;
135 
136 	if (log_page_dir->smart_log_len)
137 		ctrlr->log_page_supported[NVME_INTEL_LOG_SMART] = true;
138 
139 	if (log_page_dir->marketing_description_log_len)
140 		ctrlr->log_page_supported[NVME_INTEL_MARKETING_DESCRIPTION] = true;
141 }
142 
143 /*
144  * Intel log page.
145  */
146 static int nvme_ctrlr_set_intel_support_log_pages(struct nvme_ctrlr *ctrlr)
147 {
148 	struct nvme_intel_log_page_dir *log_page_dir;
149 	int ret;
150 
151 	log_page_dir = nvme_zmalloc(sizeof(struct nvme_intel_log_page_dir), 64);
152 	if (!log_page_dir) {
153 		nvme_err("Allocate log_page_directory failed\n");
154 		return ENOMEM;
155 	}
156 
157 	ret = nvme_admin_get_log_page(ctrlr, NVME_INTEL_LOG_PAGE_DIR,
158 				      NVME_GLOBAL_NS_TAG,
159 				      log_page_dir,
160 				      sizeof(struct nvme_intel_log_page_dir));
161 	if (ret != 0)
162 		nvme_notice("Get NVME_INTEL_LOG_PAGE_DIR log page failed\n");
163 	else
164 		nvme_ctrlr_construct_intel_support_log_page_list(ctrlr,
165 								 log_page_dir);
166 
167 	nvme_free(log_page_dir);
168 
169 	return ret;
170 }
171 
172 /*
173  * Initialize log page support directory.
174  */
175 static void nvme_ctrlr_set_supported_log_pages(struct nvme_ctrlr *ctrlr)
176 {
177 
178 	memset(ctrlr->log_page_supported, 0, sizeof(ctrlr->log_page_supported));
179 
180 	/* Mandatory pages */
181 	ctrlr->log_page_supported[NVME_LOG_ERROR] = true;
182 	ctrlr->log_page_supported[NVME_LOG_HEALTH_INFORMATION] = true;
183 	ctrlr->log_page_supported[NVME_LOG_FIRMWARE_SLOT] = true;
184 
185 	if (ctrlr->cdata.lpa.celp)
186 		ctrlr->log_page_supported[NVME_LOG_COMMAND_EFFECTS_LOG] = true;
187 
188 	if (ctrlr->cdata.vid == NVME_PCI_VID_INTEL)
189 		nvme_ctrlr_set_intel_support_log_pages(ctrlr);
190 }
191 
192 /*
193  * Set Intel device features.
194  */
195 static void nvme_ctrlr_set_intel_supported_features(struct nvme_ctrlr *ctrlr)
196 {
197 	bool *supported_feature = ctrlr->feature_supported;
198 
199 	supported_feature[NVME_INTEL_FEAT_MAX_LBA] = true;
200 	supported_feature[NVME_INTEL_FEAT_MAX_LBA] = true;
201 	supported_feature[NVME_INTEL_FEAT_NATIVE_MAX_LBA] = true;
202 	supported_feature[NVME_INTEL_FEAT_POWER_GOVERNOR_SETTING] = true;
203 	supported_feature[NVME_INTEL_FEAT_SMBUS_ADDRESS] = true;
204 	supported_feature[NVME_INTEL_FEAT_LED_PATTERN] = true;
205 	supported_feature[NVME_INTEL_FEAT_RESET_TIMED_WORKLOAD_COUNTERS] = true;
206 	supported_feature[NVME_INTEL_FEAT_LATENCY_TRACKING] = true;
207 }
208 
209 /*
210  * Set device features.
211  */
212 static void nvme_ctrlr_set_supported_features(struct nvme_ctrlr *ctrlr)
213 {
214 	bool *supported_feature = ctrlr->feature_supported;
215 
216 	memset(ctrlr->feature_supported, 0, sizeof(ctrlr->feature_supported));
217 
218 	/* Mandatory features */
219 	supported_feature[NVME_FEAT_ARBITRATION] = true;
220 	supported_feature[NVME_FEAT_POWER_MANAGEMENT] = true;
221 	supported_feature[NVME_FEAT_TEMPERATURE_THRESHOLD] = true;
222 	supported_feature[NVME_FEAT_ERROR_RECOVERY] = true;
223 	supported_feature[NVME_FEAT_NUMBER_OF_QUEUES] = true;
224 	supported_feature[NVME_FEAT_INTERRUPT_COALESCING] = true;
225 	supported_feature[NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION] = true;
226 	supported_feature[NVME_FEAT_WRITE_ATOMICITY] = true;
227 	supported_feature[NVME_FEAT_ASYNC_EVENT_CONFIGURATION] = true;
228 
229 	/* Optional features */
230 	if (ctrlr->cdata.vwc.present)
231 		supported_feature[NVME_FEAT_VOLATILE_WRITE_CACHE] = true;
232 	if (ctrlr->cdata.apsta.supported)
233 		supported_feature[NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION]
234 			= true;
235 	if (ctrlr->cdata.hmpre)
236 		supported_feature[NVME_FEAT_HOST_MEM_BUFFER] = true;
237 	if (ctrlr->cdata.vid == NVME_PCI_VID_INTEL)
238 		nvme_ctrlr_set_intel_supported_features(ctrlr);
239 }
240 
241 /*
242  * Initialize I/O queue pairs.
243  */
244 static int nvme_ctrlr_init_io_qpairs(struct nvme_ctrlr *ctrlr)
245 {
246 	struct nvme_qpair *qpair;
247 	union nvme_cap_register	cap;
248 	uint32_t i;
249 
250 	if (ctrlr->ioq != NULL)
251 		/*
252 		 * io_qpairs were already constructed, so just return.
253 		 * This typically happens when the controller is
254 		 * initialized a second (or subsequent) time after a
255 		 * controller reset.
256 		 */
257 		return 0;
258 
259 	/*
260 	 * NVMe spec sets a hard limit of 64K max entries, but
261 	 * devices may specify a smaller limit, so we need to check
262 	 * the MQES field in the capabilities register.
263 	 */
264 	cap.raw = nvme_reg_mmio_read_8(ctrlr, cap.raw);
265 	ctrlr->io_qpairs_max_entries =
266 		nvme_min(NVME_IO_ENTRIES, (unsigned int)cap.bits.mqes + 1);
267 
268 	ctrlr->ioq = calloc(ctrlr->io_queues, sizeof(struct nvme_qpair));
269 	if (!ctrlr->ioq)
270 		return ENOMEM;
271 
272 	/* Keep queue pair ID 0 for the admin queue */
273 	for (i = 0; i < ctrlr->io_queues; i++) {
274 		qpair = &ctrlr->ioq[i];
275 		qpair->id = i + 1;
276 		TAILQ_INSERT_TAIL(&ctrlr->free_io_qpairs, qpair, tailq);
277 	}
278 
279 	return 0;
280 }
281 
282 /*
283  * Shutdown a controller.
284  */
285 static void nvme_ctrlr_shutdown(struct nvme_ctrlr *ctrlr)
286 {
287 	union nvme_cc_register	cc;
288 	union nvme_csts_register csts;
289 	int ms_waited = 0;
290 
291 	cc.raw = nvme_reg_mmio_read_4(ctrlr, cc.raw);
292 	cc.bits.shn = NVME_SHN_NORMAL;
293 	nvme_reg_mmio_write_4(ctrlr, cc.raw, cc.raw);
294 
295 	csts.raw = nvme_reg_mmio_read_4(ctrlr, csts.raw);
296 	/*
297 	 * The NVMe spec does not define a timeout period for shutdown
298 	 * notification, so we just pick 5 seconds as a reasonable amount
299 	 * of time to wait before proceeding.
300 	 */
301 #define NVME_CTRLR_SHUTDOWN_TIMEOUT 5000
302 	while (csts.bits.shst != NVME_SHST_COMPLETE) {
303 		nvme_usleep(1000);
304 		csts.raw = nvme_reg_mmio_read_4(ctrlr, csts.raw);
305 		if (ms_waited++ >= NVME_CTRLR_SHUTDOWN_TIMEOUT)
306 			break;
307 	}
308 
309 	if (csts.bits.shst != NVME_SHST_COMPLETE)
310 		nvme_err("Controller did not shutdown within %d seconds\n",
311 			 NVME_CTRLR_SHUTDOWN_TIMEOUT / 1000);
312 }
313 
314 /*
315  * Enable a controller.
316  */
317 static int nvme_ctrlr_enable(struct nvme_ctrlr *ctrlr)
318 {
319 	union nvme_cc_register	cc;
320 	union nvme_aqa_register	aqa;
321 	union nvme_cap_register	cap;
322 
323 	cc.raw = nvme_reg_mmio_read_4(ctrlr, cc.raw);
324 
325 	if (cc.bits.en != 0) {
326 		nvme_err("COntroller enable called with CC.EN = 1\n");
327 		return EINVAL;
328 	}
329 
330 	nvme_reg_mmio_write_8(ctrlr, asq, ctrlr->adminq.cmd_bus_addr);
331 	nvme_reg_mmio_write_8(ctrlr, acq, ctrlr->adminq.cpl_bus_addr);
332 
333 	aqa.raw = 0;
334 	/* acqs and asqs are 0-based. */
335 	aqa.bits.acqs = ctrlr->adminq.entries - 1;
336 	aqa.bits.asqs = ctrlr->adminq.entries - 1;
337 	nvme_reg_mmio_write_4(ctrlr, aqa.raw, aqa.raw);
338 
339 	cc.bits.en = 1;
340 	cc.bits.css = 0;
341 	cc.bits.shn = 0;
342 	cc.bits.iosqes = 6; /* SQ entry size == 64 == 2^6 */
343 	cc.bits.iocqes = 4; /* CQ entry size == 16 == 2^4 */
344 
345 	/* Page size is 2 ^ (12 + mps). */
346 	cc.bits.mps = PAGE_SHIFT - 12;
347 
348 	cap.raw = nvme_reg_mmio_read_8(ctrlr, cap.raw);
349 
350 	switch (ctrlr->opts.arb_mechanism) {
351 	case NVME_CC_AMS_RR:
352 		break;
353 	case NVME_CC_AMS_WRR:
354 		if (NVME_CAP_AMS_WRR & cap.bits.ams)
355 			break;
356 		return EINVAL;
357 	case NVME_CC_AMS_VS:
358 		if (NVME_CAP_AMS_VS & cap.bits.ams)
359 			break;
360 		return EINVAL;
361 	default:
362 		return EINVAL;
363 	}
364 
365 	cc.bits.ams = ctrlr->opts.arb_mechanism;
366 
367 	nvme_reg_mmio_write_4(ctrlr, cc.raw, cc.raw);
368 
369 	return 0;
370 }
371 
372 /*
373  * Disable a controller.
374  */
375 static inline void nvme_ctrlr_disable(struct nvme_ctrlr *ctrlr)
376 {
377 	union nvme_cc_register cc;
378 
379 	cc.raw = nvme_reg_mmio_read_4(ctrlr, cc.raw);
380 	cc.bits.en = 0;
381 
382 	nvme_reg_mmio_write_4(ctrlr, cc.raw, cc.raw);
383 }
384 
385 /*
386  * Test if a controller is enabled.
387  */
388 static inline int nvme_ctrlr_enabled(struct nvme_ctrlr *ctrlr)
389 {
390 	union nvme_cc_register cc;
391 
392 	cc.raw = nvme_reg_mmio_read_4(ctrlr, cc.raw);
393 
394 	return cc.bits.en;
395 }
396 
397 /*
398  * Test if a controller is ready.
399  */
400 static inline int nvme_ctrlr_ready(struct nvme_ctrlr *ctrlr)
401 {
402 	union nvme_csts_register csts;
403 
404 	csts.raw = nvme_reg_mmio_read_4(ctrlr, csts.raw);
405 
406 	return csts.bits.rdy;
407 }
408 
409 /*
410  * Set a controller state.
411  */
412 static void nvme_ctrlr_set_state(struct nvme_ctrlr *ctrlr,
413 				 enum nvme_ctrlr_state state,
414 				 uint64_t timeout_in_ms)
415 {
416 	ctrlr->state = state;
417 	if (timeout_in_ms == NVME_TIMEOUT_INFINITE)
418 		ctrlr->state_timeout_ms = NVME_TIMEOUT_INFINITE;
419 	else
420 		ctrlr->state_timeout_ms = nvme_time_msec() + timeout_in_ms;
421 }
422 
423 /*
424  * Get a controller data.
425  */
426 static int nvme_ctrlr_identify(struct nvme_ctrlr *ctrlr)
427 {
428 	int ret;
429 
430 	ret = nvme_admin_identify_ctrlr(ctrlr, &ctrlr->cdata);
431 	if (ret != 0) {
432 		nvme_notice("Identify controller failed\n");
433 		return ret;
434 	}
435 
436 	/*
437 	 * Use MDTS to ensure our default max_xfer_size doesn't
438 	 * exceed what the controller supports.
439 	 */
440 	if (ctrlr->cdata.mdts > 0)
441 		ctrlr->max_xfer_size = nvme_min(ctrlr->max_xfer_size,
442 						ctrlr->min_page_size
443 						* (1 << (ctrlr->cdata.mdts)));
444 	return 0;
445 }
446 
447 /*
448  * Set the number of I/O queue pairs.
449  */
450 static int nvme_ctrlr_get_max_io_qpairs(struct nvme_ctrlr *ctrlr)
451 {
452 	unsigned int cdw0, cq_allocated, sq_allocated;
453 	int ret;
454 
455 	ret = nvme_admin_get_feature(ctrlr, NVME_FEAT_CURRENT,
456 				     NVME_FEAT_NUMBER_OF_QUEUES,
457 				     0, &cdw0);
458 	if (ret != 0) {
459 		nvme_notice("Get feature NVME_FEAT_NUMBER_OF_QUEUES failed\n");
460 		return ret;
461 	}
462 
463 	/*
464 	 * Data in cdw0 is 0-based.
465 	 * Lower 16-bits indicate number of submission queues allocated.
466 	 * Upper 16-bits indicate number of completion queues allocated.
467 	 */
468 	sq_allocated = (cdw0 & 0xFFFF) + 1;
469 	cq_allocated = (cdw0 >> 16) + 1;
470 
471 	ctrlr->max_io_queues = nvme_min(sq_allocated, cq_allocated);
472 
473 	return 0;
474 }
475 
476 /*
477  * Set the number of I/O queue pairs.
478  */
479 static int nvme_ctrlr_set_num_qpairs(struct nvme_ctrlr *ctrlr)
480 {
481 	unsigned int num_queues, cdw0;
482 	unsigned int cq_allocated, sq_allocated;
483 	int ret;
484 
485 	ret = nvme_ctrlr_get_max_io_qpairs(ctrlr);
486 	if (ret != 0) {
487 		nvme_notice("Failed to get the maximum of I/O qpairs\n");
488 		return ret;
489 	}
490 
491 	/*
492 	 * Format number of I/O queue:
493 	 * Remove 1 as it as be be 0-based,
494 	 * bits 31:16 represent the number of completion queues,
495 	 * bits 0:15 represent the number of submission queues
496 	*/
497 	num_queues = ((ctrlr->opts.io_queues - 1) << 16) |
498 		(ctrlr->opts.io_queues - 1);
499 
500 	/*
501 	 * Set the number of I/O queues.
502 	 * Note: The value allocated may be smaller or larger than the number
503 	 * of queues requested (see specifications).
504 	 */
505 	ret = nvme_admin_set_feature(ctrlr, false, NVME_FEAT_NUMBER_OF_QUEUES,
506 				     num_queues, 0, &cdw0);
507 	if (ret != 0) {
508 		nvme_notice("Set feature NVME_FEAT_NUMBER_OF_QUEUES failed\n");
509 		return ret;
510 	}
511 
512 	/*
513 	 * Data in cdw0 is 0-based.
514 	 * Lower 16-bits indicate number of submission queues allocated.
515 	 * Upper 16-bits indicate number of completion queues allocated.
516 	 */
517 	sq_allocated = (cdw0 & 0xFFFF) + 1;
518 	cq_allocated = (cdw0 >> 16) + 1;
519 	ctrlr->io_queues = nvme_min(sq_allocated, cq_allocated);
520 
521 	/*
522 	 * Make sure the number of constructed qpair listed in free_io_qpairs
523 	 * will not be more than the requested one.
524 	 */
525 	ctrlr->io_queues = nvme_min(ctrlr->io_queues, ctrlr->opts.io_queues);
526 
527 	return 0;
528 }
529 
530 static void nvme_ctrlr_destruct_namespaces(struct nvme_ctrlr *ctrlr)
531 {
532 
533 	if (ctrlr->ns) {
534 		free(ctrlr->ns);
535 		ctrlr->ns = NULL;
536 		ctrlr->nr_ns = 0;
537 	}
538 
539 	if (ctrlr->nsdata) {
540 		nvme_free(ctrlr->nsdata);
541 		ctrlr->nsdata = NULL;
542 	}
543 }
544 
545 static int nvme_ctrlr_construct_namespaces(struct nvme_ctrlr *ctrlr)
546 {
547 	unsigned int i, nr_ns = ctrlr->cdata.nn;
548 	struct nvme_ns *ns = NULL;
549 
550 	/*
551 	 * ctrlr->nr_ns may be 0 (startup) or a different number of
552 	 * namespaces (reset), so check if we need to reallocate.
553 	 */
554 	if (nr_ns != ctrlr->nr_ns) {
555 
556 		nvme_ctrlr_destruct_namespaces(ctrlr);
557 
558 		ctrlr->ns = calloc(nr_ns, sizeof(struct nvme_ns));
559 		if (!ctrlr->ns)
560 			goto fail;
561 
562 		nvme_debug("Allocate %u namespace data\n", nr_ns);
563 		ctrlr->nsdata = nvme_calloc(nr_ns, sizeof(struct nvme_ns_data),
564 					    PAGE_SIZE);
565 		if (!ctrlr->nsdata)
566 			goto fail;
567 
568 		ctrlr->nr_ns = nr_ns;
569 
570 	}
571 
572 	for (i = 0; i < nr_ns; i++) {
573 		ns = &ctrlr->ns[i];
574 		if (nvme_ns_construct(ctrlr, ns, i + 1) != 0)
575 			goto fail;
576 	}
577 
578 	return 0;
579 
580 fail:
581 	nvme_ctrlr_destruct_namespaces(ctrlr);
582 
583 	return -1;
584 }
585 
586 /*
587  * Forward declaration.
588  */
589 static int nvme_ctrlr_construct_and_submit_aer(struct nvme_ctrlr *ctrlr,
590 				struct nvme_async_event_request *aer);
591 
592 /*
593  * Async event completion callback.
594  */
595 static void nvme_ctrlr_async_event_cb(void *arg, const struct nvme_cpl *cpl)
596 {
597 	struct nvme_async_event_request	*aer = arg;
598 	struct nvme_ctrlr *ctrlr = aer->ctrlr;
599 
600 	if (cpl->status.sc == NVME_SC_ABORTED_SQ_DELETION)
601 		/*
602 		 *  This is simulated when controller is being shut down, to
603 		 *  effectively abort outstanding asynchronous event requests
604 		 *  and make sure all memory is freed. Do not repost the
605 		 *  request in this case.
606 		 */
607 		return;
608 
609 	if (ctrlr->aer_cb_fn != NULL)
610 		ctrlr->aer_cb_fn(ctrlr->aer_cb_arg, cpl);
611 
612 	/*
613 	 * Repost another asynchronous event request to replace
614 	 * the one that just completed.
615 	 */
616 	if (nvme_ctrlr_construct_and_submit_aer(ctrlr, aer))
617 		/*
618 		 * We can't do anything to recover from a failure here,
619 		 * so just print a warning message and leave the
620 		 * AER unsubmitted.
621 		 */
622 		nvme_err("Initialize AER failed\n");
623 }
624 
625 /*
626  * Issue an async event request.
627  */
628 static int nvme_ctrlr_construct_and_submit_aer(struct nvme_ctrlr *ctrlr,
629 					       struct nvme_async_event_request *aer)
630 {
631 	struct nvme_request *req;
632 
633 	req = nvme_request_allocate_null(&ctrlr->adminq,
634 					 nvme_ctrlr_async_event_cb, aer);
635 	if (req == NULL)
636 		return -1;
637 
638 	aer->ctrlr = ctrlr;
639 	aer->req = req;
640 	req->cmd.opc = NVME_OPC_ASYNC_EVENT_REQUEST;
641 
642 	return nvme_qpair_submit_request(&ctrlr->adminq, req);
643 }
644 
645 /*
646  * Configure async event management.
647  */
648 static int nvme_ctrlr_configure_aer(struct nvme_ctrlr *ctrlr)
649 {
650 	union nvme_critical_warning_state state;
651 	struct nvme_async_event_request	*aer;
652 	unsigned int i;
653 	int ret;
654 
655 	state.raw = 0xFF;
656 	state.bits.reserved = 0;
657 
658 	ret =  nvme_admin_set_feature(ctrlr, false,
659 				      NVME_FEAT_ASYNC_EVENT_CONFIGURATION,
660 				      state.raw, 0, NULL);
661 	if (ret != 0) {
662 		nvme_notice("Set feature ASYNC_EVENT_CONFIGURATION failed\n");
663 		return ret;
664 	}
665 
666 	/* aerl is a zero-based value, so we need to add 1 here. */
667 	ctrlr->num_aers = nvme_min(NVME_MAX_ASYNC_EVENTS,
668 				   (ctrlr->cdata.aerl + 1));
669 
670 	for (i = 0; i < ctrlr->num_aers; i++) {
671 		aer = &ctrlr->aer[i];
672 		if (nvme_ctrlr_construct_and_submit_aer(ctrlr, aer)) {
673 			nvme_notice("Construct AER failed\n");
674 			return -1;
675 		}
676 	}
677 
678 	return 0;
679 }
680 
681 /*
682  * Start a controller.
683  */
684 static int nvme_ctrlr_start(struct nvme_ctrlr *ctrlr)
685 {
686 
687 	nvme_qpair_reset(&ctrlr->adminq);
688 	nvme_qpair_enable(&ctrlr->adminq);
689 
690 	if (nvme_ctrlr_identify(ctrlr) != 0)
691 		return -1;
692 
693 	if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0)
694 		return -1;
695 
696 	if (nvme_ctrlr_init_io_qpairs(ctrlr))
697 		return -1;
698 
699 	if (nvme_ctrlr_construct_namespaces(ctrlr) != 0)
700 		return -1;
701 
702 	if (nvme_ctrlr_configure_aer(ctrlr) != 0)
703 		nvme_warning("controller does not support AER!\n");
704 
705 	nvme_ctrlr_set_supported_log_pages(ctrlr);
706 	nvme_ctrlr_set_supported_features(ctrlr);
707 
708 	if (ctrlr->cdata.sgls.supported)
709 		ctrlr->flags |= NVME_CTRLR_SGL_SUPPORTED;
710 
711 	return 0;
712 }
713 
714 /*
715  * Memory map the controller side buffer.
716  */
717 static void nvme_ctrlr_map_cmb(struct nvme_ctrlr *ctrlr)
718 {
719 	int ret;
720 	void *addr;
721 	uint32_t bir;
722 	union nvme_cmbsz_register cmbsz;
723 	union nvme_cmbloc_register cmbloc;
724 	uint64_t size, unit_size, offset, bar_size, bar_phys_addr;
725 
726 	cmbsz.raw = nvme_reg_mmio_read_4(ctrlr, cmbsz.raw);
727 	cmbloc.raw = nvme_reg_mmio_read_4(ctrlr, cmbloc.raw);
728 	if (!cmbsz.bits.sz)
729 		goto out;
730 
731 	/* Values 0 2 3 4 5 are valid for BAR */
732 	bir = cmbloc.bits.bir;
733 	if (bir > 5 || bir == 1)
734 		goto out;
735 
736 	/* unit size for 4KB/64KB/1MB/16MB/256MB/4GB/64GB */
737 	unit_size = (uint64_t)1 << (12 + 4 * cmbsz.bits.szu);
738 
739 	/* controller memory buffer size in Bytes */
740 	size = unit_size * cmbsz.bits.sz;
741 
742 	/* controller memory buffer offset from BAR in Bytes */
743 	offset = unit_size * cmbloc.bits.ofst;
744 
745 	nvme_pcicfg_get_bar_addr_len(ctrlr->pci_dev, bir, &bar_phys_addr,
746 				     &bar_size);
747 
748 	if (offset > bar_size)
749 		goto out;
750 
751 	if (size > bar_size - offset)
752 		goto out;
753 
754 	ret = nvme_pcicfg_map_bar_write_combine(ctrlr->pci_dev, bir, &addr);
755 	if ((ret != 0) || addr == NULL)
756 		goto out;
757 
758 	ctrlr->cmb_bar_virt_addr = addr;
759 	ctrlr->cmb_bar_phys_addr = bar_phys_addr;
760 	ctrlr->cmb_size = size;
761 	ctrlr->cmb_current_offset = offset;
762 
763 	if (!cmbsz.bits.sqs)
764 		ctrlr->opts.use_cmb_sqs = false;
765 
766 	return;
767 
768 out:
769 	ctrlr->cmb_bar_virt_addr = NULL;
770 	ctrlr->opts.use_cmb_sqs = false;
771 
772 	return;
773 }
774 
775 /*
776  * Unmap the controller side buffer.
777  */
778 static int nvme_ctrlr_unmap_cmb(struct nvme_ctrlr *ctrlr)
779 {
780 	union nvme_cmbloc_register cmbloc;
781 	void *addr = ctrlr->cmb_bar_virt_addr;
782 	int ret = 0;
783 
784 	if (addr) {
785 		cmbloc.raw = nvme_reg_mmio_read_4(ctrlr, cmbloc.raw);
786 		ret = nvme_pcicfg_unmap_bar(ctrlr->pci_dev, cmbloc.bits.bir,
787 					    addr);
788 	}
789 	return ret;
790 }
791 
792 /*
793  * Map the controller PCI bars.
794  */
795 static int nvme_ctrlr_map_bars(struct nvme_ctrlr *ctrlr)
796 {
797 	void *addr;
798 	int ret;
799 
800 	ret = nvme_pcicfg_map_bar(ctrlr->pci_dev, 0, 0, &addr);
801 	if (ret != 0 || addr == NULL) {
802 		nvme_err("Map PCI device bar failed %d (%s)\n",
803 			 ret, strerror(ret));
804 		return ret;
805 	}
806 
807 	nvme_debug("Controller BAR mapped at %p\n", addr);
808 
809 	ctrlr->regs = (volatile struct nvme_registers *)addr;
810 	nvme_ctrlr_map_cmb(ctrlr);
811 
812 	return 0;
813 }
814 
815 /*
816  * Unmap the controller PCI bars.
817  */
818 static int nvme_ctrlr_unmap_bars(struct nvme_ctrlr *ctrlr)
819 {
820 	void *addr = (void *)ctrlr->regs;
821 	int ret;
822 
823 	ret = nvme_ctrlr_unmap_cmb(ctrlr);
824 	if (ret != 0) {
825 		nvme_err("Unmap controller side buffer failed %d\n", ret);
826 		return ret;
827 	}
828 
829 	if (addr) {
830 		ret = nvme_pcicfg_unmap_bar(ctrlr->pci_dev, 0, addr);
831 		if (ret != 0) {
832 			nvme_err("Unmap PCI device bar failed %d\n", ret);
833 			return ret;
834 		}
835 	}
836 
837 	return 0;
838 }
839 
840 /*
841  * Set a controller in the failed state.
842  */
843 static void nvme_ctrlr_fail(struct nvme_ctrlr *ctrlr)
844 {
845 	unsigned int i;
846 
847 	ctrlr->failed = true;
848 
849 	nvme_qpair_fail(&ctrlr->adminq);
850 	if (ctrlr->ioq)
851 		for (i = 0; i < ctrlr->io_queues; i++)
852 			nvme_qpair_fail(&ctrlr->ioq[i]);
853 }
854 
855 /*
856  * This function will be called repeatedly during initialization
857  * until the controller is ready.
858  */
859 static int nvme_ctrlr_init(struct nvme_ctrlr *ctrlr)
860 {
861 	unsigned int ready_timeout_in_ms = nvme_ctrlr_get_ready_to_in_ms(ctrlr);
862 	int ret;
863 
864 	/*
865 	 * Check if the current initialization step is done or has timed out.
866 	 */
867 	switch (ctrlr->state) {
868 
869 	case NVME_CTRLR_STATE_INIT:
870 
871 		/* Begin the hardware initialization by making
872 		 * sure the controller is disabled. */
873 		if (nvme_ctrlr_enabled(ctrlr)) {
874 			/*
875 			 * Disable the controller to cause a reset.
876 			 */
877 			if (!nvme_ctrlr_ready(ctrlr)) {
878 				/* Wait for the controller to be ready */
879 				nvme_ctrlr_set_state(ctrlr,
880 				      NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1,
881 				      ready_timeout_in_ms);
882 				return 0;
883 			}
884 
885 			/*
886 			 * The controller is enabled and ready.
887 			 * It can be immediatly disabled
888 			 */
889 			nvme_ctrlr_disable(ctrlr);
890 			nvme_ctrlr_set_state(ctrlr,
891 				      NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0,
892 				      ready_timeout_in_ms);
893 
894 			if (ctrlr->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
895 				nvme_msleep(2000);
896 
897 			return 0;
898 		}
899 
900 		if (nvme_ctrlr_ready(ctrlr)) {
901 			/*
902 			 * Controller is in the process of shutting down.
903 			 * We need to wait for CSTS.RDY to become 0.
904 			 */
905 			nvme_ctrlr_set_state(ctrlr,
906 				      NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0,
907 				      ready_timeout_in_ms);
908 			return 0;
909 		}
910 
911 		/*
912 		 * Controller is currently disabled.
913 		 * We can jump straight to enabling it.
914 		 */
915 		ret = nvme_ctrlr_enable(ctrlr);
916 		if (ret)
917 			nvme_err("Enable controller failed\n");
918 		else
919 			nvme_ctrlr_set_state(ctrlr,
920 				       NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1,
921 				       ready_timeout_in_ms);
922 		return ret;
923 
924 	case NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1:
925 
926 		if (nvme_ctrlr_ready(ctrlr)) {
927 			/* CC.EN = 1 && CSTS.RDY = 1,
928 			 * so we can disable the controller now. */
929 			nvme_ctrlr_disable(ctrlr);
930 			nvme_ctrlr_set_state(ctrlr,
931 				      NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0,
932 				      ready_timeout_in_ms);
933 			return 0;
934 		}
935 
936 		break;
937 
938 	case NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0:
939 
940 		if (!nvme_ctrlr_ready(ctrlr)) {
941 			/* CC.EN = 0 && CSTS.RDY = 0,
942 			 * so we can enable the controller now. */
943 			ret = nvme_ctrlr_enable(ctrlr);
944 			if (ret)
945 				nvme_err("Enable controller failed\n");
946 			else
947 				nvme_ctrlr_set_state(ctrlr,
948 				       NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1,
949 				       ready_timeout_in_ms);
950 			return ret;
951 		}
952 		break;
953 
954 	case NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1:
955 
956 		if (nvme_ctrlr_ready(ctrlr)) {
957 			if (ctrlr->quirks & NVME_QUIRK_DELAY_AFTER_RDY)
958 				nvme_msleep(2000);
959 
960 			ret = nvme_ctrlr_start(ctrlr);
961 			if (ret)
962 				nvme_err("Start controller failed\n");
963 			else
964 				nvme_ctrlr_set_state(ctrlr,
965 						     NVME_CTRLR_STATE_READY,
966 						     NVME_TIMEOUT_INFINITE);
967 			return ret;
968 		}
969 		break;
970 
971 	default:
972 		nvme_panic("Unhandled ctrlr state %d\n", ctrlr->state);
973 		nvme_ctrlr_fail(ctrlr);
974 		return -1;
975 	}
976 
977 	if ((ctrlr->state_timeout_ms != NVME_TIMEOUT_INFINITE) &&
978 	    (nvme_time_msec() > ctrlr->state_timeout_ms)) {
979 		nvme_err("Initialization timed out in state %d\n",
980 			 ctrlr->state);
981 		nvme_ctrlr_fail(ctrlr);
982 		return -1;
983 	}
984 
985 	return 0;
986 }
987 
988 /*
989  * Reset a controller.
990  */
991 static int nvme_ctrlr_reset(struct nvme_ctrlr *ctrlr)
992 {
993 	struct nvme_qpair *qpair;
994 	unsigned int i;
995 
996 	if (ctrlr->resetting || ctrlr->failed)
997 		/*
998 		 * Controller is already resetting or has failed. Return
999 		 * immediately since there is no need to kick off another
1000 		 * reset in these cases.
1001 		 */
1002 		return 0;
1003 
1004 	ctrlr->resetting = true;
1005 
1006 	/* Disable all queues before disabling the controller hardware. */
1007 	nvme_qpair_disable(&ctrlr->adminq);
1008 	for (i = 0; i < ctrlr->io_queues; i++)
1009 		nvme_qpair_disable(&ctrlr->ioq[i]);
1010 
1011 	/* Set the state back to INIT to cause a full hardware reset. */
1012 	nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_INIT,
1013 			     NVME_TIMEOUT_INFINITE);
1014 
1015 	while (ctrlr->state != NVME_CTRLR_STATE_READY) {
1016 		if (nvme_ctrlr_init(ctrlr) != 0) {
1017 			nvme_crit("Controller reset failed\n");
1018 			nvme_ctrlr_fail(ctrlr);
1019 			goto out;
1020 		}
1021 	}
1022 
1023 	/* Reinitialize qpairs */
1024 	TAILQ_FOREACH(qpair, &ctrlr->active_io_qpairs, tailq) {
1025 		if (nvme_ctrlr_create_qpair(ctrlr, qpair) != 0)
1026 			nvme_ctrlr_fail(ctrlr);
1027 	}
1028 
1029 out:
1030 	ctrlr->resetting = false;
1031 
1032 	return ctrlr->failed ? -1 : 0;
1033 }
1034 
1035 /*
1036  * Set a controller options.
1037  */
1038 static void nvme_ctrlr_set_opts(struct nvme_ctrlr *ctrlr,
1039 				struct nvme_ctrlr_opts *opts)
1040 {
1041 	if (opts)
1042 		memcpy(&ctrlr->opts, opts, sizeof(struct nvme_ctrlr_opts));
1043 	else
1044 		memset(&ctrlr->opts, 0, sizeof(struct nvme_ctrlr_opts));
1045 
1046 	if (ctrlr->opts.io_queues == 0)
1047 		ctrlr->opts.io_queues = DEFAULT_MAX_IO_QUEUES;
1048 
1049 	if (ctrlr->opts.io_queues > NVME_MAX_IO_QUEUES) {
1050 		nvme_info("Limiting requested I/O queues %u to %d\n",
1051 			  ctrlr->opts.io_queues, NVME_MAX_IO_QUEUES);
1052 		ctrlr->opts.io_queues = NVME_MAX_IO_QUEUES;
1053 	}
1054 }
1055 
1056 /*
1057  * Attach a PCI controller.
1058  */
1059 struct nvme_ctrlr *
1060 nvme_ctrlr_attach(struct pci_device *pci_dev,
1061 		  struct nvme_ctrlr_opts *opts)
1062 {
1063 	struct nvme_ctrlr *ctrlr;
1064 	union nvme_cap_register	cap;
1065 	uint32_t cmd_reg;
1066 	int ret;
1067 
1068 	/* Get a new controller handle */
1069 	ctrlr = malloc(sizeof(struct nvme_ctrlr));
1070 	if (!ctrlr) {
1071 		nvme_err("Allocate controller handle failed\n");
1072 		return NULL;
1073 	}
1074 
1075 	nvme_debug("New controller handle %p\n", ctrlr);
1076 
1077 	/* Initialize the handle */
1078 	memset(ctrlr, 0, sizeof(struct nvme_ctrlr));
1079 	ctrlr->pci_dev = pci_dev;
1080 	ctrlr->resetting = false;
1081 	ctrlr->failed = false;
1082 	TAILQ_INIT(&ctrlr->free_io_qpairs);
1083 	TAILQ_INIT(&ctrlr->active_io_qpairs);
1084 	pthread_mutex_init(&ctrlr->lock, NULL);
1085 	ctrlr->quirks = nvme_ctrlr_get_quirks(pci_dev);
1086 
1087 	nvme_ctrlr_set_state(ctrlr,
1088 			     NVME_CTRLR_STATE_INIT,
1089 			     NVME_TIMEOUT_INFINITE);
1090 
1091 	ret = nvme_ctrlr_map_bars(ctrlr);
1092 	if (ret != 0) {
1093 		nvme_err("Map controller BAR failed\n");
1094 		pthread_mutex_destroy(&ctrlr->lock);
1095 		free(ctrlr);
1096 		return NULL;
1097 	}
1098 
1099 	/* Enable PCI busmaster and disable INTx */
1100 	nvme_pcicfg_read32(pci_dev, &cmd_reg, 4);
1101 	cmd_reg |= 0x0404;
1102 	nvme_pcicfg_write32(pci_dev, cmd_reg, 4);
1103 
1104 	/*
1105 	 * Doorbell stride is 2 ^ (dstrd + 2),
1106 	 * but we want multiples of 4, so drop the + 2.
1107 	 */
1108 	cap.raw = nvme_reg_mmio_read_8(ctrlr, cap.raw);
1109 	ctrlr->doorbell_stride_u32 = 1 << cap.bits.dstrd;
1110 	ctrlr->min_page_size = 1 << (12 + cap.bits.mpsmin);
1111 
1112 	/* Set default transfer size */
1113 	ctrlr->max_xfer_size = NVME_MAX_XFER_SIZE;
1114 
1115 	/* Create the admin queue pair */
1116 	ret = nvme_qpair_construct(ctrlr, &ctrlr->adminq, 0,
1117 				   NVME_ADMIN_ENTRIES, NVME_ADMIN_TRACKERS);
1118 	if (ret != 0) {
1119 		nvme_err("Initialize admin queue pair failed\n");
1120 		goto err;
1121 	}
1122 
1123 	/* Set options and then initialize */
1124 	nvme_ctrlr_set_opts(ctrlr, opts);
1125 	do {
1126 		ret = nvme_ctrlr_init(ctrlr);
1127 		if (ret)
1128 			goto err;
1129 	} while (ctrlr->state != NVME_CTRLR_STATE_READY);
1130 
1131 	return ctrlr;
1132 
1133 err:
1134 	nvme_ctrlr_detach(ctrlr);
1135 
1136 	return NULL;
1137 }
1138 
1139 /*
1140  * Detach a PCI controller.
1141  */
1142 void nvme_ctrlr_detach(struct nvme_ctrlr *ctrlr)
1143 {
1144 	struct nvme_qpair *qpair;
1145 	uint32_t i;
1146 
1147 	while (!TAILQ_EMPTY(&ctrlr->active_io_qpairs)) {
1148 		qpair = TAILQ_FIRST(&ctrlr->active_io_qpairs);
1149 		nvme_ioqp_release(qpair);
1150 	}
1151 
1152 	nvme_ctrlr_shutdown(ctrlr);
1153 
1154 	nvme_ctrlr_destruct_namespaces(ctrlr);
1155 	if (ctrlr->ioq) {
1156 		for (i = 0; i < ctrlr->io_queues; i++)
1157 			nvme_qpair_destroy(&ctrlr->ioq[i]);
1158 		free(ctrlr->ioq);
1159 	}
1160 
1161 	nvme_qpair_destroy(&ctrlr->adminq);
1162 
1163 	nvme_ctrlr_unmap_bars(ctrlr);
1164 
1165 	pthread_mutex_destroy(&ctrlr->lock);
1166 	free(ctrlr);
1167 }
1168 
1169 /*
1170  * Get a controller feature.
1171  */
1172 int nvme_ctrlr_get_feature(struct nvme_ctrlr *ctrlr,
1173 			   enum nvme_feat_sel sel, enum nvme_feat feature,
1174 			   uint32_t cdw11,
1175 			   uint32_t *attributes)
1176 {
1177 	int ret;
1178 
1179 	pthread_mutex_lock(&ctrlr->lock);
1180 
1181 	ret = nvme_admin_get_feature(ctrlr, sel, feature, cdw11, attributes);
1182 	if (ret != 0)
1183 		nvme_notice("Get feature 0x%08x failed\n",
1184 			    (unsigned int) feature);
1185 
1186 	pthread_mutex_unlock(&ctrlr->lock);
1187 
1188 	return ret;
1189 }
1190 
1191 /*
1192  * Set a controller feature.
1193  */
1194 int nvme_ctrlr_set_feature(struct nvme_ctrlr *ctrlr,
1195 			   bool save, enum nvme_feat feature,
1196 			   uint32_t cdw11, uint32_t cdw12,
1197 			   uint32_t *attributes)
1198 {
1199 	int ret;
1200 
1201 	pthread_mutex_lock(&ctrlr->lock);
1202 
1203 	ret = nvme_admin_set_feature(ctrlr, save, feature,
1204 				     cdw11, cdw12, attributes);
1205 	if (ret != 0)
1206 		nvme_notice("Set feature 0x%08x failed\n",
1207 			    (unsigned int) feature);
1208 
1209 	pthread_mutex_unlock(&ctrlr->lock);
1210 
1211 	return ret;
1212 }
1213 
1214 /*
1215  * Attach a namespace.
1216  */
1217 int nvme_ctrlr_attach_ns(struct nvme_ctrlr *ctrlr, unsigned int nsid,
1218 			 struct nvme_ctrlr_list *clist)
1219 {
1220 	int ret;
1221 
1222 	pthread_mutex_lock(&ctrlr->lock);
1223 
1224 	ret = nvme_admin_attach_ns(ctrlr, nsid, clist);
1225 	if (ret) {
1226 		nvme_notice("Attach namespace %u failed\n", nsid);
1227 		goto out;
1228 	}
1229 
1230 	ret = nvme_ctrlr_reset(ctrlr);
1231 	if (ret != 0)
1232 		nvme_notice("Reset controller failed\n");
1233 
1234 out:
1235 	pthread_mutex_unlock(&ctrlr->lock);
1236 
1237 	return ret;
1238 }
1239 
1240 /*
1241  * Detach a namespace.
1242  */
1243 int nvme_ctrlr_detach_ns(struct nvme_ctrlr *ctrlr, unsigned int nsid,
1244 			 struct nvme_ctrlr_list *clist)
1245 {
1246 	int ret;
1247 
1248 	pthread_mutex_lock(&ctrlr->lock);
1249 
1250 	ret = nvme_admin_detach_ns(ctrlr, nsid, clist);
1251 	if (ret != 0) {
1252 		nvme_notice("Detach namespace %u failed\n", nsid);
1253 		goto out;
1254 	}
1255 
1256 	ret = nvme_ctrlr_reset(ctrlr);
1257 	if (ret)
1258 		nvme_notice("Reset controller failed\n");
1259 
1260 out:
1261 	pthread_mutex_unlock(&ctrlr->lock);
1262 
1263 	return ret;
1264 }
1265 
1266 /*
1267  * Create a namespace.
1268  */
1269 unsigned int nvme_ctrlr_create_ns(struct nvme_ctrlr *ctrlr,
1270 				  struct nvme_ns_data *nsdata)
1271 {
1272 	unsigned int nsid;
1273 	int ret;
1274 
1275 	pthread_mutex_lock(&ctrlr->lock);
1276 
1277 	ret = nvme_admin_create_ns(ctrlr, nsdata, &nsid);
1278 	if (ret != 0) {
1279 		nvme_notice("Create namespace failed\n");
1280 		nsid = 0;
1281 	}
1282 
1283 	pthread_mutex_unlock(&ctrlr->lock);
1284 
1285 	return nsid;
1286 }
1287 
1288 /*
1289  * Delete a namespace.
1290  */
1291 int nvme_ctrlr_delete_ns(struct nvme_ctrlr *ctrlr, unsigned int nsid)
1292 {
1293 	int ret;
1294 
1295 	pthread_mutex_lock(&ctrlr->lock);
1296 
1297 	ret = nvme_admin_delete_ns(ctrlr, nsid);
1298 	if (ret != 0) {
1299 		nvme_notice("Delete namespace %u failed\n", nsid);
1300 		goto out;
1301 	}
1302 
1303 	ret = nvme_ctrlr_reset(ctrlr);
1304 	if (ret)
1305 		nvme_notice("Reset controller failed\n");
1306 
1307 out:
1308 	pthread_mutex_unlock(&ctrlr->lock);
1309 
1310 	return ret;
1311 }
1312 
1313 /*
1314  * Format NVM media.
1315  */
1316 int nvme_ctrlr_format_ns(struct nvme_ctrlr *ctrlr, unsigned int nsid,
1317 			 struct nvme_format *format)
1318 {
1319 	int ret;
1320 
1321 	pthread_mutex_lock(&ctrlr->lock);
1322 
1323 	ret = nvme_admin_format_nvm(ctrlr, nsid, format);
1324 	if (ret != 0) {
1325 		if (nsid == NVME_GLOBAL_NS_TAG)
1326 			nvme_notice("Format device failed\n");
1327 		else
1328 			nvme_notice("Format namespace %u failed\n", nsid);
1329 		goto out;
1330 	}
1331 
1332 	ret = nvme_ctrlr_reset(ctrlr);
1333 	if (ret)
1334 		nvme_notice("Reset controller failed\n");
1335 
1336 out:
1337 	pthread_mutex_unlock(&ctrlr->lock);
1338 
1339 	return ret;
1340 }
1341 
1342 /*
1343  * Update a device firmware.
1344  */
1345 int nvme_ctrlr_update_firmware(struct nvme_ctrlr *ctrlr,
1346 			       void *fw, size_t size, int slot)
1347 {
1348 	struct nvme_fw_commit fw_commit;
1349 	unsigned int size_remaining = size, offset = 0, transfer;
1350 	void *f = fw;
1351 	int ret;
1352 
1353 	if (size & 0x3) {
1354 		nvme_err("Invalid firmware size\n");
1355 		return EINVAL;
1356 	}
1357 
1358 	pthread_mutex_lock(&ctrlr->lock);
1359 
1360 	/* Download firmware */
1361 	while (size_remaining > 0) {
1362 
1363 		transfer = nvme_min(size_remaining, ctrlr->min_page_size);
1364 
1365 		ret = nvme_admin_fw_image_dl(ctrlr, f, transfer, offset);
1366 		if (ret != 0) {
1367 			nvme_err("Download FW (%u B at %u) failed\n",
1368 				 transfer, offset);
1369 			goto out;
1370 		}
1371 
1372 		f += transfer;
1373 		offset += transfer;
1374 		size_remaining -= transfer;
1375 
1376 	}
1377 
1378 	/* Commit firmware */
1379 	memset(&fw_commit, 0, sizeof(struct nvme_fw_commit));
1380 	fw_commit.fs = slot;
1381 	fw_commit.ca = NVME_FW_COMMIT_REPLACE_IMG;
1382 
1383 	ret = nvme_admin_fw_commit(ctrlr, &fw_commit);
1384 	if (ret != 0) {
1385 		nvme_err("Commit downloaded FW (%zu B) failed\n",
1386 			 size);
1387 		goto out;
1388 	}
1389 
1390 	ret = nvme_ctrlr_reset(ctrlr);
1391 	if (ret)
1392 		nvme_notice("Reset controller failed\n");
1393 
1394 out:
1395 	pthread_mutex_unlock(&ctrlr->lock);
1396 
1397 	return ret;
1398 }
1399 
1400 /*
1401  * Get an unused I/O queue pair.
1402  */
1403 struct nvme_qpair *nvme_ioqp_get(struct nvme_ctrlr *ctrlr,
1404 				 enum nvme_qprio qprio, unsigned int qd)
1405 {
1406 	struct nvme_qpair *qpair = NULL;
1407 	union nvme_cc_register cc;
1408 	uint32_t trackers;
1409 	int ret;
1410 
1411 	cc.raw = nvme_reg_mmio_read_4(ctrlr, cc.raw);
1412 
1413 	/* Only the low 2 bits (values 0, 1, 2, 3) of QPRIO are valid. */
1414 	if ((qprio & 3) != qprio)
1415 		return NULL;
1416 
1417 	/*
1418 	 * Only value NVME_QPRIO_URGENT(0) is valid for the
1419 	 * default round robin arbitration method.
1420 	 */
1421 	if ((cc.bits.ams == NVME_CC_AMS_RR) && (qprio != NVME_QPRIO_URGENT)) {
1422 		nvme_err("Invalid queue priority for default round "
1423 			 "robin arbitration method\n");
1424 		return NULL;
1425 	}
1426 
1427 	/* I/O qpairs number of entries belong to [2, io_qpairs_max_entries] */
1428 	if (qd == 1) {
1429 		nvme_err("Invalid queue depth\n");
1430 		return NULL;
1431 	}
1432 
1433 	if (qd == 0 || qd > ctrlr->io_qpairs_max_entries)
1434 		qd = ctrlr->io_qpairs_max_entries;
1435 
1436 	/*
1437 	 * No need to have more trackers than entries in the submit queue.
1438 	 * Note also that for a queue size of N, we can only have (N-1)
1439 	 * commands outstanding, hence the "-1" here.
1440 	 */
1441 	trackers = nvme_min(NVME_IO_TRACKERS, (qd - 1));
1442 
1443 	pthread_mutex_lock(&ctrlr->lock);
1444 
1445 	/* Get the first available qpair structure */
1446 	qpair = TAILQ_FIRST(&ctrlr->free_io_qpairs);
1447 	if (qpair == NULL) {
1448 		/* No free queue IDs */
1449 		nvme_err("No free I/O queue pairs\n");
1450 		goto out;
1451 	}
1452 
1453 	/* Construct the qpair */
1454 	ret = nvme_qpair_construct(ctrlr, qpair, qprio, qd, trackers);
1455 	if (ret != 0) {
1456 		nvme_qpair_destroy(qpair);
1457 		qpair = NULL;
1458 		goto out;
1459 	}
1460 
1461 	/*
1462 	 * At this point, qpair contains a preallocated submission
1463 	 * and completion queue and a unique queue ID, but it is not
1464 	 * yet created on the controller.
1465 	 * Fill out the submission queue priority and send out the
1466 	 * Create I/O Queue commands.
1467 	 */
1468 	if (nvme_ctrlr_create_qpair(ctrlr, qpair) != 0) {
1469 		nvme_err("Create queue pair on the controller failed\n");
1470 		nvme_qpair_destroy(qpair);
1471 		qpair = NULL;
1472 		goto out;
1473 	}
1474 
1475 	TAILQ_REMOVE(&ctrlr->free_io_qpairs, qpair, tailq);
1476 	TAILQ_INSERT_TAIL(&ctrlr->active_io_qpairs, qpair, tailq);
1477 
1478 out:
1479 	pthread_mutex_unlock(&ctrlr->lock);
1480 
1481 	return qpair;
1482 }
1483 
1484 /*
1485  * Free an I/O queue pair.
1486  */
1487 int nvme_ioqp_release(struct nvme_qpair *qpair)
1488 {
1489 	struct nvme_ctrlr *ctrlr;
1490 	int ret;
1491 
1492 	if (qpair == NULL)
1493 		return 0;
1494 
1495 	ctrlr = qpair->ctrlr;
1496 
1497 	pthread_mutex_lock(&ctrlr->lock);
1498 
1499 	/* Delete the I/O submission and completion queues */
1500 	ret = nvme_ctrlr_delete_qpair(ctrlr, qpair);
1501 	if (ret != 0) {
1502 		nvme_notice("Delete queue pair %u failed\n", qpair->id);
1503 	} else {
1504 		TAILQ_REMOVE(&ctrlr->active_io_qpairs, qpair, tailq);
1505 		TAILQ_INSERT_HEAD(&ctrlr->free_io_qpairs, qpair, tailq);
1506 	}
1507 
1508 	pthread_mutex_unlock(&ctrlr->lock);
1509 
1510 	return ret;
1511 }
1512 
1513 /*
1514  * Submit an NVMe command using the specified I/O queue pair.
1515  */
1516 int nvme_ioqp_submit_cmd(struct nvme_qpair *qpair,
1517 			 struct nvme_cmd *cmd,
1518 			 void *buf, size_t len,
1519 			 nvme_cmd_cb cb_fn, void *cb_arg)
1520 {
1521 	struct nvme_ctrlr *ctrlr = qpair->ctrlr;
1522 	struct nvme_request *req;
1523 	int ret = ENOMEM;
1524 
1525 	pthread_mutex_lock(&ctrlr->lock);
1526 
1527 	req = nvme_request_allocate_contig(qpair, buf, len, cb_fn, cb_arg);
1528 	if (req) {
1529 		memcpy(&req->cmd, cmd, sizeof(req->cmd));
1530 		ret = nvme_qpair_submit_request(qpair, req);
1531 	}
1532 
1533 	pthread_mutex_unlock(&ctrlr->lock);
1534 
1535 	return ret;
1536 }
1537 
1538 /*
1539  * Poll for completion of NVMe commands submitted to the
1540  * specified I/O queue pair.
1541  */
1542 unsigned int nvme_ioqp_poll(struct nvme_qpair *qpair,
1543 			    unsigned int max_completions)
1544 {
1545 	struct nvme_ctrlr *ctrlr = qpair->ctrlr;
1546 	int ret;
1547 
1548 	pthread_mutex_lock(&ctrlr->lock);
1549 	ret = nvme_qpair_poll(qpair, max_completions);
1550 	pthread_mutex_unlock(&ctrlr->lock);
1551 
1552 	return ret;
1553 }
1554