xref: /haiku/src/add-ons/kernel/drivers/disk/nvme/libnvme/nvme_qpair.c (revision dc0bc42494264bb8d17b6b081647377b5601570a)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation. All rights reserved.
5  *   Copyright (c) 2017, Western Digital Corporation or its affiliates.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "nvme_internal.h"
35 
36 struct nvme_qpair_string {
37 	uint16_t	value;
38 	const char 	*str;
39 };
40 
41 static const struct nvme_qpair_string admin_opcode[] = {
42 	{ NVME_OPC_DELETE_IO_SQ,	"DELETE IO SQ" },
43 	{ NVME_OPC_CREATE_IO_SQ,	"CREATE IO SQ" },
44 	{ NVME_OPC_GET_LOG_PAGE,	"GET LOG PAGE" },
45 	{ NVME_OPC_DELETE_IO_CQ,	"DELETE IO CQ" },
46 	{ NVME_OPC_CREATE_IO_CQ,	"CREATE IO CQ" },
47 	{ NVME_OPC_IDENTIFY, 		"IDENTIFY" },
48 	{ NVME_OPC_ABORT,		"ABORT" },
49 	{ NVME_OPC_SET_FEATURES,	"SET FEATURES" },
50 	{ NVME_OPC_GET_FEATURES,	"GET FEATURES" },
51 	{ NVME_OPC_ASYNC_EVENT_REQUEST, "ASYNC EVENT REQUEST" },
52 	{ NVME_OPC_NS_MANAGEMENT,	"NAMESPACE MANAGEMENT" },
53 	{ NVME_OPC_FIRMWARE_COMMIT,	"FIRMWARE COMMIT" },
54 	{ NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD, "FIRMWARE IMAGE DOWNLOAD" },
55 	{ NVME_OPC_NS_ATTACHMENT,	"NAMESPACE ATTACHMENT" },
56 	{ NVME_OPC_FORMAT_NVM,		"FORMAT NVM" },
57 	{ NVME_OPC_SECURITY_SEND,	"SECURITY SEND" },
58 	{ NVME_OPC_SECURITY_RECEIVE,	"SECURITY RECEIVE" },
59 	{ 0xFFFF,			"ADMIN COMMAND" }
60 };
61 
62 static const struct nvme_qpair_string io_opcode[] = {
63 	{ NVME_OPC_FLUSH,		"FLUSH" },
64 	{ NVME_OPC_WRITE,		"WRITE" },
65 	{ NVME_OPC_READ,		"READ" },
66 	{ NVME_OPC_WRITE_UNCORRECTABLE, "WRITE UNCORRECTABLE" },
67 	{ NVME_OPC_COMPARE,		"COMPARE" },
68 	{ NVME_OPC_WRITE_ZEROES,	"WRITE ZEROES" },
69 	{ NVME_OPC_DATASET_MANAGEMENT,	"DATASET MANAGEMENT" },
70 	{ NVME_OPC_RESERVATION_REGISTER, "RESERVATION REGISTER" },
71 	{ NVME_OPC_RESERVATION_REPORT,	"RESERVATION REPORT" },
72 	{ NVME_OPC_RESERVATION_ACQUIRE, "RESERVATION ACQUIRE" },
73 	{ NVME_OPC_RESERVATION_RELEASE, "RESERVATION RELEASE" },
74 	{ 0xFFFF,			"IO COMMAND" }
75 };
76 
77 static const struct nvme_qpair_string generic_status[] = {
78 	{ NVME_SC_SUCCESS,			"SUCCESS" },
79 	{ NVME_SC_INVALID_OPCODE,		"INVALID OPCODE" },
80 	{ NVME_SC_INVALID_FIELD,		"INVALID FIELD" },
81 	{ NVME_SC_COMMAND_ID_CONFLICT,		"COMMAND ID CONFLICT" },
82 	{ NVME_SC_DATA_TRANSFER_ERROR,		"DATA TRANSFER ERROR" },
83 	{ NVME_SC_ABORTED_POWER_LOSS,		"ABORTED - POWER LOSS" },
84 	{ NVME_SC_INTERNAL_DEVICE_ERROR,	"INTERNAL DEVICE ERROR" },
85 	{ NVME_SC_ABORTED_BY_REQUEST,		"ABORTED - BY REQUEST" },
86 	{ NVME_SC_ABORTED_SQ_DELETION,		"ABORTED - SQ DELETION" },
87 	{ NVME_SC_ABORTED_FAILED_FUSED,		"ABORTED - FAILED FUSED" },
88 	{ NVME_SC_ABORTED_MISSING_FUSED,	"ABORTED - MISSING FUSED" },
89 	{ NVME_SC_INVALID_NAMESPACE_OR_FORMAT,	"INVALID NAMESPACE OR FORMAT" },
90 	{ NVME_SC_COMMAND_SEQUENCE_ERROR,	"COMMAND SEQUENCE ERROR" },
91 	{ NVME_SC_INVALID_SGL_SEG_DESCRIPTOR,	"INVALID SGL SEGMENT DESCRIPTOR" },
92 	{ NVME_SC_INVALID_NUM_SGL_DESCIRPTORS,	"INVALID NUMBER OF SGL DESCRIPTORS" },
93 	{ NVME_SC_DATA_SGL_LENGTH_INVALID,	"DATA SGL LENGTH INVALID" },
94 	{ NVME_SC_METADATA_SGL_LENGTH_INVALID,	"METADATA SGL LENGTH INVALID" },
95 	{ NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID,	"SGL DESCRIPTOR TYPE INVALID" },
96 	{ NVME_SC_INVALID_CONTROLLER_MEM_BUF,	"INVALID CONTROLLER MEMORY BUFFER" },
97 	{ NVME_SC_INVALID_PRP_OFFSET,		"INVALID PRP OFFSET" },
98 	{ NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED,	"ATOMIC WRITE UNIT EXCEEDED" },
99 	{ NVME_SC_LBA_OUT_OF_RANGE,		"LBA OUT OF RANGE" },
100 	{ NVME_SC_CAPACITY_EXCEEDED,		"CAPACITY EXCEEDED" },
101 	{ NVME_SC_NAMESPACE_NOT_READY,		"NAMESPACE NOT READY" },
102 	{ NVME_SC_RESERVATION_CONFLICT,		"RESERVATION CONFLICT" },
103 	{ NVME_SC_FORMAT_IN_PROGRESS,		"FORMAT IN PROGRESS" },
104 	{ 0xFFFF,				"GENERIC" }
105 };
106 
107 static const struct nvme_qpair_string command_specific_status[] = {
108 	{ NVME_SC_COMPLETION_QUEUE_INVALID,	"INVALID COMPLETION QUEUE" },
109 	{ NVME_SC_INVALID_QUEUE_IDENTIFIER,	"INVALID QUEUE IDENTIFIER" },
110 	{ NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED,	"MAX QUEUE SIZE EXCEEDED" },
111 	{ NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED,	"ABORT CMD LIMIT EXCEEDED" },
112 	{ NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED,"ASYNC LIMIT EXCEEDED" },
113 	{ NVME_SC_INVALID_FIRMWARE_SLOT,	"INVALID FIRMWARE SLOT" },
114 	{ NVME_SC_INVALID_FIRMWARE_IMAGE,	"INVALID FIRMWARE IMAGE" },
115 	{ NVME_SC_INVALID_INTERRUPT_VECTOR,	"INVALID INTERRUPT VECTOR" },
116 	{ NVME_SC_INVALID_LOG_PAGE,		"INVALID LOG PAGE" },
117 	{ NVME_SC_INVALID_FORMAT,		"INVALID FORMAT" },
118 	{ NVME_SC_FIRMWARE_REQ_CONVENTIONAL_RESET,"FIRMWARE REQUIRES CONVENTIONAL RESET" },
119 	{ NVME_SC_INVALID_QUEUE_DELETION,	"INVALID QUEUE DELETION" },
120 	{ NVME_SC_FEATURE_ID_NOT_SAVEABLE,	"FEATURE ID NOT SAVEABLE" },
121 	{ NVME_SC_FEATURE_NOT_CHANGEABLE,	"FEATURE NOT CHANGEABLE" },
122 	{ NVME_SC_FEATURE_NOT_NAMESPACE_SPECIFIC,"FEATURE NOT NAMESPACE SPECIFIC" },
123 	{ NVME_SC_FIRMWARE_REQ_NVM_RESET,	"FIRMWARE REQUIRES NVM RESET" },
124 	{ NVME_SC_FIRMWARE_REQ_RESET,		"FIRMWARE REQUIRES RESET" },
125 	{ NVME_SC_FIRMWARE_REQ_MAX_TIME_VIOLATION,"FIRMWARE REQUIRES MAX TIME VIOLATION" },
126 	{ NVME_SC_FIRMWARE_ACTIVATION_PROHIBITED,"FIRMWARE ACTIVATION PROHIBITED" },
127 	{ NVME_SC_OVERLAPPING_RANGE,		"OVERLAPPING RANGE" },
128 	{ NVME_SC_NAMESPACE_INSUFFICIENT_CAPACITY,"NAMESPACE INSUFFICIENT CAPACITY" },
129 	{ NVME_SC_NAMESPACE_ID_UNAVAILABLE,	"NAMESPACE ID UNAVAILABLE" },
130 	{ NVME_SC_NAMESPACE_ALREADY_ATTACHED,	"NAMESPACE ALREADY ATTACHED" },
131 	{ NVME_SC_NAMESPACE_IS_PRIVATE,		"NAMESPACE IS PRIVATE" },
132 	{ NVME_SC_NAMESPACE_NOT_ATTACHED,	"NAMESPACE NOT ATTACHED" },
133 	{ NVME_SC_THINPROVISIONING_NOT_SUPPORTED,"THINPROVISIONING NOT SUPPORTED" },
134 	{ NVME_SC_CONTROLLER_LIST_INVALID,	"CONTROLLER LIST INVALID" },
135 	{ NVME_SC_CONFLICTING_ATTRIBUTES,	"CONFLICTING ATTRIBUTES" },
136 	{ NVME_SC_INVALID_PROTECTION_INFO,	"INVALID PROTECTION INFO" },
137 	{ NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE,	"WRITE TO RO PAGE" },
138 	{ 0xFFFF,				"COMMAND SPECIFIC" }
139 };
140 
141 static const struct nvme_qpair_string media_error_status[] = {
142 	{ NVME_SC_WRITE_FAULTS, 		"WRITE FAULTS" },
143 	{ NVME_SC_UNRECOVERED_READ_ERROR, 	"UNRECOVERED READ ERROR" },
144 	{ NVME_SC_GUARD_CHECK_ERROR, 		"GUARD CHECK ERROR" },
145 	{ NVME_SC_APPLICATION_TAG_CHECK_ERROR, 	"APPLICATION TAG CHECK ERROR" },
146 	{ NVME_SC_REFERENCE_TAG_CHECK_ERROR, 	"REFERENCE TAG CHECK ERROR" },
147 	{ NVME_SC_COMPARE_FAILURE, 		"COMPARE FAILURE" },
148 	{ NVME_SC_ACCESS_DENIED, 		"ACCESS DENIED" },
149 	{ NVME_SC_DEALLOCATED_OR_UNWRITTEN_BLOCK, "DEALLOCATED OR UNWRITTEN BLOCK" },
150 	{ 0xFFFF, 				"MEDIA ERROR" }
151 };
152 
153 static inline bool nvme_qpair_is_admin_queue(struct nvme_qpair *qpair)
154 {
155 	return qpair->id == 0;
156 }
157 
158 static inline bool nvme_qpair_is_io_queue(struct nvme_qpair *qpair)
159 {
160 	return qpair->id != 0;
161 }
162 
163 static const char*nvme_qpair_get_string(const struct nvme_qpair_string *strings,
164 					uint16_t value)
165 {
166 	const struct nvme_qpair_string *entry;
167 
168 	entry = strings;
169 
170 	while (entry->value != 0xFFFF) {
171 		if (entry->value == value)
172 			return entry->str;
173 		entry++;
174 	}
175 	return entry->str;
176 }
177 
178 static void nvme_qpair_admin_qpair_print_command(struct nvme_qpair *qpair,
179 						 struct nvme_cmd *cmd)
180 {
181 	nvme_info("%s (%02x) sqid:%d cid:%d nsid:%x cdw10:%08x cdw11:%08x\n",
182 		  nvme_qpair_get_string(admin_opcode, cmd->opc), cmd->opc,
183 		  qpair->id, cmd->cid,
184 		  cmd->nsid, cmd->cdw10, cmd->cdw11);
185 }
186 
187 static void nvme_qpair_io_qpair_print_command(struct nvme_qpair *qpair,
188 					      struct nvme_cmd *cmd)
189 {
190 	nvme_assert(qpair != NULL, "print_command: qpair == NULL\n");
191 	nvme_assert(cmd != NULL, "print_command: cmd == NULL\n");
192 
193 	switch ((int)cmd->opc) {
194 	case NVME_OPC_WRITE:
195 	case NVME_OPC_READ:
196 	case NVME_OPC_WRITE_UNCORRECTABLE:
197 	case NVME_OPC_COMPARE:
198 		nvme_info("%s sqid:%d cid:%d nsid:%d lba:%llu len:%d\n",
199 			  nvme_qpair_get_string(io_opcode, cmd->opc),
200 			  qpair->id, cmd->cid, cmd->nsid,
201 			  ((unsigned long long)cmd->cdw11 << 32) + cmd->cdw10,
202 			  (cmd->cdw12 & 0xFFFF) + 1);
203 		break;
204 	case NVME_OPC_FLUSH:
205 	case NVME_OPC_DATASET_MANAGEMENT:
206 		nvme_info("%s sqid:%d cid:%d nsid:%d\n",
207 			  nvme_qpair_get_string(io_opcode, cmd->opc),
208 			  qpair->id, cmd->cid, cmd->nsid);
209 		break;
210 	default:
211 		nvme_info("%s (%02x) sqid:%d cid:%d nsid:%d\n",
212 			  nvme_qpair_get_string(io_opcode, cmd->opc),
213 			  cmd->opc, qpair->id, cmd->cid, cmd->nsid);
214 		break;
215 	}
216 }
217 
218 static void nvme_qpair_print_command(struct nvme_qpair *qpair,
219 				     struct nvme_cmd *cmd)
220 {
221 	nvme_assert(qpair != NULL, "qpair can not be NULL");
222 	nvme_assert(cmd != NULL, "cmd can not be NULL");
223 
224 	if (nvme_qpair_is_admin_queue(qpair))
225 		return nvme_qpair_admin_qpair_print_command(qpair, cmd);
226 
227 	return nvme_qpair_io_qpair_print_command(qpair, cmd);
228 }
229 
230 static const char *get_status_string(uint16_t sct, uint16_t sc)
231 {
232 	const struct nvme_qpair_string *entry;
233 
234 	switch (sct) {
235 	case NVME_SCT_GENERIC:
236 		entry = generic_status;
237 		break;
238 	case NVME_SCT_COMMAND_SPECIFIC:
239 		entry = command_specific_status;
240 		break;
241 	case NVME_SCT_MEDIA_ERROR:
242 		entry = media_error_status;
243 		break;
244 	case NVME_SCT_VENDOR_SPECIFIC:
245 		return "VENDOR SPECIFIC";
246 	default:
247 		return "RESERVED";
248 	}
249 
250 	return nvme_qpair_get_string(entry, sc);
251 }
252 
253 static void nvme_qpair_print_completion(struct nvme_qpair *qpair,
254 					struct nvme_cpl *cpl)
255 {
256 	nvme_info("Cpl: %s (%02x/%02x) sqid:%d cid:%d "
257 		  "cdw0:%x sqhd:%04x p:%x m:%x dnr:%x\n",
258 		  get_status_string(cpl->status.sct, cpl->status.sc),
259 		  cpl->status.sct,
260 		  cpl->status.sc,
261 		  cpl->sqid,
262 		  cpl->cid,
263 		  cpl->cdw0,
264 		  cpl->sqhd,
265 		  cpl->status.p,
266 		  cpl->status.m,
267 		  cpl->status.dnr);
268 }
269 
270 static bool nvme_qpair_completion_retry(const struct nvme_cpl *cpl)
271 {
272 	/*
273 	 * TODO: spec is not clear how commands that are aborted due
274 	 *  to TLER will be marked.  So for now, it seems
275 	 *  NAMESPACE_NOT_READY is the only case where we should
276 	 *  look at the DNR bit.
277 	 */
278 	switch ((int)cpl->status.sct) {
279 	case NVME_SCT_GENERIC:
280 		switch ((int)cpl->status.sc) {
281 		case NVME_SC_NAMESPACE_NOT_READY:
282 		case NVME_SC_FORMAT_IN_PROGRESS:
283 			if (cpl->status.dnr)
284 				return false;
285 			return true;
286 		case NVME_SC_INVALID_OPCODE:
287 		case NVME_SC_INVALID_FIELD:
288 		case NVME_SC_COMMAND_ID_CONFLICT:
289 		case NVME_SC_DATA_TRANSFER_ERROR:
290 		case NVME_SC_ABORTED_POWER_LOSS:
291 		case NVME_SC_INTERNAL_DEVICE_ERROR:
292 		case NVME_SC_ABORTED_BY_REQUEST:
293 		case NVME_SC_ABORTED_SQ_DELETION:
294 		case NVME_SC_ABORTED_FAILED_FUSED:
295 		case NVME_SC_ABORTED_MISSING_FUSED:
296 		case NVME_SC_INVALID_NAMESPACE_OR_FORMAT:
297 		case NVME_SC_COMMAND_SEQUENCE_ERROR:
298 		case NVME_SC_LBA_OUT_OF_RANGE:
299 		case NVME_SC_CAPACITY_EXCEEDED:
300 		default:
301 			return false;
302 		}
303 	case NVME_SCT_COMMAND_SPECIFIC:
304 	case NVME_SCT_MEDIA_ERROR:
305 	case NVME_SCT_VENDOR_SPECIFIC:
306 	default:
307 		return false;
308 	}
309 }
310 
311 static void nvme_qpair_construct_tracker(struct nvme_tracker *tr,
312 					 uint16_t cid, uint64_t phys_addr)
313 {
314 	tr->prp_sgl_bus_addr = phys_addr + offsetof(struct nvme_tracker, u.prp);
315 	tr->cid = cid;
316 	tr->active = false;
317 }
318 
319 static inline void nvme_qpair_copy_command(struct nvme_cmd *dst,
320 					   const struct nvme_cmd *src)
321 {
322 	/* dst and src are known to be non-overlapping and 64-byte aligned. */
323 #if defined(__AVX__)
324 	__m256i *d256 = (__m256i *)dst;
325 	const __m256i *s256 = (const __m256i *)src;
326 
327 	_mm256_store_si256(&d256[0], _mm256_load_si256(&s256[0]));
328 	_mm256_store_si256(&d256[1], _mm256_load_si256(&s256[1]));
329 #elif defined(__SSE2__)
330 	__m128i *d128 = (__m128i *)dst;
331 	const __m128i *s128 = (const __m128i *)src;
332 
333 	_mm_store_si128(&d128[0], _mm_load_si128(&s128[0]));
334 	_mm_store_si128(&d128[1], _mm_load_si128(&s128[1]));
335 	_mm_store_si128(&d128[2], _mm_load_si128(&s128[2]));
336 	_mm_store_si128(&d128[3], _mm_load_si128(&s128[3]));
337 #else
338 	*dst = *src;
339 #endif
340 }
341 
342 static void nvme_qpair_submit_tracker(struct nvme_qpair *qpair,
343 				      struct nvme_tracker *tr)
344 {
345 	struct nvme_request *req = tr->req;
346 
347 	/*
348 	 * Set the tracker active and copy its command
349 	 * to the submission queue.
350 	 */
351 	nvme_debug("qpair %d: Submit command, tail %d, cid %d / %d\n",
352 		   qpair->id,
353 		   (int)qpair->sq_tail,
354 		   (int)tr->cid,
355 		   (int)tr->req->cmd.cid);
356 
357 	qpair->tr[tr->cid].active = true;
358 	nvme_qpair_copy_command(&qpair->cmd[qpair->sq_tail], &req->cmd);
359 
360 	if (++qpair->sq_tail == qpair->entries)
361 		qpair->sq_tail = 0;
362 
363 	nvme_wmb();
364 	nvme_mmio_write_4(qpair->sq_tdbl, qpair->sq_tail);
365 }
366 
367 static void nvme_qpair_complete_tracker(struct nvme_qpair *qpair,
368 					struct nvme_tracker *tr,
369 					struct nvme_cpl *cpl,
370 					bool print_on_error)
371 {
372 	struct nvme_request *req = tr->req;
373 	bool retry, error;
374 
375 	if (!req) {
376 		nvme_crit("tracker has no request\n");
377 		qpair->tr[cpl->cid].active = false;
378 		goto done;
379 	}
380 
381 	error = nvme_cpl_is_error(cpl);
382 	retry = error && nvme_qpair_completion_retry(cpl) &&
383 		(req->retries < NVME_MAX_RETRY_COUNT);
384 	if (error && print_on_error) {
385 		nvme_qpair_print_command(qpair, &req->cmd);
386 		nvme_qpair_print_completion(qpair, cpl);
387 	}
388 
389 	qpair->tr[cpl->cid].active = false;
390 
391 	if (cpl->cid != req->cmd.cid)
392 		nvme_crit("cpl and command CID mismatch (%d / %d)\n",
393 			  (int)cpl->cid, (int)req->cmd.cid);
394 
395 	if (retry) {
396 		req->retries++;
397 		nvme_qpair_submit_tracker(qpair, tr);
398 		return;
399 	}
400 
401 	if (req->cb_fn)
402 		req->cb_fn(req->cb_arg, cpl);
403 
404 	nvme_request_free_locked(req);
405 
406 done:
407 	tr->req = NULL;
408 
409 	LIST_REMOVE(tr, list);
410 	LIST_INSERT_HEAD(&qpair->free_tr, tr, list);
411 }
412 
413 static void nvme_qpair_submit_queued_requests(struct nvme_qpair *qpair)
414 {
415 	pthread_mutex_lock(&qpair->lock);
416 
417 	/*
418 	 * If the controller is in the middle of a reset, don't
419 	 * try to submit queued requests - let the reset logic
420 	 * handle that instead.
421 	 */
422 	while (!STAILQ_EMPTY(&qpair->queued_req) && !qpair->ctrlr->resetting) {
423 		struct nvme_request *req = STAILQ_FIRST(&qpair->queued_req);
424 		STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
425 
426 		pthread_mutex_unlock(&qpair->lock);
427 		nvme_qpair_submit_request(qpair, req);
428 		pthread_mutex_lock(&qpair->lock);
429 	}
430 
431 	pthread_mutex_unlock(&qpair->lock);
432 }
433 
434 static void nvme_qpair_manual_complete_tracker(struct nvme_qpair *qpair,
435 					       struct nvme_tracker *tr,
436 					       uint32_t sct,
437 					       uint32_t sc,
438 					       uint32_t dnr,
439 					       bool print_on_error)
440 {
441 	struct nvme_cpl	cpl;
442 
443 	memset(&cpl, 0, sizeof(cpl));
444 	cpl.sqid = qpair->id;
445 	cpl.cid = tr->cid;
446 	cpl.status.sct = sct;
447 	cpl.status.sc = sc;
448 	cpl.status.dnr = dnr;
449 
450 	nvme_qpair_complete_tracker(qpair, tr, &cpl, print_on_error);
451 }
452 
453 static void nvme_qpair_manual_complete_request(struct nvme_qpair *qpair,
454 					       struct nvme_request *req,
455 					       uint32_t sct, uint32_t sc,
456 					       bool print_on_error)
457 {
458 	struct nvme_cpl	cpl;
459 	bool error;
460 
461 	memset(&cpl, 0, sizeof(cpl));
462 	cpl.sqid = qpair->id;
463 	cpl.status.sct = sct;
464 	cpl.status.sc = sc;
465 
466 	error = nvme_cpl_is_error(&cpl);
467 
468 	if (error && print_on_error) {
469 		nvme_qpair_print_command(qpair, &req->cmd);
470 		nvme_qpair_print_completion(qpair, &cpl);
471 	}
472 
473 	if (req->cb_fn)
474 		req->cb_fn(req->cb_arg, &cpl);
475 
476 	nvme_request_free_locked(req);
477 }
478 
479 static void nvme_qpair_abort_aers(struct nvme_qpair *qpair)
480 {
481 	struct nvme_tracker *tr;
482 
483 	tr = LIST_FIRST(&qpair->outstanding_tr);
484 	while (tr != NULL) {
485 		nvme_assert(tr->req != NULL,
486 			    "tr->req == NULL in abort_aers\n");
487 		if (tr->req->cmd.opc == NVME_OPC_ASYNC_EVENT_REQUEST) {
488 			nvme_qpair_manual_complete_tracker(qpair, tr,
489 					      NVME_SCT_GENERIC,
490 					      NVME_SC_ABORTED_SQ_DELETION,
491 					      0, false);
492 			tr = LIST_FIRST(&qpair->outstanding_tr);
493 			continue;
494 		}
495 		tr = LIST_NEXT(tr, list);
496 	}
497 }
498 
499 static inline void _nvme_qpair_admin_qpair_destroy(struct nvme_qpair *qpair)
500 {
501 	nvme_qpair_abort_aers(qpair);
502 }
503 
504 static inline void _nvme_qpair_req_bad_phys(struct nvme_qpair *qpair,
505 					    struct nvme_tracker *tr)
506 {
507 	/*
508 	 * Bad vtophys translation, so abort this request
509 	 * and return immediately, without retry.
510 	 */
511 	nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC,
512 					   NVME_SC_INVALID_FIELD,
513 					   1, true);
514 }
515 
516 /*
517  * Build PRP list describing physically contiguous payload buffer.
518  */
519 static int _nvme_qpair_build_contig_request(struct nvme_qpair *qpair,
520 					    struct nvme_request *req,
521 					    struct nvme_tracker *tr)
522 {
523 	uint64_t phys_addr;
524 	void *seg_addr;
525 	uint32_t nseg, cur_nseg, modulo, unaligned;
526 	void *md_payload;
527 	void *payload = req->payload.u.contig + req->payload_offset;
528 
529 	phys_addr = nvme_mem_vtophys(payload);
530 	if (phys_addr == NVME_VTOPHYS_ERROR) {
531 		_nvme_qpair_req_bad_phys(qpair, tr);
532 		return -1;
533 	}
534 	nseg = req->payload_size >> PAGE_SHIFT;
535 	modulo = req->payload_size & (PAGE_SIZE - 1);
536 	unaligned = phys_addr & (PAGE_SIZE - 1);
537 	if (modulo || unaligned)
538 		nseg += 1 + ((modulo + unaligned - 1) >> PAGE_SHIFT);
539 
540 	if (req->payload.md) {
541 		md_payload = req->payload.md + req->md_offset;
542 		tr->req->cmd.mptr = nvme_mem_vtophys(md_payload);
543 		if (tr->req->cmd.mptr == NVME_VTOPHYS_ERROR) {
544 			_nvme_qpair_req_bad_phys(qpair, tr);
545 			return -1;
546 		}
547 	}
548 
549 	tr->req->cmd.psdt = NVME_PSDT_PRP;
550 	tr->req->cmd.dptr.prp.prp1 = phys_addr;
551 	if (nseg == 2) {
552 		seg_addr = payload + PAGE_SIZE - unaligned;
553 		tr->req->cmd.dptr.prp.prp2 = nvme_mem_vtophys(seg_addr);
554 	} else if (nseg > 2) {
555 		cur_nseg = 1;
556 		tr->req->cmd.dptr.prp.prp2 = (uint64_t)tr->prp_sgl_bus_addr;
557 		while (cur_nseg < nseg) {
558 			seg_addr = payload + cur_nseg * PAGE_SIZE - unaligned;
559 			phys_addr = nvme_mem_vtophys(seg_addr);
560 			if (phys_addr == NVME_VTOPHYS_ERROR) {
561 				_nvme_qpair_req_bad_phys(qpair, tr);
562 				return -1;
563 			}
564 			tr->u.prp[cur_nseg - 1] = phys_addr;
565 			cur_nseg++;
566 		}
567 	}
568 
569 	return 0;
570 }
571 
572 /*
573  * Build SGL list describing scattered payload buffer.
574  */
575 static int _nvme_qpair_build_hw_sgl_request(struct nvme_qpair *qpair,
576 					    struct nvme_request *req,
577 					    struct nvme_tracker *tr)
578 {
579 	struct nvme_sgl_descriptor *sgl;
580 	uint64_t phys_addr;
581 	uint32_t remaining_transfer_len, length, nseg = 0;
582 	int ret;
583 
584 	/*
585 	 * Build scattered payloads.
586 	 */
587 	nvme_assert(req->payload_size != 0,
588 		    "cannot build SGL for zero-length transfer\n");
589 	nvme_assert(req->payload.type == NVME_PAYLOAD_TYPE_SGL,
590 		    "sgl payload type required\n");
591 	nvme_assert(req->payload.u.sgl.reset_sgl_fn != NULL,
592 		    "sgl reset callback required\n");
593 	nvme_assert(req->payload.u.sgl.next_sge_fn != NULL,
594 		    "sgl callback required\n");
595 	req->payload.u.sgl.reset_sgl_fn(req->payload.u.sgl.cb_arg,
596 					req->payload_offset);
597 
598 	sgl = tr->u.sgl;
599 	req->cmd.psdt = NVME_PSDT_SGL_MPTR_SGL;
600 	req->cmd.dptr.sgl1.unkeyed.subtype = 0;
601 
602 	remaining_transfer_len = req->payload_size;
603 
604 	while (remaining_transfer_len > 0) {
605 
606 		if (nseg >= NVME_MAX_SGL_DESCRIPTORS) {
607 			_nvme_qpair_req_bad_phys(qpair, tr);
608 			return -1;
609 		}
610 
611 		ret = req->payload.u.sgl.next_sge_fn(req->payload.u.sgl.cb_arg,
612 						     &phys_addr, &length);
613 		if (ret != 0) {
614 			_nvme_qpair_req_bad_phys(qpair, tr);
615 			return ret;
616 		}
617 
618 		length = nvme_min(remaining_transfer_len, length);
619 		remaining_transfer_len -= length;
620 
621 		sgl->unkeyed.type = NVME_SGL_TYPE_DATA_BLOCK;
622 		sgl->unkeyed.length = length;
623 		sgl->address = phys_addr;
624 		sgl->unkeyed.subtype = 0;
625 
626 		sgl++;
627 		nseg++;
628 
629 	}
630 
631 	if (nseg == 1) {
632 		/*
633 		 * The whole transfer can be described by a single Scatter
634 		 * Gather List descriptor. Use the special case described
635 		 * by the spec where SGL1's type is Data Block.
636 		 * This means the SGL in the tracker is not used at all,
637 		 * so copy the first (and only) SGL element into SGL1.
638 		 */
639 		req->cmd.dptr.sgl1.unkeyed.type = NVME_SGL_TYPE_DATA_BLOCK;
640 		req->cmd.dptr.sgl1.address = tr->u.sgl[0].address;
641 		req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length;
642 	} else {
643 		/* For now we only support 1 SGL segment in NVMe controller */
644 		req->cmd.dptr.sgl1.unkeyed.type = NVME_SGL_TYPE_LAST_SEGMENT;
645 		req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr;
646 		req->cmd.dptr.sgl1.unkeyed.length =
647 			nseg * sizeof(struct nvme_sgl_descriptor);
648 	}
649 
650 	return 0;
651 }
652 
653 /*
654  * Build Physical Region Page list describing scattered payload buffer.
655  */
656 static int _nvme_qpair_build_prps_sgl_request(struct nvme_qpair *qpair,
657 					      struct nvme_request *req,
658 					      struct nvme_tracker *tr)
659 {
660 	uint64_t phys_addr, prp2 = 0;
661 	uint32_t data_transferred, remaining_transfer_len, length;
662 	uint32_t nseg, cur_nseg, total_nseg = 0, last_nseg = 0;
663 	uint32_t modulo, unaligned, sge_count = 0;
664 	int ret;
665 
666 	/*
667 	 * Build scattered payloads.
668 	 */
669 	nvme_assert(req->payload.type == NVME_PAYLOAD_TYPE_SGL,
670 		    "sgl payload type required\n");
671 	nvme_assert(req->payload.u.sgl.reset_sgl_fn != NULL,
672 		    "sgl reset callback required\n");
673 	req->payload.u.sgl.reset_sgl_fn(req->payload.u.sgl.cb_arg,
674 					req->payload_offset);
675 
676 	remaining_transfer_len = req->payload_size;
677 
678 	while (remaining_transfer_len > 0) {
679 
680 		nvme_assert(req->payload.u.sgl.next_sge_fn != NULL,
681 			    "sgl callback required\n");
682 
683 		ret = req->payload.u.sgl.next_sge_fn(req->payload.u.sgl.cb_arg,
684 						    &phys_addr, &length);
685 		if (ret != 0) {
686 			_nvme_qpair_req_bad_phys(qpair, tr);
687 			return -1;
688 		}
689 
690 		nvme_assert((phys_addr & 0x3) == 0, "address must be dword aligned\n");
691 		nvme_assert((length >= remaining_transfer_len) || ((phys_addr + length) % PAGE_SIZE) == 0,
692 			"All SGEs except last must end on a page boundary\n");
693 		nvme_assert((sge_count == 0) || (phys_addr % PAGE_SIZE) == 0,
694 			"All SGEs except first must start on a page boundary\n");
695 
696 		data_transferred = nvme_min(remaining_transfer_len, length);
697 
698 		nseg = data_transferred >> PAGE_SHIFT;
699 		modulo = data_transferred & (PAGE_SIZE - 1);
700 		unaligned = phys_addr & (PAGE_SIZE - 1);
701 		if (modulo || unaligned)
702 			nseg += 1 + ((modulo + unaligned - 1) >> PAGE_SHIFT);
703 
704 		if (total_nseg == 0) {
705 			req->cmd.psdt = NVME_PSDT_PRP;
706 			req->cmd.dptr.prp.prp1 = phys_addr;
707 		}
708 
709 		total_nseg += nseg;
710 		sge_count++;
711 		remaining_transfer_len -= data_transferred;
712 
713 		if (total_nseg == 2) {
714 			if (sge_count == 1)
715 				tr->req->cmd.dptr.prp.prp2 = phys_addr +
716 					PAGE_SIZE - unaligned;
717 			else if (sge_count == 2)
718 				tr->req->cmd.dptr.prp.prp2 = phys_addr;
719 			/* save prp2 value */
720 			prp2 = tr->req->cmd.dptr.prp.prp2;
721 		} else if (total_nseg > 2) {
722 			if (sge_count == 1)
723 				cur_nseg = 1;
724 			else
725 				cur_nseg = 0;
726 
727 			tr->req->cmd.dptr.prp.prp2 =
728 				(uint64_t)tr->prp_sgl_bus_addr;
729 
730 			while (cur_nseg < nseg) {
731 				if (prp2) {
732 					tr->u.prp[0] = prp2;
733 					tr->u.prp[last_nseg + 1] = phys_addr +
734 						cur_nseg * PAGE_SIZE - unaligned;
735 				} else {
736 					tr->u.prp[last_nseg] = phys_addr +
737 						cur_nseg * PAGE_SIZE - unaligned;
738 				}
739 				last_nseg++;
740 				cur_nseg++;
741 			}
742 		}
743 	}
744 
745 	return 0;
746 }
747 
748 static void _nvme_qpair_admin_qpair_enable(struct nvme_qpair *qpair)
749 {
750 	struct nvme_tracker *tr, *tr_temp;
751 
752 	/*
753 	 * Manually abort each outstanding admin command.  Do not retry
754 	 * admin commands found here, since they will be left over from
755 	 * a controller reset and its likely the context in which the
756 	 * command was issued no longer applies.
757 	 */
758 	LIST_FOREACH_SAFE(tr, &qpair->outstanding_tr, list, tr_temp) {
759 		nvme_info("Aborting outstanding admin command\n");
760 		nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC,
761 						   NVME_SC_ABORTED_BY_REQUEST,
762 						   1 /* do not retry */, true);
763 	}
764 
765 	qpair->enabled = true;
766 }
767 
768 static void _nvme_qpair_io_qpair_enable(struct nvme_qpair *qpair)
769 {
770 	struct nvme_tracker *tr, *temp;
771 	struct nvme_request *req;
772 
773 	qpair->enabled = true;
774 
775 	qpair->ctrlr->enabled_io_qpairs++;
776 
777 	/* Manually abort each queued I/O. */
778 	while (!STAILQ_EMPTY(&qpair->queued_req)) {
779 		req = STAILQ_FIRST(&qpair->queued_req);
780 		STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
781 		nvme_info("Aborting queued I/O command\n");
782 		nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC,
783 						   NVME_SC_ABORTED_BY_REQUEST,
784 						   true);
785 	}
786 
787 	/* Manually abort each outstanding I/O. */
788 	LIST_FOREACH_SAFE(tr, &qpair->outstanding_tr, list, temp) {
789 		nvme_info("Aborting outstanding I/O command\n");
790 		nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC,
791 						   NVME_SC_ABORTED_BY_REQUEST,
792 						   0, true);
793 	}
794 }
795 
796 static inline void _nvme_qpair_admin_qpair_disable(struct nvme_qpair *qpair)
797 {
798 	qpair->enabled = false;
799 	nvme_qpair_abort_aers(qpair);
800 }
801 
802 static inline void _nvme_qpair_io_qpair_disable(struct nvme_qpair *qpair)
803 {
804 	qpair->enabled = false;
805 
806 	qpair->ctrlr->enabled_io_qpairs--;
807 }
808 
809 /*
810  * Reserve room for the submission queue
811  * in the controller memory buffer
812  */
813 static int nvme_ctrlr_reserve_sq_in_cmb(struct nvme_ctrlr *ctrlr,
814 					uint16_t entries,
815 					uint64_t aligned, uint64_t *offset)
816 {
817 	uint64_t round_offset;
818 	const uint64_t length = entries * sizeof(struct nvme_cmd);
819 
820 	round_offset = ctrlr->cmb_current_offset;
821 	round_offset = (round_offset + (aligned - 1)) & ~(aligned - 1);
822 
823 	if (round_offset + length > ctrlr->cmb_size)
824 		return -1;
825 
826 	*offset = round_offset;
827 	ctrlr->cmb_current_offset = round_offset + length;
828 
829 	return 0;
830 }
831 
832 /*
833  * Initialize a queue pair on the host side.
834  */
835 int nvme_qpair_construct(struct nvme_ctrlr *ctrlr, struct nvme_qpair *qpair,
836 			 enum nvme_qprio qprio,
837 			 uint16_t entries, uint16_t trackers)
838 {
839 	volatile uint32_t *doorbell_base;
840 	struct nvme_tracker *tr;
841 	uint64_t offset;
842 	unsigned long phys_addr = 0;
843 	uint16_t i;
844 	int ret;
845 
846 	nvme_assert(entries != 0, "Invalid number of entries\n");
847 	nvme_assert(trackers != 0, "Invalid trackers\n");
848 
849 	pthread_mutex_init(&qpair->lock, NULL);
850 
851 	qpair->entries = entries;
852 	qpair->trackers = trackers;
853 	qpair->qprio = qprio;
854 	qpair->sq_in_cmb = false;
855 	qpair->ctrlr = ctrlr;
856 
857 	if (ctrlr->opts.use_cmb_sqs) {
858 		/*
859 		 * Reserve room for the submission queue in ctrlr
860 		 * memory buffer.
861 		 */
862 		ret = nvme_ctrlr_reserve_sq_in_cmb(ctrlr, entries,
863 						   PAGE_SIZE,
864 						   &offset);
865 		if (ret == 0) {
866 
867 			qpair->cmd = ctrlr->cmb_bar_virt_addr + offset;
868 			qpair->cmd_bus_addr = ctrlr->cmb_bar_phys_addr + offset;
869 			qpair->sq_in_cmb = true;
870 
871 			nvme_debug("Allocated qpair %d cmd in cmb at %p / 0x%llx\n",
872 				   qpair->id,
873 				   qpair->cmd, qpair->cmd_bus_addr);
874 
875 		}
876 	}
877 
878 	if (qpair->sq_in_cmb == false) {
879 
880 		qpair->cmd =
881 			nvme_mem_alloc_node(sizeof(struct nvme_cmd) * entries,
882 				    PAGE_SIZE, NVME_NODE_ID_ANY,
883 				    (unsigned long *) &qpair->cmd_bus_addr);
884 		if (!qpair->cmd) {
885 			nvme_err("Allocate qpair commands failed\n");
886 			goto fail;
887 		}
888 		memset(qpair->cmd, 0, sizeof(struct nvme_cmd) * entries);
889 
890 		nvme_debug("Allocated qpair %d cmd %p / 0x%llx\n",
891 			   qpair->id,
892 			   qpair->cmd, qpair->cmd_bus_addr);
893 	}
894 
895 	qpair->cpl = nvme_mem_alloc_node(sizeof(struct nvme_cpl) * entries,
896 				 PAGE_SIZE, NVME_NODE_ID_ANY,
897 				 (unsigned long *) &qpair->cpl_bus_addr);
898 	if (!qpair->cpl) {
899 		nvme_err("Allocate qpair completions failed\n");
900 		goto fail;
901 	}
902 	memset(qpair->cpl, 0, sizeof(struct nvme_cpl) * entries);
903 
904 	nvme_debug("Allocated qpair %d cpl at %p / 0x%llx\n",
905 		   qpair->id,
906 		   qpair->cpl,
907 		   qpair->cpl_bus_addr);
908 
909 	doorbell_base = &ctrlr->regs->doorbell[0].sq_tdbl;
910 	qpair->sq_tdbl = doorbell_base +
911 		(2 * qpair->id + 0) * ctrlr->doorbell_stride_u32;
912 	qpair->cq_hdbl = doorbell_base +
913 		(2 * qpair->id + 1) * ctrlr->doorbell_stride_u32;
914 
915 	LIST_INIT(&qpair->free_tr);
916 	LIST_INIT(&qpair->outstanding_tr);
917 	STAILQ_INIT(&qpair->free_req);
918 	STAILQ_INIT(&qpair->queued_req);
919 
920 	/* Request pool */
921 	if (nvme_request_pool_construct(qpair)) {
922 		nvme_err("Create request pool failed\n");
923 		goto fail;
924 	}
925 
926 	/*
927 	 * Reserve space for all of the trackers in a single allocation.
928 	 * struct nvme_tracker must be padded so that its size is already
929 	 * a power of 2. This ensures the PRP list embedded in the nvme_tracker
930 	 * object will not span a 4KB boundary, while allowing access to
931 	 * trackers in tr[] via normal array indexing.
932 	 */
933 	qpair->tr = nvme_mem_alloc_node(sizeof(struct nvme_tracker) * trackers,
934 					sizeof(struct nvme_tracker),
935 					NVME_NODE_ID_ANY, &phys_addr);
936 	if (!qpair->tr) {
937 		nvme_err("Allocate request trackers failed\n");
938 		goto fail;
939 	}
940 	memset(qpair->tr, 0, sizeof(struct nvme_tracker) * trackers);
941 
942 	nvme_debug("Allocated qpair %d trackers at %p / 0x%lx\n",
943 		   qpair->id, qpair->tr, phys_addr);
944 
945 	for (i = 0; i < trackers; i++) {
946 		tr = &qpair->tr[i];
947 		nvme_qpair_construct_tracker(tr, i, phys_addr);
948 		LIST_INSERT_HEAD(&qpair->free_tr, tr, list);
949 		phys_addr += sizeof(struct nvme_tracker);
950 	}
951 
952 	nvme_qpair_reset(qpair);
953 
954 	return 0;
955 
956 fail:
957 	nvme_qpair_destroy(qpair);
958 
959 	return -1;
960 }
961 
962 void nvme_qpair_destroy(struct nvme_qpair *qpair)
963 {
964 	if (!qpair->ctrlr)
965 		return; // Not initialized.
966 
967 	if (nvme_qpair_is_admin_queue(qpair))
968 		_nvme_qpair_admin_qpair_destroy(qpair);
969 
970 	if (qpair->cmd && !qpair->sq_in_cmb) {
971 		nvme_free(qpair->cmd);
972 		qpair->cmd = NULL;
973 	}
974 	if (qpair->cpl) {
975 		nvme_free(qpair->cpl);
976 		qpair->cpl = NULL;
977 	}
978 	if (qpair->tr) {
979 		nvme_free(qpair->tr);
980 		qpair->tr = NULL;
981 	}
982 	nvme_request_pool_destroy(qpair);
983 
984 	qpair->ctrlr = NULL;
985 
986 	pthread_mutex_destroy(&qpair->lock);
987 }
988 
989 static bool nvme_qpair_enabled(struct nvme_qpair *qpair)
990 {
991 	if (!qpair->enabled && !qpair->ctrlr->resetting)
992 		nvme_qpair_enable(qpair);
993 
994 	return qpair->enabled;
995 }
996 
997 int nvme_qpair_submit_request(struct nvme_qpair *qpair,
998 			      struct nvme_request *req)
999 {
1000 	struct nvme_tracker *tr;
1001 	struct nvme_request *child_req, *tmp;
1002 	struct nvme_ctrlr *ctrlr = qpair->ctrlr;
1003 	bool child_req_failed = false;
1004 	int ret = 0;
1005 
1006 	if (ctrlr->failed) {
1007 		nvme_request_free(req);
1008 		return ENXIO;
1009 	}
1010 
1011 	nvme_qpair_enabled(qpair);
1012 
1013 	if (req->child_reqs) {
1014 
1015 		/*
1016 		 * This is a splitted (parent) request. Submit all of the
1017 		 * children but not the parent request itself, since the
1018 		 * parent is the original unsplit request.
1019 		 */
1020 		TAILQ_FOREACH_SAFE(child_req, &req->children, child_tailq, tmp) {
1021 			if (!child_req_failed) {
1022 				ret = nvme_qpair_submit_request(qpair, child_req);
1023 				if (ret != 0)
1024 					child_req_failed = true;
1025 			} else {
1026 				/* free remaining child_reqs since
1027 				 * one child_req fails */
1028 				nvme_request_remove_child(req, child_req);
1029 				nvme_request_free(child_req);
1030 			}
1031 		}
1032 
1033 		return ret;
1034 	}
1035 
1036 	pthread_mutex_lock(&qpair->lock);
1037 
1038 	tr = LIST_FIRST(&qpair->free_tr);
1039 	if (tr == NULL || !qpair->enabled || !STAILQ_EMPTY(&qpair->queued_req)) {
1040 		/*
1041 		 * No tracker is available, the qpair is disabled due
1042 		 * to an in-progress controller-level reset, or
1043 		 * there are already queued requests.
1044 		 *
1045 		 * Put the request on the qpair's request queue to be
1046 		 * processed when a tracker frees up via a command
1047 		 * completion or when the controller reset is completed.
1048 		 */
1049 		STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
1050 		pthread_mutex_unlock(&qpair->lock);
1051 
1052 		if (tr)
1053 			nvme_qpair_submit_queued_requests(qpair);
1054 		return 0;
1055 	}
1056 
1057 	/* remove tr from free_tr */
1058 	LIST_REMOVE(tr, list);
1059 	LIST_INSERT_HEAD(&qpair->outstanding_tr, tr, list);
1060 	tr->req = req;
1061 	req->cmd.cid = tr->cid;
1062 
1063 	if (req->payload_size == 0) {
1064 		/* Null payload - leave PRP fields zeroed */
1065 		ret = 0;
1066 	} else if (req->payload.type == NVME_PAYLOAD_TYPE_CONTIG) {
1067 		ret = _nvme_qpair_build_contig_request(qpair, req, tr);
1068 	} else if (req->payload.type == NVME_PAYLOAD_TYPE_SGL) {
1069 		if (ctrlr->flags & NVME_CTRLR_SGL_SUPPORTED)
1070 			ret = _nvme_qpair_build_hw_sgl_request(qpair, req, tr);
1071 		else
1072 			ret = _nvme_qpair_build_prps_sgl_request(qpair, req, tr);
1073 	} else {
1074 		nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC,
1075 						   NVME_SC_INVALID_FIELD,
1076 						   1 /* do not retry */, true);
1077 		ret = -EINVAL;
1078 	}
1079 
1080 	if (ret == 0)
1081 		nvme_qpair_submit_tracker(qpair, tr);
1082 
1083 	pthread_mutex_unlock(&qpair->lock);
1084 
1085 	return ret;
1086 }
1087 
1088 /*
1089  * Poll for completion of NVMe commands submitted to the
1090  * specified I/O queue pair.
1091  */
1092 unsigned int nvme_qpair_poll(struct nvme_qpair *qpair,
1093 			     unsigned int max_completions)
1094 {
1095 	struct nvme_tracker *tr;
1096 	struct nvme_cpl	*cpl;
1097 	uint32_t num_completions = 0;
1098 
1099 	if (!nvme_qpair_enabled(qpair))
1100 		/*
1101 		 * qpair is not enabled, likely because a controller reset is
1102 		 * is in progress.  Ignore the interrupt - any I/O that was
1103 		 * associated with this interrupt will get retried when the
1104 		 * reset is complete.
1105 		 */
1106 		return 0;
1107 
1108 	if ((max_completions == 0) ||
1109 	    (max_completions > (qpair->entries - 1U)))
1110 		/*
1111 		 * max_completions == 0 means unlimited, but complete at most
1112 		 * one queue depth batch of I/O at a time so that the completion
1113 		 * queue doorbells don't wrap around.
1114 		 */
1115 		max_completions = qpair->entries - 1;
1116 
1117 	pthread_mutex_lock(&qpair->lock);
1118 
1119 	while (1) {
1120 
1121 		cpl = &qpair->cpl[qpair->cq_head];
1122 		if (cpl->status.p != qpair->phase)
1123 			break;
1124 
1125 		tr = &qpair->tr[cpl->cid];
1126 		if (tr->active) {
1127 			nvme_qpair_complete_tracker(qpair, tr, cpl, true);
1128 		} else {
1129 			nvme_info("cpl does not map to outstanding cmd\n");
1130 			nvme_qpair_print_completion(qpair, cpl);
1131 			nvme_panic("received completion for unknown cmd\n");
1132 		}
1133 
1134 		if (++qpair->cq_head == qpair->entries) {
1135 			qpair->cq_head = 0;
1136 			qpair->phase = !qpair->phase;
1137 		}
1138 
1139 		if (++num_completions == max_completions)
1140 			break;
1141 	}
1142 
1143 	if (num_completions > 0)
1144 		nvme_mmio_write_4(qpair->cq_hdbl, qpair->cq_head);
1145 
1146 	pthread_mutex_unlock(&qpair->lock);
1147 
1148 	if (!STAILQ_EMPTY(&qpair->queued_req))
1149 		nvme_qpair_submit_queued_requests(qpair);
1150 
1151 	return num_completions;
1152 }
1153 
1154 void nvme_qpair_reset(struct nvme_qpair *qpair)
1155 {
1156 	pthread_mutex_lock(&qpair->lock);
1157 
1158 	qpair->sq_tail = qpair->cq_head = 0;
1159 
1160 	/*
1161 	 * First time through the completion queue, HW will set phase
1162 	 * bit on completions to 1.  So set this to 1 here, indicating
1163 	 * we're looking for a 1 to know which entries have completed.
1164 	 * we'll toggle the bit each time when the completion queue rolls over.
1165 	 */
1166 	qpair->phase = 1;
1167 
1168 	memset(qpair->cmd, 0, qpair->entries * sizeof(struct nvme_cmd));
1169 	memset(qpair->cpl, 0, qpair->entries * sizeof(struct nvme_cpl));
1170 
1171 	pthread_mutex_unlock(&qpair->lock);
1172 }
1173 
1174 void nvme_qpair_enable(struct nvme_qpair *qpair)
1175 {
1176 	pthread_mutex_lock(&qpair->lock);
1177 
1178 	if (nvme_qpair_is_io_queue(qpair))
1179 		_nvme_qpair_io_qpair_enable(qpair);
1180 	else
1181 		_nvme_qpair_admin_qpair_enable(qpair);
1182 
1183 	pthread_mutex_unlock(&qpair->lock);
1184 }
1185 
1186 void nvme_qpair_disable(struct nvme_qpair *qpair)
1187 {
1188 	pthread_mutex_lock(&qpair->lock);
1189 
1190 	if (nvme_qpair_is_io_queue(qpair))
1191 		_nvme_qpair_io_qpair_disable(qpair);
1192 	else
1193 		_nvme_qpair_admin_qpair_disable(qpair);
1194 
1195 	pthread_mutex_unlock(&qpair->lock);
1196 }
1197 
1198 void nvme_qpair_fail(struct nvme_qpair *qpair)
1199 {
1200 	struct nvme_tracker *tr;
1201 	struct nvme_request *req;
1202 
1203 	pthread_mutex_lock(&qpair->lock);
1204 
1205 	while (!STAILQ_EMPTY(&qpair->queued_req)) {
1206 
1207 		nvme_notice("Failing queued I/O command\n");
1208 		req = STAILQ_FIRST(&qpair->queued_req);
1209 		STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
1210 		nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC,
1211 						   NVME_SC_ABORTED_BY_REQUEST,
1212 						   true);
1213 
1214 	}
1215 
1216 	/* Manually abort each outstanding I/O. */
1217 	while (!LIST_EMPTY(&qpair->outstanding_tr)) {
1218 
1219 		/*
1220 		 * Do not remove the tracker. The abort_tracker path
1221 		 * will do that for us.
1222 		 */
1223 		nvme_notice("Failing outstanding I/O command\n");
1224 		tr = LIST_FIRST(&qpair->outstanding_tr);
1225 		nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC,
1226 						   NVME_SC_ABORTED_BY_REQUEST,
1227 						   1, true);
1228 
1229 	}
1230 
1231 	pthread_mutex_unlock(&qpair->lock);
1232 }
1233 
1234