xref: /haiku/src/add-ons/kernel/drivers/disk/nvme/libnvme/nvme_qpair.c (revision c237c4ce593ee823d9867fd997e51e4c447f5623)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation. All rights reserved.
5  *   Copyright (c) 2017, Western Digital Corporation or its affiliates.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "nvme_internal.h"
35 
36 struct nvme_qpair_string {
37 	uint16_t	value;
38 	const char 	*str;
39 };
40 
41 static const struct nvme_qpair_string admin_opcode[] = {
42 	{ NVME_OPC_DELETE_IO_SQ,	"DELETE IO SQ" },
43 	{ NVME_OPC_CREATE_IO_SQ,	"CREATE IO SQ" },
44 	{ NVME_OPC_GET_LOG_PAGE,	"GET LOG PAGE" },
45 	{ NVME_OPC_DELETE_IO_CQ,	"DELETE IO CQ" },
46 	{ NVME_OPC_CREATE_IO_CQ,	"CREATE IO CQ" },
47 	{ NVME_OPC_IDENTIFY, 		"IDENTIFY" },
48 	{ NVME_OPC_ABORT,		"ABORT" },
49 	{ NVME_OPC_SET_FEATURES,	"SET FEATURES" },
50 	{ NVME_OPC_GET_FEATURES,	"GET FEATURES" },
51 	{ NVME_OPC_ASYNC_EVENT_REQUEST, "ASYNC EVENT REQUEST" },
52 	{ NVME_OPC_NS_MANAGEMENT,	"NAMESPACE MANAGEMENT" },
53 	{ NVME_OPC_FIRMWARE_COMMIT,	"FIRMWARE COMMIT" },
54 	{ NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD, "FIRMWARE IMAGE DOWNLOAD" },
55 	{ NVME_OPC_NS_ATTACHMENT,	"NAMESPACE ATTACHMENT" },
56 	{ NVME_OPC_FORMAT_NVM,		"FORMAT NVM" },
57 	{ NVME_OPC_SECURITY_SEND,	"SECURITY SEND" },
58 	{ NVME_OPC_SECURITY_RECEIVE,	"SECURITY RECEIVE" },
59 	{ 0xFFFF,			"ADMIN COMMAND" }
60 };
61 
62 static const struct nvme_qpair_string io_opcode[] = {
63 	{ NVME_OPC_FLUSH,		"FLUSH" },
64 	{ NVME_OPC_WRITE,		"WRITE" },
65 	{ NVME_OPC_READ,		"READ" },
66 	{ NVME_OPC_WRITE_UNCORRECTABLE, "WRITE UNCORRECTABLE" },
67 	{ NVME_OPC_COMPARE,		"COMPARE" },
68 	{ NVME_OPC_WRITE_ZEROES,	"WRITE ZEROES" },
69 	{ NVME_OPC_DATASET_MANAGEMENT,	"DATASET MANAGEMENT" },
70 	{ NVME_OPC_RESERVATION_REGISTER, "RESERVATION REGISTER" },
71 	{ NVME_OPC_RESERVATION_REPORT,	"RESERVATION REPORT" },
72 	{ NVME_OPC_RESERVATION_ACQUIRE, "RESERVATION ACQUIRE" },
73 	{ NVME_OPC_RESERVATION_RELEASE, "RESERVATION RELEASE" },
74 	{ 0xFFFF,			"IO COMMAND" }
75 };
76 
77 static const struct nvme_qpair_string generic_status[] = {
78 	{ NVME_SC_SUCCESS,			"SUCCESS" },
79 	{ NVME_SC_INVALID_OPCODE,		"INVALID OPCODE" },
80 	{ NVME_SC_INVALID_FIELD,		"INVALID FIELD" },
81 	{ NVME_SC_COMMAND_ID_CONFLICT,		"COMMAND ID CONFLICT" },
82 	{ NVME_SC_DATA_TRANSFER_ERROR,		"DATA TRANSFER ERROR" },
83 	{ NVME_SC_ABORTED_POWER_LOSS,		"ABORTED - POWER LOSS" },
84 	{ NVME_SC_INTERNAL_DEVICE_ERROR,	"INTERNAL DEVICE ERROR" },
85 	{ NVME_SC_ABORTED_BY_REQUEST,		"ABORTED - BY REQUEST" },
86 	{ NVME_SC_ABORTED_SQ_DELETION,		"ABORTED - SQ DELETION" },
87 	{ NVME_SC_ABORTED_FAILED_FUSED,		"ABORTED - FAILED FUSED" },
88 	{ NVME_SC_ABORTED_MISSING_FUSED,	"ABORTED - MISSING FUSED" },
89 	{ NVME_SC_INVALID_NAMESPACE_OR_FORMAT,	"INVALID NAMESPACE OR FORMAT" },
90 	{ NVME_SC_COMMAND_SEQUENCE_ERROR,	"COMMAND SEQUENCE ERROR" },
91 	{ NVME_SC_INVALID_SGL_SEG_DESCRIPTOR,	"INVALID SGL SEGMENT DESCRIPTOR" },
92 	{ NVME_SC_INVALID_NUM_SGL_DESCIRPTORS,	"INVALID NUMBER OF SGL DESCRIPTORS" },
93 	{ NVME_SC_DATA_SGL_LENGTH_INVALID,	"DATA SGL LENGTH INVALID" },
94 	{ NVME_SC_METADATA_SGL_LENGTH_INVALID,	"METADATA SGL LENGTH INVALID" },
95 	{ NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID,	"SGL DESCRIPTOR TYPE INVALID" },
96 	{ NVME_SC_INVALID_CONTROLLER_MEM_BUF,	"INVALID CONTROLLER MEMORY BUFFER" },
97 	{ NVME_SC_INVALID_PRP_OFFSET,		"INVALID PRP OFFSET" },
98 	{ NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED,	"ATOMIC WRITE UNIT EXCEEDED" },
99 	{ NVME_SC_LBA_OUT_OF_RANGE,		"LBA OUT OF RANGE" },
100 	{ NVME_SC_CAPACITY_EXCEEDED,		"CAPACITY EXCEEDED" },
101 	{ NVME_SC_NAMESPACE_NOT_READY,		"NAMESPACE NOT READY" },
102 	{ NVME_SC_RESERVATION_CONFLICT,		"RESERVATION CONFLICT" },
103 	{ NVME_SC_FORMAT_IN_PROGRESS,		"FORMAT IN PROGRESS" },
104 	{ 0xFFFF,				"GENERIC" }
105 };
106 
107 static const struct nvme_qpair_string command_specific_status[] = {
108 	{ NVME_SC_COMPLETION_QUEUE_INVALID,	"INVALID COMPLETION QUEUE" },
109 	{ NVME_SC_INVALID_QUEUE_IDENTIFIER,	"INVALID QUEUE IDENTIFIER" },
110 	{ NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED,	"MAX QUEUE SIZE EXCEEDED" },
111 	{ NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED,	"ABORT CMD LIMIT EXCEEDED" },
112 	{ NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED,"ASYNC LIMIT EXCEEDED" },
113 	{ NVME_SC_INVALID_FIRMWARE_SLOT,	"INVALID FIRMWARE SLOT" },
114 	{ NVME_SC_INVALID_FIRMWARE_IMAGE,	"INVALID FIRMWARE IMAGE" },
115 	{ NVME_SC_INVALID_INTERRUPT_VECTOR,	"INVALID INTERRUPT VECTOR" },
116 	{ NVME_SC_INVALID_LOG_PAGE,		"INVALID LOG PAGE" },
117 	{ NVME_SC_INVALID_FORMAT,		"INVALID FORMAT" },
118 	{ NVME_SC_FIRMWARE_REQ_CONVENTIONAL_RESET,"FIRMWARE REQUIRES CONVENTIONAL RESET" },
119 	{ NVME_SC_INVALID_QUEUE_DELETION,	"INVALID QUEUE DELETION" },
120 	{ NVME_SC_FEATURE_ID_NOT_SAVEABLE,	"FEATURE ID NOT SAVEABLE" },
121 	{ NVME_SC_FEATURE_NOT_CHANGEABLE,	"FEATURE NOT CHANGEABLE" },
122 	{ NVME_SC_FEATURE_NOT_NAMESPACE_SPECIFIC,"FEATURE NOT NAMESPACE SPECIFIC" },
123 	{ NVME_SC_FIRMWARE_REQ_NVM_RESET,	"FIRMWARE REQUIRES NVM RESET" },
124 	{ NVME_SC_FIRMWARE_REQ_RESET,		"FIRMWARE REQUIRES RESET" },
125 	{ NVME_SC_FIRMWARE_REQ_MAX_TIME_VIOLATION,"FIRMWARE REQUIRES MAX TIME VIOLATION" },
126 	{ NVME_SC_FIRMWARE_ACTIVATION_PROHIBITED,"FIRMWARE ACTIVATION PROHIBITED" },
127 	{ NVME_SC_OVERLAPPING_RANGE,		"OVERLAPPING RANGE" },
128 	{ NVME_SC_NAMESPACE_INSUFFICIENT_CAPACITY,"NAMESPACE INSUFFICIENT CAPACITY" },
129 	{ NVME_SC_NAMESPACE_ID_UNAVAILABLE,	"NAMESPACE ID UNAVAILABLE" },
130 	{ NVME_SC_NAMESPACE_ALREADY_ATTACHED,	"NAMESPACE ALREADY ATTACHED" },
131 	{ NVME_SC_NAMESPACE_IS_PRIVATE,		"NAMESPACE IS PRIVATE" },
132 	{ NVME_SC_NAMESPACE_NOT_ATTACHED,	"NAMESPACE NOT ATTACHED" },
133 	{ NVME_SC_THINPROVISIONING_NOT_SUPPORTED,"THINPROVISIONING NOT SUPPORTED" },
134 	{ NVME_SC_CONTROLLER_LIST_INVALID,	"CONTROLLER LIST INVALID" },
135 	{ NVME_SC_CONFLICTING_ATTRIBUTES,	"CONFLICTING ATTRIBUTES" },
136 	{ NVME_SC_INVALID_PROTECTION_INFO,	"INVALID PROTECTION INFO" },
137 	{ NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE,	"WRITE TO RO PAGE" },
138 	{ 0xFFFF,				"COMMAND SPECIFIC" }
139 };
140 
141 static const struct nvme_qpair_string media_error_status[] = {
142 	{ NVME_SC_WRITE_FAULTS, 		"WRITE FAULTS" },
143 	{ NVME_SC_UNRECOVERED_READ_ERROR, 	"UNRECOVERED READ ERROR" },
144 	{ NVME_SC_GUARD_CHECK_ERROR, 		"GUARD CHECK ERROR" },
145 	{ NVME_SC_APPLICATION_TAG_CHECK_ERROR, 	"APPLICATION TAG CHECK ERROR" },
146 	{ NVME_SC_REFERENCE_TAG_CHECK_ERROR, 	"REFERENCE TAG CHECK ERROR" },
147 	{ NVME_SC_COMPARE_FAILURE, 		"COMPARE FAILURE" },
148 	{ NVME_SC_ACCESS_DENIED, 		"ACCESS DENIED" },
149 	{ NVME_SC_DEALLOCATED_OR_UNWRITTEN_BLOCK, "DEALLOCATED OR UNWRITTEN BLOCK" },
150 	{ 0xFFFF, 				"MEDIA ERROR" }
151 };
152 
153 static inline bool nvme_qpair_is_admin_queue(struct nvme_qpair *qpair)
154 {
155 	return qpair->id == 0;
156 }
157 
158 static inline bool nvme_qpair_is_io_queue(struct nvme_qpair *qpair)
159 {
160 	return qpair->id != 0;
161 }
162 
163 static const char*nvme_qpair_get_string(const struct nvme_qpair_string *strings,
164 					uint16_t value)
165 {
166 	const struct nvme_qpair_string *entry;
167 
168 	entry = strings;
169 
170 	while (entry->value != 0xFFFF) {
171 		if (entry->value == value)
172 			return entry->str;
173 		entry++;
174 	}
175 	return entry->str;
176 }
177 
178 static void nvme_qpair_admin_qpair_print_command(struct nvme_qpair *qpair,
179 						 struct nvme_cmd *cmd)
180 {
181 	nvme_info("%s (%02x) sqid:%d cid:%d nsid:%x cdw10:%08x cdw11:%08x\n",
182 		  nvme_qpair_get_string(admin_opcode, cmd->opc), cmd->opc,
183 		  qpair->id, cmd->cid,
184 		  cmd->nsid, cmd->cdw10, cmd->cdw11);
185 }
186 
187 static void nvme_qpair_io_qpair_print_command(struct nvme_qpair *qpair,
188 					      struct nvme_cmd *cmd)
189 {
190 	nvme_assert(qpair != NULL, "print_command: qpair == NULL\n");
191 	nvme_assert(cmd != NULL, "print_command: cmd == NULL\n");
192 
193 	switch ((int)cmd->opc) {
194 	case NVME_OPC_WRITE:
195 	case NVME_OPC_READ:
196 	case NVME_OPC_WRITE_UNCORRECTABLE:
197 	case NVME_OPC_COMPARE:
198 		nvme_info("%s sqid:%d cid:%d nsid:%d lba:%llu len:%d\n",
199 			  nvme_qpair_get_string(io_opcode, cmd->opc),
200 			  qpair->id, cmd->cid, cmd->nsid,
201 			  ((unsigned long long)cmd->cdw11 << 32) + cmd->cdw10,
202 			  (cmd->cdw12 & 0xFFFF) + 1);
203 		break;
204 	case NVME_OPC_FLUSH:
205 	case NVME_OPC_DATASET_MANAGEMENT:
206 		nvme_info("%s sqid:%d cid:%d nsid:%d\n",
207 			  nvme_qpair_get_string(io_opcode, cmd->opc),
208 			  qpair->id, cmd->cid, cmd->nsid);
209 		break;
210 	default:
211 		nvme_info("%s (%02x) sqid:%d cid:%d nsid:%d\n",
212 			  nvme_qpair_get_string(io_opcode, cmd->opc),
213 			  cmd->opc, qpair->id, cmd->cid, cmd->nsid);
214 		break;
215 	}
216 }
217 
218 static void nvme_qpair_print_command(struct nvme_qpair *qpair,
219 				     struct nvme_cmd *cmd)
220 {
221 	nvme_assert(qpair != NULL, "qpair can not be NULL");
222 	nvme_assert(cmd != NULL, "cmd can not be NULL");
223 
224 	if (nvme_qpair_is_admin_queue(qpair))
225 		return nvme_qpair_admin_qpair_print_command(qpair, cmd);
226 
227 	return nvme_qpair_io_qpair_print_command(qpair, cmd);
228 }
229 
230 static const char *get_status_string(uint16_t sct, uint16_t sc)
231 {
232 	const struct nvme_qpair_string *entry;
233 
234 	switch (sct) {
235 	case NVME_SCT_GENERIC:
236 		entry = generic_status;
237 		break;
238 	case NVME_SCT_COMMAND_SPECIFIC:
239 		entry = command_specific_status;
240 		break;
241 	case NVME_SCT_MEDIA_ERROR:
242 		entry = media_error_status;
243 		break;
244 	case NVME_SCT_VENDOR_SPECIFIC:
245 		return "VENDOR SPECIFIC";
246 	default:
247 		return "RESERVED";
248 	}
249 
250 	return nvme_qpair_get_string(entry, sc);
251 }
252 
253 static void nvme_qpair_print_completion(struct nvme_qpair *qpair,
254 					struct nvme_cpl *cpl)
255 {
256 	nvme_info("Cpl: %s (%02x/%02x) sqid:%d cid:%d "
257 		  "cdw0:%x sqhd:%04x p:%x m:%x dnr:%x\n",
258 		  get_status_string(cpl->status.sct, cpl->status.sc),
259 		  cpl->status.sct,
260 		  cpl->status.sc,
261 		  cpl->sqid,
262 		  cpl->cid,
263 		  cpl->cdw0,
264 		  cpl->sqhd,
265 		  cpl->status.p,
266 		  cpl->status.m,
267 		  cpl->status.dnr);
268 }
269 
270 static bool nvme_qpair_completion_retry(const struct nvme_cpl *cpl)
271 {
272 	/*
273 	 * TODO: spec is not clear how commands that are aborted due
274 	 *  to TLER will be marked.  So for now, it seems
275 	 *  NAMESPACE_NOT_READY is the only case where we should
276 	 *  look at the DNR bit.
277 	 */
278 	switch ((int)cpl->status.sct) {
279 	case NVME_SCT_GENERIC:
280 		switch ((int)cpl->status.sc) {
281 		case NVME_SC_NAMESPACE_NOT_READY:
282 		case NVME_SC_FORMAT_IN_PROGRESS:
283 			if (cpl->status.dnr)
284 				return false;
285 			return true;
286 		case NVME_SC_INVALID_OPCODE:
287 		case NVME_SC_INVALID_FIELD:
288 		case NVME_SC_COMMAND_ID_CONFLICT:
289 		case NVME_SC_DATA_TRANSFER_ERROR:
290 		case NVME_SC_ABORTED_POWER_LOSS:
291 		case NVME_SC_INTERNAL_DEVICE_ERROR:
292 		case NVME_SC_ABORTED_BY_REQUEST:
293 		case NVME_SC_ABORTED_SQ_DELETION:
294 		case NVME_SC_ABORTED_FAILED_FUSED:
295 		case NVME_SC_ABORTED_MISSING_FUSED:
296 		case NVME_SC_INVALID_NAMESPACE_OR_FORMAT:
297 		case NVME_SC_COMMAND_SEQUENCE_ERROR:
298 		case NVME_SC_LBA_OUT_OF_RANGE:
299 		case NVME_SC_CAPACITY_EXCEEDED:
300 		default:
301 			return false;
302 		}
303 	case NVME_SCT_COMMAND_SPECIFIC:
304 	case NVME_SCT_MEDIA_ERROR:
305 	case NVME_SCT_VENDOR_SPECIFIC:
306 	default:
307 		return false;
308 	}
309 }
310 
311 static void nvme_qpair_construct_tracker(struct nvme_tracker *tr,
312 					 uint16_t cid, uint64_t phys_addr)
313 {
314 	tr->prp_sgl_bus_addr = phys_addr + offsetof(struct nvme_tracker, u.prp);
315 	tr->cid = cid;
316 	tr->active = false;
317 }
318 
319 static inline void nvme_qpair_copy_command(struct nvme_cmd *dst,
320 					   const struct nvme_cmd *src)
321 {
322 	/* dst and src are known to be non-overlapping and 64-byte aligned. */
323 #if defined(__AVX__)
324 	__m256i *d256 = (__m256i *)dst;
325 	const __m256i *s256 = (const __m256i *)src;
326 
327 	_mm256_store_si256(&d256[0], _mm256_load_si256(&s256[0]));
328 	_mm256_store_si256(&d256[1], _mm256_load_si256(&s256[1]));
329 #elif defined(__SSE2__)
330 	__m128i *d128 = (__m128i *)dst;
331 	const __m128i *s128 = (const __m128i *)src;
332 
333 	_mm_store_si128(&d128[0], _mm_load_si128(&s128[0]));
334 	_mm_store_si128(&d128[1], _mm_load_si128(&s128[1]));
335 	_mm_store_si128(&d128[2], _mm_load_si128(&s128[2]));
336 	_mm_store_si128(&d128[3], _mm_load_si128(&s128[3]));
337 #else
338 	*dst = *src;
339 #endif
340 }
341 
342 static void nvme_qpair_submit_tracker(struct nvme_qpair *qpair,
343 				      struct nvme_tracker *tr)
344 {
345 	struct nvme_request *req = tr->req;
346 
347 	/*
348 	 * Set the tracker active and copy its command
349 	 * to the submission queue.
350 	 */
351 	nvme_debug("qpair %d: Submit command, tail %d, cid %d / %d\n",
352 		   qpair->id,
353 		   (int)qpair->sq_tail,
354 		   (int)tr->cid,
355 		   (int)tr->req->cmd.cid);
356 
357 	qpair->tr[tr->cid].active = true;
358 	nvme_qpair_copy_command(&qpair->cmd[qpair->sq_tail], &req->cmd);
359 
360 	if (++qpair->sq_tail == qpair->entries)
361 		qpair->sq_tail = 0;
362 
363 	nvme_wmb();
364 	nvme_mmio_write_4(qpair->sq_tdbl, qpair->sq_tail);
365 }
366 
367 static void nvme_qpair_complete_tracker(struct nvme_qpair *qpair,
368 					struct nvme_tracker *tr,
369 					struct nvme_cpl *cpl,
370 					bool print_on_error)
371 {
372 	struct nvme_request *req = tr->req;
373 	bool retry, error;
374 
375 	if (!req) {
376 		nvme_crit("tracker has no request\n");
377 		qpair->tr[cpl->cid].active = false;
378 		goto done;
379 	}
380 
381 	error = nvme_cpl_is_error(cpl);
382 	retry = error && nvme_qpair_completion_retry(cpl) &&
383 		(req->retries < NVME_MAX_RETRY_COUNT);
384 	if (error && print_on_error) {
385 		nvme_qpair_print_command(qpair, &req->cmd);
386 		nvme_qpair_print_completion(qpair, cpl);
387 	}
388 
389 	qpair->tr[cpl->cid].active = false;
390 
391 	if (cpl->cid != req->cmd.cid)
392 		nvme_crit("cpl and command CID mismatch (%d / %d)\n",
393 			  (int)cpl->cid, (int)req->cmd.cid);
394 
395 	if (retry) {
396 		req->retries++;
397 		nvme_qpair_submit_tracker(qpair, tr);
398 		return;
399 	}
400 
401 	if (req->cb_fn)
402 		req->cb_fn(req->cb_arg, cpl);
403 
404 	nvme_request_free_locked(req);
405 
406 done:
407 	tr->req = NULL;
408 
409 	LIST_REMOVE(tr, list);
410 	LIST_INSERT_HEAD(&qpair->free_tr, tr, list);
411 }
412 
413 static void nvme_qpair_submit_queued_requests(struct nvme_qpair *qpair)
414 {
415 	STAILQ_HEAD(, nvme_request) req_queue;
416 	STAILQ_INIT(&req_queue);
417 
418 	pthread_mutex_lock(&qpair->lock);
419 
420 	STAILQ_CONCAT(&req_queue, &qpair->queued_req);
421 
422 	/*
423 	 * If the controller is in the middle of a reset, don't
424 	 * try to submit queued requests - let the reset logic
425 	 * handle that instead.
426 	 */
427 	while (!qpair->ctrlr->resetting && LIST_FIRST(&qpair->free_tr)
428 			&& !STAILQ_EMPTY(&req_queue)) {
429 		struct nvme_request *req = STAILQ_FIRST(&req_queue);
430 		STAILQ_REMOVE_HEAD(&req_queue, stailq);
431 
432 		pthread_mutex_unlock(&qpair->lock);
433 		nvme_qpair_submit_request(qpair, req);
434 		pthread_mutex_lock(&qpair->lock);
435 	}
436 
437 	STAILQ_CONCAT(&qpair->queued_req, &req_queue);
438 
439 	pthread_mutex_unlock(&qpair->lock);
440 }
441 
442 static void nvme_qpair_manual_complete_tracker(struct nvme_qpair *qpair,
443 					       struct nvme_tracker *tr,
444 					       uint32_t sct,
445 					       uint32_t sc,
446 					       uint32_t dnr,
447 					       bool print_on_error)
448 {
449 	struct nvme_cpl	cpl;
450 
451 	memset(&cpl, 0, sizeof(cpl));
452 	cpl.sqid = qpair->id;
453 	cpl.cid = tr->cid;
454 	cpl.status.sct = sct;
455 	cpl.status.sc = sc;
456 	cpl.status.dnr = dnr;
457 
458 	nvme_qpair_complete_tracker(qpair, tr, &cpl, print_on_error);
459 }
460 
461 static void nvme_qpair_manual_complete_request(struct nvme_qpair *qpair,
462 					       struct nvme_request *req,
463 					       uint32_t sct, uint32_t sc,
464 					       bool print_on_error)
465 {
466 	struct nvme_cpl	cpl;
467 	bool error;
468 
469 	memset(&cpl, 0, sizeof(cpl));
470 	cpl.sqid = qpair->id;
471 	cpl.status.sct = sct;
472 	cpl.status.sc = sc;
473 
474 	error = nvme_cpl_is_error(&cpl);
475 
476 	if (error && print_on_error) {
477 		nvme_qpair_print_command(qpair, &req->cmd);
478 		nvme_qpair_print_completion(qpair, &cpl);
479 	}
480 
481 	if (req->cb_fn)
482 		req->cb_fn(req->cb_arg, &cpl);
483 
484 	nvme_request_free_locked(req);
485 }
486 
487 static void nvme_qpair_abort_aers(struct nvme_qpair *qpair)
488 {
489 	struct nvme_tracker *tr;
490 
491 	tr = LIST_FIRST(&qpair->outstanding_tr);
492 	while (tr != NULL) {
493 		nvme_assert(tr->req != NULL,
494 			    "tr->req == NULL in abort_aers\n");
495 		if (tr->req->cmd.opc == NVME_OPC_ASYNC_EVENT_REQUEST) {
496 			nvme_qpair_manual_complete_tracker(qpair, tr,
497 					      NVME_SCT_GENERIC,
498 					      NVME_SC_ABORTED_SQ_DELETION,
499 					      0, false);
500 			tr = LIST_FIRST(&qpair->outstanding_tr);
501 			continue;
502 		}
503 		tr = LIST_NEXT(tr, list);
504 	}
505 }
506 
507 static inline void _nvme_qpair_admin_qpair_destroy(struct nvme_qpair *qpair)
508 {
509 	nvme_qpair_abort_aers(qpair);
510 }
511 
512 static inline void _nvme_qpair_req_bad_phys(struct nvme_qpair *qpair,
513 					    struct nvme_tracker *tr)
514 {
515 	/*
516 	 * Bad vtophys translation, so abort this request
517 	 * and return immediately, without retry.
518 	 */
519 	nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC,
520 					   NVME_SC_INVALID_FIELD,
521 					   1, true);
522 }
523 
524 /*
525  * Build PRP list describing physically contiguous payload buffer.
526  */
527 static int _nvme_qpair_build_contig_request(struct nvme_qpair *qpair,
528 					    struct nvme_request *req,
529 					    struct nvme_tracker *tr)
530 {
531 	uint64_t phys_addr;
532 	void *seg_addr;
533 	uint32_t nseg, cur_nseg, modulo, unaligned;
534 	void *md_payload;
535 	void *payload = req->payload.u.contig + req->payload_offset;
536 
537 	phys_addr = nvme_mem_vtophys(payload);
538 	if (phys_addr == NVME_VTOPHYS_ERROR) {
539 		_nvme_qpair_req_bad_phys(qpair, tr);
540 		return -1;
541 	}
542 	nseg = req->payload_size >> PAGE_SHIFT;
543 	modulo = req->payload_size & (PAGE_SIZE - 1);
544 	unaligned = phys_addr & (PAGE_SIZE - 1);
545 	if (modulo || unaligned)
546 		nseg += 1 + ((modulo + unaligned - 1) >> PAGE_SHIFT);
547 
548 	if (req->payload.md) {
549 		md_payload = req->payload.md + req->md_offset;
550 		tr->req->cmd.mptr = nvme_mem_vtophys(md_payload);
551 		if (tr->req->cmd.mptr == NVME_VTOPHYS_ERROR) {
552 			_nvme_qpair_req_bad_phys(qpair, tr);
553 			return -1;
554 		}
555 	}
556 
557 	tr->req->cmd.psdt = NVME_PSDT_PRP;
558 	tr->req->cmd.dptr.prp.prp1 = phys_addr;
559 	if (nseg == 2) {
560 		seg_addr = payload + PAGE_SIZE - unaligned;
561 		tr->req->cmd.dptr.prp.prp2 = nvme_mem_vtophys(seg_addr);
562 	} else if (nseg > 2) {
563 		cur_nseg = 1;
564 		tr->req->cmd.dptr.prp.prp2 = (uint64_t)tr->prp_sgl_bus_addr;
565 		while (cur_nseg < nseg) {
566 			seg_addr = payload + cur_nseg * PAGE_SIZE - unaligned;
567 			phys_addr = nvme_mem_vtophys(seg_addr);
568 			if (phys_addr == NVME_VTOPHYS_ERROR) {
569 				_nvme_qpair_req_bad_phys(qpair, tr);
570 				return -1;
571 			}
572 			tr->u.prp[cur_nseg - 1] = phys_addr;
573 			cur_nseg++;
574 		}
575 	}
576 
577 	return 0;
578 }
579 
580 /*
581  * Build SGL list describing scattered payload buffer.
582  */
583 static int _nvme_qpair_build_hw_sgl_request(struct nvme_qpair *qpair,
584 					    struct nvme_request *req,
585 					    struct nvme_tracker *tr)
586 {
587 	struct nvme_sgl_descriptor *sgl;
588 	uint64_t phys_addr;
589 	uint32_t remaining_transfer_len, length, nseg = 0;
590 	int ret;
591 
592 	/*
593 	 * Build scattered payloads.
594 	 */
595 	nvme_assert(req->payload_size != 0,
596 		    "cannot build SGL for zero-length transfer\n");
597 	nvme_assert(req->payload.type == NVME_PAYLOAD_TYPE_SGL,
598 		    "sgl payload type required\n");
599 	nvme_assert(req->payload.u.sgl.reset_sgl_fn != NULL,
600 		    "sgl reset callback required\n");
601 	nvme_assert(req->payload.u.sgl.next_sge_fn != NULL,
602 		    "sgl callback required\n");
603 	req->payload.u.sgl.reset_sgl_fn(req->payload.u.sgl.cb_arg,
604 					req->payload_offset);
605 
606 	sgl = tr->u.sgl;
607 	req->cmd.psdt = NVME_PSDT_SGL_MPTR_SGL;
608 	req->cmd.dptr.sgl1.unkeyed.subtype = 0;
609 
610 	remaining_transfer_len = req->payload_size;
611 
612 	while (remaining_transfer_len > 0) {
613 
614 		if (nseg >= NVME_MAX_SGL_DESCRIPTORS) {
615 			_nvme_qpair_req_bad_phys(qpair, tr);
616 			return -1;
617 		}
618 
619 		ret = req->payload.u.sgl.next_sge_fn(req->payload.u.sgl.cb_arg,
620 						     &phys_addr, &length);
621 		if (ret != 0) {
622 			_nvme_qpair_req_bad_phys(qpair, tr);
623 			return ret;
624 		}
625 
626 		length = nvme_min(remaining_transfer_len, length);
627 		remaining_transfer_len -= length;
628 
629 		sgl->unkeyed.type = NVME_SGL_TYPE_DATA_BLOCK;
630 		sgl->unkeyed.length = length;
631 		sgl->address = phys_addr;
632 		sgl->unkeyed.subtype = 0;
633 
634 		sgl++;
635 		nseg++;
636 
637 	}
638 
639 	if (nseg == 1) {
640 		/*
641 		 * The whole transfer can be described by a single Scatter
642 		 * Gather List descriptor. Use the special case described
643 		 * by the spec where SGL1's type is Data Block.
644 		 * This means the SGL in the tracker is not used at all,
645 		 * so copy the first (and only) SGL element into SGL1.
646 		 */
647 		req->cmd.dptr.sgl1.unkeyed.type = NVME_SGL_TYPE_DATA_BLOCK;
648 		req->cmd.dptr.sgl1.address = tr->u.sgl[0].address;
649 		req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length;
650 	} else {
651 		/* For now we only support 1 SGL segment in NVMe controller */
652 		req->cmd.dptr.sgl1.unkeyed.type = NVME_SGL_TYPE_LAST_SEGMENT;
653 		req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr;
654 		req->cmd.dptr.sgl1.unkeyed.length =
655 			nseg * sizeof(struct nvme_sgl_descriptor);
656 	}
657 
658 	return 0;
659 }
660 
661 /*
662  * Build Physical Region Page list describing scattered payload buffer.
663  */
664 static int _nvme_qpair_build_prps_sgl_request(struct nvme_qpair *qpair,
665 					      struct nvme_request *req,
666 					      struct nvme_tracker *tr)
667 {
668 	uint64_t phys_addr, prp2 = 0;
669 	uint32_t data_transferred, remaining_transfer_len, length;
670 	uint32_t nseg, cur_nseg, total_nseg = 0, last_nseg = 0;
671 	uint32_t modulo, unaligned, sge_count = 0;
672 	int ret;
673 
674 	/*
675 	 * Build scattered payloads.
676 	 */
677 	nvme_assert(req->payload.type == NVME_PAYLOAD_TYPE_SGL,
678 		    "sgl payload type required\n");
679 	nvme_assert(req->payload.u.sgl.reset_sgl_fn != NULL,
680 		    "sgl reset callback required\n");
681 	req->payload.u.sgl.reset_sgl_fn(req->payload.u.sgl.cb_arg,
682 					req->payload_offset);
683 
684 	remaining_transfer_len = req->payload_size;
685 
686 	while (remaining_transfer_len > 0) {
687 
688 		nvme_assert(req->payload.u.sgl.next_sge_fn != NULL,
689 			    "sgl callback required\n");
690 
691 		ret = req->payload.u.sgl.next_sge_fn(req->payload.u.sgl.cb_arg,
692 						    &phys_addr, &length);
693 		if (ret != 0) {
694 			_nvme_qpair_req_bad_phys(qpair, tr);
695 			return -1;
696 		}
697 
698 		nvme_assert((phys_addr & 0x3) == 0, "address must be dword aligned\n");
699 		nvme_assert((length >= remaining_transfer_len) || ((phys_addr + length) % PAGE_SIZE) == 0,
700 			"All SGEs except last must end on a page boundary\n");
701 		nvme_assert((sge_count == 0) || (phys_addr % PAGE_SIZE) == 0,
702 			"All SGEs except first must start on a page boundary\n");
703 
704 		data_transferred = nvme_min(remaining_transfer_len, length);
705 
706 		nseg = data_transferred >> PAGE_SHIFT;
707 		modulo = data_transferred & (PAGE_SIZE - 1);
708 		unaligned = phys_addr & (PAGE_SIZE - 1);
709 		if (modulo || unaligned)
710 			nseg += 1 + ((modulo + unaligned - 1) >> PAGE_SHIFT);
711 
712 		if (total_nseg == 0) {
713 			req->cmd.psdt = NVME_PSDT_PRP;
714 			req->cmd.dptr.prp.prp1 = phys_addr;
715 		}
716 
717 		total_nseg += nseg;
718 		sge_count++;
719 		remaining_transfer_len -= data_transferred;
720 
721 		if (total_nseg == 2) {
722 			if (sge_count == 1)
723 				tr->req->cmd.dptr.prp.prp2 = phys_addr +
724 					PAGE_SIZE - unaligned;
725 			else if (sge_count == 2)
726 				tr->req->cmd.dptr.prp.prp2 = phys_addr;
727 			/* save prp2 value */
728 			prp2 = tr->req->cmd.dptr.prp.prp2;
729 		} else if (total_nseg > 2) {
730 			if (sge_count == 1)
731 				cur_nseg = 1;
732 			else
733 				cur_nseg = 0;
734 
735 			tr->req->cmd.dptr.prp.prp2 =
736 				(uint64_t)tr->prp_sgl_bus_addr;
737 
738 			while (cur_nseg < nseg) {
739 				if (prp2) {
740 					tr->u.prp[0] = prp2;
741 					tr->u.prp[last_nseg + 1] = phys_addr +
742 						cur_nseg * PAGE_SIZE - unaligned;
743 				} else {
744 					tr->u.prp[last_nseg] = phys_addr +
745 						cur_nseg * PAGE_SIZE - unaligned;
746 				}
747 				last_nseg++;
748 				cur_nseg++;
749 			}
750 		}
751 	}
752 
753 	return 0;
754 }
755 
756 static void _nvme_qpair_admin_qpair_enable(struct nvme_qpair *qpair)
757 {
758 	struct nvme_tracker *tr, *tr_temp;
759 
760 	/*
761 	 * Manually abort each outstanding admin command.  Do not retry
762 	 * admin commands found here, since they will be left over from
763 	 * a controller reset and its likely the context in which the
764 	 * command was issued no longer applies.
765 	 */
766 	LIST_FOREACH_SAFE(tr, &qpair->outstanding_tr, list, tr_temp) {
767 		nvme_info("Aborting outstanding admin command\n");
768 		nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC,
769 						   NVME_SC_ABORTED_BY_REQUEST,
770 						   1 /* do not retry */, true);
771 	}
772 
773 	qpair->enabled = true;
774 }
775 
776 static void _nvme_qpair_io_qpair_enable(struct nvme_qpair *qpair)
777 {
778 	struct nvme_tracker *tr, *temp;
779 	struct nvme_request *req;
780 
781 	qpair->enabled = true;
782 
783 	qpair->ctrlr->enabled_io_qpairs++;
784 
785 	/* Manually abort each queued I/O. */
786 	while (!STAILQ_EMPTY(&qpair->queued_req)) {
787 		req = STAILQ_FIRST(&qpair->queued_req);
788 		STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
789 		nvme_info("Aborting queued I/O command\n");
790 		nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC,
791 						   NVME_SC_ABORTED_BY_REQUEST,
792 						   true);
793 	}
794 
795 	/* Manually abort each outstanding I/O. */
796 	LIST_FOREACH_SAFE(tr, &qpair->outstanding_tr, list, temp) {
797 		nvme_info("Aborting outstanding I/O command\n");
798 		nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC,
799 						   NVME_SC_ABORTED_BY_REQUEST,
800 						   0, true);
801 	}
802 }
803 
804 static inline void _nvme_qpair_admin_qpair_disable(struct nvme_qpair *qpair)
805 {
806 	qpair->enabled = false;
807 	nvme_qpair_abort_aers(qpair);
808 }
809 
810 static inline void _nvme_qpair_io_qpair_disable(struct nvme_qpair *qpair)
811 {
812 	qpair->enabled = false;
813 
814 	qpair->ctrlr->enabled_io_qpairs--;
815 }
816 
817 /*
818  * Reserve room for the submission queue
819  * in the controller memory buffer
820  */
821 static int nvme_ctrlr_reserve_sq_in_cmb(struct nvme_ctrlr *ctrlr,
822 					uint16_t entries,
823 					uint64_t aligned, uint64_t *offset)
824 {
825 	uint64_t round_offset;
826 	const uint64_t length = entries * sizeof(struct nvme_cmd);
827 
828 	round_offset = ctrlr->cmb_current_offset;
829 	round_offset = (round_offset + (aligned - 1)) & ~(aligned - 1);
830 
831 	if (round_offset + length > ctrlr->cmb_size)
832 		return -1;
833 
834 	*offset = round_offset;
835 	ctrlr->cmb_current_offset = round_offset + length;
836 
837 	return 0;
838 }
839 
840 /*
841  * Initialize a queue pair on the host side.
842  */
843 int nvme_qpair_construct(struct nvme_ctrlr *ctrlr, struct nvme_qpair *qpair,
844 			 enum nvme_qprio qprio,
845 			 uint16_t entries, uint16_t trackers)
846 {
847 	volatile uint32_t *doorbell_base;
848 	struct nvme_tracker *tr;
849 	uint64_t offset;
850 	unsigned long phys_addr = 0;
851 	uint16_t i;
852 	int ret;
853 
854 	nvme_assert(entries != 0, "Invalid number of entries\n");
855 	nvme_assert(trackers != 0, "Invalid trackers\n");
856 
857 	pthread_mutex_init(&qpair->lock, NULL);
858 
859 	qpair->entries = entries;
860 	qpair->trackers = trackers;
861 	qpair->qprio = qprio;
862 	qpair->sq_in_cmb = false;
863 	qpair->ctrlr = ctrlr;
864 
865 	if (ctrlr->opts.use_cmb_sqs) {
866 		/*
867 		 * Reserve room for the submission queue in ctrlr
868 		 * memory buffer.
869 		 */
870 		ret = nvme_ctrlr_reserve_sq_in_cmb(ctrlr, entries,
871 						   PAGE_SIZE,
872 						   &offset);
873 		if (ret == 0) {
874 
875 			qpair->cmd = ctrlr->cmb_bar_virt_addr + offset;
876 			qpair->cmd_bus_addr = ctrlr->cmb_bar_phys_addr + offset;
877 			qpair->sq_in_cmb = true;
878 
879 			nvme_debug("Allocated qpair %d cmd in cmb at %p / 0x%llx\n",
880 				   qpair->id,
881 				   qpair->cmd, qpair->cmd_bus_addr);
882 
883 		}
884 	}
885 
886 	if (qpair->sq_in_cmb == false) {
887 
888 		qpair->cmd =
889 			nvme_mem_alloc_node(sizeof(struct nvme_cmd) * entries,
890 				    PAGE_SIZE, NVME_NODE_ID_ANY,
891 				    (unsigned long *) &qpair->cmd_bus_addr);
892 		if (!qpair->cmd) {
893 			nvme_err("Allocate qpair commands failed\n");
894 			goto fail;
895 		}
896 		memset(qpair->cmd, 0, sizeof(struct nvme_cmd) * entries);
897 
898 		nvme_debug("Allocated qpair %d cmd %p / 0x%llx\n",
899 			   qpair->id,
900 			   qpair->cmd, qpair->cmd_bus_addr);
901 	}
902 
903 	qpair->cpl = nvme_mem_alloc_node(sizeof(struct nvme_cpl) * entries,
904 				 PAGE_SIZE, NVME_NODE_ID_ANY,
905 				 (unsigned long *) &qpair->cpl_bus_addr);
906 	if (!qpair->cpl) {
907 		nvme_err("Allocate qpair completions failed\n");
908 		goto fail;
909 	}
910 	memset(qpair->cpl, 0, sizeof(struct nvme_cpl) * entries);
911 
912 	nvme_debug("Allocated qpair %d cpl at %p / 0x%llx\n",
913 		   qpair->id,
914 		   qpair->cpl,
915 		   qpair->cpl_bus_addr);
916 
917 	doorbell_base = &ctrlr->regs->doorbell[0].sq_tdbl;
918 	qpair->sq_tdbl = doorbell_base +
919 		(2 * qpair->id + 0) * ctrlr->doorbell_stride_u32;
920 	qpair->cq_hdbl = doorbell_base +
921 		(2 * qpair->id + 1) * ctrlr->doorbell_stride_u32;
922 
923 	LIST_INIT(&qpair->free_tr);
924 	LIST_INIT(&qpair->outstanding_tr);
925 	STAILQ_INIT(&qpair->free_req);
926 	STAILQ_INIT(&qpair->queued_req);
927 
928 	/* Request pool */
929 	if (nvme_request_pool_construct(qpair)) {
930 		nvme_err("Create request pool failed\n");
931 		goto fail;
932 	}
933 
934 	/*
935 	 * Reserve space for all of the trackers in a single allocation.
936 	 * struct nvme_tracker must be padded so that its size is already
937 	 * a power of 2. This ensures the PRP list embedded in the nvme_tracker
938 	 * object will not span a 4KB boundary, while allowing access to
939 	 * trackers in tr[] via normal array indexing.
940 	 */
941 	qpair->tr = nvme_mem_alloc_node(sizeof(struct nvme_tracker) * trackers,
942 					sizeof(struct nvme_tracker),
943 					NVME_NODE_ID_ANY, &phys_addr);
944 	if (!qpair->tr) {
945 		nvme_err("Allocate request trackers failed\n");
946 		goto fail;
947 	}
948 	memset(qpair->tr, 0, sizeof(struct nvme_tracker) * trackers);
949 
950 	nvme_debug("Allocated qpair %d trackers at %p / 0x%lx\n",
951 		   qpair->id, qpair->tr, phys_addr);
952 
953 	for (i = 0; i < trackers; i++) {
954 		tr = &qpair->tr[i];
955 		nvme_qpair_construct_tracker(tr, i, phys_addr);
956 		LIST_INSERT_HEAD(&qpair->free_tr, tr, list);
957 		phys_addr += sizeof(struct nvme_tracker);
958 	}
959 
960 	nvme_qpair_reset(qpair);
961 
962 	return 0;
963 
964 fail:
965 	nvme_qpair_destroy(qpair);
966 
967 	return -1;
968 }
969 
970 void nvme_qpair_destroy(struct nvme_qpair *qpair)
971 {
972 	if (!qpair->ctrlr)
973 		return; // Not initialized.
974 
975 	if (nvme_qpair_is_admin_queue(qpair))
976 		_nvme_qpair_admin_qpair_destroy(qpair);
977 
978 	if (qpair->cmd && !qpair->sq_in_cmb) {
979 		nvme_free(qpair->cmd);
980 		qpair->cmd = NULL;
981 	}
982 	if (qpair->cpl) {
983 		nvme_free(qpair->cpl);
984 		qpair->cpl = NULL;
985 	}
986 	if (qpair->tr) {
987 		nvme_free(qpair->tr);
988 		qpair->tr = NULL;
989 	}
990 	nvme_request_pool_destroy(qpair);
991 
992 	qpair->ctrlr = NULL;
993 
994 	pthread_mutex_destroy(&qpair->lock);
995 }
996 
997 static bool nvme_qpair_enabled(struct nvme_qpair *qpair)
998 {
999 	if (!qpair->enabled && !qpair->ctrlr->resetting)
1000 		nvme_qpair_enable(qpair);
1001 
1002 	return qpair->enabled;
1003 }
1004 
1005 int nvme_qpair_submit_request(struct nvme_qpair *qpair,
1006 			      struct nvme_request *req)
1007 {
1008 	struct nvme_tracker *tr;
1009 	struct nvme_request *child_req, *tmp;
1010 	struct nvme_ctrlr *ctrlr = qpair->ctrlr;
1011 	bool child_req_failed = false;
1012 	int ret = 0;
1013 
1014 	if (ctrlr->failed) {
1015 		nvme_request_free(req);
1016 		return ENXIO;
1017 	}
1018 
1019 	nvme_qpair_enabled(qpair);
1020 
1021 	if (req->child_reqs) {
1022 
1023 		/*
1024 		 * This is a splitted (parent) request. Submit all of the
1025 		 * children but not the parent request itself, since the
1026 		 * parent is the original unsplit request.
1027 		 */
1028 		TAILQ_FOREACH_SAFE(child_req, &req->children, child_tailq, tmp) {
1029 			if (!child_req_failed) {
1030 				ret = nvme_qpair_submit_request(qpair, child_req);
1031 				if (ret != 0)
1032 					child_req_failed = true;
1033 			} else {
1034 				/* free remaining child_reqs since
1035 				 * one child_req fails */
1036 				nvme_request_remove_child(req, child_req);
1037 				nvme_request_free(child_req);
1038 			}
1039 		}
1040 
1041 		return ret;
1042 	}
1043 
1044 	pthread_mutex_lock(&qpair->lock);
1045 
1046 	tr = LIST_FIRST(&qpair->free_tr);
1047 	if (tr == NULL || !qpair->enabled || !STAILQ_EMPTY(&qpair->queued_req)) {
1048 		/*
1049 		 * No tracker is available, the qpair is disabled due
1050 		 * to an in-progress controller-level reset, or
1051 		 * there are already queued requests.
1052 		 *
1053 		 * Put the request on the qpair's request queue to be
1054 		 * processed when a tracker frees up via a command
1055 		 * completion or when the controller reset is completed.
1056 		 */
1057 		STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
1058 		pthread_mutex_unlock(&qpair->lock);
1059 
1060 		if (tr)
1061 			nvme_qpair_submit_queued_requests(qpair);
1062 		return 0;
1063 	}
1064 
1065 	/* remove tr from free_tr */
1066 	LIST_REMOVE(tr, list);
1067 	LIST_INSERT_HEAD(&qpair->outstanding_tr, tr, list);
1068 	tr->req = req;
1069 	req->cmd.cid = tr->cid;
1070 
1071 	if (req->payload_size == 0) {
1072 		/* Null payload - leave PRP fields zeroed */
1073 		ret = 0;
1074 	} else if (req->payload.type == NVME_PAYLOAD_TYPE_CONTIG) {
1075 		ret = _nvme_qpair_build_contig_request(qpair, req, tr);
1076 	} else if (req->payload.type == NVME_PAYLOAD_TYPE_SGL) {
1077 		if (ctrlr->flags & NVME_CTRLR_SGL_SUPPORTED)
1078 			ret = _nvme_qpair_build_hw_sgl_request(qpair, req, tr);
1079 		else
1080 			ret = _nvme_qpair_build_prps_sgl_request(qpair, req, tr);
1081 	} else {
1082 		nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC,
1083 						   NVME_SC_INVALID_FIELD,
1084 						   1 /* do not retry */, true);
1085 		ret = -EINVAL;
1086 	}
1087 
1088 	if (ret == 0)
1089 		nvme_qpair_submit_tracker(qpair, tr);
1090 
1091 	pthread_mutex_unlock(&qpair->lock);
1092 
1093 	return ret;
1094 }
1095 
1096 /*
1097  * Poll for completion of NVMe commands submitted to the
1098  * specified I/O queue pair.
1099  */
1100 unsigned int nvme_qpair_poll(struct nvme_qpair *qpair,
1101 			     unsigned int max_completions)
1102 {
1103 	struct nvme_tracker *tr;
1104 	struct nvme_cpl	*cpl;
1105 	uint32_t num_completions = 0;
1106 
1107 	if (!nvme_qpair_enabled(qpair))
1108 		/*
1109 		 * qpair is not enabled, likely because a controller reset is
1110 		 * is in progress.  Ignore the interrupt - any I/O that was
1111 		 * associated with this interrupt will get retried when the
1112 		 * reset is complete.
1113 		 */
1114 		return 0;
1115 
1116 	if ((max_completions == 0) ||
1117 	    (max_completions > (qpair->entries - 1U)))
1118 		/*
1119 		 * max_completions == 0 means unlimited, but complete at most
1120 		 * one queue depth batch of I/O at a time so that the completion
1121 		 * queue doorbells don't wrap around.
1122 		 */
1123 		max_completions = qpair->entries - 1;
1124 
1125 	pthread_mutex_lock(&qpair->lock);
1126 
1127 	while (1) {
1128 
1129 		cpl = &qpair->cpl[qpair->cq_head];
1130 		if (cpl->status.p != qpair->phase)
1131 			break;
1132 
1133 		tr = &qpair->tr[cpl->cid];
1134 		if (tr->active) {
1135 			nvme_qpair_complete_tracker(qpair, tr, cpl, true);
1136 		} else {
1137 			nvme_info("cpl does not map to outstanding cmd\n");
1138 			nvme_qpair_print_completion(qpair, cpl);
1139 			nvme_panic("received completion for unknown cmd\n");
1140 		}
1141 
1142 		if (++qpair->cq_head == qpair->entries) {
1143 			qpair->cq_head = 0;
1144 			qpair->phase = !qpair->phase;
1145 		}
1146 
1147 		if (++num_completions == max_completions)
1148 			break;
1149 	}
1150 
1151 	if (num_completions > 0)
1152 		nvme_mmio_write_4(qpair->cq_hdbl, qpair->cq_head);
1153 
1154 	pthread_mutex_unlock(&qpair->lock);
1155 
1156 	if (!STAILQ_EMPTY(&qpair->queued_req))
1157 		nvme_qpair_submit_queued_requests(qpair);
1158 
1159 	return num_completions;
1160 }
1161 
1162 void nvme_qpair_reset(struct nvme_qpair *qpair)
1163 {
1164 	pthread_mutex_lock(&qpair->lock);
1165 
1166 	qpair->sq_tail = qpair->cq_head = 0;
1167 
1168 	/*
1169 	 * First time through the completion queue, HW will set phase
1170 	 * bit on completions to 1.  So set this to 1 here, indicating
1171 	 * we're looking for a 1 to know which entries have completed.
1172 	 * we'll toggle the bit each time when the completion queue rolls over.
1173 	 */
1174 	qpair->phase = 1;
1175 
1176 	memset(qpair->cmd, 0, qpair->entries * sizeof(struct nvme_cmd));
1177 	memset(qpair->cpl, 0, qpair->entries * sizeof(struct nvme_cpl));
1178 
1179 	pthread_mutex_unlock(&qpair->lock);
1180 }
1181 
1182 void nvme_qpair_enable(struct nvme_qpair *qpair)
1183 {
1184 	pthread_mutex_lock(&qpair->lock);
1185 
1186 	if (nvme_qpair_is_io_queue(qpair))
1187 		_nvme_qpair_io_qpair_enable(qpair);
1188 	else
1189 		_nvme_qpair_admin_qpair_enable(qpair);
1190 
1191 	pthread_mutex_unlock(&qpair->lock);
1192 }
1193 
1194 void nvme_qpair_disable(struct nvme_qpair *qpair)
1195 {
1196 	pthread_mutex_lock(&qpair->lock);
1197 
1198 	if (nvme_qpair_is_io_queue(qpair))
1199 		_nvme_qpair_io_qpair_disable(qpair);
1200 	else
1201 		_nvme_qpair_admin_qpair_disable(qpair);
1202 
1203 	pthread_mutex_unlock(&qpair->lock);
1204 }
1205 
1206 void nvme_qpair_fail(struct nvme_qpair *qpair)
1207 {
1208 	struct nvme_tracker *tr;
1209 	struct nvme_request *req;
1210 
1211 	pthread_mutex_lock(&qpair->lock);
1212 
1213 	while (!STAILQ_EMPTY(&qpair->queued_req)) {
1214 
1215 		nvme_notice("Failing queued I/O command\n");
1216 		req = STAILQ_FIRST(&qpair->queued_req);
1217 		STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
1218 		nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC,
1219 						   NVME_SC_ABORTED_BY_REQUEST,
1220 						   true);
1221 
1222 	}
1223 
1224 	/* Manually abort each outstanding I/O. */
1225 	while (!LIST_EMPTY(&qpair->outstanding_tr)) {
1226 
1227 		/*
1228 		 * Do not remove the tracker. The abort_tracker path
1229 		 * will do that for us.
1230 		 */
1231 		nvme_notice("Failing outstanding I/O command\n");
1232 		tr = LIST_FIRST(&qpair->outstanding_tr);
1233 		nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC,
1234 						   NVME_SC_ABORTED_BY_REQUEST,
1235 						   1, true);
1236 
1237 	}
1238 
1239 	pthread_mutex_unlock(&qpair->lock);
1240 }
1241 
1242