xref: /haiku/src/add-ons/kernel/drivers/disk/nvme/libnvme/nvme_qpair.c (revision 5b189b0e1e2f51f367bfcb126b2f00a3702f352d)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation. All rights reserved.
5  *   Copyright (c) 2017, Western Digital Corporation or its affiliates.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "nvme_internal.h"
35 
36 struct nvme_qpair_string {
37 	uint16_t	value;
38 	const char 	*str;
39 };
40 
41 static const struct nvme_qpair_string admin_opcode[] = {
42 	{ NVME_OPC_DELETE_IO_SQ,	"DELETE IO SQ" },
43 	{ NVME_OPC_CREATE_IO_SQ,	"CREATE IO SQ" },
44 	{ NVME_OPC_GET_LOG_PAGE,	"GET LOG PAGE" },
45 	{ NVME_OPC_DELETE_IO_CQ,	"DELETE IO CQ" },
46 	{ NVME_OPC_CREATE_IO_CQ,	"CREATE IO CQ" },
47 	{ NVME_OPC_IDENTIFY, 		"IDENTIFY" },
48 	{ NVME_OPC_ABORT,		"ABORT" },
49 	{ NVME_OPC_SET_FEATURES,	"SET FEATURES" },
50 	{ NVME_OPC_GET_FEATURES,	"GET FEATURES" },
51 	{ NVME_OPC_ASYNC_EVENT_REQUEST, "ASYNC EVENT REQUEST" },
52 	{ NVME_OPC_NS_MANAGEMENT,	"NAMESPACE MANAGEMENT" },
53 	{ NVME_OPC_FIRMWARE_COMMIT,	"FIRMWARE COMMIT" },
54 	{ NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD, "FIRMWARE IMAGE DOWNLOAD" },
55 	{ NVME_OPC_NS_ATTACHMENT,	"NAMESPACE ATTACHMENT" },
56 	{ NVME_OPC_FORMAT_NVM,		"FORMAT NVM" },
57 	{ NVME_OPC_SECURITY_SEND,	"SECURITY SEND" },
58 	{ NVME_OPC_SECURITY_RECEIVE,	"SECURITY RECEIVE" },
59 	{ 0xFFFF,			"ADMIN COMMAND" }
60 };
61 
62 static const struct nvme_qpair_string io_opcode[] = {
63 	{ NVME_OPC_FLUSH,		"FLUSH" },
64 	{ NVME_OPC_WRITE,		"WRITE" },
65 	{ NVME_OPC_READ,		"READ" },
66 	{ NVME_OPC_WRITE_UNCORRECTABLE, "WRITE UNCORRECTABLE" },
67 	{ NVME_OPC_COMPARE,		"COMPARE" },
68 	{ NVME_OPC_WRITE_ZEROES,	"WRITE ZEROES" },
69 	{ NVME_OPC_DATASET_MANAGEMENT,	"DATASET MANAGEMENT" },
70 	{ NVME_OPC_RESERVATION_REGISTER, "RESERVATION REGISTER" },
71 	{ NVME_OPC_RESERVATION_REPORT,	"RESERVATION REPORT" },
72 	{ NVME_OPC_RESERVATION_ACQUIRE, "RESERVATION ACQUIRE" },
73 	{ NVME_OPC_RESERVATION_RELEASE, "RESERVATION RELEASE" },
74 	{ 0xFFFF,			"IO COMMAND" }
75 };
76 
77 static const struct nvme_qpair_string generic_status[] = {
78 	{ NVME_SC_SUCCESS,			"SUCCESS" },
79 	{ NVME_SC_INVALID_OPCODE,		"INVALID OPCODE" },
80 	{ NVME_SC_INVALID_FIELD,		"INVALID FIELD" },
81 	{ NVME_SC_COMMAND_ID_CONFLICT,		"COMMAND ID CONFLICT" },
82 	{ NVME_SC_DATA_TRANSFER_ERROR,		"DATA TRANSFER ERROR" },
83 	{ NVME_SC_ABORTED_POWER_LOSS,		"ABORTED - POWER LOSS" },
84 	{ NVME_SC_INTERNAL_DEVICE_ERROR,	"INTERNAL DEVICE ERROR" },
85 	{ NVME_SC_ABORTED_BY_REQUEST,		"ABORTED - BY REQUEST" },
86 	{ NVME_SC_ABORTED_SQ_DELETION,		"ABORTED - SQ DELETION" },
87 	{ NVME_SC_ABORTED_FAILED_FUSED,		"ABORTED - FAILED FUSED" },
88 	{ NVME_SC_ABORTED_MISSING_FUSED,	"ABORTED - MISSING FUSED" },
89 	{ NVME_SC_INVALID_NAMESPACE_OR_FORMAT,	"INVALID NAMESPACE OR FORMAT" },
90 	{ NVME_SC_COMMAND_SEQUENCE_ERROR,	"COMMAND SEQUENCE ERROR" },
91 	{ NVME_SC_INVALID_SGL_SEG_DESCRIPTOR,	"INVALID SGL SEGMENT DESCRIPTOR" },
92 	{ NVME_SC_INVALID_NUM_SGL_DESCIRPTORS,	"INVALID NUMBER OF SGL DESCRIPTORS" },
93 	{ NVME_SC_DATA_SGL_LENGTH_INVALID,	"DATA SGL LENGTH INVALID" },
94 	{ NVME_SC_METADATA_SGL_LENGTH_INVALID,	"METADATA SGL LENGTH INVALID" },
95 	{ NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID,	"SGL DESCRIPTOR TYPE INVALID" },
96 	{ NVME_SC_INVALID_CONTROLLER_MEM_BUF,	"INVALID CONTROLLER MEMORY BUFFER" },
97 	{ NVME_SC_INVALID_PRP_OFFSET,		"INVALID PRP OFFSET" },
98 	{ NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED,	"ATOMIC WRITE UNIT EXCEEDED" },
99 	{ NVME_SC_LBA_OUT_OF_RANGE,		"LBA OUT OF RANGE" },
100 	{ NVME_SC_CAPACITY_EXCEEDED,		"CAPACITY EXCEEDED" },
101 	{ NVME_SC_NAMESPACE_NOT_READY,		"NAMESPACE NOT READY" },
102 	{ NVME_SC_RESERVATION_CONFLICT,		"RESERVATION CONFLICT" },
103 	{ NVME_SC_FORMAT_IN_PROGRESS,		"FORMAT IN PROGRESS" },
104 	{ 0xFFFF,				"GENERIC" }
105 };
106 
107 static const struct nvme_qpair_string command_specific_status[] = {
108 	{ NVME_SC_COMPLETION_QUEUE_INVALID,	"INVALID COMPLETION QUEUE" },
109 	{ NVME_SC_INVALID_QUEUE_IDENTIFIER,	"INVALID QUEUE IDENTIFIER" },
110 	{ NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED,	"MAX QUEUE SIZE EXCEEDED" },
111 	{ NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED,	"ABORT CMD LIMIT EXCEEDED" },
112 	{ NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED,"ASYNC LIMIT EXCEEDED" },
113 	{ NVME_SC_INVALID_FIRMWARE_SLOT,	"INVALID FIRMWARE SLOT" },
114 	{ NVME_SC_INVALID_FIRMWARE_IMAGE,	"INVALID FIRMWARE IMAGE" },
115 	{ NVME_SC_INVALID_INTERRUPT_VECTOR,	"INVALID INTERRUPT VECTOR" },
116 	{ NVME_SC_INVALID_LOG_PAGE,		"INVALID LOG PAGE" },
117 	{ NVME_SC_INVALID_FORMAT,		"INVALID FORMAT" },
118 	{ NVME_SC_FIRMWARE_REQ_CONVENTIONAL_RESET,"FIRMWARE REQUIRES CONVENTIONAL RESET" },
119 	{ NVME_SC_INVALID_QUEUE_DELETION,	"INVALID QUEUE DELETION" },
120 	{ NVME_SC_FEATURE_ID_NOT_SAVEABLE,	"FEATURE ID NOT SAVEABLE" },
121 	{ NVME_SC_FEATURE_NOT_CHANGEABLE,	"FEATURE NOT CHANGEABLE" },
122 	{ NVME_SC_FEATURE_NOT_NAMESPACE_SPECIFIC,"FEATURE NOT NAMESPACE SPECIFIC" },
123 	{ NVME_SC_FIRMWARE_REQ_NVM_RESET,	"FIRMWARE REQUIRES NVM RESET" },
124 	{ NVME_SC_FIRMWARE_REQ_RESET,		"FIRMWARE REQUIRES RESET" },
125 	{ NVME_SC_FIRMWARE_REQ_MAX_TIME_VIOLATION,"FIRMWARE REQUIRES MAX TIME VIOLATION" },
126 	{ NVME_SC_FIRMWARE_ACTIVATION_PROHIBITED,"FIRMWARE ACTIVATION PROHIBITED" },
127 	{ NVME_SC_OVERLAPPING_RANGE,		"OVERLAPPING RANGE" },
128 	{ NVME_SC_NAMESPACE_INSUFFICIENT_CAPACITY,"NAMESPACE INSUFFICIENT CAPACITY" },
129 	{ NVME_SC_NAMESPACE_ID_UNAVAILABLE,	"NAMESPACE ID UNAVAILABLE" },
130 	{ NVME_SC_NAMESPACE_ALREADY_ATTACHED,	"NAMESPACE ALREADY ATTACHED" },
131 	{ NVME_SC_NAMESPACE_IS_PRIVATE,		"NAMESPACE IS PRIVATE" },
132 	{ NVME_SC_NAMESPACE_NOT_ATTACHED,	"NAMESPACE NOT ATTACHED" },
133 	{ NVME_SC_THINPROVISIONING_NOT_SUPPORTED,"THINPROVISIONING NOT SUPPORTED" },
134 	{ NVME_SC_CONTROLLER_LIST_INVALID,	"CONTROLLER LIST INVALID" },
135 	{ NVME_SC_CONFLICTING_ATTRIBUTES,	"CONFLICTING ATTRIBUTES" },
136 	{ NVME_SC_INVALID_PROTECTION_INFO,	"INVALID PROTECTION INFO" },
137 	{ NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE,	"WRITE TO RO PAGE" },
138 	{ 0xFFFF,				"COMMAND SPECIFIC" }
139 };
140 
141 static const struct nvme_qpair_string media_error_status[] = {
142 	{ NVME_SC_WRITE_FAULTS, 		"WRITE FAULTS" },
143 	{ NVME_SC_UNRECOVERED_READ_ERROR, 	"UNRECOVERED READ ERROR" },
144 	{ NVME_SC_GUARD_CHECK_ERROR, 		"GUARD CHECK ERROR" },
145 	{ NVME_SC_APPLICATION_TAG_CHECK_ERROR, 	"APPLICATION TAG CHECK ERROR" },
146 	{ NVME_SC_REFERENCE_TAG_CHECK_ERROR, 	"REFERENCE TAG CHECK ERROR" },
147 	{ NVME_SC_COMPARE_FAILURE, 		"COMPARE FAILURE" },
148 	{ NVME_SC_ACCESS_DENIED, 		"ACCESS DENIED" },
149 	{ NVME_SC_DEALLOCATED_OR_UNWRITTEN_BLOCK, "DEALLOCATED OR UNWRITTEN BLOCK" },
150 	{ 0xFFFF, 				"MEDIA ERROR" }
151 };
152 
153 static inline bool nvme_qpair_is_admin_queue(struct nvme_qpair *qpair)
154 {
155 	return qpair->id == 0;
156 }
157 
158 static inline bool nvme_qpair_is_io_queue(struct nvme_qpair *qpair)
159 {
160 	return qpair->id != 0;
161 }
162 
163 static const char*nvme_qpair_get_string(const struct nvme_qpair_string *strings,
164 					uint16_t value)
165 {
166 	const struct nvme_qpair_string *entry;
167 
168 	entry = strings;
169 
170 	while (entry->value != 0xFFFF) {
171 		if (entry->value == value)
172 			return entry->str;
173 		entry++;
174 	}
175 	return entry->str;
176 }
177 
178 static void nvme_qpair_admin_qpair_print_command(struct nvme_qpair *qpair,
179 						 struct nvme_cmd *cmd)
180 {
181 	nvme_info("%s (%02x) sqid:%d cid:%d nsid:%x cdw10:%08x cdw11:%08x\n",
182 		  nvme_qpair_get_string(admin_opcode, cmd->opc), cmd->opc,
183 		  qpair->id, cmd->cid,
184 		  cmd->nsid, cmd->cdw10, cmd->cdw11);
185 }
186 
187 static void nvme_qpair_io_qpair_print_command(struct nvme_qpair *qpair,
188 					      struct nvme_cmd *cmd)
189 {
190 	nvme_assert(qpair != NULL, "print_command: qpair == NULL\n");
191 	nvme_assert(cmd != NULL, "print_command: cmd == NULL\n");
192 
193 	switch ((int)cmd->opc) {
194 	case NVME_OPC_WRITE:
195 	case NVME_OPC_READ:
196 	case NVME_OPC_WRITE_UNCORRECTABLE:
197 	case NVME_OPC_COMPARE:
198 		nvme_info("%s sqid:%d cid:%d nsid:%d lba:%llu len:%d\n",
199 			  nvme_qpair_get_string(io_opcode, cmd->opc),
200 			  qpair->id, cmd->cid, cmd->nsid,
201 			  ((unsigned long long)cmd->cdw11 << 32) + cmd->cdw10,
202 			  (cmd->cdw12 & 0xFFFF) + 1);
203 		break;
204 	case NVME_OPC_FLUSH:
205 	case NVME_OPC_DATASET_MANAGEMENT:
206 		nvme_info("%s sqid:%d cid:%d nsid:%d\n",
207 			  nvme_qpair_get_string(io_opcode, cmd->opc),
208 			  qpair->id, cmd->cid, cmd->nsid);
209 		break;
210 	default:
211 		nvme_info("%s (%02x) sqid:%d cid:%d nsid:%d\n",
212 			  nvme_qpair_get_string(io_opcode, cmd->opc),
213 			  cmd->opc, qpair->id, cmd->cid, cmd->nsid);
214 		break;
215 	}
216 }
217 
218 static void nvme_qpair_print_command(struct nvme_qpair *qpair,
219 				     struct nvme_cmd *cmd)
220 {
221 	nvme_assert(qpair != NULL, "qpair can not be NULL");
222 	nvme_assert(cmd != NULL, "cmd can not be NULL");
223 
224 	if (nvme_qpair_is_admin_queue(qpair))
225 		return nvme_qpair_admin_qpair_print_command(qpair, cmd);
226 
227 	return nvme_qpair_io_qpair_print_command(qpair, cmd);
228 }
229 
230 static const char *get_status_string(uint16_t sct, uint16_t sc)
231 {
232 	const struct nvme_qpair_string *entry;
233 
234 	switch (sct) {
235 	case NVME_SCT_GENERIC:
236 		entry = generic_status;
237 		break;
238 	case NVME_SCT_COMMAND_SPECIFIC:
239 		entry = command_specific_status;
240 		break;
241 	case NVME_SCT_MEDIA_ERROR:
242 		entry = media_error_status;
243 		break;
244 	case NVME_SCT_VENDOR_SPECIFIC:
245 		return "VENDOR SPECIFIC";
246 	default:
247 		return "RESERVED";
248 	}
249 
250 	return nvme_qpair_get_string(entry, sc);
251 }
252 
253 static void nvme_qpair_print_completion(struct nvme_qpair *qpair,
254 					struct nvme_cpl *cpl)
255 {
256 	nvme_info("Cpl: %s (%02x/%02x) sqid:%d cid:%d "
257 		  "cdw0:%x sqhd:%04x p:%x m:%x dnr:%x\n",
258 		  get_status_string(cpl->status.sct, cpl->status.sc),
259 		  cpl->status.sct,
260 		  cpl->status.sc,
261 		  cpl->sqid,
262 		  cpl->cid,
263 		  cpl->cdw0,
264 		  cpl->sqhd,
265 		  cpl->status.p,
266 		  cpl->status.m,
267 		  cpl->status.dnr);
268 }
269 
270 static bool nvme_qpair_completion_retry(const struct nvme_cpl *cpl)
271 {
272 	/*
273 	 * TODO: spec is not clear how commands that are aborted due
274 	 *  to TLER will be marked.  So for now, it seems
275 	 *  NAMESPACE_NOT_READY is the only case where we should
276 	 *  look at the DNR bit.
277 	 */
278 	switch ((int)cpl->status.sct) {
279 	case NVME_SCT_GENERIC:
280 		switch ((int)cpl->status.sc) {
281 		case NVME_SC_NAMESPACE_NOT_READY:
282 		case NVME_SC_FORMAT_IN_PROGRESS:
283 			if (cpl->status.dnr)
284 				return false;
285 			return true;
286 		case NVME_SC_INVALID_OPCODE:
287 		case NVME_SC_INVALID_FIELD:
288 		case NVME_SC_COMMAND_ID_CONFLICT:
289 		case NVME_SC_DATA_TRANSFER_ERROR:
290 		case NVME_SC_ABORTED_POWER_LOSS:
291 		case NVME_SC_INTERNAL_DEVICE_ERROR:
292 		case NVME_SC_ABORTED_BY_REQUEST:
293 		case NVME_SC_ABORTED_SQ_DELETION:
294 		case NVME_SC_ABORTED_FAILED_FUSED:
295 		case NVME_SC_ABORTED_MISSING_FUSED:
296 		case NVME_SC_INVALID_NAMESPACE_OR_FORMAT:
297 		case NVME_SC_COMMAND_SEQUENCE_ERROR:
298 		case NVME_SC_LBA_OUT_OF_RANGE:
299 		case NVME_SC_CAPACITY_EXCEEDED:
300 		default:
301 			return false;
302 		}
303 	case NVME_SCT_COMMAND_SPECIFIC:
304 	case NVME_SCT_MEDIA_ERROR:
305 	case NVME_SCT_VENDOR_SPECIFIC:
306 	default:
307 		return false;
308 	}
309 }
310 
311 static void nvme_qpair_construct_tracker(struct nvme_tracker *tr,
312 					 uint16_t cid, uint64_t phys_addr)
313 {
314 	tr->prp_sgl_bus_addr = phys_addr + offsetof(struct nvme_tracker, u.prp);
315 	tr->cid = cid;
316 	tr->active = false;
317 }
318 
319 static inline void nvme_qpair_copy_command(struct nvme_cmd *dst,
320 					   const struct nvme_cmd *src)
321 {
322 	/* dst and src are known to be non-overlapping and 64-byte aligned. */
323 #if defined(__AVX__)
324 	__m256i *d256 = (__m256i *)dst;
325 	const __m256i *s256 = (const __m256i *)src;
326 
327 	_mm256_store_si256(&d256[0], _mm256_load_si256(&s256[0]));
328 	_mm256_store_si256(&d256[1], _mm256_load_si256(&s256[1]));
329 #elif defined(__SSE2__)
330 	__m128i *d128 = (__m128i *)dst;
331 	const __m128i *s128 = (const __m128i *)src;
332 
333 	_mm_store_si128(&d128[0], _mm_load_si128(&s128[0]));
334 	_mm_store_si128(&d128[1], _mm_load_si128(&s128[1]));
335 	_mm_store_si128(&d128[2], _mm_load_si128(&s128[2]));
336 	_mm_store_si128(&d128[3], _mm_load_si128(&s128[3]));
337 #else
338 	*dst = *src;
339 #endif
340 }
341 
342 static void nvme_qpair_submit_tracker(struct nvme_qpair *qpair,
343 				      struct nvme_tracker *tr)
344 {
345 	struct nvme_request *req = tr->req;
346 
347 	/*
348 	 * Set the tracker active and copy its command
349 	 * to the submission queue.
350 	 */
351 	nvme_debug("qpair %d: Submit command, tail %d, cid %d / %d\n",
352 		   qpair->id,
353 		   (int)qpair->sq_tail,
354 		   (int)tr->cid,
355 		   (int)tr->req->cmd.cid);
356 
357 	qpair->tr[tr->cid].active = true;
358 	nvme_qpair_copy_command(&qpair->cmd[qpair->sq_tail], &req->cmd);
359 
360 	if (++qpair->sq_tail == qpair->entries)
361 		qpair->sq_tail = 0;
362 
363 	nvme_wmb();
364 	nvme_mmio_write_4(qpair->sq_tdbl, qpair->sq_tail);
365 }
366 
367 static void nvme_qpair_complete_tracker(struct nvme_qpair *qpair,
368 					struct nvme_tracker *tr,
369 					struct nvme_cpl *cpl,
370 					bool print_on_error)
371 {
372 	struct nvme_request *req = tr->req;
373 	bool retry, error;
374 
375 	if (!req) {
376 		nvme_crit("tracker has no request\n");
377 		qpair->tr[cpl->cid].active = false;
378 		goto done;
379 	}
380 
381 	error = nvme_cpl_is_error(cpl);
382 	retry = error && nvme_qpair_completion_retry(cpl) &&
383 		(req->retries < NVME_MAX_RETRY_COUNT);
384 	if (error && print_on_error) {
385 		nvme_qpair_print_command(qpair, &req->cmd);
386 		nvme_qpair_print_completion(qpair, cpl);
387 	}
388 
389 	qpair->tr[cpl->cid].active = false;
390 
391 	if (cpl->cid != req->cmd.cid)
392 		nvme_crit("cpl and command CID mismatch (%d / %d)\n",
393 			  (int)cpl->cid, (int)req->cmd.cid);
394 
395 	if (retry) {
396 		req->retries++;
397 		nvme_qpair_submit_tracker(qpair, tr);
398 		return;
399 	}
400 
401 	if (req->cb_fn)
402 		req->cb_fn(req->cb_arg, cpl);
403 
404 	nvme_request_free(req);
405 
406 done:
407 	tr->req = NULL;
408 
409 	LIST_REMOVE(tr, list);
410 	LIST_INSERT_HEAD(&qpair->free_tr, tr, list);
411 
412 	/*
413 	 * If the controller is in the middle of a reset, don't
414 	 * try to submit queued requests here - let the reset logic
415 	 * handle that instead.
416 	 */
417 	if (!STAILQ_EMPTY(&qpair->queued_req) &&
418 	    !qpair->ctrlr->resetting) {
419 		req = STAILQ_FIRST(&qpair->queued_req);
420 		STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
421 		nvme_qpair_submit_request(qpair, req);
422 	}
423 }
424 
425 static void nvme_qpair_manual_complete_tracker(struct nvme_qpair *qpair,
426 					       struct nvme_tracker *tr,
427 					       uint32_t sct,
428 					       uint32_t sc,
429 					       uint32_t dnr,
430 					       bool print_on_error)
431 {
432 	struct nvme_cpl	cpl;
433 
434 	memset(&cpl, 0, sizeof(cpl));
435 	cpl.sqid = qpair->id;
436 	cpl.cid = tr->cid;
437 	cpl.status.sct = sct;
438 	cpl.status.sc = sc;
439 	cpl.status.dnr = dnr;
440 
441 	nvme_qpair_complete_tracker(qpair, tr, &cpl, print_on_error);
442 }
443 
444 static void nvme_qpair_manual_complete_request(struct nvme_qpair *qpair,
445 					       struct nvme_request *req,
446 					       uint32_t sct, uint32_t sc,
447 					       bool print_on_error)
448 {
449 	struct nvme_cpl	cpl;
450 	bool error;
451 
452 	memset(&cpl, 0, sizeof(cpl));
453 	cpl.sqid = qpair->id;
454 	cpl.status.sct = sct;
455 	cpl.status.sc = sc;
456 
457 	error = nvme_cpl_is_error(&cpl);
458 
459 	if (error && print_on_error) {
460 		nvme_qpair_print_command(qpair, &req->cmd);
461 		nvme_qpair_print_completion(qpair, &cpl);
462 	}
463 
464 	if (req->cb_fn)
465 		req->cb_fn(req->cb_arg, &cpl);
466 
467 	nvme_request_free(req);
468 }
469 
470 static void nvme_qpair_abort_aers(struct nvme_qpair *qpair)
471 {
472 	struct nvme_tracker *tr;
473 
474 	tr = LIST_FIRST(&qpair->outstanding_tr);
475 	while (tr != NULL) {
476 		nvme_assert(tr->req != NULL,
477 			    "tr->req == NULL in abort_aers\n");
478 		if (tr->req->cmd.opc == NVME_OPC_ASYNC_EVENT_REQUEST) {
479 			nvme_qpair_manual_complete_tracker(qpair, tr,
480 					      NVME_SCT_GENERIC,
481 					      NVME_SC_ABORTED_SQ_DELETION,
482 					      0, false);
483 			tr = LIST_FIRST(&qpair->outstanding_tr);
484 			continue;
485 		}
486 		tr = LIST_NEXT(tr, list);
487 	}
488 }
489 
490 static inline void _nvme_qpair_admin_qpair_destroy(struct nvme_qpair *qpair)
491 {
492 	nvme_qpair_abort_aers(qpair);
493 }
494 
495 static inline void _nvme_qpair_req_bad_phys(struct nvme_qpair *qpair,
496 					    struct nvme_tracker *tr)
497 {
498 	/*
499 	 * Bad vtophys translation, so abort this request
500 	 * and return immediately, without retry.
501 	 */
502 	nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC,
503 					   NVME_SC_INVALID_FIELD,
504 					   1, true);
505 }
506 
507 /*
508  * Build PRP list describing physically contiguous payload buffer.
509  */
510 static int _nvme_qpair_build_contig_request(struct nvme_qpair *qpair,
511 					    struct nvme_request *req,
512 					    struct nvme_tracker *tr)
513 {
514 	uint64_t phys_addr;
515 	void *seg_addr;
516 	uint32_t nseg, cur_nseg, modulo, unaligned;
517 	void *md_payload;
518 	void *payload = req->payload.u.contig + req->payload_offset;
519 
520 	phys_addr = nvme_mem_vtophys(payload);
521 	if (phys_addr == NVME_VTOPHYS_ERROR) {
522 		_nvme_qpair_req_bad_phys(qpair, tr);
523 		return -1;
524 	}
525 	nseg = req->payload_size >> PAGE_SHIFT;
526 	modulo = req->payload_size & (PAGE_SIZE - 1);
527 	unaligned = phys_addr & (PAGE_SIZE - 1);
528 	if (modulo || unaligned)
529 		nseg += 1 + ((modulo + unaligned - 1) >> PAGE_SHIFT);
530 
531 	if (req->payload.md) {
532 		md_payload = req->payload.md + req->md_offset;
533 		tr->req->cmd.mptr = nvme_mem_vtophys(md_payload);
534 		if (tr->req->cmd.mptr == NVME_VTOPHYS_ERROR) {
535 			_nvme_qpair_req_bad_phys(qpair, tr);
536 			return -1;
537 		}
538 	}
539 
540 	tr->req->cmd.psdt = NVME_PSDT_PRP;
541 	tr->req->cmd.dptr.prp.prp1 = phys_addr;
542 	if (nseg == 2) {
543 		seg_addr = payload + PAGE_SIZE - unaligned;
544 		tr->req->cmd.dptr.prp.prp2 = nvme_mem_vtophys(seg_addr);
545 	} else if (nseg > 2) {
546 		cur_nseg = 1;
547 		tr->req->cmd.dptr.prp.prp2 = (uint64_t)tr->prp_sgl_bus_addr;
548 		while (cur_nseg < nseg) {
549 			seg_addr = payload + cur_nseg * PAGE_SIZE - unaligned;
550 			phys_addr = nvme_mem_vtophys(seg_addr);
551 			if (phys_addr == NVME_VTOPHYS_ERROR) {
552 				_nvme_qpair_req_bad_phys(qpair, tr);
553 				return -1;
554 			}
555 			tr->u.prp[cur_nseg - 1] = phys_addr;
556 			cur_nseg++;
557 		}
558 	}
559 
560 	return 0;
561 }
562 
563 /*
564  * Build SGL list describing scattered payload buffer.
565  */
566 static int _nvme_qpair_build_hw_sgl_request(struct nvme_qpair *qpair,
567 					    struct nvme_request *req,
568 					    struct nvme_tracker *tr)
569 {
570 	struct nvme_sgl_descriptor *sgl;
571 	uint64_t phys_addr;
572 	uint32_t remaining_transfer_len, length, nseg = 0;
573 	int ret;
574 
575 	/*
576 	 * Build scattered payloads.
577 	 */
578 	nvme_assert(req->payload_size != 0,
579 		    "cannot build SGL for zero-length transfer\n");
580 	nvme_assert(req->payload.type == NVME_PAYLOAD_TYPE_SGL,
581 		    "sgl payload type required\n");
582 	nvme_assert(req->payload.u.sgl.reset_sgl_fn != NULL,
583 		    "sgl reset callback required\n");
584 	nvme_assert(req->payload.u.sgl.next_sge_fn != NULL,
585 		    "sgl callback required\n");
586 	req->payload.u.sgl.reset_sgl_fn(req->payload.u.sgl.cb_arg,
587 					req->payload_offset);
588 
589 	sgl = tr->u.sgl;
590 	req->cmd.psdt = NVME_PSDT_SGL_MPTR_SGL;
591 	req->cmd.dptr.sgl1.unkeyed.subtype = 0;
592 
593 	remaining_transfer_len = req->payload_size;
594 
595 	while (remaining_transfer_len > 0) {
596 
597 		if (nseg >= NVME_MAX_SGL_DESCRIPTORS) {
598 			_nvme_qpair_req_bad_phys(qpair, tr);
599 			return -1;
600 		}
601 
602 		ret = req->payload.u.sgl.next_sge_fn(req->payload.u.sgl.cb_arg,
603 						     &phys_addr, &length);
604 		if (ret != 0) {
605 			_nvme_qpair_req_bad_phys(qpair, tr);
606 			return ret;
607 		}
608 
609 		length = nvme_min(remaining_transfer_len, length);
610 		remaining_transfer_len -= length;
611 
612 		sgl->unkeyed.type = NVME_SGL_TYPE_DATA_BLOCK;
613 		sgl->unkeyed.length = length;
614 		sgl->address = phys_addr;
615 		sgl->unkeyed.subtype = 0;
616 
617 		sgl++;
618 		nseg++;
619 
620 	}
621 
622 	if (nseg == 1) {
623 		/*
624 		 * The whole transfer can be described by a single Scatter
625 		 * Gather List descriptor. Use the special case described
626 		 * by the spec where SGL1's type is Data Block.
627 		 * This means the SGL in the tracker is not used at all,
628 		 * so copy the first (and only) SGL element into SGL1.
629 		 */
630 		req->cmd.dptr.sgl1.unkeyed.type = NVME_SGL_TYPE_DATA_BLOCK;
631 		req->cmd.dptr.sgl1.address = tr->u.sgl[0].address;
632 		req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length;
633 	} else {
634 		/* For now we only support 1 SGL segment in NVMe controller */
635 		req->cmd.dptr.sgl1.unkeyed.type = NVME_SGL_TYPE_LAST_SEGMENT;
636 		req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr;
637 		req->cmd.dptr.sgl1.unkeyed.length =
638 			nseg * sizeof(struct nvme_sgl_descriptor);
639 	}
640 
641 	return 0;
642 }
643 
644 /*
645  * Build Physical Region Page list describing scattered payload buffer.
646  */
647 static int _nvme_qpair_build_prps_sgl_request(struct nvme_qpair *qpair,
648 					      struct nvme_request *req,
649 					      struct nvme_tracker *tr)
650 {
651 	uint64_t phys_addr, prp2 = 0;
652 	uint32_t data_transferred, remaining_transfer_len, length;
653 	uint32_t nseg, cur_nseg, total_nseg = 0, last_nseg = 0;
654 	uint32_t modulo, unaligned, sge_count = 0;
655 	int ret;
656 
657 	/*
658 	 * Build scattered payloads.
659 	 */
660 	nvme_assert(req->payload.type == NVME_PAYLOAD_TYPE_SGL,
661 		    "sgl payload type required\n");
662 	nvme_assert(req->payload.u.sgl.reset_sgl_fn != NULL,
663 		    "sgl reset callback required\n");
664 	req->payload.u.sgl.reset_sgl_fn(req->payload.u.sgl.cb_arg,
665 					req->payload_offset);
666 
667 	remaining_transfer_len = req->payload_size;
668 
669 	while (remaining_transfer_len > 0) {
670 
671 		nvme_assert(req->payload.u.sgl.next_sge_fn != NULL,
672 			    "sgl callback required\n");
673 
674 		ret = req->payload.u.sgl.next_sge_fn(req->payload.u.sgl.cb_arg,
675 						    &phys_addr, &length);
676 		if (ret != 0) {
677 			_nvme_qpair_req_bad_phys(qpair, tr);
678 			return -1;
679 		}
680 
681 		data_transferred = nvme_min(remaining_transfer_len, length);
682 
683 		nseg = data_transferred >> PAGE_SHIFT;
684 		modulo = data_transferred & (PAGE_SIZE - 1);
685 		unaligned = phys_addr & (PAGE_SIZE - 1);
686 		if (modulo || unaligned)
687 			nseg += 1 + ((modulo + unaligned - 1) >> PAGE_SHIFT);
688 
689 		if (total_nseg == 0) {
690 			req->cmd.psdt = NVME_PSDT_PRP;
691 			req->cmd.dptr.prp.prp1 = phys_addr;
692 		}
693 
694 		total_nseg += nseg;
695 		sge_count++;
696 		remaining_transfer_len -= data_transferred;
697 
698 		if (total_nseg == 2) {
699 			if (sge_count == 1)
700 				tr->req->cmd.dptr.prp.prp2 = phys_addr +
701 					PAGE_SIZE - unaligned;
702 			else if (sge_count == 2)
703 				tr->req->cmd.dptr.prp.prp2 = phys_addr;
704 			/* save prp2 value */
705 			prp2 = tr->req->cmd.dptr.prp.prp2;
706 		} else if (total_nseg > 2) {
707 			if (sge_count == 1)
708 				cur_nseg = 1;
709 			else
710 				cur_nseg = 0;
711 
712 			tr->req->cmd.dptr.prp.prp2 =
713 				(uint64_t)tr->prp_sgl_bus_addr;
714 
715 			while (cur_nseg < nseg) {
716 				if (prp2) {
717 					tr->u.prp[0] = prp2;
718 					tr->u.prp[last_nseg + 1] = phys_addr +
719 						cur_nseg * PAGE_SIZE - unaligned;
720 				} else {
721 					tr->u.prp[last_nseg] = phys_addr +
722 						cur_nseg * PAGE_SIZE - unaligned;
723 				}
724 				last_nseg++;
725 				cur_nseg++;
726 
727 				/* physical address and length check */
728 				if (remaining_transfer_len ||
729 				    (!remaining_transfer_len &&
730 				     (cur_nseg < nseg))) {
731 					if ((length & (PAGE_SIZE - 1)) ||
732 					    unaligned) {
733 						_nvme_qpair_req_bad_phys(qpair,
734 									 tr);
735 						return -1;
736 					}
737 				}
738 			}
739 		}
740 	}
741 
742 	return 0;
743 }
744 
745 static void _nvme_qpair_admin_qpair_enable(struct nvme_qpair *qpair)
746 {
747 	struct nvme_tracker *tr, *tr_temp;
748 
749 	/*
750 	 * Manually abort each outstanding admin command.  Do not retry
751 	 * admin commands found here, since they will be left over from
752 	 * a controller reset and its likely the context in which the
753 	 * command was issued no longer applies.
754 	 */
755 	LIST_FOREACH_SAFE(tr, &qpair->outstanding_tr, list, tr_temp) {
756 		nvme_info("Aborting outstanding admin command\n");
757 		nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC,
758 						   NVME_SC_ABORTED_BY_REQUEST,
759 						   1 /* do not retry */, true);
760 	}
761 
762 	qpair->enabled = true;
763 }
764 
765 static void _nvme_qpair_io_qpair_enable(struct nvme_qpair *qpair)
766 {
767 	struct nvme_tracker *tr, *temp;
768 	struct nvme_request *req;
769 
770 	qpair->enabled = true;
771 
772 	qpair->ctrlr->enabled_io_qpairs++;
773 
774 	/* Manually abort each queued I/O. */
775 	while (!STAILQ_EMPTY(&qpair->queued_req)) {
776 		req = STAILQ_FIRST(&qpair->queued_req);
777 		STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
778 		nvme_info("Aborting queued I/O command\n");
779 		nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC,
780 						   NVME_SC_ABORTED_BY_REQUEST,
781 						   true);
782 	}
783 
784 	/* Manually abort each outstanding I/O. */
785 	LIST_FOREACH_SAFE(tr, &qpair->outstanding_tr, list, temp) {
786 		nvme_info("Aborting outstanding I/O command\n");
787 		nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC,
788 						   NVME_SC_ABORTED_BY_REQUEST,
789 						   0, true);
790 	}
791 }
792 
793 static inline void _nvme_qpair_admin_qpair_disable(struct nvme_qpair *qpair)
794 {
795 	qpair->enabled = false;
796 	nvme_qpair_abort_aers(qpair);
797 }
798 
799 static inline void _nvme_qpair_io_qpair_disable(struct nvme_qpair *qpair)
800 {
801 	qpair->enabled = false;
802 
803 	qpair->ctrlr->enabled_io_qpairs--;
804 }
805 
806 /*
807  * Reserve room for the submission queue
808  * in the controller memory buffer
809  */
810 static int nvme_ctrlr_reserve_sq_in_cmb(struct nvme_ctrlr *ctrlr,
811 					uint16_t entries,
812 					uint64_t aligned, uint64_t *offset)
813 {
814 	uint64_t round_offset;
815 	const uint64_t length = entries * sizeof(struct nvme_cmd);
816 
817 	round_offset = ctrlr->cmb_current_offset;
818 	round_offset = (round_offset + (aligned - 1)) & ~(aligned - 1);
819 
820 	if (round_offset + length > ctrlr->cmb_size)
821 		return -1;
822 
823 	*offset = round_offset;
824 	ctrlr->cmb_current_offset = round_offset + length;
825 
826 	return 0;
827 }
828 
829 /*
830  * Initialize a queue pair on the host side.
831  */
832 int nvme_qpair_construct(struct nvme_ctrlr *ctrlr, struct nvme_qpair *qpair,
833 			 enum nvme_qprio qprio,
834 			 uint16_t entries, uint16_t trackers)
835 {
836 	volatile uint32_t *doorbell_base;
837 	struct nvme_tracker *tr;
838 	uint64_t offset;
839 	unsigned long phys_addr = 0;
840 	uint16_t i;
841 	int ret;
842 
843 	nvme_assert(entries != 0, "Invalid number of entries\n");
844 	nvme_assert(trackers != 0, "Invalid trackers\n");
845 
846 	qpair->entries = entries;
847 	qpair->trackers = trackers;
848 	qpair->qprio = qprio;
849 	qpair->sq_in_cmb = false;
850 	qpair->ctrlr = ctrlr;
851 
852 	if (ctrlr->opts.use_cmb_sqs) {
853 		/*
854 		 * Reserve room for the submission queue in ctrlr
855 		 * memory buffer.
856 		 */
857 		ret = nvme_ctrlr_reserve_sq_in_cmb(ctrlr, entries,
858 						   PAGE_SIZE,
859 						   &offset);
860 		if (ret == 0) {
861 
862 			qpair->cmd = ctrlr->cmb_bar_virt_addr + offset;
863 			qpair->cmd_bus_addr = ctrlr->cmb_bar_phys_addr + offset;
864 			qpair->sq_in_cmb = true;
865 
866 			nvme_debug("Allocated qpair %d cmd in cmb at %p / 0x%llx\n",
867 				   qpair->id,
868 				   qpair->cmd, qpair->cmd_bus_addr);
869 
870 		}
871 	}
872 
873 	if (qpair->sq_in_cmb == false) {
874 
875 		qpair->cmd =
876 			nvme_mem_alloc_node(sizeof(struct nvme_cmd) * entries,
877 				    PAGE_SIZE, NVME_NODE_ID_ANY,
878 				    (unsigned long *) &qpair->cmd_bus_addr);
879 		if (!qpair->cmd) {
880 			nvme_err("Allocate qpair commands failed\n");
881 			goto fail;
882 		}
883 		memset(qpair->cmd, 0, sizeof(struct nvme_cmd) * entries);
884 
885 		nvme_debug("Allocated qpair %d cmd %p / 0x%llx\n",
886 			   qpair->id,
887 			   qpair->cmd, qpair->cmd_bus_addr);
888 	}
889 
890 	qpair->cpl = nvme_mem_alloc_node(sizeof(struct nvme_cpl) * entries,
891 				 PAGE_SIZE, NVME_NODE_ID_ANY,
892 				 (unsigned long *) &qpair->cpl_bus_addr);
893 	if (!qpair->cpl) {
894 		nvme_err("Allocate qpair completions failed\n");
895 		goto fail;
896 	}
897 	memset(qpair->cpl, 0, sizeof(struct nvme_cpl) * entries);
898 
899 	nvme_debug("Allocated qpair %d cpl at %p / 0x%llx\n",
900 		   qpair->id,
901 		   qpair->cpl,
902 		   qpair->cpl_bus_addr);
903 
904 	doorbell_base = &ctrlr->regs->doorbell[0].sq_tdbl;
905 	qpair->sq_tdbl = doorbell_base +
906 		(2 * qpair->id + 0) * ctrlr->doorbell_stride_u32;
907 	qpair->cq_hdbl = doorbell_base +
908 		(2 * qpair->id + 1) * ctrlr->doorbell_stride_u32;
909 
910 	LIST_INIT(&qpair->free_tr);
911 	LIST_INIT(&qpair->outstanding_tr);
912 	STAILQ_INIT(&qpair->free_req);
913 	STAILQ_INIT(&qpair->queued_req);
914 
915 	/* Request pool */
916 	if (nvme_request_pool_construct(qpair)) {
917 		nvme_err("Create request pool failed\n");
918 		goto fail;
919 	}
920 
921 	/*
922 	 * Reserve space for all of the trackers in a single allocation.
923 	 * struct nvme_tracker must be padded so that its size is already
924 	 * a power of 2. This ensures the PRP list embedded in the nvme_tracker
925 	 * object will not span a 4KB boundary, while allowing access to
926 	 * trackers in tr[] via normal array indexing.
927 	 */
928 	qpair->tr = nvme_mem_alloc_node(sizeof(struct nvme_tracker) * trackers,
929 					sizeof(struct nvme_tracker),
930 					NVME_NODE_ID_ANY, &phys_addr);
931 	if (!qpair->tr) {
932 		nvme_err("Allocate request trackers failed\n");
933 		goto fail;
934 	}
935 	memset(qpair->tr, 0, sizeof(struct nvme_tracker) * trackers);
936 
937 	nvme_debug("Allocated qpair %d trackers at %p / 0x%lx\n",
938 		   qpair->id, qpair->tr, phys_addr);
939 
940 	for (i = 0; i < trackers; i++) {
941 		tr = &qpair->tr[i];
942 		nvme_qpair_construct_tracker(tr, i, phys_addr);
943 		LIST_INSERT_HEAD(&qpair->free_tr, tr, list);
944 		phys_addr += sizeof(struct nvme_tracker);
945 	}
946 
947 	nvme_qpair_reset(qpair);
948 
949 	return 0;
950 
951 fail:
952 	nvme_qpair_destroy(qpair);
953 
954 	return -1;
955 }
956 
957 void nvme_qpair_destroy(struct nvme_qpair *qpair)
958 {
959 	if (nvme_qpair_is_admin_queue(qpair))
960 		_nvme_qpair_admin_qpair_destroy(qpair);
961 
962 	if (qpair->cmd && !qpair->sq_in_cmb) {
963 		nvme_free(qpair->cmd);
964 		qpair->cmd = NULL;
965 	}
966 	if (qpair->cpl) {
967 		nvme_free(qpair->cpl);
968 		qpair->cpl = NULL;
969 	}
970 	if (qpair->tr) {
971 		nvme_free(qpair->tr);
972 		qpair->tr = NULL;
973 	}
974 	nvme_request_pool_destroy(qpair);
975 
976 }
977 
978 bool nvme_qpair_enabled(struct nvme_qpair *qpair)
979 {
980 	if (!qpair->enabled && !qpair->ctrlr->resetting)
981 		nvme_qpair_enable(qpair);
982 
983 	return qpair->enabled;
984 }
985 
986 int nvme_qpair_submit_request(struct nvme_qpair *qpair,
987 			      struct nvme_request *req)
988 {
989 	struct nvme_tracker *tr;
990 	struct nvme_request *child_req, *tmp;
991 	struct nvme_ctrlr *ctrlr = qpair->ctrlr;
992 	bool child_req_failed = false;
993 	int ret = 0;
994 
995 	if (ctrlr->failed) {
996 		nvme_request_free(req);
997 		return ENXIO;
998 	}
999 
1000 	nvme_qpair_enabled(qpair);
1001 
1002 	if (req->child_reqs) {
1003 
1004 		/*
1005 		 * This is a splitted (parent) request. Submit all of the
1006 		 * children but not the parent request itself, since the
1007 		 * parent is the original unsplit request.
1008 		 */
1009 		TAILQ_FOREACH_SAFE(child_req, &req->children, child_tailq, tmp) {
1010 			if (!child_req_failed) {
1011 				ret = nvme_qpair_submit_request(qpair, child_req);
1012 				if (ret != 0)
1013 					child_req_failed = true;
1014 			} else {
1015 				/* free remaining child_reqs since
1016 				 * one child_req fails */
1017 				nvme_request_remove_child(req, child_req);
1018 				nvme_request_free(child_req);
1019 			}
1020 		}
1021 
1022 		return ret;
1023 	}
1024 
1025 	tr = LIST_FIRST(&qpair->free_tr);
1026 	if (tr == NULL || !qpair->enabled) {
1027 		/*
1028 		 * No tracker is available, or the qpair is disabled due
1029 		 * to an in-progress controller-level reset.
1030 		 *
1031 		 * Put the request on the qpair's request queue to be
1032 		 * processed when a tracker frees up via a command
1033 		 * completion or when the controller reset is completed.
1034 		 */
1035 		STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
1036 		return 0;
1037 	}
1038 
1039 	/* remove tr from free_tr */
1040 	LIST_REMOVE(tr, list);
1041 	LIST_INSERT_HEAD(&qpair->outstanding_tr, tr, list);
1042 	tr->req = req;
1043 	req->cmd.cid = tr->cid;
1044 
1045 	if (req->payload_size == 0) {
1046 		/* Null payload - leave PRP fields zeroed */
1047 		ret = 0;
1048 	} else if (req->payload.type == NVME_PAYLOAD_TYPE_CONTIG) {
1049 		ret = _nvme_qpair_build_contig_request(qpair, req, tr);
1050 	} else if (req->payload.type == NVME_PAYLOAD_TYPE_SGL) {
1051 		if (ctrlr->flags & NVME_CTRLR_SGL_SUPPORTED)
1052 			ret = _nvme_qpair_build_hw_sgl_request(qpair, req, tr);
1053 		else
1054 			ret = _nvme_qpair_build_prps_sgl_request(qpair, req, tr);
1055 	} else {
1056 		nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC,
1057 						   NVME_SC_INVALID_FIELD,
1058 						   1 /* do not retry */, true);
1059 		ret = -EINVAL;
1060 	}
1061 
1062 	if (ret == 0)
1063 		nvme_qpair_submit_tracker(qpair, tr);
1064 
1065 	return ret;
1066 }
1067 
1068 unsigned int nvme_qpair_poll(struct nvme_qpair *qpair,
1069 			     unsigned int max_completions)
1070 {
1071 	struct nvme_tracker *tr;
1072 	struct nvme_cpl	*cpl;
1073 	uint32_t num_completions = 0;
1074 
1075 	if (!nvme_qpair_enabled(qpair))
1076 		/*
1077 		 * qpair is not enabled, likely because a controller reset is
1078 		 * is in progress.  Ignore the interrupt - any I/O that was
1079 		 * associated with this interrupt will get retried when the
1080 		 * reset is complete.
1081 		 */
1082 		return 0;
1083 
1084 	if ((max_completions == 0) ||
1085 	    (max_completions > (qpair->entries - 1U)))
1086 		/*
1087 		 * max_completions == 0 means unlimited, but complete at most
1088 		 * one queue depth batch of I/O at a time so that the completion
1089 		 * queue doorbells don't wrap around.
1090 		 */
1091 		max_completions = qpair->entries - 1;
1092 
1093 	while (1) {
1094 
1095 		cpl = &qpair->cpl[qpair->cq_head];
1096 		if (cpl->status.p != qpair->phase)
1097 			break;
1098 
1099 		tr = &qpair->tr[cpl->cid];
1100 		if (tr->active) {
1101 			nvme_qpair_complete_tracker(qpair, tr, cpl, true);
1102 		} else {
1103 			nvme_info("cpl does not map to outstanding cmd\n");
1104 			nvme_qpair_print_completion(qpair, cpl);
1105 			nvme_panic("received completion for unknown cmd\n");
1106 		}
1107 
1108 		if (++qpair->cq_head == qpair->entries) {
1109 			qpair->cq_head = 0;
1110 			qpair->phase = !qpair->phase;
1111 		}
1112 
1113 		if (++num_completions == max_completions)
1114 			break;
1115 	}
1116 
1117 	if (num_completions > 0)
1118 		nvme_mmio_write_4(qpair->cq_hdbl, qpair->cq_head);
1119 
1120 	return num_completions;
1121 }
1122 
1123 void nvme_qpair_reset(struct nvme_qpair *qpair)
1124 {
1125 	qpair->sq_tail = qpair->cq_head = 0;
1126 
1127 	/*
1128 	 * First time through the completion queue, HW will set phase
1129 	 * bit on completions to 1.  So set this to 1 here, indicating
1130 	 * we're looking for a 1 to know which entries have completed.
1131 	 * we'll toggle the bit each time when the completion queue rolls over.
1132 	 */
1133 	qpair->phase = 1;
1134 
1135 	memset(qpair->cmd, 0, qpair->entries * sizeof(struct nvme_cmd));
1136 	memset(qpair->cpl, 0, qpair->entries * sizeof(struct nvme_cpl));
1137 }
1138 
1139 void nvme_qpair_enable(struct nvme_qpair *qpair)
1140 {
1141 	if (nvme_qpair_is_io_queue(qpair))
1142 		_nvme_qpair_io_qpair_enable(qpair);
1143 	else
1144 		_nvme_qpair_admin_qpair_enable(qpair);
1145 }
1146 
1147 void nvme_qpair_disable(struct nvme_qpair *qpair)
1148 {
1149 	if (nvme_qpair_is_io_queue(qpair))
1150 		_nvme_qpair_io_qpair_disable(qpair);
1151 	else
1152 		_nvme_qpair_admin_qpair_disable(qpair);
1153 }
1154 
1155 void nvme_qpair_fail(struct nvme_qpair *qpair)
1156 {
1157 	struct nvme_tracker *tr;
1158 	struct nvme_request *req;
1159 
1160 	while (!STAILQ_EMPTY(&qpair->queued_req)) {
1161 
1162 		nvme_notice("Failing queued I/O command\n");
1163 		req = STAILQ_FIRST(&qpair->queued_req);
1164 		STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
1165 		nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC,
1166 						   NVME_SC_ABORTED_BY_REQUEST,
1167 						   true);
1168 
1169 	}
1170 
1171 	/* Manually abort each outstanding I/O. */
1172 	while (!LIST_EMPTY(&qpair->outstanding_tr)) {
1173 
1174 		/*
1175 		 * Do not remove the tracker. The abort_tracker path
1176 		 * will do that for us.
1177 		 */
1178 		nvme_notice("Failing outstanding I/O command\n");
1179 		tr = LIST_FIRST(&qpair->outstanding_tr);
1180 		nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC,
1181 						   NVME_SC_ABORTED_BY_REQUEST,
1182 						   1, true);
1183 
1184 	}
1185 }
1186 
1187