xref: /haiku/src/add-ons/kernel/drivers/disk/nvme/libnvme/nvme_qpair.c (revision bb83316a5811a550c4f850d07fa8e328e7ac0a94)
1  /*-
2   *   BSD LICENSE
3   *
4   *   Copyright (c) Intel Corporation. All rights reserved.
5   *   Copyright (c) 2017, Western Digital Corporation or its affiliates.
6   *
7   *   Redistribution and use in source and binary forms, with or without
8   *   modification, are permitted provided that the following conditions
9   *   are met:
10   *
11   *     * Redistributions of source code must retain the above copyright
12   *       notice, this list of conditions and the following disclaimer.
13   *     * Redistributions in binary form must reproduce the above copyright
14   *       notice, this list of conditions and the following disclaimer in
15   *       the documentation and/or other materials provided with the
16   *       distribution.
17   *     * Neither the name of Intel Corporation nor the names of its
18   *       contributors may be used to endorse or promote products derived
19   *       from this software without specific prior written permission.
20   *
21   *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22   *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23   *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24   *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25   *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26   *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27   *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28   *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29   *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30   *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31   *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32   */
33  
34  #include "nvme_internal.h"
35  
36  struct nvme_qpair_string {
37  	uint16_t	value;
38  	const char 	*str;
39  };
40  
41  static const struct nvme_qpair_string admin_opcode[] = {
42  	{ NVME_OPC_DELETE_IO_SQ,	"DELETE IO SQ" },
43  	{ NVME_OPC_CREATE_IO_SQ,	"CREATE IO SQ" },
44  	{ NVME_OPC_GET_LOG_PAGE,	"GET LOG PAGE" },
45  	{ NVME_OPC_DELETE_IO_CQ,	"DELETE IO CQ" },
46  	{ NVME_OPC_CREATE_IO_CQ,	"CREATE IO CQ" },
47  	{ NVME_OPC_IDENTIFY, 		"IDENTIFY" },
48  	{ NVME_OPC_ABORT,		"ABORT" },
49  	{ NVME_OPC_SET_FEATURES,	"SET FEATURES" },
50  	{ NVME_OPC_GET_FEATURES,	"GET FEATURES" },
51  	{ NVME_OPC_ASYNC_EVENT_REQUEST, "ASYNC EVENT REQUEST" },
52  	{ NVME_OPC_NS_MANAGEMENT,	"NAMESPACE MANAGEMENT" },
53  	{ NVME_OPC_FIRMWARE_COMMIT,	"FIRMWARE COMMIT" },
54  	{ NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD, "FIRMWARE IMAGE DOWNLOAD" },
55  	{ NVME_OPC_NS_ATTACHMENT,	"NAMESPACE ATTACHMENT" },
56  	{ NVME_OPC_FORMAT_NVM,		"FORMAT NVM" },
57  	{ NVME_OPC_SECURITY_SEND,	"SECURITY SEND" },
58  	{ NVME_OPC_SECURITY_RECEIVE,	"SECURITY RECEIVE" },
59  	{ 0xFFFF,			"ADMIN COMMAND" }
60  };
61  
62  static const struct nvme_qpair_string io_opcode[] = {
63  	{ NVME_OPC_FLUSH,		"FLUSH" },
64  	{ NVME_OPC_WRITE,		"WRITE" },
65  	{ NVME_OPC_READ,		"READ" },
66  	{ NVME_OPC_WRITE_UNCORRECTABLE, "WRITE UNCORRECTABLE" },
67  	{ NVME_OPC_COMPARE,		"COMPARE" },
68  	{ NVME_OPC_WRITE_ZEROES,	"WRITE ZEROES" },
69  	{ NVME_OPC_DATASET_MANAGEMENT,	"DATASET MANAGEMENT" },
70  	{ NVME_OPC_RESERVATION_REGISTER, "RESERVATION REGISTER" },
71  	{ NVME_OPC_RESERVATION_REPORT,	"RESERVATION REPORT" },
72  	{ NVME_OPC_RESERVATION_ACQUIRE, "RESERVATION ACQUIRE" },
73  	{ NVME_OPC_RESERVATION_RELEASE, "RESERVATION RELEASE" },
74  	{ 0xFFFF,			"IO COMMAND" }
75  };
76  
77  static const struct nvme_qpair_string generic_status[] = {
78  	{ NVME_SC_SUCCESS,			"SUCCESS" },
79  	{ NVME_SC_INVALID_OPCODE,		"INVALID OPCODE" },
80  	{ NVME_SC_INVALID_FIELD,		"INVALID FIELD" },
81  	{ NVME_SC_COMMAND_ID_CONFLICT,		"COMMAND ID CONFLICT" },
82  	{ NVME_SC_DATA_TRANSFER_ERROR,		"DATA TRANSFER ERROR" },
83  	{ NVME_SC_ABORTED_POWER_LOSS,		"ABORTED - POWER LOSS" },
84  	{ NVME_SC_INTERNAL_DEVICE_ERROR,	"INTERNAL DEVICE ERROR" },
85  	{ NVME_SC_ABORTED_BY_REQUEST,		"ABORTED - BY REQUEST" },
86  	{ NVME_SC_ABORTED_SQ_DELETION,		"ABORTED - SQ DELETION" },
87  	{ NVME_SC_ABORTED_FAILED_FUSED,		"ABORTED - FAILED FUSED" },
88  	{ NVME_SC_ABORTED_MISSING_FUSED,	"ABORTED - MISSING FUSED" },
89  	{ NVME_SC_INVALID_NAMESPACE_OR_FORMAT,	"INVALID NAMESPACE OR FORMAT" },
90  	{ NVME_SC_COMMAND_SEQUENCE_ERROR,	"COMMAND SEQUENCE ERROR" },
91  	{ NVME_SC_INVALID_SGL_SEG_DESCRIPTOR,	"INVALID SGL SEGMENT DESCRIPTOR" },
92  	{ NVME_SC_INVALID_NUM_SGL_DESCIRPTORS,	"INVALID NUMBER OF SGL DESCRIPTORS" },
93  	{ NVME_SC_DATA_SGL_LENGTH_INVALID,	"DATA SGL LENGTH INVALID" },
94  	{ NVME_SC_METADATA_SGL_LENGTH_INVALID,	"METADATA SGL LENGTH INVALID" },
95  	{ NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID,	"SGL DESCRIPTOR TYPE INVALID" },
96  	{ NVME_SC_INVALID_CONTROLLER_MEM_BUF,	"INVALID CONTROLLER MEMORY BUFFER" },
97  	{ NVME_SC_INVALID_PRP_OFFSET,		"INVALID PRP OFFSET" },
98  	{ NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED,	"ATOMIC WRITE UNIT EXCEEDED" },
99  	{ NVME_SC_LBA_OUT_OF_RANGE,		"LBA OUT OF RANGE" },
100  	{ NVME_SC_CAPACITY_EXCEEDED,		"CAPACITY EXCEEDED" },
101  	{ NVME_SC_NAMESPACE_NOT_READY,		"NAMESPACE NOT READY" },
102  	{ NVME_SC_RESERVATION_CONFLICT,		"RESERVATION CONFLICT" },
103  	{ NVME_SC_FORMAT_IN_PROGRESS,		"FORMAT IN PROGRESS" },
104  	{ 0xFFFF,				"GENERIC" }
105  };
106  
107  static const struct nvme_qpair_string command_specific_status[] = {
108  	{ NVME_SC_COMPLETION_QUEUE_INVALID,	"INVALID COMPLETION QUEUE" },
109  	{ NVME_SC_INVALID_QUEUE_IDENTIFIER,	"INVALID QUEUE IDENTIFIER" },
110  	{ NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED,	"MAX QUEUE SIZE EXCEEDED" },
111  	{ NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED,	"ABORT CMD LIMIT EXCEEDED" },
112  	{ NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED,"ASYNC LIMIT EXCEEDED" },
113  	{ NVME_SC_INVALID_FIRMWARE_SLOT,	"INVALID FIRMWARE SLOT" },
114  	{ NVME_SC_INVALID_FIRMWARE_IMAGE,	"INVALID FIRMWARE IMAGE" },
115  	{ NVME_SC_INVALID_INTERRUPT_VECTOR,	"INVALID INTERRUPT VECTOR" },
116  	{ NVME_SC_INVALID_LOG_PAGE,		"INVALID LOG PAGE" },
117  	{ NVME_SC_INVALID_FORMAT,		"INVALID FORMAT" },
118  	{ NVME_SC_FIRMWARE_REQ_CONVENTIONAL_RESET,"FIRMWARE REQUIRES CONVENTIONAL RESET" },
119  	{ NVME_SC_INVALID_QUEUE_DELETION,	"INVALID QUEUE DELETION" },
120  	{ NVME_SC_FEATURE_ID_NOT_SAVEABLE,	"FEATURE ID NOT SAVEABLE" },
121  	{ NVME_SC_FEATURE_NOT_CHANGEABLE,	"FEATURE NOT CHANGEABLE" },
122  	{ NVME_SC_FEATURE_NOT_NAMESPACE_SPECIFIC,"FEATURE NOT NAMESPACE SPECIFIC" },
123  	{ NVME_SC_FIRMWARE_REQ_NVM_RESET,	"FIRMWARE REQUIRES NVM RESET" },
124  	{ NVME_SC_FIRMWARE_REQ_RESET,		"FIRMWARE REQUIRES RESET" },
125  	{ NVME_SC_FIRMWARE_REQ_MAX_TIME_VIOLATION,"FIRMWARE REQUIRES MAX TIME VIOLATION" },
126  	{ NVME_SC_FIRMWARE_ACTIVATION_PROHIBITED,"FIRMWARE ACTIVATION PROHIBITED" },
127  	{ NVME_SC_OVERLAPPING_RANGE,		"OVERLAPPING RANGE" },
128  	{ NVME_SC_NAMESPACE_INSUFFICIENT_CAPACITY,"NAMESPACE INSUFFICIENT CAPACITY" },
129  	{ NVME_SC_NAMESPACE_ID_UNAVAILABLE,	"NAMESPACE ID UNAVAILABLE" },
130  	{ NVME_SC_NAMESPACE_ALREADY_ATTACHED,	"NAMESPACE ALREADY ATTACHED" },
131  	{ NVME_SC_NAMESPACE_IS_PRIVATE,		"NAMESPACE IS PRIVATE" },
132  	{ NVME_SC_NAMESPACE_NOT_ATTACHED,	"NAMESPACE NOT ATTACHED" },
133  	{ NVME_SC_THINPROVISIONING_NOT_SUPPORTED,"THINPROVISIONING NOT SUPPORTED" },
134  	{ NVME_SC_CONTROLLER_LIST_INVALID,	"CONTROLLER LIST INVALID" },
135  	{ NVME_SC_CONFLICTING_ATTRIBUTES,	"CONFLICTING ATTRIBUTES" },
136  	{ NVME_SC_INVALID_PROTECTION_INFO,	"INVALID PROTECTION INFO" },
137  	{ NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE,	"WRITE TO RO PAGE" },
138  	{ 0xFFFF,				"COMMAND SPECIFIC" }
139  };
140  
141  static const struct nvme_qpair_string media_error_status[] = {
142  	{ NVME_SC_WRITE_FAULTS, 		"WRITE FAULTS" },
143  	{ NVME_SC_UNRECOVERED_READ_ERROR, 	"UNRECOVERED READ ERROR" },
144  	{ NVME_SC_GUARD_CHECK_ERROR, 		"GUARD CHECK ERROR" },
145  	{ NVME_SC_APPLICATION_TAG_CHECK_ERROR, 	"APPLICATION TAG CHECK ERROR" },
146  	{ NVME_SC_REFERENCE_TAG_CHECK_ERROR, 	"REFERENCE TAG CHECK ERROR" },
147  	{ NVME_SC_COMPARE_FAILURE, 		"COMPARE FAILURE" },
148  	{ NVME_SC_ACCESS_DENIED, 		"ACCESS DENIED" },
149  	{ NVME_SC_DEALLOCATED_OR_UNWRITTEN_BLOCK, "DEALLOCATED OR UNWRITTEN BLOCK" },
150  	{ 0xFFFF, 				"MEDIA ERROR" }
151  };
152  
153  static inline bool nvme_qpair_is_admin_queue(struct nvme_qpair *qpair)
154  {
155  	return qpair->id == 0;
156  }
157  
158  static inline bool nvme_qpair_is_io_queue(struct nvme_qpair *qpair)
159  {
160  	return qpair->id != 0;
161  }
162  
163  static const char*nvme_qpair_get_string(const struct nvme_qpair_string *strings,
164  					uint16_t value)
165  {
166  	const struct nvme_qpair_string *entry;
167  
168  	entry = strings;
169  
170  	while (entry->value != 0xFFFF) {
171  		if (entry->value == value)
172  			return entry->str;
173  		entry++;
174  	}
175  	return entry->str;
176  }
177  
178  static void nvme_qpair_admin_qpair_print_command(struct nvme_qpair *qpair,
179  						 struct nvme_cmd *cmd)
180  {
181  	nvme_info("%s (%02x) sqid:%d cid:%d nsid:%x cdw10:%08x cdw11:%08x\n",
182  		  nvme_qpair_get_string(admin_opcode, cmd->opc), cmd->opc,
183  		  qpair->id, cmd->cid,
184  		  cmd->nsid, cmd->cdw10, cmd->cdw11);
185  }
186  
187  static void nvme_qpair_io_qpair_print_command(struct nvme_qpair *qpair,
188  					      struct nvme_cmd *cmd)
189  {
190  	nvme_assert(qpair != NULL, "print_command: qpair == NULL\n");
191  	nvme_assert(cmd != NULL, "print_command: cmd == NULL\n");
192  
193  	switch ((int)cmd->opc) {
194  	case NVME_OPC_WRITE:
195  	case NVME_OPC_READ:
196  	case NVME_OPC_WRITE_UNCORRECTABLE:
197  	case NVME_OPC_COMPARE:
198  		nvme_info("%s sqid:%d cid:%d nsid:%d lba:%llu len:%d\n",
199  			  nvme_qpair_get_string(io_opcode, cmd->opc),
200  			  qpair->id, cmd->cid, cmd->nsid,
201  			  ((unsigned long long)cmd->cdw11 << 32) + cmd->cdw10,
202  			  (cmd->cdw12 & 0xFFFF) + 1);
203  		break;
204  	case NVME_OPC_FLUSH:
205  	case NVME_OPC_DATASET_MANAGEMENT:
206  		nvme_info("%s sqid:%d cid:%d nsid:%d\n",
207  			  nvme_qpair_get_string(io_opcode, cmd->opc),
208  			  qpair->id, cmd->cid, cmd->nsid);
209  		break;
210  	default:
211  		nvme_info("%s (%02x) sqid:%d cid:%d nsid:%d\n",
212  			  nvme_qpair_get_string(io_opcode, cmd->opc),
213  			  cmd->opc, qpair->id, cmd->cid, cmd->nsid);
214  		break;
215  	}
216  }
217  
218  static void nvme_qpair_print_command(struct nvme_qpair *qpair,
219  				     struct nvme_cmd *cmd)
220  {
221  	nvme_assert(qpair != NULL, "qpair can not be NULL");
222  	nvme_assert(cmd != NULL, "cmd can not be NULL");
223  
224  	if (nvme_qpair_is_admin_queue(qpair))
225  		return nvme_qpair_admin_qpair_print_command(qpair, cmd);
226  
227  	return nvme_qpair_io_qpair_print_command(qpair, cmd);
228  }
229  
230  static const char *get_status_string(uint16_t sct, uint16_t sc)
231  {
232  	const struct nvme_qpair_string *entry;
233  
234  	switch (sct) {
235  	case NVME_SCT_GENERIC:
236  		entry = generic_status;
237  		break;
238  	case NVME_SCT_COMMAND_SPECIFIC:
239  		entry = command_specific_status;
240  		break;
241  	case NVME_SCT_MEDIA_ERROR:
242  		entry = media_error_status;
243  		break;
244  	case NVME_SCT_VENDOR_SPECIFIC:
245  		return "VENDOR SPECIFIC";
246  	default:
247  		return "RESERVED";
248  	}
249  
250  	return nvme_qpair_get_string(entry, sc);
251  }
252  
253  static void nvme_qpair_print_completion(struct nvme_qpair *qpair,
254  					struct nvme_cpl *cpl)
255  {
256  	nvme_info("Cpl: %s (%02x/%02x) sqid:%d cid:%d "
257  		  "cdw0:%x sqhd:%04x p:%x m:%x dnr:%x\n",
258  		  get_status_string(cpl->status.sct, cpl->status.sc),
259  		  cpl->status.sct,
260  		  cpl->status.sc,
261  		  cpl->sqid,
262  		  cpl->cid,
263  		  cpl->cdw0,
264  		  cpl->sqhd,
265  		  cpl->status.p,
266  		  cpl->status.m,
267  		  cpl->status.dnr);
268  }
269  
270  static bool nvme_qpair_completion_retry(const struct nvme_cpl *cpl)
271  {
272  	/*
273  	 * TODO: spec is not clear how commands that are aborted due
274  	 *  to TLER will be marked.  So for now, it seems
275  	 *  NAMESPACE_NOT_READY is the only case where we should
276  	 *  look at the DNR bit.
277  	 */
278  	switch ((int)cpl->status.sct) {
279  	case NVME_SCT_GENERIC:
280  		switch ((int)cpl->status.sc) {
281  		case NVME_SC_NAMESPACE_NOT_READY:
282  		case NVME_SC_FORMAT_IN_PROGRESS:
283  			if (cpl->status.dnr)
284  				return false;
285  			return true;
286  		case NVME_SC_INVALID_OPCODE:
287  		case NVME_SC_INVALID_FIELD:
288  		case NVME_SC_COMMAND_ID_CONFLICT:
289  		case NVME_SC_DATA_TRANSFER_ERROR:
290  		case NVME_SC_ABORTED_POWER_LOSS:
291  		case NVME_SC_INTERNAL_DEVICE_ERROR:
292  		case NVME_SC_ABORTED_BY_REQUEST:
293  		case NVME_SC_ABORTED_SQ_DELETION:
294  		case NVME_SC_ABORTED_FAILED_FUSED:
295  		case NVME_SC_ABORTED_MISSING_FUSED:
296  		case NVME_SC_INVALID_NAMESPACE_OR_FORMAT:
297  		case NVME_SC_COMMAND_SEQUENCE_ERROR:
298  		case NVME_SC_LBA_OUT_OF_RANGE:
299  		case NVME_SC_CAPACITY_EXCEEDED:
300  		default:
301  			return false;
302  		}
303  	case NVME_SCT_COMMAND_SPECIFIC:
304  	case NVME_SCT_MEDIA_ERROR:
305  	case NVME_SCT_VENDOR_SPECIFIC:
306  	default:
307  		return false;
308  	}
309  }
310  
311  static void nvme_qpair_construct_tracker(struct nvme_tracker *tr,
312  					 uint16_t cid, uint64_t phys_addr)
313  {
314  	tr->prp_sgl_bus_addr = phys_addr + offsetof(struct nvme_tracker, u.prp);
315  	tr->cid = cid;
316  	tr->active = false;
317  }
318  
319  static inline void nvme_qpair_copy_command(struct nvme_cmd *dst,
320  					   const struct nvme_cmd *src)
321  {
322  	/* dst and src are known to be non-overlapping and 64-byte aligned. */
323  #if defined(__AVX__)
324  	__m256i *d256 = (__m256i *)dst;
325  	const __m256i *s256 = (const __m256i *)src;
326  
327  	_mm256_store_si256(&d256[0], _mm256_load_si256(&s256[0]));
328  	_mm256_store_si256(&d256[1], _mm256_load_si256(&s256[1]));
329  #elif defined(__SSE2__)
330  	__m128i *d128 = (__m128i *)dst;
331  	const __m128i *s128 = (const __m128i *)src;
332  
333  	_mm_store_si128(&d128[0], _mm_load_si128(&s128[0]));
334  	_mm_store_si128(&d128[1], _mm_load_si128(&s128[1]));
335  	_mm_store_si128(&d128[2], _mm_load_si128(&s128[2]));
336  	_mm_store_si128(&d128[3], _mm_load_si128(&s128[3]));
337  #else
338  	*dst = *src;
339  #endif
340  }
341  
342  static void nvme_qpair_submit_tracker(struct nvme_qpair *qpair,
343  				      struct nvme_tracker *tr)
344  {
345  	struct nvme_request *req = tr->req;
346  
347  	/*
348  	 * Set the tracker active and copy its command
349  	 * to the submission queue.
350  	 */
351  	nvme_debug("qpair %d: Submit command, tail %d, cid %d / %d\n",
352  		   qpair->id,
353  		   (int)qpair->sq_tail,
354  		   (int)tr->cid,
355  		   (int)tr->req->cmd.cid);
356  
357  	qpair->tr[tr->cid].active = true;
358  	nvme_qpair_copy_command(&qpair->cmd[qpair->sq_tail], &req->cmd);
359  
360  	if (++qpair->sq_tail == qpair->entries)
361  		qpair->sq_tail = 0;
362  
363  	nvme_wmb();
364  	nvme_mmio_write_4(qpair->sq_tdbl, qpair->sq_tail);
365  }
366  
367  static void nvme_qpair_complete_tracker(struct nvme_qpair *qpair,
368  					struct nvme_tracker *tr,
369  					struct nvme_cpl *cpl,
370  					bool print_on_error)
371  {
372  	struct nvme_request *req = tr->req;
373  	bool retry, error;
374  
375  	if (!req) {
376  		nvme_crit("tracker has no request\n");
377  		qpair->tr[cpl->cid].active = false;
378  		goto done;
379  	}
380  
381  	error = nvme_cpl_is_error(cpl);
382  	retry = error && nvme_qpair_completion_retry(cpl) &&
383  		(req->retries < NVME_MAX_RETRY_COUNT);
384  	if (error && print_on_error) {
385  		nvme_qpair_print_command(qpair, &req->cmd);
386  		nvme_qpair_print_completion(qpair, cpl);
387  	}
388  
389  	qpair->tr[cpl->cid].active = false;
390  
391  	if (cpl->cid != req->cmd.cid)
392  		nvme_crit("cpl and command CID mismatch (%d / %d)\n",
393  			  (int)cpl->cid, (int)req->cmd.cid);
394  
395  	if (retry) {
396  		req->retries++;
397  		nvme_qpair_submit_tracker(qpair, tr);
398  		return;
399  	}
400  
401  	if (req->cb_fn)
402  		req->cb_fn(req->cb_arg, cpl);
403  
404  	nvme_request_free_locked(req);
405  
406  done:
407  	tr->req = NULL;
408  
409  	LIST_REMOVE(tr, list);
410  	LIST_INSERT_HEAD(&qpair->free_tr, tr, list);
411  }
412  
413  static void nvme_qpair_submit_queued_requests(struct nvme_qpair *qpair)
414  {
415  	STAILQ_HEAD(, nvme_request) req_queue;
416  	STAILQ_INIT(&req_queue);
417  
418  	pthread_mutex_lock(&qpair->lock);
419  
420  	STAILQ_CONCAT(&req_queue, &qpair->queued_req);
421  
422  	/*
423  	 * If the controller is in the middle of a reset, don't
424  	 * try to submit queued requests - let the reset logic
425  	 * handle that instead.
426  	 */
427  	while (!qpair->ctrlr->resetting && LIST_FIRST(&qpair->free_tr)
428  			&& !STAILQ_EMPTY(&req_queue)) {
429  		struct nvme_request *req = STAILQ_FIRST(&req_queue);
430  		STAILQ_REMOVE_HEAD(&req_queue, stailq);
431  
432  		pthread_mutex_unlock(&qpair->lock);
433  		nvme_qpair_submit_request(qpair, req);
434  		pthread_mutex_lock(&qpair->lock);
435  	}
436  
437  	STAILQ_CONCAT(&qpair->queued_req, &req_queue);
438  
439  	pthread_mutex_unlock(&qpair->lock);
440  }
441  
442  static void nvme_qpair_manual_complete_tracker(struct nvme_qpair *qpair,
443  					       struct nvme_tracker *tr,
444  					       uint32_t sct,
445  					       uint32_t sc,
446  					       uint32_t dnr,
447  					       bool print_on_error)
448  {
449  	struct nvme_cpl	cpl;
450  
451  	memset(&cpl, 0, sizeof(cpl));
452  	cpl.sqid = qpair->id;
453  	cpl.cid = tr->cid;
454  	cpl.status.sct = sct;
455  	cpl.status.sc = sc;
456  	cpl.status.dnr = dnr;
457  
458  	nvme_qpair_complete_tracker(qpair, tr, &cpl, print_on_error);
459  }
460  
461  static void nvme_qpair_manual_complete_request(struct nvme_qpair *qpair,
462  					       struct nvme_request *req,
463  					       uint32_t sct, uint32_t sc,
464  					       bool print_on_error)
465  {
466  	struct nvme_cpl	cpl;
467  	bool error;
468  
469  	memset(&cpl, 0, sizeof(cpl));
470  	cpl.sqid = qpair->id;
471  	cpl.status.sct = sct;
472  	cpl.status.sc = sc;
473  
474  	error = nvme_cpl_is_error(&cpl);
475  
476  	if (error && print_on_error) {
477  		nvme_qpair_print_command(qpair, &req->cmd);
478  		nvme_qpair_print_completion(qpair, &cpl);
479  	}
480  
481  	if (req->cb_fn)
482  		req->cb_fn(req->cb_arg, &cpl);
483  
484  	nvme_request_free_locked(req);
485  }
486  
487  static void nvme_qpair_abort_aers(struct nvme_qpair *qpair)
488  {
489  	struct nvme_tracker *tr;
490  
491  	tr = LIST_FIRST(&qpair->outstanding_tr);
492  	while (tr != NULL) {
493  		nvme_assert(tr->req != NULL,
494  			    "tr->req == NULL in abort_aers\n");
495  		if (tr->req->cmd.opc == NVME_OPC_ASYNC_EVENT_REQUEST) {
496  			nvme_qpair_manual_complete_tracker(qpair, tr,
497  					      NVME_SCT_GENERIC,
498  					      NVME_SC_ABORTED_SQ_DELETION,
499  					      0, false);
500  			tr = LIST_FIRST(&qpair->outstanding_tr);
501  			continue;
502  		}
503  		tr = LIST_NEXT(tr, list);
504  	}
505  }
506  
507  static inline void _nvme_qpair_admin_qpair_destroy(struct nvme_qpair *qpair)
508  {
509  	nvme_qpair_abort_aers(qpair);
510  }
511  
512  static inline void _nvme_qpair_req_bad_phys(struct nvme_qpair *qpair,
513  					    struct nvme_tracker *tr)
514  {
515  	/*
516  	 * Bad vtophys translation, so abort this request
517  	 * and return immediately, without retry.
518  	 */
519  	nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC,
520  					   NVME_SC_INVALID_FIELD,
521  					   1, true);
522  }
523  
524  /*
525   * Build PRP list describing physically contiguous payload buffer.
526   */
527  static int _nvme_qpair_build_contig_request(struct nvme_qpair *qpair,
528  					    struct nvme_request *req,
529  					    struct nvme_tracker *tr)
530  {
531  	uint64_t phys_addr;
532  	void *seg_addr;
533  	uint32_t nseg, cur_nseg, modulo, unaligned;
534  	void *md_payload;
535  	void *payload = req->payload.u.contig + req->payload_offset;
536  
537  	phys_addr = nvme_mem_vtophys(payload);
538  	if (phys_addr == NVME_VTOPHYS_ERROR) {
539  		_nvme_qpair_req_bad_phys(qpair, tr);
540  		return -1;
541  	}
542  	nseg = req->payload_size >> PAGE_SHIFT;
543  	modulo = req->payload_size & (PAGE_SIZE - 1);
544  	unaligned = phys_addr & (PAGE_SIZE - 1);
545  	if (modulo || unaligned)
546  		nseg += 1 + ((modulo + unaligned - 1) >> PAGE_SHIFT);
547  
548  	if (req->payload.md) {
549  		md_payload = req->payload.md + req->md_offset;
550  		tr->req->cmd.mptr = nvme_mem_vtophys(md_payload);
551  		if (tr->req->cmd.mptr == NVME_VTOPHYS_ERROR) {
552  			_nvme_qpair_req_bad_phys(qpair, tr);
553  			return -1;
554  		}
555  	}
556  
557  	tr->req->cmd.psdt = NVME_PSDT_PRP;
558  	tr->req->cmd.dptr.prp.prp1 = phys_addr;
559  	if (nseg == 2) {
560  		seg_addr = payload + PAGE_SIZE - unaligned;
561  		tr->req->cmd.dptr.prp.prp2 = nvme_mem_vtophys(seg_addr);
562  	} else if (nseg > 2) {
563  		cur_nseg = 1;
564  		tr->req->cmd.dptr.prp.prp2 = (uint64_t)tr->prp_sgl_bus_addr;
565  		while (cur_nseg < nseg) {
566  			seg_addr = payload + cur_nseg * PAGE_SIZE - unaligned;
567  			phys_addr = nvme_mem_vtophys(seg_addr);
568  			if (phys_addr == NVME_VTOPHYS_ERROR) {
569  				_nvme_qpair_req_bad_phys(qpair, tr);
570  				return -1;
571  			}
572  			tr->u.prp[cur_nseg - 1] = phys_addr;
573  			cur_nseg++;
574  		}
575  	}
576  
577  	return 0;
578  }
579  
580  /*
581   * Build SGL list describing scattered payload buffer.
582   */
583  static int _nvme_qpair_build_hw_sgl_request(struct nvme_qpair *qpair,
584  					    struct nvme_request *req,
585  					    struct nvme_tracker *tr)
586  {
587  	struct nvme_sgl_descriptor *sgl;
588  	uint64_t phys_addr;
589  	uint32_t remaining_transfer_len, length, nseg = 0;
590  	int ret;
591  
592  	/*
593  	 * Build scattered payloads.
594  	 */
595  	nvme_assert(req->payload_size != 0,
596  		    "cannot build SGL for zero-length transfer\n");
597  	nvme_assert(req->payload.type == NVME_PAYLOAD_TYPE_SGL,
598  		    "sgl payload type required\n");
599  	nvme_assert(req->payload.u.sgl.reset_sgl_fn != NULL,
600  		    "sgl reset callback required\n");
601  	nvme_assert(req->payload.u.sgl.next_sge_fn != NULL,
602  		    "sgl callback required\n");
603  	req->payload.u.sgl.reset_sgl_fn(req->payload.u.sgl.cb_arg,
604  					req->payload_offset);
605  
606  	sgl = tr->u.sgl;
607  	req->cmd.psdt = NVME_PSDT_SGL_MPTR_SGL;
608  	req->cmd.dptr.sgl1.unkeyed.subtype = 0;
609  
610  	remaining_transfer_len = req->payload_size;
611  
612  	while (remaining_transfer_len > 0) {
613  
614  		if (nseg >= NVME_MAX_SGL_DESCRIPTORS) {
615  			_nvme_qpair_req_bad_phys(qpair, tr);
616  			return -1;
617  		}
618  
619  		ret = req->payload.u.sgl.next_sge_fn(req->payload.u.sgl.cb_arg,
620  						     &phys_addr, &length);
621  		if (ret != 0) {
622  			_nvme_qpair_req_bad_phys(qpair, tr);
623  			return ret;
624  		}
625  
626  		length = nvme_min(remaining_transfer_len, length);
627  		remaining_transfer_len -= length;
628  
629  		sgl->unkeyed.type = NVME_SGL_TYPE_DATA_BLOCK;
630  		sgl->unkeyed.length = length;
631  		sgl->address = phys_addr;
632  		sgl->unkeyed.subtype = 0;
633  
634  		sgl++;
635  		nseg++;
636  
637  	}
638  
639  	if (nseg == 1) {
640  		/*
641  		 * The whole transfer can be described by a single Scatter
642  		 * Gather List descriptor. Use the special case described
643  		 * by the spec where SGL1's type is Data Block.
644  		 * This means the SGL in the tracker is not used at all,
645  		 * so copy the first (and only) SGL element into SGL1.
646  		 */
647  		req->cmd.dptr.sgl1.unkeyed.type = NVME_SGL_TYPE_DATA_BLOCK;
648  		req->cmd.dptr.sgl1.address = tr->u.sgl[0].address;
649  		req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length;
650  	} else {
651  		/* For now we only support 1 SGL segment in NVMe controller */
652  		req->cmd.dptr.sgl1.unkeyed.type = NVME_SGL_TYPE_LAST_SEGMENT;
653  		req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr;
654  		req->cmd.dptr.sgl1.unkeyed.length =
655  			nseg * sizeof(struct nvme_sgl_descriptor);
656  	}
657  
658  	return 0;
659  }
660  
661  /*
662   * Build Physical Region Page list describing scattered payload buffer.
663   */
664  static int _nvme_qpair_build_prps_sgl_request(struct nvme_qpair *qpair,
665  					      struct nvme_request *req,
666  					      struct nvme_tracker *tr)
667  {
668  	uint64_t phys_addr, prp2 = 0;
669  	uint32_t data_transferred, remaining_transfer_len, length;
670  	uint32_t nseg, cur_nseg, total_nseg = 0, last_nseg = 0;
671  	uint32_t modulo, unaligned, sge_count = 0;
672  	int ret;
673  
674  	/*
675  	 * Build scattered payloads.
676  	 */
677  	nvme_assert(req->payload.type == NVME_PAYLOAD_TYPE_SGL,
678  		    "sgl payload type required\n");
679  	nvme_assert(req->payload.u.sgl.reset_sgl_fn != NULL,
680  		    "sgl reset callback required\n");
681  	req->payload.u.sgl.reset_sgl_fn(req->payload.u.sgl.cb_arg,
682  					req->payload_offset);
683  
684  	remaining_transfer_len = req->payload_size;
685  
686  	while (remaining_transfer_len > 0) {
687  
688  		nvme_assert(req->payload.u.sgl.next_sge_fn != NULL,
689  			    "sgl callback required\n");
690  
691  		ret = req->payload.u.sgl.next_sge_fn(req->payload.u.sgl.cb_arg,
692  						    &phys_addr, &length);
693  		if (ret != 0) {
694  			_nvme_qpair_req_bad_phys(qpair, tr);
695  			return -1;
696  		}
697  
698  		nvme_assert((phys_addr & 0x3) == 0, "address must be dword aligned\n");
699  		nvme_assert((length >= remaining_transfer_len) || ((phys_addr + length) % PAGE_SIZE) == 0,
700  			"All SGEs except last must end on a page boundary\n");
701  		nvme_assert((sge_count == 0) || (phys_addr % PAGE_SIZE) == 0,
702  			"All SGEs except first must start on a page boundary\n");
703  
704  		data_transferred = nvme_min(remaining_transfer_len, length);
705  
706  		nseg = data_transferred >> PAGE_SHIFT;
707  		modulo = data_transferred & (PAGE_SIZE - 1);
708  		unaligned = phys_addr & (PAGE_SIZE - 1);
709  		if (modulo || unaligned)
710  			nseg += 1 + ((modulo + unaligned - 1) >> PAGE_SHIFT);
711  
712  		if (total_nseg == 0) {
713  			req->cmd.psdt = NVME_PSDT_PRP;
714  			req->cmd.dptr.prp.prp1 = phys_addr;
715  		}
716  
717  		total_nseg += nseg;
718  		sge_count++;
719  		remaining_transfer_len -= data_transferred;
720  
721  		if (total_nseg == 2) {
722  			if (sge_count == 1)
723  				tr->req->cmd.dptr.prp.prp2 = phys_addr +
724  					PAGE_SIZE - unaligned;
725  			else if (sge_count == 2)
726  				tr->req->cmd.dptr.prp.prp2 = phys_addr;
727  			/* save prp2 value */
728  			prp2 = tr->req->cmd.dptr.prp.prp2;
729  		} else if (total_nseg > 2) {
730  			if (sge_count == 1)
731  				cur_nseg = 1;
732  			else
733  				cur_nseg = 0;
734  
735  			tr->req->cmd.dptr.prp.prp2 =
736  				(uint64_t)tr->prp_sgl_bus_addr;
737  
738  			while (cur_nseg < nseg) {
739  				if (prp2) {
740  					tr->u.prp[0] = prp2;
741  					tr->u.prp[last_nseg + 1] = phys_addr +
742  						cur_nseg * PAGE_SIZE - unaligned;
743  				} else {
744  					tr->u.prp[last_nseg] = phys_addr +
745  						cur_nseg * PAGE_SIZE - unaligned;
746  				}
747  				last_nseg++;
748  				cur_nseg++;
749  			}
750  		}
751  	}
752  
753  	return 0;
754  }
755  
756  static void _nvme_qpair_admin_qpair_enable(struct nvme_qpair *qpair)
757  {
758  	struct nvme_tracker *tr, *tr_temp;
759  
760  	/*
761  	 * Manually abort each outstanding admin command.  Do not retry
762  	 * admin commands found here, since they will be left over from
763  	 * a controller reset and its likely the context in which the
764  	 * command was issued no longer applies.
765  	 */
766  	LIST_FOREACH_SAFE(tr, &qpair->outstanding_tr, list, tr_temp) {
767  		nvme_info("Aborting outstanding admin command\n");
768  		nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC,
769  						   NVME_SC_ABORTED_BY_REQUEST,
770  						   1 /* do not retry */, true);
771  	}
772  
773  	qpair->enabled = true;
774  }
775  
776  static void _nvme_qpair_io_qpair_enable(struct nvme_qpair *qpair)
777  {
778  	struct nvme_tracker *tr, *temp;
779  	struct nvme_request *req;
780  
781  	qpair->enabled = true;
782  
783  	qpair->ctrlr->enabled_io_qpairs++;
784  
785  	/* Manually abort each queued I/O. */
786  	while (!STAILQ_EMPTY(&qpair->queued_req)) {
787  		req = STAILQ_FIRST(&qpair->queued_req);
788  		STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
789  		nvme_info("Aborting queued I/O command\n");
790  		nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC,
791  						   NVME_SC_ABORTED_BY_REQUEST,
792  						   true);
793  	}
794  
795  	/* Manually abort each outstanding I/O. */
796  	LIST_FOREACH_SAFE(tr, &qpair->outstanding_tr, list, temp) {
797  		nvme_info("Aborting outstanding I/O command\n");
798  		nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC,
799  						   NVME_SC_ABORTED_BY_REQUEST,
800  						   0, true);
801  	}
802  }
803  
804  static inline void _nvme_qpair_admin_qpair_disable(struct nvme_qpair *qpair)
805  {
806  	qpair->enabled = false;
807  	nvme_qpair_abort_aers(qpair);
808  }
809  
810  static inline void _nvme_qpair_io_qpair_disable(struct nvme_qpair *qpair)
811  {
812  	qpair->enabled = false;
813  
814  	qpair->ctrlr->enabled_io_qpairs--;
815  }
816  
817  /*
818   * Reserve room for the submission queue
819   * in the controller memory buffer
820   */
821  static int nvme_ctrlr_reserve_sq_in_cmb(struct nvme_ctrlr *ctrlr,
822  					uint16_t entries,
823  					uint64_t aligned, uint64_t *offset)
824  {
825  	uint64_t round_offset;
826  	const uint64_t length = entries * sizeof(struct nvme_cmd);
827  
828  	round_offset = ctrlr->cmb_current_offset;
829  	round_offset = (round_offset + (aligned - 1)) & ~(aligned - 1);
830  
831  	if (round_offset + length > ctrlr->cmb_size)
832  		return -1;
833  
834  	*offset = round_offset;
835  	ctrlr->cmb_current_offset = round_offset + length;
836  
837  	return 0;
838  }
839  
840  /*
841   * Initialize a queue pair on the host side.
842   */
843  int nvme_qpair_construct(struct nvme_ctrlr *ctrlr, struct nvme_qpair *qpair,
844  			 enum nvme_qprio qprio,
845  			 uint16_t entries, uint16_t trackers)
846  {
847  	volatile uint32_t *doorbell_base;
848  	struct nvme_tracker *tr;
849  	uint64_t offset;
850  	unsigned long phys_addr = 0;
851  	uint16_t i;
852  	int ret;
853  
854  	nvme_assert(entries != 0, "Invalid number of entries\n");
855  	nvme_assert(trackers != 0, "Invalid trackers\n");
856  
857  	pthread_mutex_init(&qpair->lock, NULL);
858  
859  	qpair->entries = entries;
860  	qpair->trackers = trackers;
861  	qpair->qprio = qprio;
862  	qpair->sq_in_cmb = false;
863  	qpair->ctrlr = ctrlr;
864  
865  	if (ctrlr->opts.use_cmb_sqs) {
866  		/*
867  		 * Reserve room for the submission queue in ctrlr
868  		 * memory buffer.
869  		 */
870  		ret = nvme_ctrlr_reserve_sq_in_cmb(ctrlr, entries,
871  						   PAGE_SIZE,
872  						   &offset);
873  		if (ret == 0) {
874  
875  			qpair->cmd = ctrlr->cmb_bar_virt_addr + offset;
876  			qpair->cmd_bus_addr = ctrlr->cmb_bar_phys_addr + offset;
877  			qpair->sq_in_cmb = true;
878  
879  			nvme_debug("Allocated qpair %d cmd in cmb at %p / 0x%llx\n",
880  				   qpair->id,
881  				   qpair->cmd, qpair->cmd_bus_addr);
882  
883  		}
884  	}
885  
886  	if (qpair->sq_in_cmb == false) {
887  
888  		qpair->cmd =
889  			nvme_mem_alloc_node(sizeof(struct nvme_cmd) * entries,
890  				    PAGE_SIZE, NVME_NODE_ID_ANY,
891  				    (unsigned long *) &qpair->cmd_bus_addr);
892  		if (!qpair->cmd) {
893  			nvme_err("Allocate qpair commands failed\n");
894  			goto fail;
895  		}
896  		memset(qpair->cmd, 0, sizeof(struct nvme_cmd) * entries);
897  
898  		nvme_debug("Allocated qpair %d cmd %p / 0x%llx\n",
899  			   qpair->id,
900  			   qpair->cmd, qpair->cmd_bus_addr);
901  	}
902  
903  	qpair->cpl = nvme_mem_alloc_node(sizeof(struct nvme_cpl) * entries,
904  				 PAGE_SIZE, NVME_NODE_ID_ANY,
905  				 (unsigned long *) &qpair->cpl_bus_addr);
906  	if (!qpair->cpl) {
907  		nvme_err("Allocate qpair completions failed\n");
908  		goto fail;
909  	}
910  	memset(qpair->cpl, 0, sizeof(struct nvme_cpl) * entries);
911  
912  	nvme_debug("Allocated qpair %d cpl at %p / 0x%llx\n",
913  		   qpair->id,
914  		   qpair->cpl,
915  		   qpair->cpl_bus_addr);
916  
917  	doorbell_base = &ctrlr->regs->doorbell[0].sq_tdbl;
918  	qpair->sq_tdbl = doorbell_base +
919  		(2 * qpair->id + 0) * ctrlr->doorbell_stride_u32;
920  	qpair->cq_hdbl = doorbell_base +
921  		(2 * qpair->id + 1) * ctrlr->doorbell_stride_u32;
922  
923  	LIST_INIT(&qpair->free_tr);
924  	LIST_INIT(&qpair->outstanding_tr);
925  	STAILQ_INIT(&qpair->free_req);
926  	STAILQ_INIT(&qpair->queued_req);
927  
928  	/* Request pool */
929  	if (nvme_request_pool_construct(qpair)) {
930  		nvme_err("Create request pool failed\n");
931  		goto fail;
932  	}
933  
934  	/*
935  	 * Reserve space for all of the trackers in a single allocation.
936  	 * struct nvme_tracker must be padded so that its size is already
937  	 * a power of 2. This ensures the PRP list embedded in the nvme_tracker
938  	 * object will not span a 4KB boundary, while allowing access to
939  	 * trackers in tr[] via normal array indexing.
940  	 */
941  	qpair->tr = nvme_mem_alloc_node(sizeof(struct nvme_tracker) * trackers,
942  					sizeof(struct nvme_tracker),
943  					NVME_NODE_ID_ANY, &phys_addr);
944  	if (!qpair->tr) {
945  		nvme_err("Allocate request trackers failed\n");
946  		goto fail;
947  	}
948  	memset(qpair->tr, 0, sizeof(struct nvme_tracker) * trackers);
949  
950  	nvme_debug("Allocated qpair %d trackers at %p / 0x%lx\n",
951  		   qpair->id, qpair->tr, phys_addr);
952  
953  	for (i = 0; i < trackers; i++) {
954  		tr = &qpair->tr[i];
955  		nvme_qpair_construct_tracker(tr, i, phys_addr);
956  		LIST_INSERT_HEAD(&qpair->free_tr, tr, list);
957  		phys_addr += sizeof(struct nvme_tracker);
958  	}
959  
960  	nvme_qpair_reset(qpair);
961  
962  	return 0;
963  
964  fail:
965  	nvme_qpair_destroy(qpair);
966  
967  	return -1;
968  }
969  
970  void nvme_qpair_destroy(struct nvme_qpair *qpair)
971  {
972  	if (!qpair->ctrlr)
973  		return; // Not initialized.
974  
975  	if (nvme_qpair_is_admin_queue(qpair))
976  		_nvme_qpair_admin_qpair_destroy(qpair);
977  
978  	if (qpair->cmd && !qpair->sq_in_cmb) {
979  		nvme_free(qpair->cmd);
980  		qpair->cmd = NULL;
981  	}
982  	if (qpair->cpl) {
983  		nvme_free(qpair->cpl);
984  		qpair->cpl = NULL;
985  	}
986  	if (qpair->tr) {
987  		nvme_free(qpair->tr);
988  		qpair->tr = NULL;
989  	}
990  	nvme_request_pool_destroy(qpair);
991  
992  	qpair->ctrlr = NULL;
993  
994  	pthread_mutex_destroy(&qpair->lock);
995  }
996  
997  static bool nvme_qpair_enabled(struct nvme_qpair *qpair)
998  {
999  	if (!qpair->enabled && !qpair->ctrlr->resetting)
1000  		nvme_qpair_enable(qpair);
1001  
1002  	return qpair->enabled;
1003  }
1004  
1005  int nvme_qpair_submit_request(struct nvme_qpair *qpair,
1006  			      struct nvme_request *req)
1007  {
1008  	struct nvme_tracker *tr;
1009  	struct nvme_request *child_req, *tmp;
1010  	struct nvme_ctrlr *ctrlr = qpair->ctrlr;
1011  	bool child_req_failed = false;
1012  	int ret = 0;
1013  
1014  	if (ctrlr->failed) {
1015  		nvme_request_free(req);
1016  		return ENXIO;
1017  	}
1018  
1019  	nvme_qpair_enabled(qpair);
1020  
1021  	if (req->child_reqs) {
1022  
1023  		/*
1024  		 * This is a splitted (parent) request. Submit all of the
1025  		 * children but not the parent request itself, since the
1026  		 * parent is the original unsplit request.
1027  		 */
1028  		TAILQ_FOREACH_SAFE(child_req, &req->children, child_tailq, tmp) {
1029  			if (!child_req_failed) {
1030  				ret = nvme_qpair_submit_request(qpair, child_req);
1031  				if (ret != 0)
1032  					child_req_failed = true;
1033  			} else {
1034  				/* free remaining child_reqs since
1035  				 * one child_req fails */
1036  				nvme_request_remove_child(req, child_req);
1037  				nvme_request_free(child_req);
1038  			}
1039  		}
1040  
1041  		return ret;
1042  	}
1043  
1044  	pthread_mutex_lock(&qpair->lock);
1045  
1046  	tr = LIST_FIRST(&qpair->free_tr);
1047  	if (tr == NULL || !qpair->enabled || !STAILQ_EMPTY(&qpair->queued_req)) {
1048  		/*
1049  		 * No tracker is available, the qpair is disabled due
1050  		 * to an in-progress controller-level reset, or
1051  		 * there are already queued requests.
1052  		 *
1053  		 * Put the request on the qpair's request queue to be
1054  		 * processed when a tracker frees up via a command
1055  		 * completion or when the controller reset is completed.
1056  		 */
1057  		STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
1058  		pthread_mutex_unlock(&qpair->lock);
1059  
1060  		if (tr)
1061  			nvme_qpair_submit_queued_requests(qpair);
1062  		return 0;
1063  	}
1064  
1065  	/* remove tr from free_tr */
1066  	LIST_REMOVE(tr, list);
1067  	LIST_INSERT_HEAD(&qpair->outstanding_tr, tr, list);
1068  	tr->req = req;
1069  	req->cmd.cid = tr->cid;
1070  
1071  	if (req->payload_size == 0) {
1072  		/* Null payload - leave PRP fields zeroed */
1073  		ret = 0;
1074  	} else if (req->payload.type == NVME_PAYLOAD_TYPE_CONTIG) {
1075  		ret = _nvme_qpair_build_contig_request(qpair, req, tr);
1076  	} else if (req->payload.type == NVME_PAYLOAD_TYPE_SGL) {
1077  		if (ctrlr->flags & NVME_CTRLR_SGL_SUPPORTED)
1078  			ret = _nvme_qpair_build_hw_sgl_request(qpair, req, tr);
1079  		else
1080  			ret = _nvme_qpair_build_prps_sgl_request(qpair, req, tr);
1081  	} else {
1082  		nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC,
1083  						   NVME_SC_INVALID_FIELD,
1084  						   1 /* do not retry */, true);
1085  		ret = -EINVAL;
1086  	}
1087  
1088  	if (ret == 0)
1089  		nvme_qpair_submit_tracker(qpair, tr);
1090  
1091  	pthread_mutex_unlock(&qpair->lock);
1092  
1093  	return ret;
1094  }
1095  
1096  /*
1097   * Poll for completion of NVMe commands submitted to the
1098   * specified I/O queue pair.
1099   */
1100  unsigned int nvme_qpair_poll(struct nvme_qpair *qpair,
1101  			     unsigned int max_completions)
1102  {
1103  	struct nvme_tracker *tr;
1104  	struct nvme_cpl	*cpl;
1105  	uint32_t num_completions = 0;
1106  
1107  	if (!nvme_qpair_enabled(qpair))
1108  		/*
1109  		 * qpair is not enabled, likely because a controller reset is
1110  		 * is in progress.  Ignore the interrupt - any I/O that was
1111  		 * associated with this interrupt will get retried when the
1112  		 * reset is complete.
1113  		 */
1114  		return 0;
1115  
1116  	if ((max_completions == 0) ||
1117  	    (max_completions > (qpair->entries - 1U)))
1118  		/*
1119  		 * max_completions == 0 means unlimited, but complete at most
1120  		 * one queue depth batch of I/O at a time so that the completion
1121  		 * queue doorbells don't wrap around.
1122  		 */
1123  		max_completions = qpair->entries - 1;
1124  
1125  	pthread_mutex_lock(&qpair->lock);
1126  
1127  	while (1) {
1128  
1129  		cpl = &qpair->cpl[qpair->cq_head];
1130  		if (cpl->status.p != qpair->phase)
1131  			break;
1132  
1133  		tr = &qpair->tr[cpl->cid];
1134  		if (tr->active) {
1135  			nvme_qpair_complete_tracker(qpair, tr, cpl, true);
1136  		} else {
1137  			nvme_info("cpl does not map to outstanding cmd\n");
1138  			nvme_qpair_print_completion(qpair, cpl);
1139  			nvme_panic("received completion for unknown cmd\n");
1140  		}
1141  
1142  		if (++qpair->cq_head == qpair->entries) {
1143  			qpair->cq_head = 0;
1144  			qpair->phase = !qpair->phase;
1145  		}
1146  
1147  		if (++num_completions == max_completions)
1148  			break;
1149  	}
1150  
1151  	if (num_completions > 0)
1152  		nvme_mmio_write_4(qpair->cq_hdbl, qpair->cq_head);
1153  
1154  	pthread_mutex_unlock(&qpair->lock);
1155  
1156  	if (!STAILQ_EMPTY(&qpair->queued_req))
1157  		nvme_qpair_submit_queued_requests(qpair);
1158  
1159  	return num_completions;
1160  }
1161  
1162  void nvme_qpair_reset(struct nvme_qpair *qpair)
1163  {
1164  	pthread_mutex_lock(&qpair->lock);
1165  
1166  	qpair->sq_tail = qpair->cq_head = 0;
1167  
1168  	/*
1169  	 * First time through the completion queue, HW will set phase
1170  	 * bit on completions to 1.  So set this to 1 here, indicating
1171  	 * we're looking for a 1 to know which entries have completed.
1172  	 * we'll toggle the bit each time when the completion queue rolls over.
1173  	 */
1174  	qpair->phase = 1;
1175  
1176  	memset(qpair->cmd, 0, qpair->entries * sizeof(struct nvme_cmd));
1177  	memset(qpair->cpl, 0, qpair->entries * sizeof(struct nvme_cpl));
1178  
1179  	pthread_mutex_unlock(&qpair->lock);
1180  }
1181  
1182  void nvme_qpair_enable(struct nvme_qpair *qpair)
1183  {
1184  	pthread_mutex_lock(&qpair->lock);
1185  
1186  	if (nvme_qpair_is_io_queue(qpair))
1187  		_nvme_qpair_io_qpair_enable(qpair);
1188  	else
1189  		_nvme_qpair_admin_qpair_enable(qpair);
1190  
1191  	pthread_mutex_unlock(&qpair->lock);
1192  }
1193  
1194  void nvme_qpair_disable(struct nvme_qpair *qpair)
1195  {
1196  	pthread_mutex_lock(&qpair->lock);
1197  
1198  	if (nvme_qpair_is_io_queue(qpair))
1199  		_nvme_qpair_io_qpair_disable(qpair);
1200  	else
1201  		_nvme_qpair_admin_qpair_disable(qpair);
1202  
1203  	pthread_mutex_unlock(&qpair->lock);
1204  }
1205  
1206  void nvme_qpair_fail(struct nvme_qpair *qpair)
1207  {
1208  	struct nvme_tracker *tr;
1209  	struct nvme_request *req;
1210  
1211  	pthread_mutex_lock(&qpair->lock);
1212  
1213  	while (!STAILQ_EMPTY(&qpair->queued_req)) {
1214  
1215  		nvme_notice("Failing queued I/O command\n");
1216  		req = STAILQ_FIRST(&qpair->queued_req);
1217  		STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
1218  		nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC,
1219  						   NVME_SC_ABORTED_BY_REQUEST,
1220  						   true);
1221  
1222  	}
1223  
1224  	/* Manually abort each outstanding I/O. */
1225  	while (!LIST_EMPTY(&qpair->outstanding_tr)) {
1226  
1227  		/*
1228  		 * Do not remove the tracker. The abort_tracker path
1229  		 * will do that for us.
1230  		 */
1231  		nvme_notice("Failing outstanding I/O command\n");
1232  		tr = LIST_FIRST(&qpair->outstanding_tr);
1233  		nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC,
1234  						   NVME_SC_ABORTED_BY_REQUEST,
1235  						   1, true);
1236  
1237  	}
1238  
1239  	pthread_mutex_unlock(&qpair->lock);
1240  }
1241  
1242