xref: /haiku/src/add-ons/kernel/drivers/disk/nvme/nvme_disk.cpp (revision d8ffdea39e122821c22bb610e965a2823bbd4480)
1 /*
2  * Copyright 2019-2020, Haiku, Inc. All rights reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Augustin Cavalier <waddlesplash>
7  */
8 
9 
10 #include <stdio.h>
11 #include <stdlib.h>
12 
13 #include <algorithm>
14 #include <condition_variable.h>
15 #include <AutoDeleter.h>
16 #include <kernel.h>
17 #include <util/AutoLock.h>
18 
19 #include <fs/devfs.h>
20 #include <bus/PCI.h>
21 #include <PCI_x86.h>
22 #include <vm/vm.h>
23 
24 #include "IORequest.h"
25 
26 extern "C" {
27 #include <libnvme/nvme.h>
28 #include <libnvme/nvme_internal.h>
29 }
30 
31 
32 //#define TRACE_NVME_DISK
33 #ifdef TRACE_NVME_DISK
34 #	define TRACE(x...) dprintf("nvme_disk: " x)
35 #else
36 #	define TRACE(x...) ;
37 #endif
38 #define TRACE_ALWAYS(x...)	dprintf("nvme_disk: " x)
39 #define TRACE_ERROR(x...)	dprintf("\33[33mnvme_disk:\33[0m " x)
40 #define CALLED() 			TRACE("CALLED %s\n", __PRETTY_FUNCTION__)
41 
42 
43 static const uint8 kDriveIcon[] = {
44 	0x6e, 0x63, 0x69, 0x66, 0x08, 0x03, 0x01, 0x00, 0x00, 0x02, 0x00, 0x16,
45 	0x02, 0x3c, 0xc7, 0xee, 0x38, 0x9b, 0xc0, 0xba, 0x16, 0x57, 0x3e, 0x39,
46 	0xb0, 0x49, 0x77, 0xc8, 0x42, 0xad, 0xc7, 0x00, 0xff, 0xff, 0xd3, 0x02,
47 	0x00, 0x06, 0x02, 0x3c, 0x96, 0x32, 0x3a, 0x4d, 0x3f, 0xba, 0xfc, 0x01,
48 	0x3d, 0x5a, 0x97, 0x4b, 0x57, 0xa5, 0x49, 0x84, 0x4d, 0x00, 0x47, 0x47,
49 	0x47, 0xff, 0xa5, 0xa0, 0xa0, 0x02, 0x00, 0x16, 0x02, 0xbc, 0x59, 0x2f,
50 	0xbb, 0x29, 0xa7, 0x3c, 0x0c, 0xe4, 0xbd, 0x0b, 0x7c, 0x48, 0x92, 0xc0,
51 	0x4b, 0x79, 0x66, 0x00, 0x7d, 0xff, 0xd4, 0x02, 0x00, 0x06, 0x02, 0x38,
52 	0xdb, 0xb4, 0x39, 0x97, 0x33, 0xbc, 0x4a, 0x33, 0x3b, 0xa5, 0x42, 0x48,
53 	0x6e, 0x66, 0x49, 0xee, 0x7b, 0x00, 0x59, 0x67, 0x56, 0xff, 0xeb, 0xb2,
54 	0xb2, 0x03, 0xa7, 0xff, 0x00, 0x03, 0xff, 0x00, 0x00, 0x04, 0x01, 0x80,
55 	0x07, 0x0a, 0x06, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x5a, 0x3e, 0x5a,
56 	0x31, 0x39, 0x25, 0x0a, 0x04, 0x22, 0x3c, 0x44, 0x4b, 0x5a, 0x31, 0x39,
57 	0x25, 0x0a, 0x04, 0x44, 0x4b, 0x44, 0x5b, 0x5a, 0x3e, 0x5a, 0x31, 0x0a,
58 	0x04, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x44, 0x4b, 0x08, 0x02, 0x27,
59 	0x43, 0xb8, 0x14, 0xc1, 0xf1, 0x08, 0x02, 0x26, 0x43, 0x29, 0x44, 0x0a,
60 	0x05, 0x44, 0x5d, 0x49, 0x5d, 0x60, 0x3e, 0x5a, 0x3b, 0x5b, 0x3f, 0x08,
61 	0x0a, 0x07, 0x01, 0x06, 0x00, 0x0a, 0x00, 0x01, 0x00, 0x10, 0x01, 0x17,
62 	0x84, 0x00, 0x04, 0x0a, 0x01, 0x01, 0x01, 0x00, 0x0a, 0x02, 0x01, 0x02,
63 	0x00, 0x0a, 0x03, 0x01, 0x03, 0x00, 0x0a, 0x04, 0x01, 0x04, 0x10, 0x01,
64 	0x17, 0x85, 0x20, 0x04, 0x0a, 0x06, 0x01, 0x05, 0x30, 0x24, 0xb3, 0x99,
65 	0x01, 0x17, 0x82, 0x00, 0x04, 0x0a, 0x05, 0x01, 0x05, 0x30, 0x20, 0xb2,
66 	0xe6, 0x01, 0x17, 0x82, 0x00, 0x04
67 };
68 
69 
70 #define NVME_DISK_DRIVER_MODULE_NAME 	"drivers/disk/nvme_disk/driver_v1"
71 #define NVME_DISK_DEVICE_MODULE_NAME 	"drivers/disk/nvme_disk/device_v1"
72 #define NVME_DISK_DEVICE_ID_GENERATOR	"nvme_disk/device_id"
73 
74 #define NVME_MAX_QPAIRS					(8)
75 
76 
77 static device_manager_info* sDeviceManager;
78 static pci_x86_module_info* sPCIx86Module;
79 
80 typedef struct {
81 	device_node*			node;
82 	pci_info				info;
83 
84 	struct nvme_ctrlr*		ctrlr;
85 
86 	struct nvme_ns*			ns;
87 	uint64					capacity;
88 	uint32					block_size;
89 	uint32					max_io_blocks;
90 	status_t				media_status;
91 
92 	struct qpair_info {
93 		struct nvme_qpair*	qpair;
94 	}						qpairs[NVME_MAX_QPAIRS];
95 	uint32					qpair_count;
96 	uint32					next_qpair;
97 
98 	DMAResource				dma_resource;
99 	sem_id					dma_buffers_sem;
100 
101 	rw_lock					rounded_write_lock;
102 
103 	ConditionVariable		interrupt;
104 } nvme_disk_driver_info;
105 typedef nvme_disk_driver_info::qpair_info qpair_info;
106 
107 
108 typedef struct {
109 	nvme_disk_driver_info*		info;
110 } nvme_disk_handle;
111 
112 
113 static status_t
114 get_geometry(nvme_disk_handle* handle, device_geometry* geometry)
115 {
116 	nvme_disk_driver_info* info = handle->info;
117 
118 	devfs_compute_geometry_size(geometry, info->capacity, info->block_size);
119 
120 	geometry->device_type = B_DISK;
121 	geometry->removable = false;
122 
123 	geometry->read_only = false;
124 	geometry->write_once = false;
125 
126 	TRACE("get_geometry(): %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %d, %d, %d, %d\n",
127 		geometry->bytes_per_sector, geometry->sectors_per_track,
128 		geometry->cylinder_count, geometry->head_count, geometry->device_type,
129 		geometry->removable, geometry->read_only, geometry->write_once);
130 
131 	return B_OK;
132 }
133 
134 
135 static int
136 log2(uint32 x)
137 {
138 	int y;
139 
140 	for (y = 31; y >= 0; --y) {
141 		if (x == ((uint32)1 << y))
142 			break;
143 	}
144 
145 	return y;
146 }
147 
148 
149 static void
150 nvme_disk_set_capacity(nvme_disk_driver_info* info, uint64 capacity,
151 	uint32 blockSize)
152 {
153 	TRACE("set_capacity(device = %p, capacity = %" B_PRIu64 ", blockSize = %" B_PRIu32 ")\n",
154 		info, capacity, blockSize);
155 
156 	// get log2, if possible
157 	uint32 blockShift = log2(blockSize);
158 
159 	if ((1UL << blockShift) != blockSize)
160 		blockShift = 0;
161 
162 	info->capacity = capacity;
163 	info->block_size = blockSize;
164 }
165 
166 
167 //	#pragma mark - device module API
168 
169 
170 static int32 nvme_interrupt_handler(void* _info);
171 
172 
173 static status_t
174 nvme_disk_init_device(void* _info, void** _cookie)
175 {
176 	CALLED();
177 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info;
178 
179 	pci_device_module_info* pci;
180 	pci_device* pcidev;
181 	device_node* parent = sDeviceManager->get_parent_node(info->node);
182 	sDeviceManager->get_driver(parent, (driver_module_info**)&pci,
183 		(void**)&pcidev);
184 	pci->get_pci_info(pcidev, &info->info);
185 	sDeviceManager->put_node(parent);
186 
187 	// construct the libnvme pci_device struct
188 	pci_device* device = new pci_device;
189 	device->vendor_id = info->info.vendor_id;
190 	device->device_id = info->info.device_id;
191 	device->subvendor_id = 0;
192 	device->subdevice_id = 0;
193 
194 	device->domain = 0;
195 	device->bus = info->info.bus;
196 	device->dev = info->info.device;
197 	device->func = info->info.function;
198 
199 	device->pci_info = &info->info;
200 
201 	// enable busmaster and memory mapped access
202 	uint16 command = pci->read_pci_config(pcidev, PCI_command, 2);
203 	command |= PCI_command_master | PCI_command_memory;
204 	pci->write_pci_config(pcidev, PCI_command, 2, command);
205 
206 	// open the controller
207 	info->ctrlr = nvme_ctrlr_open(device, NULL);
208 	if (info->ctrlr == NULL) {
209 		TRACE_ERROR("failed to open the controller!\n");
210 		return B_ERROR;
211 	}
212 
213 	struct nvme_ctrlr_stat cstat;
214 	int err = nvme_ctrlr_stat(info->ctrlr, &cstat);
215 	if (err != 0) {
216 		TRACE_ERROR("failed to get controller information!\n");
217 		nvme_ctrlr_close(info->ctrlr);
218 		return err;
219 	}
220 
221 	TRACE_ALWAYS("attached to NVMe device \"%s (%s)\"\n", cstat.mn, cstat.sn);
222 	TRACE_ALWAYS("\tmaximum transfer size: %" B_PRIuSIZE "\n", cstat.max_xfer_size);
223 	TRACE_ALWAYS("\tqpair count: %d\n", cstat.io_qpairs);
224 
225 	// TODO: export more than just the first namespace!
226 	info->ns = nvme_ns_open(info->ctrlr, cstat.ns_ids[0]);
227 	if (info->ns == NULL) {
228 		TRACE_ERROR("failed to open namespace!\n");
229 		nvme_ctrlr_close(info->ctrlr);
230 		return B_ERROR;
231 	}
232 
233 	struct nvme_ns_stat nsstat;
234 	err = nvme_ns_stat(info->ns, &nsstat);
235 	if (err != 0) {
236 		TRACE_ERROR("failed to get namespace information!\n");
237 		nvme_ctrlr_close(info->ctrlr);
238 		return err;
239 	}
240 
241 	// store capacity information
242 	nvme_disk_set_capacity(info, nsstat.sectors, nsstat.sector_size);
243 
244 	TRACE("capacity: %" B_PRIu64 ", block_size %" B_PRIu32 "\n",
245 		info->capacity, info->block_size);
246 
247 	// allocate qpairs
248 	info->qpair_count = info->next_qpair = 0;
249 	for (uint32 i = 0; i < NVME_MAX_QPAIRS && i < cstat.io_qpairs; i++) {
250 		info->qpairs[i].qpair = nvme_ioqp_get(info->ctrlr,
251 			(enum nvme_qprio)0, 0);
252 		if (info->qpairs[i].qpair == NULL)
253 			break;
254 
255 		info->qpair_count++;
256 	}
257 	if (info->qpair_count == 0) {
258 		TRACE_ERROR("failed to allocate qpairs!\n");
259 		nvme_ctrlr_close(info->ctrlr);
260 		return B_NO_MEMORY;
261 	}
262 
263 	// allocate DMA buffers
264 	int buffers = info->qpair_count * 2;
265 
266 	dma_restrictions restrictions = {};
267 	restrictions.alignment = B_PAGE_SIZE;
268 		// Technically, the first and last segments in a transfer can be
269 		// unaligned, and the rest only need to have sizes that are a multiple
270 		// of the block size.
271 	restrictions.max_segment_count = (NVME_MAX_SGL_DESCRIPTORS / 2);
272 	restrictions.max_transfer_size = cstat.max_xfer_size;
273 	info->max_io_blocks = cstat.max_xfer_size / nsstat.sector_size;
274 
275 	err = info->dma_resource.Init(restrictions, B_PAGE_SIZE, buffers, buffers);
276 	if (err != 0) {
277 		TRACE_ERROR("failed to initialize DMA resource!\n");
278 		nvme_ctrlr_close(info->ctrlr);
279 		return err;
280 	}
281 
282 	info->dma_buffers_sem = create_sem(buffers, "nvme buffers sem");
283 	if (info->dma_buffers_sem < 0) {
284 		TRACE_ERROR("failed to create DMA buffers semaphore!\n");
285 		nvme_ctrlr_close(info->ctrlr);
286 		return info->dma_buffers_sem;
287 	}
288 
289 	// set up rounded-write lock
290 	rw_lock_init(&info->rounded_write_lock, "nvme rounded writes");
291 
292 	// set up interrupt
293 	if (get_module(B_PCI_X86_MODULE_NAME, (module_info**)&sPCIx86Module)
294 			!= B_OK) {
295 		sPCIx86Module = NULL;
296 	}
297 
298 	uint8 irq = info->info.u.h0.interrupt_line;
299 	if (sPCIx86Module != NULL) {
300 		if (sPCIx86Module->get_msix_count(info->info.bus, info->info.device,
301 				info->info.function)) {
302 			uint8 msixVector = 0;
303 			if (sPCIx86Module->configure_msix(info->info.bus, info->info.device,
304 					info->info.function, 1, &msixVector) == B_OK
305 				&& sPCIx86Module->enable_msix(info->info.bus, info->info.device,
306 					info->info.function) == B_OK) {
307 				TRACE_ALWAYS("using MSI-X\n");
308 				irq = msixVector;
309 			}
310 		} else if (sPCIx86Module->get_msi_count(info->info.bus,
311 				info->info.device, info->info.function) >= 1) {
312 			uint8 msiVector = 0;
313 			if (sPCIx86Module->configure_msi(info->info.bus, info->info.device,
314 					info->info.function, 1, &msiVector) == B_OK
315 				&& sPCIx86Module->enable_msi(info->info.bus, info->info.device,
316 					info->info.function) == B_OK) {
317 				TRACE_ALWAYS("using message signaled interrupts\n");
318 				irq = msiVector;
319 			}
320 		}
321 	} else {
322 		uint16 command = pci->read_pci_config(pcidev, PCI_command, 2);
323 		command &= ~(PCI_command_int_disable);
324 		pci->write_pci_config(pcidev, PCI_command, 2, command);
325 	}
326 
327 	if (irq == 0 || irq == 0xFF) {
328 		TRACE_ERROR("device PCI:%d:%d:%d was assigned an invalid IRQ\n",
329 			info->info.bus, info->info.device, info->info.function);
330 		return B_ERROR;
331 	}
332 	info->interrupt.Init(NULL, NULL);
333 	install_io_interrupt_handler(irq, nvme_interrupt_handler, (void*)info, B_NO_HANDLED_INFO);
334 
335 	if (info->ctrlr->feature_supported[NVME_FEAT_INTERRUPT_COALESCING]) {
336 		uint32 microseconds = 16, threshold = 32;
337 		nvme_admin_set_feature(info->ctrlr, false, NVME_FEAT_INTERRUPT_COALESCING,
338 			((microseconds / 100) << 8) | threshold, 0, NULL);
339 	}
340 
341 	*_cookie = info;
342 	return B_OK;
343 }
344 
345 
346 static void
347 nvme_disk_uninit_device(void* _cookie)
348 {
349 	CALLED();
350 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie;
351 
352 	remove_io_interrupt_handler(info->info.u.h0.interrupt_line,
353 		nvme_interrupt_handler, (void*)info);
354 
355 	rw_lock_destroy(&info->rounded_write_lock);
356 
357 	nvme_ns_close(info->ns);
358 	nvme_ctrlr_close(info->ctrlr);
359 
360 	// TODO: Deallocate MSI(-X).
361 	// TODO: Deallocate PCI.
362 }
363 
364 
365 static status_t
366 nvme_disk_open(void* _info, const char* path, int openMode, void** _cookie)
367 {
368 	CALLED();
369 
370 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info;
371 	nvme_disk_handle* handle = (nvme_disk_handle*)malloc(
372 		sizeof(nvme_disk_handle));
373 	if (handle == NULL)
374 		return B_NO_MEMORY;
375 
376 	handle->info = info;
377 
378 	*_cookie = handle;
379 	return B_OK;
380 }
381 
382 
383 static status_t
384 nvme_disk_close(void* cookie)
385 {
386 	CALLED();
387 
388 	//nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
389 	return B_OK;
390 }
391 
392 
393 static status_t
394 nvme_disk_free(void* cookie)
395 {
396 	CALLED();
397 
398 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
399 	free(handle);
400 	return B_OK;
401 }
402 
403 
404 // #pragma mark - I/O
405 
406 
407 static int32
408 nvme_interrupt_handler(void* _info)
409 {
410 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info;
411 	info->interrupt.NotifyAll();
412 	return 0;
413 }
414 
415 
416 static qpair_info*
417 get_qpair(nvme_disk_driver_info* info)
418 {
419 	return &info->qpairs[atomic_add((int32*)&info->next_qpair, 1)
420 		% info->qpair_count];
421 }
422 
423 
424 static void
425 io_finished_callback(status_t* status, const struct nvme_cpl* cpl)
426 {
427 	*status = nvme_cpl_is_error(cpl) ? B_IO_ERROR : B_OK;
428 }
429 
430 
431 static void
432 await_status(nvme_disk_driver_info* info, struct nvme_qpair* qpair, status_t& status)
433 {
434 	CALLED();
435 
436 	ConditionVariableEntry entry;
437 	int timeouts = 0;
438 	while (status == EINPROGRESS) {
439 		info->interrupt.Add(&entry);
440 
441 		nvme_qpair_poll(qpair, 0);
442 
443 		if (status != EINPROGRESS)
444 			return;
445 
446 		if (entry.Wait(B_RELATIVE_TIMEOUT, 5 * 1000 * 1000) != B_OK) {
447 			// This should never happen, as we are woken up on every interrupt
448 			// no matter the qpair or transfer within; so if it does occur,
449 			// that probably means the controller stalled or something.
450 
451 			TRACE_ERROR("timed out waiting for interrupt!\n");
452 			if (timeouts++ >= 3) {
453 				nvme_qpair_fail(qpair);
454 				status = B_TIMED_OUT;
455 				return;
456 			}
457 		}
458 
459 		nvme_qpair_poll(qpair, 0);
460 	}
461 }
462 
463 
464 struct nvme_io_request {
465 	status_t status;
466 
467 	bool write;
468 
469 	off_t lba_start;
470 	size_t lba_count;
471 
472 	physical_entry* iovecs;
473 	int32 iovec_count;
474 
475 	int32 iovec_i;
476 	uint32 iovec_offset;
477 };
478 
479 
480 void ior_reset_sgl(nvme_io_request* request, uint32_t offset)
481 {
482 	TRACE("IOR Reset: %" B_PRIu32 "\n", offset);
483 
484 	int32 i = 0;
485 	while (offset > 0 && request->iovecs[i].size <= offset) {
486 		offset -= request->iovecs[i].size;
487 		i++;
488 	}
489 	request->iovec_i = i;
490 	request->iovec_offset = offset;
491 }
492 
493 
494 int ior_next_sge(nvme_io_request* request, uint64_t* address, uint32_t* length)
495 {
496 	int32 index = request->iovec_i;
497 	if (index < 0 || index > request->iovec_count)
498 		return -1;
499 
500 	*address = request->iovecs[index].address + request->iovec_offset;
501 	*length = request->iovecs[index].size - request->iovec_offset;
502 
503 	TRACE("IOV %d (+ " B_PRIu32 "): 0x%" B_PRIx64 ", %" B_PRIu32 "\n",
504 		request->iovec_i, request->iovec_offset, *address, *length);
505 
506 	request->iovec_i++;
507 	request->iovec_offset = 0;
508 	return 0;
509 }
510 
511 
512 static status_t
513 do_nvme_io_request(nvme_disk_driver_info* info, nvme_io_request* request)
514 {
515 	request->status = EINPROGRESS;
516 
517 	qpair_info* qpinfo = get_qpair(info);
518 	int ret = -1;
519 	if (request->write) {
520 		ret = nvme_ns_writev(info->ns, qpinfo->qpair, request->lba_start,
521 			request->lba_count, (nvme_cmd_cb)io_finished_callback, request,
522 			0, (nvme_req_reset_sgl_cb)ior_reset_sgl,
523 			(nvme_req_next_sge_cb)ior_next_sge);
524 	} else {
525 		ret = nvme_ns_readv(info->ns, qpinfo->qpair, request->lba_start,
526 			request->lba_count, (nvme_cmd_cb)io_finished_callback, request,
527 			0, (nvme_req_reset_sgl_cb)ior_reset_sgl,
528 			(nvme_req_next_sge_cb)ior_next_sge);
529 	}
530 	if (ret != 0) {
531 		TRACE_ERROR("attempt to queue %s I/O at LBA %" B_PRIdOFF " of %" B_PRIuSIZE
532 			" blocks failed!\n", request->write ? "write" : "read",
533 			request->lba_start, request->lba_count);
534 
535 		request->lba_count = 0;
536 		return ret;
537 	}
538 
539 	await_status(info, qpinfo->qpair, request->status);
540 
541 	if (request->status != B_OK) {
542 		TRACE_ERROR("%s at LBA %" B_PRIdOFF " of %" B_PRIuSIZE
543 			" blocks failed!\n", request->write ? "write" : "read",
544 			request->lba_start, request->lba_count);
545 
546 		request->lba_count = 0;
547 	}
548 	return request->status;
549 }
550 
551 
552 static status_t
553 nvme_disk_bounced_io(nvme_disk_handle* handle, io_request* request)
554 {
555 	CALLED();
556 
557 	WriteLocker writeLocker;
558 	if (request->IsWrite())
559 		writeLocker.SetTo(handle->info->rounded_write_lock, false);
560 
561 	status_t status = acquire_sem(handle->info->dma_buffers_sem);
562 	if (status != B_OK) {
563 		request->SetStatusAndNotify(status);
564 		return status;
565 	}
566 
567 	const size_t block_size = handle->info->block_size;
568 
569 	TRACE("%p: IOR Offset: %" B_PRIdOFF "; Length %" B_PRIuGENADDR
570 		"; Write %s\n", request, request->Offset(), request->Length(),
571 		request->IsWrite() ? "yes" : "no");
572 
573 	nvme_io_request nvme_request;
574 	while (request->RemainingBytes() > 0) {
575 		IOOperation operation;
576 		status = handle->info->dma_resource.TranslateNext(request, &operation, 0);
577 		if (status != B_OK)
578 			break;
579 
580 		size_t transferredBytes = 0;
581 		do {
582 			TRACE("%p: IOO offset: %" B_PRIdOFF ", length: %" B_PRIuGENADDR
583 				", write: %s\n", request, operation.Offset(),
584 				operation.Length(), operation.IsWrite() ? "yes" : "no");
585 
586 			nvme_request.write = operation.IsWrite();
587 			nvme_request.lba_start = operation.Offset() / block_size;
588 			nvme_request.lba_count = operation.Length() / block_size;
589 			nvme_request.iovecs = (physical_entry*)operation.Vecs();
590 			nvme_request.iovec_count = operation.VecCount();
591 
592 			status = do_nvme_io_request(handle->info, &nvme_request);
593 			if (status == B_OK && nvme_request.write == request->IsWrite())
594 				transferredBytes += operation.OriginalLength();
595 
596 			operation.SetStatus(status);
597 		} while (status == B_OK && !operation.Finish());
598 
599 		if (status == B_OK && operation.Status() != B_OK) {
600 			TRACE_ERROR("I/O succeeded but IOOperation failed!\n");
601 			status = operation.Status();
602 		}
603 
604 		operation.SetTransferredBytes(transferredBytes);
605 		request->OperationFinished(&operation, status, status != B_OK,
606 			operation.OriginalOffset() + transferredBytes);
607 
608 		handle->info->dma_resource.RecycleBuffer(operation.Buffer());
609 
610 		TRACE("%p: status %s, remaining bytes %" B_PRIuGENADDR "\n", request,
611 			strerror(status), request->RemainingBytes());
612 		if (status != B_OK)
613 			break;
614 	}
615 
616 	release_sem(handle->info->dma_buffers_sem);
617 
618 	// Notify() also takes care of UnlockMemory().
619 	if (status != B_OK && request->Status() == B_OK)
620 		request->SetStatusAndNotify(status);
621 	else
622 		request->NotifyFinished();
623 	return status;
624 }
625 
626 
627 static status_t
628 nvme_disk_io(void* cookie, io_request* request)
629 {
630 	CALLED();
631 
632 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
633 
634 	nvme_io_request nvme_request;
635 	memset(&nvme_request, 0, sizeof(nvme_io_request));
636 
637 	nvme_request.write = request->IsWrite();
638 
639 	physical_entry* vtophys = NULL;
640 	MemoryDeleter vtophysDeleter;
641 
642 	IOBuffer* buffer = request->Buffer();
643 	status_t status = B_OK;
644 	if (!buffer->IsPhysical()) {
645 		status = buffer->LockMemory(request->TeamID(), request->IsWrite());
646 		if (status != B_OK) {
647 			TRACE_ERROR("failed to lock memory: %s\n", strerror(status));
648 			return status;
649 		}
650 		// SetStatusAndNotify() takes care of unlocking memory if necessary.
651 
652 		// This is slightly inefficient, as we could use a BStackOrHeapArray in
653 		// the optimal case (few physical entries required), but we would not
654 		// know whether or not that was possible until calling get_memory_map()
655 		// and then potentially reallocating, which would complicate the logic.
656 
657 		int32 vtophys_length = (request->Length() / B_PAGE_SIZE) + 2;
658 		nvme_request.iovecs = vtophys = (physical_entry*)malloc(sizeof(physical_entry)
659 			* vtophys_length);
660 		if (vtophys == NULL) {
661 			TRACE_ERROR("failed to allocate memory for iovecs\n");
662 			request->SetStatusAndNotify(B_NO_MEMORY);
663 			return B_NO_MEMORY;
664 		}
665 		vtophysDeleter.SetTo(vtophys);
666 
667 		for (size_t i = 0; i < buffer->VecCount(); i++) {
668 			generic_io_vec virt = buffer->VecAt(i);
669 			uint32 entries = vtophys_length - nvme_request.iovec_count;
670 
671 			// Avoid copies by going straight into the vtophys array.
672 			status = get_memory_map_etc(request->TeamID(), (void*)virt.base,
673 				virt.length, vtophys + nvme_request.iovec_count, &entries);
674 			if (status == B_BUFFER_OVERFLOW) {
675 				TRACE("vtophys array was too small, reallocating\n");
676 
677 				vtophysDeleter.Detach();
678 				vtophys_length *= 2;
679 				nvme_request.iovecs = vtophys = (physical_entry*)realloc(vtophys,
680 					sizeof(physical_entry) * vtophys_length);
681 				vtophysDeleter.SetTo(vtophys);
682 				if (vtophys == NULL) {
683 					status = B_NO_MEMORY;
684 				} else {
685 					// Try again, with the larger buffer this time.
686 					i--;
687 					continue;
688 				}
689 			}
690 			if (status != B_OK) {
691 				TRACE_ERROR("I/O get_memory_map failed: %s\n", strerror(status));
692 				request->SetStatusAndNotify(status);
693 				return status;
694 			}
695 
696 			nvme_request.iovec_count += entries;
697 		}
698 	} else {
699 		nvme_request.iovecs = (physical_entry*)buffer->Vecs();
700 		nvme_request.iovec_count = buffer->VecCount();
701 	}
702 
703 	// See if we need to bounce anything other than the first or last vec.
704 	const size_t block_size = handle->info->block_size;
705 	bool bounceAll = false;
706 	for (int32 i = 1; !bounceAll && i < (nvme_request.iovec_count - 1); i++) {
707 		if ((nvme_request.iovecs[i].address % B_PAGE_SIZE) != 0)
708 			bounceAll = true;
709 		if ((nvme_request.iovecs[i].size % B_PAGE_SIZE) != 0)
710 			bounceAll = true;
711 	}
712 
713 	// See if we need to bounce due to the first or last vec.
714 	if (nvme_request.iovec_count > 1) {
715 		physical_entry* entry = &nvme_request.iovecs[0];
716 		if (!bounceAll && (((entry->address + entry->size) % B_PAGE_SIZE) != 0
717 				|| (entry->size % block_size) != 0))
718 			bounceAll = true;
719 
720 		entry = &nvme_request.iovecs[nvme_request.iovec_count - 1];
721 		if (!bounceAll && ((entry->address % B_PAGE_SIZE) != 0
722 				|| (entry->size % block_size) != 0))
723 			bounceAll = true;
724 	}
725 
726 	// See if we need to bounce due to rounding.
727 	const off_t rounded_pos = ROUNDDOWN(request->Offset(), block_size);
728 	phys_size_t rounded_len = ROUNDUP(request->Length() + (request->Offset()
729 		- rounded_pos), block_size);
730 	if (rounded_pos != request->Offset() || rounded_len != request->Length())
731 		bounceAll = true;
732 
733 	if (bounceAll) {
734 		// Let the bounced I/O routine take care of everything from here.
735 		return nvme_disk_bounced_io(handle, request);
736 	}
737 
738 	nvme_request.lba_start = rounded_pos / block_size;
739 	nvme_request.lba_count = rounded_len / block_size;
740 
741 	// No bouncing was required.
742 	ReadLocker readLocker;
743 	if (nvme_request.write)
744 		readLocker.SetTo(handle->info->rounded_write_lock, false);
745 
746 	// Error check before actually doing I/O.
747 	if (status != B_OK) {
748 		TRACE_ERROR("I/O failed early: %s\n", strerror(status));
749 		request->SetStatusAndNotify(status);
750 		return status;
751 	}
752 
753 	const uint32 max_io_blocks = handle->info->max_io_blocks;
754 	int32 remaining = nvme_request.iovec_count;
755 	while (remaining > 0) {
756 		nvme_request.iovec_count = min_c(remaining,
757 			NVME_MAX_SGL_DESCRIPTORS / 2);
758 
759 		nvme_request.lba_count = 0;
760 		for (int i = 0; i < nvme_request.iovec_count; i++) {
761 			int32 new_lba_count = nvme_request.lba_count
762 				+ (nvme_request.iovecs[i].size / block_size);
763 			if (nvme_request.lba_count > 0 && new_lba_count > max_io_blocks) {
764 				// We already have a nonzero length, and adding this vec would
765 				// make us go over (or we already are over.) Stop adding.
766 				nvme_request.iovec_count = i;
767 				break;
768 			}
769 
770 			nvme_request.lba_count = new_lba_count;
771 		}
772 
773 		status = do_nvme_io_request(handle->info, &nvme_request);
774 		if (status != B_OK)
775 			break;
776 
777 		nvme_request.iovecs += nvme_request.iovec_count;
778 		remaining -= nvme_request.iovec_count;
779 		nvme_request.lba_start += nvme_request.lba_count;
780 	}
781 
782 	if (status != B_OK)
783 		TRACE_ERROR("I/O failed: %s\n", strerror(status));
784 
785 	request->SetTransferredBytes(status != B_OK,
786 		(nvme_request.lba_start * block_size) - rounded_pos);
787 	request->SetStatusAndNotify(status);
788 	return status;
789 }
790 
791 
792 static status_t
793 nvme_disk_read(void* cookie, off_t pos, void* buffer, size_t* length)
794 {
795 	CALLED();
796 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
797 
798 	const off_t end = (handle->info->capacity * handle->info->block_size);
799 	if (pos >= end)
800 		return B_BAD_VALUE;
801 	if (pos + (off_t)*length > end)
802 		*length = end - pos;
803 
804 	IORequest request;
805 	status_t status = request.Init(pos, (addr_t)buffer, *length, false, 0);
806 	if (status != B_OK)
807 		return status;
808 
809 	status = nvme_disk_io(handle, &request);
810 	*length = request.TransferredBytes();
811 	return status;
812 }
813 
814 
815 static status_t
816 nvme_disk_write(void* cookie, off_t pos, const void* buffer, size_t* length)
817 {
818 	CALLED();
819 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
820 
821 	const off_t end = (handle->info->capacity * handle->info->block_size);
822 	if (pos >= end)
823 		return B_BAD_VALUE;
824 	if (pos + (off_t)*length > end)
825 		*length = end - pos;
826 
827 	IORequest request;
828 	status_t status = request.Init(pos, (addr_t)buffer, *length, true, 0);
829 	if (status != B_OK)
830 		return status;
831 
832 	status = nvme_disk_io(handle, &request);
833 	*length = request.TransferredBytes();
834 	return status;
835 }
836 
837 
838 static status_t
839 nvme_disk_flush(nvme_disk_driver_info* info)
840 {
841 	status_t status = EINPROGRESS;
842 
843 	qpair_info* qpinfo = get_qpair(info);
844 	int ret = nvme_ns_flush(info->ns, qpinfo->qpair,
845 		(nvme_cmd_cb)io_finished_callback, &status);
846 	if (ret != 0)
847 		return ret;
848 
849 	await_status(info, qpinfo->qpair, status);
850 	return status;
851 }
852 
853 
854 static status_t
855 nvme_disk_ioctl(void* cookie, uint32 op, void* buffer, size_t length)
856 {
857 	CALLED();
858 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
859 	nvme_disk_driver_info* info = handle->info;
860 
861 	TRACE("ioctl(op = %" B_PRId32 ")\n", op);
862 
863 	switch (op) {
864 		case B_GET_MEDIA_STATUS:
865 		{
866 			*(status_t *)buffer = info->media_status;
867 			info->media_status = B_OK;
868 			return B_OK;
869 			break;
870 		}
871 
872 		case B_GET_DEVICE_SIZE:
873 		{
874 			size_t size = info->capacity * info->block_size;
875 			return user_memcpy(buffer, &size, sizeof(size_t));
876 		}
877 
878 		case B_GET_GEOMETRY:
879 		{
880 			if (buffer == NULL /*|| length != sizeof(device_geometry)*/)
881 				return B_BAD_VALUE;
882 
883 		 	device_geometry geometry;
884 			status_t status = get_geometry(handle, &geometry);
885 			if (status != B_OK)
886 				return status;
887 
888 			return user_memcpy(buffer, &geometry, sizeof(device_geometry));
889 		}
890 
891 		case B_GET_ICON_NAME:
892 			return user_strlcpy((char*)buffer, "devices/drive-harddisk",
893 				B_FILE_NAME_LENGTH);
894 
895 		case B_GET_VECTOR_ICON:
896 		{
897 			device_icon iconData;
898 			if (length != sizeof(device_icon))
899 				return B_BAD_VALUE;
900 			if (user_memcpy(&iconData, buffer, sizeof(device_icon)) != B_OK)
901 				return B_BAD_ADDRESS;
902 
903 			if (iconData.icon_size >= (int32)sizeof(kDriveIcon)) {
904 				if (user_memcpy(iconData.icon_data, kDriveIcon,
905 						sizeof(kDriveIcon)) != B_OK)
906 					return B_BAD_ADDRESS;
907 			}
908 
909 			iconData.icon_size = sizeof(kDriveIcon);
910 			return user_memcpy(buffer, &iconData, sizeof(device_icon));
911 		}
912 
913 		case B_FLUSH_DRIVE_CACHE:
914 			return nvme_disk_flush(info);
915 	}
916 
917 	return B_DEV_INVALID_IOCTL;
918 }
919 
920 
921 //	#pragma mark - driver module API
922 
923 
924 static float
925 nvme_disk_supports_device(device_node *parent)
926 {
927 	CALLED();
928 
929 	const char* bus;
930 	uint16 baseClass, subClass;
931 
932 	if (sDeviceManager->get_attr_string(parent, B_DEVICE_BUS, &bus, false) != B_OK
933 		|| sDeviceManager->get_attr_uint16(parent, B_DEVICE_TYPE, &baseClass, false) != B_OK
934 		|| sDeviceManager->get_attr_uint16(parent, B_DEVICE_SUB_TYPE, &subClass, false) != B_OK)
935 		return -1.0f;
936 
937 	if (strcmp(bus, "pci") != 0 || baseClass != PCI_mass_storage)
938 		return 0.0f;
939 
940 	if (subClass != PCI_nvm)
941 		return 0.0f;
942 
943 	TRACE("NVMe device found!\n");
944 	return 1.0f;
945 }
946 
947 
948 static status_t
949 nvme_disk_register_device(device_node* parent)
950 {
951 	CALLED();
952 
953 	device_attr attrs[] = {
954 		{ B_DEVICE_PRETTY_NAME, B_STRING_TYPE, { string: "NVMe Disk" } },
955 		{ NULL }
956 	};
957 
958 	return sDeviceManager->register_node(parent, NVME_DISK_DRIVER_MODULE_NAME,
959 		attrs, NULL, NULL);
960 }
961 
962 
963 static status_t
964 nvme_disk_init_driver(device_node* node, void** cookie)
965 {
966 	CALLED();
967 
968 	int ret = nvme_lib_init((enum nvme_log_level)0, (enum nvme_log_facility)0, NULL);
969 	if (ret != 0) {
970 		TRACE_ERROR("libnvme initialization failed!\n");
971 		return ret;
972 	}
973 
974 	nvme_disk_driver_info* info = new nvme_disk_driver_info;
975 	if (info == NULL)
976 		return B_NO_MEMORY;
977 
978 	info->media_status = B_OK;
979 	info->node = node;
980 
981 	info->ctrlr = NULL;
982 
983 	*cookie = info;
984 	return B_OK;
985 }
986 
987 
988 static void
989 nvme_disk_uninit_driver(void* _cookie)
990 {
991 	CALLED();
992 
993 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie;
994 	free(info);
995 }
996 
997 
998 static status_t
999 nvme_disk_register_child_devices(void* _cookie)
1000 {
1001 	CALLED();
1002 
1003 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie;
1004 	status_t status;
1005 
1006 	int32 id = sDeviceManager->create_id(NVME_DISK_DEVICE_ID_GENERATOR);
1007 	if (id < 0)
1008 		return id;
1009 
1010 	char name[64];
1011 	snprintf(name, sizeof(name), "disk/nvme/%" B_PRId32 "/raw",
1012 		id);
1013 
1014 	status = sDeviceManager->publish_device(info->node, name,
1015 		NVME_DISK_DEVICE_MODULE_NAME);
1016 
1017 	return status;
1018 }
1019 
1020 
1021 //	#pragma mark -
1022 
1023 
1024 module_dependency module_dependencies[] = {
1025 	{ B_DEVICE_MANAGER_MODULE_NAME, (module_info**)&sDeviceManager },
1026 	{ NULL }
1027 };
1028 
1029 struct device_module_info sNvmeDiskDevice = {
1030 	{
1031 		NVME_DISK_DEVICE_MODULE_NAME,
1032 		0,
1033 		NULL
1034 	},
1035 
1036 	nvme_disk_init_device,
1037 	nvme_disk_uninit_device,
1038 	NULL, // remove,
1039 
1040 	nvme_disk_open,
1041 	nvme_disk_close,
1042 	nvme_disk_free,
1043 	nvme_disk_read,
1044 	nvme_disk_write,
1045 	nvme_disk_io,
1046 	nvme_disk_ioctl,
1047 
1048 	NULL,	// select
1049 	NULL,	// deselect
1050 };
1051 
1052 struct driver_module_info sNvmeDiskDriver = {
1053 	{
1054 		NVME_DISK_DRIVER_MODULE_NAME,
1055 		0,
1056 		NULL
1057 	},
1058 
1059 	nvme_disk_supports_device,
1060 	nvme_disk_register_device,
1061 	nvme_disk_init_driver,
1062 	nvme_disk_uninit_driver,
1063 	nvme_disk_register_child_devices,
1064 	NULL,	// rescan
1065 	NULL,	// removed
1066 };
1067 
1068 module_info* modules[] = {
1069 	(module_info*)&sNvmeDiskDriver,
1070 	(module_info*)&sNvmeDiskDevice,
1071 	NULL
1072 };
1073