xref: /haiku/src/add-ons/kernel/drivers/disk/nvme/nvme_disk.cpp (revision 52c4471a3024d2eb81fe88e2c3982b9f8daa5e56)
1 /*
2  * Copyright 2019-2022, Haiku, Inc. All rights reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Augustin Cavalier <waddlesplash>
7  */
8 
9 
10 #include <stdio.h>
11 #include <stdlib.h>
12 
13 #include <algorithm>
14 #include <condition_variable.h>
15 #include <AutoDeleter.h>
16 #include <kernel.h>
17 #include <smp.h>
18 #include <util/AutoLock.h>
19 
20 #include <fs/devfs.h>
21 #include <bus/PCI.h>
22 #include <PCI_x86.h>
23 #include <vm/vm.h>
24 
25 #include "IORequest.h"
26 
27 extern "C" {
28 #include <libnvme/nvme.h>
29 #include <libnvme/nvme_internal.h>
30 }
31 
32 
33 //#define TRACE_NVME_DISK
34 #ifdef TRACE_NVME_DISK
35 #	define TRACE(x...) dprintf("nvme_disk: " x)
36 #else
37 #	define TRACE(x...) ;
38 #endif
39 #define TRACE_ALWAYS(x...)	dprintf("nvme_disk: " x)
40 #define TRACE_ERROR(x...)	dprintf("\33[33mnvme_disk:\33[0m " x)
41 #define CALLED() 			TRACE("CALLED %s\n", __PRETTY_FUNCTION__)
42 
43 
44 static const uint8 kDriveIcon[] = {
45 	0x6e, 0x63, 0x69, 0x66, 0x08, 0x03, 0x01, 0x00, 0x00, 0x02, 0x00, 0x16,
46 	0x02, 0x3c, 0xc7, 0xee, 0x38, 0x9b, 0xc0, 0xba, 0x16, 0x57, 0x3e, 0x39,
47 	0xb0, 0x49, 0x77, 0xc8, 0x42, 0xad, 0xc7, 0x00, 0xff, 0xff, 0xd3, 0x02,
48 	0x00, 0x06, 0x02, 0x3c, 0x96, 0x32, 0x3a, 0x4d, 0x3f, 0xba, 0xfc, 0x01,
49 	0x3d, 0x5a, 0x97, 0x4b, 0x57, 0xa5, 0x49, 0x84, 0x4d, 0x00, 0x47, 0x47,
50 	0x47, 0xff, 0xa5, 0xa0, 0xa0, 0x02, 0x00, 0x16, 0x02, 0xbc, 0x59, 0x2f,
51 	0xbb, 0x29, 0xa7, 0x3c, 0x0c, 0xe4, 0xbd, 0x0b, 0x7c, 0x48, 0x92, 0xc0,
52 	0x4b, 0x79, 0x66, 0x00, 0x7d, 0xff, 0xd4, 0x02, 0x00, 0x06, 0x02, 0x38,
53 	0xdb, 0xb4, 0x39, 0x97, 0x33, 0xbc, 0x4a, 0x33, 0x3b, 0xa5, 0x42, 0x48,
54 	0x6e, 0x66, 0x49, 0xee, 0x7b, 0x00, 0x59, 0x67, 0x56, 0xff, 0xeb, 0xb2,
55 	0xb2, 0x03, 0xa7, 0xff, 0x00, 0x03, 0xff, 0x00, 0x00, 0x04, 0x01, 0x80,
56 	0x07, 0x0a, 0x06, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x5a, 0x3e, 0x5a,
57 	0x31, 0x39, 0x25, 0x0a, 0x04, 0x22, 0x3c, 0x44, 0x4b, 0x5a, 0x31, 0x39,
58 	0x25, 0x0a, 0x04, 0x44, 0x4b, 0x44, 0x5b, 0x5a, 0x3e, 0x5a, 0x31, 0x0a,
59 	0x04, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x44, 0x4b, 0x08, 0x02, 0x27,
60 	0x43, 0xb8, 0x14, 0xc1, 0xf1, 0x08, 0x02, 0x26, 0x43, 0x29, 0x44, 0x0a,
61 	0x05, 0x44, 0x5d, 0x49, 0x5d, 0x60, 0x3e, 0x5a, 0x3b, 0x5b, 0x3f, 0x08,
62 	0x0a, 0x07, 0x01, 0x06, 0x00, 0x0a, 0x00, 0x01, 0x00, 0x10, 0x01, 0x17,
63 	0x84, 0x00, 0x04, 0x0a, 0x01, 0x01, 0x01, 0x00, 0x0a, 0x02, 0x01, 0x02,
64 	0x00, 0x0a, 0x03, 0x01, 0x03, 0x00, 0x0a, 0x04, 0x01, 0x04, 0x10, 0x01,
65 	0x17, 0x85, 0x20, 0x04, 0x0a, 0x06, 0x01, 0x05, 0x30, 0x24, 0xb3, 0x99,
66 	0x01, 0x17, 0x82, 0x00, 0x04, 0x0a, 0x05, 0x01, 0x05, 0x30, 0x20, 0xb2,
67 	0xe6, 0x01, 0x17, 0x82, 0x00, 0x04
68 };
69 
70 
71 #define NVME_DISK_DRIVER_MODULE_NAME 	"drivers/disk/nvme_disk/driver_v1"
72 #define NVME_DISK_DEVICE_MODULE_NAME 	"drivers/disk/nvme_disk/device_v1"
73 #define NVME_DISK_DEVICE_ID_GENERATOR	"nvme_disk/device_id"
74 
75 #define NVME_MAX_QPAIRS					(16)
76 
77 
78 static device_manager_info* sDeviceManager;
79 static pci_x86_module_info* sPCIx86Module;
80 
81 typedef struct {
82 	device_node*			node;
83 	pci_info				info;
84 
85 	struct nvme_ctrlr*		ctrlr;
86 
87 	struct nvme_ns*			ns;
88 	uint64					capacity;
89 	uint32					block_size;
90 	uint32					max_io_blocks;
91 	status_t				media_status;
92 
93 	DMAResource				dma_resource;
94 	sem_id					dma_buffers_sem;
95 
96 	rw_lock					rounded_write_lock;
97 
98 	ConditionVariable		interrupt;
99 	int32					polling;
100 
101 	struct qpair_info {
102 		struct nvme_qpair*	qpair;
103 	}						qpairs[NVME_MAX_QPAIRS];
104 	uint32					qpair_count;
105 } nvme_disk_driver_info;
106 typedef nvme_disk_driver_info::qpair_info qpair_info;
107 
108 
109 typedef struct {
110 	nvme_disk_driver_info*		info;
111 } nvme_disk_handle;
112 
113 
114 static status_t
115 get_geometry(nvme_disk_handle* handle, device_geometry* geometry)
116 {
117 	nvme_disk_driver_info* info = handle->info;
118 
119 	devfs_compute_geometry_size(geometry, info->capacity, info->block_size);
120 	geometry->bytes_per_physical_sector = info->block_size;
121 
122 	geometry->device_type = B_DISK;
123 	geometry->removable = false;
124 
125 	geometry->read_only = false;
126 	geometry->write_once = false;
127 
128 	TRACE("get_geometry(): %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %d, %d, %d, %d\n",
129 		geometry->bytes_per_sector, geometry->sectors_per_track,
130 		geometry->cylinder_count, geometry->head_count, geometry->device_type,
131 		geometry->removable, geometry->read_only, geometry->write_once);
132 
133 	return B_OK;
134 }
135 
136 
137 static void
138 nvme_disk_set_capacity(nvme_disk_driver_info* info, uint64 capacity,
139 	uint32 blockSize)
140 {
141 	TRACE("set_capacity(device = %p, capacity = %" B_PRIu64 ", blockSize = %" B_PRIu32 ")\n",
142 		info, capacity, blockSize);
143 
144 	info->capacity = capacity;
145 	info->block_size = blockSize;
146 }
147 
148 
149 //	#pragma mark - device module API
150 
151 
152 static int32 nvme_interrupt_handler(void* _info);
153 
154 
155 static status_t
156 nvme_disk_init_device(void* _info, void** _cookie)
157 {
158 	CALLED();
159 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info;
160 	ASSERT(info->ctrlr == NULL);
161 
162 	pci_device_module_info* pci;
163 	pci_device* pcidev;
164 	device_node* parent = sDeviceManager->get_parent_node(info->node);
165 	sDeviceManager->get_driver(parent, (driver_module_info**)&pci,
166 		(void**)&pcidev);
167 	pci->get_pci_info(pcidev, &info->info);
168 	sDeviceManager->put_node(parent);
169 
170 	// construct the libnvme pci_device struct
171 	pci_device* device = new pci_device;
172 	device->vendor_id = info->info.vendor_id;
173 	device->device_id = info->info.device_id;
174 	device->subvendor_id = 0;
175 	device->subdevice_id = 0;
176 
177 	device->domain = 0;
178 	device->bus = info->info.bus;
179 	device->dev = info->info.device;
180 	device->func = info->info.function;
181 
182 	device->pci_info = &info->info;
183 
184 	// enable busmaster and memory mapped access
185 	uint16 command = pci->read_pci_config(pcidev, PCI_command, 2);
186 	command |= PCI_command_master | PCI_command_memory;
187 	pci->write_pci_config(pcidev, PCI_command, 2, command);
188 
189 	// open the controller
190 	info->ctrlr = nvme_ctrlr_open(device, NULL);
191 	if (info->ctrlr == NULL) {
192 		TRACE_ERROR("failed to open the controller!\n");
193 		return B_ERROR;
194 	}
195 
196 	struct nvme_ctrlr_stat cstat;
197 	int err = nvme_ctrlr_stat(info->ctrlr, &cstat);
198 	if (err != 0) {
199 		TRACE_ERROR("failed to get controller information!\n");
200 		nvme_ctrlr_close(info->ctrlr);
201 		return err;
202 	}
203 
204 	TRACE_ALWAYS("attached to NVMe device \"%s (%s)\"\n", cstat.mn, cstat.sn);
205 	TRACE_ALWAYS("\tmaximum transfer size: %" B_PRIuSIZE "\n", cstat.max_xfer_size);
206 	TRACE_ALWAYS("\tqpair count: %d\n", cstat.io_qpairs);
207 
208 	// TODO: export more than just the first namespace!
209 	info->ns = nvme_ns_open(info->ctrlr, cstat.ns_ids[0]);
210 	if (info->ns == NULL) {
211 		TRACE_ERROR("failed to open namespace!\n");
212 		nvme_ctrlr_close(info->ctrlr);
213 		return B_ERROR;
214 	}
215 	TRACE_ALWAYS("namespace 0\n");
216 
217 	struct nvme_ns_stat nsstat;
218 	err = nvme_ns_stat(info->ns, &nsstat);
219 	if (err != 0) {
220 		TRACE_ERROR("failed to get namespace information!\n");
221 		nvme_ctrlr_close(info->ctrlr);
222 		return err;
223 	}
224 
225 	// store capacity information
226 	TRACE_ALWAYS("\tblock size: %" B_PRIuSIZE ", stripe size: %u\n",
227 		nsstat.sector_size, info->ns->stripe_size);
228 	nvme_disk_set_capacity(info, nsstat.sectors, nsstat.sector_size);
229 
230 	// set up interrupts
231 	if (get_module(B_PCI_X86_MODULE_NAME, (module_info**)&sPCIx86Module)
232 			!= B_OK) {
233 		sPCIx86Module = NULL;
234 	}
235 
236 	command = pci->read_pci_config(pcidev, PCI_command, 2);
237 	command &= ~(PCI_command_int_disable);
238 	pci->write_pci_config(pcidev, PCI_command, 2, command);
239 
240 	uint8 irq = info->info.u.h0.interrupt_line;
241 	if (sPCIx86Module != NULL) {
242 		if (sPCIx86Module->get_msix_count(info->info.bus, info->info.device,
243 				info->info.function)) {
244 			uint8 msixVector = 0;
245 			if (sPCIx86Module->configure_msix(info->info.bus, info->info.device,
246 					info->info.function, 1, &msixVector) == B_OK
247 				&& sPCIx86Module->enable_msix(info->info.bus, info->info.device,
248 					info->info.function) == B_OK) {
249 				TRACE_ALWAYS("using MSI-X\n");
250 				irq = msixVector;
251 			}
252 		} else if (sPCIx86Module->get_msi_count(info->info.bus,
253 				info->info.device, info->info.function) >= 1) {
254 			uint8 msiVector = 0;
255 			if (sPCIx86Module->configure_msi(info->info.bus, info->info.device,
256 					info->info.function, 1, &msiVector) == B_OK
257 				&& sPCIx86Module->enable_msi(info->info.bus, info->info.device,
258 					info->info.function) == B_OK) {
259 				TRACE_ALWAYS("using message signaled interrupts\n");
260 				irq = msiVector;
261 			}
262 		}
263 	}
264 
265 	if (irq == 0 || irq == 0xFF) {
266 		TRACE_ERROR("device PCI:%d:%d:%d was assigned an invalid IRQ\n",
267 			info->info.bus, info->info.device, info->info.function);
268 		info->polling = 1;
269 	} else {
270 		info->polling = 0;
271 	}
272 	info->interrupt.Init(NULL, NULL);
273 	install_io_interrupt_handler(irq, nvme_interrupt_handler, (void*)info, B_NO_HANDLED_INFO);
274 
275 	if (info->ctrlr->feature_supported[NVME_FEAT_INTERRUPT_COALESCING]) {
276 		uint32 microseconds = 16, threshold = 32;
277 		nvme_admin_set_feature(info->ctrlr, false, NVME_FEAT_INTERRUPT_COALESCING,
278 			((microseconds / 100) << 8) | threshold, 0, NULL);
279 	}
280 
281 	// allocate qpairs
282 	uint32 try_qpairs = cstat.io_qpairs;
283 	try_qpairs = min_c(try_qpairs, NVME_MAX_QPAIRS);
284 	if (try_qpairs >= (uint32)smp_get_num_cpus()) {
285 		try_qpairs = smp_get_num_cpus();
286 	} else {
287 		// Find the highest number of qpairs that evenly divides the number of CPUs.
288 		while ((smp_get_num_cpus() % try_qpairs) != 0)
289 			try_qpairs--;
290 	}
291 	info->qpair_count = 0;
292 	for (uint32 i = 0; i < try_qpairs; i++) {
293 		info->qpairs[i].qpair = nvme_ioqp_get(info->ctrlr,
294 			(enum nvme_qprio)0, 0);
295 		if (info->qpairs[i].qpair == NULL)
296 			break;
297 
298 		info->qpair_count++;
299 	}
300 	if (info->qpair_count == 0) {
301 		TRACE_ERROR("failed to allocate qpairs!\n");
302 		nvme_ctrlr_close(info->ctrlr);
303 		return B_NO_MEMORY;
304 	}
305 	if (info->qpair_count != try_qpairs) {
306 		TRACE_ALWAYS("warning: did not get expected number of qpairs\n");
307 	}
308 
309 	// allocate DMA buffers
310 	int buffers = info->qpair_count * 2;
311 
312 	dma_restrictions restrictions = {};
313 	restrictions.alignment = B_PAGE_SIZE;
314 		// Technically, the first and last segments in a transfer can be aligned
315 		// only on 32-bits, and the rest only need to have sizes that are a multiple
316 		// of the block size.
317 	restrictions.max_segment_count = (NVME_MAX_SGL_DESCRIPTORS / 2);
318 	restrictions.max_transfer_size = cstat.max_xfer_size;
319 	info->max_io_blocks = cstat.max_xfer_size / nsstat.sector_size;
320 
321 	err = info->dma_resource.Init(restrictions, B_PAGE_SIZE, buffers, buffers);
322 	if (err != 0) {
323 		TRACE_ERROR("failed to initialize DMA resource!\n");
324 		nvme_ctrlr_close(info->ctrlr);
325 		return err;
326 	}
327 
328 	info->dma_buffers_sem = create_sem(buffers, "nvme buffers sem");
329 	if (info->dma_buffers_sem < 0) {
330 		TRACE_ERROR("failed to create DMA buffers semaphore!\n");
331 		nvme_ctrlr_close(info->ctrlr);
332 		return info->dma_buffers_sem;
333 	}
334 
335 	// set up rounded-write lock
336 	rw_lock_init(&info->rounded_write_lock, "nvme rounded writes");
337 
338 	*_cookie = info;
339 	return B_OK;
340 }
341 
342 
343 static void
344 nvme_disk_uninit_device(void* _cookie)
345 {
346 	CALLED();
347 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie;
348 
349 	remove_io_interrupt_handler(info->info.u.h0.interrupt_line,
350 		nvme_interrupt_handler, (void*)info);
351 
352 	rw_lock_destroy(&info->rounded_write_lock);
353 
354 	nvme_ns_close(info->ns);
355 	nvme_ctrlr_close(info->ctrlr);
356 
357 	// TODO: Deallocate MSI(-X).
358 	// TODO: Deallocate PCI.
359 }
360 
361 
362 static status_t
363 nvme_disk_open(void* _info, const char* path, int openMode, void** _cookie)
364 {
365 	CALLED();
366 
367 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info;
368 	nvme_disk_handle* handle = (nvme_disk_handle*)malloc(
369 		sizeof(nvme_disk_handle));
370 	if (handle == NULL)
371 		return B_NO_MEMORY;
372 
373 	handle->info = info;
374 
375 	*_cookie = handle;
376 	return B_OK;
377 }
378 
379 
380 static status_t
381 nvme_disk_close(void* cookie)
382 {
383 	CALLED();
384 
385 	//nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
386 	return B_OK;
387 }
388 
389 
390 static status_t
391 nvme_disk_free(void* cookie)
392 {
393 	CALLED();
394 
395 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
396 	free(handle);
397 	return B_OK;
398 }
399 
400 
401 // #pragma mark - I/O
402 
403 
404 static int32
405 nvme_interrupt_handler(void* _info)
406 {
407 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info;
408 	info->interrupt.NotifyAll();
409 	info->polling = -1;
410 	return 0;
411 }
412 
413 
414 static qpair_info*
415 get_qpair(nvme_disk_driver_info* info)
416 {
417 	return &info->qpairs[smp_get_current_cpu() % info->qpair_count];
418 }
419 
420 
421 static void
422 io_finished_callback(status_t* status, const struct nvme_cpl* cpl)
423 {
424 	*status = nvme_cpl_is_error(cpl) ? B_IO_ERROR : B_OK;
425 }
426 
427 
428 static void
429 await_status(nvme_disk_driver_info* info, struct nvme_qpair* qpair, status_t& status)
430 {
431 	CALLED();
432 
433 	ConditionVariableEntry entry;
434 	int timeouts = 0;
435 	while (status == EINPROGRESS) {
436 		info->interrupt.Add(&entry);
437 
438 		nvme_qpair_poll(qpair, 0);
439 
440 		if (status != EINPROGRESS)
441 			return;
442 
443 		if (info->polling > 0) {
444 			entry.Wait(B_RELATIVE_TIMEOUT, min_c(5 * 1000 * 1000,
445 				(1 << timeouts) * 1000));
446 			timeouts++;
447 		} else if (entry.Wait(B_RELATIVE_TIMEOUT, 5 * 1000 * 1000) != B_OK) {
448 			// This should never happen, as we are woken up on every interrupt
449 			// no matter the qpair or transfer within; so if it does occur,
450 			// that probably means the controller stalled, or maybe cannot
451 			// generate interrupts at all.
452 
453 			TRACE_ERROR("timed out waiting for interrupt!\n");
454 			if (timeouts++ >= 3) {
455 				nvme_qpair_fail(qpair);
456 				status = B_TIMED_OUT;
457 				return;
458 			}
459 
460 			info->polling++;
461 			if (info->polling > 0) {
462 				TRACE_ALWAYS("switching to polling mode, performance will be affected!\n");
463 			}
464 		}
465 
466 		nvme_qpair_poll(qpair, 0);
467 	}
468 }
469 
470 
471 struct nvme_io_request {
472 	status_t status;
473 
474 	bool write;
475 
476 	off_t lba_start;
477 	size_t lba_count;
478 
479 	physical_entry* iovecs;
480 	int32 iovec_count;
481 
482 	int32 iovec_i;
483 	uint32 iovec_offset;
484 };
485 
486 
487 static void
488 ior_reset_sgl(nvme_io_request* request, uint32_t offset)
489 {
490 	TRACE("IOR Reset: %" B_PRIu32 "\n", offset);
491 
492 	int32 i = 0;
493 	while (offset > 0 && request->iovecs[i].size <= offset) {
494 		offset -= request->iovecs[i].size;
495 		i++;
496 	}
497 	request->iovec_i = i;
498 	request->iovec_offset = offset;
499 }
500 
501 
502 static int
503 ior_next_sge(nvme_io_request* request, uint64_t* address, uint32_t* length)
504 {
505 	int32 index = request->iovec_i;
506 	if (index < 0 || index > request->iovec_count)
507 		return -1;
508 
509 	*address = request->iovecs[index].address + request->iovec_offset;
510 	*length = request->iovecs[index].size - request->iovec_offset;
511 
512 	TRACE("IOV %d (+ " B_PRIu32 "): 0x%" B_PRIx64 ", %" B_PRIu32 "\n",
513 		request->iovec_i, request->iovec_offset, *address, *length);
514 
515 	request->iovec_i++;
516 	request->iovec_offset = 0;
517 	return 0;
518 }
519 
520 
521 static status_t
522 do_nvme_io_request(nvme_disk_driver_info* info, nvme_io_request* request)
523 {
524 	request->status = EINPROGRESS;
525 
526 	qpair_info* qpinfo = get_qpair(info);
527 	int ret = -1;
528 	if (request->write) {
529 		ret = nvme_ns_writev(info->ns, qpinfo->qpair, request->lba_start,
530 			request->lba_count, (nvme_cmd_cb)io_finished_callback, request,
531 			0, (nvme_req_reset_sgl_cb)ior_reset_sgl,
532 			(nvme_req_next_sge_cb)ior_next_sge);
533 	} else {
534 		ret = nvme_ns_readv(info->ns, qpinfo->qpair, request->lba_start,
535 			request->lba_count, (nvme_cmd_cb)io_finished_callback, request,
536 			0, (nvme_req_reset_sgl_cb)ior_reset_sgl,
537 			(nvme_req_next_sge_cb)ior_next_sge);
538 	}
539 	if (ret != 0) {
540 		TRACE_ERROR("attempt to queue %s I/O at LBA %" B_PRIdOFF " of %" B_PRIuSIZE
541 			" blocks failed!\n", request->write ? "write" : "read",
542 			request->lba_start, request->lba_count);
543 
544 		request->lba_count = 0;
545 		return ret;
546 	}
547 
548 	await_status(info, qpinfo->qpair, request->status);
549 
550 	if (request->status != B_OK) {
551 		TRACE_ERROR("%s at LBA %" B_PRIdOFF " of %" B_PRIuSIZE
552 			" blocks failed!\n", request->write ? "write" : "read",
553 			request->lba_start, request->lba_count);
554 
555 		request->lba_count = 0;
556 	}
557 	return request->status;
558 }
559 
560 
561 static status_t
562 nvme_disk_bounced_io(nvme_disk_handle* handle, io_request* request)
563 {
564 	CALLED();
565 
566 	WriteLocker writeLocker;
567 	if (request->IsWrite())
568 		writeLocker.SetTo(handle->info->rounded_write_lock, false);
569 
570 	status_t status = acquire_sem(handle->info->dma_buffers_sem);
571 	if (status != B_OK) {
572 		request->SetStatusAndNotify(status);
573 		return status;
574 	}
575 
576 	const size_t block_size = handle->info->block_size;
577 
578 	TRACE("%p: IOR Offset: %" B_PRIdOFF "; Length %" B_PRIuGENADDR
579 		"; Write %s\n", request, request->Offset(), request->Length(),
580 		request->IsWrite() ? "yes" : "no");
581 
582 	nvme_io_request nvme_request;
583 	while (request->RemainingBytes() > 0) {
584 		IOOperation operation;
585 		status = handle->info->dma_resource.TranslateNext(request, &operation, 0);
586 		if (status != B_OK)
587 			break;
588 
589 		size_t transferredBytes = 0;
590 		do {
591 			TRACE("%p: IOO offset: %" B_PRIdOFF ", length: %" B_PRIuGENADDR
592 				", write: %s\n", request, operation.Offset(),
593 				operation.Length(), operation.IsWrite() ? "yes" : "no");
594 
595 			nvme_request.write = operation.IsWrite();
596 			nvme_request.lba_start = operation.Offset() / block_size;
597 			nvme_request.lba_count = operation.Length() / block_size;
598 			nvme_request.iovecs = (physical_entry*)operation.Vecs();
599 			nvme_request.iovec_count = operation.VecCount();
600 
601 			status = do_nvme_io_request(handle->info, &nvme_request);
602 			if (status == B_OK && nvme_request.write == request->IsWrite())
603 				transferredBytes += operation.OriginalLength();
604 
605 			operation.SetStatus(status);
606 		} while (status == B_OK && !operation.Finish());
607 
608 		if (status == B_OK && operation.Status() != B_OK) {
609 			TRACE_ERROR("I/O succeeded but IOOperation failed!\n");
610 			status = operation.Status();
611 		}
612 
613 		operation.SetTransferredBytes(transferredBytes);
614 		request->OperationFinished(&operation, status, status != B_OK,
615 			operation.OriginalOffset() + transferredBytes);
616 
617 		handle->info->dma_resource.RecycleBuffer(operation.Buffer());
618 
619 		TRACE("%p: status %s, remaining bytes %" B_PRIuGENADDR "\n", request,
620 			strerror(status), request->RemainingBytes());
621 		if (status != B_OK)
622 			break;
623 	}
624 
625 	release_sem(handle->info->dma_buffers_sem);
626 
627 	// Notify() also takes care of UnlockMemory().
628 	if (status != B_OK && request->Status() == B_OK)
629 		request->SetStatusAndNotify(status);
630 	else
631 		request->NotifyFinished();
632 	return status;
633 }
634 
635 
636 static status_t
637 nvme_disk_io(void* cookie, io_request* request)
638 {
639 	CALLED();
640 
641 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
642 
643 	nvme_io_request nvme_request;
644 	memset(&nvme_request, 0, sizeof(nvme_io_request));
645 
646 	nvme_request.write = request->IsWrite();
647 
648 	physical_entry* vtophys = NULL;
649 	MemoryDeleter vtophysDeleter;
650 
651 	IOBuffer* buffer = request->Buffer();
652 	status_t status = B_OK;
653 	if (!buffer->IsPhysical()) {
654 		status = buffer->LockMemory(request->TeamID(), request->IsWrite());
655 		if (status != B_OK) {
656 			TRACE_ERROR("failed to lock memory: %s\n", strerror(status));
657 			return status;
658 		}
659 		// SetStatusAndNotify() takes care of unlocking memory if necessary.
660 
661 		// This is slightly inefficient, as we could use a BStackOrHeapArray in
662 		// the optimal case (few physical entries required), but we would not
663 		// know whether or not that was possible until calling get_memory_map()
664 		// and then potentially reallocating, which would complicate the logic.
665 
666 		int32 vtophys_length = (request->Length() / B_PAGE_SIZE) + 2;
667 		nvme_request.iovecs = vtophys = (physical_entry*)malloc(sizeof(physical_entry)
668 			* vtophys_length);
669 		if (vtophys == NULL) {
670 			TRACE_ERROR("failed to allocate memory for iovecs\n");
671 			request->SetStatusAndNotify(B_NO_MEMORY);
672 			return B_NO_MEMORY;
673 		}
674 		vtophysDeleter.SetTo(vtophys);
675 
676 		for (size_t i = 0; i < buffer->VecCount(); i++) {
677 			generic_io_vec virt = buffer->VecAt(i);
678 			uint32 entries = vtophys_length - nvme_request.iovec_count;
679 
680 			// Avoid copies by going straight into the vtophys array.
681 			status = get_memory_map_etc(request->TeamID(), (void*)virt.base,
682 				virt.length, vtophys + nvme_request.iovec_count, &entries);
683 			if (status == B_BUFFER_OVERFLOW) {
684 				TRACE("vtophys array was too small, reallocating\n");
685 
686 				vtophysDeleter.Detach();
687 				vtophys_length *= 2;
688 				nvme_request.iovecs = vtophys = (physical_entry*)realloc(vtophys,
689 					sizeof(physical_entry) * vtophys_length);
690 				vtophysDeleter.SetTo(vtophys);
691 				if (vtophys == NULL) {
692 					status = B_NO_MEMORY;
693 				} else {
694 					// Try again, with the larger buffer this time.
695 					i--;
696 					continue;
697 				}
698 			}
699 			if (status != B_OK) {
700 				TRACE_ERROR("I/O get_memory_map failed: %s\n", strerror(status));
701 				request->SetStatusAndNotify(status);
702 				return status;
703 			}
704 
705 			nvme_request.iovec_count += entries;
706 		}
707 	} else {
708 		nvme_request.iovecs = (physical_entry*)buffer->Vecs();
709 		nvme_request.iovec_count = buffer->VecCount();
710 	}
711 
712 	// See if we need to bounce anything other than the first or last vec.
713 	const size_t block_size = handle->info->block_size;
714 	bool bounceAll = false;
715 	for (int32 i = 1; !bounceAll && i < (nvme_request.iovec_count - 1); i++) {
716 		if ((nvme_request.iovecs[i].address % B_PAGE_SIZE) != 0)
717 			bounceAll = true;
718 		if ((nvme_request.iovecs[i].size % B_PAGE_SIZE) != 0)
719 			bounceAll = true;
720 	}
721 
722 	// See if we need to bounce due to the first or last vecs.
723 	if (nvme_request.iovec_count > 1) {
724 		// There are middle vecs, so the first and last vecs have different restrictions: they
725 		// need only be a multiple of the block size, and must end and start on a page boundary,
726 		// respectively, though the start address must always be 32-bit-aligned.
727 		physical_entry* entry = &nvme_request.iovecs[0];
728 		if (!bounceAll && (((entry->address + entry->size) % B_PAGE_SIZE) != 0
729 				|| (entry->address & 0x3) != 0 || (entry->size % block_size) != 0))
730 			bounceAll = true;
731 
732 		entry = &nvme_request.iovecs[nvme_request.iovec_count - 1];
733 		if (!bounceAll && ((entry->address % B_PAGE_SIZE) != 0
734 				|| (entry->size % block_size) != 0))
735 			bounceAll = true;
736 	} else {
737 		// There is only one vec. Check that it is a multiple of the block size,
738 		// and that its address is 32-bit-aligned.
739 		physical_entry* entry = &nvme_request.iovecs[0];
740 		if (!bounceAll && ((entry->address & 0x3) != 0 || (entry->size % block_size) != 0))
741 			bounceAll = true;
742 	}
743 
744 	// See if we need to bounce due to rounding.
745 	const off_t rounded_pos = ROUNDDOWN(request->Offset(), block_size);
746 	phys_size_t rounded_len = ROUNDUP(request->Length() + (request->Offset()
747 		- rounded_pos), block_size);
748 	if (rounded_pos != request->Offset() || rounded_len != request->Length())
749 		bounceAll = true;
750 
751 	if (bounceAll) {
752 		// Let the bounced I/O routine take care of everything from here.
753 		return nvme_disk_bounced_io(handle, request);
754 	}
755 
756 	nvme_request.lba_start = rounded_pos / block_size;
757 	nvme_request.lba_count = rounded_len / block_size;
758 
759 	// No bouncing was required.
760 	ReadLocker readLocker;
761 	if (nvme_request.write)
762 		readLocker.SetTo(handle->info->rounded_write_lock, false);
763 
764 	// Error check before actually doing I/O.
765 	if (status != B_OK) {
766 		TRACE_ERROR("I/O failed early: %s\n", strerror(status));
767 		request->SetStatusAndNotify(status);
768 		return status;
769 	}
770 
771 	const uint32 max_io_blocks = handle->info->max_io_blocks;
772 	int32 remaining = nvme_request.iovec_count;
773 	while (remaining > 0) {
774 		nvme_request.iovec_count = min_c(remaining,
775 			NVME_MAX_SGL_DESCRIPTORS / 2);
776 
777 		nvme_request.lba_count = 0;
778 		for (int i = 0; i < nvme_request.iovec_count; i++) {
779 			uint32 new_lba_count = nvme_request.lba_count
780 				+ (nvme_request.iovecs[i].size / block_size);
781 			if (nvme_request.lba_count > 0 && new_lba_count > max_io_blocks) {
782 				// We already have a nonzero length, and adding this vec would
783 				// make us go over (or we already are over.) Stop adding.
784 				nvme_request.iovec_count = i;
785 				break;
786 			}
787 
788 			nvme_request.lba_count = new_lba_count;
789 		}
790 
791 		status = do_nvme_io_request(handle->info, &nvme_request);
792 		if (status != B_OK)
793 			break;
794 
795 		nvme_request.iovecs += nvme_request.iovec_count;
796 		remaining -= nvme_request.iovec_count;
797 		nvme_request.lba_start += nvme_request.lba_count;
798 	}
799 
800 	if (status != B_OK)
801 		TRACE_ERROR("I/O failed: %s\n", strerror(status));
802 
803 	request->SetTransferredBytes(status != B_OK,
804 		(nvme_request.lba_start * block_size) - rounded_pos);
805 	request->SetStatusAndNotify(status);
806 	return status;
807 }
808 
809 
810 static status_t
811 nvme_disk_read(void* cookie, off_t pos, void* buffer, size_t* length)
812 {
813 	CALLED();
814 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
815 
816 	const off_t end = (handle->info->capacity * handle->info->block_size);
817 	if (pos >= end)
818 		return B_BAD_VALUE;
819 	if (pos + (off_t)*length > end)
820 		*length = end - pos;
821 
822 	IORequest request;
823 	status_t status = request.Init(pos, (addr_t)buffer, *length, false, 0);
824 	if (status != B_OK)
825 		return status;
826 
827 	status = nvme_disk_io(handle, &request);
828 	*length = request.TransferredBytes();
829 	return status;
830 }
831 
832 
833 static status_t
834 nvme_disk_write(void* cookie, off_t pos, const void* buffer, size_t* length)
835 {
836 	CALLED();
837 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
838 
839 	const off_t end = (handle->info->capacity * handle->info->block_size);
840 	if (pos >= end)
841 		return B_BAD_VALUE;
842 	if (pos + (off_t)*length > end)
843 		*length = end - pos;
844 
845 	IORequest request;
846 	status_t status = request.Init(pos, (addr_t)buffer, *length, true, 0);
847 	if (status != B_OK)
848 		return status;
849 
850 	status = nvme_disk_io(handle, &request);
851 	*length = request.TransferredBytes();
852 	return status;
853 }
854 
855 
856 static status_t
857 nvme_disk_flush(nvme_disk_driver_info* info)
858 {
859 	CALLED();
860 	status_t status = EINPROGRESS;
861 
862 	qpair_info* qpinfo = get_qpair(info);
863 	int ret = nvme_ns_flush(info->ns, qpinfo->qpair,
864 		(nvme_cmd_cb)io_finished_callback, &status);
865 	if (ret != 0)
866 		return ret;
867 
868 	await_status(info, qpinfo->qpair, status);
869 	return status;
870 }
871 
872 
873 static status_t
874 nvme_disk_trim(nvme_disk_driver_info* info, fs_trim_data* trimData)
875 {
876 	CALLED();
877 	trimData->trimmed_size = 0;
878 
879 	const off_t deviceSize = info->capacity * info->block_size; // in bytes
880 	if (deviceSize < 0)
881 		return B_BAD_VALUE;
882 
883 	STATIC_ASSERT(sizeof(deviceSize) <= sizeof(uint64));
884 	ASSERT(deviceSize >= 0);
885 
886 	// Do not trim past device end.
887 	for (uint32 i = 0; i < trimData->range_count; i++) {
888 		uint64 offset = trimData->ranges[i].offset;
889 		uint64& size = trimData->ranges[i].size;
890 
891 		if (offset >= (uint64)deviceSize)
892 			return B_BAD_VALUE;
893 		size = std::min(size, (uint64)deviceSize - offset);
894 	}
895 
896 	// We need contiguous memory for the DSM ranges.
897 	nvme_dsm_range* dsmRanges = (nvme_dsm_range*)nvme_mem_alloc_node(
898 		trimData->range_count * sizeof(nvme_dsm_range), 0, 0, NULL);
899 	if (dsmRanges == NULL)
900 		return B_NO_MEMORY;
901 	CObjectDeleter<void, void, nvme_free> dsmRangesDeleter(dsmRanges);
902 
903 	uint64 trimmingSize = 0;
904 	for (uint32 i = 0; i < trimData->range_count; i++) {
905 		uint64 offset = trimData->ranges[i].offset;
906 		uint64 length = trimData->ranges[i].size;
907 
908 		// Round up offset and length to the block size.
909 		// (Some space at the beginning and end may thus not be trimmed.)
910 		offset = ROUNDUP(offset, info->block_size);
911 		length -= offset - trimData->ranges[i].offset;
912 		length = ROUNDDOWN(length, info->block_size);
913 
914 		if (length == 0)
915 			continue;
916 		if ((length / info->block_size) > UINT32_MAX)
917 			length = uint64(UINT32_MAX) * info->block_size;
918 			// TODO: Break into smaller trim ranges!
919 
920 		TRACE("trim %" B_PRIu64 " bytes from %" B_PRIu64 "\n", length, offset);
921 
922 		dsmRanges[i].attributes = 0;
923 		dsmRanges[i].length = length / info->block_size;
924 		dsmRanges[i].starting_lba = offset / info->block_size;
925 
926 		trimmingSize += dsmRanges[i].length;
927 	}
928 
929 	status_t status = EINPROGRESS;
930 	qpair_info* qpair = get_qpair(info);
931 	if (nvme_ns_deallocate(info->ns, qpair->qpair, dsmRanges, trimData->range_count,
932 			(nvme_cmd_cb)io_finished_callback, &status) != 0)
933 		return B_IO_ERROR;
934 
935 	await_status(info, qpair->qpair, status);
936 	if (status != B_OK)
937 		return status;
938 
939 	trimData->trimmed_size = trimmingSize;
940 	return B_OK;
941 }
942 
943 
944 static status_t
945 nvme_disk_ioctl(void* cookie, uint32 op, void* buffer, size_t length)
946 {
947 	CALLED();
948 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
949 	nvme_disk_driver_info* info = handle->info;
950 
951 	TRACE("ioctl(op = %" B_PRId32 ")\n", op);
952 
953 	switch (op) {
954 		case B_GET_MEDIA_STATUS:
955 		{
956 			*(status_t *)buffer = info->media_status;
957 			info->media_status = B_OK;
958 			return B_OK;
959 			break;
960 		}
961 
962 		case B_GET_DEVICE_SIZE:
963 		{
964 			size_t size = info->capacity * info->block_size;
965 			return user_memcpy(buffer, &size, sizeof(size_t));
966 		}
967 
968 		case B_GET_GEOMETRY:
969 		{
970 			if (buffer == NULL || length > sizeof(device_geometry))
971 				return B_BAD_VALUE;
972 
973 		 	device_geometry geometry;
974 			status_t status = get_geometry(handle, &geometry);
975 			if (status != B_OK)
976 				return status;
977 
978 			return user_memcpy(buffer, &geometry, length);
979 		}
980 
981 		case B_GET_ICON_NAME:
982 			return user_strlcpy((char*)buffer, "devices/drive-harddisk",
983 				B_FILE_NAME_LENGTH);
984 
985 		case B_GET_VECTOR_ICON:
986 		{
987 			device_icon iconData;
988 			if (length != sizeof(device_icon))
989 				return B_BAD_VALUE;
990 			if (user_memcpy(&iconData, buffer, sizeof(device_icon)) != B_OK)
991 				return B_BAD_ADDRESS;
992 
993 			if (iconData.icon_size >= (int32)sizeof(kDriveIcon)) {
994 				if (user_memcpy(iconData.icon_data, kDriveIcon,
995 						sizeof(kDriveIcon)) != B_OK)
996 					return B_BAD_ADDRESS;
997 			}
998 
999 			iconData.icon_size = sizeof(kDriveIcon);
1000 			return user_memcpy(buffer, &iconData, sizeof(device_icon));
1001 		}
1002 
1003 		case B_FLUSH_DRIVE_CACHE:
1004 			return nvme_disk_flush(info);
1005 
1006 		case B_TRIM_DEVICE:
1007 			ASSERT(IS_KERNEL_ADDRESS(buffer));
1008 			return nvme_disk_trim(info, (fs_trim_data*)buffer);
1009 	}
1010 
1011 	return B_DEV_INVALID_IOCTL;
1012 }
1013 
1014 
1015 //	#pragma mark - driver module API
1016 
1017 
1018 static float
1019 nvme_disk_supports_device(device_node *parent)
1020 {
1021 	CALLED();
1022 
1023 	const char* bus;
1024 	uint16 baseClass, subClass;
1025 
1026 	if (sDeviceManager->get_attr_string(parent, B_DEVICE_BUS, &bus, false) != B_OK
1027 		|| sDeviceManager->get_attr_uint16(parent, B_DEVICE_TYPE, &baseClass, false) != B_OK
1028 		|| sDeviceManager->get_attr_uint16(parent, B_DEVICE_SUB_TYPE, &subClass, false) != B_OK)
1029 		return -1.0f;
1030 
1031 	if (strcmp(bus, "pci") != 0 || baseClass != PCI_mass_storage)
1032 		return 0.0f;
1033 
1034 	if (subClass != PCI_nvm)
1035 		return 0.0f;
1036 
1037 	TRACE("NVMe device found!\n");
1038 	return 1.0f;
1039 }
1040 
1041 
1042 static status_t
1043 nvme_disk_register_device(device_node* parent)
1044 {
1045 	CALLED();
1046 
1047 	device_attr attrs[] = {
1048 		{ B_DEVICE_PRETTY_NAME, B_STRING_TYPE, { .string = "NVMe Disk" } },
1049 		{ NULL }
1050 	};
1051 
1052 	return sDeviceManager->register_node(parent, NVME_DISK_DRIVER_MODULE_NAME,
1053 		attrs, NULL, NULL);
1054 }
1055 
1056 
1057 static status_t
1058 nvme_disk_init_driver(device_node* node, void** cookie)
1059 {
1060 	CALLED();
1061 
1062 	int ret = nvme_lib_init((enum nvme_log_level)0, (enum nvme_log_facility)0, NULL);
1063 	if (ret != 0) {
1064 		TRACE_ERROR("libnvme initialization failed!\n");
1065 		return ret;
1066 	}
1067 
1068 	nvme_disk_driver_info* info = new nvme_disk_driver_info;
1069 	if (info == NULL)
1070 		return B_NO_MEMORY;
1071 
1072 	info->media_status = B_OK;
1073 	info->node = node;
1074 
1075 	info->ctrlr = NULL;
1076 
1077 	*cookie = info;
1078 	return B_OK;
1079 }
1080 
1081 
1082 static void
1083 nvme_disk_uninit_driver(void* _cookie)
1084 {
1085 	CALLED();
1086 
1087 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie;
1088 	free(info);
1089 }
1090 
1091 
1092 static status_t
1093 nvme_disk_register_child_devices(void* _cookie)
1094 {
1095 	CALLED();
1096 
1097 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie;
1098 	status_t status;
1099 
1100 	int32 id = sDeviceManager->create_id(NVME_DISK_DEVICE_ID_GENERATOR);
1101 	if (id < 0)
1102 		return id;
1103 
1104 	char name[64];
1105 	snprintf(name, sizeof(name), "disk/nvme/%" B_PRId32 "/raw",
1106 		id);
1107 
1108 	status = sDeviceManager->publish_device(info->node, name,
1109 		NVME_DISK_DEVICE_MODULE_NAME);
1110 
1111 	return status;
1112 }
1113 
1114 
1115 //	#pragma mark -
1116 
1117 
1118 module_dependency module_dependencies[] = {
1119 	{ B_DEVICE_MANAGER_MODULE_NAME, (module_info**)&sDeviceManager },
1120 	{ NULL }
1121 };
1122 
1123 struct device_module_info sNvmeDiskDevice = {
1124 	{
1125 		NVME_DISK_DEVICE_MODULE_NAME,
1126 		0,
1127 		NULL
1128 	},
1129 
1130 	nvme_disk_init_device,
1131 	nvme_disk_uninit_device,
1132 	NULL, // remove,
1133 
1134 	nvme_disk_open,
1135 	nvme_disk_close,
1136 	nvme_disk_free,
1137 	nvme_disk_read,
1138 	nvme_disk_write,
1139 	nvme_disk_io,
1140 	nvme_disk_ioctl,
1141 
1142 	NULL,	// select
1143 	NULL,	// deselect
1144 };
1145 
1146 struct driver_module_info sNvmeDiskDriver = {
1147 	{
1148 		NVME_DISK_DRIVER_MODULE_NAME,
1149 		0,
1150 		NULL
1151 	},
1152 
1153 	nvme_disk_supports_device,
1154 	nvme_disk_register_device,
1155 	nvme_disk_init_driver,
1156 	nvme_disk_uninit_driver,
1157 	nvme_disk_register_child_devices,
1158 	NULL,	// rescan
1159 	NULL,	// removed
1160 };
1161 
1162 module_info* modules[] = {
1163 	(module_info*)&sNvmeDiskDriver,
1164 	(module_info*)&sNvmeDiskDevice,
1165 	NULL
1166 };
1167