xref: /haiku/src/add-ons/kernel/drivers/disk/nvme/nvme_disk.cpp (revision cbe0a0c436162d78cc3f92a305b64918c839d079)
1 /*
2  * Copyright 2019-2020, Haiku, Inc. All rights reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Augustin Cavalier <waddlesplash>
7  */
8 
9 
10 #include <stdio.h>
11 #include <stdlib.h>
12 
13 #include <algorithm>
14 #include <condition_variable.h>
15 #include <AutoDeleter.h>
16 #include <kernel.h>
17 #include <util/AutoLock.h>
18 
19 #include <fs/devfs.h>
20 #include <bus/PCI.h>
21 #include <PCI_x86.h>
22 #include <vm/vm.h>
23 
24 #include "IORequest.h"
25 
26 extern "C" {
27 #include <libnvme/nvme.h>
28 #include <libnvme/nvme_internal.h>
29 }
30 
31 
32 //#define TRACE_NVME_DISK
33 #ifdef TRACE_NVME_DISK
34 #	define TRACE(x...) dprintf("nvme_disk: " x)
35 #else
36 #	define TRACE(x...) ;
37 #endif
38 #define TRACE_ALWAYS(x...)	dprintf("nvme_disk: " x)
39 #define TRACE_ERROR(x...)	dprintf("\33[33mnvme_disk:\33[0m " x)
40 #define CALLED() 			TRACE("CALLED %s\n", __PRETTY_FUNCTION__)
41 
42 
43 static const uint8 kDriveIcon[] = {
44 	0x6e, 0x63, 0x69, 0x66, 0x08, 0x03, 0x01, 0x00, 0x00, 0x02, 0x00, 0x16,
45 	0x02, 0x3c, 0xc7, 0xee, 0x38, 0x9b, 0xc0, 0xba, 0x16, 0x57, 0x3e, 0x39,
46 	0xb0, 0x49, 0x77, 0xc8, 0x42, 0xad, 0xc7, 0x00, 0xff, 0xff, 0xd3, 0x02,
47 	0x00, 0x06, 0x02, 0x3c, 0x96, 0x32, 0x3a, 0x4d, 0x3f, 0xba, 0xfc, 0x01,
48 	0x3d, 0x5a, 0x97, 0x4b, 0x57, 0xa5, 0x49, 0x84, 0x4d, 0x00, 0x47, 0x47,
49 	0x47, 0xff, 0xa5, 0xa0, 0xa0, 0x02, 0x00, 0x16, 0x02, 0xbc, 0x59, 0x2f,
50 	0xbb, 0x29, 0xa7, 0x3c, 0x0c, 0xe4, 0xbd, 0x0b, 0x7c, 0x48, 0x92, 0xc0,
51 	0x4b, 0x79, 0x66, 0x00, 0x7d, 0xff, 0xd4, 0x02, 0x00, 0x06, 0x02, 0x38,
52 	0xdb, 0xb4, 0x39, 0x97, 0x33, 0xbc, 0x4a, 0x33, 0x3b, 0xa5, 0x42, 0x48,
53 	0x6e, 0x66, 0x49, 0xee, 0x7b, 0x00, 0x59, 0x67, 0x56, 0xff, 0xeb, 0xb2,
54 	0xb2, 0x03, 0xa7, 0xff, 0x00, 0x03, 0xff, 0x00, 0x00, 0x04, 0x01, 0x80,
55 	0x07, 0x0a, 0x06, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x5a, 0x3e, 0x5a,
56 	0x31, 0x39, 0x25, 0x0a, 0x04, 0x22, 0x3c, 0x44, 0x4b, 0x5a, 0x31, 0x39,
57 	0x25, 0x0a, 0x04, 0x44, 0x4b, 0x44, 0x5b, 0x5a, 0x3e, 0x5a, 0x31, 0x0a,
58 	0x04, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x44, 0x4b, 0x08, 0x02, 0x27,
59 	0x43, 0xb8, 0x14, 0xc1, 0xf1, 0x08, 0x02, 0x26, 0x43, 0x29, 0x44, 0x0a,
60 	0x05, 0x44, 0x5d, 0x49, 0x5d, 0x60, 0x3e, 0x5a, 0x3b, 0x5b, 0x3f, 0x08,
61 	0x0a, 0x07, 0x01, 0x06, 0x00, 0x0a, 0x00, 0x01, 0x00, 0x10, 0x01, 0x17,
62 	0x84, 0x00, 0x04, 0x0a, 0x01, 0x01, 0x01, 0x00, 0x0a, 0x02, 0x01, 0x02,
63 	0x00, 0x0a, 0x03, 0x01, 0x03, 0x00, 0x0a, 0x04, 0x01, 0x04, 0x10, 0x01,
64 	0x17, 0x85, 0x20, 0x04, 0x0a, 0x06, 0x01, 0x05, 0x30, 0x24, 0xb3, 0x99,
65 	0x01, 0x17, 0x82, 0x00, 0x04, 0x0a, 0x05, 0x01, 0x05, 0x30, 0x20, 0xb2,
66 	0xe6, 0x01, 0x17, 0x82, 0x00, 0x04
67 };
68 
69 
70 #define NVME_DISK_DRIVER_MODULE_NAME 	"drivers/disk/nvme_disk/driver_v1"
71 #define NVME_DISK_DEVICE_MODULE_NAME 	"drivers/disk/nvme_disk/device_v1"
72 #define NVME_DISK_DEVICE_ID_GENERATOR	"nvme_disk/device_id"
73 
74 #define NVME_MAX_QPAIRS					(8)
75 
76 
77 static device_manager_info* sDeviceManager;
78 static pci_x86_module_info* sPCIx86Module;
79 
80 typedef struct {
81 	device_node*			node;
82 	pci_info				info;
83 
84 	struct nvme_ctrlr*		ctrlr;
85 
86 	struct nvme_ns*			ns;
87 	uint64					capacity;
88 	uint32					block_size;
89 	uint32					max_io_blocks;
90 	status_t				media_status;
91 
92 	struct qpair_info {
93 		struct nvme_qpair*	qpair;
94 	}						qpairs[NVME_MAX_QPAIRS];
95 	uint32					qpair_count;
96 	uint32					next_qpair;
97 
98 	DMAResource				dma_resource;
99 	sem_id					dma_buffers_sem;
100 
101 	rw_lock					rounded_write_lock;
102 
103 	ConditionVariable		interrupt;
104 } nvme_disk_driver_info;
105 typedef nvme_disk_driver_info::qpair_info qpair_info;
106 
107 
108 typedef struct {
109 	nvme_disk_driver_info*		info;
110 } nvme_disk_handle;
111 
112 
113 static status_t
114 get_geometry(nvme_disk_handle* handle, device_geometry* geometry)
115 {
116 	nvme_disk_driver_info* info = handle->info;
117 
118 	devfs_compute_geometry_size(geometry, info->capacity, info->block_size);
119 
120 	geometry->device_type = B_DISK;
121 	geometry->removable = false;
122 
123 	geometry->read_only = false;
124 	geometry->write_once = false;
125 
126 	TRACE("get_geometry(): %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %d, %d, %d, %d\n",
127 		geometry->bytes_per_sector, geometry->sectors_per_track,
128 		geometry->cylinder_count, geometry->head_count, geometry->device_type,
129 		geometry->removable, geometry->read_only, geometry->write_once);
130 
131 	return B_OK;
132 }
133 
134 
135 static int
136 log2(uint32 x)
137 {
138 	int y;
139 
140 	for (y = 31; y >= 0; --y) {
141 		if (x == ((uint32)1 << y))
142 			break;
143 	}
144 
145 	return y;
146 }
147 
148 
149 static void
150 nvme_disk_set_capacity(nvme_disk_driver_info* info, uint64 capacity,
151 	uint32 blockSize)
152 {
153 	TRACE("set_capacity(device = %p, capacity = %" B_PRIu64 ", blockSize = %" B_PRIu32 ")\n",
154 		info, capacity, blockSize);
155 
156 	// get log2, if possible
157 	uint32 blockShift = log2(blockSize);
158 
159 	if ((1UL << blockShift) != blockSize)
160 		blockShift = 0;
161 
162 	info->capacity = capacity;
163 	info->block_size = blockSize;
164 }
165 
166 
167 //	#pragma mark - device module API
168 
169 
170 static int32 nvme_interrupt_handler(void* _info);
171 
172 
173 static status_t
174 nvme_disk_init_device(void* _info, void** _cookie)
175 {
176 	CALLED();
177 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info;
178 
179 	pci_device_module_info* pci;
180 	pci_device* pcidev;
181 	device_node* parent = sDeviceManager->get_parent_node(info->node);
182 	sDeviceManager->get_driver(parent, (driver_module_info**)&pci,
183 		(void**)&pcidev);
184 	pci->get_pci_info(pcidev, &info->info);
185 	sDeviceManager->put_node(parent);
186 
187 	// construct the libnvme pci_device struct
188 	pci_device* device = new pci_device;
189 	device->vendor_id = info->info.vendor_id;
190 	device->device_id = info->info.device_id;
191 	device->subvendor_id = 0;
192 	device->subdevice_id = 0;
193 
194 	device->domain = 0;
195 	device->bus = info->info.bus;
196 	device->dev = info->info.device;
197 	device->func = info->info.function;
198 
199 	device->pci_info = &info->info;
200 
201 	// enable busmaster and memory mapped access
202 	uint16 command = pci->read_pci_config(pcidev, PCI_command, 2);
203 	command |= PCI_command_master | PCI_command_memory;
204 	pci->write_pci_config(pcidev, PCI_command, 2, command);
205 
206 	// open the controller
207 	info->ctrlr = nvme_ctrlr_open(device, NULL);
208 	if (info->ctrlr == NULL) {
209 		TRACE_ERROR("failed to open the controller!\n");
210 		return B_ERROR;
211 	}
212 
213 	struct nvme_ctrlr_stat cstat;
214 	int err = nvme_ctrlr_stat(info->ctrlr, &cstat);
215 	if (err != 0) {
216 		TRACE_ERROR("failed to get controller information!\n");
217 		nvme_ctrlr_close(info->ctrlr);
218 		return err;
219 	}
220 
221 	TRACE_ALWAYS("attached to NVMe device \"%s (%s)\"\n", cstat.mn, cstat.sn);
222 	TRACE_ALWAYS("\tmaximum transfer size: %" B_PRIuSIZE "\n", cstat.max_xfer_size);
223 	TRACE_ALWAYS("\tqpair count: %d\n", cstat.io_qpairs);
224 
225 	// TODO: export more than just the first namespace!
226 	info->ns = nvme_ns_open(info->ctrlr, cstat.ns_ids[0]);
227 	if (info->ns == NULL) {
228 		TRACE_ERROR("failed to open namespace!\n");
229 		nvme_ctrlr_close(info->ctrlr);
230 		return B_ERROR;
231 	}
232 
233 	struct nvme_ns_stat nsstat;
234 	err = nvme_ns_stat(info->ns, &nsstat);
235 	if (err != 0) {
236 		TRACE_ERROR("failed to get namespace information!\n");
237 		nvme_ctrlr_close(info->ctrlr);
238 		return err;
239 	}
240 
241 	// store capacity information
242 	nvme_disk_set_capacity(info, nsstat.sectors, nsstat.sector_size);
243 
244 	TRACE("capacity: %" B_PRIu64 ", block_size %" B_PRIu32 "\n",
245 		info->capacity, info->block_size);
246 
247 	// allocate qpairs
248 	info->qpair_count = info->next_qpair = 0;
249 	for (uint32 i = 0; i < NVME_MAX_QPAIRS && i < cstat.io_qpairs; i++) {
250 		info->qpairs[i].qpair = nvme_ioqp_get(info->ctrlr,
251 			(enum nvme_qprio)0, 0);
252 		if (info->qpairs[i].qpair == NULL)
253 			break;
254 
255 		info->qpair_count++;
256 	}
257 	if (info->qpair_count == 0) {
258 		TRACE_ERROR("failed to allocate qpairs!\n");
259 		nvme_ctrlr_close(info->ctrlr);
260 		return B_NO_MEMORY;
261 	}
262 
263 	// allocate DMA buffers
264 	int buffers = info->qpair_count * 2;
265 
266 	dma_restrictions restrictions = {};
267 	restrictions.alignment = B_PAGE_SIZE;
268 		// Technically, the first and last segments in a transfer can be
269 		// unaligned, and the rest only need to have sizes that are a multiple
270 		// of the block size.
271 	restrictions.max_segment_count = (NVME_MAX_SGL_DESCRIPTORS / 2);
272 	restrictions.max_transfer_size = cstat.max_xfer_size;
273 	info->max_io_blocks = cstat.max_xfer_size / nsstat.sector_size;
274 
275 	err = info->dma_resource.Init(restrictions, B_PAGE_SIZE, buffers, buffers);
276 	if (err != 0) {
277 		TRACE_ERROR("failed to initialize DMA resource!\n");
278 		nvme_ctrlr_close(info->ctrlr);
279 		return err;
280 	}
281 
282 	info->dma_buffers_sem = create_sem(buffers, "nvme buffers sem");
283 	if (info->dma_buffers_sem < 0) {
284 		TRACE_ERROR("failed to create DMA buffers semaphore!\n");
285 		nvme_ctrlr_close(info->ctrlr);
286 		return info->dma_buffers_sem;
287 	}
288 
289 	// set up rounded-write lock
290 	rw_lock_init(&info->rounded_write_lock, "nvme rounded writes");
291 
292 	// set up interrupt
293 	if (get_module(B_PCI_X86_MODULE_NAME, (module_info**)&sPCIx86Module)
294 			!= B_OK) {
295 		sPCIx86Module = NULL;
296 	}
297 
298 	command = pci->read_pci_config(pcidev, PCI_command, 2);
299 	command &= ~(PCI_command_int_disable);
300 	pci->write_pci_config(pcidev, PCI_command, 2, command);
301 
302 	uint8 irq = info->info.u.h0.interrupt_line;
303 	if (sPCIx86Module != NULL) {
304 #if 0
305 		if (sPCIx86Module->get_msix_count(info->info.bus, info->info.device,
306 				info->info.function)) {
307 			uint8 msixVector = 0;
308 			if (sPCIx86Module->configure_msix(info->info.bus, info->info.device,
309 					info->info.function, 1, &msixVector) == B_OK
310 				&& sPCIx86Module->enable_msix(info->info.bus, info->info.device,
311 					info->info.function) == B_OK) {
312 				TRACE_ALWAYS("using MSI-X\n");
313 				irq = msixVector;
314 			}
315 		} else
316 #endif
317 		if (sPCIx86Module->get_msi_count(info->info.bus,
318 				info->info.device, info->info.function) >= 1) {
319 			uint8 msiVector = 0;
320 			if (sPCIx86Module->configure_msi(info->info.bus, info->info.device,
321 					info->info.function, 1, &msiVector) == B_OK
322 				&& sPCIx86Module->enable_msi(info->info.bus, info->info.device,
323 					info->info.function) == B_OK) {
324 				TRACE_ALWAYS("using message signaled interrupts\n");
325 				irq = msiVector;
326 			}
327 		}
328 	}
329 
330 	if (irq == 0 || irq == 0xFF) {
331 		TRACE_ERROR("device PCI:%d:%d:%d was assigned an invalid IRQ\n",
332 			info->info.bus, info->info.device, info->info.function);
333 		return B_ERROR;
334 	}
335 	info->interrupt.Init(NULL, NULL);
336 	install_io_interrupt_handler(irq, nvme_interrupt_handler, (void*)info, B_NO_HANDLED_INFO);
337 
338 	if (info->ctrlr->feature_supported[NVME_FEAT_INTERRUPT_COALESCING]) {
339 		uint32 microseconds = 16, threshold = 32;
340 		nvme_admin_set_feature(info->ctrlr, false, NVME_FEAT_INTERRUPT_COALESCING,
341 			((microseconds / 100) << 8) | threshold, 0, NULL);
342 	}
343 
344 	*_cookie = info;
345 	return B_OK;
346 }
347 
348 
349 static void
350 nvme_disk_uninit_device(void* _cookie)
351 {
352 	CALLED();
353 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie;
354 
355 	remove_io_interrupt_handler(info->info.u.h0.interrupt_line,
356 		nvme_interrupt_handler, (void*)info);
357 
358 	rw_lock_destroy(&info->rounded_write_lock);
359 
360 	nvme_ns_close(info->ns);
361 	nvme_ctrlr_close(info->ctrlr);
362 
363 	// TODO: Deallocate MSI(-X).
364 	// TODO: Deallocate PCI.
365 }
366 
367 
368 static status_t
369 nvme_disk_open(void* _info, const char* path, int openMode, void** _cookie)
370 {
371 	CALLED();
372 
373 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info;
374 	nvme_disk_handle* handle = (nvme_disk_handle*)malloc(
375 		sizeof(nvme_disk_handle));
376 	if (handle == NULL)
377 		return B_NO_MEMORY;
378 
379 	handle->info = info;
380 
381 	*_cookie = handle;
382 	return B_OK;
383 }
384 
385 
386 static status_t
387 nvme_disk_close(void* cookie)
388 {
389 	CALLED();
390 
391 	//nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
392 	return B_OK;
393 }
394 
395 
396 static status_t
397 nvme_disk_free(void* cookie)
398 {
399 	CALLED();
400 
401 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
402 	free(handle);
403 	return B_OK;
404 }
405 
406 
407 // #pragma mark - I/O
408 
409 
410 static int32
411 nvme_interrupt_handler(void* _info)
412 {
413 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info;
414 	info->interrupt.NotifyAll();
415 	return 0;
416 }
417 
418 
419 static qpair_info*
420 get_qpair(nvme_disk_driver_info* info)
421 {
422 	return &info->qpairs[atomic_add((int32*)&info->next_qpair, 1)
423 		% info->qpair_count];
424 }
425 
426 
427 static void
428 io_finished_callback(status_t* status, const struct nvme_cpl* cpl)
429 {
430 	*status = nvme_cpl_is_error(cpl) ? B_IO_ERROR : B_OK;
431 }
432 
433 
434 static void
435 await_status(nvme_disk_driver_info* info, struct nvme_qpair* qpair, status_t& status)
436 {
437 	CALLED();
438 
439 	ConditionVariableEntry entry;
440 	int timeouts = 0;
441 	while (status == EINPROGRESS) {
442 		info->interrupt.Add(&entry);
443 
444 		nvme_qpair_poll(qpair, 0);
445 
446 		if (status != EINPROGRESS)
447 			return;
448 
449 		if (entry.Wait(B_RELATIVE_TIMEOUT, 5 * 1000 * 1000) != B_OK) {
450 			// This should never happen, as we are woken up on every interrupt
451 			// no matter the qpair or transfer within; so if it does occur,
452 			// that probably means the controller stalled or something.
453 
454 			TRACE_ERROR("timed out waiting for interrupt!\n");
455 			if (timeouts++ >= 3) {
456 				nvme_qpair_fail(qpair);
457 				status = B_TIMED_OUT;
458 				return;
459 			}
460 		}
461 
462 		nvme_qpair_poll(qpair, 0);
463 	}
464 }
465 
466 
467 struct nvme_io_request {
468 	status_t status;
469 
470 	bool write;
471 
472 	off_t lba_start;
473 	size_t lba_count;
474 
475 	physical_entry* iovecs;
476 	int32 iovec_count;
477 
478 	int32 iovec_i;
479 	uint32 iovec_offset;
480 };
481 
482 
483 void ior_reset_sgl(nvme_io_request* request, uint32_t offset)
484 {
485 	TRACE("IOR Reset: %" B_PRIu32 "\n", offset);
486 
487 	int32 i = 0;
488 	while (offset > 0 && request->iovecs[i].size <= offset) {
489 		offset -= request->iovecs[i].size;
490 		i++;
491 	}
492 	request->iovec_i = i;
493 	request->iovec_offset = offset;
494 }
495 
496 
497 int ior_next_sge(nvme_io_request* request, uint64_t* address, uint32_t* length)
498 {
499 	int32 index = request->iovec_i;
500 	if (index < 0 || index > request->iovec_count)
501 		return -1;
502 
503 	*address = request->iovecs[index].address + request->iovec_offset;
504 	*length = request->iovecs[index].size - request->iovec_offset;
505 
506 	TRACE("IOV %d (+ " B_PRIu32 "): 0x%" B_PRIx64 ", %" B_PRIu32 "\n",
507 		request->iovec_i, request->iovec_offset, *address, *length);
508 
509 	request->iovec_i++;
510 	request->iovec_offset = 0;
511 	return 0;
512 }
513 
514 
515 static status_t
516 do_nvme_io_request(nvme_disk_driver_info* info, nvme_io_request* request)
517 {
518 	request->status = EINPROGRESS;
519 
520 	qpair_info* qpinfo = get_qpair(info);
521 	int ret = -1;
522 	if (request->write) {
523 		ret = nvme_ns_writev(info->ns, qpinfo->qpair, request->lba_start,
524 			request->lba_count, (nvme_cmd_cb)io_finished_callback, request,
525 			0, (nvme_req_reset_sgl_cb)ior_reset_sgl,
526 			(nvme_req_next_sge_cb)ior_next_sge);
527 	} else {
528 		ret = nvme_ns_readv(info->ns, qpinfo->qpair, request->lba_start,
529 			request->lba_count, (nvme_cmd_cb)io_finished_callback, request,
530 			0, (nvme_req_reset_sgl_cb)ior_reset_sgl,
531 			(nvme_req_next_sge_cb)ior_next_sge);
532 	}
533 	if (ret != 0) {
534 		TRACE_ERROR("attempt to queue %s I/O at LBA %" B_PRIdOFF " of %" B_PRIuSIZE
535 			" blocks failed!\n", request->write ? "write" : "read",
536 			request->lba_start, request->lba_count);
537 
538 		request->lba_count = 0;
539 		return ret;
540 	}
541 
542 	await_status(info, qpinfo->qpair, request->status);
543 
544 	if (request->status != B_OK) {
545 		TRACE_ERROR("%s at LBA %" B_PRIdOFF " of %" B_PRIuSIZE
546 			" blocks failed!\n", request->write ? "write" : "read",
547 			request->lba_start, request->lba_count);
548 
549 		request->lba_count = 0;
550 	}
551 	return request->status;
552 }
553 
554 
555 static status_t
556 nvme_disk_bounced_io(nvme_disk_handle* handle, io_request* request)
557 {
558 	CALLED();
559 
560 	WriteLocker writeLocker;
561 	if (request->IsWrite())
562 		writeLocker.SetTo(handle->info->rounded_write_lock, false);
563 
564 	status_t status = acquire_sem(handle->info->dma_buffers_sem);
565 	if (status != B_OK) {
566 		request->SetStatusAndNotify(status);
567 		return status;
568 	}
569 
570 	const size_t block_size = handle->info->block_size;
571 
572 	TRACE("%p: IOR Offset: %" B_PRIdOFF "; Length %" B_PRIuGENADDR
573 		"; Write %s\n", request, request->Offset(), request->Length(),
574 		request->IsWrite() ? "yes" : "no");
575 
576 	nvme_io_request nvme_request;
577 	while (request->RemainingBytes() > 0) {
578 		IOOperation operation;
579 		status = handle->info->dma_resource.TranslateNext(request, &operation, 0);
580 		if (status != B_OK)
581 			break;
582 
583 		size_t transferredBytes = 0;
584 		do {
585 			TRACE("%p: IOO offset: %" B_PRIdOFF ", length: %" B_PRIuGENADDR
586 				", write: %s\n", request, operation.Offset(),
587 				operation.Length(), operation.IsWrite() ? "yes" : "no");
588 
589 			nvme_request.write = operation.IsWrite();
590 			nvme_request.lba_start = operation.Offset() / block_size;
591 			nvme_request.lba_count = operation.Length() / block_size;
592 			nvme_request.iovecs = (physical_entry*)operation.Vecs();
593 			nvme_request.iovec_count = operation.VecCount();
594 
595 			status = do_nvme_io_request(handle->info, &nvme_request);
596 			if (status == B_OK && nvme_request.write == request->IsWrite())
597 				transferredBytes += operation.OriginalLength();
598 
599 			operation.SetStatus(status);
600 		} while (status == B_OK && !operation.Finish());
601 
602 		if (status == B_OK && operation.Status() != B_OK) {
603 			TRACE_ERROR("I/O succeeded but IOOperation failed!\n");
604 			status = operation.Status();
605 		}
606 
607 		operation.SetTransferredBytes(transferredBytes);
608 		request->OperationFinished(&operation, status, status != B_OK,
609 			operation.OriginalOffset() + transferredBytes);
610 
611 		handle->info->dma_resource.RecycleBuffer(operation.Buffer());
612 
613 		TRACE("%p: status %s, remaining bytes %" B_PRIuGENADDR "\n", request,
614 			strerror(status), request->RemainingBytes());
615 		if (status != B_OK)
616 			break;
617 	}
618 
619 	release_sem(handle->info->dma_buffers_sem);
620 
621 	// Notify() also takes care of UnlockMemory().
622 	if (status != B_OK && request->Status() == B_OK)
623 		request->SetStatusAndNotify(status);
624 	else
625 		request->NotifyFinished();
626 	return status;
627 }
628 
629 
630 static status_t
631 nvme_disk_io(void* cookie, io_request* request)
632 {
633 	CALLED();
634 
635 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
636 
637 	nvme_io_request nvme_request;
638 	memset(&nvme_request, 0, sizeof(nvme_io_request));
639 
640 	nvme_request.write = request->IsWrite();
641 
642 	physical_entry* vtophys = NULL;
643 	MemoryDeleter vtophysDeleter;
644 
645 	IOBuffer* buffer = request->Buffer();
646 	status_t status = B_OK;
647 	if (!buffer->IsPhysical()) {
648 		status = buffer->LockMemory(request->TeamID(), request->IsWrite());
649 		if (status != B_OK) {
650 			TRACE_ERROR("failed to lock memory: %s\n", strerror(status));
651 			return status;
652 		}
653 		// SetStatusAndNotify() takes care of unlocking memory if necessary.
654 
655 		// This is slightly inefficient, as we could use a BStackOrHeapArray in
656 		// the optimal case (few physical entries required), but we would not
657 		// know whether or not that was possible until calling get_memory_map()
658 		// and then potentially reallocating, which would complicate the logic.
659 
660 		int32 vtophys_length = (request->Length() / B_PAGE_SIZE) + 2;
661 		nvme_request.iovecs = vtophys = (physical_entry*)malloc(sizeof(physical_entry)
662 			* vtophys_length);
663 		if (vtophys == NULL) {
664 			TRACE_ERROR("failed to allocate memory for iovecs\n");
665 			request->SetStatusAndNotify(B_NO_MEMORY);
666 			return B_NO_MEMORY;
667 		}
668 		vtophysDeleter.SetTo(vtophys);
669 
670 		for (size_t i = 0; i < buffer->VecCount(); i++) {
671 			generic_io_vec virt = buffer->VecAt(i);
672 			uint32 entries = vtophys_length - nvme_request.iovec_count;
673 
674 			// Avoid copies by going straight into the vtophys array.
675 			status = get_memory_map_etc(request->TeamID(), (void*)virt.base,
676 				virt.length, vtophys + nvme_request.iovec_count, &entries);
677 			if (status == B_BUFFER_OVERFLOW) {
678 				TRACE("vtophys array was too small, reallocating\n");
679 
680 				vtophysDeleter.Detach();
681 				vtophys_length *= 2;
682 				nvme_request.iovecs = vtophys = (physical_entry*)realloc(vtophys,
683 					sizeof(physical_entry) * vtophys_length);
684 				vtophysDeleter.SetTo(vtophys);
685 				if (vtophys == NULL) {
686 					status = B_NO_MEMORY;
687 				} else {
688 					// Try again, with the larger buffer this time.
689 					i--;
690 					continue;
691 				}
692 			}
693 			if (status != B_OK) {
694 				TRACE_ERROR("I/O get_memory_map failed: %s\n", strerror(status));
695 				request->SetStatusAndNotify(status);
696 				return status;
697 			}
698 
699 			nvme_request.iovec_count += entries;
700 		}
701 	} else {
702 		nvme_request.iovecs = (physical_entry*)buffer->Vecs();
703 		nvme_request.iovec_count = buffer->VecCount();
704 	}
705 
706 	// See if we need to bounce anything other than the first or last vec.
707 	const size_t block_size = handle->info->block_size;
708 	bool bounceAll = false;
709 	for (int32 i = 1; !bounceAll && i < (nvme_request.iovec_count - 1); i++) {
710 		if ((nvme_request.iovecs[i].address % B_PAGE_SIZE) != 0)
711 			bounceAll = true;
712 		if ((nvme_request.iovecs[i].size % B_PAGE_SIZE) != 0)
713 			bounceAll = true;
714 	}
715 
716 	// See if we need to bounce due to the first or last vec (which, unlike middle vecs,
717 	// need only be a multiple of the block size, and must end and start on a page boundary,
718 	// respectively, though the start address must always be 32-bit-aligned.)
719 	if (nvme_request.iovec_count > 1) {
720 		physical_entry* entry = &nvme_request.iovecs[0];
721 		if (!bounceAll && (((entry->address + entry->size) % B_PAGE_SIZE) != 0
722 				|| (entry->address & 0x3) != 0 || (entry->size % block_size) != 0))
723 			bounceAll = true;
724 
725 		entry = &nvme_request.iovecs[nvme_request.iovec_count - 1];
726 		if (!bounceAll && ((entry->address % B_PAGE_SIZE) != 0
727 				|| (entry->address & 0x3) != 0 || (entry->size % block_size) != 0))
728 			bounceAll = true;
729 	}
730 
731 	// See if we need to bounce due to rounding.
732 	const off_t rounded_pos = ROUNDDOWN(request->Offset(), block_size);
733 	phys_size_t rounded_len = ROUNDUP(request->Length() + (request->Offset()
734 		- rounded_pos), block_size);
735 	if (rounded_pos != request->Offset() || rounded_len != request->Length())
736 		bounceAll = true;
737 
738 	if (bounceAll) {
739 		// Let the bounced I/O routine take care of everything from here.
740 		return nvme_disk_bounced_io(handle, request);
741 	}
742 
743 	nvme_request.lba_start = rounded_pos / block_size;
744 	nvme_request.lba_count = rounded_len / block_size;
745 
746 	// No bouncing was required.
747 	ReadLocker readLocker;
748 	if (nvme_request.write)
749 		readLocker.SetTo(handle->info->rounded_write_lock, false);
750 
751 	// Error check before actually doing I/O.
752 	if (status != B_OK) {
753 		TRACE_ERROR("I/O failed early: %s\n", strerror(status));
754 		request->SetStatusAndNotify(status);
755 		return status;
756 	}
757 
758 	const uint32 max_io_blocks = handle->info->max_io_blocks;
759 	int32 remaining = nvme_request.iovec_count;
760 	while (remaining > 0) {
761 		nvme_request.iovec_count = min_c(remaining,
762 			NVME_MAX_SGL_DESCRIPTORS / 2);
763 
764 		nvme_request.lba_count = 0;
765 		for (int i = 0; i < nvme_request.iovec_count; i++) {
766 			int32 new_lba_count = nvme_request.lba_count
767 				+ (nvme_request.iovecs[i].size / block_size);
768 			if (nvme_request.lba_count > 0 && new_lba_count > max_io_blocks) {
769 				// We already have a nonzero length, and adding this vec would
770 				// make us go over (or we already are over.) Stop adding.
771 				nvme_request.iovec_count = i;
772 				break;
773 			}
774 
775 			nvme_request.lba_count = new_lba_count;
776 		}
777 
778 		status = do_nvme_io_request(handle->info, &nvme_request);
779 		if (status != B_OK)
780 			break;
781 
782 		nvme_request.iovecs += nvme_request.iovec_count;
783 		remaining -= nvme_request.iovec_count;
784 		nvme_request.lba_start += nvme_request.lba_count;
785 	}
786 
787 	if (status != B_OK)
788 		TRACE_ERROR("I/O failed: %s\n", strerror(status));
789 
790 	request->SetTransferredBytes(status != B_OK,
791 		(nvme_request.lba_start * block_size) - rounded_pos);
792 	request->SetStatusAndNotify(status);
793 	return status;
794 }
795 
796 
797 static status_t
798 nvme_disk_read(void* cookie, off_t pos, void* buffer, size_t* length)
799 {
800 	CALLED();
801 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
802 
803 	const off_t end = (handle->info->capacity * handle->info->block_size);
804 	if (pos >= end)
805 		return B_BAD_VALUE;
806 	if (pos + (off_t)*length > end)
807 		*length = end - pos;
808 
809 	IORequest request;
810 	status_t status = request.Init(pos, (addr_t)buffer, *length, false, 0);
811 	if (status != B_OK)
812 		return status;
813 
814 	status = nvme_disk_io(handle, &request);
815 	*length = request.TransferredBytes();
816 	return status;
817 }
818 
819 
820 static status_t
821 nvme_disk_write(void* cookie, off_t pos, const void* buffer, size_t* length)
822 {
823 	CALLED();
824 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
825 
826 	const off_t end = (handle->info->capacity * handle->info->block_size);
827 	if (pos >= end)
828 		return B_BAD_VALUE;
829 	if (pos + (off_t)*length > end)
830 		*length = end - pos;
831 
832 	IORequest request;
833 	status_t status = request.Init(pos, (addr_t)buffer, *length, true, 0);
834 	if (status != B_OK)
835 		return status;
836 
837 	status = nvme_disk_io(handle, &request);
838 	*length = request.TransferredBytes();
839 	return status;
840 }
841 
842 
843 static status_t
844 nvme_disk_flush(nvme_disk_driver_info* info)
845 {
846 	status_t status = EINPROGRESS;
847 
848 	qpair_info* qpinfo = get_qpair(info);
849 	int ret = nvme_ns_flush(info->ns, qpinfo->qpair,
850 		(nvme_cmd_cb)io_finished_callback, &status);
851 	if (ret != 0)
852 		return ret;
853 
854 	await_status(info, qpinfo->qpair, status);
855 	return status;
856 }
857 
858 
859 static status_t
860 nvme_disk_ioctl(void* cookie, uint32 op, void* buffer, size_t length)
861 {
862 	CALLED();
863 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
864 	nvme_disk_driver_info* info = handle->info;
865 
866 	TRACE("ioctl(op = %" B_PRId32 ")\n", op);
867 
868 	switch (op) {
869 		case B_GET_MEDIA_STATUS:
870 		{
871 			*(status_t *)buffer = info->media_status;
872 			info->media_status = B_OK;
873 			return B_OK;
874 			break;
875 		}
876 
877 		case B_GET_DEVICE_SIZE:
878 		{
879 			size_t size = info->capacity * info->block_size;
880 			return user_memcpy(buffer, &size, sizeof(size_t));
881 		}
882 
883 		case B_GET_GEOMETRY:
884 		{
885 			if (buffer == NULL /*|| length != sizeof(device_geometry)*/)
886 				return B_BAD_VALUE;
887 
888 		 	device_geometry geometry;
889 			status_t status = get_geometry(handle, &geometry);
890 			if (status != B_OK)
891 				return status;
892 
893 			return user_memcpy(buffer, &geometry, sizeof(device_geometry));
894 		}
895 
896 		case B_GET_ICON_NAME:
897 			return user_strlcpy((char*)buffer, "devices/drive-harddisk",
898 				B_FILE_NAME_LENGTH);
899 
900 		case B_GET_VECTOR_ICON:
901 		{
902 			device_icon iconData;
903 			if (length != sizeof(device_icon))
904 				return B_BAD_VALUE;
905 			if (user_memcpy(&iconData, buffer, sizeof(device_icon)) != B_OK)
906 				return B_BAD_ADDRESS;
907 
908 			if (iconData.icon_size >= (int32)sizeof(kDriveIcon)) {
909 				if (user_memcpy(iconData.icon_data, kDriveIcon,
910 						sizeof(kDriveIcon)) != B_OK)
911 					return B_BAD_ADDRESS;
912 			}
913 
914 			iconData.icon_size = sizeof(kDriveIcon);
915 			return user_memcpy(buffer, &iconData, sizeof(device_icon));
916 		}
917 
918 		case B_FLUSH_DRIVE_CACHE:
919 			return nvme_disk_flush(info);
920 	}
921 
922 	return B_DEV_INVALID_IOCTL;
923 }
924 
925 
926 //	#pragma mark - driver module API
927 
928 
929 static float
930 nvme_disk_supports_device(device_node *parent)
931 {
932 	CALLED();
933 
934 	const char* bus;
935 	uint16 baseClass, subClass;
936 
937 	if (sDeviceManager->get_attr_string(parent, B_DEVICE_BUS, &bus, false) != B_OK
938 		|| sDeviceManager->get_attr_uint16(parent, B_DEVICE_TYPE, &baseClass, false) != B_OK
939 		|| sDeviceManager->get_attr_uint16(parent, B_DEVICE_SUB_TYPE, &subClass, false) != B_OK)
940 		return -1.0f;
941 
942 	if (strcmp(bus, "pci") != 0 || baseClass != PCI_mass_storage)
943 		return 0.0f;
944 
945 	if (subClass != PCI_nvm)
946 		return 0.0f;
947 
948 	TRACE("NVMe device found!\n");
949 	return 1.0f;
950 }
951 
952 
953 static status_t
954 nvme_disk_register_device(device_node* parent)
955 {
956 	CALLED();
957 
958 	device_attr attrs[] = {
959 		{ B_DEVICE_PRETTY_NAME, B_STRING_TYPE, { string: "NVMe Disk" } },
960 		{ NULL }
961 	};
962 
963 	return sDeviceManager->register_node(parent, NVME_DISK_DRIVER_MODULE_NAME,
964 		attrs, NULL, NULL);
965 }
966 
967 
968 static status_t
969 nvme_disk_init_driver(device_node* node, void** cookie)
970 {
971 	CALLED();
972 
973 	int ret = nvme_lib_init((enum nvme_log_level)0, (enum nvme_log_facility)0, NULL);
974 	if (ret != 0) {
975 		TRACE_ERROR("libnvme initialization failed!\n");
976 		return ret;
977 	}
978 
979 	nvme_disk_driver_info* info = new nvme_disk_driver_info;
980 	if (info == NULL)
981 		return B_NO_MEMORY;
982 
983 	info->media_status = B_OK;
984 	info->node = node;
985 
986 	info->ctrlr = NULL;
987 
988 	*cookie = info;
989 	return B_OK;
990 }
991 
992 
993 static void
994 nvme_disk_uninit_driver(void* _cookie)
995 {
996 	CALLED();
997 
998 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie;
999 	free(info);
1000 }
1001 
1002 
1003 static status_t
1004 nvme_disk_register_child_devices(void* _cookie)
1005 {
1006 	CALLED();
1007 
1008 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie;
1009 	status_t status;
1010 
1011 	int32 id = sDeviceManager->create_id(NVME_DISK_DEVICE_ID_GENERATOR);
1012 	if (id < 0)
1013 		return id;
1014 
1015 	char name[64];
1016 	snprintf(name, sizeof(name), "disk/nvme/%" B_PRId32 "/raw",
1017 		id);
1018 
1019 	status = sDeviceManager->publish_device(info->node, name,
1020 		NVME_DISK_DEVICE_MODULE_NAME);
1021 
1022 	return status;
1023 }
1024 
1025 
1026 //	#pragma mark -
1027 
1028 
1029 module_dependency module_dependencies[] = {
1030 	{ B_DEVICE_MANAGER_MODULE_NAME, (module_info**)&sDeviceManager },
1031 	{ NULL }
1032 };
1033 
1034 struct device_module_info sNvmeDiskDevice = {
1035 	{
1036 		NVME_DISK_DEVICE_MODULE_NAME,
1037 		0,
1038 		NULL
1039 	},
1040 
1041 	nvme_disk_init_device,
1042 	nvme_disk_uninit_device,
1043 	NULL, // remove,
1044 
1045 	nvme_disk_open,
1046 	nvme_disk_close,
1047 	nvme_disk_free,
1048 	nvme_disk_read,
1049 	nvme_disk_write,
1050 	nvme_disk_io,
1051 	nvme_disk_ioctl,
1052 
1053 	NULL,	// select
1054 	NULL,	// deselect
1055 };
1056 
1057 struct driver_module_info sNvmeDiskDriver = {
1058 	{
1059 		NVME_DISK_DRIVER_MODULE_NAME,
1060 		0,
1061 		NULL
1062 	},
1063 
1064 	nvme_disk_supports_device,
1065 	nvme_disk_register_device,
1066 	nvme_disk_init_driver,
1067 	nvme_disk_uninit_driver,
1068 	nvme_disk_register_child_devices,
1069 	NULL,	// rescan
1070 	NULL,	// removed
1071 };
1072 
1073 module_info* modules[] = {
1074 	(module_info*)&sNvmeDiskDriver,
1075 	(module_info*)&sNvmeDiskDevice,
1076 	NULL
1077 };
1078