xref: /haiku/src/add-ons/kernel/drivers/disk/nvme/nvme_disk.cpp (revision 7a617f59fd64449167bb190666bd44fae7efbe0b)
1 /*
2  * Copyright 2019-2020, Haiku, Inc. All rights reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Augustin Cavalier <waddlesplash>
7  */
8 
9 
10 #include <stdio.h>
11 #include <stdlib.h>
12 
13 #include <algorithm>
14 #include <condition_variable.h>
15 #include <AutoDeleter.h>
16 #include <kernel.h>
17 #include <util/AutoLock.h>
18 
19 #include <fs/devfs.h>
20 #include <bus/PCI.h>
21 #include <PCI_x86.h>
22 #include <vm/vm.h>
23 
24 #include "IORequest.h"
25 
26 extern "C" {
27 #include <libnvme/nvme.h>
28 #include <libnvme/nvme_internal.h>
29 }
30 
31 
32 //#define TRACE_NVME_DISK
33 #ifdef TRACE_NVME_DISK
34 #	define TRACE(x...) dprintf("nvme_disk: " x)
35 #else
36 #	define TRACE(x...) ;
37 #endif
38 #define TRACE_ALWAYS(x...)	dprintf("nvme_disk: " x)
39 #define TRACE_ERROR(x...)	dprintf("\33[33mnvme_disk:\33[0m " x)
40 #define CALLED() 			TRACE("CALLED %s\n", __PRETTY_FUNCTION__)
41 
42 
43 static const uint8 kDriveIcon[] = {
44 	0x6e, 0x63, 0x69, 0x66, 0x08, 0x03, 0x01, 0x00, 0x00, 0x02, 0x00, 0x16,
45 	0x02, 0x3c, 0xc7, 0xee, 0x38, 0x9b, 0xc0, 0xba, 0x16, 0x57, 0x3e, 0x39,
46 	0xb0, 0x49, 0x77, 0xc8, 0x42, 0xad, 0xc7, 0x00, 0xff, 0xff, 0xd3, 0x02,
47 	0x00, 0x06, 0x02, 0x3c, 0x96, 0x32, 0x3a, 0x4d, 0x3f, 0xba, 0xfc, 0x01,
48 	0x3d, 0x5a, 0x97, 0x4b, 0x57, 0xa5, 0x49, 0x84, 0x4d, 0x00, 0x47, 0x47,
49 	0x47, 0xff, 0xa5, 0xa0, 0xa0, 0x02, 0x00, 0x16, 0x02, 0xbc, 0x59, 0x2f,
50 	0xbb, 0x29, 0xa7, 0x3c, 0x0c, 0xe4, 0xbd, 0x0b, 0x7c, 0x48, 0x92, 0xc0,
51 	0x4b, 0x79, 0x66, 0x00, 0x7d, 0xff, 0xd4, 0x02, 0x00, 0x06, 0x02, 0x38,
52 	0xdb, 0xb4, 0x39, 0x97, 0x33, 0xbc, 0x4a, 0x33, 0x3b, 0xa5, 0x42, 0x48,
53 	0x6e, 0x66, 0x49, 0xee, 0x7b, 0x00, 0x59, 0x67, 0x56, 0xff, 0xeb, 0xb2,
54 	0xb2, 0x03, 0xa7, 0xff, 0x00, 0x03, 0xff, 0x00, 0x00, 0x04, 0x01, 0x80,
55 	0x07, 0x0a, 0x06, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x5a, 0x3e, 0x5a,
56 	0x31, 0x39, 0x25, 0x0a, 0x04, 0x22, 0x3c, 0x44, 0x4b, 0x5a, 0x31, 0x39,
57 	0x25, 0x0a, 0x04, 0x44, 0x4b, 0x44, 0x5b, 0x5a, 0x3e, 0x5a, 0x31, 0x0a,
58 	0x04, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x44, 0x4b, 0x08, 0x02, 0x27,
59 	0x43, 0xb8, 0x14, 0xc1, 0xf1, 0x08, 0x02, 0x26, 0x43, 0x29, 0x44, 0x0a,
60 	0x05, 0x44, 0x5d, 0x49, 0x5d, 0x60, 0x3e, 0x5a, 0x3b, 0x5b, 0x3f, 0x08,
61 	0x0a, 0x07, 0x01, 0x06, 0x00, 0x0a, 0x00, 0x01, 0x00, 0x10, 0x01, 0x17,
62 	0x84, 0x00, 0x04, 0x0a, 0x01, 0x01, 0x01, 0x00, 0x0a, 0x02, 0x01, 0x02,
63 	0x00, 0x0a, 0x03, 0x01, 0x03, 0x00, 0x0a, 0x04, 0x01, 0x04, 0x10, 0x01,
64 	0x17, 0x85, 0x20, 0x04, 0x0a, 0x06, 0x01, 0x05, 0x30, 0x24, 0xb3, 0x99,
65 	0x01, 0x17, 0x82, 0x00, 0x04, 0x0a, 0x05, 0x01, 0x05, 0x30, 0x20, 0xb2,
66 	0xe6, 0x01, 0x17, 0x82, 0x00, 0x04
67 };
68 
69 
70 #define NVME_DISK_DRIVER_MODULE_NAME 	"drivers/disk/nvme_disk/driver_v1"
71 #define NVME_DISK_DEVICE_MODULE_NAME 	"drivers/disk/nvme_disk/device_v1"
72 #define NVME_DISK_DEVICE_ID_GENERATOR	"nvme_disk/device_id"
73 
74 #define NVME_MAX_QPAIRS					(8)
75 
76 
77 static device_manager_info* sDeviceManager;
78 static pci_x86_module_info* sPCIx86Module;
79 
80 typedef struct {
81 	device_node*			node;
82 	pci_info				info;
83 
84 	struct nvme_ctrlr*		ctrlr;
85 
86 	struct nvme_ns*			ns;
87 	uint64					capacity;
88 	uint32					block_size;
89 	status_t				media_status;
90 
91 	struct qpair_info {
92 		struct nvme_qpair*	qpair;
93 	}						qpairs[NVME_MAX_QPAIRS];
94 	uint32					qpair_count;
95 	uint32					next_qpair;
96 
97 	DMAResource				dma_resource;
98 	sem_id					dma_buffers_sem;
99 
100 	rw_lock					rounded_write_lock;
101 
102 	ConditionVariable		interrupt;
103 } nvme_disk_driver_info;
104 typedef nvme_disk_driver_info::qpair_info qpair_info;
105 
106 
107 typedef struct {
108 	nvme_disk_driver_info*		info;
109 } nvme_disk_handle;
110 
111 
112 static status_t
113 get_geometry(nvme_disk_handle* handle, device_geometry* geometry)
114 {
115 	nvme_disk_driver_info* info = handle->info;
116 
117 	devfs_compute_geometry_size(geometry, info->capacity, info->block_size);
118 
119 	geometry->device_type = B_DISK;
120 	geometry->removable = false;
121 
122 	geometry->read_only = false;
123 	geometry->write_once = false;
124 
125 	TRACE("get_geometry(): %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %d, %d, %d, %d\n",
126 		geometry->bytes_per_sector, geometry->sectors_per_track,
127 		geometry->cylinder_count, geometry->head_count, geometry->device_type,
128 		geometry->removable, geometry->read_only, geometry->write_once);
129 
130 	return B_OK;
131 }
132 
133 
134 static int
135 log2(uint32 x)
136 {
137 	int y;
138 
139 	for (y = 31; y >= 0; --y) {
140 		if (x == ((uint32)1 << y))
141 			break;
142 	}
143 
144 	return y;
145 }
146 
147 
148 static void
149 nvme_disk_set_capacity(nvme_disk_driver_info* info, uint64 capacity,
150 	uint32 blockSize)
151 {
152 	TRACE("set_capacity(device = %p, capacity = %" B_PRIu64 ", blockSize = %" B_PRIu32 ")\n",
153 		info, capacity, blockSize);
154 
155 	// get log2, if possible
156 	uint32 blockShift = log2(blockSize);
157 
158 	if ((1UL << blockShift) != blockSize)
159 		blockShift = 0;
160 
161 	info->capacity = capacity;
162 	info->block_size = blockSize;
163 }
164 
165 
166 //	#pragma mark - device module API
167 
168 
169 static int32 nvme_interrupt_handler(void* _info);
170 
171 
172 static status_t
173 nvme_disk_init_device(void* _info, void** _cookie)
174 {
175 	CALLED();
176 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info;
177 
178 	pci_device_module_info* pci;
179 	pci_device* pcidev;
180 	device_node* parent = sDeviceManager->get_parent_node(info->node);
181 	sDeviceManager->get_driver(parent, (driver_module_info**)&pci,
182 		(void**)&pcidev);
183 	pci->get_pci_info(pcidev, &info->info);
184 	sDeviceManager->put_node(parent);
185 
186 	// construct the libnvme pci_device struct
187 	pci_device* device = new pci_device;
188 	device->vendor_id = info->info.vendor_id;
189 	device->device_id = info->info.device_id;
190 	device->subvendor_id = 0;
191 	device->subdevice_id = 0;
192 
193 	device->domain = 0;
194 	device->bus = info->info.bus;
195 	device->dev = info->info.device;
196 	device->func = info->info.function;
197 
198 	device->pci_info = &info->info;
199 
200 	// open the controller
201 	info->ctrlr = nvme_ctrlr_open(device, NULL);
202 	if (info->ctrlr == NULL) {
203 		TRACE_ERROR("failed to open the controller!\n");
204 		return B_ERROR;
205 	}
206 
207 	struct nvme_ctrlr_stat cstat;
208 	int err = nvme_ctrlr_stat(info->ctrlr, &cstat);
209 	if (err != 0) {
210 		TRACE_ERROR("failed to get controller information!\n");
211 		nvme_ctrlr_close(info->ctrlr);
212 		return err;
213 	}
214 
215 	TRACE_ALWAYS("attached to NVMe device \"%s (%s)\"\n", cstat.mn, cstat.sn);
216 	TRACE_ALWAYS("\tmaximum transfer size: %" B_PRIuSIZE "\n", cstat.max_xfer_size);
217 	TRACE_ALWAYS("\tqpair count: %d\n", cstat.io_qpairs);
218 
219 	// TODO: export more than just the first namespace!
220 	info->ns = nvme_ns_open(info->ctrlr, cstat.ns_ids[0]);
221 	if (info->ns == NULL) {
222 		TRACE_ERROR("failed to open namespace!\n");
223 		nvme_ctrlr_close(info->ctrlr);
224 		return B_ERROR;
225 	}
226 
227 	struct nvme_ns_stat nsstat;
228 	err = nvme_ns_stat(info->ns, &nsstat);
229 	if (err != 0) {
230 		TRACE_ERROR("failed to get namespace information!\n");
231 		nvme_ctrlr_close(info->ctrlr);
232 		return err;
233 	}
234 
235 	// store capacity information
236 	nvme_disk_set_capacity(info, nsstat.sectors, nsstat.sector_size);
237 
238 	TRACE("capacity: %" B_PRIu64 ", block_size %" B_PRIu32 "\n",
239 		info->capacity, info->block_size);
240 
241 	// allocate qpairs
242 	info->qpair_count = info->next_qpair = 0;
243 	for (uint32 i = 0; i < NVME_MAX_QPAIRS && i < cstat.io_qpairs; i++) {
244 		info->qpairs[i].qpair = nvme_ioqp_get(info->ctrlr,
245 			(enum nvme_qprio)0, 0);
246 		if (info->qpairs[i].qpair == NULL)
247 			break;
248 
249 		info->qpair_count++;
250 	}
251 	if (info->qpair_count == 0) {
252 		TRACE_ERROR("failed to allocate qpairs!\n");
253 		nvme_ctrlr_close(info->ctrlr);
254 		return B_NO_MEMORY;
255 	}
256 
257 	// allocate DMA buffers
258 	int buffers = info->qpair_count * 2;
259 
260 	dma_restrictions restrictions = {};
261 	restrictions.alignment = B_PAGE_SIZE;
262 		// Technically, the first and last segments in a transfer can be
263 		// unaligned, and the rest only need to have sizes that are a multiple
264 		// of the block size.
265 	restrictions.max_segment_count = (NVME_MAX_SGL_DESCRIPTORS / 2);
266 	restrictions.max_transfer_size = cstat.max_xfer_size;
267 
268 	err = info->dma_resource.Init(restrictions, B_PAGE_SIZE, buffers, buffers);
269 	if (err != 0) {
270 		TRACE_ERROR("failed to initialize DMA resource!\n");
271 		nvme_ctrlr_close(info->ctrlr);
272 		return err;
273 	}
274 
275 	info->dma_buffers_sem = create_sem(buffers, "nvme buffers sem");
276 	if (info->dma_buffers_sem < 0) {
277 		TRACE_ERROR("failed to create DMA buffers semaphore!\n");
278 		nvme_ctrlr_close(info->ctrlr);
279 		return info->dma_buffers_sem;
280 	}
281 
282 	// set up rounded-write lock
283 	rw_lock_init(&info->rounded_write_lock, "nvme rounded writes");
284 
285 	// set up interrupt
286 	if (get_module(B_PCI_X86_MODULE_NAME, (module_info**)&sPCIx86Module)
287 			!= B_OK) {
288 		sPCIx86Module = NULL;
289 	}
290 
291 	uint16 command = pci->read_pci_config(pcidev, PCI_command, 2);
292 	command &= ~(PCI_command_int_disable);
293 	pci->write_pci_config(pcidev, PCI_command, 2, command);
294 
295 	uint8 irq = info->info.u.h0.interrupt_line;
296 	if (sPCIx86Module != NULL) {
297 		if (sPCIx86Module->get_msix_count(info->info.bus, info->info.device,
298 				info->info.function)) {
299 			uint8 msixVector = 0;
300 			if (sPCIx86Module->configure_msix(info->info.bus, info->info.device,
301 					info->info.function, 1, &msixVector) == B_OK
302 				&& sPCIx86Module->enable_msix(info->info.bus, info->info.device,
303 					info->info.function) == B_OK) {
304 				TRACE_ALWAYS("using MSI-X\n");
305 				irq = msixVector;
306 			}
307 		} else if (sPCIx86Module->get_msi_count(info->info.bus,
308 				info->info.device, info->info.function) >= 1) {
309 			uint8 msiVector = 0;
310 			if (sPCIx86Module->configure_msi(info->info.bus, info->info.device,
311 					info->info.function, 1, &msiVector) == B_OK
312 				&& sPCIx86Module->enable_msi(info->info.bus, info->info.device,
313 					info->info.function) == B_OK) {
314 				TRACE_ALWAYS("using message signaled interrupts\n");
315 				irq = msiVector;
316 			}
317 		}
318 	}
319 
320 	if (irq == 0 || irq == 0xFF) {
321 		TRACE_ERROR("device PCI:%d:%d:%d was assigned an invalid IRQ\n",
322 			info->info.bus, info->info.device, info->info.function);
323 		return B_ERROR;
324 	}
325 	info->interrupt.Init(NULL, NULL);
326 	install_io_interrupt_handler(irq, nvme_interrupt_handler, (void*)info, B_NO_HANDLED_INFO);
327 
328 	if (info->ctrlr->feature_supported[NVME_FEAT_INTERRUPT_COALESCING]) {
329 		uint32 microseconds = 16, threshold = 32;
330 		nvme_admin_set_feature(info->ctrlr, false, NVME_FEAT_INTERRUPT_COALESCING,
331 			((microseconds / 100) << 8) | threshold, 0, NULL);
332 	}
333 
334 	*_cookie = info;
335 	return B_OK;
336 }
337 
338 
339 static void
340 nvme_disk_uninit_device(void* _cookie)
341 {
342 	CALLED();
343 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie;
344 
345 	remove_io_interrupt_handler(info->info.u.h0.interrupt_line,
346 		nvme_interrupt_handler, (void*)info);
347 
348 	rw_lock_destroy(&info->rounded_write_lock);
349 
350 	nvme_ns_close(info->ns);
351 	nvme_ctrlr_close(info->ctrlr);
352 
353 	// TODO: Deallocate MSI(-X).
354 	// TODO: Deallocate PCI.
355 }
356 
357 
358 static status_t
359 nvme_disk_open(void* _info, const char* path, int openMode, void** _cookie)
360 {
361 	CALLED();
362 
363 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info;
364 	nvme_disk_handle* handle = (nvme_disk_handle*)malloc(
365 		sizeof(nvme_disk_handle));
366 	if (handle == NULL)
367 		return B_NO_MEMORY;
368 
369 	handle->info = info;
370 
371 	*_cookie = handle;
372 	return B_OK;
373 }
374 
375 
376 static status_t
377 nvme_disk_close(void* cookie)
378 {
379 	CALLED();
380 
381 	//nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
382 	return B_OK;
383 }
384 
385 
386 static status_t
387 nvme_disk_free(void* cookie)
388 {
389 	CALLED();
390 
391 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
392 	free(handle);
393 	return B_OK;
394 }
395 
396 
397 // #pragma mark - I/O
398 
399 
400 static int32
401 nvme_interrupt_handler(void* _info)
402 {
403 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info;
404 	info->interrupt.NotifyAll();
405 	return 0;
406 }
407 
408 
409 static qpair_info*
410 get_qpair(nvme_disk_driver_info* info)
411 {
412 	return &info->qpairs[atomic_add((int32*)&info->next_qpair, 1)
413 		% info->qpair_count];
414 }
415 
416 
417 static void
418 io_finished_callback(status_t* status, const struct nvme_cpl* cpl)
419 {
420 	*status = nvme_cpl_is_error(cpl) ? B_IO_ERROR : B_OK;
421 }
422 
423 
424 static void
425 await_status(nvme_disk_driver_info* info, struct nvme_qpair* qpair, status_t& status)
426 {
427 	CALLED();
428 
429 	ConditionVariableEntry entry;
430 	int timeouts = 0;
431 	while (status == EINPROGRESS) {
432 		info->interrupt.Add(&entry);
433 
434 		nvme_qpair_poll(qpair, 0);
435 
436 		if (status != EINPROGRESS)
437 			return;
438 
439 		if (entry.Wait(B_RELATIVE_TIMEOUT, 5 * 1000 * 1000) != B_OK) {
440 			// This should never happen, as we are woken up on every interrupt
441 			// no matter the qpair or transfer within; so if it does occur,
442 			// that probably means the controller stalled or something.
443 
444 			TRACE_ERROR("timed out waiting for interrupt!\n");
445 			if (timeouts++ >= 3) {
446 				nvme_qpair_fail(qpair);
447 				status = B_TIMED_OUT;
448 				return;
449 			}
450 		}
451 
452 		nvme_qpair_poll(qpair, 0);
453 	}
454 }
455 
456 
457 struct nvme_io_request {
458 	status_t status;
459 
460 	bool write;
461 
462 	off_t lba_start;
463 	size_t lba_count;
464 
465 	physical_entry* iovecs;
466 	int32 iovec_count;
467 
468 	int32 iovec_i;
469 };
470 
471 
472 void ior_reset_sgl(nvme_io_request* request, uint32_t offset)
473 {
474 	request->iovec_i = offset;
475 }
476 
477 
478 int ior_next_sge(nvme_io_request* request, uint64_t* address, uint32_t* length)
479 {
480 	int32 index = request->iovec_i;
481 	if (index < 0 || index > request->iovec_count)
482 		return -1;
483 
484 	*address = request->iovecs[index].address;
485 	*length = request->iovecs[index].size;
486 
487 	TRACE("IOV %d: 0x%" B_PRIx64 ", %" B_PRIu32 "\n", request->iovec_i, *address,
488 		  *length);
489 
490 	request->iovec_i++;
491 	return 0;
492 }
493 
494 
495 static status_t
496 do_nvme_io_request(nvme_disk_driver_info* info, nvme_io_request* request)
497 {
498 	request->status = EINPROGRESS;
499 
500 	qpair_info* qpinfo = get_qpair(info);
501 	int ret = -1;
502 	if (request->write) {
503 		ret = nvme_ns_writev(info->ns, qpinfo->qpair, request->lba_start,
504 			request->lba_count, (nvme_cmd_cb)io_finished_callback, request,
505 			0, (nvme_req_reset_sgl_cb)ior_reset_sgl,
506 			(nvme_req_next_sge_cb)ior_next_sge);
507 	} else {
508 		ret = nvme_ns_readv(info->ns, qpinfo->qpair, request->lba_start,
509 			request->lba_count, (nvme_cmd_cb)io_finished_callback, request,
510 			0, (nvme_req_reset_sgl_cb)ior_reset_sgl,
511 			(nvme_req_next_sge_cb)ior_next_sge);
512 	}
513 	if (ret != 0) {
514 		TRACE_ERROR("attempt to queue %s I/O at LBA %" B_PRIdOFF " of %" B_PRIuSIZE
515 			" blocks failed!\n", request->write ? "write" : "read",
516 			request->lba_start, request->lba_count);
517 
518 		request->lba_count = 0;
519 		return ret;
520 	}
521 
522 	await_status(info, qpinfo->qpair, request->status);
523 
524 	if (request->status != B_OK) {
525 		TRACE_ERROR("%s at LBA %" B_PRIdOFF " of %" B_PRIuSIZE
526 			" blocks failed!\n", request->write ? "write" : "read",
527 			request->lba_start, request->lba_count);
528 
529 		request->lba_count = 0;
530 	}
531 	return request->status;
532 }
533 
534 
535 static status_t
536 nvme_disk_bounced_io(nvme_disk_handle* handle, io_request* request)
537 {
538 	CALLED();
539 
540 	WriteLocker writeLocker;
541 	if (request->IsWrite())
542 		writeLocker.SetTo(handle->info->rounded_write_lock, false);
543 
544 	status_t status = acquire_sem(handle->info->dma_buffers_sem);
545 	if (status != B_OK) {
546 		request->SetStatusAndNotify(status);
547 		return status;
548 	}
549 
550 	const size_t block_size = handle->info->block_size;
551 
552 	TRACE("%p: IOR Offset: %" B_PRIdOFF "; Length %" B_PRIuGENADDR
553 		"; Write %s\n", request, request->Offset(), request->Length(),
554 		request->IsWrite() ? "yes" : "no");
555 
556 	nvme_io_request nvme_request;
557 	while (request->RemainingBytes() > 0) {
558 		IOOperation operation;
559 		status = handle->info->dma_resource.TranslateNext(request, &operation, 0);
560 		if (status != B_OK)
561 			break;
562 
563 		size_t transferredBytes = 0;
564 		do {
565 			TRACE("%p: IOO offset: %" B_PRIdOFF ", length: %" B_PRIuGENADDR
566 				", write: %s\n", request, operation.Offset(),
567 				operation.Length(), operation.IsWrite() ? "yes" : "no");
568 
569 			nvme_request.write = operation.IsWrite();
570 			nvme_request.lba_start = operation.Offset() / block_size;
571 			nvme_request.lba_count = operation.Length() / block_size;
572 			nvme_request.iovecs = (physical_entry*)operation.Vecs();
573 			nvme_request.iovec_count = operation.VecCount();
574 
575 			status = do_nvme_io_request(handle->info, &nvme_request);
576 			if (status == B_OK && nvme_request.write == request->IsWrite())
577 				transferredBytes += operation.OriginalLength();
578 
579 			operation.SetStatus(status);
580 		} while (status == B_OK && !operation.Finish());
581 
582 		if (status == B_OK && operation.Status() != B_OK) {
583 			TRACE_ERROR("I/O succeeded but IOOperation failed!\n");
584 			status = operation.Status();
585 		}
586 
587 		operation.SetTransferredBytes(transferredBytes);
588 		request->OperationFinished(&operation, status, status != B_OK,
589 			operation.OriginalOffset() + transferredBytes);
590 
591 		handle->info->dma_resource.RecycleBuffer(operation.Buffer());
592 
593 		TRACE("%p: status %s, remaining bytes %" B_PRIuGENADDR "\n", request,
594 			strerror(status), request->RemainingBytes());
595 		if (status != B_OK)
596 			break;
597 	}
598 
599 	release_sem(handle->info->dma_buffers_sem);
600 
601 	// Notify() also takes care of UnlockMemory().
602 	if (status != B_OK && request->Status() == B_OK)
603 		request->SetStatusAndNotify(status);
604 	else
605 		request->NotifyFinished();
606 	return status;
607 }
608 
609 
610 static status_t
611 nvme_disk_io(void* cookie, io_request* request)
612 {
613 	CALLED();
614 
615 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
616 
617 	nvme_io_request nvme_request;
618 	memset(&nvme_request, 0, sizeof(nvme_io_request));
619 
620 	nvme_request.write = request->IsWrite();
621 
622 	physical_entry* vtophys = NULL;
623 	MemoryDeleter vtophysDeleter;
624 
625 	IOBuffer* buffer = request->Buffer();
626 	status_t status = B_OK;
627 	if (!buffer->IsPhysical()) {
628 		status = buffer->LockMemory(request->TeamID(), request->IsWrite());
629 		if (status != B_OK) {
630 			TRACE_ERROR("failed to lock memory: %s\n", strerror(status));
631 			return status;
632 		}
633 		// SetStatusAndNotify() takes care of unlocking memory if necessary.
634 
635 		// This is slightly inefficient, as we could use a BStackOrHeapArray in
636 		// the optimal case (few physical entries required), but we would not
637 		// know whether or not that was possible until calling get_memory_map()
638 		// and then potentially reallocating, which would complicate the logic.
639 
640 		int32 vtophys_length = (request->Length() / B_PAGE_SIZE) + 2;
641 		nvme_request.iovecs = vtophys = (physical_entry*)malloc(sizeof(physical_entry)
642 			* vtophys_length);
643 		if (vtophys == NULL) {
644 			TRACE_ERROR("failed to allocate memory for iovecs\n");
645 			request->SetStatusAndNotify(B_NO_MEMORY);
646 			return B_NO_MEMORY;
647 		}
648 		vtophysDeleter.SetTo(vtophys);
649 
650 		for (size_t i = 0; i < buffer->VecCount(); i++) {
651 			generic_io_vec virt = buffer->VecAt(i);
652 			uint32 entries = vtophys_length - nvme_request.iovec_count;
653 
654 			// Avoid copies by going straight into the vtophys array.
655 			status = get_memory_map_etc(request->TeamID(), (void*)virt.base,
656 				virt.length, vtophys + nvme_request.iovec_count, &entries);
657 			if (status == B_BUFFER_OVERFLOW) {
658 				TRACE("vtophys array was too small, reallocating\n");
659 
660 				vtophysDeleter.Detach();
661 				vtophys_length *= 2;
662 				nvme_request.iovecs = vtophys = (physical_entry*)realloc(vtophys,
663 					sizeof(physical_entry) * vtophys_length);
664 				vtophysDeleter.SetTo(vtophys);
665 				if (vtophys == NULL) {
666 					status = B_NO_MEMORY;
667 				} else {
668 					// Try again, with the larger buffer this time.
669 					i--;
670 					continue;
671 				}
672 			}
673 			if (status != B_OK) {
674 				TRACE_ERROR("I/O get_memory_map failed: %s\n", strerror(status));
675 				request->SetStatusAndNotify(status);
676 				return status;
677 			}
678 
679 			nvme_request.iovec_count += entries;
680 		}
681 	} else {
682 		nvme_request.iovecs = (physical_entry*)buffer->Vecs();
683 		nvme_request.iovec_count = buffer->VecCount();
684 	}
685 
686 	// See if we need to bounce anything other than the first or last vec.
687 	const size_t block_size = handle->info->block_size;
688 	bool bounceAll = false;
689 	for (int32 i = 1; !bounceAll && i < (nvme_request.iovec_count - 1); i++) {
690 		if ((nvme_request.iovecs[i].address % B_PAGE_SIZE) != 0)
691 			bounceAll = true;
692 		if ((nvme_request.iovecs[i].size % B_PAGE_SIZE) != 0)
693 			bounceAll = true;
694 	}
695 
696 	// See if we need to bounce due to the first or last vec.
697 	if (nvme_request.iovec_count > 1) {
698 		physical_entry* entry = &nvme_request.iovecs[0];
699 		if (!bounceAll && (((entry->address + entry->size) % B_PAGE_SIZE) != 0
700 				|| (entry->size % block_size) != 0))
701 			bounceAll = true;
702 
703 		entry = &nvme_request.iovecs[nvme_request.iovec_count - 1];
704 		if (!bounceAll && ((entry->address % B_PAGE_SIZE) != 0
705 				|| (entry->size % block_size) != 0))
706 			bounceAll = true;
707 	}
708 
709 	// See if we need to bounce due to rounding.
710 	const off_t rounded_pos = ROUNDDOWN(request->Offset(), block_size);
711 	phys_size_t rounded_len = ROUNDUP(request->Length() + (request->Offset()
712 		- rounded_pos), block_size);
713 	if (rounded_pos != request->Offset() || rounded_len != request->Length())
714 		bounceAll = true;
715 
716 	if (bounceAll) {
717 		// Let the bounced I/O routine take care of everything from here.
718 		return nvme_disk_bounced_io(handle, request);
719 	}
720 
721 	nvme_request.lba_start = rounded_pos / block_size;
722 	nvme_request.lba_count = rounded_len / block_size;
723 
724 	// No bouncing was required.
725 	ReadLocker readLocker;
726 	if (nvme_request.write)
727 		readLocker.SetTo(handle->info->rounded_write_lock, false);
728 
729 	// Error check before actually doing I/O.
730 	if (status != B_OK) {
731 		TRACE_ERROR("I/O failed early: %s\n", strerror(status));
732 		request->SetStatusAndNotify(status);
733 		return status;
734 	}
735 
736 	int32 remaining = nvme_request.iovec_count;
737 	while (remaining > 0 && status == B_OK) {
738 		nvme_request.iovec_count = min_c(remaining,
739 			NVME_MAX_SGL_DESCRIPTORS / 2);
740 
741 		nvme_request.lba_count = 0;
742 		for (int i = 0; i < nvme_request.iovec_count; i++)
743 			nvme_request.lba_count += (nvme_request.iovecs[i].size / block_size);
744 
745 		status = do_nvme_io_request(handle->info, &nvme_request);
746 
747 		nvme_request.iovecs += nvme_request.iovec_count;
748 		remaining -= nvme_request.iovec_count;
749 		nvme_request.lba_start += nvme_request.lba_count;
750 	}
751 
752 	if (status != B_OK)
753 		TRACE_ERROR("I/O failed: %s\n", strerror(status));
754 
755 	request->SetTransferredBytes(status != B_OK,
756 		(nvme_request.lba_start * block_size) - rounded_pos);
757 	request->SetStatusAndNotify(status);
758 	return status;
759 }
760 
761 
762 static status_t
763 nvme_disk_read(void* cookie, off_t pos, void* buffer, size_t* length)
764 {
765 	CALLED();
766 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
767 
768 	const off_t end = (handle->info->capacity * handle->info->block_size);
769 	if (pos >= end)
770 		return B_BAD_VALUE;
771 	if (pos + (off_t)*length > end)
772 		*length = end - pos;
773 
774 	IORequest request;
775 	status_t status = request.Init(pos, (addr_t)buffer, *length, false, 0);
776 	if (status != B_OK)
777 		return status;
778 
779 	status = nvme_disk_io(handle, &request);
780 	*length = request.TransferredBytes();
781 	return status;
782 }
783 
784 
785 static status_t
786 nvme_disk_write(void* cookie, off_t pos, const void* buffer, size_t* length)
787 {
788 	CALLED();
789 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
790 
791 	const off_t end = (handle->info->capacity * handle->info->block_size);
792 	if (pos >= end)
793 		return B_BAD_VALUE;
794 	if (pos + (off_t)*length > end)
795 		*length = end - pos;
796 
797 	IORequest request;
798 	status_t status = request.Init(pos, (addr_t)buffer, *length, true, 0);
799 	if (status != B_OK)
800 		return status;
801 
802 	status = nvme_disk_io(handle, &request);
803 	*length = request.TransferredBytes();
804 	return status;
805 }
806 
807 
808 static status_t
809 nvme_disk_flush(nvme_disk_driver_info* info)
810 {
811 	status_t status = EINPROGRESS;
812 
813 	qpair_info* qpinfo = get_qpair(info);
814 	int ret = nvme_ns_flush(info->ns, qpinfo->qpair,
815 		(nvme_cmd_cb)io_finished_callback, &status);
816 	if (ret != 0)
817 		return ret;
818 
819 	await_status(info, qpinfo->qpair, status);
820 	return status;
821 }
822 
823 
824 static status_t
825 nvme_disk_ioctl(void* cookie, uint32 op, void* buffer, size_t length)
826 {
827 	CALLED();
828 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
829 	nvme_disk_driver_info* info = handle->info;
830 
831 	TRACE("ioctl(op = %" B_PRId32 ")\n", op);
832 
833 	switch (op) {
834 		case B_GET_MEDIA_STATUS:
835 		{
836 			*(status_t *)buffer = info->media_status;
837 			info->media_status = B_OK;
838 			return B_OK;
839 			break;
840 		}
841 
842 		case B_GET_DEVICE_SIZE:
843 		{
844 			size_t size = info->capacity * info->block_size;
845 			return user_memcpy(buffer, &size, sizeof(size_t));
846 		}
847 
848 		case B_GET_GEOMETRY:
849 		{
850 			if (buffer == NULL /*|| length != sizeof(device_geometry)*/)
851 				return B_BAD_VALUE;
852 
853 		 	device_geometry geometry;
854 			status_t status = get_geometry(handle, &geometry);
855 			if (status != B_OK)
856 				return status;
857 
858 			return user_memcpy(buffer, &geometry, sizeof(device_geometry));
859 		}
860 
861 		case B_GET_ICON_NAME:
862 			return user_strlcpy((char*)buffer, "devices/drive-harddisk",
863 				B_FILE_NAME_LENGTH);
864 
865 		case B_GET_VECTOR_ICON:
866 		{
867 			device_icon iconData;
868 			if (length != sizeof(device_icon))
869 				return B_BAD_VALUE;
870 			if (user_memcpy(&iconData, buffer, sizeof(device_icon)) != B_OK)
871 				return B_BAD_ADDRESS;
872 
873 			if (iconData.icon_size >= (int32)sizeof(kDriveIcon)) {
874 				if (user_memcpy(iconData.icon_data, kDriveIcon,
875 						sizeof(kDriveIcon)) != B_OK)
876 					return B_BAD_ADDRESS;
877 			}
878 
879 			iconData.icon_size = sizeof(kDriveIcon);
880 			return user_memcpy(buffer, &iconData, sizeof(device_icon));
881 		}
882 
883 		case B_FLUSH_DRIVE_CACHE:
884 			return nvme_disk_flush(info);
885 	}
886 
887 	return B_DEV_INVALID_IOCTL;
888 }
889 
890 
891 //	#pragma mark - driver module API
892 
893 
894 static float
895 nvme_disk_supports_device(device_node *parent)
896 {
897 	CALLED();
898 
899 	const char* bus;
900 	uint16 baseClass, subClass;
901 
902 	if (sDeviceManager->get_attr_string(parent, B_DEVICE_BUS, &bus, false) != B_OK
903 		|| sDeviceManager->get_attr_uint16(parent, B_DEVICE_TYPE, &baseClass, false) != B_OK
904 		|| sDeviceManager->get_attr_uint16(parent, B_DEVICE_SUB_TYPE, &subClass, false) != B_OK)
905 		return -1.0f;
906 
907 	if (strcmp(bus, "pci") != 0 || baseClass != PCI_mass_storage)
908 		return 0.0f;
909 
910 	if (subClass != PCI_nvm)
911 		return 0.0f;
912 
913 	TRACE("NVMe device found!\n");
914 	return 1.0f;
915 }
916 
917 
918 static status_t
919 nvme_disk_register_device(device_node* parent)
920 {
921 	CALLED();
922 
923 	device_attr attrs[] = {
924 		{ NULL }
925 	};
926 
927 	return sDeviceManager->register_node(parent, NVME_DISK_DRIVER_MODULE_NAME,
928 		attrs, NULL, NULL);
929 }
930 
931 
932 static status_t
933 nvme_disk_init_driver(device_node* node, void** cookie)
934 {
935 	CALLED();
936 
937 	int ret = nvme_lib_init((enum nvme_log_level)0, (enum nvme_log_facility)0, NULL);
938 	if (ret != 0) {
939 		TRACE_ERROR("libnvme initialization failed!\n");
940 		return ret;
941 	}
942 
943 	nvme_disk_driver_info* info = new nvme_disk_driver_info;
944 	if (info == NULL)
945 		return B_NO_MEMORY;
946 
947 	info->media_status = B_OK;
948 	info->node = node;
949 
950 	info->ctrlr = NULL;
951 
952 	*cookie = info;
953 	return B_OK;
954 }
955 
956 
957 static void
958 nvme_disk_uninit_driver(void* _cookie)
959 {
960 	CALLED();
961 
962 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie;
963 	free(info);
964 }
965 
966 
967 static status_t
968 nvme_disk_register_child_devices(void* _cookie)
969 {
970 	CALLED();
971 
972 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie;
973 	status_t status;
974 
975 	int32 id = sDeviceManager->create_id(NVME_DISK_DEVICE_ID_GENERATOR);
976 	if (id < 0)
977 		return id;
978 
979 	char name[64];
980 	snprintf(name, sizeof(name), "disk/nvme/%" B_PRId32 "/raw",
981 		id);
982 
983 	status = sDeviceManager->publish_device(info->node, name,
984 		NVME_DISK_DEVICE_MODULE_NAME);
985 
986 	return status;
987 }
988 
989 
990 //	#pragma mark -
991 
992 
993 module_dependency module_dependencies[] = {
994 	{B_DEVICE_MANAGER_MODULE_NAME, (module_info**)&sDeviceManager},
995 	{}
996 };
997 
998 struct device_module_info sNvmeDiskDevice = {
999 	{
1000 		NVME_DISK_DEVICE_MODULE_NAME,
1001 		0,
1002 		NULL
1003 	},
1004 
1005 	nvme_disk_init_device,
1006 	nvme_disk_uninit_device,
1007 	NULL, // remove,
1008 
1009 	nvme_disk_open,
1010 	nvme_disk_close,
1011 	nvme_disk_free,
1012 	nvme_disk_read,
1013 	nvme_disk_write,
1014 	nvme_disk_io,
1015 	nvme_disk_ioctl,
1016 
1017 	NULL,	// select
1018 	NULL,	// deselect
1019 };
1020 
1021 struct driver_module_info sNvmeDiskDriver = {
1022 	{
1023 		NVME_DISK_DRIVER_MODULE_NAME,
1024 		0,
1025 		NULL
1026 	},
1027 
1028 	nvme_disk_supports_device,
1029 	nvme_disk_register_device,
1030 	nvme_disk_init_driver,
1031 	nvme_disk_uninit_driver,
1032 	nvme_disk_register_child_devices,
1033 	NULL,	// rescan
1034 	NULL,	// removed
1035 };
1036 
1037 module_info* modules[] = {
1038 	(module_info*)&sNvmeDiskDriver,
1039 	(module_info*)&sNvmeDiskDevice,
1040 	NULL
1041 };
1042