xref: /haiku/src/add-ons/kernel/drivers/disk/nvme/nvme_disk.cpp (revision 9295c1f645806eca5d7699c985f7b509528c9eaa)
1 /*
2  * Copyright 2019-2020, Haiku, Inc. All rights reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Augustin Cavalier <waddlesplash>
7  */
8 
9 
10 #include <stdio.h>
11 #include <stdlib.h>
12 
13 #include <algorithm>
14 #include <condition_variable.h>
15 #include <AutoDeleter.h>
16 #include <kernel.h>
17 #include <util/AutoLock.h>
18 
19 #include <fs/devfs.h>
20 #include <bus/PCI.h>
21 #include <PCI_x86.h>
22 #include <vm/vm.h>
23 
24 #include "IORequest.h"
25 
26 extern "C" {
27 #include <libnvme/nvme.h>
28 #include <libnvme/nvme_internal.h>
29 }
30 
31 
32 //#define TRACE_NVME_DISK
33 #ifdef TRACE_NVME_DISK
34 #	define TRACE(x...) dprintf("nvme_disk: " x)
35 #else
36 #	define TRACE(x...) ;
37 #endif
38 #define TRACE_ALWAYS(x...)	dprintf("nvme_disk: " x)
39 #define TRACE_ERROR(x...)	dprintf("\33[33mnvme_disk:\33[0m " x)
40 #define CALLED() 			TRACE("CALLED %s\n", __PRETTY_FUNCTION__)
41 
42 
43 static const uint8 kDriveIcon[] = {
44 	0x6e, 0x63, 0x69, 0x66, 0x08, 0x03, 0x01, 0x00, 0x00, 0x02, 0x00, 0x16,
45 	0x02, 0x3c, 0xc7, 0xee, 0x38, 0x9b, 0xc0, 0xba, 0x16, 0x57, 0x3e, 0x39,
46 	0xb0, 0x49, 0x77, 0xc8, 0x42, 0xad, 0xc7, 0x00, 0xff, 0xff, 0xd3, 0x02,
47 	0x00, 0x06, 0x02, 0x3c, 0x96, 0x32, 0x3a, 0x4d, 0x3f, 0xba, 0xfc, 0x01,
48 	0x3d, 0x5a, 0x97, 0x4b, 0x57, 0xa5, 0x49, 0x84, 0x4d, 0x00, 0x47, 0x47,
49 	0x47, 0xff, 0xa5, 0xa0, 0xa0, 0x02, 0x00, 0x16, 0x02, 0xbc, 0x59, 0x2f,
50 	0xbb, 0x29, 0xa7, 0x3c, 0x0c, 0xe4, 0xbd, 0x0b, 0x7c, 0x48, 0x92, 0xc0,
51 	0x4b, 0x79, 0x66, 0x00, 0x7d, 0xff, 0xd4, 0x02, 0x00, 0x06, 0x02, 0x38,
52 	0xdb, 0xb4, 0x39, 0x97, 0x33, 0xbc, 0x4a, 0x33, 0x3b, 0xa5, 0x42, 0x48,
53 	0x6e, 0x66, 0x49, 0xee, 0x7b, 0x00, 0x59, 0x67, 0x56, 0xff, 0xeb, 0xb2,
54 	0xb2, 0x03, 0xa7, 0xff, 0x00, 0x03, 0xff, 0x00, 0x00, 0x04, 0x01, 0x80,
55 	0x07, 0x0a, 0x06, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x5a, 0x3e, 0x5a,
56 	0x31, 0x39, 0x25, 0x0a, 0x04, 0x22, 0x3c, 0x44, 0x4b, 0x5a, 0x31, 0x39,
57 	0x25, 0x0a, 0x04, 0x44, 0x4b, 0x44, 0x5b, 0x5a, 0x3e, 0x5a, 0x31, 0x0a,
58 	0x04, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x44, 0x4b, 0x08, 0x02, 0x27,
59 	0x43, 0xb8, 0x14, 0xc1, 0xf1, 0x08, 0x02, 0x26, 0x43, 0x29, 0x44, 0x0a,
60 	0x05, 0x44, 0x5d, 0x49, 0x5d, 0x60, 0x3e, 0x5a, 0x3b, 0x5b, 0x3f, 0x08,
61 	0x0a, 0x07, 0x01, 0x06, 0x00, 0x0a, 0x00, 0x01, 0x00, 0x10, 0x01, 0x17,
62 	0x84, 0x00, 0x04, 0x0a, 0x01, 0x01, 0x01, 0x00, 0x0a, 0x02, 0x01, 0x02,
63 	0x00, 0x0a, 0x03, 0x01, 0x03, 0x00, 0x0a, 0x04, 0x01, 0x04, 0x10, 0x01,
64 	0x17, 0x85, 0x20, 0x04, 0x0a, 0x06, 0x01, 0x05, 0x30, 0x24, 0xb3, 0x99,
65 	0x01, 0x17, 0x82, 0x00, 0x04, 0x0a, 0x05, 0x01, 0x05, 0x30, 0x20, 0xb2,
66 	0xe6, 0x01, 0x17, 0x82, 0x00, 0x04
67 };
68 
69 
70 #define NVME_DISK_DRIVER_MODULE_NAME 	"drivers/disk/nvme_disk/driver_v1"
71 #define NVME_DISK_DEVICE_MODULE_NAME 	"drivers/disk/nvme_disk/device_v1"
72 #define NVME_DISK_DEVICE_ID_GENERATOR	"nvme_disk/device_id"
73 
74 #define NVME_MAX_QPAIRS					(8)
75 
76 
77 static device_manager_info* sDeviceManager;
78 static pci_x86_module_info* sPCIx86Module;
79 
80 typedef struct {
81 	device_node*			node;
82 	pci_info				info;
83 
84 	struct nvme_ctrlr*		ctrlr;
85 
86 	struct nvme_ns*			ns;
87 	uint64					capacity;
88 	uint32					block_size;
89 	uint32					max_io_blocks;
90 	status_t				media_status;
91 
92 	struct qpair_info {
93 		struct nvme_qpair*	qpair;
94 	}						qpairs[NVME_MAX_QPAIRS];
95 	uint32					qpair_count;
96 	uint32					next_qpair;
97 
98 	DMAResource				dma_resource;
99 	sem_id					dma_buffers_sem;
100 
101 	rw_lock					rounded_write_lock;
102 
103 	ConditionVariable		interrupt;
104 } nvme_disk_driver_info;
105 typedef nvme_disk_driver_info::qpair_info qpair_info;
106 
107 
108 typedef struct {
109 	nvme_disk_driver_info*		info;
110 } nvme_disk_handle;
111 
112 
113 static status_t
114 get_geometry(nvme_disk_handle* handle, device_geometry* geometry)
115 {
116 	nvme_disk_driver_info* info = handle->info;
117 
118 	devfs_compute_geometry_size(geometry, info->capacity, info->block_size);
119 
120 	geometry->device_type = B_DISK;
121 	geometry->removable = false;
122 
123 	geometry->read_only = false;
124 	geometry->write_once = false;
125 
126 	TRACE("get_geometry(): %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %d, %d, %d, %d\n",
127 		geometry->bytes_per_sector, geometry->sectors_per_track,
128 		geometry->cylinder_count, geometry->head_count, geometry->device_type,
129 		geometry->removable, geometry->read_only, geometry->write_once);
130 
131 	return B_OK;
132 }
133 
134 
135 static int
136 log2(uint32 x)
137 {
138 	int y;
139 
140 	for (y = 31; y >= 0; --y) {
141 		if (x == ((uint32)1 << y))
142 			break;
143 	}
144 
145 	return y;
146 }
147 
148 
149 static void
150 nvme_disk_set_capacity(nvme_disk_driver_info* info, uint64 capacity,
151 	uint32 blockSize)
152 {
153 	TRACE("set_capacity(device = %p, capacity = %" B_PRIu64 ", blockSize = %" B_PRIu32 ")\n",
154 		info, capacity, blockSize);
155 
156 	// get log2, if possible
157 	uint32 blockShift = log2(blockSize);
158 
159 	if ((1UL << blockShift) != blockSize)
160 		blockShift = 0;
161 
162 	info->capacity = capacity;
163 	info->block_size = blockSize;
164 }
165 
166 
167 //	#pragma mark - device module API
168 
169 
170 static int32 nvme_interrupt_handler(void* _info);
171 
172 
173 static status_t
174 nvme_disk_init_device(void* _info, void** _cookie)
175 {
176 	CALLED();
177 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info;
178 
179 	pci_device_module_info* pci;
180 	pci_device* pcidev;
181 	device_node* parent = sDeviceManager->get_parent_node(info->node);
182 	sDeviceManager->get_driver(parent, (driver_module_info**)&pci,
183 		(void**)&pcidev);
184 	pci->get_pci_info(pcidev, &info->info);
185 	sDeviceManager->put_node(parent);
186 
187 	// construct the libnvme pci_device struct
188 	pci_device* device = new pci_device;
189 	device->vendor_id = info->info.vendor_id;
190 	device->device_id = info->info.device_id;
191 	device->subvendor_id = 0;
192 	device->subdevice_id = 0;
193 
194 	device->domain = 0;
195 	device->bus = info->info.bus;
196 	device->dev = info->info.device;
197 	device->func = info->info.function;
198 
199 	device->pci_info = &info->info;
200 
201 	// open the controller
202 	info->ctrlr = nvme_ctrlr_open(device, NULL);
203 	if (info->ctrlr == NULL) {
204 		TRACE_ERROR("failed to open the controller!\n");
205 		return B_ERROR;
206 	}
207 
208 	struct nvme_ctrlr_stat cstat;
209 	int err = nvme_ctrlr_stat(info->ctrlr, &cstat);
210 	if (err != 0) {
211 		TRACE_ERROR("failed to get controller information!\n");
212 		nvme_ctrlr_close(info->ctrlr);
213 		return err;
214 	}
215 
216 	TRACE_ALWAYS("attached to NVMe device \"%s (%s)\"\n", cstat.mn, cstat.sn);
217 	TRACE_ALWAYS("\tmaximum transfer size: %" B_PRIuSIZE "\n", cstat.max_xfer_size);
218 	TRACE_ALWAYS("\tqpair count: %d\n", cstat.io_qpairs);
219 
220 	// TODO: export more than just the first namespace!
221 	info->ns = nvme_ns_open(info->ctrlr, cstat.ns_ids[0]);
222 	if (info->ns == NULL) {
223 		TRACE_ERROR("failed to open namespace!\n");
224 		nvme_ctrlr_close(info->ctrlr);
225 		return B_ERROR;
226 	}
227 
228 	struct nvme_ns_stat nsstat;
229 	err = nvme_ns_stat(info->ns, &nsstat);
230 	if (err != 0) {
231 		TRACE_ERROR("failed to get namespace information!\n");
232 		nvme_ctrlr_close(info->ctrlr);
233 		return err;
234 	}
235 
236 	// store capacity information
237 	nvme_disk_set_capacity(info, nsstat.sectors, nsstat.sector_size);
238 
239 	TRACE("capacity: %" B_PRIu64 ", block_size %" B_PRIu32 "\n",
240 		info->capacity, info->block_size);
241 
242 	// allocate qpairs
243 	info->qpair_count = info->next_qpair = 0;
244 	for (uint32 i = 0; i < NVME_MAX_QPAIRS && i < cstat.io_qpairs; i++) {
245 		info->qpairs[i].qpair = nvme_ioqp_get(info->ctrlr,
246 			(enum nvme_qprio)0, 0);
247 		if (info->qpairs[i].qpair == NULL)
248 			break;
249 
250 		info->qpair_count++;
251 	}
252 	if (info->qpair_count == 0) {
253 		TRACE_ERROR("failed to allocate qpairs!\n");
254 		nvme_ctrlr_close(info->ctrlr);
255 		return B_NO_MEMORY;
256 	}
257 
258 	// allocate DMA buffers
259 	int buffers = info->qpair_count * 2;
260 
261 	dma_restrictions restrictions = {};
262 	restrictions.alignment = B_PAGE_SIZE;
263 		// Technically, the first and last segments in a transfer can be
264 		// unaligned, and the rest only need to have sizes that are a multiple
265 		// of the block size.
266 	restrictions.max_segment_count = (NVME_MAX_SGL_DESCRIPTORS / 2);
267 	restrictions.max_transfer_size = cstat.max_xfer_size;
268 	info->max_io_blocks = cstat.max_xfer_size / nsstat.sector_size;
269 
270 	err = info->dma_resource.Init(restrictions, B_PAGE_SIZE, buffers, buffers);
271 	if (err != 0) {
272 		TRACE_ERROR("failed to initialize DMA resource!\n");
273 		nvme_ctrlr_close(info->ctrlr);
274 		return err;
275 	}
276 
277 	info->dma_buffers_sem = create_sem(buffers, "nvme buffers sem");
278 	if (info->dma_buffers_sem < 0) {
279 		TRACE_ERROR("failed to create DMA buffers semaphore!\n");
280 		nvme_ctrlr_close(info->ctrlr);
281 		return info->dma_buffers_sem;
282 	}
283 
284 	// set up rounded-write lock
285 	rw_lock_init(&info->rounded_write_lock, "nvme rounded writes");
286 
287 	// set up interrupt
288 	if (get_module(B_PCI_X86_MODULE_NAME, (module_info**)&sPCIx86Module)
289 			!= B_OK) {
290 		sPCIx86Module = NULL;
291 	}
292 
293 	uint16 command = pci->read_pci_config(pcidev, PCI_command, 2);
294 	command &= ~(PCI_command_int_disable);
295 	pci->write_pci_config(pcidev, PCI_command, 2, command);
296 
297 	uint8 irq = info->info.u.h0.interrupt_line;
298 	if (sPCIx86Module != NULL) {
299 		if (sPCIx86Module->get_msix_count(info->info.bus, info->info.device,
300 				info->info.function)) {
301 			uint8 msixVector = 0;
302 			if (sPCIx86Module->configure_msix(info->info.bus, info->info.device,
303 					info->info.function, 1, &msixVector) == B_OK
304 				&& sPCIx86Module->enable_msix(info->info.bus, info->info.device,
305 					info->info.function) == B_OK) {
306 				TRACE_ALWAYS("using MSI-X\n");
307 				irq = msixVector;
308 			}
309 		} else if (sPCIx86Module->get_msi_count(info->info.bus,
310 				info->info.device, info->info.function) >= 1) {
311 			uint8 msiVector = 0;
312 			if (sPCIx86Module->configure_msi(info->info.bus, info->info.device,
313 					info->info.function, 1, &msiVector) == B_OK
314 				&& sPCIx86Module->enable_msi(info->info.bus, info->info.device,
315 					info->info.function) == B_OK) {
316 				TRACE_ALWAYS("using message signaled interrupts\n");
317 				irq = msiVector;
318 			}
319 		}
320 	}
321 
322 	if (irq == 0 || irq == 0xFF) {
323 		TRACE_ERROR("device PCI:%d:%d:%d was assigned an invalid IRQ\n",
324 			info->info.bus, info->info.device, info->info.function);
325 		return B_ERROR;
326 	}
327 	info->interrupt.Init(NULL, NULL);
328 	install_io_interrupt_handler(irq, nvme_interrupt_handler, (void*)info, B_NO_HANDLED_INFO);
329 
330 	if (info->ctrlr->feature_supported[NVME_FEAT_INTERRUPT_COALESCING]) {
331 		uint32 microseconds = 16, threshold = 32;
332 		nvme_admin_set_feature(info->ctrlr, false, NVME_FEAT_INTERRUPT_COALESCING,
333 			((microseconds / 100) << 8) | threshold, 0, NULL);
334 	}
335 
336 	*_cookie = info;
337 	return B_OK;
338 }
339 
340 
341 static void
342 nvme_disk_uninit_device(void* _cookie)
343 {
344 	CALLED();
345 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie;
346 
347 	remove_io_interrupt_handler(info->info.u.h0.interrupt_line,
348 		nvme_interrupt_handler, (void*)info);
349 
350 	rw_lock_destroy(&info->rounded_write_lock);
351 
352 	nvme_ns_close(info->ns);
353 	nvme_ctrlr_close(info->ctrlr);
354 
355 	// TODO: Deallocate MSI(-X).
356 	// TODO: Deallocate PCI.
357 }
358 
359 
360 static status_t
361 nvme_disk_open(void* _info, const char* path, int openMode, void** _cookie)
362 {
363 	CALLED();
364 
365 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info;
366 	nvme_disk_handle* handle = (nvme_disk_handle*)malloc(
367 		sizeof(nvme_disk_handle));
368 	if (handle == NULL)
369 		return B_NO_MEMORY;
370 
371 	handle->info = info;
372 
373 	*_cookie = handle;
374 	return B_OK;
375 }
376 
377 
378 static status_t
379 nvme_disk_close(void* cookie)
380 {
381 	CALLED();
382 
383 	//nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
384 	return B_OK;
385 }
386 
387 
388 static status_t
389 nvme_disk_free(void* cookie)
390 {
391 	CALLED();
392 
393 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
394 	free(handle);
395 	return B_OK;
396 }
397 
398 
399 // #pragma mark - I/O
400 
401 
402 static int32
403 nvme_interrupt_handler(void* _info)
404 {
405 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info;
406 	info->interrupt.NotifyAll();
407 	return 0;
408 }
409 
410 
411 static qpair_info*
412 get_qpair(nvme_disk_driver_info* info)
413 {
414 	return &info->qpairs[atomic_add((int32*)&info->next_qpair, 1)
415 		% info->qpair_count];
416 }
417 
418 
419 static void
420 io_finished_callback(status_t* status, const struct nvme_cpl* cpl)
421 {
422 	*status = nvme_cpl_is_error(cpl) ? B_IO_ERROR : B_OK;
423 }
424 
425 
426 static void
427 await_status(nvme_disk_driver_info* info, struct nvme_qpair* qpair, status_t& status)
428 {
429 	CALLED();
430 
431 	ConditionVariableEntry entry;
432 	int timeouts = 0;
433 	while (status == EINPROGRESS) {
434 		info->interrupt.Add(&entry);
435 
436 		nvme_qpair_poll(qpair, 0);
437 
438 		if (status != EINPROGRESS)
439 			return;
440 
441 		if (entry.Wait(B_RELATIVE_TIMEOUT, 5 * 1000 * 1000) != B_OK) {
442 			// This should never happen, as we are woken up on every interrupt
443 			// no matter the qpair or transfer within; so if it does occur,
444 			// that probably means the controller stalled or something.
445 
446 			TRACE_ERROR("timed out waiting for interrupt!\n");
447 			if (timeouts++ >= 3) {
448 				nvme_qpair_fail(qpair);
449 				status = B_TIMED_OUT;
450 				return;
451 			}
452 		}
453 
454 		nvme_qpair_poll(qpair, 0);
455 	}
456 }
457 
458 
459 struct nvme_io_request {
460 	status_t status;
461 
462 	bool write;
463 
464 	off_t lba_start;
465 	size_t lba_count;
466 
467 	physical_entry* iovecs;
468 	int32 iovec_count;
469 
470 	int32 iovec_i;
471 	uint32 iovec_offset;
472 };
473 
474 
475 void ior_reset_sgl(nvme_io_request* request, uint32_t offset)
476 {
477 	TRACE("IOR Reset: %" B_PRIu32 "\n", offset);
478 
479 	int32 i = 0;
480 	while (offset > 0 && request->iovecs[i].size <= offset) {
481 		offset -= request->iovecs[i].size;
482 		i++;
483 	}
484 	request->iovec_i = i;
485 	request->iovec_offset = offset;
486 }
487 
488 
489 int ior_next_sge(nvme_io_request* request, uint64_t* address, uint32_t* length)
490 {
491 	int32 index = request->iovec_i;
492 	if (index < 0 || index > request->iovec_count)
493 		return -1;
494 
495 	*address = request->iovecs[index].address + request->iovec_offset;
496 	*length = request->iovecs[index].size - request->iovec_offset;
497 
498 	TRACE("IOV %d (+ " B_PRIu32 "): 0x%" B_PRIx64 ", %" B_PRIu32 "\n",
499 		request->iovec_i, request->iovec_offset, *address, *length);
500 
501 	request->iovec_i++;
502 	request->iovec_offset = 0;
503 	return 0;
504 }
505 
506 
507 static status_t
508 do_nvme_io_request(nvme_disk_driver_info* info, nvme_io_request* request)
509 {
510 	request->status = EINPROGRESS;
511 
512 	qpair_info* qpinfo = get_qpair(info);
513 	int ret = -1;
514 	if (request->write) {
515 		ret = nvme_ns_writev(info->ns, qpinfo->qpair, request->lba_start,
516 			request->lba_count, (nvme_cmd_cb)io_finished_callback, request,
517 			0, (nvme_req_reset_sgl_cb)ior_reset_sgl,
518 			(nvme_req_next_sge_cb)ior_next_sge);
519 	} else {
520 		ret = nvme_ns_readv(info->ns, qpinfo->qpair, request->lba_start,
521 			request->lba_count, (nvme_cmd_cb)io_finished_callback, request,
522 			0, (nvme_req_reset_sgl_cb)ior_reset_sgl,
523 			(nvme_req_next_sge_cb)ior_next_sge);
524 	}
525 	if (ret != 0) {
526 		TRACE_ERROR("attempt to queue %s I/O at LBA %" B_PRIdOFF " of %" B_PRIuSIZE
527 			" blocks failed!\n", request->write ? "write" : "read",
528 			request->lba_start, request->lba_count);
529 
530 		request->lba_count = 0;
531 		return ret;
532 	}
533 
534 	await_status(info, qpinfo->qpair, request->status);
535 
536 	if (request->status != B_OK) {
537 		TRACE_ERROR("%s at LBA %" B_PRIdOFF " of %" B_PRIuSIZE
538 			" blocks failed!\n", request->write ? "write" : "read",
539 			request->lba_start, request->lba_count);
540 
541 		request->lba_count = 0;
542 	}
543 	return request->status;
544 }
545 
546 
547 static status_t
548 nvme_disk_bounced_io(nvme_disk_handle* handle, io_request* request)
549 {
550 	CALLED();
551 
552 	WriteLocker writeLocker;
553 	if (request->IsWrite())
554 		writeLocker.SetTo(handle->info->rounded_write_lock, false);
555 
556 	status_t status = acquire_sem(handle->info->dma_buffers_sem);
557 	if (status != B_OK) {
558 		request->SetStatusAndNotify(status);
559 		return status;
560 	}
561 
562 	const size_t block_size = handle->info->block_size;
563 
564 	TRACE("%p: IOR Offset: %" B_PRIdOFF "; Length %" B_PRIuGENADDR
565 		"; Write %s\n", request, request->Offset(), request->Length(),
566 		request->IsWrite() ? "yes" : "no");
567 
568 	nvme_io_request nvme_request;
569 	while (request->RemainingBytes() > 0) {
570 		IOOperation operation;
571 		status = handle->info->dma_resource.TranslateNext(request, &operation, 0);
572 		if (status != B_OK)
573 			break;
574 
575 		size_t transferredBytes = 0;
576 		do {
577 			TRACE("%p: IOO offset: %" B_PRIdOFF ", length: %" B_PRIuGENADDR
578 				", write: %s\n", request, operation.Offset(),
579 				operation.Length(), operation.IsWrite() ? "yes" : "no");
580 
581 			nvme_request.write = operation.IsWrite();
582 			nvme_request.lba_start = operation.Offset() / block_size;
583 			nvme_request.lba_count = operation.Length() / block_size;
584 			nvme_request.iovecs = (physical_entry*)operation.Vecs();
585 			nvme_request.iovec_count = operation.VecCount();
586 
587 			status = do_nvme_io_request(handle->info, &nvme_request);
588 			if (status == B_OK && nvme_request.write == request->IsWrite())
589 				transferredBytes += operation.OriginalLength();
590 
591 			operation.SetStatus(status);
592 		} while (status == B_OK && !operation.Finish());
593 
594 		if (status == B_OK && operation.Status() != B_OK) {
595 			TRACE_ERROR("I/O succeeded but IOOperation failed!\n");
596 			status = operation.Status();
597 		}
598 
599 		operation.SetTransferredBytes(transferredBytes);
600 		request->OperationFinished(&operation, status, status != B_OK,
601 			operation.OriginalOffset() + transferredBytes);
602 
603 		handle->info->dma_resource.RecycleBuffer(operation.Buffer());
604 
605 		TRACE("%p: status %s, remaining bytes %" B_PRIuGENADDR "\n", request,
606 			strerror(status), request->RemainingBytes());
607 		if (status != B_OK)
608 			break;
609 	}
610 
611 	release_sem(handle->info->dma_buffers_sem);
612 
613 	// Notify() also takes care of UnlockMemory().
614 	if (status != B_OK && request->Status() == B_OK)
615 		request->SetStatusAndNotify(status);
616 	else
617 		request->NotifyFinished();
618 	return status;
619 }
620 
621 
622 static status_t
623 nvme_disk_io(void* cookie, io_request* request)
624 {
625 	CALLED();
626 
627 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
628 
629 	nvme_io_request nvme_request;
630 	memset(&nvme_request, 0, sizeof(nvme_io_request));
631 
632 	nvme_request.write = request->IsWrite();
633 
634 	physical_entry* vtophys = NULL;
635 	MemoryDeleter vtophysDeleter;
636 
637 	IOBuffer* buffer = request->Buffer();
638 	status_t status = B_OK;
639 	if (!buffer->IsPhysical()) {
640 		status = buffer->LockMemory(request->TeamID(), request->IsWrite());
641 		if (status != B_OK) {
642 			TRACE_ERROR("failed to lock memory: %s\n", strerror(status));
643 			return status;
644 		}
645 		// SetStatusAndNotify() takes care of unlocking memory if necessary.
646 
647 		// This is slightly inefficient, as we could use a BStackOrHeapArray in
648 		// the optimal case (few physical entries required), but we would not
649 		// know whether or not that was possible until calling get_memory_map()
650 		// and then potentially reallocating, which would complicate the logic.
651 
652 		int32 vtophys_length = (request->Length() / B_PAGE_SIZE) + 2;
653 		nvme_request.iovecs = vtophys = (physical_entry*)malloc(sizeof(physical_entry)
654 			* vtophys_length);
655 		if (vtophys == NULL) {
656 			TRACE_ERROR("failed to allocate memory for iovecs\n");
657 			request->SetStatusAndNotify(B_NO_MEMORY);
658 			return B_NO_MEMORY;
659 		}
660 		vtophysDeleter.SetTo(vtophys);
661 
662 		for (size_t i = 0; i < buffer->VecCount(); i++) {
663 			generic_io_vec virt = buffer->VecAt(i);
664 			uint32 entries = vtophys_length - nvme_request.iovec_count;
665 
666 			// Avoid copies by going straight into the vtophys array.
667 			status = get_memory_map_etc(request->TeamID(), (void*)virt.base,
668 				virt.length, vtophys + nvme_request.iovec_count, &entries);
669 			if (status == B_BUFFER_OVERFLOW) {
670 				TRACE("vtophys array was too small, reallocating\n");
671 
672 				vtophysDeleter.Detach();
673 				vtophys_length *= 2;
674 				nvme_request.iovecs = vtophys = (physical_entry*)realloc(vtophys,
675 					sizeof(physical_entry) * vtophys_length);
676 				vtophysDeleter.SetTo(vtophys);
677 				if (vtophys == NULL) {
678 					status = B_NO_MEMORY;
679 				} else {
680 					// Try again, with the larger buffer this time.
681 					i--;
682 					continue;
683 				}
684 			}
685 			if (status != B_OK) {
686 				TRACE_ERROR("I/O get_memory_map failed: %s\n", strerror(status));
687 				request->SetStatusAndNotify(status);
688 				return status;
689 			}
690 
691 			nvme_request.iovec_count += entries;
692 		}
693 	} else {
694 		nvme_request.iovecs = (physical_entry*)buffer->Vecs();
695 		nvme_request.iovec_count = buffer->VecCount();
696 	}
697 
698 	// See if we need to bounce anything other than the first or last vec.
699 	const size_t block_size = handle->info->block_size;
700 	bool bounceAll = false;
701 	for (int32 i = 1; !bounceAll && i < (nvme_request.iovec_count - 1); i++) {
702 		if ((nvme_request.iovecs[i].address % B_PAGE_SIZE) != 0)
703 			bounceAll = true;
704 		if ((nvme_request.iovecs[i].size % B_PAGE_SIZE) != 0)
705 			bounceAll = true;
706 	}
707 
708 	// See if we need to bounce due to the first or last vec.
709 	if (nvme_request.iovec_count > 1) {
710 		physical_entry* entry = &nvme_request.iovecs[0];
711 		if (!bounceAll && (((entry->address + entry->size) % B_PAGE_SIZE) != 0
712 				|| (entry->size % block_size) != 0))
713 			bounceAll = true;
714 
715 		entry = &nvme_request.iovecs[nvme_request.iovec_count - 1];
716 		if (!bounceAll && ((entry->address % B_PAGE_SIZE) != 0
717 				|| (entry->size % block_size) != 0))
718 			bounceAll = true;
719 	}
720 
721 	// See if we need to bounce due to rounding.
722 	const off_t rounded_pos = ROUNDDOWN(request->Offset(), block_size);
723 	phys_size_t rounded_len = ROUNDUP(request->Length() + (request->Offset()
724 		- rounded_pos), block_size);
725 	if (rounded_pos != request->Offset() || rounded_len != request->Length())
726 		bounceAll = true;
727 
728 	if (bounceAll) {
729 		// Let the bounced I/O routine take care of everything from here.
730 		return nvme_disk_bounced_io(handle, request);
731 	}
732 
733 	nvme_request.lba_start = rounded_pos / block_size;
734 	nvme_request.lba_count = rounded_len / block_size;
735 
736 	// No bouncing was required.
737 	ReadLocker readLocker;
738 	if (nvme_request.write)
739 		readLocker.SetTo(handle->info->rounded_write_lock, false);
740 
741 	// Error check before actually doing I/O.
742 	if (status != B_OK) {
743 		TRACE_ERROR("I/O failed early: %s\n", strerror(status));
744 		request->SetStatusAndNotify(status);
745 		return status;
746 	}
747 
748 	const uint32 max_io_blocks = handle->info->max_io_blocks;
749 	int32 remaining = nvme_request.iovec_count;
750 	while (remaining > 0) {
751 		nvme_request.iovec_count = min_c(remaining,
752 			NVME_MAX_SGL_DESCRIPTORS / 2);
753 
754 		nvme_request.lba_count = 0;
755 		for (int i = 0; i < nvme_request.iovec_count; i++) {
756 			int32 new_lba_count = nvme_request.lba_count
757 				+ (nvme_request.iovecs[i].size / block_size);
758 			if (nvme_request.lba_count > 0 && new_lba_count > max_io_blocks) {
759 				// We already have a nonzero length, and adding this vec would
760 				// make us go over (or we already are over.) Stop adding.
761 				nvme_request.iovec_count = i;
762 				break;
763 			}
764 
765 			nvme_request.lba_count = new_lba_count;
766 		}
767 
768 		status = do_nvme_io_request(handle->info, &nvme_request);
769 		if (status != B_OK)
770 			break;
771 
772 		nvme_request.iovecs += nvme_request.iovec_count;
773 		remaining -= nvme_request.iovec_count;
774 		nvme_request.lba_start += nvme_request.lba_count;
775 	}
776 
777 	if (status != B_OK)
778 		TRACE_ERROR("I/O failed: %s\n", strerror(status));
779 
780 	request->SetTransferredBytes(status != B_OK,
781 		(nvme_request.lba_start * block_size) - rounded_pos);
782 	request->SetStatusAndNotify(status);
783 	return status;
784 }
785 
786 
787 static status_t
788 nvme_disk_read(void* cookie, off_t pos, void* buffer, size_t* length)
789 {
790 	CALLED();
791 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
792 
793 	const off_t end = (handle->info->capacity * handle->info->block_size);
794 	if (pos >= end)
795 		return B_BAD_VALUE;
796 	if (pos + (off_t)*length > end)
797 		*length = end - pos;
798 
799 	IORequest request;
800 	status_t status = request.Init(pos, (addr_t)buffer, *length, false, 0);
801 	if (status != B_OK)
802 		return status;
803 
804 	status = nvme_disk_io(handle, &request);
805 	*length = request.TransferredBytes();
806 	return status;
807 }
808 
809 
810 static status_t
811 nvme_disk_write(void* cookie, off_t pos, const void* buffer, size_t* length)
812 {
813 	CALLED();
814 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
815 
816 	const off_t end = (handle->info->capacity * handle->info->block_size);
817 	if (pos >= end)
818 		return B_BAD_VALUE;
819 	if (pos + (off_t)*length > end)
820 		*length = end - pos;
821 
822 	IORequest request;
823 	status_t status = request.Init(pos, (addr_t)buffer, *length, true, 0);
824 	if (status != B_OK)
825 		return status;
826 
827 	status = nvme_disk_io(handle, &request);
828 	*length = request.TransferredBytes();
829 	return status;
830 }
831 
832 
833 static status_t
834 nvme_disk_flush(nvme_disk_driver_info* info)
835 {
836 	status_t status = EINPROGRESS;
837 
838 	qpair_info* qpinfo = get_qpair(info);
839 	int ret = nvme_ns_flush(info->ns, qpinfo->qpair,
840 		(nvme_cmd_cb)io_finished_callback, &status);
841 	if (ret != 0)
842 		return ret;
843 
844 	await_status(info, qpinfo->qpair, status);
845 	return status;
846 }
847 
848 
849 static status_t
850 nvme_disk_ioctl(void* cookie, uint32 op, void* buffer, size_t length)
851 {
852 	CALLED();
853 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
854 	nvme_disk_driver_info* info = handle->info;
855 
856 	TRACE("ioctl(op = %" B_PRId32 ")\n", op);
857 
858 	switch (op) {
859 		case B_GET_MEDIA_STATUS:
860 		{
861 			*(status_t *)buffer = info->media_status;
862 			info->media_status = B_OK;
863 			return B_OK;
864 			break;
865 		}
866 
867 		case B_GET_DEVICE_SIZE:
868 		{
869 			size_t size = info->capacity * info->block_size;
870 			return user_memcpy(buffer, &size, sizeof(size_t));
871 		}
872 
873 		case B_GET_GEOMETRY:
874 		{
875 			if (buffer == NULL /*|| length != sizeof(device_geometry)*/)
876 				return B_BAD_VALUE;
877 
878 		 	device_geometry geometry;
879 			status_t status = get_geometry(handle, &geometry);
880 			if (status != B_OK)
881 				return status;
882 
883 			return user_memcpy(buffer, &geometry, sizeof(device_geometry));
884 		}
885 
886 		case B_GET_ICON_NAME:
887 			return user_strlcpy((char*)buffer, "devices/drive-harddisk",
888 				B_FILE_NAME_LENGTH);
889 
890 		case B_GET_VECTOR_ICON:
891 		{
892 			device_icon iconData;
893 			if (length != sizeof(device_icon))
894 				return B_BAD_VALUE;
895 			if (user_memcpy(&iconData, buffer, sizeof(device_icon)) != B_OK)
896 				return B_BAD_ADDRESS;
897 
898 			if (iconData.icon_size >= (int32)sizeof(kDriveIcon)) {
899 				if (user_memcpy(iconData.icon_data, kDriveIcon,
900 						sizeof(kDriveIcon)) != B_OK)
901 					return B_BAD_ADDRESS;
902 			}
903 
904 			iconData.icon_size = sizeof(kDriveIcon);
905 			return user_memcpy(buffer, &iconData, sizeof(device_icon));
906 		}
907 
908 		case B_FLUSH_DRIVE_CACHE:
909 			return nvme_disk_flush(info);
910 	}
911 
912 	return B_DEV_INVALID_IOCTL;
913 }
914 
915 
916 //	#pragma mark - driver module API
917 
918 
919 static float
920 nvme_disk_supports_device(device_node *parent)
921 {
922 	CALLED();
923 
924 	const char* bus;
925 	uint16 baseClass, subClass;
926 
927 	if (sDeviceManager->get_attr_string(parent, B_DEVICE_BUS, &bus, false) != B_OK
928 		|| sDeviceManager->get_attr_uint16(parent, B_DEVICE_TYPE, &baseClass, false) != B_OK
929 		|| sDeviceManager->get_attr_uint16(parent, B_DEVICE_SUB_TYPE, &subClass, false) != B_OK)
930 		return -1.0f;
931 
932 	if (strcmp(bus, "pci") != 0 || baseClass != PCI_mass_storage)
933 		return 0.0f;
934 
935 	if (subClass != PCI_nvm)
936 		return 0.0f;
937 
938 	TRACE("NVMe device found!\n");
939 	return 1.0f;
940 }
941 
942 
943 static status_t
944 nvme_disk_register_device(device_node* parent)
945 {
946 	CALLED();
947 
948 	device_attr attrs[] = {
949 		{ NULL }
950 	};
951 
952 	return sDeviceManager->register_node(parent, NVME_DISK_DRIVER_MODULE_NAME,
953 		attrs, NULL, NULL);
954 }
955 
956 
957 static status_t
958 nvme_disk_init_driver(device_node* node, void** cookie)
959 {
960 	CALLED();
961 
962 	int ret = nvme_lib_init((enum nvme_log_level)0, (enum nvme_log_facility)0, NULL);
963 	if (ret != 0) {
964 		TRACE_ERROR("libnvme initialization failed!\n");
965 		return ret;
966 	}
967 
968 	nvme_disk_driver_info* info = new nvme_disk_driver_info;
969 	if (info == NULL)
970 		return B_NO_MEMORY;
971 
972 	info->media_status = B_OK;
973 	info->node = node;
974 
975 	info->ctrlr = NULL;
976 
977 	*cookie = info;
978 	return B_OK;
979 }
980 
981 
982 static void
983 nvme_disk_uninit_driver(void* _cookie)
984 {
985 	CALLED();
986 
987 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie;
988 	free(info);
989 }
990 
991 
992 static status_t
993 nvme_disk_register_child_devices(void* _cookie)
994 {
995 	CALLED();
996 
997 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie;
998 	status_t status;
999 
1000 	int32 id = sDeviceManager->create_id(NVME_DISK_DEVICE_ID_GENERATOR);
1001 	if (id < 0)
1002 		return id;
1003 
1004 	char name[64];
1005 	snprintf(name, sizeof(name), "disk/nvme/%" B_PRId32 "/raw",
1006 		id);
1007 
1008 	status = sDeviceManager->publish_device(info->node, name,
1009 		NVME_DISK_DEVICE_MODULE_NAME);
1010 
1011 	return status;
1012 }
1013 
1014 
1015 //	#pragma mark -
1016 
1017 
1018 module_dependency module_dependencies[] = {
1019 	{B_DEVICE_MANAGER_MODULE_NAME, (module_info**)&sDeviceManager},
1020 	{}
1021 };
1022 
1023 struct device_module_info sNvmeDiskDevice = {
1024 	{
1025 		NVME_DISK_DEVICE_MODULE_NAME,
1026 		0,
1027 		NULL
1028 	},
1029 
1030 	nvme_disk_init_device,
1031 	nvme_disk_uninit_device,
1032 	NULL, // remove,
1033 
1034 	nvme_disk_open,
1035 	nvme_disk_close,
1036 	nvme_disk_free,
1037 	nvme_disk_read,
1038 	nvme_disk_write,
1039 	nvme_disk_io,
1040 	nvme_disk_ioctl,
1041 
1042 	NULL,	// select
1043 	NULL,	// deselect
1044 };
1045 
1046 struct driver_module_info sNvmeDiskDriver = {
1047 	{
1048 		NVME_DISK_DRIVER_MODULE_NAME,
1049 		0,
1050 		NULL
1051 	},
1052 
1053 	nvme_disk_supports_device,
1054 	nvme_disk_register_device,
1055 	nvme_disk_init_driver,
1056 	nvme_disk_uninit_driver,
1057 	nvme_disk_register_child_devices,
1058 	NULL,	// rescan
1059 	NULL,	// removed
1060 };
1061 
1062 module_info* modules[] = {
1063 	(module_info*)&sNvmeDiskDriver,
1064 	(module_info*)&sNvmeDiskDevice,
1065 	NULL
1066 };
1067