xref: /haiku/src/add-ons/kernel/drivers/disk/nvme/nvme_disk.cpp (revision 909af08f4328301fbdef1ffb41f566c3b5bec0c7)
1 /*
2  * Copyright 2019-2022, Haiku, Inc. All rights reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Augustin Cavalier <waddlesplash>
7  */
8 
9 
10 #include <stdio.h>
11 #include <stdlib.h>
12 
13 #include <algorithm>
14 #include <condition_variable.h>
15 #include <AutoDeleter.h>
16 #include <kernel.h>
17 #include <smp.h>
18 #include <util/AutoLock.h>
19 
20 #include <fs/devfs.h>
21 #include <bus/PCI.h>
22 #include <vm/vm.h>
23 
24 #include "IORequest.h"
25 
26 extern "C" {
27 #include <libnvme/nvme.h>
28 #include <libnvme/nvme_internal.h>
29 }
30 
31 
32 //#define TRACE_NVME_DISK
33 #ifdef TRACE_NVME_DISK
34 #	define TRACE(x...) dprintf("nvme_disk: " x)
35 #else
36 #	define TRACE(x...) ;
37 #endif
38 #define TRACE_ALWAYS(x...)	dprintf("nvme_disk: " x)
39 #define TRACE_ERROR(x...)	dprintf("\33[33mnvme_disk:\33[0m " x)
40 #define CALLED() 			TRACE("CALLED %s\n", __PRETTY_FUNCTION__)
41 
42 
43 static const uint8 kDriveIcon[] = {
44 	0x6e, 0x63, 0x69, 0x66, 0x08, 0x03, 0x01, 0x00, 0x00, 0x02, 0x00, 0x16,
45 	0x02, 0x3c, 0xc7, 0xee, 0x38, 0x9b, 0xc0, 0xba, 0x16, 0x57, 0x3e, 0x39,
46 	0xb0, 0x49, 0x77, 0xc8, 0x42, 0xad, 0xc7, 0x00, 0xff, 0xff, 0xd3, 0x02,
47 	0x00, 0x06, 0x02, 0x3c, 0x96, 0x32, 0x3a, 0x4d, 0x3f, 0xba, 0xfc, 0x01,
48 	0x3d, 0x5a, 0x97, 0x4b, 0x57, 0xa5, 0x49, 0x84, 0x4d, 0x00, 0x47, 0x47,
49 	0x47, 0xff, 0xa5, 0xa0, 0xa0, 0x02, 0x00, 0x16, 0x02, 0xbc, 0x59, 0x2f,
50 	0xbb, 0x29, 0xa7, 0x3c, 0x0c, 0xe4, 0xbd, 0x0b, 0x7c, 0x48, 0x92, 0xc0,
51 	0x4b, 0x79, 0x66, 0x00, 0x7d, 0xff, 0xd4, 0x02, 0x00, 0x06, 0x02, 0x38,
52 	0xdb, 0xb4, 0x39, 0x97, 0x33, 0xbc, 0x4a, 0x33, 0x3b, 0xa5, 0x42, 0x48,
53 	0x6e, 0x66, 0x49, 0xee, 0x7b, 0x00, 0x59, 0x67, 0x56, 0xff, 0xeb, 0xb2,
54 	0xb2, 0x03, 0xa7, 0xff, 0x00, 0x03, 0xff, 0x00, 0x00, 0x04, 0x01, 0x80,
55 	0x07, 0x0a, 0x06, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x5a, 0x3e, 0x5a,
56 	0x31, 0x39, 0x25, 0x0a, 0x04, 0x22, 0x3c, 0x44, 0x4b, 0x5a, 0x31, 0x39,
57 	0x25, 0x0a, 0x04, 0x44, 0x4b, 0x44, 0x5b, 0x5a, 0x3e, 0x5a, 0x31, 0x0a,
58 	0x04, 0x22, 0x3c, 0x22, 0x49, 0x44, 0x5b, 0x44, 0x4b, 0x08, 0x02, 0x27,
59 	0x43, 0xb8, 0x14, 0xc1, 0xf1, 0x08, 0x02, 0x26, 0x43, 0x29, 0x44, 0x0a,
60 	0x05, 0x44, 0x5d, 0x49, 0x5d, 0x60, 0x3e, 0x5a, 0x3b, 0x5b, 0x3f, 0x08,
61 	0x0a, 0x07, 0x01, 0x06, 0x00, 0x0a, 0x00, 0x01, 0x00, 0x10, 0x01, 0x17,
62 	0x84, 0x00, 0x04, 0x0a, 0x01, 0x01, 0x01, 0x00, 0x0a, 0x02, 0x01, 0x02,
63 	0x00, 0x0a, 0x03, 0x01, 0x03, 0x00, 0x0a, 0x04, 0x01, 0x04, 0x10, 0x01,
64 	0x17, 0x85, 0x20, 0x04, 0x0a, 0x06, 0x01, 0x05, 0x30, 0x24, 0xb3, 0x99,
65 	0x01, 0x17, 0x82, 0x00, 0x04, 0x0a, 0x05, 0x01, 0x05, 0x30, 0x20, 0xb2,
66 	0xe6, 0x01, 0x17, 0x82, 0x00, 0x04
67 };
68 
69 
70 #define NVME_DISK_DRIVER_MODULE_NAME 	"drivers/disk/nvme_disk/driver_v1"
71 #define NVME_DISK_DEVICE_MODULE_NAME 	"drivers/disk/nvme_disk/device_v1"
72 #define NVME_DISK_DEVICE_ID_GENERATOR	"nvme_disk/device_id"
73 
74 #define NVME_MAX_QPAIRS					(16)
75 
76 
77 static device_manager_info* sDeviceManager;
78 
79 typedef struct {
80 	device_node*			node;
81 	pci_info				info;
82 
83 	struct nvme_ctrlr*		ctrlr;
84 
85 	struct nvme_ns*			ns;
86 	uint64					capacity;
87 	uint32					block_size;
88 	uint32					max_io_blocks;
89 	status_t				media_status;
90 
91 	DMAResource				dma_resource;
92 	sem_id					dma_buffers_sem;
93 
94 	rw_lock					rounded_write_lock;
95 
96 	ConditionVariable		interrupt;
97 	int32					polling;
98 
99 	struct qpair_info {
100 		struct nvme_qpair*	qpair;
101 	}						qpairs[NVME_MAX_QPAIRS];
102 	uint32					qpair_count;
103 } nvme_disk_driver_info;
104 typedef nvme_disk_driver_info::qpair_info qpair_info;
105 
106 
107 typedef struct {
108 	nvme_disk_driver_info*		info;
109 } nvme_disk_handle;
110 
111 
112 static status_t
113 get_geometry(nvme_disk_handle* handle, device_geometry* geometry)
114 {
115 	nvme_disk_driver_info* info = handle->info;
116 
117 	devfs_compute_geometry_size(geometry, info->capacity, info->block_size);
118 	geometry->bytes_per_physical_sector = info->block_size;
119 
120 	geometry->device_type = B_DISK;
121 	geometry->removable = false;
122 
123 	geometry->read_only = false;
124 	geometry->write_once = false;
125 
126 	TRACE("get_geometry(): %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %" B_PRId32 ", %d, %d, %d, %d\n",
127 		geometry->bytes_per_sector, geometry->sectors_per_track,
128 		geometry->cylinder_count, geometry->head_count, geometry->device_type,
129 		geometry->removable, geometry->read_only, geometry->write_once);
130 
131 	return B_OK;
132 }
133 
134 
135 static void
136 nvme_disk_set_capacity(nvme_disk_driver_info* info, uint64 capacity,
137 	uint32 blockSize)
138 {
139 	TRACE("set_capacity(device = %p, capacity = %" B_PRIu64 ", blockSize = %" B_PRIu32 ")\n",
140 		info, capacity, blockSize);
141 
142 	info->capacity = capacity;
143 	info->block_size = blockSize;
144 }
145 
146 
147 //	#pragma mark - device module API
148 
149 
150 static int32 nvme_interrupt_handler(void* _info);
151 
152 
153 static status_t
154 nvme_disk_init_device(void* _info, void** _cookie)
155 {
156 	CALLED();
157 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info;
158 	ASSERT(info->ctrlr == NULL);
159 
160 	pci_device_module_info* pci;
161 	pci_device* pcidev;
162 	device_node* parent = sDeviceManager->get_parent_node(info->node);
163 	sDeviceManager->get_driver(parent, (driver_module_info**)&pci,
164 		(void**)&pcidev);
165 	pci->get_pci_info(pcidev, &info->info);
166 	sDeviceManager->put_node(parent);
167 
168 	// construct the libnvme pci_device struct
169 	pci_device* device = new pci_device;
170 	device->vendor_id = info->info.vendor_id;
171 	device->device_id = info->info.device_id;
172 	device->subvendor_id = 0;
173 	device->subdevice_id = 0;
174 
175 	device->domain = 0;
176 	device->bus = info->info.bus;
177 	device->dev = info->info.device;
178 	device->func = info->info.function;
179 
180 	device->pci_info = &info->info;
181 
182 	// enable busmaster and memory mapped access
183 	uint16 command = pci->read_pci_config(pcidev, PCI_command, 2);
184 	command |= PCI_command_master | PCI_command_memory;
185 	pci->write_pci_config(pcidev, PCI_command, 2, command);
186 
187 	// open the controller
188 	info->ctrlr = nvme_ctrlr_open(device, NULL);
189 	if (info->ctrlr == NULL) {
190 		TRACE_ERROR("failed to open the controller!\n");
191 		return B_ERROR;
192 	}
193 
194 	struct nvme_ctrlr_stat cstat;
195 	int err = nvme_ctrlr_stat(info->ctrlr, &cstat);
196 	if (err != 0) {
197 		TRACE_ERROR("failed to get controller information!\n");
198 		nvme_ctrlr_close(info->ctrlr);
199 		return err;
200 	}
201 
202 	TRACE_ALWAYS("attached to NVMe device \"%s (%s)\"\n", cstat.mn, cstat.sn);
203 	TRACE_ALWAYS("\tmaximum transfer size: %" B_PRIuSIZE "\n", cstat.max_xfer_size);
204 	TRACE_ALWAYS("\tqpair count: %d\n", cstat.io_qpairs);
205 
206 	// TODO: export more than just the first namespace!
207 	info->ns = nvme_ns_open(info->ctrlr, cstat.ns_ids[0]);
208 	if (info->ns == NULL) {
209 		TRACE_ERROR("failed to open namespace!\n");
210 		nvme_ctrlr_close(info->ctrlr);
211 		return B_ERROR;
212 	}
213 	TRACE_ALWAYS("namespace 0\n");
214 
215 	struct nvme_ns_stat nsstat;
216 	err = nvme_ns_stat(info->ns, &nsstat);
217 	if (err != 0) {
218 		TRACE_ERROR("failed to get namespace information!\n");
219 		nvme_ctrlr_close(info->ctrlr);
220 		return err;
221 	}
222 
223 	// store capacity information
224 	TRACE_ALWAYS("\tblock size: %" B_PRIuSIZE ", stripe size: %u\n",
225 		nsstat.sector_size, info->ns->stripe_size);
226 	nvme_disk_set_capacity(info, nsstat.sectors, nsstat.sector_size);
227 
228 	command = pci->read_pci_config(pcidev, PCI_command, 2);
229 	command &= ~(PCI_command_int_disable);
230 	pci->write_pci_config(pcidev, PCI_command, 2, command);
231 
232 	uint32 irq = info->info.u.h0.interrupt_line;
233 	if (irq == 0xFF)
234 		irq = 0;
235 
236 	if (pci->get_msix_count(pcidev)) {
237 		uint32 msixVector = 0;
238 		if (pci->configure_msix(pcidev, 1, &msixVector) == B_OK
239 			&& pci->enable_msix(pcidev) == B_OK) {
240 			TRACE_ALWAYS("using MSI-X\n");
241 			irq = msixVector;
242 		}
243 	} else if (pci->get_msi_count(pcidev) >= 1) {
244 		uint32 msiVector = 0;
245 		if (pci->configure_msi(pcidev, 1, &msiVector) == B_OK
246 			&& pci->enable_msi(pcidev) == B_OK) {
247 			TRACE_ALWAYS("using message signaled interrupts\n");
248 			irq = msiVector;
249 		}
250 	}
251 
252 	if (irq == 0) {
253 		TRACE_ERROR("device PCI:%d:%d:%d was assigned an invalid IRQ\n",
254 			info->info.bus, info->info.device, info->info.function);
255 		info->polling = 1;
256 	} else {
257 		info->polling = 0;
258 	}
259 	info->interrupt.Init(NULL, NULL);
260 	install_io_interrupt_handler(irq, nvme_interrupt_handler, (void*)info, B_NO_HANDLED_INFO);
261 
262 	if (info->ctrlr->feature_supported[NVME_FEAT_INTERRUPT_COALESCING]) {
263 		uint32 microseconds = 16, threshold = 32;
264 		nvme_admin_set_feature(info->ctrlr, false, NVME_FEAT_INTERRUPT_COALESCING,
265 			((microseconds / 100) << 8) | threshold, 0, NULL);
266 	}
267 
268 	// allocate qpairs
269 	uint32 try_qpairs = cstat.io_qpairs;
270 	try_qpairs = min_c(try_qpairs, NVME_MAX_QPAIRS);
271 	if (try_qpairs >= (uint32)smp_get_num_cpus()) {
272 		try_qpairs = smp_get_num_cpus();
273 	} else {
274 		// Find the highest number of qpairs that evenly divides the number of CPUs.
275 		while ((smp_get_num_cpus() % try_qpairs) != 0)
276 			try_qpairs--;
277 	}
278 	info->qpair_count = 0;
279 	for (uint32 i = 0; i < try_qpairs; i++) {
280 		info->qpairs[i].qpair = nvme_ioqp_get(info->ctrlr,
281 			(enum nvme_qprio)0, 0);
282 		if (info->qpairs[i].qpair == NULL)
283 			break;
284 
285 		info->qpair_count++;
286 	}
287 	if (info->qpair_count == 0) {
288 		TRACE_ERROR("failed to allocate qpairs!\n");
289 		nvme_ctrlr_close(info->ctrlr);
290 		return B_NO_MEMORY;
291 	}
292 	if (info->qpair_count != try_qpairs) {
293 		TRACE_ALWAYS("warning: did not get expected number of qpairs\n");
294 	}
295 
296 	// allocate DMA buffers
297 	int buffers = info->qpair_count * 2;
298 
299 	dma_restrictions restrictions = {};
300 	restrictions.alignment = B_PAGE_SIZE;
301 		// Technically, the first and last segments in a transfer can be aligned
302 		// only on 32-bits, and the rest only need to have sizes that are a multiple
303 		// of the block size.
304 	restrictions.max_segment_count = (NVME_MAX_SGL_DESCRIPTORS / 2);
305 	restrictions.max_transfer_size = cstat.max_xfer_size;
306 	info->max_io_blocks = cstat.max_xfer_size / nsstat.sector_size;
307 
308 	err = info->dma_resource.Init(restrictions, B_PAGE_SIZE, buffers, buffers);
309 	if (err != 0) {
310 		TRACE_ERROR("failed to initialize DMA resource!\n");
311 		nvme_ctrlr_close(info->ctrlr);
312 		return err;
313 	}
314 
315 	info->dma_buffers_sem = create_sem(buffers, "nvme buffers sem");
316 	if (info->dma_buffers_sem < 0) {
317 		TRACE_ERROR("failed to create DMA buffers semaphore!\n");
318 		nvme_ctrlr_close(info->ctrlr);
319 		return info->dma_buffers_sem;
320 	}
321 
322 	// set up rounded-write lock
323 	rw_lock_init(&info->rounded_write_lock, "nvme rounded writes");
324 
325 	*_cookie = info;
326 	return B_OK;
327 }
328 
329 
330 static void
331 nvme_disk_uninit_device(void* _cookie)
332 {
333 	CALLED();
334 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie;
335 
336 	remove_io_interrupt_handler(info->info.u.h0.interrupt_line,
337 		nvme_interrupt_handler, (void*)info);
338 
339 	rw_lock_destroy(&info->rounded_write_lock);
340 
341 	nvme_ns_close(info->ns);
342 	nvme_ctrlr_close(info->ctrlr);
343 
344 	// TODO: Deallocate MSI(-X).
345 	// TODO: Deallocate PCI.
346 }
347 
348 
349 static status_t
350 nvme_disk_open(void* _info, const char* path, int openMode, void** _cookie)
351 {
352 	CALLED();
353 
354 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info;
355 	nvme_disk_handle* handle = (nvme_disk_handle*)malloc(
356 		sizeof(nvme_disk_handle));
357 	if (handle == NULL)
358 		return B_NO_MEMORY;
359 
360 	handle->info = info;
361 
362 	*_cookie = handle;
363 	return B_OK;
364 }
365 
366 
367 static status_t
368 nvme_disk_close(void* cookie)
369 {
370 	CALLED();
371 
372 	//nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
373 	return B_OK;
374 }
375 
376 
377 static status_t
378 nvme_disk_free(void* cookie)
379 {
380 	CALLED();
381 
382 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
383 	free(handle);
384 	return B_OK;
385 }
386 
387 
388 // #pragma mark - I/O
389 
390 
391 static int32
392 nvme_interrupt_handler(void* _info)
393 {
394 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_info;
395 	info->interrupt.NotifyAll();
396 	info->polling = -1;
397 	return 0;
398 }
399 
400 
401 static qpair_info*
402 get_qpair(nvme_disk_driver_info* info)
403 {
404 	return &info->qpairs[smp_get_current_cpu() % info->qpair_count];
405 }
406 
407 
408 static void
409 io_finished_callback(status_t* status, const struct nvme_cpl* cpl)
410 {
411 	*status = nvme_cpl_is_error(cpl) ? B_IO_ERROR : B_OK;
412 }
413 
414 
415 static void
416 await_status(nvme_disk_driver_info* info, struct nvme_qpair* qpair, status_t& status)
417 {
418 	CALLED();
419 
420 	ConditionVariableEntry entry;
421 	int timeouts = 0;
422 	while (status == EINPROGRESS) {
423 		info->interrupt.Add(&entry);
424 
425 		nvme_qpair_poll(qpair, 0);
426 
427 		if (status != EINPROGRESS)
428 			return;
429 
430 		if (info->polling > 0) {
431 			entry.Wait(B_RELATIVE_TIMEOUT, min_c(5 * 1000 * 1000,
432 				(1 << timeouts) * 1000));
433 			timeouts++;
434 		} else if (entry.Wait(B_RELATIVE_TIMEOUT, 5 * 1000 * 1000) != B_OK) {
435 			// This should never happen, as we are woken up on every interrupt
436 			// no matter the qpair or transfer within; so if it does occur,
437 			// that probably means the controller stalled, or maybe cannot
438 			// generate interrupts at all.
439 
440 			TRACE_ERROR("timed out waiting for interrupt!\n");
441 			if (timeouts++ >= 3) {
442 				nvme_qpair_fail(qpair);
443 				status = B_TIMED_OUT;
444 				return;
445 			}
446 
447 			info->polling++;
448 			if (info->polling > 0) {
449 				TRACE_ALWAYS("switching to polling mode, performance will be affected!\n");
450 			}
451 		}
452 
453 		nvme_qpair_poll(qpair, 0);
454 	}
455 }
456 
457 
458 struct nvme_io_request {
459 	status_t status;
460 
461 	bool write;
462 
463 	off_t lba_start;
464 	size_t lba_count;
465 
466 	physical_entry* iovecs;
467 	int32 iovec_count;
468 
469 	int32 iovec_i;
470 	uint32 iovec_offset;
471 };
472 
473 
474 static void
475 ior_reset_sgl(nvme_io_request* request, uint32_t offset)
476 {
477 	TRACE("IOR Reset: %" B_PRIu32 "\n", offset);
478 
479 	int32 i = 0;
480 	while (offset > 0 && request->iovecs[i].size <= offset) {
481 		offset -= request->iovecs[i].size;
482 		i++;
483 	}
484 	request->iovec_i = i;
485 	request->iovec_offset = offset;
486 }
487 
488 
489 static int
490 ior_next_sge(nvme_io_request* request, uint64_t* address, uint32_t* length)
491 {
492 	int32 index = request->iovec_i;
493 	if (index < 0 || index > request->iovec_count)
494 		return -1;
495 
496 	*address = request->iovecs[index].address + request->iovec_offset;
497 	*length = request->iovecs[index].size - request->iovec_offset;
498 
499 	TRACE("IOV %d (+ " B_PRIu32 "): 0x%" B_PRIx64 ", %" B_PRIu32 "\n",
500 		request->iovec_i, request->iovec_offset, *address, *length);
501 
502 	request->iovec_i++;
503 	request->iovec_offset = 0;
504 	return 0;
505 }
506 
507 
508 static status_t
509 do_nvme_io_request(nvme_disk_driver_info* info, nvme_io_request* request)
510 {
511 	request->status = EINPROGRESS;
512 
513 	qpair_info* qpinfo = get_qpair(info);
514 	int ret = -1;
515 	if (request->write) {
516 		ret = nvme_ns_writev(info->ns, qpinfo->qpair, request->lba_start,
517 			request->lba_count, (nvme_cmd_cb)io_finished_callback, request,
518 			0, (nvme_req_reset_sgl_cb)ior_reset_sgl,
519 			(nvme_req_next_sge_cb)ior_next_sge);
520 	} else {
521 		ret = nvme_ns_readv(info->ns, qpinfo->qpair, request->lba_start,
522 			request->lba_count, (nvme_cmd_cb)io_finished_callback, request,
523 			0, (nvme_req_reset_sgl_cb)ior_reset_sgl,
524 			(nvme_req_next_sge_cb)ior_next_sge);
525 	}
526 	if (ret != 0) {
527 		TRACE_ERROR("attempt to queue %s I/O at LBA %" B_PRIdOFF " of %" B_PRIuSIZE
528 			" blocks failed!\n", request->write ? "write" : "read",
529 			request->lba_start, request->lba_count);
530 
531 		request->lba_count = 0;
532 		return ret;
533 	}
534 
535 	await_status(info, qpinfo->qpair, request->status);
536 
537 	if (request->status != B_OK) {
538 		TRACE_ERROR("%s at LBA %" B_PRIdOFF " of %" B_PRIuSIZE
539 			" blocks failed!\n", request->write ? "write" : "read",
540 			request->lba_start, request->lba_count);
541 
542 		request->lba_count = 0;
543 	}
544 	return request->status;
545 }
546 
547 
548 static status_t
549 nvme_disk_bounced_io(nvme_disk_handle* handle, io_request* request)
550 {
551 	CALLED();
552 
553 	WriteLocker writeLocker;
554 	if (request->IsWrite())
555 		writeLocker.SetTo(handle->info->rounded_write_lock, false);
556 
557 	status_t status = acquire_sem(handle->info->dma_buffers_sem);
558 	if (status != B_OK) {
559 		request->SetStatusAndNotify(status);
560 		return status;
561 	}
562 
563 	const size_t block_size = handle->info->block_size;
564 
565 	TRACE("%p: IOR Offset: %" B_PRIdOFF "; Length %" B_PRIuGENADDR
566 		"; Write %s\n", request, request->Offset(), request->Length(),
567 		request->IsWrite() ? "yes" : "no");
568 
569 	nvme_io_request nvme_request;
570 	while (request->RemainingBytes() > 0) {
571 		IOOperation operation;
572 		status = handle->info->dma_resource.TranslateNext(request, &operation, 0);
573 		if (status != B_OK)
574 			break;
575 
576 		do {
577 			TRACE("%p: IOO offset: %" B_PRIdOFF ", length: %" B_PRIuGENADDR
578 				", write: %s\n", request, operation.Offset(),
579 				operation.Length(), operation.IsWrite() ? "yes" : "no");
580 
581 			nvme_request.write = operation.IsWrite();
582 			nvme_request.lba_start = operation.Offset() / block_size;
583 			nvme_request.lba_count = operation.Length() / block_size;
584 			nvme_request.iovecs = (physical_entry*)operation.Vecs();
585 			nvme_request.iovec_count = operation.VecCount();
586 
587 			status = do_nvme_io_request(handle->info, &nvme_request);
588 
589 			operation.SetStatus(status,
590 				status == B_OK ? operation.Length() : 0);
591 		} while (status == B_OK && !operation.Finish());
592 
593 		if (status == B_OK && operation.Status() != B_OK) {
594 			TRACE_ERROR("I/O succeeded but IOOperation failed!\n");
595 			status = operation.Status();
596 		}
597 
598 		request->OperationFinished(&operation);
599 
600 		handle->info->dma_resource.RecycleBuffer(operation.Buffer());
601 
602 		TRACE("%p: status %s, remaining bytes %" B_PRIuGENADDR "\n", request,
603 			strerror(status), request->RemainingBytes());
604 		if (status != B_OK)
605 			break;
606 	}
607 
608 	release_sem(handle->info->dma_buffers_sem);
609 
610 	// Notify() also takes care of UnlockMemory().
611 	if (status != B_OK && request->Status() == B_OK)
612 		request->SetStatusAndNotify(status);
613 	else
614 		request->NotifyFinished();
615 	return status;
616 }
617 
618 
619 static status_t
620 nvme_disk_io(void* cookie, io_request* request)
621 {
622 	CALLED();
623 
624 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
625 
626 	const off_t ns_end = (handle->info->capacity * handle->info->block_size);
627 	if ((request->Offset() + (off_t)request->Length()) > ns_end)
628 		return ERANGE;
629 
630 	nvme_io_request nvme_request;
631 	memset(&nvme_request, 0, sizeof(nvme_io_request));
632 
633 	nvme_request.write = request->IsWrite();
634 
635 	physical_entry* vtophys = NULL;
636 	MemoryDeleter vtophysDeleter;
637 
638 	IOBuffer* buffer = request->Buffer();
639 	status_t status = B_OK;
640 	if (!buffer->IsPhysical()) {
641 		status = buffer->LockMemory(request->TeamID(), request->IsWrite());
642 		if (status != B_OK) {
643 			TRACE_ERROR("failed to lock memory: %s\n", strerror(status));
644 			return status;
645 		}
646 		// SetStatusAndNotify() takes care of unlocking memory if necessary.
647 
648 		// This is slightly inefficient, as we could use a BStackOrHeapArray in
649 		// the optimal case (few physical entries required), but we would not
650 		// know whether or not that was possible until calling get_memory_map()
651 		// and then potentially reallocating, which would complicate the logic.
652 
653 		int32 vtophys_length = (request->Length() / B_PAGE_SIZE) + 2;
654 		nvme_request.iovecs = vtophys = (physical_entry*)malloc(sizeof(physical_entry)
655 			* vtophys_length);
656 		if (vtophys == NULL) {
657 			TRACE_ERROR("failed to allocate memory for iovecs\n");
658 			request->SetStatusAndNotify(B_NO_MEMORY);
659 			return B_NO_MEMORY;
660 		}
661 		vtophysDeleter.SetTo(vtophys);
662 
663 		for (size_t i = 0; i < buffer->VecCount(); i++) {
664 			generic_io_vec virt = buffer->VecAt(i);
665 			uint32 entries = vtophys_length - nvme_request.iovec_count;
666 
667 			// Avoid copies by going straight into the vtophys array.
668 			status = get_memory_map_etc(request->TeamID(), (void*)virt.base,
669 				virt.length, vtophys + nvme_request.iovec_count, &entries);
670 
671 			if (status == B_BAD_VALUE && entries == 0)
672 				status = B_BUFFER_OVERFLOW;
673 			if (status == B_BUFFER_OVERFLOW) {
674 				TRACE("vtophys array was too small, reallocating\n");
675 
676 				vtophys_length *= 2;
677 				nvme_request.iovecs = vtophys = (physical_entry*)realloc(vtophys,
678 					sizeof(physical_entry) * vtophys_length);
679 				if (vtophys != NULL) {
680 					vtophysDeleter.Detach();
681 					vtophysDeleter.SetTo(vtophys);
682 
683 					// Try again, with the larger buffer this time.
684 					i--;
685 					continue;
686 				} else {
687 					status = B_NO_MEMORY;
688 				}
689 			}
690 			if (status != B_OK) {
691 				TRACE_ERROR("I/O get_memory_map failed: %s\n", strerror(status));
692 				request->SetStatusAndNotify(status);
693 				return status;
694 			}
695 
696 			nvme_request.iovec_count += entries;
697 		}
698 	} else {
699 		nvme_request.iovecs = (physical_entry*)buffer->Vecs();
700 		nvme_request.iovec_count = buffer->VecCount();
701 	}
702 
703 	// See if we need to bounce anything other than the first or last vec.
704 	const size_t block_size = handle->info->block_size;
705 	bool bounceAll = false;
706 	for (int32 i = 1; !bounceAll && i < (nvme_request.iovec_count - 1); i++) {
707 		if ((nvme_request.iovecs[i].address % B_PAGE_SIZE) != 0)
708 			bounceAll = true;
709 		if ((nvme_request.iovecs[i].size % B_PAGE_SIZE) != 0)
710 			bounceAll = true;
711 	}
712 
713 	// See if we need to bounce due to the first or last vecs.
714 	if (nvme_request.iovec_count > 1) {
715 		// There are middle vecs, so the first and last vecs have different restrictions: they
716 		// need only be a multiple of the block size, and must end and start on a page boundary,
717 		// respectively, though the start address must always be 32-bit-aligned.
718 		physical_entry* entry = &nvme_request.iovecs[0];
719 		if (!bounceAll && (((entry->address + entry->size) % B_PAGE_SIZE) != 0
720 				|| (entry->address & 0x3) != 0 || (entry->size % block_size) != 0))
721 			bounceAll = true;
722 
723 		entry = &nvme_request.iovecs[nvme_request.iovec_count - 1];
724 		if (!bounceAll && ((entry->address % B_PAGE_SIZE) != 0
725 				|| (entry->size % block_size) != 0))
726 			bounceAll = true;
727 	} else {
728 		// There is only one vec. Check that it is a multiple of the block size,
729 		// and that its address is 32-bit-aligned.
730 		physical_entry* entry = &nvme_request.iovecs[0];
731 		if (!bounceAll && ((entry->address & 0x3) != 0 || (entry->size % block_size) != 0))
732 			bounceAll = true;
733 	}
734 
735 	// See if we need to bounce due to rounding.
736 	const off_t rounded_pos = ROUNDDOWN(request->Offset(), block_size);
737 	phys_size_t rounded_len = ROUNDUP(request->Length() + (request->Offset()
738 		- rounded_pos), block_size);
739 	if (rounded_pos != request->Offset() || rounded_len != request->Length())
740 		bounceAll = true;
741 
742 	if (bounceAll) {
743 		// Let the bounced I/O routine take care of everything from here.
744 		return nvme_disk_bounced_io(handle, request);
745 	}
746 
747 	nvme_request.lba_start = rounded_pos / block_size;
748 	nvme_request.lba_count = rounded_len / block_size;
749 
750 	// No bouncing was required.
751 	ReadLocker readLocker;
752 	if (nvme_request.write)
753 		readLocker.SetTo(handle->info->rounded_write_lock, false);
754 
755 	// Error check before actually doing I/O.
756 	if (status != B_OK) {
757 		TRACE_ERROR("I/O failed early: %s\n", strerror(status));
758 		request->SetStatusAndNotify(status);
759 		return status;
760 	}
761 
762 	const uint32 max_io_blocks = handle->info->max_io_blocks;
763 	int32 remaining = nvme_request.iovec_count;
764 	while (remaining > 0) {
765 		nvme_request.iovec_count = min_c(remaining,
766 			NVME_MAX_SGL_DESCRIPTORS / 2);
767 
768 		nvme_request.lba_count = 0;
769 		for (int i = 0; i < nvme_request.iovec_count; i++) {
770 			uint32 new_lba_count = nvme_request.lba_count
771 				+ (nvme_request.iovecs[i].size / block_size);
772 			if (nvme_request.lba_count > 0 && new_lba_count > max_io_blocks) {
773 				// We already have a nonzero length, and adding this vec would
774 				// make us go over (or we already are over.) Stop adding.
775 				nvme_request.iovec_count = i;
776 				break;
777 			}
778 
779 			nvme_request.lba_count = new_lba_count;
780 		}
781 
782 		status = do_nvme_io_request(handle->info, &nvme_request);
783 		if (status != B_OK)
784 			break;
785 
786 		nvme_request.iovecs += nvme_request.iovec_count;
787 		remaining -= nvme_request.iovec_count;
788 		nvme_request.lba_start += nvme_request.lba_count;
789 	}
790 
791 	if (status != B_OK)
792 		TRACE_ERROR("I/O failed: %s\n", strerror(status));
793 
794 	request->SetTransferredBytes(status != B_OK,
795 		(nvme_request.lba_start * block_size) - rounded_pos);
796 	request->SetStatusAndNotify(status);
797 	return status;
798 }
799 
800 
801 static status_t
802 nvme_disk_read(void* cookie, off_t pos, void* buffer, size_t* length)
803 {
804 	CALLED();
805 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
806 
807 	const off_t ns_end = (handle->info->capacity * handle->info->block_size);
808 	if (pos >= ns_end)
809 		return B_BAD_VALUE;
810 	if ((pos + (off_t)*length) > ns_end)
811 		*length = ns_end - pos;
812 
813 	IORequest request;
814 	status_t status = request.Init(pos, (addr_t)buffer, *length, false, 0);
815 	if (status != B_OK)
816 		return status;
817 
818 	status = nvme_disk_io(handle, &request);
819 	*length = request.TransferredBytes();
820 	return status;
821 }
822 
823 
824 static status_t
825 nvme_disk_write(void* cookie, off_t pos, const void* buffer, size_t* length)
826 {
827 	CALLED();
828 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
829 
830 	const off_t ns_end = (handle->info->capacity * handle->info->block_size);
831 	if (pos >= ns_end)
832 		return B_BAD_VALUE;
833 	if ((pos + (off_t)*length) > ns_end)
834 		*length = ns_end - pos;
835 
836 	IORequest request;
837 	status_t status = request.Init(pos, (addr_t)buffer, *length, true, 0);
838 	if (status != B_OK)
839 		return status;
840 
841 	status = nvme_disk_io(handle, &request);
842 	*length = request.TransferredBytes();
843 	return status;
844 }
845 
846 
847 static status_t
848 nvme_disk_flush(nvme_disk_driver_info* info)
849 {
850 	CALLED();
851 	status_t status = EINPROGRESS;
852 
853 	qpair_info* qpinfo = get_qpair(info);
854 	int ret = nvme_ns_flush(info->ns, qpinfo->qpair,
855 		(nvme_cmd_cb)io_finished_callback, &status);
856 	if (ret != 0)
857 		return ret;
858 
859 	await_status(info, qpinfo->qpair, status);
860 	return status;
861 }
862 
863 
864 static status_t
865 nvme_disk_trim(nvme_disk_driver_info* info, fs_trim_data* trimData)
866 {
867 	CALLED();
868 	trimData->trimmed_size = 0;
869 
870 	const off_t deviceSize = info->capacity * info->block_size; // in bytes
871 	if (deviceSize < 0)
872 		return B_BAD_VALUE;
873 
874 	STATIC_ASSERT(sizeof(deviceSize) <= sizeof(uint64));
875 	ASSERT(deviceSize >= 0);
876 
877 	// Do not trim past device end.
878 	for (uint32 i = 0; i < trimData->range_count; i++) {
879 		uint64 offset = trimData->ranges[i].offset;
880 		uint64& size = trimData->ranges[i].size;
881 
882 		if (offset >= (uint64)deviceSize)
883 			return B_BAD_VALUE;
884 		size = std::min(size, (uint64)deviceSize - offset);
885 	}
886 
887 	// We need contiguous memory for the DSM ranges.
888 	nvme_dsm_range* dsmRanges = (nvme_dsm_range*)nvme_mem_alloc_node(
889 		trimData->range_count * sizeof(nvme_dsm_range), 0, 0, NULL);
890 	if (dsmRanges == NULL)
891 		return B_NO_MEMORY;
892 	CObjectDeleter<void, void, nvme_free> dsmRangesDeleter(dsmRanges);
893 
894 	uint64 trimmingSize = 0;
895 	for (uint32 i = 0; i < trimData->range_count; i++) {
896 		uint64 offset = trimData->ranges[i].offset;
897 		uint64 length = trimData->ranges[i].size;
898 
899 		// Round up offset and length to the block size.
900 		// (Some space at the beginning and end may thus not be trimmed.)
901 		offset = ROUNDUP(offset, info->block_size);
902 		length -= offset - trimData->ranges[i].offset;
903 		length = ROUNDDOWN(length, info->block_size);
904 
905 		if (length == 0)
906 			continue;
907 		if ((length / info->block_size) > UINT32_MAX)
908 			length = uint64(UINT32_MAX) * info->block_size;
909 			// TODO: Break into smaller trim ranges!
910 
911 		TRACE("trim %" B_PRIu64 " bytes from %" B_PRIu64 "\n", length, offset);
912 
913 		dsmRanges[i].attributes = 0;
914 		dsmRanges[i].length = length / info->block_size;
915 		dsmRanges[i].starting_lba = offset / info->block_size;
916 
917 		trimmingSize += length;
918 	}
919 
920 	status_t status = EINPROGRESS;
921 	qpair_info* qpair = get_qpair(info);
922 	if (nvme_ns_deallocate(info->ns, qpair->qpair, dsmRanges, trimData->range_count,
923 			(nvme_cmd_cb)io_finished_callback, &status) != 0)
924 		return B_IO_ERROR;
925 
926 	await_status(info, qpair->qpair, status);
927 	if (status != B_OK)
928 		return status;
929 
930 	trimData->trimmed_size = trimmingSize;
931 	return B_OK;
932 }
933 
934 
935 static status_t
936 nvme_disk_ioctl(void* cookie, uint32 op, void* buffer, size_t length)
937 {
938 	CALLED();
939 	nvme_disk_handle* handle = (nvme_disk_handle*)cookie;
940 	nvme_disk_driver_info* info = handle->info;
941 
942 	TRACE("ioctl(op = %" B_PRId32 ")\n", op);
943 
944 	switch (op) {
945 		case B_GET_MEDIA_STATUS:
946 		{
947 			return user_memcpy(buffer, &info->media_status, sizeof(status_t));
948 		}
949 
950 		case B_GET_DEVICE_SIZE:
951 		{
952 			size_t size = info->capacity * info->block_size;
953 			return user_memcpy(buffer, &size, sizeof(size_t));
954 		}
955 
956 		case B_GET_GEOMETRY:
957 		{
958 			if (buffer == NULL || length > sizeof(device_geometry))
959 				return B_BAD_VALUE;
960 
961 		 	device_geometry geometry;
962 			status_t status = get_geometry(handle, &geometry);
963 			if (status != B_OK)
964 				return status;
965 
966 			return user_memcpy(buffer, &geometry, length);
967 		}
968 
969 		case B_GET_ICON_NAME:
970 			return user_strlcpy((char*)buffer, "devices/drive-harddisk",
971 				B_FILE_NAME_LENGTH);
972 
973 		case B_GET_VECTOR_ICON:
974 		{
975 			device_icon iconData;
976 			if (length != sizeof(device_icon))
977 				return B_BAD_VALUE;
978 			if (user_memcpy(&iconData, buffer, sizeof(device_icon)) != B_OK)
979 				return B_BAD_ADDRESS;
980 
981 			if (iconData.icon_size >= (int32)sizeof(kDriveIcon)) {
982 				if (user_memcpy(iconData.icon_data, kDriveIcon,
983 						sizeof(kDriveIcon)) != B_OK)
984 					return B_BAD_ADDRESS;
985 			}
986 
987 			iconData.icon_size = sizeof(kDriveIcon);
988 			return user_memcpy(buffer, &iconData, sizeof(device_icon));
989 		}
990 
991 		case B_FLUSH_DRIVE_CACHE:
992 			return nvme_disk_flush(info);
993 
994 		case B_TRIM_DEVICE:
995 			ASSERT(IS_KERNEL_ADDRESS(buffer));
996 			return nvme_disk_trim(info, (fs_trim_data*)buffer);
997 	}
998 
999 	return B_DEV_INVALID_IOCTL;
1000 }
1001 
1002 
1003 //	#pragma mark - driver module API
1004 
1005 
1006 static float
1007 nvme_disk_supports_device(device_node *parent)
1008 {
1009 	CALLED();
1010 
1011 	const char* bus;
1012 	uint16 baseClass, subClass;
1013 
1014 	if (sDeviceManager->get_attr_string(parent, B_DEVICE_BUS, &bus, false) != B_OK
1015 		|| sDeviceManager->get_attr_uint16(parent, B_DEVICE_TYPE, &baseClass, false) != B_OK
1016 		|| sDeviceManager->get_attr_uint16(parent, B_DEVICE_SUB_TYPE, &subClass, false) != B_OK)
1017 		return -1.0f;
1018 
1019 	if (strcmp(bus, "pci") != 0 || baseClass != PCI_mass_storage)
1020 		return 0.0f;
1021 
1022 	if (subClass != PCI_nvm)
1023 		return 0.0f;
1024 
1025 	TRACE("NVMe device found!\n");
1026 	return 1.0f;
1027 }
1028 
1029 
1030 static status_t
1031 nvme_disk_register_device(device_node* parent)
1032 {
1033 	CALLED();
1034 
1035 	device_attr attrs[] = {
1036 		{ B_DEVICE_PRETTY_NAME, B_STRING_TYPE, { .string = "NVMe Disk" } },
1037 		{ NULL }
1038 	};
1039 
1040 	return sDeviceManager->register_node(parent, NVME_DISK_DRIVER_MODULE_NAME,
1041 		attrs, NULL, NULL);
1042 }
1043 
1044 
1045 static status_t
1046 nvme_disk_init_driver(device_node* node, void** cookie)
1047 {
1048 	CALLED();
1049 
1050 	int ret = nvme_lib_init((enum nvme_log_level)0, (enum nvme_log_facility)0, NULL);
1051 	if (ret != 0) {
1052 		TRACE_ERROR("libnvme initialization failed!\n");
1053 		return ret;
1054 	}
1055 
1056 	nvme_disk_driver_info* info = new nvme_disk_driver_info;
1057 	if (info == NULL)
1058 		return B_NO_MEMORY;
1059 
1060 	info->media_status = B_OK;
1061 	info->node = node;
1062 
1063 	info->ctrlr = NULL;
1064 
1065 	*cookie = info;
1066 	return B_OK;
1067 }
1068 
1069 
1070 static void
1071 nvme_disk_uninit_driver(void* _cookie)
1072 {
1073 	CALLED();
1074 
1075 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie;
1076 	free(info);
1077 }
1078 
1079 
1080 static status_t
1081 nvme_disk_register_child_devices(void* _cookie)
1082 {
1083 	CALLED();
1084 
1085 	nvme_disk_driver_info* info = (nvme_disk_driver_info*)_cookie;
1086 	status_t status;
1087 
1088 	int32 id = sDeviceManager->create_id(NVME_DISK_DEVICE_ID_GENERATOR);
1089 	if (id < 0)
1090 		return id;
1091 
1092 	char name[64];
1093 	snprintf(name, sizeof(name), "disk/nvme/%" B_PRId32 "/raw",
1094 		id);
1095 
1096 	status = sDeviceManager->publish_device(info->node, name,
1097 		NVME_DISK_DEVICE_MODULE_NAME);
1098 
1099 	return status;
1100 }
1101 
1102 
1103 //	#pragma mark -
1104 
1105 
1106 module_dependency module_dependencies[] = {
1107 	{ B_DEVICE_MANAGER_MODULE_NAME, (module_info**)&sDeviceManager },
1108 	{ NULL }
1109 };
1110 
1111 struct device_module_info sNvmeDiskDevice = {
1112 	{
1113 		NVME_DISK_DEVICE_MODULE_NAME,
1114 		0,
1115 		NULL
1116 	},
1117 
1118 	nvme_disk_init_device,
1119 	nvme_disk_uninit_device,
1120 	NULL, // remove,
1121 
1122 	nvme_disk_open,
1123 	nvme_disk_close,
1124 	nvme_disk_free,
1125 	nvme_disk_read,
1126 	nvme_disk_write,
1127 	nvme_disk_io,
1128 	nvme_disk_ioctl,
1129 
1130 	NULL,	// select
1131 	NULL,	// deselect
1132 };
1133 
1134 struct driver_module_info sNvmeDiskDriver = {
1135 	{
1136 		NVME_DISK_DRIVER_MODULE_NAME,
1137 		0,
1138 		NULL
1139 	},
1140 
1141 	nvme_disk_supports_device,
1142 	nvme_disk_register_device,
1143 	nvme_disk_init_driver,
1144 	nvme_disk_uninit_driver,
1145 	nvme_disk_register_child_devices,
1146 	NULL,	// rescan
1147 	NULL,	// removed
1148 };
1149 
1150 module_info* modules[] = {
1151 	(module_info*)&sNvmeDiskDriver,
1152 	(module_info*)&sNvmeDiskDevice,
1153 	NULL
1154 };
1155