blob: 31edf29220088c1f3e1766d8a1729dcce875ce8f [file] [log] [blame]
Julian Stecklinac83e15b2017-02-13 10:03:59 +01001// Low level NVMe disk access
2//
3// Copyright 2017 Amazon.com, Inc. or its affiliates.
4//
5// This file may be distributed under the terms of the GNU LGPLv3 license.
6
7#include "blockcmd.h"
8#include "fw/paravirt.h" // runningOnQEMU
9#include "malloc.h" // malloc_high
10#include "output.h" // dprintf
11#include "pci.h"
12#include "pci_ids.h" // PCI_CLASS_STORAGE_NVME
13#include "pci_regs.h" // PCI_BASE_ADDRESS_0
14#include "pcidevice.h" // foreachpci
15#include "stacks.h" // yield
16#include "std/disk.h" // DISK_RET_
17#include "string.h" // memset
18#include "util.h" // boot_add_hd
19#include "x86.h" // readl
20
21#include "nvme.h"
22#include "nvme-int.h"
23
24static void *
25zalloc_page_aligned(struct zone_s *zone, u32 size)
26{
27 void *res = _malloc(zone, size, NVME_PAGE_SIZE);
28 if (res) memset(res, 0, size);
29 return res;
30}
31
32static void
33nvme_init_queue_common(struct nvme_ctrl *ctrl, struct nvme_queue *q, u16 q_idx,
34 u16 length)
35{
36 memset(q, 0, sizeof(*q));
37 q->dbl = (u32 *)((char *)ctrl->reg + 0x1000 + q_idx * ctrl->doorbell_stride);
38 dprintf(3, " q %p q_idx %u dbl %p\n", q, q_idx, q->dbl);
39 q->mask = length - 1;
40}
41
42static void
43nvme_init_sq(struct nvme_ctrl *ctrl, struct nvme_sq *sq, u16 q_idx, u16 length,
44 struct nvme_cq *cq)
45{
46 nvme_init_queue_common(ctrl, &sq->common, q_idx, length);
47 sq->sqe = zalloc_page_aligned(&ZoneHigh, sizeof(*sq->sqe) * length);
48 dprintf(3, "sq %p q_idx %u sqe %p\n", sq, q_idx, sq->sqe);
49 sq->cq = cq;
50 sq->head = 0;
51 sq->tail = 0;
52}
53
54static void
55nvme_init_cq(struct nvme_ctrl *ctrl, struct nvme_cq *cq, u16 q_idx, u16 length)
56{
57 nvme_init_queue_common(ctrl, &cq->common, q_idx, length);
58 cq->cqe = zalloc_page_aligned(&ZoneHigh, sizeof(*cq->cqe) * length);
59
60 cq->head = 0;
61
62 /* All CQE phase bits are initialized to zero. This means initially we wait
63 for the host controller to set these to 1. */
64 cq->phase = 1;
65}
66
67static int
68nvme_poll_cq(struct nvme_cq *cq)
69{
70 u32 dw3 = readl(&cq->cqe[cq->head].dword[3]);
71 return (!!(dw3 & NVME_CQE_DW3_P) == cq->phase);
72}
73
74static int
75nvme_is_cqe_success(struct nvme_cqe const *cqe)
76{
77 return (cqe->status & 0xFF) >> 1 == 0;
78}
79
80
81static struct nvme_cqe
82nvme_error_cqe(void)
83{
84 struct nvme_cqe r;
85
86 /* 0xFF is a vendor specific status code != success. Should be okay for
87 indicating failure. */
88 memset(&r, 0xFF, sizeof(r));
89 return r;
90}
91
92static struct nvme_cqe
93nvme_consume_cqe(struct nvme_sq *sq)
94{
95 struct nvme_cq *cq = sq->cq;
96
97 if (!nvme_poll_cq(cq)) {
98 /* Cannot consume a completion queue entry, if there is none ready. */
99 return nvme_error_cqe();
100 }
101
102 struct nvme_cqe *cqe = &cq->cqe[cq->head];
103 u16 cq_next_head = (cq->head + 1) & cq->common.mask;
104 dprintf(4, "cq %p head %u -> %u\n", cq, cq->head, cq_next_head);
105 if (cq_next_head < cq->head) {
106 dprintf(3, "cq %p wrap\n", cq);
107 cq->phase = ~cq->phase;
108 }
109 cq->head = cq_next_head;
110
111 /* Update the submission queue head. */
112 if (cqe->sq_head != sq->head) {
113 sq->head = cqe->sq_head;
114 dprintf(4, "sq %p advanced to %u\n", sq, cqe->sq_head);
115 }
116
117 /* Tell the controller that we consumed the completion. */
118 writel(cq->common.dbl, cq->head);
119
120 return *cqe;
121}
122
123static struct nvme_cqe
124nvme_wait(struct nvme_sq *sq)
125{
126 static const unsigned nvme_timeout = 500 /* ms */;
127 u32 to = timer_calc(nvme_timeout);
128 while (!nvme_poll_cq(sq->cq)) {
129 yield();
130
131 if (timer_check(to)) {
132 warn_timeout();
133 return nvme_error_cqe();
134 }
135 }
136
137 return nvme_consume_cqe(sq);
138}
139
140/* Returns the next submission queue entry (or NULL if the queue is full). It
141 also fills out Command Dword 0 and clears the rest. */
142static struct nvme_sqe *
143nvme_get_next_sqe(struct nvme_sq *sq, u8 opc, void *metadata, void *data)
144{
145 if (((sq->head + 1) & sq->common.mask) == sq->tail) {
146 dprintf(3, "submission queue is full");
147 return NULL;
148 }
149
150 struct nvme_sqe *sqe = &sq->sqe[sq->tail];
151 dprintf(4, "sq %p next_sqe %u\n", sq, sq->tail);
152
153 memset(sqe, 0, sizeof(*sqe));
154 sqe->cdw0 = opc | (sq->tail << 16 /* CID */);
155 sqe->mptr = (u32)metadata;
156 sqe->dptr_prp1 = (u32)data;
157
158 if (sqe->dptr_prp1 & (NVME_PAGE_SIZE - 1)) {
159 /* Data buffer not page aligned. */
160 warn_internalerror();
161 }
162
163 return sqe;
164}
165
166/* Call this after you've filled out an sqe that you've got from nvme_get_next_sqe. */
167static void
168nvme_commit_sqe(struct nvme_sq *sq)
169{
170 dprintf(4, "sq %p commit_sqe %u\n", sq, sq->tail);
171 sq->tail = (sq->tail + 1) & sq->common.mask;
172 writel(sq->common.dbl, sq->tail);
173}
174
175/* Perform an identify command on the admin queue and return the resulting
176 buffer. This may be a NULL pointer, if something failed. This function
177 cannot be used after initialization, because it uses buffers in tmp zone. */
178static union nvme_identify *
179nvme_admin_identify(struct nvme_ctrl *ctrl, u8 cns, u32 nsid)
180{
181 union nvme_identify *identify_buf = zalloc_page_aligned(&ZoneTmpHigh, 4096);
182 if (!identify_buf) {
183 /* Could not allocate identify buffer. */
184 warn_internalerror();
185 return NULL;
186 }
187
188 struct nvme_sqe *cmd_identify;
189 cmd_identify = nvme_get_next_sqe(&ctrl->admin_sq,
190 NVME_SQE_OPC_ADMIN_IDENTIFY, NULL,
191 identify_buf);
192
193 if (!cmd_identify) {
194 warn_internalerror();
195 goto error;
196 }
197
198 cmd_identify->nsid = nsid;
199 cmd_identify->dword[10] = cns;
200
201 nvme_commit_sqe(&ctrl->admin_sq);
202
203 struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq);
204
205 if (!nvme_is_cqe_success(&cqe)) {
206 goto error;
207 }
208
209 return identify_buf;
210 error:
211 free(identify_buf);
212 return NULL;
213}
214
215static struct nvme_identify_ctrl *
216nvme_admin_identify_ctrl(struct nvme_ctrl *ctrl)
217{
218 return &nvme_admin_identify(ctrl, NVME_ADMIN_IDENTIFY_CNS_ID_CTRL, 0)->ctrl;
219}
220
221static struct nvme_identify_ns_list *
222nvme_admin_identify_get_ns_list(struct nvme_ctrl *ctrl)
223{
224 return &nvme_admin_identify(ctrl, NVME_ADMIN_IDENTIFY_CNS_GET_NS_LIST,
225 0)->ns_list;
226}
227
228static struct nvme_identify_ns *
229nvme_admin_identify_ns(struct nvme_ctrl *ctrl, u32 ns_id)
230{
231 return &nvme_admin_identify(ctrl, NVME_ADMIN_IDENTIFY_CNS_ID_NS,
232 ns_id)->ns;
233}
234
235static void
236nvme_probe_ns(struct nvme_ctrl *ctrl, struct nvme_namespace *ns, u32 ns_id)
237{
238 ns->ctrl = ctrl;
239 ns->ns_id = ns_id;
240
241 struct nvme_identify_ns *id = nvme_admin_identify_ns(ctrl, ns_id);
242 if (!id) {
243 dprintf(2, "NVMe couldn't identify namespace %u.\n", ns_id);
244 goto free_buffer;
245 }
246
247 u8 current_lba_format = id->flbas & 0xF;
248 if (current_lba_format > id->nlbaf) {
249 dprintf(2, "NVMe NS %u: current LBA format %u is beyond what the "
250 " namespace supports (%u)?\n",
251 ns_id, current_lba_format, id->nlbaf + 1);
252 goto free_buffer;
253 }
254
255 ns->lba_count = id->nsze;
256
257 struct nvme_lba_format *fmt = &id->lbaf[current_lba_format];
258
259 ns->block_size = 1U << fmt->lbads;
260 ns->metadata_size = fmt->ms;
261
262 if (ns->block_size > NVME_PAGE_SIZE) {
263 /* If we see devices that trigger this path, we need to increase our
264 buffer size. */
265 warn_internalerror();
266 goto free_buffer;
267 }
268
269 ns->drive.cntl_id = ns - ctrl->ns;
270 ns->drive.removable = 0;
271 ns->drive.type = DTYPE_NVME;
272 ns->drive.blksize = ns->block_size;
273 ns->drive.sectors = ns->lba_count;
274
275 ns->dma_buffer = zalloc_page_aligned(&ZoneHigh, NVME_PAGE_SIZE);
276
277 char *desc = znprintf(MAXDESCSIZE, "NVMe NS %u: %llu MiB (%llu %u-byte "
278 "blocks + %u-byte metadata)\n",
279 ns_id, (ns->lba_count * ns->block_size) >> 20,
280 ns->lba_count, ns->block_size, ns->metadata_size);
281
282 dprintf(3, "%s", desc);
283 boot_add_hd(&ns->drive, desc, bootprio_find_pci_device(ctrl->pci));
284
285 free_buffer:
286 free (id);
287 }
288
289/* Returns 0 on success. */
290static int
291nvme_create_io_cq(struct nvme_ctrl *ctrl, struct nvme_cq *cq, u16 q_idx)
292{
293 struct nvme_sqe *cmd_create_cq;
294
295 nvme_init_cq(ctrl, cq, q_idx, NVME_PAGE_SIZE / sizeof(struct nvme_cqe));
296 cmd_create_cq = nvme_get_next_sqe(&ctrl->admin_sq,
297 NVME_SQE_OPC_ADMIN_CREATE_IO_CQ, NULL,
298 cq->cqe);
299 if (!cmd_create_cq) {
300 return -1;
301 }
302
303 cmd_create_cq->dword[10] = (cq->common.mask << 16) | (q_idx >> 1);
304 cmd_create_cq->dword[11] = 1 /* physically contiguous */;
305
306 nvme_commit_sqe(&ctrl->admin_sq);
307
308 struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq);
309
310 if (!nvme_is_cqe_success(&cqe)) {
311 dprintf(2, "create io cq failed: %08x %08x %08x %08x\n",
312 cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]);
313
314 return -1;
315 }
316
317 return 0;
318}
319
320/* Returns 0 on success. */
321static int
322nvme_create_io_sq(struct nvme_ctrl *ctrl, struct nvme_sq *sq, u16 q_idx, struct nvme_cq *cq)
323{
324 struct nvme_sqe *cmd_create_sq;
325
326 nvme_init_sq(ctrl, sq, q_idx, NVME_PAGE_SIZE / sizeof(struct nvme_cqe), cq);
327 cmd_create_sq = nvme_get_next_sqe(&ctrl->admin_sq,
328 NVME_SQE_OPC_ADMIN_CREATE_IO_SQ, NULL,
329 sq->sqe);
330 if (!cmd_create_sq) {
331 return -1;
332 }
333
334 cmd_create_sq->dword[10] = (sq->common.mask << 16) | (q_idx >> 1);
335 cmd_create_sq->dword[11] = (q_idx >> 1) << 16 | 1 /* contiguous */;
336 dprintf(3, "sq %p create dword10 %08x dword11 %08x\n", sq,
337 cmd_create_sq->dword[10], cmd_create_sq->dword[11]);
338
339 nvme_commit_sqe(&ctrl->admin_sq);
340
341 struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq);
342
343 if (!nvme_is_cqe_success(&cqe)) {
344 dprintf(2, "create io sq failed: %08x %08x %08x %08x\n",
345 cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]);
346 return -1;
347 }
348
349 return 0;
350}
351
352/* Reads count sectors into buf. Returns DISK_RET_*. The buffer cannot cross
353 page boundaries. */
354static int
355nvme_io_readwrite(struct nvme_namespace *ns, u64 lba, char *buf, u16 count,
356 int write)
357{
358 u32 buf_addr = (u32)buf;
359
360 if ((buf_addr & 0x3) ||
361 ((buf_addr & ~(NVME_PAGE_SIZE - 1)) !=
362 ((buf_addr + ns->block_size * count - 1) & ~(NVME_PAGE_SIZE - 1)))) {
363 /* Buffer is misaligned or crosses page boundary */
364 warn_internalerror();
365 return DISK_RET_EBADTRACK;
366 }
367
368 struct nvme_sqe *io_read = nvme_get_next_sqe(&ns->ctrl->io_sq,
369 write ? NVME_SQE_OPC_IO_WRITE
370 : NVME_SQE_OPC_IO_READ,
371 NULL, buf);
372 io_read->nsid = ns->ns_id;
373 io_read->dword[10] = (u32)lba;
374 io_read->dword[11] = (u32)(lba >> 32);
375 io_read->dword[12] = (1U << 31 /* limited retry */) | (count - 1);
376
377 nvme_commit_sqe(&ns->ctrl->io_sq);
378
379 struct nvme_cqe cqe = nvme_wait(&ns->ctrl->io_sq);
380
381 if (!nvme_is_cqe_success(&cqe)) {
382 dprintf(2, "read io: %08x %08x %08x %08x\n",
383 cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]);
384
385 return DISK_RET_EBADTRACK;
386 }
387
388 return DISK_RET_SUCCESS;
389}
390
391
392static int
393nvme_create_io_queues(struct nvme_ctrl *ctrl)
394{
395 if (nvme_create_io_cq(ctrl, &ctrl->io_cq, 3))
396 return -1;
397
398 if (nvme_create_io_sq(ctrl, &ctrl->io_sq, 2, &ctrl->io_cq))
399 return -1;
400
401 return 0;
402}
403
404/* Waits for CSTS.RDY to match rdy. Returns 0 on success. */
405static int
406nvme_wait_csts_rdy(struct nvme_ctrl *ctrl, unsigned rdy)
407{
408 u32 const max_to = 500 /* ms */ * ((ctrl->reg->cap >> 24) & 0xFFU);
409 u32 to = timer_calc(max_to);
410 u32 csts;
411
412 while (rdy != ((csts = ctrl->reg->csts) & NVME_CSTS_RDY)) {
413 yield();
414
415 if (csts & NVME_CSTS_FATAL) {
416 dprintf(3, "NVMe fatal error during controller shutdown\n");
417 return -1;
418 }
419
420 if (timer_check(to)) {
421 warn_timeout();
422 return -1;
423 }
424 }
425
426 return 0;
427}
428
429/* Returns 0 on success. */
430static int
431nvme_controller_enable(struct nvme_ctrl *ctrl)
432{
433 pci_enable_busmaster(ctrl->pci);
434
435 /* Turn the controller off. */
436 ctrl->reg->cc = 0;
437 if (nvme_wait_csts_rdy(ctrl, 0)) {
438 dprintf(2, "NVMe fatal error during controller shutdown\n");
439 return -1;
440 }
441
442 ctrl->doorbell_stride = 4U << ((ctrl->reg->cap >> 32) & 0xF);
443
444 nvme_init_cq(ctrl, &ctrl->admin_cq, 1,
445 NVME_PAGE_SIZE / sizeof(struct nvme_cqe));
446
447 nvme_init_sq(ctrl, &ctrl->admin_sq, 0,
448 NVME_PAGE_SIZE / sizeof(struct nvme_sqe), &ctrl->admin_cq);
449
450 ctrl->reg->aqa = ctrl->admin_cq.common.mask << 16
451 | ctrl->admin_sq.common.mask;
452
453 /* Create the admin queue pair */
454 if (!ctrl->admin_sq.sqe || !ctrl->admin_cq.cqe) goto out_of_memory;
455
456 ctrl->reg->asq = (u32)ctrl->admin_sq.sqe;
457 ctrl->reg->acq = (u32)ctrl->admin_cq.cqe;
458
459 dprintf(3, " admin submission queue: %p\n", ctrl->admin_sq.sqe);
460 dprintf(3, " admin completion queue: %p\n", ctrl->admin_cq.cqe);
461
462 ctrl->reg->cc = NVME_CC_EN | (NVME_CQE_SIZE_LOG << 20)
463 | (NVME_SQE_SIZE_LOG << 16 /* IOSQES */);
464
465 if (nvme_wait_csts_rdy(ctrl, 1)) {
466 dprintf(2, "NVMe fatal error while enabling controller\n");
467 goto failed;
468 }
469 /* The admin queue is set up and the controller is ready. Let's figure out
470 what namespaces we have. */
471
472 struct nvme_identify_ctrl *identify = nvme_admin_identify_ctrl(ctrl);
473
474 if (!identify) {
475 dprintf(2, "NVMe couldn't identify controller.\n");
476 goto failed;
477 }
478
479 /* TODO Print model/serial info. */
480 dprintf(3, "NVMe has %u namespace%s.\n",
481 identify->nn, (identify->nn == 1) ? "" : "s");
482
483 ctrl->ns_count = identify->nn;
484 free(identify);
485
486 if ((ctrl->ns_count == 0) || nvme_create_io_queues(ctrl)) {
487 /* No point to continue, if the controller says it doesn't have
488 namespaces or we couldn't create I/O queues. */
489 goto failed;
490 }
491
492 ctrl->ns = malloc_fseg(sizeof(*ctrl->ns) * ctrl->ns_count);
493 if (!ctrl->ns) goto out_of_memory;
494 memset(ctrl->ns, 0, sizeof(*ctrl->ns) * ctrl->ns_count);
495
496 struct nvme_identify_ns_list *ns_list = nvme_admin_identify_get_ns_list(ctrl);
497 if (!ns_list) {
498 dprintf(2, "NVMe couldn't get namespace list.\n");
499 goto failed;
500 }
501
502 /* Populate namespace IDs */
503 int ns_idx;
504 for (ns_idx = 0;
505 ns_idx < ARRAY_SIZE(ns_list->ns_id)
506 && ns_idx < ctrl->ns_count
507 && ns_list->ns_id[ns_idx];
508 ns_idx++) {
509 nvme_probe_ns(ctrl, &ctrl->ns[ns_idx], ns_list->ns_id[ns_idx]);
510 }
511
512 free(ns_list);
513
514 /* If for some reason the namespace list gives us fewer namespaces, we just
515 go along. */
516 if (ns_idx != ctrl->ns_count) {
517 dprintf(2, "NVMe namespace list has only %u namespaces?\n", ns_idx);
518 ctrl->ns_count = ns_idx;
519 }
520
521 dprintf(3, "NVMe initialization complete!\n");
522 return 0;
523
524 out_of_memory:
525 warn_noalloc();
526 failed:
527 free(ctrl->admin_sq.sqe);
528 free(ctrl->admin_cq.cqe);
529 free(ctrl->ns);
530 return -1;
531}
532
533/* Initialize an NVMe controller and detect its drives. */
534static void
535nvme_controller_setup(void *opaque)
536{
537 struct pci_device *pci = opaque;
538
539 struct nvme_reg volatile *reg = pci_enable_membar(pci, PCI_BASE_ADDRESS_0);
540 if (!reg)
541 return;
542
543 u32 version = reg->vs;
544 dprintf(3, "Found NVMe controller with version %u.%u.%u.\n",
545 version >> 16, (version >> 8) & 0xFF, version & 0xFF);
546 dprintf(3, " Capabilities %016llx\n", reg->cap);
547
548 if (version < 0x00010100U) {
549 dprintf(3, "Need at least 1.1.0! Skipping.\n");
550 return;
551 }
552
553 if (~reg->cap & NVME_CAP_CSS_NVME) {
554 dprintf(3, "Controller doesn't speak NVMe command set. Skipping.\n");
555 return;
556 }
557
558 struct nvme_ctrl *ctrl = malloc_high(sizeof(*ctrl));
559 if (!ctrl) {
560 warn_noalloc();
561 return;
562 }
563
564 memset(ctrl, 0, sizeof(*ctrl));
565
566 ctrl->reg = reg;
567 ctrl->pci = pci;
568
569 if (nvme_controller_enable(ctrl)) {
570 /* Initialization failed */
571 free(ctrl);
572 }
573}
574
575// Locate and init NVMe controllers
576static void
577nvme_scan(void)
578{
579 // Scan PCI bus for ATA adapters
580 struct pci_device *pci;
581
582 foreachpci(pci) {
583 if (pci->class != PCI_CLASS_STORAGE_NVME)
584 continue;
585 if (pci->prog_if != 2 /* as of NVM 1.0e */) {
586 dprintf(3, "Found incompatble NVMe: prog-if=%02x\n", pci->prog_if);
587 continue;
588 }
589
590 run_thread(nvme_controller_setup, pci);
591 }
592}
593
594static int
595nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write)
596{
597 int res = DISK_RET_SUCCESS;
598 u16 const max_blocks = NVME_PAGE_SIZE / ns->block_size;
599 u16 i;
600
601 for (i = 0; i < op->count || res != DISK_RET_SUCCESS;) {
602 u16 blocks_remaining = op->count - i;
603 u16 blocks = blocks_remaining < max_blocks ? blocks_remaining
604 : max_blocks;
605 char *op_buf = op->buf_fl + i * ns->block_size;
606
607 if (write) {
608 memcpy(ns->dma_buffer, op_buf, blocks * ns->block_size);
609 }
610
611 res = nvme_io_readwrite(ns, op->lba + i, ns->dma_buffer, blocks, write);
612 dprintf(3, "ns %u %s lba %llu+%u: %d\n", ns->ns_id, write ? "write"
613 : "read",
614 op->lba + i, blocks, res);
615
616 if (!write && res == DISK_RET_SUCCESS) {
617 memcpy(op_buf, ns->dma_buffer, blocks * ns->block_size);
618 }
619
620 i += blocks;
621 }
622
623 return res;
624}
625
626int
627nvme_process_op(struct disk_op_s *op)
628{
629 if (!CONFIG_NVME || !runningOnQEMU())
630 return DISK_RET_SUCCESS;
631
632 struct nvme_namespace *ns = container_of(op->drive_gf, struct nvme_namespace,
633 drive);
634
635 switch (op->command) {
636 case CMD_READ:
637 case CMD_WRITE:
638 return nvme_cmd_readwrite(ns, op, op->command == CMD_WRITE);
639 default:
640 return default_process_op(op);
641 }
642}
643
644void
645nvme_setup(void)
646{
647 ASSERT32FLAT();
648 if (!CONFIG_NVME || !runningOnQEMU())
649 return;
650
651 dprintf(3, "init nvme\n");
652 nvme_scan();
653}
654
655/* EOF */