blob: 556a9a4227fedf3d3f3c7609b0690e87c809d6b4 [file] [log] [blame]
Julian Stecklinac83e15b2017-02-13 10:03:59 +01001// Low level NVMe disk access
2//
3// Copyright 2017 Amazon.com, Inc. or its affiliates.
4//
5// This file may be distributed under the terms of the GNU LGPLv3 license.
6
7#include "blockcmd.h"
Julian Stecklinac83e15b2017-02-13 10:03:59 +01008#include "malloc.h" // malloc_high
9#include "output.h" // dprintf
10#include "pci.h"
11#include "pci_ids.h" // PCI_CLASS_STORAGE_NVME
12#include "pci_regs.h" // PCI_BASE_ADDRESS_0
13#include "pcidevice.h" // foreachpci
14#include "stacks.h" // yield
15#include "std/disk.h" // DISK_RET_
16#include "string.h" // memset
17#include "util.h" // boot_add_hd
18#include "x86.h" // readl
19
20#include "nvme.h"
21#include "nvme-int.h"
22
23static void *
24zalloc_page_aligned(struct zone_s *zone, u32 size)
25{
26 void *res = _malloc(zone, size, NVME_PAGE_SIZE);
27 if (res) memset(res, 0, size);
28 return res;
29}
30
31static void
32nvme_init_queue_common(struct nvme_ctrl *ctrl, struct nvme_queue *q, u16 q_idx,
33 u16 length)
34{
35 memset(q, 0, sizeof(*q));
36 q->dbl = (u32 *)((char *)ctrl->reg + 0x1000 + q_idx * ctrl->doorbell_stride);
37 dprintf(3, " q %p q_idx %u dbl %p\n", q, q_idx, q->dbl);
38 q->mask = length - 1;
39}
40
41static void
42nvme_init_sq(struct nvme_ctrl *ctrl, struct nvme_sq *sq, u16 q_idx, u16 length,
43 struct nvme_cq *cq)
44{
45 nvme_init_queue_common(ctrl, &sq->common, q_idx, length);
46 sq->sqe = zalloc_page_aligned(&ZoneHigh, sizeof(*sq->sqe) * length);
47 dprintf(3, "sq %p q_idx %u sqe %p\n", sq, q_idx, sq->sqe);
48 sq->cq = cq;
49 sq->head = 0;
50 sq->tail = 0;
51}
52
53static void
54nvme_init_cq(struct nvme_ctrl *ctrl, struct nvme_cq *cq, u16 q_idx, u16 length)
55{
56 nvme_init_queue_common(ctrl, &cq->common, q_idx, length);
57 cq->cqe = zalloc_page_aligned(&ZoneHigh, sizeof(*cq->cqe) * length);
58
59 cq->head = 0;
60
61 /* All CQE phase bits are initialized to zero. This means initially we wait
62 for the host controller to set these to 1. */
63 cq->phase = 1;
64}
65
66static int
67nvme_poll_cq(struct nvme_cq *cq)
68{
69 u32 dw3 = readl(&cq->cqe[cq->head].dword[3]);
70 return (!!(dw3 & NVME_CQE_DW3_P) == cq->phase);
71}
72
73static int
74nvme_is_cqe_success(struct nvme_cqe const *cqe)
75{
Daniel Verkampd8a6c842017-02-23 23:27:56 -070076 return ((cqe->status >> 1) & 0xFF) == 0;
Julian Stecklinac83e15b2017-02-13 10:03:59 +010077}
78
79
80static struct nvme_cqe
81nvme_error_cqe(void)
82{
83 struct nvme_cqe r;
84
85 /* 0xFF is a vendor specific status code != success. Should be okay for
86 indicating failure. */
87 memset(&r, 0xFF, sizeof(r));
88 return r;
89}
90
91static struct nvme_cqe
92nvme_consume_cqe(struct nvme_sq *sq)
93{
94 struct nvme_cq *cq = sq->cq;
95
96 if (!nvme_poll_cq(cq)) {
97 /* Cannot consume a completion queue entry, if there is none ready. */
98 return nvme_error_cqe();
99 }
100
101 struct nvme_cqe *cqe = &cq->cqe[cq->head];
102 u16 cq_next_head = (cq->head + 1) & cq->common.mask;
103 dprintf(4, "cq %p head %u -> %u\n", cq, cq->head, cq_next_head);
104 if (cq_next_head < cq->head) {
105 dprintf(3, "cq %p wrap\n", cq);
106 cq->phase = ~cq->phase;
107 }
108 cq->head = cq_next_head;
109
110 /* Update the submission queue head. */
111 if (cqe->sq_head != sq->head) {
112 sq->head = cqe->sq_head;
113 dprintf(4, "sq %p advanced to %u\n", sq, cqe->sq_head);
114 }
115
116 /* Tell the controller that we consumed the completion. */
117 writel(cq->common.dbl, cq->head);
118
119 return *cqe;
120}
121
122static struct nvme_cqe
123nvme_wait(struct nvme_sq *sq)
124{
Daniel Verkamp21de72f2017-02-23 23:27:54 -0700125 static const unsigned nvme_timeout = 5000 /* ms */;
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100126 u32 to = timer_calc(nvme_timeout);
127 while (!nvme_poll_cq(sq->cq)) {
128 yield();
129
130 if (timer_check(to)) {
131 warn_timeout();
132 return nvme_error_cqe();
133 }
134 }
135
136 return nvme_consume_cqe(sq);
137}
138
139/* Returns the next submission queue entry (or NULL if the queue is full). It
140 also fills out Command Dword 0 and clears the rest. */
141static struct nvme_sqe *
142nvme_get_next_sqe(struct nvme_sq *sq, u8 opc, void *metadata, void *data)
143{
144 if (((sq->head + 1) & sq->common.mask) == sq->tail) {
145 dprintf(3, "submission queue is full");
146 return NULL;
147 }
148
149 struct nvme_sqe *sqe = &sq->sqe[sq->tail];
150 dprintf(4, "sq %p next_sqe %u\n", sq, sq->tail);
151
152 memset(sqe, 0, sizeof(*sqe));
153 sqe->cdw0 = opc | (sq->tail << 16 /* CID */);
154 sqe->mptr = (u32)metadata;
155 sqe->dptr_prp1 = (u32)data;
156
157 if (sqe->dptr_prp1 & (NVME_PAGE_SIZE - 1)) {
158 /* Data buffer not page aligned. */
159 warn_internalerror();
160 }
161
162 return sqe;
163}
164
165/* Call this after you've filled out an sqe that you've got from nvme_get_next_sqe. */
166static void
167nvme_commit_sqe(struct nvme_sq *sq)
168{
169 dprintf(4, "sq %p commit_sqe %u\n", sq, sq->tail);
170 sq->tail = (sq->tail + 1) & sq->common.mask;
171 writel(sq->common.dbl, sq->tail);
172}
173
174/* Perform an identify command on the admin queue and return the resulting
175 buffer. This may be a NULL pointer, if something failed. This function
176 cannot be used after initialization, because it uses buffers in tmp zone. */
177static union nvme_identify *
178nvme_admin_identify(struct nvme_ctrl *ctrl, u8 cns, u32 nsid)
179{
180 union nvme_identify *identify_buf = zalloc_page_aligned(&ZoneTmpHigh, 4096);
181 if (!identify_buf) {
182 /* Could not allocate identify buffer. */
183 warn_internalerror();
184 return NULL;
185 }
186
187 struct nvme_sqe *cmd_identify;
188 cmd_identify = nvme_get_next_sqe(&ctrl->admin_sq,
189 NVME_SQE_OPC_ADMIN_IDENTIFY, NULL,
190 identify_buf);
191
192 if (!cmd_identify) {
193 warn_internalerror();
194 goto error;
195 }
196
197 cmd_identify->nsid = nsid;
198 cmd_identify->dword[10] = cns;
199
200 nvme_commit_sqe(&ctrl->admin_sq);
201
202 struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq);
203
204 if (!nvme_is_cqe_success(&cqe)) {
205 goto error;
206 }
207
208 return identify_buf;
209 error:
210 free(identify_buf);
211 return NULL;
212}
213
214static struct nvme_identify_ctrl *
215nvme_admin_identify_ctrl(struct nvme_ctrl *ctrl)
216{
217 return &nvme_admin_identify(ctrl, NVME_ADMIN_IDENTIFY_CNS_ID_CTRL, 0)->ctrl;
218}
219
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100220static struct nvme_identify_ns *
221nvme_admin_identify_ns(struct nvme_ctrl *ctrl, u32 ns_id)
222{
223 return &nvme_admin_identify(ctrl, NVME_ADMIN_IDENTIFY_CNS_ID_NS,
224 ns_id)->ns;
225}
226
227static void
228nvme_probe_ns(struct nvme_ctrl *ctrl, struct nvme_namespace *ns, u32 ns_id)
229{
230 ns->ctrl = ctrl;
231 ns->ns_id = ns_id;
232
233 struct nvme_identify_ns *id = nvme_admin_identify_ns(ctrl, ns_id);
234 if (!id) {
235 dprintf(2, "NVMe couldn't identify namespace %u.\n", ns_id);
236 goto free_buffer;
237 }
238
239 u8 current_lba_format = id->flbas & 0xF;
240 if (current_lba_format > id->nlbaf) {
241 dprintf(2, "NVMe NS %u: current LBA format %u is beyond what the "
242 " namespace supports (%u)?\n",
243 ns_id, current_lba_format, id->nlbaf + 1);
244 goto free_buffer;
245 }
246
247 ns->lba_count = id->nsze;
Daniel Verkampf21e3042017-02-23 23:27:53 -0700248 if (!ns->lba_count) {
249 dprintf(2, "NVMe NS %u is inactive.\n", ns_id);
250 goto free_buffer;
251 }
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100252
253 struct nvme_lba_format *fmt = &id->lbaf[current_lba_format];
254
255 ns->block_size = 1U << fmt->lbads;
256 ns->metadata_size = fmt->ms;
257
258 if (ns->block_size > NVME_PAGE_SIZE) {
259 /* If we see devices that trigger this path, we need to increase our
260 buffer size. */
261 warn_internalerror();
262 goto free_buffer;
263 }
264
265 ns->drive.cntl_id = ns - ctrl->ns;
266 ns->drive.removable = 0;
267 ns->drive.type = DTYPE_NVME;
268 ns->drive.blksize = ns->block_size;
269 ns->drive.sectors = ns->lba_count;
270
271 ns->dma_buffer = zalloc_page_aligned(&ZoneHigh, NVME_PAGE_SIZE);
272
273 char *desc = znprintf(MAXDESCSIZE, "NVMe NS %u: %llu MiB (%llu %u-byte "
274 "blocks + %u-byte metadata)\n",
275 ns_id, (ns->lba_count * ns->block_size) >> 20,
276 ns->lba_count, ns->block_size, ns->metadata_size);
277
278 dprintf(3, "%s", desc);
279 boot_add_hd(&ns->drive, desc, bootprio_find_pci_device(ctrl->pci));
280
281 free_buffer:
282 free (id);
283 }
284
285/* Returns 0 on success. */
286static int
287nvme_create_io_cq(struct nvme_ctrl *ctrl, struct nvme_cq *cq, u16 q_idx)
288{
289 struct nvme_sqe *cmd_create_cq;
290
291 nvme_init_cq(ctrl, cq, q_idx, NVME_PAGE_SIZE / sizeof(struct nvme_cqe));
292 cmd_create_cq = nvme_get_next_sqe(&ctrl->admin_sq,
293 NVME_SQE_OPC_ADMIN_CREATE_IO_CQ, NULL,
294 cq->cqe);
295 if (!cmd_create_cq) {
296 return -1;
297 }
298
299 cmd_create_cq->dword[10] = (cq->common.mask << 16) | (q_idx >> 1);
300 cmd_create_cq->dword[11] = 1 /* physically contiguous */;
301
302 nvme_commit_sqe(&ctrl->admin_sq);
303
304 struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq);
305
306 if (!nvme_is_cqe_success(&cqe)) {
307 dprintf(2, "create io cq failed: %08x %08x %08x %08x\n",
308 cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]);
309
310 return -1;
311 }
312
313 return 0;
314}
315
316/* Returns 0 on success. */
317static int
318nvme_create_io_sq(struct nvme_ctrl *ctrl, struct nvme_sq *sq, u16 q_idx, struct nvme_cq *cq)
319{
320 struct nvme_sqe *cmd_create_sq;
321
322 nvme_init_sq(ctrl, sq, q_idx, NVME_PAGE_SIZE / sizeof(struct nvme_cqe), cq);
323 cmd_create_sq = nvme_get_next_sqe(&ctrl->admin_sq,
324 NVME_SQE_OPC_ADMIN_CREATE_IO_SQ, NULL,
325 sq->sqe);
326 if (!cmd_create_sq) {
327 return -1;
328 }
329
330 cmd_create_sq->dword[10] = (sq->common.mask << 16) | (q_idx >> 1);
331 cmd_create_sq->dword[11] = (q_idx >> 1) << 16 | 1 /* contiguous */;
332 dprintf(3, "sq %p create dword10 %08x dword11 %08x\n", sq,
333 cmd_create_sq->dword[10], cmd_create_sq->dword[11]);
334
335 nvme_commit_sqe(&ctrl->admin_sq);
336
337 struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq);
338
339 if (!nvme_is_cqe_success(&cqe)) {
340 dprintf(2, "create io sq failed: %08x %08x %08x %08x\n",
341 cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]);
342 return -1;
343 }
344
345 return 0;
346}
347
348/* Reads count sectors into buf. Returns DISK_RET_*. The buffer cannot cross
349 page boundaries. */
350static int
351nvme_io_readwrite(struct nvme_namespace *ns, u64 lba, char *buf, u16 count,
352 int write)
353{
354 u32 buf_addr = (u32)buf;
355
356 if ((buf_addr & 0x3) ||
357 ((buf_addr & ~(NVME_PAGE_SIZE - 1)) !=
358 ((buf_addr + ns->block_size * count - 1) & ~(NVME_PAGE_SIZE - 1)))) {
359 /* Buffer is misaligned or crosses page boundary */
360 warn_internalerror();
361 return DISK_RET_EBADTRACK;
362 }
363
364 struct nvme_sqe *io_read = nvme_get_next_sqe(&ns->ctrl->io_sq,
365 write ? NVME_SQE_OPC_IO_WRITE
366 : NVME_SQE_OPC_IO_READ,
367 NULL, buf);
368 io_read->nsid = ns->ns_id;
369 io_read->dword[10] = (u32)lba;
370 io_read->dword[11] = (u32)(lba >> 32);
371 io_read->dword[12] = (1U << 31 /* limited retry */) | (count - 1);
372
373 nvme_commit_sqe(&ns->ctrl->io_sq);
374
375 struct nvme_cqe cqe = nvme_wait(&ns->ctrl->io_sq);
376
377 if (!nvme_is_cqe_success(&cqe)) {
378 dprintf(2, "read io: %08x %08x %08x %08x\n",
379 cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]);
380
381 return DISK_RET_EBADTRACK;
382 }
383
384 return DISK_RET_SUCCESS;
385}
386
387
388static int
389nvme_create_io_queues(struct nvme_ctrl *ctrl)
390{
391 if (nvme_create_io_cq(ctrl, &ctrl->io_cq, 3))
392 return -1;
393
394 if (nvme_create_io_sq(ctrl, &ctrl->io_sq, 2, &ctrl->io_cq))
395 return -1;
396
397 return 0;
398}
399
400/* Waits for CSTS.RDY to match rdy. Returns 0 on success. */
401static int
402nvme_wait_csts_rdy(struct nvme_ctrl *ctrl, unsigned rdy)
403{
404 u32 const max_to = 500 /* ms */ * ((ctrl->reg->cap >> 24) & 0xFFU);
405 u32 to = timer_calc(max_to);
406 u32 csts;
407
408 while (rdy != ((csts = ctrl->reg->csts) & NVME_CSTS_RDY)) {
409 yield();
410
411 if (csts & NVME_CSTS_FATAL) {
412 dprintf(3, "NVMe fatal error during controller shutdown\n");
413 return -1;
414 }
415
416 if (timer_check(to)) {
417 warn_timeout();
418 return -1;
419 }
420 }
421
422 return 0;
423}
424
425/* Returns 0 on success. */
426static int
427nvme_controller_enable(struct nvme_ctrl *ctrl)
428{
429 pci_enable_busmaster(ctrl->pci);
430
431 /* Turn the controller off. */
432 ctrl->reg->cc = 0;
433 if (nvme_wait_csts_rdy(ctrl, 0)) {
434 dprintf(2, "NVMe fatal error during controller shutdown\n");
435 return -1;
436 }
437
438 ctrl->doorbell_stride = 4U << ((ctrl->reg->cap >> 32) & 0xF);
439
440 nvme_init_cq(ctrl, &ctrl->admin_cq, 1,
441 NVME_PAGE_SIZE / sizeof(struct nvme_cqe));
442
443 nvme_init_sq(ctrl, &ctrl->admin_sq, 0,
444 NVME_PAGE_SIZE / sizeof(struct nvme_sqe), &ctrl->admin_cq);
445
446 ctrl->reg->aqa = ctrl->admin_cq.common.mask << 16
447 | ctrl->admin_sq.common.mask;
448
449 /* Create the admin queue pair */
450 if (!ctrl->admin_sq.sqe || !ctrl->admin_cq.cqe) goto out_of_memory;
451
452 ctrl->reg->asq = (u32)ctrl->admin_sq.sqe;
453 ctrl->reg->acq = (u32)ctrl->admin_cq.cqe;
454
455 dprintf(3, " admin submission queue: %p\n", ctrl->admin_sq.sqe);
456 dprintf(3, " admin completion queue: %p\n", ctrl->admin_cq.cqe);
457
458 ctrl->reg->cc = NVME_CC_EN | (NVME_CQE_SIZE_LOG << 20)
459 | (NVME_SQE_SIZE_LOG << 16 /* IOSQES */);
460
461 if (nvme_wait_csts_rdy(ctrl, 1)) {
462 dprintf(2, "NVMe fatal error while enabling controller\n");
463 goto failed;
464 }
465 /* The admin queue is set up and the controller is ready. Let's figure out
466 what namespaces we have. */
467
468 struct nvme_identify_ctrl *identify = nvme_admin_identify_ctrl(ctrl);
469
470 if (!identify) {
471 dprintf(2, "NVMe couldn't identify controller.\n");
472 goto failed;
473 }
474
475 /* TODO Print model/serial info. */
476 dprintf(3, "NVMe has %u namespace%s.\n",
477 identify->nn, (identify->nn == 1) ? "" : "s");
478
479 ctrl->ns_count = identify->nn;
480 free(identify);
481
482 if ((ctrl->ns_count == 0) || nvme_create_io_queues(ctrl)) {
483 /* No point to continue, if the controller says it doesn't have
484 namespaces or we couldn't create I/O queues. */
485 goto failed;
486 }
487
488 ctrl->ns = malloc_fseg(sizeof(*ctrl->ns) * ctrl->ns_count);
489 if (!ctrl->ns) goto out_of_memory;
490 memset(ctrl->ns, 0, sizeof(*ctrl->ns) * ctrl->ns_count);
491
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100492 /* Populate namespace IDs */
493 int ns_idx;
Daniel Verkampf21e3042017-02-23 23:27:53 -0700494 for (ns_idx = 0; ns_idx < ctrl->ns_count; ns_idx++) {
495 nvme_probe_ns(ctrl, &ctrl->ns[ns_idx], ns_idx + 1);
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100496 }
497
498 dprintf(3, "NVMe initialization complete!\n");
499 return 0;
500
501 out_of_memory:
502 warn_noalloc();
503 failed:
504 free(ctrl->admin_sq.sqe);
505 free(ctrl->admin_cq.cqe);
506 free(ctrl->ns);
507 return -1;
508}
509
510/* Initialize an NVMe controller and detect its drives. */
511static void
512nvme_controller_setup(void *opaque)
513{
514 struct pci_device *pci = opaque;
515
516 struct nvme_reg volatile *reg = pci_enable_membar(pci, PCI_BASE_ADDRESS_0);
517 if (!reg)
518 return;
519
520 u32 version = reg->vs;
521 dprintf(3, "Found NVMe controller with version %u.%u.%u.\n",
522 version >> 16, (version >> 8) & 0xFF, version & 0xFF);
523 dprintf(3, " Capabilities %016llx\n", reg->cap);
524
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100525 if (~reg->cap & NVME_CAP_CSS_NVME) {
526 dprintf(3, "Controller doesn't speak NVMe command set. Skipping.\n");
527 return;
528 }
529
530 struct nvme_ctrl *ctrl = malloc_high(sizeof(*ctrl));
531 if (!ctrl) {
532 warn_noalloc();
533 return;
534 }
535
536 memset(ctrl, 0, sizeof(*ctrl));
537
538 ctrl->reg = reg;
539 ctrl->pci = pci;
540
541 if (nvme_controller_enable(ctrl)) {
542 /* Initialization failed */
543 free(ctrl);
544 }
545}
546
547// Locate and init NVMe controllers
548static void
549nvme_scan(void)
550{
Daniel Verkamp1415d462017-02-23 23:27:57 -0700551 // Scan PCI bus for NVMe adapters
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100552 struct pci_device *pci;
553
554 foreachpci(pci) {
555 if (pci->class != PCI_CLASS_STORAGE_NVME)
556 continue;
557 if (pci->prog_if != 2 /* as of NVM 1.0e */) {
558 dprintf(3, "Found incompatble NVMe: prog-if=%02x\n", pci->prog_if);
559 continue;
560 }
561
562 run_thread(nvme_controller_setup, pci);
563 }
564}
565
566static int
567nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write)
568{
569 int res = DISK_RET_SUCCESS;
570 u16 const max_blocks = NVME_PAGE_SIZE / ns->block_size;
571 u16 i;
572
Daniel Verkamp2e82b462017-02-23 23:27:55 -0700573 for (i = 0; i < op->count && res == DISK_RET_SUCCESS;) {
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100574 u16 blocks_remaining = op->count - i;
575 u16 blocks = blocks_remaining < max_blocks ? blocks_remaining
576 : max_blocks;
577 char *op_buf = op->buf_fl + i * ns->block_size;
578
579 if (write) {
580 memcpy(ns->dma_buffer, op_buf, blocks * ns->block_size);
581 }
582
583 res = nvme_io_readwrite(ns, op->lba + i, ns->dma_buffer, blocks, write);
584 dprintf(3, "ns %u %s lba %llu+%u: %d\n", ns->ns_id, write ? "write"
585 : "read",
586 op->lba + i, blocks, res);
587
588 if (!write && res == DISK_RET_SUCCESS) {
589 memcpy(op_buf, ns->dma_buffer, blocks * ns->block_size);
590 }
591
592 i += blocks;
593 }
594
595 return res;
596}
597
598int
599nvme_process_op(struct disk_op_s *op)
600{
Youness Alaoui7759d3a2017-06-12 21:09:07 -0400601 if (!CONFIG_NVME)
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100602 return DISK_RET_SUCCESS;
603
604 struct nvme_namespace *ns = container_of(op->drive_gf, struct nvme_namespace,
605 drive);
606
607 switch (op->command) {
608 case CMD_READ:
609 case CMD_WRITE:
610 return nvme_cmd_readwrite(ns, op, op->command == CMD_WRITE);
611 default:
612 return default_process_op(op);
613 }
614}
615
616void
617nvme_setup(void)
618{
619 ASSERT32FLAT();
Kevin O'Connor235a8192017-05-10 16:14:39 -0400620 if (!CONFIG_NVME)
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100621 return;
622
623 dprintf(3, "init nvme\n");
624 nvme_scan();
625}
626
627/* EOF */