blob: 746e4682d318cfb7dab23c367c40352833cf3ac7 [file] [log] [blame]
Julian Stecklinac83e15b2017-02-13 10:03:59 +01001// Low level NVMe disk access
2//
3// Copyright 2017 Amazon.com, Inc. or its affiliates.
4//
5// This file may be distributed under the terms of the GNU LGPLv3 license.
6
7#include "blockcmd.h"
Julian Stecklinac83e15b2017-02-13 10:03:59 +01008#include "malloc.h" // malloc_high
9#include "output.h" // dprintf
10#include "pci.h"
11#include "pci_ids.h" // PCI_CLASS_STORAGE_NVME
12#include "pci_regs.h" // PCI_BASE_ADDRESS_0
13#include "pcidevice.h" // foreachpci
14#include "stacks.h" // yield
15#include "std/disk.h" // DISK_RET_
16#include "string.h" // memset
17#include "util.h" // boot_add_hd
18#include "x86.h" // readl
19
20#include "nvme.h"
21#include "nvme-int.h"
22
23static void *
24zalloc_page_aligned(struct zone_s *zone, u32 size)
25{
26 void *res = _malloc(zone, size, NVME_PAGE_SIZE);
27 if (res) memset(res, 0, size);
28 return res;
29}
30
31static void
32nvme_init_queue_common(struct nvme_ctrl *ctrl, struct nvme_queue *q, u16 q_idx,
33 u16 length)
34{
35 memset(q, 0, sizeof(*q));
36 q->dbl = (u32 *)((char *)ctrl->reg + 0x1000 + q_idx * ctrl->doorbell_stride);
37 dprintf(3, " q %p q_idx %u dbl %p\n", q, q_idx, q->dbl);
38 q->mask = length - 1;
39}
40
Julian Stecklinaf7036042017-10-03 15:47:17 +020041static int
Julian Stecklinac83e15b2017-02-13 10:03:59 +010042nvme_init_sq(struct nvme_ctrl *ctrl, struct nvme_sq *sq, u16 q_idx, u16 length,
43 struct nvme_cq *cq)
44{
45 nvme_init_queue_common(ctrl, &sq->common, q_idx, length);
46 sq->sqe = zalloc_page_aligned(&ZoneHigh, sizeof(*sq->sqe) * length);
Julian Stecklinaf7036042017-10-03 15:47:17 +020047
48 if (!sq->sqe) {
49 warn_noalloc();
50 return -1;
51 }
52
Julian Stecklinac83e15b2017-02-13 10:03:59 +010053 dprintf(3, "sq %p q_idx %u sqe %p\n", sq, q_idx, sq->sqe);
54 sq->cq = cq;
55 sq->head = 0;
56 sq->tail = 0;
Julian Stecklinaf7036042017-10-03 15:47:17 +020057
58 return 0;
Julian Stecklinac83e15b2017-02-13 10:03:59 +010059}
60
Julian Stecklinaf7036042017-10-03 15:47:17 +020061static int
Julian Stecklinac83e15b2017-02-13 10:03:59 +010062nvme_init_cq(struct nvme_ctrl *ctrl, struct nvme_cq *cq, u16 q_idx, u16 length)
63{
64 nvme_init_queue_common(ctrl, &cq->common, q_idx, length);
65 cq->cqe = zalloc_page_aligned(&ZoneHigh, sizeof(*cq->cqe) * length);
Julian Stecklinaf7036042017-10-03 15:47:17 +020066 if (!cq->cqe) {
67 warn_noalloc();
68 return -1;
69 }
Julian Stecklinac83e15b2017-02-13 10:03:59 +010070
71 cq->head = 0;
72
73 /* All CQE phase bits are initialized to zero. This means initially we wait
74 for the host controller to set these to 1. */
75 cq->phase = 1;
Julian Stecklinaf7036042017-10-03 15:47:17 +020076
77 return 0;
Julian Stecklinac83e15b2017-02-13 10:03:59 +010078}
79
80static int
81nvme_poll_cq(struct nvme_cq *cq)
82{
83 u32 dw3 = readl(&cq->cqe[cq->head].dword[3]);
84 return (!!(dw3 & NVME_CQE_DW3_P) == cq->phase);
85}
86
87static int
88nvme_is_cqe_success(struct nvme_cqe const *cqe)
89{
Daniel Verkampd8a6c842017-02-23 23:27:56 -070090 return ((cqe->status >> 1) & 0xFF) == 0;
Julian Stecklinac83e15b2017-02-13 10:03:59 +010091}
92
Julian Stecklinac83e15b2017-02-13 10:03:59 +010093static struct nvme_cqe
94nvme_error_cqe(void)
95{
96 struct nvme_cqe r;
97
98 /* 0xFF is a vendor specific status code != success. Should be okay for
99 indicating failure. */
100 memset(&r, 0xFF, sizeof(r));
101 return r;
102}
103
104static struct nvme_cqe
105nvme_consume_cqe(struct nvme_sq *sq)
106{
107 struct nvme_cq *cq = sq->cq;
108
109 if (!nvme_poll_cq(cq)) {
110 /* Cannot consume a completion queue entry, if there is none ready. */
111 return nvme_error_cqe();
112 }
113
114 struct nvme_cqe *cqe = &cq->cqe[cq->head];
115 u16 cq_next_head = (cq->head + 1) & cq->common.mask;
116 dprintf(4, "cq %p head %u -> %u\n", cq, cq->head, cq_next_head);
117 if (cq_next_head < cq->head) {
118 dprintf(3, "cq %p wrap\n", cq);
119 cq->phase = ~cq->phase;
120 }
121 cq->head = cq_next_head;
122
123 /* Update the submission queue head. */
124 if (cqe->sq_head != sq->head) {
125 sq->head = cqe->sq_head;
126 dprintf(4, "sq %p advanced to %u\n", sq, cqe->sq_head);
127 }
128
129 /* Tell the controller that we consumed the completion. */
130 writel(cq->common.dbl, cq->head);
131
132 return *cqe;
133}
134
135static struct nvme_cqe
136nvme_wait(struct nvme_sq *sq)
137{
Daniel Verkamp21de72f2017-02-23 23:27:54 -0700138 static const unsigned nvme_timeout = 5000 /* ms */;
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100139 u32 to = timer_calc(nvme_timeout);
140 while (!nvme_poll_cq(sq->cq)) {
141 yield();
142
143 if (timer_check(to)) {
144 warn_timeout();
145 return nvme_error_cqe();
146 }
147 }
148
149 return nvme_consume_cqe(sq);
150}
151
152/* Returns the next submission queue entry (or NULL if the queue is full). It
153 also fills out Command Dword 0 and clears the rest. */
154static struct nvme_sqe *
Alexander Graf23258d32020-09-30 23:10:54 +0200155nvme_get_next_sqe(struct nvme_sq *sq, u8 opc, void *metadata, void *data, void *data2)
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100156{
157 if (((sq->head + 1) & sq->common.mask) == sq->tail) {
158 dprintf(3, "submission queue is full");
159 return NULL;
160 }
161
162 struct nvme_sqe *sqe = &sq->sqe[sq->tail];
163 dprintf(4, "sq %p next_sqe %u\n", sq, sq->tail);
164
165 memset(sqe, 0, sizeof(*sqe));
166 sqe->cdw0 = opc | (sq->tail << 16 /* CID */);
167 sqe->mptr = (u32)metadata;
168 sqe->dptr_prp1 = (u32)data;
Alexander Graf23258d32020-09-30 23:10:54 +0200169 sqe->dptr_prp2 = (u32)data2;
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100170
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100171 return sqe;
172}
173
174/* Call this after you've filled out an sqe that you've got from nvme_get_next_sqe. */
175static void
176nvme_commit_sqe(struct nvme_sq *sq)
177{
178 dprintf(4, "sq %p commit_sqe %u\n", sq, sq->tail);
179 sq->tail = (sq->tail + 1) & sq->common.mask;
180 writel(sq->common.dbl, sq->tail);
181}
182
183/* Perform an identify command on the admin queue and return the resulting
184 buffer. This may be a NULL pointer, if something failed. This function
185 cannot be used after initialization, because it uses buffers in tmp zone. */
186static union nvme_identify *
187nvme_admin_identify(struct nvme_ctrl *ctrl, u8 cns, u32 nsid)
188{
189 union nvme_identify *identify_buf = zalloc_page_aligned(&ZoneTmpHigh, 4096);
190 if (!identify_buf) {
191 /* Could not allocate identify buffer. */
192 warn_internalerror();
193 return NULL;
194 }
195
196 struct nvme_sqe *cmd_identify;
197 cmd_identify = nvme_get_next_sqe(&ctrl->admin_sq,
198 NVME_SQE_OPC_ADMIN_IDENTIFY, NULL,
Alexander Graf23258d32020-09-30 23:10:54 +0200199 identify_buf, NULL);
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100200
201 if (!cmd_identify) {
202 warn_internalerror();
203 goto error;
204 }
205
206 cmd_identify->nsid = nsid;
207 cmd_identify->dword[10] = cns;
208
209 nvme_commit_sqe(&ctrl->admin_sq);
210
211 struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq);
212
213 if (!nvme_is_cqe_success(&cqe)) {
214 goto error;
215 }
216
217 return identify_buf;
218 error:
219 free(identify_buf);
220 return NULL;
221}
222
223static struct nvme_identify_ctrl *
224nvme_admin_identify_ctrl(struct nvme_ctrl *ctrl)
225{
226 return &nvme_admin_identify(ctrl, NVME_ADMIN_IDENTIFY_CNS_ID_CTRL, 0)->ctrl;
227}
228
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100229static struct nvme_identify_ns *
230nvme_admin_identify_ns(struct nvme_ctrl *ctrl, u32 ns_id)
231{
232 return &nvme_admin_identify(ctrl, NVME_ADMIN_IDENTIFY_CNS_ID_NS,
233 ns_id)->ns;
234}
235
236static void
Gerd Hoffmann9bb12032021-05-26 09:32:10 +0200237nvme_probe_ns(struct nvme_ctrl *ctrl, u32 ns_idx, u8 mdts)
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100238{
Gerd Hoffmann9bb12032021-05-26 09:32:10 +0200239 u32 ns_id = ns_idx + 1;
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100240
241 struct nvme_identify_ns *id = nvme_admin_identify_ns(ctrl, ns_id);
242 if (!id) {
243 dprintf(2, "NVMe couldn't identify namespace %u.\n", ns_id);
244 goto free_buffer;
245 }
246
247 u8 current_lba_format = id->flbas & 0xF;
248 if (current_lba_format > id->nlbaf) {
249 dprintf(2, "NVMe NS %u: current LBA format %u is beyond what the "
250 " namespace supports (%u)?\n",
251 ns_id, current_lba_format, id->nlbaf + 1);
252 goto free_buffer;
253 }
254
Gerd Hoffmann9bb12032021-05-26 09:32:10 +0200255 if (!id->nsze) {
Daniel Verkampf21e3042017-02-23 23:27:53 -0700256 dprintf(2, "NVMe NS %u is inactive.\n", ns_id);
257 goto free_buffer;
258 }
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100259
Gerd Hoffmann9bb12032021-05-26 09:32:10 +0200260 struct nvme_namespace *ns = malloc_fseg(sizeof(*ns));
261 if (!ns) {
262 warn_noalloc();
263 goto free_buffer;
264 }
265 memset(ns, 0, sizeof(*ns));
266 ns->ctrl = ctrl;
267 ns->ns_id = ns_id;
268 ns->lba_count = id->nsze;
269
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100270 struct nvme_lba_format *fmt = &id->lbaf[current_lba_format];
271
272 ns->block_size = 1U << fmt->lbads;
273 ns->metadata_size = fmt->ms;
274
275 if (ns->block_size > NVME_PAGE_SIZE) {
276 /* If we see devices that trigger this path, we need to increase our
277 buffer size. */
278 warn_internalerror();
Gerd Hoffmann9bb12032021-05-26 09:32:10 +0200279 free(ns);
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100280 goto free_buffer;
281 }
282
Gerd Hoffmann9bb12032021-05-26 09:32:10 +0200283 ns->drive.cntl_id = ns_idx;
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100284 ns->drive.removable = 0;
285 ns->drive.type = DTYPE_NVME;
286 ns->drive.blksize = ns->block_size;
287 ns->drive.sectors = ns->lba_count;
288
Alexander Grafb68f3132020-09-30 23:10:53 +0200289 if (mdts) {
290 ns->max_req_size = ((1U << mdts) * NVME_PAGE_SIZE) / ns->block_size;
291 dprintf(3, "NVME NS %u max request size: %d sectors\n",
292 ns_id, ns->max_req_size);
293 } else {
294 ns->max_req_size = -1U;
295 }
296
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100297 ns->dma_buffer = zalloc_page_aligned(&ZoneHigh, NVME_PAGE_SIZE);
298
299 char *desc = znprintf(MAXDESCSIZE, "NVMe NS %u: %llu MiB (%llu %u-byte "
Gerd Hoffmann81433aa2021-05-26 09:34:40 +0200300 "blocks + %u-byte metadata)",
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100301 ns_id, (ns->lba_count * ns->block_size) >> 20,
302 ns->lba_count, ns->block_size, ns->metadata_size);
303
Gerd Hoffmann81433aa2021-05-26 09:34:40 +0200304 dprintf(3, "%s\n", desc);
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100305 boot_add_hd(&ns->drive, desc, bootprio_find_pci_device(ctrl->pci));
306
Julian Stecklinaf7036042017-10-03 15:47:17 +0200307free_buffer:
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100308 free (id);
Julian Stecklinaf7036042017-10-03 15:47:17 +0200309}
310
311
312/* Release memory allocated for a completion queue */
313static void
314nvme_destroy_cq(struct nvme_cq *cq)
315{
316 free(cq->cqe);
317 cq->cqe = NULL;
318}
319
320/* Release memory allocated for a submission queue */
321static void
322nvme_destroy_sq(struct nvme_sq *sq)
323{
324 free(sq->sqe);
325 sq->sqe = NULL;
326}
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100327
328/* Returns 0 on success. */
329static int
330nvme_create_io_cq(struct nvme_ctrl *ctrl, struct nvme_cq *cq, u16 q_idx)
331{
Julian Stecklinaf7036042017-10-03 15:47:17 +0200332 int rc;
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100333 struct nvme_sqe *cmd_create_cq;
Matt DeVillier79619172018-08-21 10:00:53 -0500334 u32 length = 1 + (ctrl->reg->cap & 0xffff);
Filippo Sironicd471722017-10-12 00:42:34 +0200335 if (length > NVME_PAGE_SIZE / sizeof(struct nvme_cqe))
336 length = NVME_PAGE_SIZE / sizeof(struct nvme_cqe);
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100337
Filippo Sironicd471722017-10-12 00:42:34 +0200338 rc = nvme_init_cq(ctrl, cq, q_idx, length);
Julian Stecklinaf7036042017-10-03 15:47:17 +0200339 if (rc) {
340 goto err;
341 }
342
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100343 cmd_create_cq = nvme_get_next_sqe(&ctrl->admin_sq,
344 NVME_SQE_OPC_ADMIN_CREATE_IO_CQ, NULL,
Alexander Graf23258d32020-09-30 23:10:54 +0200345 cq->cqe, NULL);
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100346 if (!cmd_create_cq) {
Julian Stecklinaf7036042017-10-03 15:47:17 +0200347 goto err_destroy_cq;
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100348 }
349
350 cmd_create_cq->dword[10] = (cq->common.mask << 16) | (q_idx >> 1);
351 cmd_create_cq->dword[11] = 1 /* physically contiguous */;
352
353 nvme_commit_sqe(&ctrl->admin_sq);
354
355 struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq);
356
357 if (!nvme_is_cqe_success(&cqe)) {
358 dprintf(2, "create io cq failed: %08x %08x %08x %08x\n",
359 cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]);
360
Julian Stecklinaf7036042017-10-03 15:47:17 +0200361 goto err_destroy_cq;
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100362 }
363
364 return 0;
Julian Stecklinaf7036042017-10-03 15:47:17 +0200365
366err_destroy_cq:
367 nvme_destroy_cq(cq);
368err:
369 return -1;
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100370}
371
372/* Returns 0 on success. */
373static int
374nvme_create_io_sq(struct nvme_ctrl *ctrl, struct nvme_sq *sq, u16 q_idx, struct nvme_cq *cq)
375{
Julian Stecklinaf7036042017-10-03 15:47:17 +0200376 int rc;
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100377 struct nvme_sqe *cmd_create_sq;
Matt DeVillier79619172018-08-21 10:00:53 -0500378 u32 length = 1 + (ctrl->reg->cap & 0xffff);
Filippo Sironicd471722017-10-12 00:42:34 +0200379 if (length > NVME_PAGE_SIZE / sizeof(struct nvme_cqe))
380 length = NVME_PAGE_SIZE / sizeof(struct nvme_cqe);
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100381
Filippo Sironicd471722017-10-12 00:42:34 +0200382 rc = nvme_init_sq(ctrl, sq, q_idx, length, cq);
Julian Stecklinaf7036042017-10-03 15:47:17 +0200383 if (rc) {
384 goto err;
385 }
386
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100387 cmd_create_sq = nvme_get_next_sqe(&ctrl->admin_sq,
388 NVME_SQE_OPC_ADMIN_CREATE_IO_SQ, NULL,
Alexander Graf23258d32020-09-30 23:10:54 +0200389 sq->sqe, NULL);
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100390 if (!cmd_create_sq) {
Julian Stecklinaf7036042017-10-03 15:47:17 +0200391 goto err_destroy_sq;
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100392 }
393
394 cmd_create_sq->dword[10] = (sq->common.mask << 16) | (q_idx >> 1);
395 cmd_create_sq->dword[11] = (q_idx >> 1) << 16 | 1 /* contiguous */;
396 dprintf(3, "sq %p create dword10 %08x dword11 %08x\n", sq,
397 cmd_create_sq->dword[10], cmd_create_sq->dword[11]);
398
399 nvme_commit_sqe(&ctrl->admin_sq);
400
401 struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq);
402
403 if (!nvme_is_cqe_success(&cqe)) {
404 dprintf(2, "create io sq failed: %08x %08x %08x %08x\n",
405 cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]);
Julian Stecklinaf7036042017-10-03 15:47:17 +0200406 goto err_destroy_sq;
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100407 }
408
409 return 0;
Julian Stecklinaf7036042017-10-03 15:47:17 +0200410
411err_destroy_sq:
412 nvme_destroy_sq(sq);
413err:
414 return -1;
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100415}
416
417/* Reads count sectors into buf. Returns DISK_RET_*. The buffer cannot cross
418 page boundaries. */
419static int
420nvme_io_readwrite(struct nvme_namespace *ns, u64 lba, char *buf, u16 count,
421 int write)
422{
423 u32 buf_addr = (u32)buf;
Alexander Graf01f27362020-09-30 23:10:55 +0200424 void *prp2;
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100425
Alexander Graf01f27362020-09-30 23:10:55 +0200426 if (buf_addr & 0x3) {
427 /* Buffer is misaligned */
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100428 warn_internalerror();
429 return DISK_RET_EBADTRACK;
430 }
431
Alexander Graf01f27362020-09-30 23:10:55 +0200432 if ((ns->block_size * count) > (NVME_PAGE_SIZE * 2)) {
433 /* We need to describe more than 2 pages, rely on PRP List */
434 prp2 = ns->prpl;
435 } else if ((ns->block_size * count) > NVME_PAGE_SIZE) {
436 /* Directly embed the 2nd page if we only need 2 pages */
437 prp2 = (void *)(long)ns->prpl[0];
438 } else {
439 /* One page is enough, don't expose anything else */
440 prp2 = NULL;
441 }
442
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100443 struct nvme_sqe *io_read = nvme_get_next_sqe(&ns->ctrl->io_sq,
444 write ? NVME_SQE_OPC_IO_WRITE
445 : NVME_SQE_OPC_IO_READ,
Alexander Graf01f27362020-09-30 23:10:55 +0200446 NULL, buf, prp2);
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100447 io_read->nsid = ns->ns_id;
448 io_read->dword[10] = (u32)lba;
449 io_read->dword[11] = (u32)(lba >> 32);
450 io_read->dword[12] = (1U << 31 /* limited retry */) | (count - 1);
451
452 nvme_commit_sqe(&ns->ctrl->io_sq);
453
454 struct nvme_cqe cqe = nvme_wait(&ns->ctrl->io_sq);
455
456 if (!nvme_is_cqe_success(&cqe)) {
457 dprintf(2, "read io: %08x %08x %08x %08x\n",
458 cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]);
459
460 return DISK_RET_EBADTRACK;
461 }
462
463 return DISK_RET_SUCCESS;
464}
465
Alexander Graf01f27362020-09-30 23:10:55 +0200466static void nvme_reset_prpl(struct nvme_namespace *ns)
467{
468 ns->prpl_len = 0;
469}
470
471static int nvme_add_prpl(struct nvme_namespace *ns, u64 base)
472{
473 if (ns->prpl_len >= NVME_MAX_PRPL_ENTRIES)
474 return -1;
475
476 ns->prpl[ns->prpl_len++] = base;
477
478 return 0;
479}
480
David Woodhouse748d6192020-11-05 16:09:32 +0000481static int nvme_build_prpl(struct nvme_namespace *ns, void *op_buf, u16 count)
Alexander Graf01f27362020-09-30 23:10:55 +0200482{
483 int first_page = 1;
David Woodhouse748d6192020-11-05 16:09:32 +0000484 u32 base = (long)op_buf;
485 s32 size;
Alexander Graf01f27362020-09-30 23:10:55 +0200486
David Woodhouse748d6192020-11-05 16:09:32 +0000487 if (count > ns->max_req_size)
488 count = ns->max_req_size;
Alexander Graf01f27362020-09-30 23:10:55 +0200489
490 nvme_reset_prpl(ns);
491
David Woodhouse748d6192020-11-05 16:09:32 +0000492 size = count * ns->block_size;
Alexander Graf01f27362020-09-30 23:10:55 +0200493 /* Special case for transfers that fit into PRP1, but are unaligned */
494 if (((size + (base & ~NVME_PAGE_MASK)) <= NVME_PAGE_SIZE)) {
David Woodhouse748d6192020-11-05 16:09:32 +0000495 ns->prp1 = op_buf;
496 return count;
Alexander Graf01f27362020-09-30 23:10:55 +0200497 }
498
499 /* Every request has to be page aligned */
500 if (base & ~NVME_PAGE_MASK)
David Woodhouse748d6192020-11-05 16:09:32 +0000501 return 0;
Alexander Graf01f27362020-09-30 23:10:55 +0200502
503 /* Make sure a full block fits into the last chunk */
504 if (size & (ns->block_size - 1ULL))
David Woodhouse748d6192020-11-05 16:09:32 +0000505 return 0;
Alexander Graf01f27362020-09-30 23:10:55 +0200506
507 for (; size > 0; base += NVME_PAGE_SIZE, size -= NVME_PAGE_SIZE) {
508 if (first_page) {
509 /* First page is special */
510 ns->prp1 = (void*)base;
511 first_page = 0;
512 continue;
513 }
514 if (nvme_add_prpl(ns, base))
David Woodhouse748d6192020-11-05 16:09:32 +0000515 return 0;
Alexander Graf01f27362020-09-30 23:10:55 +0200516 }
517
David Woodhouse748d6192020-11-05 16:09:32 +0000518 return count;
Alexander Graf01f27362020-09-30 23:10:55 +0200519}
520
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100521static int
522nvme_create_io_queues(struct nvme_ctrl *ctrl)
523{
524 if (nvme_create_io_cq(ctrl, &ctrl->io_cq, 3))
Julian Stecklinaf7036042017-10-03 15:47:17 +0200525 goto err;
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100526
527 if (nvme_create_io_sq(ctrl, &ctrl->io_sq, 2, &ctrl->io_cq))
Julian Stecklinaf7036042017-10-03 15:47:17 +0200528 goto err_free_cq;
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100529
530 return 0;
Julian Stecklinaf7036042017-10-03 15:47:17 +0200531
532 err_free_cq:
533 nvme_destroy_cq(&ctrl->io_cq);
534 err:
535 return -1;
536}
537
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100538/* Waits for CSTS.RDY to match rdy. Returns 0 on success. */
539static int
540nvme_wait_csts_rdy(struct nvme_ctrl *ctrl, unsigned rdy)
541{
542 u32 const max_to = 500 /* ms */ * ((ctrl->reg->cap >> 24) & 0xFFU);
543 u32 to = timer_calc(max_to);
544 u32 csts;
545
546 while (rdy != ((csts = ctrl->reg->csts) & NVME_CSTS_RDY)) {
547 yield();
548
549 if (csts & NVME_CSTS_FATAL) {
550 dprintf(3, "NVMe fatal error during controller shutdown\n");
551 return -1;
552 }
553
554 if (timer_check(to)) {
555 warn_timeout();
556 return -1;
557 }
558 }
559
560 return 0;
561}
562
563/* Returns 0 on success. */
564static int
565nvme_controller_enable(struct nvme_ctrl *ctrl)
566{
Julian Stecklinaf7036042017-10-03 15:47:17 +0200567 int rc;
568
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100569 pci_enable_busmaster(ctrl->pci);
570
571 /* Turn the controller off. */
572 ctrl->reg->cc = 0;
573 if (nvme_wait_csts_rdy(ctrl, 0)) {
574 dprintf(2, "NVMe fatal error during controller shutdown\n");
575 return -1;
576 }
577
578 ctrl->doorbell_stride = 4U << ((ctrl->reg->cap >> 32) & 0xF);
579
Julian Stecklinaf7036042017-10-03 15:47:17 +0200580 rc = nvme_init_cq(ctrl, &ctrl->admin_cq, 1,
581 NVME_PAGE_SIZE / sizeof(struct nvme_cqe));
582 if (rc) {
583 return -1;
584 }
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100585
Julian Stecklinaf7036042017-10-03 15:47:17 +0200586 rc = nvme_init_sq(ctrl, &ctrl->admin_sq, 0,
587 NVME_PAGE_SIZE / sizeof(struct nvme_sqe), &ctrl->admin_cq);
588 if (rc) {
589 goto err_destroy_admin_cq;
590 }
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100591
592 ctrl->reg->aqa = ctrl->admin_cq.common.mask << 16
593 | ctrl->admin_sq.common.mask;
594
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100595 ctrl->reg->asq = (u32)ctrl->admin_sq.sqe;
596 ctrl->reg->acq = (u32)ctrl->admin_cq.cqe;
597
598 dprintf(3, " admin submission queue: %p\n", ctrl->admin_sq.sqe);
599 dprintf(3, " admin completion queue: %p\n", ctrl->admin_cq.cqe);
600
601 ctrl->reg->cc = NVME_CC_EN | (NVME_CQE_SIZE_LOG << 20)
602 | (NVME_SQE_SIZE_LOG << 16 /* IOSQES */);
603
604 if (nvme_wait_csts_rdy(ctrl, 1)) {
605 dprintf(2, "NVMe fatal error while enabling controller\n");
Julian Stecklinaf7036042017-10-03 15:47:17 +0200606 goto err_destroy_admin_sq;
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100607 }
Julian Stecklinaf7036042017-10-03 15:47:17 +0200608
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100609 /* The admin queue is set up and the controller is ready. Let's figure out
610 what namespaces we have. */
611
612 struct nvme_identify_ctrl *identify = nvme_admin_identify_ctrl(ctrl);
613
614 if (!identify) {
615 dprintf(2, "NVMe couldn't identify controller.\n");
Julian Stecklinaf7036042017-10-03 15:47:17 +0200616 goto err_destroy_admin_sq;
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100617 }
618
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100619 dprintf(3, "NVMe has %u namespace%s.\n",
620 identify->nn, (identify->nn == 1) ? "" : "s");
621
622 ctrl->ns_count = identify->nn;
623 free(identify);
624
625 if ((ctrl->ns_count == 0) || nvme_create_io_queues(ctrl)) {
626 /* No point to continue, if the controller says it doesn't have
627 namespaces or we couldn't create I/O queues. */
Julian Stecklinaf7036042017-10-03 15:47:17 +0200628 goto err_destroy_admin_sq;
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100629 }
630
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100631 /* Populate namespace IDs */
632 int ns_idx;
Daniel Verkampf21e3042017-02-23 23:27:53 -0700633 for (ns_idx = 0; ns_idx < ctrl->ns_count; ns_idx++) {
Gerd Hoffmann9bb12032021-05-26 09:32:10 +0200634 nvme_probe_ns(ctrl, ns_idx, identify->mdts);
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100635 }
636
637 dprintf(3, "NVMe initialization complete!\n");
638 return 0;
639
Julian Stecklinaf7036042017-10-03 15:47:17 +0200640 err_destroy_admin_sq:
641 nvme_destroy_sq(&ctrl->admin_sq);
642 err_destroy_admin_cq:
643 nvme_destroy_cq(&ctrl->admin_cq);
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100644 return -1;
645}
646
647/* Initialize an NVMe controller and detect its drives. */
648static void
649nvme_controller_setup(void *opaque)
650{
Gerd Hoffmann76551852020-01-14 10:12:01 +0100651 u8 skip_nonbootable = is_bootprio_strict();
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100652 struct pci_device *pci = opaque;
653
Gerd Hoffmann76551852020-01-14 10:12:01 +0100654 if (skip_nonbootable && bootprio_find_pci_device(pci) < 0) {
655 dprintf(1, "skipping init of a non-bootable NVMe at %pP\n",
656 pci);
657 goto err;
658 }
659
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100660 struct nvme_reg volatile *reg = pci_enable_membar(pci, PCI_BASE_ADDRESS_0);
661 if (!reg)
662 return;
663
664 u32 version = reg->vs;
665 dprintf(3, "Found NVMe controller with version %u.%u.%u.\n",
666 version >> 16, (version >> 8) & 0xFF, version & 0xFF);
667 dprintf(3, " Capabilities %016llx\n", reg->cap);
668
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100669 if (~reg->cap & NVME_CAP_CSS_NVME) {
670 dprintf(3, "Controller doesn't speak NVMe command set. Skipping.\n");
Julian Stecklinaf7036042017-10-03 15:47:17 +0200671 goto err;
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100672 }
673
674 struct nvme_ctrl *ctrl = malloc_high(sizeof(*ctrl));
675 if (!ctrl) {
676 warn_noalloc();
Julian Stecklinaf7036042017-10-03 15:47:17 +0200677 goto err;
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100678 }
679
680 memset(ctrl, 0, sizeof(*ctrl));
681
682 ctrl->reg = reg;
683 ctrl->pci = pci;
684
685 if (nvme_controller_enable(ctrl)) {
Julian Stecklinaf7036042017-10-03 15:47:17 +0200686 goto err_free_ctrl;
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100687 }
Julian Stecklinaf7036042017-10-03 15:47:17 +0200688
689 return;
690
691 err_free_ctrl:
692 free(ctrl);
693 err:
694 dprintf(2, "Failed to enable NVMe controller.\n");
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100695}
696
697// Locate and init NVMe controllers
698static void
699nvme_scan(void)
700{
Daniel Verkamp1415d462017-02-23 23:27:57 -0700701 // Scan PCI bus for NVMe adapters
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100702 struct pci_device *pci;
703
704 foreachpci(pci) {
705 if (pci->class != PCI_CLASS_STORAGE_NVME)
706 continue;
707 if (pci->prog_if != 2 /* as of NVM 1.0e */) {
708 dprintf(3, "Found incompatble NVMe: prog-if=%02x\n", pci->prog_if);
709 continue;
710 }
711
712 run_thread(nvme_controller_setup, pci);
713 }
714}
715
716static int
717nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write)
718{
719 int res = DISK_RET_SUCCESS;
720 u16 const max_blocks = NVME_PAGE_SIZE / ns->block_size;
David Woodhouse748d6192020-11-05 16:09:32 +0000721 u16 i, blocks;
Alexander Graf01f27362020-09-30 23:10:55 +0200722
Daniel Verkamp2e82b462017-02-23 23:27:55 -0700723 for (i = 0; i < op->count && res == DISK_RET_SUCCESS;) {
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100724 u16 blocks_remaining = op->count - i;
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100725 char *op_buf = op->buf_fl + i * ns->block_size;
726
David Woodhouse748d6192020-11-05 16:09:32 +0000727 blocks = nvme_build_prpl(ns, op_buf, blocks_remaining);
728 if (blocks) {
729 res = nvme_io_readwrite(ns, op->lba + i, ns->prp1, blocks, write);
730 dprintf(5, "ns %u %s lba %llu+%u: %d\n", ns->ns_id, write ? "write"
731 : "read",
732 op->lba, blocks, res);
733 } else {
734 blocks = blocks_remaining < max_blocks ? blocks_remaining
735 : max_blocks;
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100736
David Woodhouse748d6192020-11-05 16:09:32 +0000737 if (write) {
738 memcpy(ns->dma_buffer, op_buf, blocks * ns->block_size);
739 }
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100740
David Woodhouse748d6192020-11-05 16:09:32 +0000741 res = nvme_io_readwrite(ns, op->lba + i, ns->dma_buffer, blocks, write);
742 dprintf(5, "ns %u %s lba %llu+%u: %d\n", ns->ns_id, write ? "write"
743 : "read",
744 op->lba + i, blocks, res);
745
746 if (!write && res == DISK_RET_SUCCESS) {
747 memcpy(op_buf, ns->dma_buffer, blocks * ns->block_size);
748 }
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100749 }
750
751 i += blocks;
752 }
753
754 return res;
755}
756
757int
758nvme_process_op(struct disk_op_s *op)
759{
Youness Alaoui7759d3a2017-06-12 21:09:07 -0400760 if (!CONFIG_NVME)
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100761 return DISK_RET_SUCCESS;
762
Kevin O'Connore5a0b612017-07-11 12:24:50 -0400763 struct nvme_namespace *ns = container_of(op->drive_fl, struct nvme_namespace,
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100764 drive);
765
766 switch (op->command) {
767 case CMD_READ:
768 case CMD_WRITE:
769 return nvme_cmd_readwrite(ns, op, op->command == CMD_WRITE);
770 default:
771 return default_process_op(op);
772 }
773}
774
775void
776nvme_setup(void)
777{
778 ASSERT32FLAT();
Kevin O'Connor235a8192017-05-10 16:14:39 -0400779 if (!CONFIG_NVME)
Julian Stecklinac83e15b2017-02-13 10:03:59 +0100780 return;
781
782 dprintf(3, "init nvme\n");
783 nvme_scan();
784}
785
786/* EOF */