Line data Source code
1 : /* SPDX-License-Identifier: BSD-3-Clause
2 : * Copyright (C) 2019 Intel Corporation.
3 : * All rights reserved.
4 : */
5 :
6 : #include "bdev_uring.h"
7 :
8 : #include "spdk/stdinc.h"
9 : #include "spdk/config.h"
10 : #include "spdk/barrier.h"
11 : #include "spdk/bdev.h"
12 : #include "spdk/env.h"
13 : #include "spdk/fd.h"
14 : #include "spdk/likely.h"
15 : #include "spdk/thread.h"
16 : #include "spdk/json.h"
17 : #include "spdk/util.h"
18 : #include "spdk/string.h"
19 : #include "spdk/file.h"
20 :
21 : #include "spdk/log.h"
22 : #include "spdk_internal/uring.h"
23 :
24 : #ifdef SPDK_CONFIG_URING_ZNS
25 : #include <linux/blkzoned.h>
26 : #define SECTOR_SHIFT 9
27 : #endif
28 :
29 : struct bdev_uring_zoned_dev {
30 : uint64_t num_zones;
31 : uint32_t zone_shift;
32 : uint32_t lba_shift;
33 : };
34 :
35 : struct bdev_uring_io_channel {
36 : struct bdev_uring_group_channel *group_ch;
37 : };
38 :
39 : struct bdev_uring_group_channel {
40 : uint64_t io_inflight;
41 : uint64_t io_pending;
42 : struct spdk_poller *poller;
43 : struct io_uring uring;
44 : };
45 :
46 : struct bdev_uring_task {
47 : uint64_t len;
48 : struct bdev_uring_io_channel *ch;
49 : TAILQ_ENTRY(bdev_uring_task) link;
50 : };
51 :
52 : struct bdev_uring {
53 : struct spdk_bdev bdev;
54 : struct bdev_uring_zoned_dev zd;
55 : char *filename;
56 : int fd;
57 : TAILQ_ENTRY(bdev_uring) link;
58 : };
59 :
60 : static int bdev_uring_init(void);
61 : static void bdev_uring_fini(void);
62 : static void uring_free_bdev(struct bdev_uring *uring);
63 : static TAILQ_HEAD(, bdev_uring) g_uring_bdev_head = TAILQ_HEAD_INITIALIZER(g_uring_bdev_head);
64 :
65 : #define SPDK_URING_QUEUE_DEPTH 512
66 : #define MAX_EVENTS_PER_POLL 32
67 :
68 : static int
69 0 : bdev_uring_get_ctx_size(void)
70 : {
71 0 : return sizeof(struct bdev_uring_task);
72 : }
73 :
74 : static struct spdk_bdev_module uring_if = {
75 : .name = "uring",
76 : .module_init = bdev_uring_init,
77 : .module_fini = bdev_uring_fini,
78 : .get_ctx_size = bdev_uring_get_ctx_size,
79 : };
80 :
81 0 : SPDK_BDEV_MODULE_REGISTER(uring, &uring_if)
82 :
83 : static int
84 0 : bdev_uring_open(struct bdev_uring *bdev)
85 : {
86 : int fd;
87 :
88 0 : fd = open(bdev->filename, O_RDWR | O_DIRECT | O_NOATIME);
89 0 : if (fd < 0) {
90 : /* Try without O_DIRECT for non-disk files */
91 0 : fd = open(bdev->filename, O_RDWR | O_NOATIME);
92 0 : if (fd < 0) {
93 0 : SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n",
94 : bdev->filename, errno, spdk_strerror(errno));
95 0 : bdev->fd = -1;
96 0 : return -1;
97 : }
98 : }
99 :
100 0 : bdev->fd = fd;
101 :
102 0 : return 0;
103 : }
104 :
105 : static void
106 0 : dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx)
107 : {
108 0 : }
109 :
110 : int
111 0 : bdev_uring_rescan(const char *name)
112 : {
113 0 : struct spdk_bdev_desc *desc;
114 : struct spdk_bdev *bdev;
115 : struct bdev_uring *uring;
116 : uint64_t uring_size, blockcnt;
117 : int rc;
118 :
119 0 : rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &desc);
120 0 : if (rc != 0) {
121 0 : return rc;
122 : }
123 :
124 0 : bdev = spdk_bdev_desc_get_bdev(desc);
125 0 : if (bdev->module != &uring_if) {
126 0 : rc = -ENODEV;
127 0 : goto exit;
128 : }
129 :
130 0 : uring = SPDK_CONTAINEROF(bdev, struct bdev_uring, bdev);
131 0 : uring_size = spdk_fd_get_size(uring->fd);
132 0 : blockcnt = uring_size / bdev->blocklen;
133 :
134 0 : if (bdev->blockcnt != blockcnt) {
135 0 : SPDK_NOTICELOG("URING device is resized: bdev name %s, old block count %" PRIu64
136 : ", new block count %"
137 : PRIu64 "\n",
138 : uring->filename,
139 : bdev->blockcnt,
140 : blockcnt);
141 0 : rc = spdk_bdev_notify_blockcnt_change(bdev, blockcnt);
142 0 : if (rc != 0) {
143 0 : SPDK_ERRLOG("Could not change num blocks for uring bdev: name %s, errno: %d.\n",
144 : uring->filename, rc);
145 0 : goto exit;
146 : }
147 : }
148 :
149 0 : exit:
150 0 : spdk_bdev_close(desc);
151 0 : return rc;
152 : }
153 :
154 : static int
155 0 : bdev_uring_close(struct bdev_uring *bdev)
156 : {
157 : int rc;
158 :
159 0 : if (bdev->fd == -1) {
160 0 : return 0;
161 : }
162 :
163 0 : rc = close(bdev->fd);
164 0 : if (rc < 0) {
165 0 : SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n",
166 : bdev->fd, errno, spdk_strerror(errno));
167 0 : return -1;
168 : }
169 :
170 0 : bdev->fd = -1;
171 :
172 0 : return 0;
173 : }
174 :
175 : static int64_t
176 0 : bdev_uring_readv(struct bdev_uring *uring, struct spdk_io_channel *ch,
177 : struct bdev_uring_task *uring_task,
178 : struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset)
179 : {
180 0 : struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch);
181 0 : struct bdev_uring_group_channel *group_ch = uring_ch->group_ch;
182 : struct io_uring_sqe *sqe;
183 :
184 0 : sqe = io_uring_get_sqe(&group_ch->uring);
185 0 : if (!sqe) {
186 0 : SPDK_DEBUGLOG(uring, "get sqe failed as out of resource\n");
187 0 : return -ENOMEM;
188 : }
189 :
190 0 : io_uring_prep_readv(sqe, uring->fd, iov, iovcnt, offset);
191 0 : io_uring_sqe_set_data(sqe, uring_task);
192 0 : uring_task->len = nbytes;
193 0 : uring_task->ch = uring_ch;
194 :
195 0 : SPDK_DEBUGLOG(uring, "read %d iovs size %lu to off: %#lx\n",
196 : iovcnt, nbytes, offset);
197 :
198 0 : group_ch->io_pending++;
199 0 : return nbytes;
200 : }
201 :
202 : static int64_t
203 0 : bdev_uring_writev(struct bdev_uring *uring, struct spdk_io_channel *ch,
204 : struct bdev_uring_task *uring_task,
205 : struct iovec *iov, int iovcnt, size_t nbytes, uint64_t offset)
206 : {
207 0 : struct bdev_uring_io_channel *uring_ch = spdk_io_channel_get_ctx(ch);
208 0 : struct bdev_uring_group_channel *group_ch = uring_ch->group_ch;
209 : struct io_uring_sqe *sqe;
210 :
211 0 : sqe = io_uring_get_sqe(&group_ch->uring);
212 0 : if (!sqe) {
213 0 : SPDK_DEBUGLOG(uring, "get sqe failed as out of resource\n");
214 0 : return -ENOMEM;
215 : }
216 :
217 0 : io_uring_prep_writev(sqe, uring->fd, iov, iovcnt, offset);
218 0 : io_uring_sqe_set_data(sqe, uring_task);
219 0 : uring_task->len = nbytes;
220 0 : uring_task->ch = uring_ch;
221 :
222 0 : SPDK_DEBUGLOG(uring, "write %d iovs size %lu from off: %#lx\n",
223 : iovcnt, nbytes, offset);
224 :
225 0 : group_ch->io_pending++;
226 0 : return nbytes;
227 : }
228 :
229 : static int
230 0 : bdev_uring_destruct(void *ctx)
231 : {
232 0 : struct bdev_uring *uring = ctx;
233 0 : int rc = 0;
234 :
235 0 : TAILQ_REMOVE(&g_uring_bdev_head, uring, link);
236 0 : rc = bdev_uring_close(uring);
237 0 : if (rc < 0) {
238 0 : SPDK_ERRLOG("bdev_uring_close() failed\n");
239 : }
240 0 : spdk_io_device_unregister(uring, NULL);
241 0 : uring_free_bdev(uring);
242 0 : return rc;
243 : }
244 :
245 : static int
246 0 : bdev_uring_reap(struct io_uring *ring, int max)
247 : {
248 : int i, count, ret;
249 0 : struct io_uring_cqe *cqe;
250 : struct bdev_uring_task *uring_task;
251 : enum spdk_bdev_io_status status;
252 :
253 0 : count = 0;
254 0 : for (i = 0; i < max; i++) {
255 0 : ret = io_uring_peek_cqe(ring, &cqe);
256 0 : if (ret != 0) {
257 0 : return ret;
258 : }
259 :
260 0 : if (cqe == NULL) {
261 0 : return count;
262 : }
263 :
264 0 : uring_task = (struct bdev_uring_task *)cqe->user_data;
265 0 : if (cqe->res != (signed)uring_task->len) {
266 0 : status = SPDK_BDEV_IO_STATUS_FAILED;
267 : } else {
268 0 : status = SPDK_BDEV_IO_STATUS_SUCCESS;
269 : }
270 :
271 0 : uring_task->ch->group_ch->io_inflight--;
272 0 : io_uring_cqe_seen(ring, cqe);
273 0 : spdk_bdev_io_complete(spdk_bdev_io_from_ctx(uring_task), status);
274 0 : count++;
275 : }
276 :
277 0 : return count;
278 : }
279 :
280 : static int
281 0 : bdev_uring_group_poll(void *arg)
282 : {
283 0 : struct bdev_uring_group_channel *group_ch = arg;
284 : int to_complete, to_submit;
285 : int count, ret;
286 :
287 0 : to_submit = group_ch->io_pending;
288 :
289 0 : if (to_submit > 0) {
290 : /* If there are I/O to submit, use io_uring_submit here.
291 : * It will automatically call spdk_io_uring_enter appropriately. */
292 0 : ret = io_uring_submit(&group_ch->uring);
293 0 : if (ret < 0) {
294 0 : return SPDK_POLLER_BUSY;
295 : }
296 :
297 0 : group_ch->io_pending = 0;
298 0 : group_ch->io_inflight += to_submit;
299 : }
300 :
301 0 : to_complete = group_ch->io_inflight;
302 0 : count = 0;
303 0 : if (to_complete > 0) {
304 0 : count = bdev_uring_reap(&group_ch->uring, to_complete);
305 : }
306 :
307 0 : if (count + to_submit > 0) {
308 0 : return SPDK_POLLER_BUSY;
309 : } else {
310 0 : return SPDK_POLLER_IDLE;
311 : }
312 : }
313 :
314 : static void
315 0 : bdev_uring_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
316 : bool success)
317 : {
318 0 : int64_t ret = 0;
319 :
320 0 : if (!success) {
321 0 : spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
322 0 : return;
323 : }
324 :
325 0 : switch (bdev_io->type) {
326 0 : case SPDK_BDEV_IO_TYPE_READ:
327 0 : ret = bdev_uring_readv((struct bdev_uring *)bdev_io->bdev->ctxt,
328 : ch,
329 0 : (struct bdev_uring_task *)bdev_io->driver_ctx,
330 : bdev_io->u.bdev.iovs,
331 : bdev_io->u.bdev.iovcnt,
332 0 : bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
333 0 : bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
334 0 : break;
335 0 : case SPDK_BDEV_IO_TYPE_WRITE:
336 0 : ret = bdev_uring_writev((struct bdev_uring *)bdev_io->bdev->ctxt,
337 : ch,
338 0 : (struct bdev_uring_task *)bdev_io->driver_ctx,
339 : bdev_io->u.bdev.iovs,
340 : bdev_io->u.bdev.iovcnt,
341 0 : bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen,
342 0 : bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen);
343 0 : break;
344 0 : default:
345 0 : SPDK_ERRLOG("Wrong io type\n");
346 0 : break;
347 : }
348 :
349 0 : if (ret == -ENOMEM) {
350 0 : spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM);
351 : }
352 : }
353 :
354 : #ifdef SPDK_CONFIG_URING_ZNS
355 : static int
356 0 : bdev_uring_fill_zone_type(struct spdk_bdev_zone_info *zone_info, struct blk_zone *zones_rep)
357 : {
358 0 : switch (zones_rep->type) {
359 0 : case BLK_ZONE_TYPE_CONVENTIONAL:
360 0 : zone_info->type = SPDK_BDEV_ZONE_TYPE_CNV;
361 0 : break;
362 0 : case BLK_ZONE_TYPE_SEQWRITE_REQ:
363 0 : zone_info->type = SPDK_BDEV_ZONE_TYPE_SEQWR;
364 0 : break;
365 0 : case BLK_ZONE_TYPE_SEQWRITE_PREF:
366 0 : zone_info->type = SPDK_BDEV_ZONE_TYPE_SEQWP;
367 0 : break;
368 0 : default:
369 0 : SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", zones_rep->type);
370 0 : return -EIO;
371 : }
372 0 : return 0;
373 : }
374 :
375 : static int
376 0 : bdev_uring_fill_zone_state(struct spdk_bdev_zone_info *zone_info, struct blk_zone *zones_rep)
377 : {
378 0 : switch (zones_rep->cond) {
379 0 : case BLK_ZONE_COND_EMPTY:
380 0 : zone_info->state = SPDK_BDEV_ZONE_STATE_EMPTY;
381 0 : break;
382 0 : case BLK_ZONE_COND_IMP_OPEN:
383 0 : zone_info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN;
384 0 : break;
385 0 : case BLK_ZONE_COND_EXP_OPEN:
386 0 : zone_info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN;
387 0 : break;
388 0 : case BLK_ZONE_COND_CLOSED:
389 0 : zone_info->state = SPDK_BDEV_ZONE_STATE_CLOSED;
390 0 : break;
391 0 : case BLK_ZONE_COND_READONLY:
392 0 : zone_info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY;
393 0 : break;
394 0 : case BLK_ZONE_COND_FULL:
395 0 : zone_info->state = SPDK_BDEV_ZONE_STATE_FULL;
396 0 : break;
397 0 : case BLK_ZONE_COND_OFFLINE:
398 0 : zone_info->state = SPDK_BDEV_ZONE_STATE_OFFLINE;
399 0 : break;
400 0 : case BLK_ZONE_COND_NOT_WP:
401 0 : zone_info->state = SPDK_BDEV_ZONE_STATE_NOT_WP;
402 0 : break;
403 0 : default:
404 0 : SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", zones_rep->cond);
405 0 : return -EIO;
406 : }
407 0 : return 0;
408 : }
409 :
410 : static int
411 0 : bdev_uring_zone_management_op(struct spdk_bdev_io *bdev_io)
412 : {
413 : struct bdev_uring *uring;
414 0 : struct blk_zone_range range;
415 : long unsigned zone_mgmt_op;
416 0 : uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
417 :
418 0 : uring = (struct bdev_uring *)bdev_io->bdev->ctxt;
419 :
420 0 : switch (bdev_io->u.zone_mgmt.zone_action) {
421 0 : case SPDK_BDEV_ZONE_RESET:
422 0 : zone_mgmt_op = BLKRESETZONE;
423 0 : break;
424 0 : case SPDK_BDEV_ZONE_OPEN:
425 0 : zone_mgmt_op = BLKOPENZONE;
426 0 : break;
427 0 : case SPDK_BDEV_ZONE_CLOSE:
428 0 : zone_mgmt_op = BLKCLOSEZONE;
429 0 : break;
430 0 : case SPDK_BDEV_ZONE_FINISH:
431 0 : zone_mgmt_op = BLKFINISHZONE;
432 0 : break;
433 0 : default:
434 0 : return -EINVAL;
435 : }
436 :
437 0 : range.sector = (zone_id << uring->zd.lba_shift);
438 0 : range.nr_sectors = (uring->bdev.zone_size << uring->zd.lba_shift);
439 :
440 0 : if (ioctl(uring->fd, zone_mgmt_op, &range)) {
441 0 : SPDK_ERRLOG("Ioctl BLKXXXZONE(%#x) failed errno: %d(%s)\n",
442 : bdev_io->u.zone_mgmt.zone_action, errno, strerror(errno));
443 0 : return -EINVAL;
444 : }
445 :
446 0 : spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
447 :
448 0 : return 0;
449 : }
450 :
451 : static int
452 0 : bdev_uring_zone_get_info(struct spdk_bdev_io *bdev_io)
453 : {
454 : struct bdev_uring *uring;
455 : struct blk_zone *zones;
456 : struct blk_zone_report *rep;
457 0 : struct spdk_bdev_zone_info *zone_info = bdev_io->u.zone_mgmt.buf;
458 : size_t repsize;
459 : uint32_t i, shift;
460 0 : uint32_t num_zones = bdev_io->u.zone_mgmt.num_zones;
461 0 : uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
462 :
463 0 : uring = (struct bdev_uring *)bdev_io->bdev->ctxt;
464 0 : shift = uring->zd.lba_shift;
465 :
466 0 : if ((num_zones > uring->zd.num_zones) || !num_zones) {
467 0 : return -EINVAL;
468 : }
469 :
470 0 : repsize = sizeof(struct blk_zone_report) + (sizeof(struct blk_zone) * num_zones);
471 0 : rep = (struct blk_zone_report *)malloc(repsize);
472 0 : if (!rep) {
473 0 : return -ENOMEM;
474 : }
475 :
476 0 : zones = (struct blk_zone *)(rep + 1);
477 :
478 0 : while (num_zones && ((zone_id >> uring->zd.zone_shift) <= num_zones)) {
479 0 : memset(rep, 0, repsize);
480 0 : rep->sector = zone_id;
481 0 : rep->nr_zones = num_zones;
482 :
483 0 : if (ioctl(uring->fd, BLKREPORTZONE, rep)) {
484 0 : SPDK_ERRLOG("Ioctl BLKREPORTZONE failed errno: %d(%s)\n",
485 : errno, strerror(errno));
486 0 : free(rep);
487 0 : return -EINVAL;
488 : }
489 :
490 0 : if (!rep->nr_zones) {
491 0 : break;
492 : }
493 :
494 0 : for (i = 0; i < rep->nr_zones; i++) {
495 0 : zone_info->zone_id = ((zones + i)->start >> shift);
496 0 : zone_info->write_pointer = ((zones + i)->wp >> shift);
497 0 : zone_info->capacity = ((zones + i)->capacity >> shift);
498 :
499 0 : bdev_uring_fill_zone_state(zone_info, zones + i);
500 0 : bdev_uring_fill_zone_type(zone_info, zones + i);
501 :
502 0 : zone_id = ((zones + i)->start + (zones + i)->len) >> shift;
503 0 : zone_info++;
504 0 : num_zones--;
505 : }
506 : }
507 :
508 0 : spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
509 0 : free(rep);
510 0 : return 0;
511 : }
512 :
513 : static int
514 0 : bdev_uring_check_zoned_support(struct bdev_uring *uring, const char *name, const char *filename)
515 : {
516 0 : char *filename_dup = NULL, *base;
517 0 : char *str = NULL;
518 0 : uint32_t val;
519 0 : uint32_t zinfo;
520 0 : int retval = -1;
521 0 : struct stat sb;
522 0 : char resolved_path[PATH_MAX], *rp;
523 0 : char *sysfs_path = NULL;
524 :
525 0 : uring->bdev.zoned = false;
526 :
527 : /* Follow symlink */
528 0 : if ((rp = realpath(filename, resolved_path))) {
529 0 : filename = rp;
530 : }
531 :
532 : /* Perform check on block devices only */
533 0 : if (stat(filename, &sb) == 0 && S_ISBLK(sb.st_mode)) {
534 0 : return 0;
535 : }
536 :
537 : /* strdup() because basename() may modify the passed parameter */
538 0 : filename_dup = strdup(filename);
539 0 : if (filename_dup == NULL) {
540 0 : SPDK_ERRLOG("Could not duplicate string %s\n", filename);
541 0 : return -1;
542 : }
543 :
544 0 : base = basename(filename_dup);
545 0 : free(filename_dup);
546 0 : sysfs_path = spdk_sprintf_alloc("/sys/block/%s/queue/zoned", base);
547 0 : retval = spdk_read_sysfs_attribute(&str, "%s", sysfs_path);
548 : /* Check if this is a zoned block device */
549 0 : if (retval < 0) {
550 0 : SPDK_ERRLOG("Unable to open file %s. errno: %d\n", sysfs_path, retval);
551 0 : } else if (strcmp(str, "host-aware") == 0 || strcmp(str, "host-managed") == 0) {
552 : /* Only host-aware & host-managed zns devices */
553 0 : uring->bdev.zoned = true;
554 :
555 0 : if (ioctl(uring->fd, BLKGETNRZONES, &zinfo)) {
556 0 : SPDK_ERRLOG("ioctl BLKNRZONES failed %d (%s)\n", errno, strerror(errno));
557 0 : goto err_ret;
558 : }
559 0 : uring->zd.num_zones = zinfo;
560 :
561 0 : if (ioctl(uring->fd, BLKGETZONESZ, &zinfo)) {
562 0 : SPDK_ERRLOG("ioctl BLKGETZONESZ failed %d (%s)\n", errno, strerror(errno));
563 0 : goto err_ret;
564 : }
565 :
566 0 : uring->zd.lba_shift = uring->bdev.required_alignment - SECTOR_SHIFT;
567 0 : uring->bdev.zone_size = (zinfo >> uring->zd.lba_shift);
568 0 : uring->zd.zone_shift = spdk_u32log2(zinfo >> uring->zd.lba_shift);
569 :
570 0 : retval = spdk_read_sysfs_attribute_uint32(&val, "/sys/block/%s/queue/max_open_zones", base);
571 0 : if (retval < 0) {
572 0 : SPDK_ERRLOG("Failed to get max open zones %d (%s)\n", retval, strerror(-retval));
573 0 : goto err_ret;
574 : }
575 0 : uring->bdev.max_open_zones = uring->bdev.optimal_open_zones = val;
576 :
577 0 : retval = spdk_read_sysfs_attribute_uint32(&val, "/sys/block/%s/queue/max_active_zones", base);
578 0 : if (retval < 0) {
579 0 : SPDK_ERRLOG("Failed to get max active zones %d (%s)\n", retval, strerror(-retval));
580 0 : goto err_ret;
581 : }
582 0 : uring->bdev.max_active_zones = val;
583 0 : retval = 0;
584 : } else {
585 0 : retval = 0; /* queue/zoned=none */
586 : }
587 0 : err_ret:
588 0 : free(str);
589 0 : free(sysfs_path);
590 0 : return retval;
591 : }
592 : #else
593 : /* No support for zoned devices */
594 : static int
595 : bdev_uring_zone_management_op(struct spdk_bdev_io *bdev_io)
596 : {
597 : return -1;
598 : }
599 :
600 : static int
601 : bdev_uring_zone_get_info(struct spdk_bdev_io *bdev_io)
602 : {
603 : return -1;
604 : }
605 :
606 : static int
607 : bdev_uring_check_zoned_support(struct bdev_uring *uring, const char *name, const char *filename)
608 : {
609 : return 0;
610 : }
611 : #endif
612 :
613 : static int
614 0 : _bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
615 : {
616 :
617 0 : switch (bdev_io->type) {
618 0 : case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
619 0 : return bdev_uring_zone_get_info(bdev_io);
620 0 : case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
621 0 : return bdev_uring_zone_management_op(bdev_io);
622 : /* Read and write operations must be performed on buffers aligned to
623 : * bdev->required_alignment. If user specified unaligned buffers,
624 : * get the aligned buffer from the pool by calling spdk_bdev_io_get_buf. */
625 0 : case SPDK_BDEV_IO_TYPE_READ:
626 : case SPDK_BDEV_IO_TYPE_WRITE:
627 0 : spdk_bdev_io_get_buf(bdev_io, bdev_uring_get_buf_cb,
628 0 : bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
629 0 : return 0;
630 0 : default:
631 0 : return -1;
632 : }
633 : }
634 :
635 : static void
636 0 : bdev_uring_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
637 : {
638 0 : if (_bdev_uring_submit_request(ch, bdev_io) < 0) {
639 0 : spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
640 : }
641 0 : }
642 :
643 : static bool
644 0 : bdev_uring_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
645 : {
646 0 : switch (io_type) {
647 : #ifdef SPDK_CONFIG_URING_ZNS
648 0 : case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
649 : case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
650 : #endif
651 : case SPDK_BDEV_IO_TYPE_READ:
652 : case SPDK_BDEV_IO_TYPE_WRITE:
653 0 : return true;
654 0 : default:
655 0 : return false;
656 : }
657 : }
658 :
659 : static int
660 0 : bdev_uring_create_cb(void *io_device, void *ctx_buf)
661 : {
662 0 : struct bdev_uring_io_channel *ch = ctx_buf;
663 :
664 0 : ch->group_ch = spdk_io_channel_get_ctx(spdk_get_io_channel(&uring_if));
665 :
666 0 : return 0;
667 : }
668 :
669 : static void
670 0 : bdev_uring_destroy_cb(void *io_device, void *ctx_buf)
671 : {
672 0 : struct bdev_uring_io_channel *ch = ctx_buf;
673 :
674 0 : spdk_put_io_channel(spdk_io_channel_from_ctx(ch->group_ch));
675 0 : }
676 :
677 : static struct spdk_io_channel *
678 0 : bdev_uring_get_io_channel(void *ctx)
679 : {
680 0 : struct bdev_uring *uring = ctx;
681 :
682 0 : return spdk_get_io_channel(uring);
683 : }
684 :
685 : static int
686 0 : bdev_uring_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
687 : {
688 0 : struct bdev_uring *uring = ctx;
689 :
690 0 : spdk_json_write_named_object_begin(w, "uring");
691 :
692 0 : spdk_json_write_named_string(w, "filename", uring->filename);
693 :
694 0 : spdk_json_write_object_end(w);
695 :
696 0 : return 0;
697 : }
698 :
699 : static void
700 0 : bdev_uring_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
701 : {
702 0 : struct bdev_uring *uring = bdev->ctxt;
703 0 : char uuid_str[SPDK_UUID_STRING_LEN];
704 :
705 0 : spdk_json_write_object_begin(w);
706 :
707 0 : spdk_json_write_named_string(w, "method", "bdev_uring_create");
708 :
709 0 : spdk_json_write_named_object_begin(w, "params");
710 0 : spdk_json_write_named_string(w, "name", bdev->name);
711 0 : spdk_json_write_named_uint32(w, "block_size", bdev->blocklen);
712 0 : spdk_json_write_named_string(w, "filename", uring->filename);
713 0 : spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &bdev->uuid);
714 0 : spdk_json_write_named_string(w, "uuid", uuid_str);
715 0 : spdk_json_write_object_end(w);
716 :
717 0 : spdk_json_write_object_end(w);
718 0 : }
719 :
720 : static const struct spdk_bdev_fn_table uring_fn_table = {
721 : .destruct = bdev_uring_destruct,
722 : .submit_request = bdev_uring_submit_request,
723 : .io_type_supported = bdev_uring_io_type_supported,
724 : .get_io_channel = bdev_uring_get_io_channel,
725 : .dump_info_json = bdev_uring_dump_info_json,
726 : .write_config_json = bdev_uring_write_json_config,
727 : };
728 :
729 : static void
730 0 : uring_free_bdev(struct bdev_uring *uring)
731 : {
732 0 : if (uring == NULL) {
733 0 : return;
734 : }
735 0 : free(uring->filename);
736 0 : free(uring->bdev.name);
737 0 : free(uring);
738 : }
739 :
740 : static int
741 0 : bdev_uring_group_create_cb(void *io_device, void *ctx_buf)
742 : {
743 0 : struct bdev_uring_group_channel *ch = ctx_buf;
744 :
745 : /* Do not use IORING_SETUP_IOPOLL until the Linux kernel can support not only
746 : * local devices but also devices attached from remote target */
747 0 : if (io_uring_queue_init(SPDK_URING_QUEUE_DEPTH, &ch->uring, 0) < 0) {
748 0 : SPDK_ERRLOG("uring I/O context setup failure\n");
749 0 : return -1;
750 : }
751 :
752 0 : ch->poller = SPDK_POLLER_REGISTER(bdev_uring_group_poll, ch, 0);
753 0 : return 0;
754 : }
755 :
756 : static void
757 0 : bdev_uring_group_destroy_cb(void *io_device, void *ctx_buf)
758 : {
759 0 : struct bdev_uring_group_channel *ch = ctx_buf;
760 :
761 0 : io_uring_queue_exit(&ch->uring);
762 :
763 0 : spdk_poller_unregister(&ch->poller);
764 0 : }
765 :
766 : struct spdk_bdev *
767 0 : create_uring_bdev(const struct bdev_uring_opts *opts)
768 : {
769 : struct bdev_uring *uring;
770 : uint32_t detected_block_size;
771 : uint64_t bdev_size;
772 : int rc;
773 0 : uint32_t block_size = opts->block_size;
774 :
775 0 : uring = calloc(1, sizeof(*uring));
776 0 : if (!uring) {
777 0 : SPDK_ERRLOG("Unable to allocate enough memory for uring backend\n");
778 0 : return NULL;
779 : }
780 :
781 0 : uring->filename = strdup(opts->filename);
782 0 : if (!uring->filename) {
783 0 : goto error_return;
784 : }
785 :
786 0 : if (bdev_uring_open(uring)) {
787 0 : SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", opts->filename, uring->fd, errno);
788 0 : goto error_return;
789 : }
790 :
791 0 : bdev_size = spdk_fd_get_size(uring->fd);
792 :
793 0 : uring->bdev.name = strdup(opts->name);
794 0 : if (!uring->bdev.name) {
795 0 : goto error_return;
796 : }
797 0 : uring->bdev.product_name = "URING bdev";
798 0 : uring->bdev.module = &uring_if;
799 :
800 0 : uring->bdev.write_cache = 0;
801 :
802 0 : detected_block_size = spdk_fd_get_blocklen(uring->fd);
803 0 : if (block_size == 0) {
804 : /* User did not specify block size - use autodetected block size. */
805 0 : if (detected_block_size == 0) {
806 0 : SPDK_ERRLOG("Block size could not be auto-detected\n");
807 0 : goto error_return;
808 : }
809 0 : block_size = detected_block_size;
810 : } else {
811 0 : if (block_size < detected_block_size) {
812 0 : SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than "
813 : "auto-detected block size %" PRIu32 "\n",
814 : block_size, detected_block_size);
815 0 : goto error_return;
816 0 : } else if (detected_block_size != 0 && block_size != detected_block_size) {
817 0 : SPDK_WARNLOG("Specified block size %" PRIu32 " does not match "
818 : "auto-detected block size %" PRIu32 "\n",
819 : block_size, detected_block_size);
820 : }
821 : }
822 :
823 0 : if (block_size < 512) {
824 0 : SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size);
825 0 : goto error_return;
826 : }
827 :
828 0 : if (!spdk_u32_is_pow2(block_size)) {
829 0 : SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size);
830 0 : goto error_return;
831 : }
832 :
833 0 : uring->bdev.blocklen = block_size;
834 0 : uring->bdev.required_alignment = spdk_u32log2(block_size);
835 :
836 0 : rc = bdev_uring_check_zoned_support(uring, opts->name, opts->filename);
837 0 : if (rc) {
838 0 : goto error_return;
839 : }
840 :
841 0 : if (bdev_size % uring->bdev.blocklen != 0) {
842 0 : SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n",
843 : bdev_size, uring->bdev.blocklen);
844 0 : goto error_return;
845 : }
846 :
847 0 : uring->bdev.blockcnt = bdev_size / uring->bdev.blocklen;
848 0 : uring->bdev.ctxt = uring;
849 :
850 0 : uring->bdev.fn_table = &uring_fn_table;
851 :
852 0 : if (!spdk_mem_all_zero(&opts->uuid, sizeof(opts->uuid))) {
853 0 : spdk_uuid_copy(&uring->bdev.uuid, &opts->uuid);
854 : }
855 :
856 0 : spdk_io_device_register(uring, bdev_uring_create_cb, bdev_uring_destroy_cb,
857 : sizeof(struct bdev_uring_io_channel),
858 0 : uring->bdev.name);
859 0 : rc = spdk_bdev_register(&uring->bdev);
860 0 : if (rc) {
861 0 : spdk_io_device_unregister(uring, NULL);
862 0 : goto error_return;
863 : }
864 :
865 0 : TAILQ_INSERT_TAIL(&g_uring_bdev_head, uring, link);
866 0 : return &uring->bdev;
867 :
868 0 : error_return:
869 0 : bdev_uring_close(uring);
870 0 : uring_free_bdev(uring);
871 0 : return NULL;
872 : }
873 :
874 : struct delete_uring_bdev_ctx {
875 : spdk_delete_uring_complete cb_fn;
876 : void *cb_arg;
877 : };
878 :
879 : static void
880 0 : uring_bdev_unregister_cb(void *arg, int bdeverrno)
881 : {
882 0 : struct delete_uring_bdev_ctx *ctx = arg;
883 :
884 0 : ctx->cb_fn(ctx->cb_arg, bdeverrno);
885 0 : free(ctx);
886 0 : }
887 :
888 : void
889 0 : delete_uring_bdev(const char *name, spdk_delete_uring_complete cb_fn, void *cb_arg)
890 : {
891 : struct delete_uring_bdev_ctx *ctx;
892 : int rc;
893 :
894 0 : ctx = calloc(1, sizeof(*ctx));
895 0 : if (ctx == NULL) {
896 0 : cb_fn(cb_arg, -ENOMEM);
897 0 : return;
898 : }
899 :
900 0 : ctx->cb_fn = cb_fn;
901 0 : ctx->cb_arg = cb_arg;
902 0 : rc = spdk_bdev_unregister_by_name(name, &uring_if, uring_bdev_unregister_cb, ctx);
903 0 : if (rc != 0) {
904 0 : uring_bdev_unregister_cb(ctx, rc);
905 : }
906 : }
907 :
908 : static int
909 0 : bdev_uring_init(void)
910 : {
911 0 : spdk_io_device_register(&uring_if, bdev_uring_group_create_cb, bdev_uring_group_destroy_cb,
912 : sizeof(struct bdev_uring_group_channel), "uring_module");
913 :
914 0 : return 0;
915 : }
916 :
917 : static void
918 0 : bdev_uring_fini(void)
919 : {
920 0 : spdk_io_device_unregister(&uring_if, NULL);
921 0 : }
922 :
923 0 : SPDK_LOG_REGISTER_COMPONENT(uring)
|