Line data Source code
1 : /* SPDX-License-Identifier: BSD-3-Clause
2 : * Copyright (C) 2022 Intel Corporation.
3 : * All rights reserved.
4 : */
5 :
6 : #include <liburing.h>
7 :
8 : #include "spdk/stdinc.h"
9 : #include "spdk/string.h"
10 : #include "spdk/bdev.h"
11 : #include "spdk/endian.h"
12 : #include "spdk/env.h"
13 : #include "spdk/likely.h"
14 : #include "spdk/log.h"
15 : #include "spdk/util.h"
16 : #include "spdk/queue.h"
17 : #include "spdk/json.h"
18 : #include "spdk/ublk.h"
19 : #include "spdk/thread.h"
20 : #include "spdk/file.h"
21 :
22 : #include "ublk_internal.h"
23 :
24 : #define UBLK_CTRL_DEV "/dev/ublk-control"
25 : #define UBLK_BLK_CDEV "/dev/ublkc"
26 :
27 : #define LINUX_SECTOR_SHIFT 9
28 : #define UBLK_IO_MAX_BYTES SPDK_BDEV_LARGE_BUF_MAX_SIZE
29 : #define UBLK_DEV_MAX_QUEUES 32
30 : #define UBLK_DEV_MAX_QUEUE_DEPTH 1024
31 : #define UBLK_QUEUE_REQUEST 32
32 : #define UBLK_STOP_BUSY_WAITING_MS 10000
33 : #define UBLK_BUSY_POLLING_INTERVAL_US 20000
34 : #define UBLK_DEFAULT_CTRL_URING_POLLING_INTERVAL_US 1000
35 : /* By default, kernel ublk_drv driver can support up to 64 block devices */
36 : #define UBLK_DEFAULT_MAX_SUPPORTED_DEVS 64
37 :
38 : #define UBLK_IOBUF_SMALL_CACHE_SIZE 128
39 : #define UBLK_IOBUF_LARGE_CACHE_SIZE 32
40 :
41 : #define UBLK_DEBUGLOG(ublk, format, ...) \
42 : SPDK_DEBUGLOG(ublk, "ublk%d: " format, ublk->ublk_id, ##__VA_ARGS__);
43 :
44 : static uint32_t g_num_ublk_poll_groups = 0;
45 : static uint32_t g_next_ublk_poll_group = 0;
46 : static uint32_t g_ublks_max = UBLK_DEFAULT_MAX_SUPPORTED_DEVS;
47 : static struct spdk_cpuset g_core_mask;
48 : static bool g_disable_user_copy = false;
49 :
50 : struct ublk_queue;
51 : struct ublk_poll_group;
52 : struct ublk_io;
53 : static void _ublk_submit_bdev_io(struct ublk_queue *q, struct ublk_io *io);
54 : static void ublk_dev_queue_fini(struct ublk_queue *q);
55 : static int ublk_poll(void *arg);
56 :
57 : static int ublk_set_params(struct spdk_ublk_dev *ublk);
58 : static int ublk_start_dev(struct spdk_ublk_dev *ublk, bool is_recovering);
59 : static void ublk_free_dev(struct spdk_ublk_dev *ublk);
60 : static void ublk_delete_dev(void *arg);
61 : static int ublk_close_dev(struct spdk_ublk_dev *ublk);
62 : static int ublk_ctrl_start_recovery(struct spdk_ublk_dev *ublk);
63 :
64 : static int ublk_ctrl_cmd_submit(struct spdk_ublk_dev *ublk, uint32_t cmd_op);
65 :
66 : static const char *ublk_op_name[64] = {
67 : [UBLK_CMD_GET_DEV_INFO] = "UBLK_CMD_GET_DEV_INFO",
68 : [UBLK_CMD_ADD_DEV] = "UBLK_CMD_ADD_DEV",
69 : [UBLK_CMD_DEL_DEV] = "UBLK_CMD_DEL_DEV",
70 : [UBLK_CMD_START_DEV] = "UBLK_CMD_START_DEV",
71 : [UBLK_CMD_STOP_DEV] = "UBLK_CMD_STOP_DEV",
72 : [UBLK_CMD_SET_PARAMS] = "UBLK_CMD_SET_PARAMS",
73 : [UBLK_CMD_START_USER_RECOVERY] = "UBLK_CMD_START_USER_RECOVERY",
74 : [UBLK_CMD_END_USER_RECOVERY] = "UBLK_CMD_END_USER_RECOVERY",
75 : };
76 :
77 : typedef void (*ublk_get_buf_cb)(struct ublk_io *io);
78 :
79 : struct ublk_io {
80 : void *payload;
81 : void *mpool_entry;
82 : bool need_data;
83 : bool user_copy;
84 : uint16_t tag;
85 : uint64_t payload_size;
86 : uint32_t cmd_op;
87 : int32_t result;
88 : struct spdk_bdev_desc *bdev_desc;
89 : struct spdk_io_channel *bdev_ch;
90 : const struct ublksrv_io_desc *iod;
91 : ublk_get_buf_cb get_buf_cb;
92 : struct ublk_queue *q;
93 : /* for bdev io_wait */
94 : struct spdk_bdev_io_wait_entry bdev_io_wait;
95 : struct spdk_iobuf_entry iobuf;
96 :
97 : TAILQ_ENTRY(ublk_io) tailq;
98 : };
99 :
100 : struct ublk_queue {
101 : uint32_t q_id;
102 : uint32_t q_depth;
103 : struct ublk_io *ios;
104 : TAILQ_HEAD(, ublk_io) completed_io_list;
105 : TAILQ_HEAD(, ublk_io) inflight_io_list;
106 : uint32_t cmd_inflight;
107 : bool is_stopping;
108 : struct ublksrv_io_desc *io_cmd_buf;
109 : /* ring depth == dev_info->queue_depth. */
110 : struct io_uring ring;
111 : struct spdk_ublk_dev *dev;
112 : struct ublk_poll_group *poll_group;
113 : struct spdk_io_channel *bdev_ch;
114 :
115 : TAILQ_ENTRY(ublk_queue) tailq;
116 : };
117 :
118 : struct spdk_ublk_dev {
119 : struct spdk_bdev *bdev;
120 : struct spdk_bdev_desc *bdev_desc;
121 :
122 : int cdev_fd;
123 : struct ublk_params dev_params;
124 : struct ublksrv_ctrl_dev_info dev_info;
125 :
126 : uint32_t ublk_id;
127 : uint32_t num_queues;
128 : uint32_t queue_depth;
129 : uint32_t online_num_queues;
130 : uint32_t sector_per_block_shift;
131 : struct ublk_queue queues[UBLK_DEV_MAX_QUEUES];
132 :
133 : struct spdk_poller *retry_poller;
134 : int retry_count;
135 : uint32_t queues_closed;
136 : ublk_ctrl_cb ctrl_cb;
137 : void *cb_arg;
138 : uint32_t current_cmd_op;
139 : uint32_t ctrl_ops_in_progress;
140 : bool is_closing;
141 : bool is_recovering;
142 :
143 : TAILQ_ENTRY(spdk_ublk_dev) tailq;
144 : TAILQ_ENTRY(spdk_ublk_dev) wait_tailq;
145 : };
146 :
147 : struct ublk_poll_group {
148 : struct spdk_thread *ublk_thread;
149 : struct spdk_poller *ublk_poller;
150 : struct spdk_iobuf_channel iobuf_ch;
151 : TAILQ_HEAD(, ublk_queue) queue_list;
152 : };
153 :
154 : struct ublk_tgt {
155 : int ctrl_fd;
156 : bool active;
157 : bool is_destroying;
158 : spdk_ublk_fini_cb cb_fn;
159 : void *cb_arg;
160 : struct io_uring ctrl_ring;
161 : struct spdk_poller *ctrl_poller;
162 : uint32_t ctrl_ops_in_progress;
163 : struct ublk_poll_group *poll_groups;
164 : uint32_t num_ublk_devs;
165 : uint64_t features;
166 : /* `ublk_drv` supports UBLK_F_CMD_IOCTL_ENCODE */
167 : bool ioctl_encode;
168 : /* `ublk_drv` supports UBLK_F_USER_COPY */
169 : bool user_copy;
170 : /* `ublk_drv` supports UBLK_F_USER_RECOVERY */
171 : bool user_recovery;
172 : };
173 :
174 : static TAILQ_HEAD(, spdk_ublk_dev) g_ublk_devs = TAILQ_HEAD_INITIALIZER(g_ublk_devs);
175 : static struct ublk_tgt g_ublk_tgt;
176 :
177 : /* helpers for using io_uring */
178 : static inline int
179 0 : ublk_setup_ring(uint32_t depth, struct io_uring *r, unsigned flags)
180 : {
181 0 : struct io_uring_params p = {};
182 :
183 0 : p.flags = flags | IORING_SETUP_CQSIZE;
184 0 : p.cq_entries = depth;
185 :
186 0 : return io_uring_queue_init_params(depth, r, &p);
187 : }
188 :
189 : static inline struct io_uring_sqe *
190 0 : ublk_uring_get_sqe(struct io_uring *r, uint32_t idx)
191 : {
192 : /* Need to update the idx since we set IORING_SETUP_SQE128 parameter in ublk_setup_ring */
193 0 : return &r->sq.sqes[idx << 1];
194 : }
195 :
196 : static inline void *
197 0 : ublk_get_sqe_cmd(struct io_uring_sqe *sqe)
198 : {
199 0 : return (void *)&sqe->addr3;
200 : }
201 :
202 : static inline void
203 0 : ublk_set_sqe_cmd_op(struct io_uring_sqe *sqe, uint32_t cmd_op)
204 : {
205 0 : uint32_t opc = cmd_op;
206 :
207 0 : if (g_ublk_tgt.ioctl_encode) {
208 0 : switch (cmd_op) {
209 : /* ctrl uring */
210 0 : case UBLK_CMD_GET_DEV_INFO:
211 0 : opc = _IOR('u', UBLK_CMD_GET_DEV_INFO, struct ublksrv_ctrl_cmd);
212 0 : break;
213 0 : case UBLK_CMD_ADD_DEV:
214 0 : opc = _IOWR('u', UBLK_CMD_ADD_DEV, struct ublksrv_ctrl_cmd);
215 0 : break;
216 0 : case UBLK_CMD_DEL_DEV:
217 0 : opc = _IOWR('u', UBLK_CMD_DEL_DEV, struct ublksrv_ctrl_cmd);
218 0 : break;
219 0 : case UBLK_CMD_START_DEV:
220 0 : opc = _IOWR('u', UBLK_CMD_START_DEV, struct ublksrv_ctrl_cmd);
221 0 : break;
222 0 : case UBLK_CMD_STOP_DEV:
223 0 : opc = _IOWR('u', UBLK_CMD_STOP_DEV, struct ublksrv_ctrl_cmd);
224 0 : break;
225 0 : case UBLK_CMD_SET_PARAMS:
226 0 : opc = _IOWR('u', UBLK_CMD_SET_PARAMS, struct ublksrv_ctrl_cmd);
227 0 : break;
228 0 : case UBLK_CMD_START_USER_RECOVERY:
229 0 : opc = _IOWR('u', UBLK_CMD_START_USER_RECOVERY, struct ublksrv_ctrl_cmd);
230 0 : break;
231 0 : case UBLK_CMD_END_USER_RECOVERY:
232 0 : opc = _IOWR('u', UBLK_CMD_END_USER_RECOVERY, struct ublksrv_ctrl_cmd);
233 0 : break;
234 :
235 : /* io uring */
236 0 : case UBLK_IO_FETCH_REQ:
237 0 : opc = _IOWR('u', UBLK_IO_FETCH_REQ, struct ublksrv_io_cmd);
238 0 : break;
239 0 : case UBLK_IO_COMMIT_AND_FETCH_REQ:
240 0 : opc = _IOWR('u', UBLK_IO_COMMIT_AND_FETCH_REQ, struct ublksrv_io_cmd);
241 0 : break;
242 0 : case UBLK_IO_NEED_GET_DATA:
243 0 : opc = _IOWR('u', UBLK_IO_NEED_GET_DATA, struct ublksrv_io_cmd);
244 0 : break;
245 0 : default:
246 0 : break;
247 : }
248 : }
249 :
250 0 : sqe->off = opc;
251 0 : }
252 :
253 : static inline uint64_t
254 0 : build_user_data(uint16_t tag, uint8_t op)
255 : {
256 0 : assert(!(tag >> 16) && !(op >> 8));
257 :
258 0 : return tag | (op << 16);
259 : }
260 :
261 : static inline uint16_t
262 0 : user_data_to_tag(uint64_t user_data)
263 : {
264 0 : return user_data & 0xffff;
265 : }
266 :
267 : static inline uint8_t
268 0 : user_data_to_op(uint64_t user_data)
269 : {
270 0 : return (user_data >> 16) & 0xff;
271 : }
272 :
273 : static inline uint64_t
274 0 : ublk_user_copy_pos(uint16_t q_id, uint16_t tag)
275 : {
276 0 : return (uint64_t)UBLKSRV_IO_BUF_OFFSET + ((((uint64_t)q_id) << UBLK_QID_OFF) | (((
277 0 : uint64_t)tag) << UBLK_TAG_OFF));
278 : }
279 :
280 : void
281 0 : spdk_ublk_init(void)
282 : {
283 0 : assert(spdk_thread_is_app_thread(NULL));
284 :
285 0 : g_ublk_tgt.ctrl_fd = -1;
286 0 : g_ublk_tgt.ctrl_ring.ring_fd = -1;
287 0 : }
288 :
289 : static void
290 0 : ublk_ctrl_cmd_error(struct spdk_ublk_dev *ublk, int32_t res)
291 : {
292 0 : assert(res != 0);
293 :
294 0 : SPDK_ERRLOG("ctrlr cmd %s failed, %s\n", ublk_op_name[ublk->current_cmd_op], spdk_strerror(-res));
295 0 : if (ublk->ctrl_cb) {
296 0 : ublk->ctrl_cb(ublk->cb_arg, res);
297 0 : ublk->ctrl_cb = NULL;
298 : }
299 :
300 0 : switch (ublk->current_cmd_op) {
301 0 : case UBLK_CMD_ADD_DEV:
302 : case UBLK_CMD_SET_PARAMS:
303 : case UBLK_CMD_START_USER_RECOVERY:
304 : case UBLK_CMD_END_USER_RECOVERY:
305 0 : ublk_delete_dev(ublk);
306 0 : break;
307 0 : case UBLK_CMD_START_DEV:
308 0 : ublk_close_dev(ublk);
309 0 : break;
310 0 : case UBLK_CMD_GET_DEV_INFO:
311 0 : ublk_free_dev(ublk);
312 0 : break;
313 0 : case UBLK_CMD_STOP_DEV:
314 : case UBLK_CMD_DEL_DEV:
315 0 : break;
316 0 : default:
317 0 : SPDK_ERRLOG("No match cmd operation,cmd_op = %d\n", ublk->current_cmd_op);
318 0 : break;
319 : }
320 0 : }
321 :
322 : static int
323 0 : _ublk_get_device_state_retry(void *arg)
324 : {
325 0 : struct spdk_ublk_dev *ublk = arg;
326 : int rc;
327 :
328 0 : spdk_poller_unregister(&ublk->retry_poller);
329 :
330 0 : rc = ublk_ctrl_cmd_submit(ublk, UBLK_CMD_GET_DEV_INFO);
331 0 : if (rc < 0) {
332 0 : ublk_delete_dev(ublk);
333 0 : if (ublk->ctrl_cb) {
334 0 : ublk->ctrl_cb(ublk->cb_arg, rc);
335 0 : ublk->ctrl_cb = NULL;
336 : }
337 : }
338 :
339 0 : return SPDK_POLLER_BUSY;
340 : }
341 :
342 : static void
343 0 : ublk_ctrl_process_cqe(struct io_uring_cqe *cqe)
344 : {
345 : struct spdk_ublk_dev *ublk;
346 0 : int rc = 0;
347 :
348 0 : ublk = (struct spdk_ublk_dev *)cqe->user_data;
349 0 : UBLK_DEBUGLOG(ublk, "ctrl cmd %s completed\n", ublk_op_name[ublk->current_cmd_op]);
350 0 : ublk->ctrl_ops_in_progress--;
351 :
352 0 : if (spdk_unlikely(cqe->res != 0)) {
353 0 : ublk_ctrl_cmd_error(ublk, cqe->res);
354 0 : return;
355 : }
356 :
357 0 : switch (ublk->current_cmd_op) {
358 0 : case UBLK_CMD_ADD_DEV:
359 0 : rc = ublk_set_params(ublk);
360 0 : if (rc < 0) {
361 0 : ublk_delete_dev(ublk);
362 0 : goto cb_done;
363 : }
364 0 : break;
365 0 : case UBLK_CMD_SET_PARAMS:
366 0 : rc = ublk_start_dev(ublk, false);
367 0 : if (rc < 0) {
368 0 : ublk_delete_dev(ublk);
369 0 : goto cb_done;
370 : }
371 0 : break;
372 0 : case UBLK_CMD_START_DEV:
373 0 : goto cb_done;
374 : break;
375 0 : case UBLK_CMD_STOP_DEV:
376 0 : break;
377 0 : case UBLK_CMD_DEL_DEV:
378 0 : if (ublk->ctrl_cb) {
379 0 : ublk->ctrl_cb(ublk->cb_arg, 0);
380 0 : ublk->ctrl_cb = NULL;
381 : }
382 0 : ublk_free_dev(ublk);
383 0 : break;
384 0 : case UBLK_CMD_GET_DEV_INFO:
385 0 : if (ublk->ublk_id != ublk->dev_info.dev_id) {
386 0 : SPDK_ERRLOG("Invalid ublk ID\n");
387 0 : rc = -EINVAL;
388 0 : goto cb_done;
389 : }
390 :
391 0 : UBLK_DEBUGLOG(ublk, "Ublk %u device state %u\n", ublk->ublk_id, ublk->dev_info.state);
392 : /* kernel ublk_drv driver returns -EBUSY if device state isn't UBLK_S_DEV_QUIESCED */
393 0 : if ((ublk->dev_info.state != UBLK_S_DEV_QUIESCED) && (ublk->retry_count < 3)) {
394 0 : ublk->retry_count++;
395 0 : ublk->retry_poller = SPDK_POLLER_REGISTER(_ublk_get_device_state_retry, ublk, 1000000);
396 0 : return;
397 : }
398 :
399 0 : rc = ublk_ctrl_start_recovery(ublk);
400 0 : if (rc < 0) {
401 0 : ublk_delete_dev(ublk);
402 0 : goto cb_done;
403 : }
404 0 : break;
405 0 : case UBLK_CMD_START_USER_RECOVERY:
406 0 : rc = ublk_start_dev(ublk, true);
407 0 : if (rc < 0) {
408 0 : ublk_delete_dev(ublk);
409 0 : goto cb_done;
410 : }
411 0 : break;
412 0 : case UBLK_CMD_END_USER_RECOVERY:
413 0 : SPDK_NOTICELOG("Ublk %u recover done successfully\n", ublk->ublk_id);
414 0 : ublk->is_recovering = false;
415 0 : goto cb_done;
416 : break;
417 0 : default:
418 0 : SPDK_ERRLOG("No match cmd operation,cmd_op = %d\n", ublk->current_cmd_op);
419 0 : break;
420 : }
421 :
422 0 : return;
423 :
424 0 : cb_done:
425 0 : if (ublk->ctrl_cb) {
426 0 : ublk->ctrl_cb(ublk->cb_arg, rc);
427 0 : ublk->ctrl_cb = NULL;
428 : }
429 : }
430 :
431 : static int
432 0 : ublk_ctrl_poller(void *arg)
433 : {
434 0 : struct io_uring *ring = &g_ublk_tgt.ctrl_ring;
435 0 : struct io_uring_cqe *cqe;
436 0 : const int max = 8;
437 0 : int i, count = 0, rc;
438 :
439 0 : if (!g_ublk_tgt.ctrl_ops_in_progress) {
440 0 : return SPDK_POLLER_IDLE;
441 : }
442 :
443 0 : for (i = 0; i < max; i++) {
444 0 : rc = io_uring_peek_cqe(ring, &cqe);
445 0 : if (rc == -EAGAIN) {
446 0 : break;
447 : }
448 :
449 0 : assert(cqe != NULL);
450 0 : g_ublk_tgt.ctrl_ops_in_progress--;
451 :
452 0 : ublk_ctrl_process_cqe(cqe);
453 :
454 0 : io_uring_cqe_seen(ring, cqe);
455 0 : count++;
456 : }
457 :
458 0 : return count > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
459 : }
460 :
461 : static int
462 0 : ublk_ctrl_cmd_submit(struct spdk_ublk_dev *ublk, uint32_t cmd_op)
463 : {
464 0 : uint32_t dev_id = ublk->ublk_id;
465 0 : int rc = -EINVAL;
466 : struct io_uring_sqe *sqe;
467 : struct ublksrv_ctrl_cmd *cmd;
468 :
469 0 : UBLK_DEBUGLOG(ublk, "ctrl cmd %s\n", ublk_op_name[cmd_op]);
470 :
471 0 : sqe = io_uring_get_sqe(&g_ublk_tgt.ctrl_ring);
472 0 : if (!sqe) {
473 0 : SPDK_ERRLOG("No available sqe in ctrl ring\n");
474 0 : assert(false);
475 : return -ENOENT;
476 : }
477 :
478 0 : cmd = (struct ublksrv_ctrl_cmd *)ublk_get_sqe_cmd(sqe);
479 0 : sqe->fd = g_ublk_tgt.ctrl_fd;
480 0 : sqe->opcode = IORING_OP_URING_CMD;
481 0 : sqe->ioprio = 0;
482 0 : cmd->dev_id = dev_id;
483 0 : cmd->queue_id = -1;
484 0 : ublk->current_cmd_op = cmd_op;
485 :
486 0 : switch (cmd_op) {
487 0 : case UBLK_CMD_ADD_DEV:
488 : case UBLK_CMD_GET_DEV_INFO:
489 0 : cmd->addr = (__u64)(uintptr_t)&ublk->dev_info;
490 0 : cmd->len = sizeof(ublk->dev_info);
491 0 : break;
492 0 : case UBLK_CMD_SET_PARAMS:
493 0 : cmd->addr = (__u64)(uintptr_t)&ublk->dev_params;
494 0 : cmd->len = sizeof(ublk->dev_params);
495 0 : break;
496 0 : case UBLK_CMD_START_DEV:
497 0 : cmd->data[0] = getpid();
498 0 : break;
499 0 : case UBLK_CMD_STOP_DEV:
500 0 : break;
501 0 : case UBLK_CMD_DEL_DEV:
502 0 : break;
503 0 : case UBLK_CMD_START_USER_RECOVERY:
504 0 : break;
505 0 : case UBLK_CMD_END_USER_RECOVERY:
506 0 : cmd->data[0] = getpid();
507 0 : break;
508 0 : default:
509 0 : SPDK_ERRLOG("No match cmd operation,cmd_op = %d\n", cmd_op);
510 0 : return -EINVAL;
511 : }
512 0 : ublk_set_sqe_cmd_op(sqe, cmd_op);
513 0 : io_uring_sqe_set_data(sqe, ublk);
514 :
515 0 : rc = io_uring_submit(&g_ublk_tgt.ctrl_ring);
516 0 : if (rc < 0) {
517 0 : SPDK_ERRLOG("uring submit rc %d\n", rc);
518 0 : assert(false);
519 : return rc;
520 : }
521 0 : g_ublk_tgt.ctrl_ops_in_progress++;
522 0 : ublk->ctrl_ops_in_progress++;
523 :
524 0 : return 0;
525 : }
526 :
527 : static int
528 0 : ublk_ctrl_cmd_get_features(void)
529 : {
530 : int rc;
531 : struct io_uring_sqe *sqe;
532 0 : struct io_uring_cqe *cqe;
533 : struct ublksrv_ctrl_cmd *cmd;
534 : uint32_t cmd_op;
535 :
536 0 : sqe = io_uring_get_sqe(&g_ublk_tgt.ctrl_ring);
537 0 : if (!sqe) {
538 0 : SPDK_ERRLOG("No available sqe in ctrl ring\n");
539 0 : assert(false);
540 : return -ENOENT;
541 : }
542 :
543 0 : cmd = (struct ublksrv_ctrl_cmd *)ublk_get_sqe_cmd(sqe);
544 0 : sqe->fd = g_ublk_tgt.ctrl_fd;
545 0 : sqe->opcode = IORING_OP_URING_CMD;
546 0 : sqe->ioprio = 0;
547 0 : cmd->dev_id = -1;
548 0 : cmd->queue_id = -1;
549 0 : cmd->addr = (__u64)(uintptr_t)&g_ublk_tgt.features;
550 0 : cmd->len = sizeof(g_ublk_tgt.features);
551 :
552 0 : cmd_op = UBLK_U_CMD_GET_FEATURES;
553 0 : ublk_set_sqe_cmd_op(sqe, cmd_op);
554 :
555 0 : rc = io_uring_submit(&g_ublk_tgt.ctrl_ring);
556 0 : if (rc < 0) {
557 0 : SPDK_ERRLOG("uring submit rc %d\n", rc);
558 0 : return rc;
559 : }
560 :
561 0 : rc = io_uring_wait_cqe(&g_ublk_tgt.ctrl_ring, &cqe);
562 0 : if (rc < 0) {
563 0 : SPDK_ERRLOG("wait cqe rc %d\n", rc);
564 0 : return rc;
565 : }
566 :
567 0 : if (cqe->res == 0) {
568 0 : g_ublk_tgt.ioctl_encode = !!(g_ublk_tgt.features & UBLK_F_CMD_IOCTL_ENCODE);
569 0 : g_ublk_tgt.user_copy = !!(g_ublk_tgt.features & UBLK_F_USER_COPY);
570 0 : g_ublk_tgt.user_copy &= !g_disable_user_copy;
571 0 : g_ublk_tgt.user_recovery = !!(g_ublk_tgt.features & UBLK_F_USER_RECOVERY);
572 0 : SPDK_NOTICELOG("User Copy %s\n", g_ublk_tgt.user_copy ? "enabled" : "disabled");
573 : }
574 0 : io_uring_cqe_seen(&g_ublk_tgt.ctrl_ring, cqe);
575 :
576 0 : return 0;
577 : }
578 :
579 : static int
580 0 : ublk_queue_cmd_buf_sz(uint32_t q_depth)
581 : {
582 0 : uint32_t size = q_depth * sizeof(struct ublksrv_io_desc);
583 0 : uint32_t page_sz = getpagesize();
584 :
585 : /* round up size */
586 0 : return (size + page_sz - 1) & ~(page_sz - 1);
587 : }
588 :
589 : static int
590 0 : ublk_open(void)
591 : {
592 0 : uint32_t ublks_max;
593 : int rc;
594 :
595 0 : g_ublk_tgt.ctrl_fd = open(UBLK_CTRL_DEV, O_RDWR);
596 0 : if (g_ublk_tgt.ctrl_fd < 0) {
597 0 : rc = errno;
598 0 : SPDK_ERRLOG("UBLK control dev %s can't be opened, error=%s\n", UBLK_CTRL_DEV, spdk_strerror(errno));
599 0 : return -rc;
600 : }
601 :
602 0 : rc = spdk_read_sysfs_attribute_uint32(&ublks_max, "%s",
603 : "/sys/module/ublk_drv/parameters/ublks_max");
604 0 : if (rc == 0 && ublks_max > 0) {
605 0 : g_ublks_max = ublks_max;
606 : }
607 :
608 : /* We need to set SQPOLL for kernels 6.1 and earlier, since they would not defer ublk ctrl
609 : * ring processing to a workqueue. Ctrl ring processing is minimal, so SQPOLL is fine.
610 : * All the commands sent via control uring for a ublk device is executed one by one, so use
611 : * ublks_max * 2 as the number of uring entries is enough.
612 : */
613 0 : rc = ublk_setup_ring(g_ublks_max * 2, &g_ublk_tgt.ctrl_ring,
614 : IORING_SETUP_SQE128 | IORING_SETUP_SQPOLL);
615 0 : if (rc < 0) {
616 0 : SPDK_ERRLOG("UBLK ctrl queue_init: %s\n", spdk_strerror(-rc));
617 0 : goto err;
618 : }
619 :
620 0 : rc = ublk_ctrl_cmd_get_features();
621 0 : if (rc) {
622 0 : goto err;
623 : }
624 :
625 0 : return 0;
626 :
627 0 : err:
628 0 : close(g_ublk_tgt.ctrl_fd);
629 0 : g_ublk_tgt.ctrl_fd = -1;
630 0 : return rc;
631 : }
632 :
633 : static int
634 0 : ublk_parse_core_mask(const char *mask)
635 : {
636 0 : struct spdk_cpuset tmp_mask;
637 : int rc;
638 :
639 0 : if (mask == NULL) {
640 0 : spdk_env_get_cpuset(&g_core_mask);
641 0 : return 0;
642 : }
643 :
644 0 : rc = spdk_cpuset_parse(&g_core_mask, mask);
645 0 : if (rc < 0) {
646 0 : SPDK_ERRLOG("invalid cpumask %s\n", mask);
647 0 : return -EINVAL;
648 : }
649 :
650 0 : if (spdk_cpuset_count(&g_core_mask) == 0) {
651 0 : SPDK_ERRLOG("no cpus specified\n");
652 0 : return -EINVAL;
653 : }
654 :
655 0 : spdk_env_get_cpuset(&tmp_mask);
656 0 : spdk_cpuset_and(&tmp_mask, &g_core_mask);
657 :
658 0 : if (!spdk_cpuset_equal(&tmp_mask, &g_core_mask)) {
659 0 : SPDK_ERRLOG("one of selected cpu is outside of core mask(=%s)\n",
660 : spdk_cpuset_fmt(&g_core_mask));
661 0 : return -EINVAL;
662 : }
663 :
664 0 : return 0;
665 : }
666 :
667 : static void
668 0 : ublk_poller_register(void *args)
669 : {
670 0 : struct ublk_poll_group *poll_group = args;
671 : int rc;
672 :
673 0 : assert(spdk_get_thread() == poll_group->ublk_thread);
674 : /* Bind ublk spdk_thread to current CPU core in order to avoid thread context switch
675 : * during uring processing as required by ublk kernel.
676 : */
677 0 : spdk_thread_bind(spdk_get_thread(), true);
678 :
679 0 : TAILQ_INIT(&poll_group->queue_list);
680 0 : poll_group->ublk_poller = SPDK_POLLER_REGISTER(ublk_poll, poll_group, 0);
681 0 : rc = spdk_iobuf_channel_init(&poll_group->iobuf_ch, "ublk",
682 : UBLK_IOBUF_SMALL_CACHE_SIZE, UBLK_IOBUF_LARGE_CACHE_SIZE);
683 0 : if (rc != 0) {
684 0 : assert(false);
685 : }
686 0 : }
687 :
688 : struct rpc_create_target {
689 : bool disable_user_copy;
690 : };
691 :
692 : static const struct spdk_json_object_decoder rpc_ublk_create_target[] = {
693 : {"disable_user_copy", offsetof(struct rpc_create_target, disable_user_copy), spdk_json_decode_bool, true},
694 : };
695 :
696 : int
697 0 : ublk_create_target(const char *cpumask_str, const struct spdk_json_val *params)
698 : {
699 : int rc;
700 : uint32_t i;
701 0 : char thread_name[32];
702 0 : struct rpc_create_target req = {};
703 : struct ublk_poll_group *poll_group;
704 :
705 0 : if (g_ublk_tgt.active == true) {
706 0 : SPDK_ERRLOG("UBLK target has been created\n");
707 0 : return -EBUSY;
708 : }
709 :
710 0 : rc = ublk_parse_core_mask(cpumask_str);
711 0 : if (rc != 0) {
712 0 : return rc;
713 : }
714 :
715 0 : if (params) {
716 0 : if (spdk_json_decode_object_relaxed(params, rpc_ublk_create_target,
717 : SPDK_COUNTOF(rpc_ublk_create_target),
718 : &req)) {
719 0 : SPDK_ERRLOG("spdk_json_decode_object failed\n");
720 0 : return -EINVAL;
721 : }
722 0 : g_disable_user_copy = req.disable_user_copy;
723 : }
724 :
725 0 : assert(g_ublk_tgt.poll_groups == NULL);
726 0 : g_ublk_tgt.poll_groups = calloc(spdk_env_get_core_count(), sizeof(*poll_group));
727 0 : if (!g_ublk_tgt.poll_groups) {
728 0 : return -ENOMEM;
729 : }
730 :
731 0 : rc = ublk_open();
732 0 : if (rc != 0) {
733 0 : SPDK_ERRLOG("Fail to open UBLK, error=%s\n", spdk_strerror(-rc));
734 0 : free(g_ublk_tgt.poll_groups);
735 0 : g_ublk_tgt.poll_groups = NULL;
736 0 : return rc;
737 : }
738 :
739 0 : spdk_iobuf_register_module("ublk");
740 :
741 0 : SPDK_ENV_FOREACH_CORE(i) {
742 0 : if (!spdk_cpuset_get_cpu(&g_core_mask, i)) {
743 0 : continue;
744 : }
745 0 : snprintf(thread_name, sizeof(thread_name), "ublk_thread%u", i);
746 0 : poll_group = &g_ublk_tgt.poll_groups[g_num_ublk_poll_groups];
747 0 : poll_group->ublk_thread = spdk_thread_create(thread_name, &g_core_mask);
748 0 : spdk_thread_send_msg(poll_group->ublk_thread, ublk_poller_register, poll_group);
749 0 : g_num_ublk_poll_groups++;
750 : }
751 :
752 0 : assert(spdk_thread_is_app_thread(NULL));
753 0 : g_ublk_tgt.active = true;
754 0 : g_ublk_tgt.ctrl_ops_in_progress = 0;
755 0 : g_ublk_tgt.ctrl_poller = SPDK_POLLER_REGISTER(ublk_ctrl_poller, NULL,
756 : UBLK_DEFAULT_CTRL_URING_POLLING_INTERVAL_US);
757 :
758 0 : SPDK_NOTICELOG("UBLK target created successfully\n");
759 :
760 0 : return 0;
761 : }
762 :
763 : static void
764 0 : _ublk_fini_done(void *args)
765 : {
766 0 : SPDK_DEBUGLOG(ublk, "\n");
767 :
768 0 : g_num_ublk_poll_groups = 0;
769 0 : g_next_ublk_poll_group = 0;
770 0 : g_ublk_tgt.is_destroying = false;
771 0 : g_ublk_tgt.active = false;
772 0 : g_ublk_tgt.features = 0;
773 0 : g_ublk_tgt.ioctl_encode = false;
774 0 : g_ublk_tgt.user_copy = false;
775 0 : g_ublk_tgt.user_recovery = false;
776 :
777 0 : if (g_ublk_tgt.cb_fn) {
778 0 : g_ublk_tgt.cb_fn(g_ublk_tgt.cb_arg);
779 0 : g_ublk_tgt.cb_fn = NULL;
780 0 : g_ublk_tgt.cb_arg = NULL;
781 : }
782 :
783 0 : if (g_ublk_tgt.poll_groups) {
784 0 : free(g_ublk_tgt.poll_groups);
785 0 : g_ublk_tgt.poll_groups = NULL;
786 : }
787 :
788 0 : }
789 :
790 : static void
791 0 : ublk_thread_exit(void *args)
792 : {
793 0 : struct spdk_thread *ublk_thread = spdk_get_thread();
794 : uint32_t i;
795 :
796 0 : for (i = 0; i < g_num_ublk_poll_groups; i++) {
797 0 : if (g_ublk_tgt.poll_groups[i].ublk_thread == ublk_thread) {
798 0 : spdk_poller_unregister(&g_ublk_tgt.poll_groups[i].ublk_poller);
799 0 : spdk_iobuf_channel_fini(&g_ublk_tgt.poll_groups[i].iobuf_ch);
800 0 : spdk_thread_bind(ublk_thread, false);
801 0 : spdk_thread_exit(ublk_thread);
802 : }
803 : }
804 0 : }
805 :
806 : static int
807 0 : ublk_close_dev(struct spdk_ublk_dev *ublk)
808 : {
809 : int rc;
810 :
811 : /* set is_closing */
812 0 : if (ublk->is_closing) {
813 0 : return -EBUSY;
814 : }
815 0 : ublk->is_closing = true;
816 :
817 0 : rc = ublk_ctrl_cmd_submit(ublk, UBLK_CMD_STOP_DEV);
818 0 : if (rc < 0) {
819 0 : SPDK_ERRLOG("stop dev %d failed\n", ublk->ublk_id);
820 : }
821 0 : return rc;
822 : }
823 :
824 : static void
825 0 : _ublk_fini(void *args)
826 : {
827 : struct spdk_ublk_dev *ublk, *ublk_tmp;
828 :
829 0 : TAILQ_FOREACH_SAFE(ublk, &g_ublk_devs, tailq, ublk_tmp) {
830 0 : ublk_close_dev(ublk);
831 : }
832 :
833 : /* Check if all ublks closed */
834 0 : if (TAILQ_EMPTY(&g_ublk_devs)) {
835 0 : SPDK_DEBUGLOG(ublk, "finish shutdown\n");
836 0 : spdk_poller_unregister(&g_ublk_tgt.ctrl_poller);
837 0 : if (g_ublk_tgt.ctrl_ring.ring_fd >= 0) {
838 0 : io_uring_queue_exit(&g_ublk_tgt.ctrl_ring);
839 0 : g_ublk_tgt.ctrl_ring.ring_fd = -1;
840 : }
841 0 : if (g_ublk_tgt.ctrl_fd >= 0) {
842 0 : close(g_ublk_tgt.ctrl_fd);
843 0 : g_ublk_tgt.ctrl_fd = -1;
844 : }
845 0 : spdk_for_each_thread(ublk_thread_exit, NULL, _ublk_fini_done);
846 : } else {
847 0 : spdk_thread_send_msg(spdk_get_thread(), _ublk_fini, NULL);
848 : }
849 0 : }
850 :
851 : int
852 0 : spdk_ublk_fini(spdk_ublk_fini_cb cb_fn, void *cb_arg)
853 : {
854 0 : assert(spdk_thread_is_app_thread(NULL));
855 :
856 0 : if (g_ublk_tgt.is_destroying == true) {
857 : /* UBLK target is being destroying */
858 0 : return -EBUSY;
859 : }
860 0 : g_ublk_tgt.cb_fn = cb_fn;
861 0 : g_ublk_tgt.cb_arg = cb_arg;
862 0 : g_ublk_tgt.is_destroying = true;
863 0 : _ublk_fini(NULL);
864 :
865 0 : return 0;
866 : }
867 :
868 : int
869 0 : ublk_destroy_target(spdk_ublk_fini_cb cb_fn, void *cb_arg)
870 : {
871 : int rc;
872 :
873 0 : if (g_ublk_tgt.active == false) {
874 : /* UBLK target has not been created */
875 0 : return -ENOENT;
876 : }
877 :
878 0 : rc = spdk_ublk_fini(cb_fn, cb_arg);
879 :
880 0 : return rc;
881 : }
882 :
883 : struct spdk_ublk_dev *
884 0 : ublk_dev_find_by_id(uint32_t ublk_id)
885 : {
886 : struct spdk_ublk_dev *ublk;
887 :
888 : /* check whether ublk has already been registered by ublk path. */
889 0 : TAILQ_FOREACH(ublk, &g_ublk_devs, tailq) {
890 0 : if (ublk->ublk_id == ublk_id) {
891 0 : return ublk;
892 : }
893 : }
894 :
895 0 : return NULL;
896 : }
897 :
898 : uint32_t
899 0 : ublk_dev_get_id(struct spdk_ublk_dev *ublk)
900 : {
901 0 : return ublk->ublk_id;
902 : }
903 :
904 0 : struct spdk_ublk_dev *ublk_dev_first(void)
905 : {
906 0 : return TAILQ_FIRST(&g_ublk_devs);
907 : }
908 :
909 0 : struct spdk_ublk_dev *ublk_dev_next(struct spdk_ublk_dev *prev)
910 : {
911 0 : return TAILQ_NEXT(prev, tailq);
912 : }
913 :
914 : uint32_t
915 0 : ublk_dev_get_queue_depth(struct spdk_ublk_dev *ublk)
916 : {
917 0 : return ublk->queue_depth;
918 : }
919 :
920 : uint32_t
921 0 : ublk_dev_get_num_queues(struct spdk_ublk_dev *ublk)
922 : {
923 0 : return ublk->num_queues;
924 : }
925 :
926 : const char *
927 0 : ublk_dev_get_bdev_name(struct spdk_ublk_dev *ublk)
928 : {
929 0 : return spdk_bdev_get_name(ublk->bdev);
930 : }
931 :
932 : void
933 0 : spdk_ublk_write_config_json(struct spdk_json_write_ctx *w)
934 : {
935 : struct spdk_ublk_dev *ublk;
936 :
937 0 : spdk_json_write_array_begin(w);
938 :
939 0 : if (g_ublk_tgt.active) {
940 0 : spdk_json_write_object_begin(w);
941 :
942 0 : spdk_json_write_named_string(w, "method", "ublk_create_target");
943 0 : spdk_json_write_named_object_begin(w, "params");
944 0 : spdk_json_write_named_string(w, "cpumask", spdk_cpuset_fmt(&g_core_mask));
945 0 : spdk_json_write_object_end(w);
946 :
947 0 : spdk_json_write_object_end(w);
948 : }
949 :
950 0 : TAILQ_FOREACH(ublk, &g_ublk_devs, tailq) {
951 0 : spdk_json_write_object_begin(w);
952 :
953 0 : spdk_json_write_named_string(w, "method", "ublk_start_disk");
954 :
955 0 : spdk_json_write_named_object_begin(w, "params");
956 0 : spdk_json_write_named_string(w, "bdev_name", ublk_dev_get_bdev_name(ublk));
957 0 : spdk_json_write_named_uint32(w, "ublk_id", ublk->ublk_id);
958 0 : spdk_json_write_named_uint32(w, "num_queues", ublk->num_queues);
959 0 : spdk_json_write_named_uint32(w, "queue_depth", ublk->queue_depth);
960 0 : spdk_json_write_object_end(w);
961 :
962 0 : spdk_json_write_object_end(w);
963 : }
964 :
965 0 : spdk_json_write_array_end(w);
966 0 : }
967 :
968 : static void
969 0 : ublk_dev_list_register(struct spdk_ublk_dev *ublk)
970 : {
971 0 : UBLK_DEBUGLOG(ublk, "add to tailq\n");
972 0 : TAILQ_INSERT_TAIL(&g_ublk_devs, ublk, tailq);
973 0 : g_ublk_tgt.num_ublk_devs++;
974 0 : }
975 :
976 : static void
977 0 : ublk_dev_list_unregister(struct spdk_ublk_dev *ublk)
978 : {
979 : /*
980 : * ublk device may be stopped before registered.
981 : * check whether it was registered.
982 : */
983 :
984 0 : if (ublk_dev_find_by_id(ublk->ublk_id)) {
985 0 : UBLK_DEBUGLOG(ublk, "remove from tailq\n");
986 0 : TAILQ_REMOVE(&g_ublk_devs, ublk, tailq);
987 0 : assert(g_ublk_tgt.num_ublk_devs);
988 0 : g_ublk_tgt.num_ublk_devs--;
989 0 : return;
990 : }
991 :
992 0 : UBLK_DEBUGLOG(ublk, "not found in tailq\n");
993 0 : assert(false);
994 : }
995 :
996 : static void
997 0 : ublk_delete_dev(void *arg)
998 : {
999 0 : struct spdk_ublk_dev *ublk = arg;
1000 0 : int rc = 0;
1001 : uint32_t q_idx;
1002 :
1003 0 : assert(spdk_thread_is_app_thread(NULL));
1004 0 : for (q_idx = 0; q_idx < ublk->num_queues; q_idx++) {
1005 0 : ublk_dev_queue_fini(&ublk->queues[q_idx]);
1006 : }
1007 :
1008 0 : if (ublk->cdev_fd >= 0) {
1009 0 : close(ublk->cdev_fd);
1010 : }
1011 :
1012 0 : rc = ublk_ctrl_cmd_submit(ublk, UBLK_CMD_DEL_DEV);
1013 0 : if (rc < 0) {
1014 0 : SPDK_ERRLOG("delete dev %d failed\n", ublk->ublk_id);
1015 : }
1016 0 : }
1017 :
1018 : static int
1019 0 : _ublk_close_dev_retry(void *arg)
1020 : {
1021 0 : struct spdk_ublk_dev *ublk = arg;
1022 :
1023 0 : if (ublk->ctrl_ops_in_progress > 0) {
1024 0 : if (ublk->retry_count-- > 0) {
1025 0 : return SPDK_POLLER_BUSY;
1026 : }
1027 0 : SPDK_ERRLOG("Timeout on ctrl op completion.\n");
1028 : }
1029 0 : spdk_poller_unregister(&ublk->retry_poller);
1030 0 : ublk_delete_dev(ublk);
1031 0 : return SPDK_POLLER_BUSY;
1032 : }
1033 :
1034 : static void
1035 0 : ublk_try_close_dev(void *arg)
1036 : {
1037 0 : struct spdk_ublk_dev *ublk = arg;
1038 :
1039 0 : assert(spdk_thread_is_app_thread(NULL));
1040 :
1041 0 : ublk->queues_closed += 1;
1042 0 : SPDK_DEBUGLOG(ublk_io, "ublkb%u closed queues %u\n", ublk->ublk_id, ublk->queues_closed);
1043 :
1044 0 : if (ublk->queues_closed < ublk->num_queues) {
1045 0 : return;
1046 : }
1047 :
1048 0 : if (ublk->ctrl_ops_in_progress > 0) {
1049 0 : assert(ublk->retry_poller == NULL);
1050 0 : ublk->retry_count = UBLK_STOP_BUSY_WAITING_MS * 1000ULL / UBLK_BUSY_POLLING_INTERVAL_US;
1051 0 : ublk->retry_poller = SPDK_POLLER_REGISTER(_ublk_close_dev_retry, ublk,
1052 : UBLK_BUSY_POLLING_INTERVAL_US);
1053 : } else {
1054 0 : ublk_delete_dev(ublk);
1055 : }
1056 : }
1057 :
1058 : static void
1059 0 : ublk_try_close_queue(struct ublk_queue *q)
1060 : {
1061 0 : struct spdk_ublk_dev *ublk = q->dev;
1062 :
1063 : /* Close queue until no I/O is submitted to bdev in flight,
1064 : * no I/O is waiting to commit result, and all I/Os are aborted back.
1065 : */
1066 0 : if (!TAILQ_EMPTY(&q->inflight_io_list) || !TAILQ_EMPTY(&q->completed_io_list) || q->cmd_inflight) {
1067 : /* wait for next retry */
1068 0 : return;
1069 : }
1070 :
1071 0 : TAILQ_REMOVE(&q->poll_group->queue_list, q, tailq);
1072 0 : spdk_put_io_channel(q->bdev_ch);
1073 0 : q->bdev_ch = NULL;
1074 :
1075 0 : spdk_thread_send_msg(spdk_thread_get_app_thread(), ublk_try_close_dev, ublk);
1076 : }
1077 :
1078 : int
1079 0 : ublk_stop_disk(uint32_t ublk_id, ublk_ctrl_cb ctrl_cb, void *cb_arg)
1080 : {
1081 : struct spdk_ublk_dev *ublk;
1082 :
1083 0 : assert(spdk_thread_is_app_thread(NULL));
1084 :
1085 0 : ublk = ublk_dev_find_by_id(ublk_id);
1086 0 : if (ublk == NULL) {
1087 0 : SPDK_ERRLOG("no ublk dev with ublk_id=%u\n", ublk_id);
1088 0 : return -ENODEV;
1089 : }
1090 0 : if (ublk->is_closing) {
1091 0 : SPDK_WARNLOG("ublk %d is closing\n", ublk->ublk_id);
1092 0 : return -EBUSY;
1093 : }
1094 0 : if (ublk->ctrl_cb) {
1095 0 : SPDK_WARNLOG("ublk %d is busy with RPC call\n", ublk->ublk_id);
1096 0 : return -EBUSY;
1097 : }
1098 :
1099 0 : ublk->ctrl_cb = ctrl_cb;
1100 0 : ublk->cb_arg = cb_arg;
1101 0 : return ublk_close_dev(ublk);
1102 : }
1103 :
1104 : static inline void
1105 0 : ublk_mark_io_done(struct ublk_io *io, int res)
1106 : {
1107 : /*
1108 : * mark io done by target, so that SPDK can commit its
1109 : * result and fetch new request via io_uring command.
1110 : */
1111 0 : io->cmd_op = UBLK_IO_COMMIT_AND_FETCH_REQ;
1112 0 : io->result = res;
1113 0 : io->need_data = false;
1114 0 : }
1115 :
1116 : static void
1117 0 : ublk_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
1118 : {
1119 0 : struct ublk_io *io = cb_arg;
1120 0 : struct ublk_queue *q = io->q;
1121 : int res;
1122 :
1123 0 : if (success) {
1124 0 : res = io->result;
1125 : } else {
1126 0 : res = -EIO;
1127 : }
1128 :
1129 0 : ublk_mark_io_done(io, res);
1130 :
1131 0 : SPDK_DEBUGLOG(ublk_io, "(qid %d tag %d res %d)\n",
1132 : q->q_id, io->tag, res);
1133 0 : TAILQ_REMOVE(&q->inflight_io_list, io, tailq);
1134 0 : TAILQ_INSERT_TAIL(&q->completed_io_list, io, tailq);
1135 :
1136 0 : if (bdev_io != NULL) {
1137 0 : spdk_bdev_free_io(bdev_io);
1138 : }
1139 0 : }
1140 :
1141 : static void
1142 0 : ublk_queue_user_copy(struct ublk_io *io, bool is_write)
1143 : {
1144 0 : struct ublk_queue *q = io->q;
1145 0 : const struct ublksrv_io_desc *iod = io->iod;
1146 : struct io_uring_sqe *sqe;
1147 : uint64_t pos;
1148 : uint32_t nbytes;
1149 :
1150 0 : nbytes = iod->nr_sectors * (1ULL << LINUX_SECTOR_SHIFT);
1151 0 : pos = ublk_user_copy_pos(q->q_id, io->tag);
1152 0 : sqe = io_uring_get_sqe(&q->ring);
1153 0 : assert(sqe);
1154 :
1155 0 : if (is_write) {
1156 0 : io_uring_prep_read(sqe, 0, io->payload, nbytes, pos);
1157 : } else {
1158 0 : io_uring_prep_write(sqe, 0, io->payload, nbytes, pos);
1159 : }
1160 0 : io_uring_sqe_set_flags(sqe, IOSQE_FIXED_FILE);
1161 0 : io_uring_sqe_set_data64(sqe, build_user_data(io->tag, 0));
1162 :
1163 0 : io->user_copy = true;
1164 0 : TAILQ_REMOVE(&q->inflight_io_list, io, tailq);
1165 0 : TAILQ_INSERT_TAIL(&q->completed_io_list, io, tailq);
1166 0 : }
1167 :
1168 : static void
1169 0 : ublk_user_copy_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
1170 : {
1171 0 : struct ublk_io *io = cb_arg;
1172 :
1173 0 : spdk_bdev_free_io(bdev_io);
1174 :
1175 0 : if (success) {
1176 0 : ublk_queue_user_copy(io, false);
1177 0 : return;
1178 : }
1179 : /* READ IO Error */
1180 0 : ublk_io_done(NULL, false, cb_arg);
1181 : }
1182 :
1183 : static void
1184 0 : ublk_resubmit_io(void *arg)
1185 : {
1186 0 : struct ublk_io *io = (struct ublk_io *)arg;
1187 :
1188 0 : _ublk_submit_bdev_io(io->q, io);
1189 0 : }
1190 :
1191 : static void
1192 0 : ublk_queue_io(struct ublk_io *io)
1193 : {
1194 : int rc;
1195 0 : struct spdk_bdev *bdev = io->q->dev->bdev;
1196 0 : struct ublk_queue *q = io->q;
1197 :
1198 0 : io->bdev_io_wait.bdev = bdev;
1199 0 : io->bdev_io_wait.cb_fn = ublk_resubmit_io;
1200 0 : io->bdev_io_wait.cb_arg = io;
1201 :
1202 0 : rc = spdk_bdev_queue_io_wait(bdev, q->bdev_ch, &io->bdev_io_wait);
1203 0 : if (rc != 0) {
1204 0 : SPDK_ERRLOG("Queue io failed in ublk_queue_io, rc=%d.\n", rc);
1205 0 : ublk_io_done(NULL, false, io);
1206 : }
1207 0 : }
1208 :
1209 : static void
1210 0 : ublk_io_get_buffer_cb(struct spdk_iobuf_entry *iobuf, void *buf)
1211 : {
1212 0 : struct ublk_io *io = SPDK_CONTAINEROF(iobuf, struct ublk_io, iobuf);
1213 :
1214 0 : io->mpool_entry = buf;
1215 0 : assert(io->payload == NULL);
1216 0 : io->payload = (void *)(uintptr_t)SPDK_ALIGN_CEIL((uintptr_t)buf, 4096ULL);
1217 0 : io->get_buf_cb(io);
1218 0 : }
1219 :
1220 : static void
1221 0 : ublk_io_get_buffer(struct ublk_io *io, struct spdk_iobuf_channel *iobuf_ch,
1222 : ublk_get_buf_cb get_buf_cb)
1223 : {
1224 : void *buf;
1225 :
1226 0 : io->payload_size = io->iod->nr_sectors * (1ULL << LINUX_SECTOR_SHIFT);
1227 0 : io->get_buf_cb = get_buf_cb;
1228 0 : buf = spdk_iobuf_get(iobuf_ch, io->payload_size, &io->iobuf, ublk_io_get_buffer_cb);
1229 :
1230 0 : if (buf != NULL) {
1231 0 : ublk_io_get_buffer_cb(&io->iobuf, buf);
1232 : }
1233 0 : }
1234 :
1235 : static void
1236 0 : ublk_io_put_buffer(struct ublk_io *io, struct spdk_iobuf_channel *iobuf_ch)
1237 : {
1238 0 : if (io->payload) {
1239 0 : spdk_iobuf_put(iobuf_ch, io->mpool_entry, io->payload_size);
1240 0 : io->mpool_entry = NULL;
1241 0 : io->payload = NULL;
1242 : }
1243 0 : }
1244 :
1245 : static void
1246 0 : _ublk_submit_bdev_io(struct ublk_queue *q, struct ublk_io *io)
1247 : {
1248 0 : struct spdk_ublk_dev *ublk = q->dev;
1249 0 : struct spdk_bdev_desc *desc = io->bdev_desc;
1250 0 : struct spdk_io_channel *ch = io->bdev_ch;
1251 : uint64_t offset_blocks, num_blocks;
1252 : spdk_bdev_io_completion_cb read_cb;
1253 : uint8_t ublk_op;
1254 0 : int rc = 0;
1255 0 : const struct ublksrv_io_desc *iod = io->iod;
1256 :
1257 0 : ublk_op = ublksrv_get_op(iod);
1258 0 : offset_blocks = iod->start_sector >> ublk->sector_per_block_shift;
1259 0 : num_blocks = iod->nr_sectors >> ublk->sector_per_block_shift;
1260 :
1261 0 : switch (ublk_op) {
1262 0 : case UBLK_IO_OP_READ:
1263 0 : if (g_ublk_tgt.user_copy) {
1264 0 : read_cb = ublk_user_copy_read_done;
1265 : } else {
1266 0 : read_cb = ublk_io_done;
1267 : }
1268 0 : rc = spdk_bdev_read_blocks(desc, ch, io->payload, offset_blocks, num_blocks, read_cb, io);
1269 0 : break;
1270 0 : case UBLK_IO_OP_WRITE:
1271 0 : rc = spdk_bdev_write_blocks(desc, ch, io->payload, offset_blocks, num_blocks, ublk_io_done, io);
1272 0 : break;
1273 0 : case UBLK_IO_OP_FLUSH:
1274 0 : rc = spdk_bdev_flush_blocks(desc, ch, 0, spdk_bdev_get_num_blocks(ublk->bdev), ublk_io_done, io);
1275 0 : break;
1276 0 : case UBLK_IO_OP_DISCARD:
1277 0 : rc = spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, ublk_io_done, io);
1278 0 : break;
1279 0 : case UBLK_IO_OP_WRITE_ZEROES:
1280 0 : rc = spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, ublk_io_done, io);
1281 0 : break;
1282 0 : default:
1283 0 : rc = -1;
1284 : }
1285 :
1286 0 : if (rc < 0) {
1287 0 : if (rc == -ENOMEM) {
1288 0 : SPDK_INFOLOG(ublk, "No memory, start to queue io.\n");
1289 0 : ublk_queue_io(io);
1290 : } else {
1291 0 : SPDK_ERRLOG("ublk io failed in ublk_queue_io, rc=%d, ublk_op=%u\n", rc, ublk_op);
1292 0 : ublk_io_done(NULL, false, io);
1293 : }
1294 : }
1295 0 : }
1296 :
1297 : static void
1298 0 : read_get_buffer_done(struct ublk_io *io)
1299 : {
1300 0 : _ublk_submit_bdev_io(io->q, io);
1301 0 : }
1302 :
1303 : static void
1304 0 : user_copy_write_get_buffer_done(struct ublk_io *io)
1305 : {
1306 0 : ublk_queue_user_copy(io, true);
1307 0 : }
1308 :
1309 : static void
1310 0 : ublk_submit_bdev_io(struct ublk_queue *q, struct ublk_io *io)
1311 : {
1312 0 : struct spdk_iobuf_channel *iobuf_ch = &q->poll_group->iobuf_ch;
1313 0 : const struct ublksrv_io_desc *iod = io->iod;
1314 : uint8_t ublk_op;
1315 :
1316 0 : io->result = iod->nr_sectors * (1ULL << LINUX_SECTOR_SHIFT);
1317 0 : ublk_op = ublksrv_get_op(iod);
1318 0 : switch (ublk_op) {
1319 0 : case UBLK_IO_OP_READ:
1320 0 : ublk_io_get_buffer(io, iobuf_ch, read_get_buffer_done);
1321 0 : break;
1322 0 : case UBLK_IO_OP_WRITE:
1323 0 : if (g_ublk_tgt.user_copy) {
1324 0 : ublk_io_get_buffer(io, iobuf_ch, user_copy_write_get_buffer_done);
1325 : } else {
1326 0 : _ublk_submit_bdev_io(q, io);
1327 : }
1328 0 : break;
1329 0 : default:
1330 0 : _ublk_submit_bdev_io(q, io);
1331 0 : break;
1332 : }
1333 0 : }
1334 :
1335 : static inline void
1336 0 : ublksrv_queue_io_cmd(struct ublk_queue *q,
1337 : struct ublk_io *io, unsigned tag)
1338 : {
1339 : struct ublksrv_io_cmd *cmd;
1340 : struct io_uring_sqe *sqe;
1341 0 : unsigned int cmd_op = 0;;
1342 : uint64_t user_data;
1343 :
1344 : /* each io should have operation of fetching or committing */
1345 0 : assert((io->cmd_op == UBLK_IO_FETCH_REQ) || (io->cmd_op == UBLK_IO_NEED_GET_DATA) ||
1346 : (io->cmd_op == UBLK_IO_COMMIT_AND_FETCH_REQ));
1347 0 : cmd_op = io->cmd_op;
1348 :
1349 0 : sqe = io_uring_get_sqe(&q->ring);
1350 0 : assert(sqe);
1351 :
1352 0 : cmd = (struct ublksrv_io_cmd *)ublk_get_sqe_cmd(sqe);
1353 0 : if (cmd_op == UBLK_IO_COMMIT_AND_FETCH_REQ) {
1354 0 : cmd->result = io->result;
1355 : }
1356 :
1357 : /* These fields should be written once, never change */
1358 0 : ublk_set_sqe_cmd_op(sqe, cmd_op);
1359 : /* dev->cdev_fd */
1360 0 : sqe->fd = 0;
1361 0 : sqe->opcode = IORING_OP_URING_CMD;
1362 0 : sqe->flags = IOSQE_FIXED_FILE;
1363 0 : sqe->rw_flags = 0;
1364 0 : cmd->tag = tag;
1365 0 : cmd->addr = g_ublk_tgt.user_copy ? 0 : (__u64)(uintptr_t)(io->payload);
1366 0 : cmd->q_id = q->q_id;
1367 :
1368 0 : user_data = build_user_data(tag, cmd_op);
1369 0 : io_uring_sqe_set_data64(sqe, user_data);
1370 :
1371 0 : io->cmd_op = 0;
1372 :
1373 0 : SPDK_DEBUGLOG(ublk_io, "(qid %d tag %u cmd_op %u) iof %x stopping %d\n",
1374 : q->q_id, tag, cmd_op,
1375 : io->cmd_op, q->is_stopping);
1376 0 : }
1377 :
1378 : static int
1379 0 : ublk_io_xmit(struct ublk_queue *q)
1380 : {
1381 0 : TAILQ_HEAD(, ublk_io) buffer_free_list;
1382 : struct spdk_iobuf_channel *iobuf_ch;
1383 0 : int rc = 0, count = 0;
1384 : struct ublk_io *io;
1385 :
1386 0 : if (TAILQ_EMPTY(&q->completed_io_list)) {
1387 0 : return 0;
1388 : }
1389 :
1390 0 : TAILQ_INIT(&buffer_free_list);
1391 0 : while (!TAILQ_EMPTY(&q->completed_io_list)) {
1392 0 : io = TAILQ_FIRST(&q->completed_io_list);
1393 0 : assert(io != NULL);
1394 : /*
1395 : * Remove IO from list now assuming it will be completed. It will be inserted
1396 : * back to the head if it cannot be completed. This approach is specifically
1397 : * taken to work around a scan-build use-after-free mischaracterization.
1398 : */
1399 0 : TAILQ_REMOVE(&q->completed_io_list, io, tailq);
1400 0 : if (!io->user_copy) {
1401 0 : if (!io->need_data) {
1402 0 : TAILQ_INSERT_TAIL(&buffer_free_list, io, tailq);
1403 : }
1404 0 : ublksrv_queue_io_cmd(q, io, io->tag);
1405 : }
1406 0 : count++;
1407 : }
1408 :
1409 0 : q->cmd_inflight += count;
1410 0 : rc = io_uring_submit(&q->ring);
1411 0 : if (rc != count) {
1412 0 : SPDK_ERRLOG("could not submit all commands\n");
1413 0 : assert(false);
1414 : }
1415 :
1416 : /* Note: for READ io, ublk will always copy the data out of
1417 : * the buffers in the io_uring_submit context. Since we
1418 : * are not using SQPOLL for IO rings, we can safely free
1419 : * those IO buffers here. This design doesn't seem ideal,
1420 : * but it's what's possible since there is no discrete
1421 : * COMMIT_REQ operation. That will need to change in the
1422 : * future should we ever want to support async copy
1423 : * operations.
1424 : */
1425 0 : iobuf_ch = &q->poll_group->iobuf_ch;
1426 0 : while (!TAILQ_EMPTY(&buffer_free_list)) {
1427 0 : io = TAILQ_FIRST(&buffer_free_list);
1428 0 : TAILQ_REMOVE(&buffer_free_list, io, tailq);
1429 0 : ublk_io_put_buffer(io, iobuf_ch);
1430 : }
1431 0 : return rc;
1432 : }
1433 :
1434 : static void
1435 0 : write_get_buffer_done(struct ublk_io *io)
1436 : {
1437 0 : io->need_data = true;
1438 0 : io->cmd_op = UBLK_IO_NEED_GET_DATA;
1439 0 : io->result = 0;
1440 :
1441 0 : TAILQ_REMOVE(&io->q->inflight_io_list, io, tailq);
1442 0 : TAILQ_INSERT_TAIL(&io->q->completed_io_list, io, tailq);
1443 0 : }
1444 :
1445 : static int
1446 0 : ublk_io_recv(struct ublk_queue *q)
1447 : {
1448 : struct io_uring_cqe *cqe;
1449 : unsigned head, tag;
1450 0 : int fetch, count = 0;
1451 : struct ublk_io *io;
1452 : struct spdk_iobuf_channel *iobuf_ch;
1453 :
1454 0 : if (q->cmd_inflight == 0) {
1455 0 : return 0;
1456 : }
1457 :
1458 0 : iobuf_ch = &q->poll_group->iobuf_ch;
1459 0 : io_uring_for_each_cqe(&q->ring, head, cqe) {
1460 0 : tag = user_data_to_tag(cqe->user_data);
1461 0 : io = &q->ios[tag];
1462 :
1463 0 : SPDK_DEBUGLOG(ublk_io, "res %d qid %d tag %u, user copy %u, cmd_op %u\n",
1464 : cqe->res, q->q_id, tag, io->user_copy, user_data_to_op(cqe->user_data));
1465 :
1466 0 : q->cmd_inflight--;
1467 0 : TAILQ_INSERT_TAIL(&q->inflight_io_list, io, tailq);
1468 :
1469 0 : if (!io->user_copy) {
1470 0 : fetch = (cqe->res != UBLK_IO_RES_ABORT) && !q->is_stopping;
1471 0 : if (!fetch) {
1472 0 : q->is_stopping = true;
1473 0 : if (io->cmd_op == UBLK_IO_FETCH_REQ) {
1474 0 : io->cmd_op = 0;
1475 : }
1476 : }
1477 :
1478 0 : if (cqe->res == UBLK_IO_RES_OK) {
1479 0 : ublk_submit_bdev_io(q, io);
1480 0 : } else if (cqe->res == UBLK_IO_RES_NEED_GET_DATA) {
1481 0 : ublk_io_get_buffer(io, iobuf_ch, write_get_buffer_done);
1482 : } else {
1483 0 : if (cqe->res != UBLK_IO_RES_ABORT) {
1484 0 : SPDK_ERRLOG("ublk received error io: res %d qid %d tag %u cmd_op %u\n",
1485 : cqe->res, q->q_id, tag, user_data_to_op(cqe->user_data));
1486 : }
1487 0 : TAILQ_REMOVE(&q->inflight_io_list, io, tailq);
1488 : }
1489 : } else {
1490 :
1491 : /* clear `user_copy` for next use of this IO structure */
1492 0 : io->user_copy = false;
1493 :
1494 0 : assert((ublksrv_get_op(io->iod) == UBLK_IO_OP_READ) ||
1495 : (ublksrv_get_op(io->iod) == UBLK_IO_OP_WRITE));
1496 0 : if (cqe->res != io->result) {
1497 : /* EIO */
1498 0 : ublk_io_done(NULL, false, io);
1499 : } else {
1500 0 : if (ublksrv_get_op(io->iod) == UBLK_IO_OP_READ) {
1501 : /* bdev_io is already freed in first READ cycle */
1502 0 : ublk_io_done(NULL, true, io);
1503 : } else {
1504 0 : _ublk_submit_bdev_io(q, io);
1505 : }
1506 : }
1507 : }
1508 0 : count += 1;
1509 0 : if (count == UBLK_QUEUE_REQUEST) {
1510 0 : break;
1511 : }
1512 : }
1513 0 : io_uring_cq_advance(&q->ring, count);
1514 :
1515 0 : return count;
1516 : }
1517 :
1518 : static int
1519 0 : ublk_poll(void *arg)
1520 : {
1521 0 : struct ublk_poll_group *poll_group = arg;
1522 : struct ublk_queue *q, *q_tmp;
1523 0 : int sent, received, count = 0;
1524 :
1525 0 : TAILQ_FOREACH_SAFE(q, &poll_group->queue_list, tailq, q_tmp) {
1526 0 : sent = ublk_io_xmit(q);
1527 0 : received = ublk_io_recv(q);
1528 0 : if (spdk_unlikely(q->is_stopping)) {
1529 0 : ublk_try_close_queue(q);
1530 : }
1531 0 : count += sent + received;
1532 : }
1533 0 : if (count > 0) {
1534 0 : return SPDK_POLLER_BUSY;
1535 : } else {
1536 0 : return SPDK_POLLER_IDLE;
1537 : }
1538 : }
1539 :
1540 : static void
1541 0 : ublk_bdev_hot_remove(struct spdk_ublk_dev *ublk)
1542 : {
1543 0 : ublk_close_dev(ublk);
1544 0 : }
1545 :
1546 : static void
1547 0 : ublk_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev,
1548 : void *event_ctx)
1549 : {
1550 0 : switch (type) {
1551 0 : case SPDK_BDEV_EVENT_REMOVE:
1552 0 : ublk_bdev_hot_remove(event_ctx);
1553 0 : break;
1554 0 : default:
1555 0 : SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type);
1556 0 : break;
1557 : }
1558 0 : }
1559 :
1560 : static void
1561 0 : ublk_dev_init_io_cmds(struct io_uring *r, uint32_t q_depth)
1562 : {
1563 : struct io_uring_sqe *sqe;
1564 : uint32_t i;
1565 :
1566 0 : for (i = 0; i < q_depth; i++) {
1567 0 : sqe = ublk_uring_get_sqe(r, i);
1568 :
1569 : /* These fields should be written once, never change */
1570 0 : sqe->flags = IOSQE_FIXED_FILE;
1571 0 : sqe->rw_flags = 0;
1572 0 : sqe->ioprio = 0;
1573 0 : sqe->off = 0;
1574 : }
1575 0 : }
1576 :
1577 : static int
1578 0 : ublk_dev_queue_init(struct ublk_queue *q)
1579 : {
1580 0 : int rc = 0, cmd_buf_size;
1581 : uint32_t j;
1582 0 : struct spdk_ublk_dev *ublk = q->dev;
1583 : unsigned long off;
1584 :
1585 0 : cmd_buf_size = ublk_queue_cmd_buf_sz(q->q_depth);
1586 0 : off = UBLKSRV_CMD_BUF_OFFSET +
1587 0 : q->q_id * (UBLK_MAX_QUEUE_DEPTH * sizeof(struct ublksrv_io_desc));
1588 0 : q->io_cmd_buf = (struct ublksrv_io_desc *)mmap(0, cmd_buf_size, PROT_READ,
1589 : MAP_SHARED | MAP_POPULATE, ublk->cdev_fd, off);
1590 0 : if (q->io_cmd_buf == MAP_FAILED) {
1591 0 : q->io_cmd_buf = NULL;
1592 0 : rc = -errno;
1593 0 : SPDK_ERRLOG("Failed at mmap: %s\n", spdk_strerror(-rc));
1594 0 : return rc;
1595 : }
1596 :
1597 0 : for (j = 0; j < q->q_depth; j++) {
1598 0 : q->ios[j].cmd_op = UBLK_IO_FETCH_REQ;
1599 0 : q->ios[j].iod = &q->io_cmd_buf[j];
1600 : }
1601 :
1602 0 : rc = ublk_setup_ring(q->q_depth, &q->ring, IORING_SETUP_SQE128);
1603 0 : if (rc < 0) {
1604 0 : SPDK_ERRLOG("Failed at setup uring: %s\n", spdk_strerror(-rc));
1605 0 : munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q->q_depth));
1606 0 : q->io_cmd_buf = NULL;
1607 0 : return rc;
1608 : }
1609 :
1610 0 : rc = io_uring_register_files(&q->ring, &ublk->cdev_fd, 1);
1611 0 : if (rc != 0) {
1612 0 : SPDK_ERRLOG("Failed at uring register files: %s\n", spdk_strerror(-rc));
1613 0 : io_uring_queue_exit(&q->ring);
1614 0 : q->ring.ring_fd = -1;
1615 0 : munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q->q_depth));
1616 0 : q->io_cmd_buf = NULL;
1617 0 : return rc;
1618 : }
1619 :
1620 0 : ublk_dev_init_io_cmds(&q->ring, q->q_depth);
1621 :
1622 0 : return 0;
1623 : }
1624 :
1625 : static void
1626 0 : ublk_dev_queue_fini(struct ublk_queue *q)
1627 : {
1628 0 : if (q->ring.ring_fd >= 0) {
1629 0 : io_uring_unregister_files(&q->ring);
1630 0 : io_uring_queue_exit(&q->ring);
1631 0 : q->ring.ring_fd = -1;
1632 : }
1633 0 : if (q->io_cmd_buf) {
1634 0 : munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q->q_depth));
1635 : }
1636 0 : }
1637 :
1638 : static void
1639 0 : ublk_dev_queue_io_init(struct ublk_queue *q)
1640 : {
1641 : struct ublk_io *io;
1642 : uint32_t i;
1643 : int rc __attribute__((unused));
1644 : void *buf;
1645 :
1646 : /* Some older kernels require a buffer to get posted, even
1647 : * when NEED_GET_DATA has been specified. So allocate a
1648 : * temporary buffer, only for purposes of this workaround.
1649 : * It never actually gets used, so we will free it immediately
1650 : * after all of the commands are posted.
1651 : */
1652 0 : buf = malloc(64);
1653 :
1654 0 : assert(q->bdev_ch != NULL);
1655 :
1656 : /* Initialize and submit all io commands to ublk driver */
1657 0 : for (i = 0; i < q->q_depth; i++) {
1658 0 : io = &q->ios[i];
1659 0 : io->tag = (uint16_t)i;
1660 0 : io->payload = buf;
1661 0 : io->bdev_ch = q->bdev_ch;
1662 0 : io->bdev_desc = q->dev->bdev_desc;
1663 0 : ublksrv_queue_io_cmd(q, io, i);
1664 : }
1665 :
1666 0 : q->cmd_inflight += q->q_depth;
1667 0 : rc = io_uring_submit(&q->ring);
1668 0 : assert(rc == (int)q->q_depth);
1669 0 : for (i = 0; i < q->q_depth; i++) {
1670 0 : io = &q->ios[i];
1671 0 : io->payload = NULL;
1672 : }
1673 0 : free(buf);
1674 0 : }
1675 :
1676 : static int
1677 0 : ublk_set_params(struct spdk_ublk_dev *ublk)
1678 : {
1679 : int rc;
1680 :
1681 0 : rc = ublk_ctrl_cmd_submit(ublk, UBLK_CMD_SET_PARAMS);
1682 0 : if (rc < 0) {
1683 0 : SPDK_ERRLOG("UBLK can't set params for dev %d, rc %s\n", ublk->ublk_id, spdk_strerror(-rc));
1684 : }
1685 :
1686 0 : return rc;
1687 : }
1688 :
1689 : static void
1690 0 : ublk_dev_info_init(struct spdk_ublk_dev *ublk)
1691 : {
1692 0 : struct ublksrv_ctrl_dev_info uinfo = {
1693 0 : .queue_depth = ublk->queue_depth,
1694 0 : .nr_hw_queues = ublk->num_queues,
1695 0 : .dev_id = ublk->ublk_id,
1696 : .max_io_buf_bytes = UBLK_IO_MAX_BYTES,
1697 0 : .ublksrv_pid = getpid(),
1698 : .flags = UBLK_F_URING_CMD_COMP_IN_TASK,
1699 : };
1700 :
1701 0 : if (g_ublk_tgt.user_copy) {
1702 0 : uinfo.flags |= UBLK_F_USER_COPY;
1703 : } else {
1704 0 : uinfo.flags |= UBLK_F_NEED_GET_DATA;
1705 : }
1706 :
1707 0 : if (g_ublk_tgt.user_recovery) {
1708 0 : uinfo.flags |= UBLK_F_USER_RECOVERY;
1709 0 : uinfo.flags |= UBLK_F_USER_RECOVERY_REISSUE;
1710 : }
1711 :
1712 0 : ublk->dev_info = uinfo;
1713 0 : }
1714 :
1715 : /* Set ublk device parameters based on bdev */
1716 : static void
1717 0 : ublk_info_param_init(struct spdk_ublk_dev *ublk)
1718 : {
1719 0 : struct spdk_bdev *bdev = ublk->bdev;
1720 0 : uint32_t blk_size = spdk_bdev_get_data_block_size(bdev);
1721 0 : uint32_t pblk_size = spdk_bdev_get_physical_block_size(bdev);
1722 0 : uint32_t io_opt_blocks = spdk_bdev_get_optimal_io_boundary(bdev);
1723 0 : uint64_t num_blocks = spdk_bdev_get_num_blocks(bdev);
1724 0 : uint8_t sectors_per_block = blk_size >> LINUX_SECTOR_SHIFT;
1725 0 : uint32_t io_min_size = blk_size;
1726 0 : uint32_t io_opt_size = spdk_max(io_opt_blocks * blk_size, io_min_size);
1727 :
1728 0 : struct ublk_params uparams = {
1729 : .types = UBLK_PARAM_TYPE_BASIC,
1730 : .len = sizeof(struct ublk_params),
1731 : .basic = {
1732 0 : .logical_bs_shift = spdk_u32log2(blk_size),
1733 0 : .physical_bs_shift = spdk_u32log2(pblk_size),
1734 0 : .io_min_shift = spdk_u32log2(io_min_size),
1735 0 : .io_opt_shift = spdk_u32log2(io_opt_size),
1736 0 : .dev_sectors = num_blocks * sectors_per_block,
1737 : .max_sectors = UBLK_IO_MAX_BYTES >> LINUX_SECTOR_SHIFT,
1738 : }
1739 : };
1740 :
1741 0 : if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) {
1742 0 : uparams.basic.attrs = UBLK_ATTR_VOLATILE_CACHE;
1743 : }
1744 :
1745 0 : if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) {
1746 0 : uparams.types |= UBLK_PARAM_TYPE_DISCARD;
1747 0 : uparams.discard.discard_alignment = sectors_per_block;
1748 0 : uparams.discard.max_discard_sectors = num_blocks * sectors_per_block;
1749 0 : uparams.discard.max_discard_segments = 1;
1750 0 : uparams.discard.discard_granularity = blk_size;
1751 0 : if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) {
1752 0 : uparams.discard.max_write_zeroes_sectors = num_blocks * sectors_per_block;
1753 : }
1754 : }
1755 :
1756 0 : ublk->dev_params = uparams;
1757 0 : }
1758 :
1759 : static void
1760 0 : _ublk_free_dev(void *arg)
1761 : {
1762 0 : struct spdk_ublk_dev *ublk = arg;
1763 :
1764 0 : ublk_free_dev(ublk);
1765 0 : }
1766 :
1767 : static void
1768 0 : free_buffers(void *arg)
1769 : {
1770 0 : struct ublk_queue *q = arg;
1771 : uint32_t i;
1772 :
1773 0 : for (i = 0; i < q->q_depth; i++) {
1774 0 : ublk_io_put_buffer(&q->ios[i], &q->poll_group->iobuf_ch);
1775 : }
1776 0 : free(q->ios);
1777 0 : q->ios = NULL;
1778 0 : spdk_thread_send_msg(spdk_thread_get_app_thread(), _ublk_free_dev, q->dev);
1779 0 : }
1780 :
1781 : static void
1782 0 : ublk_free_dev(struct spdk_ublk_dev *ublk)
1783 : {
1784 : struct ublk_queue *q;
1785 : uint32_t q_idx;
1786 :
1787 0 : for (q_idx = 0; q_idx < ublk->num_queues; q_idx++) {
1788 0 : q = &ublk->queues[q_idx];
1789 :
1790 : /* The ublk_io of this queue are not initialized. */
1791 0 : if (q->ios == NULL) {
1792 0 : continue;
1793 : }
1794 :
1795 : /* We found a queue that has an ios array that may have buffers
1796 : * that need to be freed. Send a message to the queue's thread
1797 : * so it can free the buffers back to that thread's iobuf channel.
1798 : * When it's done, it will set q->ios to NULL and send a message
1799 : * back to this function to continue.
1800 : */
1801 0 : if (q->poll_group) {
1802 0 : spdk_thread_send_msg(q->poll_group->ublk_thread, free_buffers, q);
1803 0 : return;
1804 : } else {
1805 0 : free(q->ios);
1806 0 : q->ios = NULL;
1807 : }
1808 : }
1809 :
1810 : /* All of the buffers associated with the queues have been freed, so now
1811 : * continue with releasing resources for the rest of the ublk device.
1812 : */
1813 0 : if (ublk->bdev_desc) {
1814 0 : spdk_bdev_close(ublk->bdev_desc);
1815 0 : ublk->bdev_desc = NULL;
1816 : }
1817 :
1818 0 : ublk_dev_list_unregister(ublk);
1819 0 : SPDK_NOTICELOG("ublk dev %d stopped\n", ublk->ublk_id);
1820 :
1821 0 : free(ublk);
1822 : }
1823 :
1824 : static int
1825 0 : ublk_ios_init(struct spdk_ublk_dev *ublk)
1826 : {
1827 : int rc;
1828 : uint32_t i, j;
1829 : struct ublk_queue *q;
1830 :
1831 0 : for (i = 0; i < ublk->num_queues; i++) {
1832 0 : q = &ublk->queues[i];
1833 :
1834 0 : TAILQ_INIT(&q->completed_io_list);
1835 0 : TAILQ_INIT(&q->inflight_io_list);
1836 0 : q->dev = ublk;
1837 0 : q->q_id = i;
1838 0 : q->q_depth = ublk->queue_depth;
1839 0 : q->ios = calloc(q->q_depth, sizeof(struct ublk_io));
1840 0 : if (!q->ios) {
1841 0 : rc = -ENOMEM;
1842 0 : SPDK_ERRLOG("could not allocate queue ios\n");
1843 0 : goto err;
1844 : }
1845 0 : for (j = 0; j < q->q_depth; j++) {
1846 0 : q->ios[j].q = q;
1847 : }
1848 : }
1849 :
1850 0 : return 0;
1851 :
1852 0 : err:
1853 0 : for (i = 0; i < ublk->num_queues; i++) {
1854 0 : free(q->ios);
1855 0 : q->ios = NULL;
1856 : }
1857 0 : return rc;
1858 : }
1859 :
1860 : static void
1861 0 : ublk_queue_recovery_done(void *arg)
1862 : {
1863 0 : struct spdk_ublk_dev *ublk = arg;
1864 :
1865 0 : ublk->online_num_queues++;
1866 0 : if (ublk->is_recovering && (ublk->online_num_queues == ublk->num_queues)) {
1867 0 : ublk_ctrl_cmd_submit(ublk, UBLK_CMD_END_USER_RECOVERY);
1868 : }
1869 0 : }
1870 :
1871 : static void
1872 0 : ublk_queue_run(void *arg1)
1873 : {
1874 0 : struct ublk_queue *q = arg1;
1875 0 : struct spdk_ublk_dev *ublk = q->dev;
1876 0 : struct ublk_poll_group *poll_group = q->poll_group;
1877 :
1878 0 : assert(spdk_get_thread() == poll_group->ublk_thread);
1879 0 : q->bdev_ch = spdk_bdev_get_io_channel(ublk->bdev_desc);
1880 : /* Queues must be filled with IO in the io pthread */
1881 0 : ublk_dev_queue_io_init(q);
1882 :
1883 0 : TAILQ_INSERT_TAIL(&poll_group->queue_list, q, tailq);
1884 0 : spdk_thread_send_msg(spdk_thread_get_app_thread(), ublk_queue_recovery_done, ublk);
1885 0 : }
1886 :
1887 : int
1888 0 : ublk_start_disk(const char *bdev_name, uint32_t ublk_id,
1889 : uint32_t num_queues, uint32_t queue_depth,
1890 : ublk_ctrl_cb ctrl_cb, void *cb_arg)
1891 : {
1892 : int rc;
1893 : uint32_t i;
1894 : struct spdk_bdev *bdev;
1895 0 : struct spdk_ublk_dev *ublk = NULL;
1896 : uint32_t sector_per_block;
1897 :
1898 0 : assert(spdk_thread_is_app_thread(NULL));
1899 :
1900 0 : if (g_ublk_tgt.active == false) {
1901 0 : SPDK_ERRLOG("NO ublk target exist\n");
1902 0 : return -ENODEV;
1903 : }
1904 :
1905 0 : ublk = ublk_dev_find_by_id(ublk_id);
1906 0 : if (ublk != NULL) {
1907 0 : SPDK_DEBUGLOG(ublk, "ublk id %d is in use.\n", ublk_id);
1908 0 : return -EBUSY;
1909 : }
1910 :
1911 0 : if (g_ublk_tgt.num_ublk_devs >= g_ublks_max) {
1912 0 : SPDK_DEBUGLOG(ublk, "Reached maximum number of supported devices: %u\n", g_ublks_max);
1913 0 : return -ENOTSUP;
1914 : }
1915 :
1916 0 : ublk = calloc(1, sizeof(*ublk));
1917 0 : if (ublk == NULL) {
1918 0 : return -ENOMEM;
1919 : }
1920 0 : ublk->ctrl_cb = ctrl_cb;
1921 0 : ublk->cb_arg = cb_arg;
1922 0 : ublk->cdev_fd = -1;
1923 0 : ublk->ublk_id = ublk_id;
1924 0 : UBLK_DEBUGLOG(ublk, "bdev %s num_queues %d queue_depth %d\n",
1925 : bdev_name, num_queues, queue_depth);
1926 :
1927 0 : rc = spdk_bdev_open_ext(bdev_name, true, ublk_bdev_event_cb, ublk, &ublk->bdev_desc);
1928 0 : if (rc != 0) {
1929 0 : SPDK_ERRLOG("could not open bdev %s, error=%d\n", bdev_name, rc);
1930 0 : free(ublk);
1931 0 : return rc;
1932 : }
1933 :
1934 0 : bdev = spdk_bdev_desc_get_bdev(ublk->bdev_desc);
1935 0 : ublk->bdev = bdev;
1936 0 : sector_per_block = spdk_bdev_get_data_block_size(ublk->bdev) >> LINUX_SECTOR_SHIFT;
1937 0 : ublk->sector_per_block_shift = spdk_u32log2(sector_per_block);
1938 :
1939 0 : ublk->queues_closed = 0;
1940 0 : ublk->num_queues = num_queues;
1941 0 : ublk->queue_depth = queue_depth;
1942 0 : if (ublk->queue_depth > UBLK_DEV_MAX_QUEUE_DEPTH) {
1943 0 : SPDK_WARNLOG("Set Queue depth %d of UBLK %d to maximum %d\n",
1944 : ublk->queue_depth, ublk->ublk_id, UBLK_DEV_MAX_QUEUE_DEPTH);
1945 0 : ublk->queue_depth = UBLK_DEV_MAX_QUEUE_DEPTH;
1946 : }
1947 0 : if (ublk->num_queues > UBLK_DEV_MAX_QUEUES) {
1948 0 : SPDK_WARNLOG("Set Queue num %d of UBLK %d to maximum %d\n",
1949 : ublk->num_queues, ublk->ublk_id, UBLK_DEV_MAX_QUEUES);
1950 0 : ublk->num_queues = UBLK_DEV_MAX_QUEUES;
1951 : }
1952 0 : for (i = 0; i < ublk->num_queues; i++) {
1953 0 : ublk->queues[i].ring.ring_fd = -1;
1954 : }
1955 :
1956 0 : ublk_dev_info_init(ublk);
1957 0 : ublk_info_param_init(ublk);
1958 0 : rc = ublk_ios_init(ublk);
1959 0 : if (rc != 0) {
1960 0 : spdk_bdev_close(ublk->bdev_desc);
1961 0 : free(ublk);
1962 0 : return rc;
1963 : }
1964 :
1965 0 : SPDK_INFOLOG(ublk, "Enabling kernel access to bdev %s via ublk %d\n",
1966 : bdev_name, ublk_id);
1967 :
1968 : /* Add ublk_dev to the end of disk list */
1969 0 : ublk_dev_list_register(ublk);
1970 0 : rc = ublk_ctrl_cmd_submit(ublk, UBLK_CMD_ADD_DEV);
1971 0 : if (rc < 0) {
1972 0 : SPDK_ERRLOG("UBLK can't add dev %d, rc %s\n", ublk->ublk_id, spdk_strerror(-rc));
1973 0 : ublk_free_dev(ublk);
1974 : }
1975 :
1976 0 : return rc;
1977 : }
1978 :
1979 : static int
1980 0 : ublk_start_dev(struct spdk_ublk_dev *ublk, bool is_recovering)
1981 : {
1982 : int rc;
1983 : uint32_t q_id;
1984 : struct spdk_thread *ublk_thread;
1985 0 : char buf[64];
1986 :
1987 0 : snprintf(buf, 64, "%s%d", UBLK_BLK_CDEV, ublk->ublk_id);
1988 0 : ublk->cdev_fd = open(buf, O_RDWR);
1989 0 : if (ublk->cdev_fd < 0) {
1990 0 : rc = ublk->cdev_fd;
1991 0 : SPDK_ERRLOG("can't open %s, rc %d\n", buf, rc);
1992 0 : return rc;
1993 : }
1994 :
1995 0 : for (q_id = 0; q_id < ublk->num_queues; q_id++) {
1996 0 : rc = ublk_dev_queue_init(&ublk->queues[q_id]);
1997 0 : if (rc) {
1998 0 : return rc;
1999 : }
2000 : }
2001 :
2002 0 : if (!is_recovering) {
2003 0 : rc = ublk_ctrl_cmd_submit(ublk, UBLK_CMD_START_DEV);
2004 0 : if (rc < 0) {
2005 0 : SPDK_ERRLOG("start dev %d failed, rc %s\n", ublk->ublk_id,
2006 : spdk_strerror(-rc));
2007 0 : return rc;
2008 : }
2009 : }
2010 :
2011 : /* Send queue to different spdk_threads for load balance */
2012 0 : for (q_id = 0; q_id < ublk->num_queues; q_id++) {
2013 0 : ublk->queues[q_id].poll_group = &g_ublk_tgt.poll_groups[g_next_ublk_poll_group];
2014 0 : ublk_thread = g_ublk_tgt.poll_groups[g_next_ublk_poll_group].ublk_thread;
2015 0 : spdk_thread_send_msg(ublk_thread, ublk_queue_run, &ublk->queues[q_id]);
2016 0 : g_next_ublk_poll_group++;
2017 0 : if (g_next_ublk_poll_group == g_num_ublk_poll_groups) {
2018 0 : g_next_ublk_poll_group = 0;
2019 : }
2020 : }
2021 :
2022 0 : return 0;
2023 : }
2024 :
2025 : static int
2026 0 : ublk_ctrl_start_recovery(struct spdk_ublk_dev *ublk)
2027 : {
2028 : int rc;
2029 : uint32_t i;
2030 :
2031 0 : ublk->num_queues = ublk->dev_info.nr_hw_queues;
2032 0 : ublk->queue_depth = ublk->dev_info.queue_depth;
2033 0 : ublk->dev_info.ublksrv_pid = getpid();
2034 :
2035 0 : SPDK_DEBUGLOG(ublk, "Recovering ublk %d, num queues %u, queue depth %u, flags 0x%llx\n",
2036 : ublk->ublk_id,
2037 : ublk->num_queues, ublk->queue_depth, ublk->dev_info.flags);
2038 :
2039 0 : for (i = 0; i < ublk->num_queues; i++) {
2040 0 : ublk->queues[i].ring.ring_fd = -1;
2041 : }
2042 :
2043 0 : ublk_info_param_init(ublk);
2044 0 : rc = ublk_ios_init(ublk);
2045 0 : if (rc != 0) {
2046 0 : return rc;
2047 : }
2048 :
2049 0 : ublk->is_recovering = true;
2050 0 : return ublk_ctrl_cmd_submit(ublk, UBLK_CMD_START_USER_RECOVERY);
2051 : }
2052 :
2053 : int
2054 0 : ublk_start_disk_recovery(const char *bdev_name, uint32_t ublk_id, ublk_ctrl_cb ctrl_cb,
2055 : void *cb_arg)
2056 : {
2057 : int rc;
2058 : struct spdk_bdev *bdev;
2059 0 : struct spdk_ublk_dev *ublk = NULL;
2060 : uint32_t sector_per_block;
2061 :
2062 0 : assert(spdk_thread_is_app_thread(NULL));
2063 :
2064 0 : if (g_ublk_tgt.active == false) {
2065 0 : SPDK_ERRLOG("NO ublk target exist\n");
2066 0 : return -ENODEV;
2067 : }
2068 :
2069 0 : if (!g_ublk_tgt.user_recovery) {
2070 0 : SPDK_ERRLOG("User recovery is enabled with kernel version >= 6.4\n");
2071 0 : return -ENOTSUP;
2072 : }
2073 :
2074 0 : ublk = ublk_dev_find_by_id(ublk_id);
2075 0 : if (ublk != NULL) {
2076 0 : SPDK_DEBUGLOG(ublk, "ublk id %d is in use.\n", ublk_id);
2077 0 : return -EBUSY;
2078 : }
2079 :
2080 0 : if (g_ublk_tgt.num_ublk_devs >= g_ublks_max) {
2081 0 : SPDK_DEBUGLOG(ublk, "Reached maximum number of supported devices: %u\n", g_ublks_max);
2082 0 : return -ENOTSUP;
2083 : }
2084 :
2085 0 : ublk = calloc(1, sizeof(*ublk));
2086 0 : if (ublk == NULL) {
2087 0 : return -ENOMEM;
2088 : }
2089 0 : ublk->ctrl_cb = ctrl_cb;
2090 0 : ublk->cb_arg = cb_arg;
2091 0 : ublk->cdev_fd = -1;
2092 0 : ublk->ublk_id = ublk_id;
2093 :
2094 0 : rc = spdk_bdev_open_ext(bdev_name, true, ublk_bdev_event_cb, ublk, &ublk->bdev_desc);
2095 0 : if (rc != 0) {
2096 0 : SPDK_ERRLOG("could not open bdev %s, error=%d\n", bdev_name, rc);
2097 0 : free(ublk);
2098 0 : return rc;
2099 : }
2100 :
2101 0 : bdev = spdk_bdev_desc_get_bdev(ublk->bdev_desc);
2102 0 : ublk->bdev = bdev;
2103 0 : sector_per_block = spdk_bdev_get_data_block_size(ublk->bdev) >> LINUX_SECTOR_SHIFT;
2104 0 : ublk->sector_per_block_shift = spdk_u32log2(sector_per_block);
2105 :
2106 0 : SPDK_NOTICELOG("Recovering ublk %d with bdev %s\n", ublk->ublk_id, bdev_name);
2107 :
2108 0 : ublk_dev_list_register(ublk);
2109 0 : rc = ublk_ctrl_cmd_submit(ublk, UBLK_CMD_GET_DEV_INFO);
2110 0 : if (rc < 0) {
2111 0 : ublk_free_dev(ublk);
2112 : }
2113 :
2114 0 : return rc;
2115 : }
2116 :
2117 0 : SPDK_LOG_REGISTER_COMPONENT(ublk)
2118 0 : SPDK_LOG_REGISTER_COMPONENT(ublk_io)
|