Line data Source code
1 : /* SPDX-License-Identifier: BSD-3-Clause
2 : * Copyright (C) 2019 Intel Corporation.
3 : * All rights reserved.
4 : * Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5 : */
6 :
7 : #include "spdk/stdinc.h"
8 :
9 : #include "vbdev_delay.h"
10 : #include "spdk/rpc.h"
11 : #include "spdk/env.h"
12 : #include "spdk/endian.h"
13 : #include "spdk/string.h"
14 : #include "spdk/thread.h"
15 : #include "spdk/util.h"
16 :
17 : #include "spdk/bdev_module.h"
18 : #include "spdk/log.h"
19 :
20 : /* This namespace UUID was generated using uuid_generate() method. */
21 : #define BDEV_DELAY_NAMESPACE_UUID "4009b574-6430-4f1b-bc40-ace811091027"
22 :
23 : static int vbdev_delay_init(void);
24 : static int vbdev_delay_get_ctx_size(void);
25 : static void vbdev_delay_examine(struct spdk_bdev *bdev);
26 : static void vbdev_delay_finish(void);
27 : static int vbdev_delay_config_json(struct spdk_json_write_ctx *w);
28 :
29 : static struct spdk_bdev_module delay_if = {
30 : .name = "delay",
31 : .module_init = vbdev_delay_init,
32 : .get_ctx_size = vbdev_delay_get_ctx_size,
33 : .examine_config = vbdev_delay_examine,
34 : .module_fini = vbdev_delay_finish,
35 : .config_json = vbdev_delay_config_json
36 : };
37 :
38 0 : SPDK_BDEV_MODULE_REGISTER(delay, &delay_if)
39 :
40 : /* Associative list to be used in examine */
41 : struct bdev_association {
42 : char *vbdev_name;
43 : char *bdev_name;
44 : struct spdk_uuid uuid;
45 : uint64_t avg_read_latency;
46 : uint64_t p99_read_latency;
47 : uint64_t avg_write_latency;
48 : uint64_t p99_write_latency;
49 : TAILQ_ENTRY(bdev_association) link;
50 : };
51 : static TAILQ_HEAD(, bdev_association) g_bdev_associations = TAILQ_HEAD_INITIALIZER(
52 : g_bdev_associations);
53 :
54 : /* List of virtual bdevs and associated info for each. */
55 : struct vbdev_delay {
56 : struct spdk_bdev *base_bdev; /* the thing we're attaching to */
57 : struct spdk_bdev_desc *base_desc; /* its descriptor we get from open */
58 : struct spdk_bdev delay_bdev; /* the delay virtual bdev */
59 : uint64_t average_read_latency_ticks; /* the average read delay */
60 : uint64_t p99_read_latency_ticks; /* the p99 read delay */
61 : uint64_t average_write_latency_ticks; /* the average write delay */
62 : uint64_t p99_write_latency_ticks; /* the p99 write delay */
63 : TAILQ_ENTRY(vbdev_delay) link;
64 : struct spdk_thread *thread; /* thread where base device is opened */
65 : };
66 : static TAILQ_HEAD(, vbdev_delay) g_delay_nodes = TAILQ_HEAD_INITIALIZER(g_delay_nodes);
67 :
68 : struct delay_bdev_io {
69 : int status;
70 :
71 : uint64_t completion_tick;
72 :
73 : enum delay_io_type type;
74 :
75 : struct spdk_io_channel *ch;
76 :
77 : struct spdk_bdev_io_wait_entry bdev_io_wait;
78 :
79 : struct spdk_bdev_io *zcopy_bdev_io;
80 :
81 : STAILQ_ENTRY(delay_bdev_io) link;
82 : };
83 :
84 : struct delay_io_channel {
85 : struct spdk_io_channel *base_ch; /* IO channel of base device */
86 : STAILQ_HEAD(, delay_bdev_io) avg_read_io;
87 : STAILQ_HEAD(, delay_bdev_io) p99_read_io;
88 : STAILQ_HEAD(, delay_bdev_io) avg_write_io;
89 : STAILQ_HEAD(, delay_bdev_io) p99_write_io;
90 : struct spdk_poller *io_poller;
91 : unsigned int rand_seed;
92 : };
93 :
94 : static void vbdev_delay_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io);
95 :
96 :
97 : /* Callback for unregistering the IO device. */
98 : static void
99 0 : _device_unregister_cb(void *io_device)
100 : {
101 0 : struct vbdev_delay *delay_node = io_device;
102 :
103 : /* Done with this delay_node. */
104 0 : free(delay_node->delay_bdev.name);
105 0 : free(delay_node);
106 0 : }
107 :
108 : static void
109 0 : _vbdev_delay_destruct(void *ctx)
110 : {
111 0 : struct spdk_bdev_desc *desc = ctx;
112 :
113 0 : spdk_bdev_close(desc);
114 0 : }
115 :
116 : static int
117 0 : vbdev_delay_destruct(void *ctx)
118 : {
119 0 : struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx;
120 :
121 : /* It is important to follow this exact sequence of steps for destroying
122 : * a vbdev...
123 : */
124 :
125 0 : TAILQ_REMOVE(&g_delay_nodes, delay_node, link);
126 :
127 : /* Unclaim the underlying bdev. */
128 0 : spdk_bdev_module_release_bdev(delay_node->base_bdev);
129 :
130 : /* Close the underlying bdev on its same opened thread. */
131 0 : if (delay_node->thread && delay_node->thread != spdk_get_thread()) {
132 0 : spdk_thread_send_msg(delay_node->thread, _vbdev_delay_destruct, delay_node->base_desc);
133 : } else {
134 0 : spdk_bdev_close(delay_node->base_desc);
135 : }
136 :
137 : /* Unregister the io_device. */
138 0 : spdk_io_device_unregister(delay_node, _device_unregister_cb);
139 :
140 0 : return 0;
141 : }
142 :
143 : static int
144 0 : _process_io_stailq(void *arg, uint64_t ticks)
145 : {
146 0 : STAILQ_HEAD(, delay_bdev_io) *head = arg;
147 : struct delay_bdev_io *io_ctx, *tmp;
148 0 : int completions = 0;
149 :
150 0 : STAILQ_FOREACH_SAFE(io_ctx, head, link, tmp) {
151 0 : if (io_ctx->completion_tick <= ticks) {
152 0 : STAILQ_REMOVE(head, io_ctx, delay_bdev_io, link);
153 0 : spdk_bdev_io_complete(spdk_bdev_io_from_ctx(io_ctx), io_ctx->status);
154 0 : completions++;
155 : } else {
156 : /* In the general case, I/O will become ready in an fifo order. When timeouts are dynamically
157 : * changed, this is not necessarily the case. However, the normal behavior will be restored
158 : * after the outstanding I/O at the time of the change have been completed.
159 : * This essentially means that moving from a high to low latency creates a dam for the new I/O
160 : * submitted after the latency change. This is considered desirable behavior for the use case where
161 : * we are trying to trigger a pre-defined timeout on an initiator.
162 : */
163 0 : break;
164 : }
165 : }
166 :
167 0 : return completions;
168 : }
169 :
170 : static int
171 0 : _delay_finish_io(void *arg)
172 : {
173 0 : struct delay_io_channel *delay_ch = arg;
174 0 : uint64_t ticks = spdk_get_ticks();
175 0 : int completions = 0;
176 :
177 0 : completions += _process_io_stailq(&delay_ch->avg_read_io, ticks);
178 0 : completions += _process_io_stailq(&delay_ch->avg_write_io, ticks);
179 0 : completions += _process_io_stailq(&delay_ch->p99_read_io, ticks);
180 0 : completions += _process_io_stailq(&delay_ch->p99_write_io, ticks);
181 :
182 0 : return completions == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY;
183 : }
184 :
185 : /* Completion callback for IO that were issued from this bdev. The original bdev_io
186 : * is passed in as an arg so we'll complete that one with the appropriate status
187 : * and then free the one that this module issued.
188 : */
189 : static void
190 0 : _delay_complete_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
191 : {
192 0 : struct spdk_bdev_io *orig_io = cb_arg;
193 0 : struct vbdev_delay *delay_node = SPDK_CONTAINEROF(orig_io->bdev, struct vbdev_delay, delay_bdev);
194 0 : struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)orig_io->driver_ctx;
195 0 : struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(io_ctx->ch);
196 :
197 0 : io_ctx->status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED;
198 :
199 0 : if (bdev_io->type == SPDK_BDEV_IO_TYPE_ZCOPY && bdev_io->u.bdev.zcopy.start && success) {
200 0 : io_ctx->zcopy_bdev_io = bdev_io;
201 : } else {
202 0 : assert(io_ctx->zcopy_bdev_io == NULL || io_ctx->zcopy_bdev_io == bdev_io);
203 0 : io_ctx->zcopy_bdev_io = NULL;
204 0 : spdk_bdev_free_io(bdev_io);
205 : }
206 :
207 : /* Put the I/O into the proper list for processing by the channel poller. */
208 0 : switch (io_ctx->type) {
209 0 : case DELAY_AVG_READ:
210 0 : io_ctx->completion_tick = spdk_get_ticks() + delay_node->average_read_latency_ticks;
211 0 : STAILQ_INSERT_TAIL(&delay_ch->avg_read_io, io_ctx, link);
212 0 : break;
213 0 : case DELAY_AVG_WRITE:
214 0 : io_ctx->completion_tick = spdk_get_ticks() + delay_node->average_write_latency_ticks;
215 0 : STAILQ_INSERT_TAIL(&delay_ch->avg_write_io, io_ctx, link);
216 0 : break;
217 0 : case DELAY_P99_READ:
218 0 : io_ctx->completion_tick = spdk_get_ticks() + delay_node->p99_read_latency_ticks;
219 0 : STAILQ_INSERT_TAIL(&delay_ch->p99_read_io, io_ctx, link);
220 0 : break;
221 0 : case DELAY_P99_WRITE:
222 0 : io_ctx->completion_tick = spdk_get_ticks() + delay_node->p99_write_latency_ticks;
223 0 : STAILQ_INSERT_TAIL(&delay_ch->p99_write_io, io_ctx, link);
224 0 : break;
225 0 : case DELAY_NONE:
226 : default:
227 0 : spdk_bdev_io_complete(orig_io, io_ctx->status);
228 0 : break;
229 : }
230 0 : }
231 :
232 : static void
233 0 : vbdev_delay_resubmit_io(void *arg)
234 : {
235 0 : struct spdk_bdev_io *bdev_io = (struct spdk_bdev_io *)arg;
236 0 : struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx;
237 :
238 0 : vbdev_delay_submit_request(io_ctx->ch, bdev_io);
239 0 : }
240 :
241 : static void
242 0 : vbdev_delay_queue_io(struct spdk_bdev_io *bdev_io)
243 : {
244 0 : struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx;
245 0 : struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(io_ctx->ch);
246 : int rc;
247 :
248 0 : io_ctx->bdev_io_wait.bdev = bdev_io->bdev;
249 0 : io_ctx->bdev_io_wait.cb_fn = vbdev_delay_resubmit_io;
250 0 : io_ctx->bdev_io_wait.cb_arg = bdev_io;
251 :
252 0 : rc = spdk_bdev_queue_io_wait(bdev_io->bdev, delay_ch->base_ch, &io_ctx->bdev_io_wait);
253 0 : if (rc != 0) {
254 0 : SPDK_ERRLOG("Queue io failed in vbdev_delay_queue_io, rc=%d.\n", rc);
255 0 : spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
256 : }
257 0 : }
258 :
259 : static void
260 0 : delay_init_ext_io_opts(struct spdk_bdev_io *bdev_io, struct spdk_bdev_ext_io_opts *opts)
261 : {
262 0 : memset(opts, 0, sizeof(*opts));
263 0 : opts->size = sizeof(*opts);
264 0 : opts->memory_domain = bdev_io->u.bdev.memory_domain;
265 0 : opts->memory_domain_ctx = bdev_io->u.bdev.memory_domain_ctx;
266 0 : opts->metadata = bdev_io->u.bdev.md_buf;
267 0 : }
268 :
269 : static void
270 0 : delay_read_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success)
271 : {
272 0 : struct vbdev_delay *delay_node = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_delay,
273 : delay_bdev);
274 0 : struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(ch);
275 0 : struct spdk_bdev_ext_io_opts io_opts;
276 : int rc;
277 :
278 0 : if (!success) {
279 0 : spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
280 0 : return;
281 : }
282 :
283 0 : delay_init_ext_io_opts(bdev_io, &io_opts);
284 0 : rc = spdk_bdev_readv_blocks_ext(delay_node->base_desc, delay_ch->base_ch, bdev_io->u.bdev.iovs,
285 : bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks,
286 : bdev_io->u.bdev.num_blocks, _delay_complete_io,
287 : bdev_io, &io_opts);
288 :
289 0 : if (rc == -ENOMEM) {
290 0 : SPDK_ERRLOG("No memory, start to queue io for delay.\n");
291 0 : vbdev_delay_queue_io(bdev_io);
292 0 : } else if (rc != 0) {
293 0 : SPDK_ERRLOG("ERROR on bdev_io submission!\n");
294 0 : spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
295 : }
296 : }
297 :
298 : static void
299 0 : vbdev_delay_reset_dev(struct spdk_io_channel_iter *i, int status)
300 : {
301 0 : struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i);
302 0 : struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx;
303 0 : struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(io_ctx->ch);
304 0 : struct vbdev_delay *delay_node = spdk_io_channel_iter_get_io_device(i);
305 : int rc;
306 :
307 0 : rc = spdk_bdev_reset(delay_node->base_desc, delay_ch->base_ch,
308 : _delay_complete_io, bdev_io);
309 :
310 0 : if (rc == -ENOMEM) {
311 0 : SPDK_ERRLOG("No memory, start to queue io for delay.\n");
312 0 : vbdev_delay_queue_io(bdev_io);
313 0 : } else if (rc != 0) {
314 0 : SPDK_ERRLOG("ERROR on bdev_io submission!\n");
315 0 : spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
316 : }
317 0 : }
318 :
319 : static void
320 0 : abort_zcopy_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
321 : {
322 0 : spdk_bdev_free_io(bdev_io);
323 0 : }
324 :
325 : static void
326 0 : _abort_all_delayed_io(void *arg)
327 : {
328 0 : STAILQ_HEAD(, delay_bdev_io) *head = arg;
329 : struct delay_bdev_io *io_ctx, *tmp;
330 :
331 0 : STAILQ_FOREACH_SAFE(io_ctx, head, link, tmp) {
332 0 : STAILQ_REMOVE(head, io_ctx, delay_bdev_io, link);
333 0 : if (io_ctx->zcopy_bdev_io != NULL) {
334 0 : spdk_bdev_zcopy_end(io_ctx->zcopy_bdev_io, false, abort_zcopy_io, NULL);
335 : }
336 0 : spdk_bdev_io_complete(spdk_bdev_io_from_ctx(io_ctx), SPDK_BDEV_IO_STATUS_ABORTED);
337 : }
338 0 : }
339 :
340 : static void
341 0 : vbdev_delay_reset_channel(struct spdk_io_channel_iter *i)
342 : {
343 0 : struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
344 0 : struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(ch);
345 :
346 0 : _abort_all_delayed_io(&delay_ch->avg_read_io);
347 0 : _abort_all_delayed_io(&delay_ch->avg_write_io);
348 0 : _abort_all_delayed_io(&delay_ch->p99_read_io);
349 0 : _abort_all_delayed_io(&delay_ch->p99_write_io);
350 :
351 0 : spdk_for_each_channel_continue(i, 0);
352 0 : }
353 :
354 : static bool
355 0 : abort_delayed_io(void *_head, struct spdk_bdev_io *bio_to_abort)
356 : {
357 0 : STAILQ_HEAD(, delay_bdev_io) *head = _head;
358 0 : struct delay_bdev_io *io_ctx_to_abort = (struct delay_bdev_io *)bio_to_abort->driver_ctx;
359 : struct delay_bdev_io *io_ctx;
360 :
361 0 : STAILQ_FOREACH(io_ctx, head, link) {
362 0 : if (io_ctx == io_ctx_to_abort) {
363 0 : STAILQ_REMOVE(head, io_ctx_to_abort, delay_bdev_io, link);
364 0 : if (io_ctx->zcopy_bdev_io != NULL) {
365 0 : spdk_bdev_zcopy_end(io_ctx->zcopy_bdev_io, false, abort_zcopy_io, NULL);
366 : }
367 0 : spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED);
368 0 : return true;
369 : }
370 : }
371 :
372 0 : return false;
373 : }
374 :
375 : static int
376 0 : vbdev_delay_abort(struct vbdev_delay *delay_node, struct delay_io_channel *delay_ch,
377 : struct spdk_bdev_io *bdev_io)
378 : {
379 0 : struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort;
380 :
381 0 : if (abort_delayed_io(&delay_ch->avg_read_io, bio_to_abort) ||
382 0 : abort_delayed_io(&delay_ch->avg_write_io, bio_to_abort) ||
383 0 : abort_delayed_io(&delay_ch->p99_read_io, bio_to_abort) ||
384 0 : abort_delayed_io(&delay_ch->p99_write_io, bio_to_abort)) {
385 0 : spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS);
386 0 : return 0;
387 : }
388 :
389 0 : return spdk_bdev_abort(delay_node->base_desc, delay_ch->base_ch, bio_to_abort,
390 : _delay_complete_io, bdev_io);
391 : }
392 :
393 : static void
394 0 : vbdev_delay_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
395 : {
396 0 : struct vbdev_delay *delay_node = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_delay, delay_bdev);
397 0 : struct delay_io_channel *delay_ch = spdk_io_channel_get_ctx(ch);
398 0 : struct delay_bdev_io *io_ctx = (struct delay_bdev_io *)bdev_io->driver_ctx;
399 0 : struct spdk_bdev_ext_io_opts io_opts;
400 0 : int rc = 0;
401 : bool is_p99;
402 :
403 0 : is_p99 = rand_r(&delay_ch->rand_seed) % 100 == 0 ? true : false;
404 :
405 0 : io_ctx->ch = ch;
406 0 : io_ctx->type = DELAY_NONE;
407 0 : if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY || bdev_io->u.bdev.zcopy.start) {
408 0 : io_ctx->zcopy_bdev_io = NULL;
409 : }
410 :
411 0 : switch (bdev_io->type) {
412 0 : case SPDK_BDEV_IO_TYPE_READ:
413 0 : io_ctx->type = is_p99 ? DELAY_P99_READ : DELAY_AVG_READ;
414 0 : spdk_bdev_io_get_buf(bdev_io, delay_read_get_buf_cb,
415 0 : bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen);
416 0 : break;
417 0 : case SPDK_BDEV_IO_TYPE_WRITE:
418 0 : io_ctx->type = is_p99 ? DELAY_P99_WRITE : DELAY_AVG_WRITE;
419 0 : delay_init_ext_io_opts(bdev_io, &io_opts);
420 0 : rc = spdk_bdev_writev_blocks_ext(delay_node->base_desc, delay_ch->base_ch, bdev_io->u.bdev.iovs,
421 : bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks,
422 : bdev_io->u.bdev.num_blocks, _delay_complete_io,
423 : bdev_io, &io_opts);
424 0 : break;
425 0 : case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
426 0 : rc = spdk_bdev_write_zeroes_blocks(delay_node->base_desc, delay_ch->base_ch,
427 : bdev_io->u.bdev.offset_blocks,
428 : bdev_io->u.bdev.num_blocks,
429 : _delay_complete_io, bdev_io);
430 0 : break;
431 0 : case SPDK_BDEV_IO_TYPE_UNMAP:
432 0 : rc = spdk_bdev_unmap_blocks(delay_node->base_desc, delay_ch->base_ch,
433 : bdev_io->u.bdev.offset_blocks,
434 : bdev_io->u.bdev.num_blocks,
435 : _delay_complete_io, bdev_io);
436 0 : break;
437 0 : case SPDK_BDEV_IO_TYPE_FLUSH:
438 0 : rc = spdk_bdev_flush_blocks(delay_node->base_desc, delay_ch->base_ch,
439 : bdev_io->u.bdev.offset_blocks,
440 : bdev_io->u.bdev.num_blocks,
441 : _delay_complete_io, bdev_io);
442 0 : break;
443 0 : case SPDK_BDEV_IO_TYPE_RESET:
444 : /* During reset, the generic bdev layer aborts all new I/Os and queues all new resets.
445 : * Hence we can simply abort all I/Os delayed to complete.
446 : */
447 0 : spdk_for_each_channel(delay_node, vbdev_delay_reset_channel, bdev_io,
448 : vbdev_delay_reset_dev);
449 0 : break;
450 0 : case SPDK_BDEV_IO_TYPE_ABORT:
451 0 : rc = vbdev_delay_abort(delay_node, delay_ch, bdev_io);
452 0 : break;
453 0 : case SPDK_BDEV_IO_TYPE_ZCOPY:
454 0 : if (bdev_io->u.bdev.zcopy.commit) {
455 0 : io_ctx->type = is_p99 ? DELAY_P99_WRITE : DELAY_AVG_WRITE;
456 0 : } else if (bdev_io->u.bdev.zcopy.populate) {
457 0 : io_ctx->type = is_p99 ? DELAY_P99_READ : DELAY_AVG_READ;
458 : }
459 0 : if (bdev_io->u.bdev.zcopy.start) {
460 0 : rc = spdk_bdev_zcopy_start(delay_node->base_desc, delay_ch->base_ch,
461 : bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
462 : bdev_io->u.bdev.offset_blocks,
463 : bdev_io->u.bdev.num_blocks,
464 0 : bdev_io->u.bdev.zcopy.populate,
465 : _delay_complete_io, bdev_io);
466 : } else {
467 0 : rc = spdk_bdev_zcopy_end(io_ctx->zcopy_bdev_io, bdev_io->u.bdev.zcopy.commit,
468 : _delay_complete_io, bdev_io);
469 : }
470 0 : break;
471 0 : default:
472 0 : SPDK_ERRLOG("delay: unknown I/O type %d\n", bdev_io->type);
473 0 : spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
474 0 : return;
475 : }
476 :
477 0 : if (rc == -ENOMEM) {
478 0 : SPDK_ERRLOG("No memory, start to queue io for delay.\n");
479 0 : vbdev_delay_queue_io(bdev_io);
480 0 : } else if (rc != 0) {
481 0 : SPDK_ERRLOG("ERROR on bdev_io submission!\n");
482 0 : spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED);
483 : }
484 : }
485 :
486 : static bool
487 0 : vbdev_delay_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
488 : {
489 0 : struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx;
490 :
491 0 : return spdk_bdev_io_type_supported(delay_node->base_bdev, io_type);
492 : }
493 :
494 : static struct spdk_io_channel *
495 0 : vbdev_delay_get_io_channel(void *ctx)
496 : {
497 0 : struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx;
498 0 : struct spdk_io_channel *delay_ch = NULL;
499 :
500 0 : delay_ch = spdk_get_io_channel(delay_node);
501 :
502 0 : return delay_ch;
503 : }
504 :
505 : static void
506 0 : _delay_write_conf_values(struct vbdev_delay *delay_node, struct spdk_json_write_ctx *w)
507 : {
508 0 : struct spdk_uuid *uuid = &delay_node->delay_bdev.uuid;
509 :
510 0 : spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&delay_node->delay_bdev));
511 0 : spdk_json_write_named_string(w, "base_bdev_name", spdk_bdev_get_name(delay_node->base_bdev));
512 0 : if (!spdk_uuid_is_null(uuid)) {
513 0 : spdk_json_write_named_uuid(w, "uuid", uuid);
514 : }
515 0 : spdk_json_write_named_int64(w, "avg_read_latency",
516 0 : delay_node->average_read_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz());
517 0 : spdk_json_write_named_int64(w, "p99_read_latency",
518 0 : delay_node->p99_read_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz());
519 0 : spdk_json_write_named_int64(w, "avg_write_latency",
520 0 : delay_node->average_write_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz());
521 0 : spdk_json_write_named_int64(w, "p99_write_latency",
522 0 : delay_node->p99_write_latency_ticks * SPDK_SEC_TO_USEC / spdk_get_ticks_hz());
523 0 : }
524 :
525 : static int
526 0 : vbdev_delay_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
527 : {
528 0 : struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx;
529 :
530 0 : spdk_json_write_name(w, "delay");
531 0 : spdk_json_write_object_begin(w);
532 0 : _delay_write_conf_values(delay_node, w);
533 0 : spdk_json_write_object_end(w);
534 :
535 0 : return 0;
536 : }
537 :
538 : /* This is used to generate JSON that can configure this module to its current state. */
539 : static int
540 0 : vbdev_delay_config_json(struct spdk_json_write_ctx *w)
541 : {
542 : struct vbdev_delay *delay_node;
543 :
544 0 : TAILQ_FOREACH(delay_node, &g_delay_nodes, link) {
545 0 : spdk_json_write_object_begin(w);
546 0 : spdk_json_write_named_string(w, "method", "bdev_delay_create");
547 0 : spdk_json_write_named_object_begin(w, "params");
548 0 : _delay_write_conf_values(delay_node, w);
549 0 : spdk_json_write_object_end(w);
550 0 : spdk_json_write_object_end(w);
551 : }
552 0 : return 0;
553 : }
554 :
555 : /* We provide this callback for the SPDK channel code to create a channel using
556 : * the channel struct we provided in our module get_io_channel() entry point. Here
557 : * we get and save off an underlying base channel of the device below us so that
558 : * we can communicate with the base bdev on a per channel basis. If we needed
559 : * our own poller for this vbdev, we'd register it here.
560 : */
561 : static int
562 0 : delay_bdev_ch_create_cb(void *io_device, void *ctx_buf)
563 : {
564 0 : struct delay_io_channel *delay_ch = ctx_buf;
565 0 : struct vbdev_delay *delay_node = io_device;
566 :
567 0 : STAILQ_INIT(&delay_ch->avg_read_io);
568 0 : STAILQ_INIT(&delay_ch->p99_read_io);
569 0 : STAILQ_INIT(&delay_ch->avg_write_io);
570 0 : STAILQ_INIT(&delay_ch->p99_write_io);
571 :
572 0 : delay_ch->io_poller = SPDK_POLLER_REGISTER(_delay_finish_io, delay_ch, 0);
573 0 : delay_ch->base_ch = spdk_bdev_get_io_channel(delay_node->base_desc);
574 0 : delay_ch->rand_seed = time(NULL);
575 :
576 0 : return 0;
577 : }
578 :
579 : /* We provide this callback for the SPDK channel code to destroy a channel
580 : * created with our create callback. We just need to undo anything we did
581 : * when we created. If this bdev used its own poller, we'd unregister it here.
582 : */
583 : static void
584 0 : delay_bdev_ch_destroy_cb(void *io_device, void *ctx_buf)
585 : {
586 0 : struct delay_io_channel *delay_ch = ctx_buf;
587 :
588 0 : spdk_poller_unregister(&delay_ch->io_poller);
589 0 : spdk_put_io_channel(delay_ch->base_ch);
590 0 : }
591 :
592 : /* Create the delay association from the bdev and vbdev name and insert
593 : * on the global list. */
594 : static int
595 0 : vbdev_delay_insert_association(const char *bdev_name, const char *vbdev_name,
596 : struct spdk_uuid *uuid,
597 : uint64_t avg_read_latency, uint64_t p99_read_latency,
598 : uint64_t avg_write_latency, uint64_t p99_write_latency)
599 : {
600 : struct bdev_association *assoc;
601 :
602 0 : TAILQ_FOREACH(assoc, &g_bdev_associations, link) {
603 0 : if (strcmp(vbdev_name, assoc->vbdev_name) == 0) {
604 0 : SPDK_ERRLOG("delay bdev %s already exists\n", vbdev_name);
605 0 : return -EEXIST;
606 : }
607 : }
608 :
609 0 : assoc = calloc(1, sizeof(struct bdev_association));
610 0 : if (!assoc) {
611 0 : SPDK_ERRLOG("could not allocate bdev_association\n");
612 0 : return -ENOMEM;
613 : }
614 :
615 0 : assoc->bdev_name = strdup(bdev_name);
616 0 : if (!assoc->bdev_name) {
617 0 : SPDK_ERRLOG("could not allocate assoc->bdev_name\n");
618 0 : free(assoc);
619 0 : return -ENOMEM;
620 : }
621 :
622 0 : assoc->vbdev_name = strdup(vbdev_name);
623 0 : if (!assoc->vbdev_name) {
624 0 : SPDK_ERRLOG("could not allocate assoc->vbdev_name\n");
625 0 : free(assoc->bdev_name);
626 0 : free(assoc);
627 0 : return -ENOMEM;
628 : }
629 :
630 0 : assoc->avg_read_latency = avg_read_latency;
631 0 : assoc->p99_read_latency = p99_read_latency;
632 0 : assoc->avg_write_latency = avg_write_latency;
633 0 : assoc->p99_write_latency = p99_write_latency;
634 0 : spdk_uuid_copy(&assoc->uuid, uuid);
635 :
636 0 : TAILQ_INSERT_TAIL(&g_bdev_associations, assoc, link);
637 :
638 0 : return 0;
639 : }
640 :
641 : int
642 0 : vbdev_delay_update_latency_value(char *delay_name, uint64_t latency_us, enum delay_io_type type)
643 : {
644 : struct vbdev_delay *delay_node;
645 0 : uint64_t ticks_mhz = spdk_get_ticks_hz() / SPDK_SEC_TO_USEC;
646 :
647 0 : TAILQ_FOREACH(delay_node, &g_delay_nodes, link) {
648 0 : if (strcmp(delay_node->delay_bdev.name, delay_name) == 0) {
649 0 : break;
650 : }
651 : }
652 :
653 0 : if (delay_node == NULL) {
654 0 : return -ENODEV;
655 : }
656 :
657 0 : switch (type) {
658 0 : case DELAY_AVG_READ:
659 0 : delay_node->average_read_latency_ticks = ticks_mhz * latency_us;
660 0 : break;
661 0 : case DELAY_AVG_WRITE:
662 0 : delay_node->average_write_latency_ticks = ticks_mhz * latency_us;
663 0 : break;
664 0 : case DELAY_P99_READ:
665 0 : delay_node->p99_read_latency_ticks = ticks_mhz * latency_us;
666 0 : break;
667 0 : case DELAY_P99_WRITE:
668 0 : delay_node->p99_write_latency_ticks = ticks_mhz * latency_us;
669 0 : break;
670 0 : default:
671 0 : return -EINVAL;
672 : }
673 :
674 0 : return 0;
675 : }
676 :
677 : static int
678 0 : vbdev_delay_init(void)
679 : {
680 : /* Not allowing for .ini style configuration. */
681 0 : return 0;
682 : }
683 :
684 : static void
685 0 : vbdev_delay_finish(void)
686 : {
687 : struct bdev_association *assoc;
688 :
689 0 : while ((assoc = TAILQ_FIRST(&g_bdev_associations))) {
690 0 : TAILQ_REMOVE(&g_bdev_associations, assoc, link);
691 0 : free(assoc->bdev_name);
692 0 : free(assoc->vbdev_name);
693 0 : free(assoc);
694 : }
695 0 : }
696 :
697 : static int
698 0 : vbdev_delay_get_ctx_size(void)
699 : {
700 0 : return sizeof(struct delay_bdev_io);
701 : }
702 :
703 : static void
704 0 : vbdev_delay_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
705 : {
706 : /* No config per bdev needed */
707 0 : }
708 :
709 : static int
710 0 : vbdev_delay_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size)
711 : {
712 0 : struct vbdev_delay *delay_node = (struct vbdev_delay *)ctx;
713 :
714 : /* Delay bdev doesn't work with data buffers, so it supports any memory domain used by base_bdev */
715 0 : return spdk_bdev_get_memory_domains(delay_node->base_bdev, domains, array_size);
716 : }
717 :
718 : /* When we register our bdev this is how we specify our entry points. */
719 : static const struct spdk_bdev_fn_table vbdev_delay_fn_table = {
720 : .destruct = vbdev_delay_destruct,
721 : .submit_request = vbdev_delay_submit_request,
722 : .io_type_supported = vbdev_delay_io_type_supported,
723 : .get_io_channel = vbdev_delay_get_io_channel,
724 : .dump_info_json = vbdev_delay_dump_info_json,
725 : .write_config_json = vbdev_delay_write_config_json,
726 : .get_memory_domains = vbdev_delay_get_memory_domains,
727 : };
728 :
729 : static void
730 0 : vbdev_delay_base_bdev_hotremove_cb(struct spdk_bdev *bdev_find)
731 : {
732 : struct vbdev_delay *delay_node, *tmp;
733 :
734 0 : TAILQ_FOREACH_SAFE(delay_node, &g_delay_nodes, link, tmp) {
735 0 : if (bdev_find == delay_node->base_bdev) {
736 0 : spdk_bdev_unregister(&delay_node->delay_bdev, NULL, NULL);
737 : }
738 : }
739 0 : }
740 :
741 : /* Called when the underlying base bdev triggers asynchronous event such as bdev removal. */
742 : static void
743 0 : vbdev_delay_base_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev,
744 : void *event_ctx)
745 : {
746 0 : switch (type) {
747 0 : case SPDK_BDEV_EVENT_REMOVE:
748 0 : vbdev_delay_base_bdev_hotremove_cb(bdev);
749 0 : break;
750 0 : default:
751 0 : SPDK_NOTICELOG("Unsupported bdev event: type %d\n", type);
752 0 : break;
753 : }
754 0 : }
755 :
756 : /* Create and register the delay vbdev if we find it in our list of bdev names.
757 : * This can be called either by the examine path or RPC method.
758 : */
759 : static int
760 0 : vbdev_delay_register(const char *bdev_name)
761 : {
762 : struct bdev_association *assoc;
763 : struct vbdev_delay *delay_node;
764 : struct spdk_bdev *bdev;
765 0 : uint64_t ticks_mhz = spdk_get_ticks_hz() / SPDK_SEC_TO_USEC;
766 0 : struct spdk_uuid ns_uuid;
767 0 : int rc = 0;
768 :
769 0 : spdk_uuid_parse(&ns_uuid, BDEV_DELAY_NAMESPACE_UUID);
770 :
771 : /* Check our list of names from config versus this bdev and if
772 : * there's a match, create the delay_node & bdev accordingly.
773 : */
774 0 : TAILQ_FOREACH(assoc, &g_bdev_associations, link) {
775 0 : if (strcmp(assoc->bdev_name, bdev_name) != 0) {
776 0 : continue;
777 : }
778 :
779 0 : delay_node = calloc(1, sizeof(struct vbdev_delay));
780 0 : if (!delay_node) {
781 0 : rc = -ENOMEM;
782 0 : SPDK_ERRLOG("could not allocate delay_node\n");
783 0 : break;
784 : }
785 0 : delay_node->delay_bdev.name = strdup(assoc->vbdev_name);
786 0 : if (!delay_node->delay_bdev.name) {
787 0 : rc = -ENOMEM;
788 0 : SPDK_ERRLOG("could not allocate delay_bdev name\n");
789 0 : free(delay_node);
790 0 : break;
791 : }
792 0 : delay_node->delay_bdev.product_name = "delay";
793 :
794 : /* The base bdev that we're attaching to. */
795 0 : rc = spdk_bdev_open_ext(bdev_name, true, vbdev_delay_base_bdev_event_cb,
796 : NULL, &delay_node->base_desc);
797 0 : if (rc) {
798 0 : if (rc != -ENODEV) {
799 0 : SPDK_ERRLOG("could not open bdev %s\n", bdev_name);
800 : }
801 0 : free(delay_node->delay_bdev.name);
802 0 : free(delay_node);
803 0 : break;
804 : }
805 :
806 0 : bdev = spdk_bdev_desc_get_bdev(delay_node->base_desc);
807 0 : delay_node->base_bdev = bdev;
808 :
809 0 : delay_node->delay_bdev.write_cache = bdev->write_cache;
810 0 : delay_node->delay_bdev.required_alignment = bdev->required_alignment;
811 0 : delay_node->delay_bdev.optimal_io_boundary = bdev->optimal_io_boundary;
812 0 : delay_node->delay_bdev.blocklen = bdev->blocklen;
813 0 : delay_node->delay_bdev.blockcnt = bdev->blockcnt;
814 :
815 0 : delay_node->delay_bdev.md_interleave = bdev->md_interleave;
816 0 : delay_node->delay_bdev.md_len = bdev->md_len;
817 0 : delay_node->delay_bdev.dif_type = bdev->dif_type;
818 0 : delay_node->delay_bdev.dif_is_head_of_md = bdev->dif_is_head_of_md;
819 0 : delay_node->delay_bdev.dif_check_flags = bdev->dif_check_flags;
820 0 : delay_node->delay_bdev.dif_pi_format = bdev->dif_pi_format;
821 :
822 0 : delay_node->delay_bdev.ctxt = delay_node;
823 0 : delay_node->delay_bdev.fn_table = &vbdev_delay_fn_table;
824 0 : delay_node->delay_bdev.module = &delay_if;
825 :
826 : /* Store the number of ticks you need to add to get the I/O expiration time. */
827 0 : delay_node->average_read_latency_ticks = ticks_mhz * assoc->avg_read_latency;
828 0 : delay_node->p99_read_latency_ticks = ticks_mhz * assoc->p99_read_latency;
829 0 : delay_node->average_write_latency_ticks = ticks_mhz * assoc->avg_write_latency;
830 0 : delay_node->p99_write_latency_ticks = ticks_mhz * assoc->p99_write_latency;
831 :
832 0 : if (spdk_uuid_is_null(&assoc->uuid)) {
833 : /* Generate UUID based on namespace UUID + base bdev UUID */
834 0 : rc = spdk_uuid_generate_sha1(&delay_node->delay_bdev.uuid, &ns_uuid,
835 0 : (const char *)&bdev->uuid, sizeof(struct spdk_uuid));
836 0 : if (rc) {
837 0 : spdk_bdev_close(delay_node->base_desc);
838 0 : free(delay_node->delay_bdev.name);
839 0 : free(delay_node);
840 0 : break;
841 : }
842 : } else {
843 0 : spdk_uuid_copy(&delay_node->delay_bdev.uuid, &assoc->uuid);
844 : }
845 :
846 0 : spdk_io_device_register(delay_node, delay_bdev_ch_create_cb, delay_bdev_ch_destroy_cb,
847 : sizeof(struct delay_io_channel),
848 0 : assoc->vbdev_name);
849 :
850 : /* Save the thread where the base device is opened */
851 0 : delay_node->thread = spdk_get_thread();
852 :
853 0 : rc = spdk_bdev_module_claim_bdev(bdev, delay_node->base_desc, delay_node->delay_bdev.module);
854 0 : if (rc) {
855 0 : SPDK_ERRLOG("could not claim bdev %s\n", bdev_name);
856 0 : goto error_close;
857 : }
858 :
859 0 : rc = spdk_bdev_register(&delay_node->delay_bdev);
860 0 : if (rc) {
861 0 : SPDK_ERRLOG("could not register delay_bdev\n");
862 0 : spdk_bdev_module_release_bdev(delay_node->base_bdev);
863 0 : goto error_close;
864 : }
865 :
866 0 : TAILQ_INSERT_TAIL(&g_delay_nodes, delay_node, link);
867 : }
868 :
869 0 : return rc;
870 :
871 0 : error_close:
872 0 : spdk_bdev_close(delay_node->base_desc);
873 0 : spdk_io_device_unregister(delay_node, NULL);
874 0 : free(delay_node->delay_bdev.name);
875 0 : free(delay_node);
876 0 : return rc;
877 : }
878 :
879 : int
880 0 : create_delay_disk(const char *bdev_name, const char *vbdev_name, struct spdk_uuid *uuid,
881 : uint64_t avg_read_latency,
882 : uint64_t p99_read_latency, uint64_t avg_write_latency, uint64_t p99_write_latency)
883 : {
884 0 : int rc = 0;
885 :
886 0 : if (p99_read_latency < avg_read_latency || p99_write_latency < avg_write_latency) {
887 0 : SPDK_ERRLOG("Unable to create a delay bdev where p99 latency is less than average latency.\n");
888 0 : return -EINVAL;
889 : }
890 :
891 0 : rc = vbdev_delay_insert_association(bdev_name, vbdev_name, uuid, avg_read_latency, p99_read_latency,
892 : avg_write_latency, p99_write_latency);
893 0 : if (rc) {
894 0 : return rc;
895 : }
896 :
897 0 : rc = vbdev_delay_register(bdev_name);
898 0 : if (rc == -ENODEV) {
899 : /* This is not an error, we tracked the name above and it still
900 : * may show up later.
901 : */
902 0 : SPDK_NOTICELOG("vbdev creation deferred pending base bdev arrival\n");
903 0 : rc = 0;
904 : }
905 :
906 0 : return rc;
907 : }
908 :
909 : void
910 0 : delete_delay_disk(const char *vbdev_name, spdk_bdev_unregister_cb cb_fn, void *cb_arg)
911 : {
912 : struct bdev_association *assoc;
913 : int rc;
914 :
915 0 : rc = spdk_bdev_unregister_by_name(vbdev_name, &delay_if, cb_fn, cb_arg);
916 0 : if (rc == 0) {
917 0 : TAILQ_FOREACH(assoc, &g_bdev_associations, link) {
918 0 : if (strcmp(assoc->vbdev_name, vbdev_name) == 0) {
919 0 : TAILQ_REMOVE(&g_bdev_associations, assoc, link);
920 0 : free(assoc->bdev_name);
921 0 : free(assoc->vbdev_name);
922 0 : free(assoc);
923 0 : break;
924 : }
925 : }
926 : } else {
927 0 : cb_fn(cb_arg, rc);
928 : }
929 0 : }
930 :
931 : static void
932 0 : vbdev_delay_examine(struct spdk_bdev *bdev)
933 : {
934 0 : vbdev_delay_register(bdev->name);
935 :
936 0 : spdk_bdev_module_examine_done(&delay_if);
937 0 : }
938 :
939 0 : SPDK_LOG_REGISTER_COMPONENT(vbdev_delay)
|