Line data Source code
1 : /* SPDX-License-Identifier: BSD-3-Clause
2 : * Copyright (C) 2019 Intel Corporation.
3 : * All rights reserved.
4 : * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5 : */
6 :
7 : #include "bdev_raid.h"
8 :
9 : #include "spdk/env.h"
10 : #include "spdk/thread.h"
11 : #include "spdk/string.h"
12 : #include "spdk/util.h"
13 :
14 : #include "spdk/log.h"
15 :
16 : /*
17 : * brief:
18 : * raid0_bdev_io_completion function is called by lower layers to notify raid
19 : * module that particular bdev_io is completed.
20 : * params:
21 : * bdev_io - pointer to bdev io submitted to lower layers, like child io
22 : * success - bdev_io status
23 : * cb_arg - function callback context (parent raid_bdev_io)
24 : * returns:
25 : * none
26 : */
27 : static void
28 10 : raid0_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
29 : {
30 10 : struct raid_bdev_io *raid_io = cb_arg;
31 : int rc;
32 :
33 10 : if (success) {
34 8 : if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_READ &&
35 : spdk_bdev_get_dif_type(bdev_io->bdev) != SPDK_DIF_DISABLE &&
36 : bdev_io->bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK)) {
37 :
38 2 : rc = raid_bdev_verify_dix_reftag(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
39 : bdev_io->u.bdev.md_buf, bdev_io->u.bdev.num_blocks, bdev_io->bdev,
40 2 : bdev_io->u.bdev.offset_blocks);
41 2 : if (rc != 0) {
42 0 : SPDK_ERRLOG("Reftag verify failed.\n");
43 0 : raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
44 0 : return;
45 : }
46 : }
47 :
48 8 : raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_SUCCESS);
49 : } else {
50 2 : raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
51 : }
52 :
53 10 : spdk_bdev_free_io(bdev_io);
54 : }
55 :
56 : static void raid0_submit_rw_request(struct raid_bdev_io *raid_io);
57 :
58 : static void
59 0 : _raid0_submit_rw_request(void *_raid_io)
60 : {
61 0 : struct raid_bdev_io *raid_io = _raid_io;
62 :
63 0 : raid0_submit_rw_request(raid_io);
64 0 : }
65 :
66 : /*
67 : * brief:
68 : * raid0_submit_rw_request function is used to submit I/O to the correct
69 : * member disk for raid0 bdevs.
70 : * params:
71 : * raid_io
72 : * returns:
73 : * none
74 : */
75 : static void
76 10 : raid0_submit_rw_request(struct raid_bdev_io *raid_io)
77 : {
78 10 : struct spdk_bdev_ext_io_opts io_opts = {};
79 10 : struct raid_bdev_io_channel *raid_ch = raid_io->raid_ch;
80 10 : struct raid_bdev *raid_bdev = raid_io->raid_bdev;
81 : uint64_t pd_strip;
82 : uint32_t offset_in_strip;
83 : uint64_t pd_lba;
84 : uint64_t pd_blocks;
85 : uint8_t pd_idx;
86 10 : int ret = 0;
87 : uint64_t start_strip;
88 : uint64_t end_strip;
89 : struct raid_base_bdev_info *base_info;
90 : struct spdk_io_channel *base_ch;
91 :
92 10 : start_strip = raid_io->offset_blocks >> raid_bdev->strip_size_shift;
93 10 : end_strip = (raid_io->offset_blocks + raid_io->num_blocks - 1) >>
94 10 : raid_bdev->strip_size_shift;
95 10 : if (start_strip != end_strip && raid_bdev->num_base_bdevs > 1) {
96 0 : assert(false);
97 : SPDK_ERRLOG("I/O spans strip boundary!\n");
98 : raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
99 : return;
100 : }
101 :
102 10 : pd_strip = start_strip / raid_bdev->num_base_bdevs;
103 10 : pd_idx = start_strip % raid_bdev->num_base_bdevs;
104 10 : offset_in_strip = raid_io->offset_blocks & (raid_bdev->strip_size - 1);
105 10 : pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip;
106 10 : pd_blocks = raid_io->num_blocks;
107 10 : base_info = &raid_bdev->base_bdev_info[pd_idx];
108 10 : if (base_info->desc == NULL) {
109 0 : SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx);
110 0 : assert(0);
111 : }
112 :
113 : /*
114 : * Submit child io to bdev layer with using base bdev descriptors, base
115 : * bdev lba, base bdev child io length in blocks, buffer, completion
116 : * function and function callback context
117 : */
118 10 : assert(raid_ch != NULL);
119 10 : base_ch = raid_bdev_channel_get_base_channel(raid_ch, pd_idx);
120 :
121 10 : io_opts.size = sizeof(io_opts);
122 10 : io_opts.memory_domain = raid_io->memory_domain;
123 10 : io_opts.memory_domain_ctx = raid_io->memory_domain_ctx;
124 10 : io_opts.metadata = raid_io->md_buf;
125 :
126 10 : if (raid_io->type == SPDK_BDEV_IO_TYPE_READ) {
127 4 : ret = raid_bdev_readv_blocks_ext(base_info, base_ch,
128 : raid_io->iovs, raid_io->iovcnt,
129 : pd_lba, pd_blocks, raid0_bdev_io_completion,
130 : raid_io, &io_opts);
131 6 : } else if (raid_io->type == SPDK_BDEV_IO_TYPE_WRITE) {
132 6 : struct spdk_bdev *bdev = &base_info->raid_bdev->bdev;
133 :
134 6 : if (spdk_unlikely(spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE &&
135 : bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK)) {
136 3 : ret = raid_bdev_verify_dix_reftag(raid_io->iovs, raid_io->iovcnt, io_opts.metadata,
137 3 : pd_blocks, bdev, raid_io->offset_blocks);
138 3 : if (ret != 0) {
139 0 : SPDK_ERRLOG("bdev io submit error due to DIX verify failure\n");
140 0 : raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
141 0 : return;
142 : }
143 : }
144 :
145 6 : ret = raid_bdev_writev_blocks_ext(base_info, base_ch,
146 : raid_io->iovs, raid_io->iovcnt,
147 : pd_lba, pd_blocks, raid0_bdev_io_completion,
148 : raid_io, &io_opts);
149 : } else {
150 0 : SPDK_ERRLOG("Recvd not supported io type %u\n", raid_io->type);
151 0 : assert(0);
152 : }
153 :
154 10 : if (ret == -ENOMEM) {
155 0 : raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc),
156 : base_ch, _raid0_submit_rw_request);
157 10 : } else if (ret != 0) {
158 0 : SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
159 0 : assert(false);
160 : raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
161 : }
162 : }
163 :
164 : /* raid0 IO range */
165 : struct raid_bdev_io_range {
166 : uint64_t strip_size;
167 : uint64_t start_strip_in_disk;
168 : uint64_t end_strip_in_disk;
169 : uint64_t start_offset_in_strip;
170 : uint64_t end_offset_in_strip;
171 : uint8_t start_disk;
172 : uint8_t end_disk;
173 : uint8_t n_disks_involved;
174 : };
175 :
176 : static inline void
177 1926 : _raid0_get_io_range(struct raid_bdev_io_range *io_range,
178 : uint8_t num_base_bdevs, uint64_t strip_size, uint64_t strip_size_shift,
179 : uint64_t offset_blocks, uint64_t num_blocks)
180 : {
181 : uint64_t start_strip;
182 : uint64_t end_strip;
183 : uint64_t total_blocks;
184 :
185 1926 : io_range->strip_size = strip_size;
186 1926 : total_blocks = offset_blocks + num_blocks - (num_blocks > 0);
187 :
188 : /* The start and end strip index in raid0 bdev scope */
189 1926 : start_strip = offset_blocks >> strip_size_shift;
190 1926 : end_strip = total_blocks >> strip_size_shift;
191 1926 : io_range->start_strip_in_disk = start_strip / num_base_bdevs;
192 1926 : io_range->end_strip_in_disk = end_strip / num_base_bdevs;
193 :
194 : /* The first strip may have unaligned start LBA offset.
195 : * The end strip may have unaligned end LBA offset.
196 : * Strips between them certainly have aligned offset and length to boundaries.
197 : */
198 1926 : io_range->start_offset_in_strip = offset_blocks % strip_size;
199 1926 : io_range->end_offset_in_strip = total_blocks % strip_size;
200 :
201 : /* The base bdev indexes in which start and end strips are located */
202 1926 : io_range->start_disk = start_strip % num_base_bdevs;
203 1926 : io_range->end_disk = end_strip % num_base_bdevs;
204 :
205 : /* Calculate how many base_bdevs are involved in io operation.
206 : * Number of base bdevs involved is between 1 and num_base_bdevs.
207 : * It will be 1 if the first strip and last strip are the same one.
208 : */
209 1926 : io_range->n_disks_involved = spdk_min((end_strip - start_strip + 1), num_base_bdevs);
210 1926 : }
211 :
212 : static inline void
213 35406 : _raid0_split_io_range(struct raid_bdev_io_range *io_range, uint8_t disk_idx,
214 : uint64_t *_offset_in_disk, uint64_t *_nblocks_in_disk)
215 : {
216 : uint64_t n_strips_in_disk;
217 : uint64_t start_offset_in_disk;
218 : uint64_t end_offset_in_disk;
219 : uint64_t offset_in_disk;
220 : uint64_t nblocks_in_disk;
221 : uint64_t start_strip_in_disk;
222 : uint64_t end_strip_in_disk;
223 :
224 35406 : start_strip_in_disk = io_range->start_strip_in_disk;
225 35406 : if (disk_idx < io_range->start_disk) {
226 14760 : start_strip_in_disk += 1;
227 : }
228 :
229 35406 : end_strip_in_disk = io_range->end_strip_in_disk;
230 35406 : if (disk_idx > io_range->end_disk) {
231 6966 : end_strip_in_disk -= 1;
232 : }
233 :
234 35406 : assert(end_strip_in_disk >= start_strip_in_disk);
235 35406 : n_strips_in_disk = end_strip_in_disk - start_strip_in_disk + 1;
236 :
237 35406 : if (disk_idx == io_range->start_disk) {
238 1926 : start_offset_in_disk = io_range->start_offset_in_strip;
239 : } else {
240 33480 : start_offset_in_disk = 0;
241 : }
242 :
243 35406 : if (disk_idx == io_range->end_disk) {
244 1926 : end_offset_in_disk = io_range->end_offset_in_strip;
245 : } else {
246 33480 : end_offset_in_disk = io_range->strip_size - 1;
247 : }
248 :
249 35406 : offset_in_disk = start_offset_in_disk + start_strip_in_disk * io_range->strip_size;
250 35406 : nblocks_in_disk = (n_strips_in_disk - 1) * io_range->strip_size
251 35406 : + end_offset_in_disk - start_offset_in_disk + 1;
252 :
253 35406 : SPDK_DEBUGLOG(bdev_raid0,
254 : "raid_bdev (strip_size 0x%" PRIx64 ") splits IO to base_bdev (%u) at (0x%" PRIx64 ", 0x%" PRIx64
255 : ").\n",
256 : io_range->strip_size, disk_idx, offset_in_disk, nblocks_in_disk);
257 :
258 35406 : *_offset_in_disk = offset_in_disk;
259 35406 : *_nblocks_in_disk = nblocks_in_disk;
260 35406 : }
261 :
262 : static void raid0_submit_null_payload_request(struct raid_bdev_io *raid_io);
263 :
264 : static void
265 0 : _raid0_submit_null_payload_request(void *_raid_io)
266 : {
267 0 : struct raid_bdev_io *raid_io = _raid_io;
268 :
269 0 : raid0_submit_null_payload_request(raid_io);
270 0 : }
271 :
272 : static void
273 35406 : raid0_base_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg)
274 : {
275 35406 : struct raid_bdev_io *raid_io = cb_arg;
276 :
277 35406 : raid_bdev_io_complete_part(raid_io, 1, success ?
278 : SPDK_BDEV_IO_STATUS_SUCCESS :
279 : SPDK_BDEV_IO_STATUS_FAILED);
280 :
281 35406 : spdk_bdev_free_io(bdev_io);
282 35406 : }
283 :
284 : /*
285 : * brief:
286 : * raid0_submit_null_payload_request function submits the next batch of
287 : * io requests with range but without payload, like FLUSH and UNMAP, to member disks;
288 : * it will submit as many as possible unless one base io request fails with -ENOMEM,
289 : * in which case it will queue itself for later submission.
290 : * params:
291 : * bdev_io - pointer to parent bdev_io on raid bdev device
292 : * returns:
293 : * none
294 : */
295 : static void
296 1926 : raid0_submit_null_payload_request(struct raid_bdev_io *raid_io)
297 : {
298 : struct raid_bdev *raid_bdev;
299 1926 : struct raid_bdev_io_range io_range;
300 : int ret;
301 : struct raid_base_bdev_info *base_info;
302 : struct spdk_io_channel *base_ch;
303 :
304 1926 : raid_bdev = raid_io->raid_bdev;
305 :
306 3852 : _raid0_get_io_range(&io_range, raid_bdev->num_base_bdevs,
307 1926 : raid_bdev->strip_size, raid_bdev->strip_size_shift,
308 : raid_io->offset_blocks, raid_io->num_blocks);
309 :
310 1926 : if (raid_io->base_bdev_io_remaining == 0) {
311 1926 : raid_io->base_bdev_io_remaining = io_range.n_disks_involved;
312 : }
313 :
314 37332 : while (raid_io->base_bdev_io_submitted < io_range.n_disks_involved) {
315 : uint8_t disk_idx;
316 35406 : uint64_t offset_in_disk;
317 35406 : uint64_t nblocks_in_disk;
318 :
319 : /* base_bdev is started from start_disk to end_disk.
320 : * It is possible that index of start_disk is larger than end_disk's.
321 : */
322 35406 : disk_idx = (io_range.start_disk + raid_io->base_bdev_io_submitted) % raid_bdev->num_base_bdevs;
323 35406 : base_info = &raid_bdev->base_bdev_info[disk_idx];
324 35406 : base_ch = raid_bdev_channel_get_base_channel(raid_io->raid_ch, disk_idx);
325 :
326 35406 : _raid0_split_io_range(&io_range, disk_idx, &offset_in_disk, &nblocks_in_disk);
327 :
328 35406 : switch (raid_io->type) {
329 35406 : case SPDK_BDEV_IO_TYPE_UNMAP:
330 35406 : ret = raid_bdev_unmap_blocks(base_info, base_ch,
331 : offset_in_disk, nblocks_in_disk,
332 : raid0_base_io_complete, raid_io);
333 35406 : break;
334 :
335 0 : case SPDK_BDEV_IO_TYPE_FLUSH:
336 0 : ret = raid_bdev_flush_blocks(base_info, base_ch,
337 : offset_in_disk, nblocks_in_disk,
338 : raid0_base_io_complete, raid_io);
339 0 : break;
340 :
341 0 : default:
342 0 : SPDK_ERRLOG("submit request, invalid io type with null payload %u\n", raid_io->type);
343 0 : assert(false);
344 : ret = -EIO;
345 : }
346 :
347 35406 : if (ret == 0) {
348 35406 : raid_io->base_bdev_io_submitted++;
349 0 : } else if (ret == -ENOMEM) {
350 0 : raid_bdev_queue_io_wait(raid_io, spdk_bdev_desc_get_bdev(base_info->desc),
351 : base_ch, _raid0_submit_null_payload_request);
352 0 : return;
353 : } else {
354 0 : SPDK_ERRLOG("bdev io submit error not due to ENOMEM, it should not happen\n");
355 0 : assert(false);
356 : raid_bdev_io_complete(raid_io, SPDK_BDEV_IO_STATUS_FAILED);
357 : return;
358 : }
359 : }
360 : }
361 :
362 : static int
363 8 : raid0_start(struct raid_bdev *raid_bdev)
364 : {
365 8 : uint64_t min_blockcnt = UINT64_MAX;
366 : uint64_t base_bdev_data_size;
367 : struct raid_base_bdev_info *base_info;
368 :
369 264 : RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
370 : /* Calculate minimum block count from all base bdevs */
371 256 : min_blockcnt = spdk_min(min_blockcnt, base_info->data_size);
372 : }
373 :
374 8 : base_bdev_data_size = (min_blockcnt >> raid_bdev->strip_size_shift) << raid_bdev->strip_size_shift;
375 :
376 264 : RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
377 256 : base_info->data_size = base_bdev_data_size;
378 : }
379 :
380 : /*
381 : * Take the minimum block count based approach where total block count
382 : * of raid bdev is the number of base bdev times the minimum block count
383 : * of any base bdev.
384 : */
385 8 : SPDK_DEBUGLOG(bdev_raid0, "min blockcount %" PRIu64 ", numbasedev %u, strip size shift %u\n",
386 : min_blockcnt, raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift);
387 :
388 8 : raid_bdev->bdev.blockcnt = base_bdev_data_size * raid_bdev->num_base_bdevs;
389 :
390 8 : if (raid_bdev->num_base_bdevs > 1) {
391 8 : raid_bdev->bdev.optimal_io_boundary = raid_bdev->strip_size;
392 8 : raid_bdev->bdev.split_on_optimal_io_boundary = true;
393 : } else {
394 : /* Do not need to split reads/writes on single bdev RAID modules. */
395 0 : raid_bdev->bdev.optimal_io_boundary = 0;
396 0 : raid_bdev->bdev.split_on_optimal_io_boundary = false;
397 : }
398 :
399 8 : return 0;
400 : }
401 :
402 : static bool
403 0 : raid0_resize(struct raid_bdev *raid_bdev)
404 : {
405 : uint64_t blockcnt;
406 : int rc;
407 0 : uint64_t min_blockcnt = UINT64_MAX;
408 : struct raid_base_bdev_info *base_info;
409 : uint64_t base_bdev_data_size;
410 :
411 0 : RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
412 0 : struct spdk_bdev *base_bdev = spdk_bdev_desc_get_bdev(base_info->desc);
413 :
414 0 : min_blockcnt = spdk_min(min_blockcnt, base_bdev->blockcnt - base_info->data_offset);
415 : }
416 :
417 0 : base_bdev_data_size = (min_blockcnt >> raid_bdev->strip_size_shift) << raid_bdev->strip_size_shift;
418 0 : blockcnt = base_bdev_data_size * raid_bdev->num_base_bdevs;
419 :
420 0 : if (blockcnt == raid_bdev->bdev.blockcnt) {
421 0 : return false;
422 : }
423 :
424 0 : rc = spdk_bdev_notify_blockcnt_change(&raid_bdev->bdev, blockcnt);
425 0 : if (rc != 0) {
426 0 : SPDK_ERRLOG("Failed to notify blockcount change\n");
427 0 : return false;
428 : }
429 :
430 0 : RAID_FOR_EACH_BASE_BDEV(raid_bdev, base_info) {
431 0 : base_info->data_size = base_bdev_data_size;
432 : }
433 :
434 0 : return true;
435 : }
436 :
437 : static struct raid_bdev_module g_raid0_module = {
438 : .level = RAID0,
439 : .base_bdevs_min = 1,
440 : .memory_domains_supported = true,
441 : .dif_supported = true,
442 : .start = raid0_start,
443 : .submit_rw_request = raid0_submit_rw_request,
444 : .submit_null_payload_request = raid0_submit_null_payload_request,
445 : .resize = raid0_resize,
446 : };
447 1 : RAID_MODULE_REGISTER(&g_raid0_module)
448 :
449 1 : SPDK_LOG_REGISTER_COMPONENT(bdev_raid0)
|