Line data Source code
1 : /* SPDX-License-Identifier: BSD-3-Clause
2 : * Copyright (C) 2018 Intel Corporation.
3 : * All rights reserved.
4 : */
5 :
6 : #ifndef SPDK_BDEV_RAID_INTERNAL_H
7 : #define SPDK_BDEV_RAID_INTERNAL_H
8 :
9 : #include "spdk/bdev_module.h"
10 : #include "spdk/uuid.h"
11 :
12 : #define RAID_BDEV_MIN_DATA_OFFSET_SIZE (1024*1024) /* 1 MiB */
13 :
14 : enum raid_level {
15 : INVALID_RAID_LEVEL = -1,
16 : RAID0 = 0,
17 : RAID1 = 1,
18 : RAID5F = 95, /* 0x5f */
19 : CONCAT = 99,
20 : };
21 :
22 : /*
23 : * Raid state describes the state of the raid. This raid bdev can be either in
24 : * configured list or configuring list
25 : */
26 : enum raid_bdev_state {
27 : /* raid bdev is ready and is seen by upper layers */
28 : RAID_BDEV_STATE_ONLINE,
29 :
30 : /*
31 : * raid bdev is configuring, not all underlying bdevs are present.
32 : * And can't be seen by upper layers.
33 : */
34 : RAID_BDEV_STATE_CONFIGURING,
35 :
36 : /*
37 : * In offline state, raid bdev layer will complete all incoming commands without
38 : * submitting to underlying base nvme bdevs
39 : */
40 : RAID_BDEV_STATE_OFFLINE,
41 :
42 : /* raid bdev state max, new states should be added before this */
43 : RAID_BDEV_STATE_MAX
44 : };
45 :
46 : enum raid_process_type {
47 : RAID_PROCESS_NONE,
48 : RAID_PROCESS_REBUILD,
49 : RAID_PROCESS_MAX
50 : };
51 :
52 : typedef void (*raid_base_bdev_cb)(void *ctx, int status);
53 :
54 : /*
55 : * raid_base_bdev_info contains information for the base bdevs which are part of some
56 : * raid. This structure contains the per base bdev information. Whatever is
57 : * required per base device for raid bdev will be kept here
58 : */
59 : struct raid_base_bdev_info {
60 : /* The raid bdev that this base bdev belongs to */
61 : struct raid_bdev *raid_bdev;
62 :
63 : /* name of the bdev */
64 : char *name;
65 :
66 : /* uuid of the bdev */
67 : struct spdk_uuid uuid;
68 :
69 : /*
70 : * Pointer to base bdev descriptor opened by raid bdev. This is NULL when the bdev for
71 : * this slot is missing.
72 : */
73 : struct spdk_bdev_desc *desc;
74 :
75 : /* offset in blocks from the start of the base bdev to the start of the data region */
76 : uint64_t data_offset;
77 :
78 : /* size in blocks of the base bdev's data region */
79 : uint64_t data_size;
80 :
81 : /*
82 : * When underlying base device calls the hot plug function on drive removal,
83 : * this flag will be set and later after doing some processing, base device
84 : * descriptor will be closed
85 : */
86 : bool remove_scheduled;
87 :
88 : /* callback for base bdev removal */
89 : raid_base_bdev_cb remove_cb;
90 :
91 : /* context of the callback */
92 : void *remove_cb_ctx;
93 :
94 : /* Hold the number of blocks to know how large the base bdev is resized. */
95 : uint64_t blockcnt;
96 :
97 : /* io channel for the app thread */
98 : struct spdk_io_channel *app_thread_ch;
99 :
100 : /* Set to true when base bdev has completed the configuration process */
101 : bool is_configured;
102 :
103 : /* Set to true if this base bdev is the target of a background process */
104 : bool is_process_target;
105 :
106 : /* Set to true to indicate that the base bdev is being removed because of a failure */
107 : bool is_failed;
108 :
109 : /* callback for base bdev configuration */
110 : raid_base_bdev_cb configure_cb;
111 :
112 : /* context of the callback */
113 : void *configure_cb_ctx;
114 : };
115 :
116 : struct raid_bdev_io;
117 : typedef void (*raid_bdev_io_completion_cb)(struct raid_bdev_io *raid_io,
118 : enum spdk_bdev_io_status status);
119 :
120 : /*
121 : * raid_bdev_io is the context part of bdev_io. It contains the information
122 : * related to bdev_io for a raid bdev
123 : */
124 : struct raid_bdev_io {
125 : /* The raid bdev associated with this IO */
126 : struct raid_bdev *raid_bdev;
127 :
128 : uint64_t offset_blocks;
129 : uint64_t num_blocks;
130 : struct iovec *iovs;
131 : int iovcnt;
132 : enum spdk_bdev_io_type type;
133 : struct spdk_memory_domain *memory_domain;
134 : void *memory_domain_ctx;
135 : void *md_buf;
136 :
137 : /* WaitQ entry, used only in waitq logic */
138 : struct spdk_bdev_io_wait_entry waitq_entry;
139 :
140 : /* Context of the original channel for this IO */
141 : struct raid_bdev_io_channel *raid_ch;
142 :
143 : /* Used for tracking progress on io requests sent to member disks. */
144 : uint64_t base_bdev_io_remaining;
145 : uint8_t base_bdev_io_submitted;
146 : enum spdk_bdev_io_status base_bdev_io_status;
147 : /* This will be the raid_io completion status unless any base io's status is different. */
148 : enum spdk_bdev_io_status base_bdev_io_status_default;
149 :
150 : /* Private data for the raid module */
151 : void *module_private;
152 :
153 : /* Custom completion callback. Overrides bdev_io completion if set. */
154 : raid_bdev_io_completion_cb completion_cb;
155 :
156 : struct {
157 : uint64_t offset;
158 : struct iovec *iov;
159 : struct iovec iov_copy;
160 : } split;
161 : };
162 :
163 : struct raid_bdev_process_request {
164 : struct raid_bdev_process *process;
165 : struct raid_base_bdev_info *target;
166 : struct spdk_io_channel *target_ch;
167 : uint64_t offset_blocks;
168 : uint32_t num_blocks;
169 : struct iovec iov;
170 : void *md_buf;
171 : /* bdev_io is raid_io's driver_ctx - don't reorder them!
172 : * These are needed for re-using raid module I/O functions for process I/O. */
173 : struct spdk_bdev_io bdev_io;
174 : struct raid_bdev_io raid_io;
175 : TAILQ_ENTRY(raid_bdev_process_request) link;
176 : };
177 :
178 : /*
179 : * raid_bdev is the single entity structure which contains SPDK block device
180 : * and the information related to any raid bdev either configured or
181 : * in configuring list. io device is created on this.
182 : */
183 : struct raid_bdev {
184 : /* raid bdev device, this will get registered in bdev layer */
185 : struct spdk_bdev bdev;
186 :
187 : /* the raid bdev descriptor, opened for internal use */
188 : struct spdk_bdev_desc *self_desc;
189 :
190 : /* link of raid bdev to link it to global raid bdev list */
191 : TAILQ_ENTRY(raid_bdev) global_link;
192 :
193 : /* array of base bdev info */
194 : struct raid_base_bdev_info *base_bdev_info;
195 :
196 : /* strip size of raid bdev in blocks */
197 : uint32_t strip_size;
198 :
199 : /* strip size of raid bdev in KB */
200 : uint32_t strip_size_kb;
201 :
202 : /* strip size bit shift for optimized calculation */
203 : uint32_t strip_size_shift;
204 :
205 : /* state of raid bdev */
206 : enum raid_bdev_state state;
207 :
208 : /* number of base bdevs comprising raid bdev */
209 : uint8_t num_base_bdevs;
210 :
211 : /* number of base bdevs discovered */
212 : uint8_t num_base_bdevs_discovered;
213 :
214 : /*
215 : * Number of operational base bdevs, i.e. how many we know/expect to be working. This
216 : * will be less than num_base_bdevs when starting a degraded array.
217 : */
218 : uint8_t num_base_bdevs_operational;
219 :
220 : /* minimum number of viable base bdevs that are required by array to operate */
221 : uint8_t min_base_bdevs_operational;
222 :
223 : /* Raid Level of this raid bdev */
224 : enum raid_level level;
225 :
226 : /* Set to true if destroy of this raid bdev is started. */
227 : bool destroy_started;
228 :
229 : /* Module for RAID-level specific operations */
230 : struct raid_bdev_module *module;
231 :
232 : /* Private data for the raid module */
233 : void *module_private;
234 :
235 : /* Superblock */
236 : bool superblock_enabled;
237 : struct raid_bdev_superblock *sb;
238 :
239 : /* Superblock buffer used for I/O */
240 : void *sb_io_buf;
241 : uint32_t sb_io_buf_size;
242 :
243 : /* Raid bdev background process, e.g. rebuild */
244 : struct raid_bdev_process *process;
245 : };
246 :
247 : #define RAID_FOR_EACH_BASE_BDEV(r, i) \
248 : for (i = r->base_bdev_info; i < r->base_bdev_info + r->num_base_bdevs; i++)
249 :
250 : struct raid_bdev_io_channel;
251 :
252 : /* TAIL head for raid bdev list */
253 : TAILQ_HEAD(raid_all_tailq, raid_bdev);
254 :
255 : extern struct raid_all_tailq g_raid_bdev_list;
256 :
257 : typedef void (*raid_bdev_destruct_cb)(void *cb_ctx, int rc);
258 :
259 : int raid_bdev_create(const char *name, uint32_t strip_size, uint8_t num_base_bdevs,
260 : enum raid_level level, bool superblock, const struct spdk_uuid *uuid,
261 : struct raid_bdev **raid_bdev_out);
262 : void raid_bdev_delete(struct raid_bdev *raid_bdev, raid_bdev_destruct_cb cb_fn, void *cb_ctx);
263 : int raid_bdev_add_base_bdev(struct raid_bdev *raid_bdev, const char *name,
264 : raid_base_bdev_cb cb_fn, void *cb_ctx);
265 : struct raid_bdev *raid_bdev_find_by_name(const char *name);
266 : enum raid_level raid_bdev_str_to_level(const char *str);
267 : const char *raid_bdev_level_to_str(enum raid_level level);
268 : enum raid_bdev_state raid_bdev_str_to_state(const char *str);
269 : const char *raid_bdev_state_to_str(enum raid_bdev_state state);
270 : const char *raid_bdev_process_to_str(enum raid_process_type value);
271 : void raid_bdev_write_info_json(struct raid_bdev *raid_bdev, struct spdk_json_write_ctx *w);
272 : int raid_bdev_remove_base_bdev(struct spdk_bdev *base_bdev, raid_base_bdev_cb cb_fn, void *cb_ctx);
273 :
274 : /*
275 : * RAID module descriptor
276 : */
277 : struct raid_bdev_module {
278 : /* RAID level implemented by this module */
279 : enum raid_level level;
280 :
281 : /* Minimum required number of base bdevs. Must be > 0. */
282 : uint8_t base_bdevs_min;
283 :
284 : /*
285 : * RAID constraint. Determines number of base bdevs that can be removed
286 : * without failing the array.
287 : */
288 : struct {
289 : enum {
290 : CONSTRAINT_UNSET = 0,
291 : CONSTRAINT_MAX_BASE_BDEVS_REMOVED,
292 : CONSTRAINT_MIN_BASE_BDEVS_OPERATIONAL,
293 : } type;
294 : uint8_t value;
295 : } base_bdevs_constraint;
296 :
297 : /* Set to true if this module supports memory domains. */
298 : bool memory_domains_supported;
299 :
300 : /* Set to true if this module supports DIF/DIX */
301 : bool dif_supported;
302 :
303 : /*
304 : * Called when the raid is starting, right before changing the state to
305 : * online and registering the bdev. Parameters of the bdev like blockcnt
306 : * should be set here.
307 : *
308 : * Non-zero return value will abort the startup process.
309 : */
310 : int (*start)(struct raid_bdev *raid_bdev);
311 :
312 : /*
313 : * Called when the raid is stopping, right before changing the state to
314 : * offline and unregistering the bdev. Optional.
315 : *
316 : * The function should return false if it is asynchronous. Then, after
317 : * the async operation has completed and the module is fully stopped
318 : * raid_bdev_module_stop_done() must be called.
319 : */
320 : bool (*stop)(struct raid_bdev *raid_bdev);
321 :
322 : /* Handler for R/W requests */
323 : void (*submit_rw_request)(struct raid_bdev_io *raid_io);
324 :
325 : /* Handler for requests without payload (flush, unmap). Optional. */
326 : void (*submit_null_payload_request)(struct raid_bdev_io *raid_io);
327 :
328 : /*
329 : * Called when the bdev's IO channel is created to get the module's private IO channel.
330 : * Optional.
331 : */
332 : struct spdk_io_channel *(*get_io_channel)(struct raid_bdev *raid_bdev);
333 :
334 : /*
335 : * Called when a base_bdev is resized to resize the raid if the condition
336 : * is satisfied. Optional.
337 : *
338 : * Returns true if the resize was performed.
339 : */
340 : bool (*resize)(struct raid_bdev *raid_bdev);
341 :
342 : /* Handler for raid process requests. Required for raid modules with redundancy. */
343 : int (*submit_process_request)(struct raid_bdev_process_request *process_req,
344 : struct raid_bdev_io_channel *raid_ch);
345 :
346 : TAILQ_ENTRY(raid_bdev_module) link;
347 : };
348 :
349 : void raid_bdev_module_list_add(struct raid_bdev_module *raid_module);
350 :
351 : #define __RAID_MODULE_REGISTER(line) __RAID_MODULE_REGISTER_(line)
352 : #define __RAID_MODULE_REGISTER_(line) raid_module_register_##line
353 :
354 : #define RAID_MODULE_REGISTER(_module) \
355 : __attribute__((constructor)) static void \
356 : __RAID_MODULE_REGISTER(__LINE__)(void) \
357 : { \
358 : raid_bdev_module_list_add(_module); \
359 : }
360 :
361 : bool raid_bdev_io_complete_part(struct raid_bdev_io *raid_io, uint64_t completed,
362 : enum spdk_bdev_io_status status);
363 : void raid_bdev_queue_io_wait(struct raid_bdev_io *raid_io, struct spdk_bdev *bdev,
364 : struct spdk_io_channel *ch, spdk_bdev_io_wait_cb cb_fn);
365 : void raid_bdev_io_complete(struct raid_bdev_io *raid_io, enum spdk_bdev_io_status status);
366 : void raid_bdev_module_stop_done(struct raid_bdev *raid_bdev);
367 : struct spdk_io_channel *raid_bdev_channel_get_base_channel(struct raid_bdev_io_channel *raid_ch,
368 : uint8_t idx);
369 : void *raid_bdev_channel_get_module_ctx(struct raid_bdev_io_channel *raid_ch);
370 : struct raid_base_bdev_info *raid_bdev_channel_get_base_info(struct raid_bdev_io_channel *raid_ch,
371 : struct spdk_bdev *base_bdev);
372 : void raid_bdev_process_request_complete(struct raid_bdev_process_request *process_req, int status);
373 : void raid_bdev_io_init(struct raid_bdev_io *raid_io, struct raid_bdev_io_channel *raid_ch,
374 : enum spdk_bdev_io_type type, uint64_t offset_blocks,
375 : uint64_t num_blocks, struct iovec *iovs, int iovcnt, void *md_buf,
376 : struct spdk_memory_domain *memory_domain, void *memory_domain_ctx);
377 : void raid_bdev_fail_base_bdev(struct raid_base_bdev_info *base_info);
378 :
379 : static inline uint8_t
380 134 : raid_bdev_base_bdev_slot(struct raid_base_bdev_info *base_info)
381 : {
382 134 : return base_info - base_info->raid_bdev->base_bdev_info;
383 : }
384 :
385 : static inline void
386 3682 : raid_bdev_io_set_default_status(struct raid_bdev_io *raid_io, enum spdk_bdev_io_status status)
387 : {
388 3682 : assert(raid_io->base_bdev_io_submitted == 0);
389 3682 : raid_io->base_bdev_io_status = status;
390 3682 : raid_io->base_bdev_io_status_default = status;
391 3682 : }
392 :
393 : int raid_bdev_remap_dix_reftag(void *md_buf, uint64_t num_blocks,
394 : struct spdk_bdev *bdev, uint32_t remapped_offset);
395 : int raid_bdev_verify_dix_reftag(struct iovec *iovs, int iovcnt, void *md_buf,
396 : uint64_t num_blocks, struct spdk_bdev *bdev, uint32_t offset_blocks);
397 :
398 : /**
399 : * Raid bdev I/O read/write wrapper for spdk_bdev_readv_blocks_ext function.
400 : */
401 : static inline int
402 1684 : raid_bdev_readv_blocks_ext(struct raid_base_bdev_info *base_info, struct spdk_io_channel *ch,
403 : struct iovec *iov, int iovcnt, uint64_t offset_blocks,
404 : uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg,
405 : struct spdk_bdev_ext_io_opts *opts)
406 : {
407 3368 : return spdk_bdev_readv_blocks_ext(base_info->desc, ch, iov, iovcnt,
408 1684 : base_info->data_offset + offset_blocks, num_blocks, cb, cb_arg, opts);
409 : }
410 :
411 : /**
412 : * Raid bdev I/O read/write wrapper for spdk_bdev_writev_blocks_ext function.
413 : */
414 : static inline int
415 414 : raid_bdev_writev_blocks_ext(struct raid_base_bdev_info *base_info, struct spdk_io_channel *ch,
416 : struct iovec *iov, int iovcnt, uint64_t offset_blocks,
417 : uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg,
418 : struct spdk_bdev_ext_io_opts *opts)
419 : {
420 : int rc;
421 414 : uint64_t remapped_offset_blocks = base_info->data_offset + offset_blocks;
422 :
423 414 : if (spdk_unlikely(spdk_bdev_get_dif_type(&base_info->raid_bdev->bdev) != SPDK_DIF_DISABLE &&
424 : (base_info->raid_bdev->bdev.dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK))) {
425 3 : rc = raid_bdev_remap_dix_reftag(opts->metadata, num_blocks, &base_info->raid_bdev->bdev,
426 : remapped_offset_blocks);
427 3 : if (rc != 0) {
428 0 : return rc;
429 : }
430 : }
431 :
432 414 : return spdk_bdev_writev_blocks_ext(base_info->desc, ch, iov, iovcnt,
433 : remapped_offset_blocks, num_blocks, cb, cb_arg, opts);
434 : }
435 :
436 : /**
437 : * Raid bdev I/O read/write wrapper for spdk_bdev_unmap_blocks function.
438 : */
439 : static inline int
440 35504 : raid_bdev_unmap_blocks(struct raid_base_bdev_info *base_info, struct spdk_io_channel *ch,
441 : uint64_t offset_blocks, uint64_t num_blocks,
442 : spdk_bdev_io_completion_cb cb, void *cb_arg)
443 : {
444 35504 : return spdk_bdev_unmap_blocks(base_info->desc, ch, base_info->data_offset + offset_blocks,
445 : num_blocks, cb, cb_arg);
446 : }
447 :
448 : /**
449 : * Raid bdev I/O read/write wrapper for spdk_bdev_flush_blocks function.
450 : */
451 : static inline int
452 98 : raid_bdev_flush_blocks(struct raid_base_bdev_info *base_info, struct spdk_io_channel *ch,
453 : uint64_t offset_blocks, uint64_t num_blocks,
454 : spdk_bdev_io_completion_cb cb, void *cb_arg)
455 : {
456 98 : return spdk_bdev_flush_blocks(base_info->desc, ch, base_info->data_offset + offset_blocks,
457 : num_blocks, cb, cb_arg);
458 : }
459 :
460 : /*
461 : * Definitions related to raid bdev superblock
462 : */
463 :
464 : #define RAID_BDEV_SB_VERSION_MAJOR 1
465 : #define RAID_BDEV_SB_VERSION_MINOR 0
466 :
467 : #define RAID_BDEV_SB_NAME_SIZE 64
468 :
469 : enum raid_bdev_sb_base_bdev_state {
470 : RAID_SB_BASE_BDEV_MISSING = 0,
471 : RAID_SB_BASE_BDEV_CONFIGURED = 1,
472 : RAID_SB_BASE_BDEV_FAILED = 2,
473 : RAID_SB_BASE_BDEV_SPARE = 3,
474 : };
475 :
476 : struct raid_bdev_sb_base_bdev {
477 : /* uuid of the base bdev */
478 : struct spdk_uuid uuid;
479 : /* offset in blocks from base device start to the start of raid data area */
480 : uint64_t data_offset;
481 : /* size in blocks of the base device raid data area */
482 : uint64_t data_size;
483 : /* state of the base bdev */
484 : uint32_t state;
485 : /* feature/status flags */
486 : uint32_t flags;
487 : /* slot number of this base bdev in the raid */
488 : uint8_t slot;
489 :
490 : uint8_t reserved[23];
491 : };
492 : SPDK_STATIC_ASSERT(sizeof(struct raid_bdev_sb_base_bdev) == 64, "incorrect size");
493 :
494 : struct raid_bdev_superblock {
495 : #define RAID_BDEV_SB_SIG "SPDKRAID"
496 : uint8_t signature[8];
497 : struct {
498 : /* incremented when a breaking change in the superblock structure is made */
499 : uint16_t major;
500 : /* incremented for changes in the superblock that are backward compatible */
501 : uint16_t minor;
502 : } version;
503 : /* length in bytes of the entire superblock */
504 : uint32_t length;
505 : /* crc32c checksum of the entire superblock */
506 : uint32_t crc;
507 : /* feature/status flags */
508 : uint32_t flags;
509 : /* unique id of the raid bdev */
510 : struct spdk_uuid uuid;
511 : /* name of the raid bdev */
512 : uint8_t name[RAID_BDEV_SB_NAME_SIZE];
513 : /* size of the raid bdev in blocks */
514 : uint64_t raid_size;
515 : /* the raid bdev block size - must be the same for all base bdevs */
516 : uint32_t block_size;
517 : /* the raid level */
518 : uint32_t level;
519 : /* strip (chunk) size in blocks */
520 : uint32_t strip_size;
521 : /* state of the raid */
522 : uint32_t state;
523 : /* sequence number, incremented on every superblock update */
524 : uint64_t seq_number;
525 : /* number of raid base devices */
526 : uint8_t num_base_bdevs;
527 :
528 : uint8_t reserved[118];
529 :
530 : /* size of the base bdevs array */
531 : uint8_t base_bdevs_size;
532 : /* array of base bdev descriptors */
533 : struct raid_bdev_sb_base_bdev base_bdevs[];
534 : };
535 : SPDK_STATIC_ASSERT(sizeof(struct raid_bdev_superblock) == 256, "incorrect size");
536 :
537 : #define RAID_BDEV_SB_MAX_LENGTH (sizeof(struct raid_bdev_superblock) + UINT8_MAX * sizeof(struct raid_bdev_sb_base_bdev))
538 :
539 : SPDK_STATIC_ASSERT(RAID_BDEV_SB_MAX_LENGTH < RAID_BDEV_MIN_DATA_OFFSET_SIZE,
540 : "Incorrect min data offset");
541 :
542 : typedef void (*raid_bdev_write_sb_cb)(int status, struct raid_bdev *raid_bdev, void *ctx);
543 : typedef void (*raid_bdev_load_sb_cb)(const struct raid_bdev_superblock *sb, int status, void *ctx);
544 :
545 : int raid_bdev_alloc_superblock(struct raid_bdev *raid_bdev, uint32_t block_size);
546 : void raid_bdev_free_superblock(struct raid_bdev *raid_bdev);
547 : void raid_bdev_init_superblock(struct raid_bdev *raid_bdev);
548 : void raid_bdev_write_superblock(struct raid_bdev *raid_bdev, raid_bdev_write_sb_cb cb,
549 : void *cb_ctx);
550 : int raid_bdev_load_base_bdev_superblock(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch,
551 : raid_bdev_load_sb_cb cb, void *cb_ctx);
552 :
553 : struct spdk_raid_bdev_opts {
554 : /* Size of the background process window in KiB */
555 : uint32_t process_window_size_kb;
556 : };
557 :
558 : void raid_bdev_get_opts(struct spdk_raid_bdev_opts *opts);
559 : int raid_bdev_set_opts(const struct spdk_raid_bdev_opts *opts);
560 :
561 : #endif /* SPDK_BDEV_RAID_INTERNAL_H */
|