Line data Source code
1 : /* SPDX-License-Identifier: BSD-3-Clause
2 : * Copyright (C) 2016 Intel Corporation. All rights reserved.
3 : * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved.
4 : * Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5 : * Copyright (c) 2022 Dell Inc, or its subsidiaries. All rights reserved.
6 : */
7 :
8 : #include "spdk/stdinc.h"
9 :
10 : #include "bdev_nvme.h"
11 :
12 : #include "spdk/accel.h"
13 : #include "spdk/config.h"
14 : #include "spdk/endian.h"
15 : #include "spdk/bdev.h"
16 : #include "spdk/json.h"
17 : #include "spdk/keyring.h"
18 : #include "spdk/likely.h"
19 : #include "spdk/nvme.h"
20 : #include "spdk/nvme_ocssd.h"
21 : #include "spdk/nvme_zns.h"
22 : #include "spdk/opal.h"
23 : #include "spdk/thread.h"
24 : #include "spdk/trace.h"
25 : #include "spdk/string.h"
26 : #include "spdk/util.h"
27 : #include "spdk/uuid.h"
28 :
29 : #include "spdk/bdev_module.h"
30 : #include "spdk/log.h"
31 :
32 : #include "spdk_internal/usdt.h"
33 : #include "spdk_internal/trace_defs.h"
34 :
35 : #define CTRLR_STRING(nvme_ctrlr) \
36 : (spdk_nvme_trtype_is_fabrics(nvme_ctrlr->active_path_id->trid.trtype) ? \
37 : nvme_ctrlr->active_path_id->trid.subnqn : nvme_ctrlr->active_path_id->trid.traddr)
38 :
39 : #define CTRLR_ID(nvme_ctrlr) (spdk_nvme_ctrlr_get_id(nvme_ctrlr->ctrlr))
40 :
41 : #define NVME_CTRLR_ERRLOG(ctrlr, format, ...) \
42 : SPDK_ERRLOG("[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__);
43 :
44 : #define NVME_CTRLR_WARNLOG(ctrlr, format, ...) \
45 : SPDK_WARNLOG("[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__);
46 :
47 : #define NVME_CTRLR_NOTICELOG(ctrlr, format, ...) \
48 : SPDK_NOTICELOG("[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__);
49 :
50 : #define NVME_CTRLR_INFOLOG(ctrlr, format, ...) \
51 : SPDK_INFOLOG(bdev_nvme, "[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__);
52 :
53 : #ifdef DEBUG
54 : #define NVME_CTRLR_DEBUGLOG(ctrlr, format, ...) \
55 : SPDK_DEBUGLOG(bdev_nvme, "[%s, %u] " format, CTRLR_STRING(ctrlr), CTRLR_ID(ctrlr), ##__VA_ARGS__);
56 : #else
57 : #define NVME_CTRLR_DEBUGLOG(ctrlr, ...) do { } while (0)
58 : #endif
59 :
60 : #define BDEV_STRING(nbdev) (nbdev->disk.name)
61 :
62 : #define NVME_BDEV_ERRLOG(nbdev, format, ...) \
63 : SPDK_ERRLOG("[%s] " format, BDEV_STRING(nbdev), ##__VA_ARGS__);
64 :
65 : #define NVME_BDEV_WARNLOG(nbdev, format, ...) \
66 : SPDK_WARNLOG("[%s] " format, BDEV_STRING(nbdev), ##__VA_ARGS__);
67 :
68 : #define NVME_BDEV_NOTICELOG(nbdev, format, ...) \
69 : SPDK_NOTICELOG("[%s] " format, BDEV_STRING(nbdev), ##__VA_ARGS__);
70 :
71 : #define NVME_BDEV_INFOLOG(nbdev, format, ...) \
72 : SPDK_INFOLOG(bdev_nvme, "[%s] " format, BDEV_STRING(nbdev), ##__VA_ARGS__);
73 :
74 : #define SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT true
75 : #define SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS (10000)
76 :
77 : #define NSID_STR_LEN 10
78 :
79 : #define SPDK_CONTROLLER_NAME_MAX 512
80 :
81 : static int bdev_nvme_config_json(struct spdk_json_write_ctx *w);
82 :
83 : struct nvme_bdev_io {
84 : /** array of iovecs to transfer. */
85 : struct iovec *iovs;
86 :
87 : /** Number of iovecs in iovs array. */
88 : int iovcnt;
89 :
90 : /** Current iovec position. */
91 : int iovpos;
92 :
93 : /** Offset in current iovec. */
94 : uint32_t iov_offset;
95 :
96 : /** Offset in current iovec. */
97 : uint32_t fused_iov_offset;
98 :
99 : /** array of iovecs to transfer. */
100 : struct iovec *fused_iovs;
101 :
102 : /** Number of iovecs in iovs array. */
103 : int fused_iovcnt;
104 :
105 : /** Current iovec position. */
106 : int fused_iovpos;
107 :
108 : /** I/O path the current I/O or admin passthrough is submitted on, or the I/O path
109 : * being reset in a reset I/O.
110 : */
111 : struct nvme_io_path *io_path;
112 :
113 : /** Saved status for admin passthru completion event, PI error verification, or intermediate compare-and-write status */
114 : struct spdk_nvme_cpl cpl;
115 :
116 : /** Extended IO opts passed by the user to bdev layer and mapped to NVME format */
117 : struct spdk_nvme_ns_cmd_ext_io_opts ext_opts;
118 :
119 : /** Keeps track if first of fused commands was submitted */
120 : bool first_fused_submitted;
121 :
122 : /** Keeps track if first of fused commands was completed */
123 : bool first_fused_completed;
124 :
125 : /* How many times the current I/O was retried. */
126 : int32_t retry_count;
127 :
128 : /** Expiration value in ticks to retry the current I/O. */
129 : uint64_t retry_ticks;
130 :
131 : /** Temporary pointer to zone report buffer */
132 : struct spdk_nvme_zns_zone_report *zone_report_buf;
133 :
134 : /** Keep track of how many zones that have been copied to the spdk_bdev_zone_info struct */
135 : uint64_t handled_zones;
136 :
137 : /* Current tsc at submit time. */
138 : uint64_t submit_tsc;
139 :
140 : /* Used to put nvme_bdev_io into the list */
141 : TAILQ_ENTRY(nvme_bdev_io) retry_link;
142 : };
143 :
144 : struct nvme_probe_skip_entry {
145 : struct spdk_nvme_transport_id trid;
146 : TAILQ_ENTRY(nvme_probe_skip_entry) tailq;
147 : };
148 : /* All the controllers deleted by users via RPC are skipped by hotplug monitor */
149 : static TAILQ_HEAD(, nvme_probe_skip_entry) g_skipped_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(
150 : g_skipped_nvme_ctrlrs);
151 :
152 : #define BDEV_NVME_DEFAULT_DIGESTS (SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA256) | \
153 : SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA384) | \
154 : SPDK_BIT(SPDK_NVMF_DHCHAP_HASH_SHA512))
155 :
156 : #define BDEV_NVME_DEFAULT_DHGROUPS (SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_NULL) | \
157 : SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_2048) | \
158 : SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_3072) | \
159 : SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_4096) | \
160 : SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_6144) | \
161 : SPDK_BIT(SPDK_NVMF_DHCHAP_DHGROUP_8192))
162 :
163 : static struct spdk_bdev_nvme_opts g_opts = {
164 : .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE,
165 : .timeout_us = 0,
166 : .timeout_admin_us = 0,
167 : .keep_alive_timeout_ms = SPDK_BDEV_NVME_DEFAULT_KEEP_ALIVE_TIMEOUT_IN_MS,
168 : .transport_retry_count = 4,
169 : .arbitration_burst = 0,
170 : .low_priority_weight = 0,
171 : .medium_priority_weight = 0,
172 : .high_priority_weight = 0,
173 : .nvme_adminq_poll_period_us = 10000ULL,
174 : .nvme_ioq_poll_period_us = 0,
175 : .io_queue_requests = 0,
176 : .delay_cmd_submit = SPDK_BDEV_NVME_DEFAULT_DELAY_CMD_SUBMIT,
177 : .bdev_retry_count = 3,
178 : .transport_ack_timeout = 0,
179 : .ctrlr_loss_timeout_sec = 0,
180 : .reconnect_delay_sec = 0,
181 : .fast_io_fail_timeout_sec = 0,
182 : .disable_auto_failback = false,
183 : .generate_uuids = false,
184 : .transport_tos = 0,
185 : .nvme_error_stat = false,
186 : .io_path_stat = false,
187 : .allow_accel_sequence = false,
188 : .dhchap_digests = BDEV_NVME_DEFAULT_DIGESTS,
189 : .dhchap_dhgroups = BDEV_NVME_DEFAULT_DHGROUPS,
190 : };
191 :
192 : #define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL
193 : #define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL
194 :
195 : static int g_hot_insert_nvme_controller_index = 0;
196 : static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT;
197 : static bool g_nvme_hotplug_enabled = false;
198 : struct spdk_thread *g_bdev_nvme_init_thread;
199 : static struct spdk_poller *g_hotplug_poller;
200 : static struct spdk_poller *g_hotplug_probe_poller;
201 : static struct spdk_nvme_probe_ctx *g_hotplug_probe_ctx;
202 :
203 : static void nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr,
204 : struct nvme_async_probe_ctx *ctx);
205 : static void nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr,
206 : struct nvme_async_probe_ctx *ctx);
207 : static int bdev_nvme_library_init(void);
208 : static void bdev_nvme_library_fini(void);
209 : static void _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch,
210 : struct spdk_bdev_io *bdev_io);
211 : static void bdev_nvme_submit_request(struct spdk_io_channel *ch,
212 : struct spdk_bdev_io *bdev_io);
213 : static int bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
214 : void *md, uint64_t lba_count, uint64_t lba,
215 : uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx,
216 : struct spdk_accel_sequence *seq);
217 : static int bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
218 : void *md, uint64_t lba_count, uint64_t lba);
219 : static int bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
220 : void *md, uint64_t lba_count, uint64_t lba,
221 : uint32_t flags, struct spdk_memory_domain *domain, void *domain_ctx,
222 : struct spdk_accel_sequence *seq,
223 : union spdk_bdev_nvme_cdw12 cdw12, union spdk_bdev_nvme_cdw13 cdw13);
224 : static int bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
225 : void *md, uint64_t lba_count,
226 : uint64_t zslba, uint32_t flags);
227 : static int bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
228 : void *md, uint64_t lba_count, uint64_t lba,
229 : uint32_t flags);
230 : static int bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio,
231 : struct iovec *cmp_iov, int cmp_iovcnt, struct iovec *write_iov,
232 : int write_iovcnt, void *md, uint64_t lba_count, uint64_t lba,
233 : uint32_t flags);
234 : static int bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id,
235 : uint32_t num_zones, struct spdk_bdev_zone_info *info);
236 : static int bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id,
237 : enum spdk_bdev_zone_action action);
238 : static void bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch,
239 : struct nvme_bdev_io *bio,
240 : struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes);
241 : static int bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
242 : void *buf, size_t nbytes);
243 : static int bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
244 : void *buf, size_t nbytes, void *md_buf, size_t md_len);
245 : static int bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
246 : struct iovec *iov, int iovcnt, size_t nbytes,
247 : void *md_buf, size_t md_len);
248 : static void bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch,
249 : struct nvme_bdev_io *bio, struct nvme_bdev_io *bio_to_abort);
250 : static void bdev_nvme_reset_io(struct nvme_bdev *nbdev, struct nvme_bdev_io *bio);
251 : static int bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr);
252 : static int bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr);
253 : static void remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr);
254 : static int nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr);
255 :
256 : static struct nvme_ns *nvme_ns_alloc(void);
257 : static void nvme_ns_free(struct nvme_ns *ns);
258 :
259 : static int
260 175 : nvme_ns_cmp(struct nvme_ns *ns1, struct nvme_ns *ns2)
261 : {
262 175 : return ns1->id < ns2->id ? -1 : ns1->id > ns2->id;
263 : }
264 :
265 1070 : RB_GENERATE_STATIC(nvme_ns_tree, nvme_ns, node, nvme_ns_cmp);
266 :
267 : struct spdk_nvme_qpair *
268 1 : bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch)
269 : {
270 : struct nvme_ctrlr_channel *ctrlr_ch;
271 :
272 1 : assert(ctrlr_io_ch != NULL);
273 :
274 1 : ctrlr_ch = spdk_io_channel_get_ctx(ctrlr_io_ch);
275 :
276 1 : return ctrlr_ch->qpair->qpair;
277 : }
278 :
279 : static int
280 0 : bdev_nvme_get_ctx_size(void)
281 : {
282 0 : return sizeof(struct nvme_bdev_io);
283 : }
284 :
285 : static struct spdk_bdev_module nvme_if = {
286 : .name = "nvme",
287 : .async_fini = true,
288 : .module_init = bdev_nvme_library_init,
289 : .module_fini = bdev_nvme_library_fini,
290 : .config_json = bdev_nvme_config_json,
291 : .get_ctx_size = bdev_nvme_get_ctx_size,
292 :
293 : };
294 1 : SPDK_BDEV_MODULE_REGISTER(nvme, &nvme_if)
295 :
296 : struct nvme_bdev_ctrlrs g_nvme_bdev_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_bdev_ctrlrs);
297 : pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER;
298 : bool g_bdev_nvme_module_finish;
299 :
300 : struct nvme_bdev_ctrlr *
301 327 : nvme_bdev_ctrlr_get_by_name(const char *name)
302 : {
303 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
304 :
305 327 : TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
306 169 : if (strcmp(name, nbdev_ctrlr->name) == 0) {
307 169 : break;
308 : }
309 0 : }
310 :
311 327 : return nbdev_ctrlr;
312 : }
313 :
314 : static struct nvme_ctrlr *
315 58 : nvme_bdev_ctrlr_get_ctrlr(struct nvme_bdev_ctrlr *nbdev_ctrlr,
316 : const struct spdk_nvme_transport_id *trid, const char *hostnqn)
317 : {
318 : const struct spdk_nvme_ctrlr_opts *opts;
319 : struct nvme_ctrlr *nvme_ctrlr;
320 :
321 99 : TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
322 74 : opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr);
323 74 : if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->active_path_id->trid) == 0 &&
324 33 : strcmp(hostnqn, opts->hostnqn) == 0) {
325 33 : break;
326 : }
327 41 : }
328 :
329 58 : return nvme_ctrlr;
330 : }
331 :
332 : struct nvme_ctrlr *
333 0 : nvme_bdev_ctrlr_get_ctrlr_by_id(struct nvme_bdev_ctrlr *nbdev_ctrlr,
334 : uint16_t cntlid)
335 : {
336 : struct nvme_ctrlr *nvme_ctrlr;
337 : const struct spdk_nvme_ctrlr_data *cdata;
338 :
339 0 : TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
340 0 : cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr);
341 0 : if (cdata->cntlid == cntlid) {
342 0 : break;
343 : }
344 0 : }
345 :
346 0 : return nvme_ctrlr;
347 : }
348 :
349 : static struct nvme_bdev *
350 73 : nvme_bdev_ctrlr_get_bdev(struct nvme_bdev_ctrlr *nbdev_ctrlr, uint32_t nsid)
351 : {
352 : struct nvme_bdev *bdev;
353 :
354 73 : pthread_mutex_lock(&g_bdev_nvme_mutex);
355 107 : TAILQ_FOREACH(bdev, &nbdev_ctrlr->bdevs, tailq) {
356 68 : if (bdev->nsid == nsid) {
357 34 : break;
358 : }
359 34 : }
360 73 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
361 :
362 73 : return bdev;
363 : }
364 :
365 : struct nvme_ns *
366 143 : nvme_ctrlr_get_ns(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid)
367 : {
368 : struct nvme_ns ns;
369 :
370 143 : assert(nsid > 0);
371 :
372 143 : ns.id = nsid;
373 143 : return RB_FIND(nvme_ns_tree, &nvme_ctrlr->namespaces, &ns);
374 : }
375 :
376 : struct nvme_ns *
377 162 : nvme_ctrlr_get_first_active_ns(struct nvme_ctrlr *nvme_ctrlr)
378 : {
379 162 : return RB_MIN(nvme_ns_tree, &nvme_ctrlr->namespaces);
380 : }
381 :
382 : struct nvme_ns *
383 72 : nvme_ctrlr_get_next_active_ns(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *ns)
384 : {
385 72 : if (ns == NULL) {
386 0 : return NULL;
387 : }
388 :
389 72 : return RB_NEXT(nvme_ns_tree, &nvme_ctrlr->namespaces, ns);
390 72 : }
391 :
392 : static struct nvme_ctrlr *
393 52 : nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid, const char *hostnqn)
394 : {
395 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
396 52 : struct nvme_ctrlr *nvme_ctrlr = NULL;
397 :
398 52 : pthread_mutex_lock(&g_bdev_nvme_mutex);
399 71 : TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
400 19 : nvme_ctrlr = nvme_bdev_ctrlr_get_ctrlr(nbdev_ctrlr, trid, hostnqn);
401 19 : if (nvme_ctrlr != NULL) {
402 0 : break;
403 : }
404 19 : }
405 52 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
406 :
407 52 : return nvme_ctrlr;
408 : }
409 :
410 : struct nvme_ctrlr *
411 125 : nvme_ctrlr_get_by_name(const char *name)
412 : {
413 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
414 125 : struct nvme_ctrlr *nvme_ctrlr = NULL;
415 :
416 125 : if (name == NULL) {
417 0 : return NULL;
418 : }
419 :
420 125 : pthread_mutex_lock(&g_bdev_nvme_mutex);
421 125 : nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
422 125 : if (nbdev_ctrlr != NULL) {
423 60 : nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs);
424 60 : }
425 125 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
426 :
427 125 : return nvme_ctrlr;
428 125 : }
429 :
430 : void
431 0 : nvme_bdev_ctrlr_for_each(nvme_bdev_ctrlr_for_each_fn fn, void *ctx)
432 : {
433 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
434 :
435 0 : pthread_mutex_lock(&g_bdev_nvme_mutex);
436 0 : TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
437 0 : fn(nbdev_ctrlr, ctx);
438 0 : }
439 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
440 0 : }
441 :
442 : struct nvme_ctrlr_channel_iter {
443 : nvme_ctrlr_for_each_channel_msg fn;
444 : nvme_ctrlr_for_each_channel_done cpl;
445 : struct spdk_io_channel_iter *i;
446 : void *ctx;
447 : };
448 :
449 : void
450 284 : nvme_ctrlr_for_each_channel_continue(struct nvme_ctrlr_channel_iter *iter, int status)
451 : {
452 284 : spdk_for_each_channel_continue(iter->i, status);
453 284 : }
454 :
455 : static void
456 284 : nvme_ctrlr_each_channel_msg(struct spdk_io_channel_iter *i)
457 : {
458 284 : struct nvme_ctrlr_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
459 284 : struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
460 284 : struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
461 284 : struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch);
462 :
463 284 : iter->i = i;
464 284 : iter->fn(iter, nvme_ctrlr, ctrlr_ch, iter->ctx);
465 284 : }
466 :
467 : static void
468 165 : nvme_ctrlr_each_channel_cpl(struct spdk_io_channel_iter *i, int status)
469 : {
470 165 : struct nvme_ctrlr_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
471 165 : struct nvme_ctrlr *nvme_ctrlr = spdk_io_channel_iter_get_io_device(i);
472 :
473 165 : iter->i = i;
474 165 : iter->cpl(nvme_ctrlr, iter->ctx, status);
475 :
476 165 : free(iter);
477 165 : }
478 :
479 : void
480 165 : nvme_ctrlr_for_each_channel(struct nvme_ctrlr *nvme_ctrlr,
481 : nvme_ctrlr_for_each_channel_msg fn, void *ctx,
482 : nvme_ctrlr_for_each_channel_done cpl)
483 : {
484 : struct nvme_ctrlr_channel_iter *iter;
485 :
486 165 : assert(nvme_ctrlr != NULL && fn != NULL);
487 :
488 165 : iter = calloc(1, sizeof(struct nvme_ctrlr_channel_iter));
489 165 : if (iter == NULL) {
490 0 : SPDK_ERRLOG("Unable to allocate iterator\n");
491 0 : assert(false);
492 : return;
493 : }
494 :
495 165 : iter->fn = fn;
496 165 : iter->cpl = cpl;
497 165 : iter->ctx = ctx;
498 :
499 330 : spdk_for_each_channel(nvme_ctrlr, nvme_ctrlr_each_channel_msg,
500 165 : iter, nvme_ctrlr_each_channel_cpl);
501 165 : }
502 :
503 : struct nvme_bdev_channel_iter {
504 : nvme_bdev_for_each_channel_msg fn;
505 : nvme_bdev_for_each_channel_done cpl;
506 : struct spdk_io_channel_iter *i;
507 : void *ctx;
508 : };
509 :
510 : void
511 67 : nvme_bdev_for_each_channel_continue(struct nvme_bdev_channel_iter *iter, int status)
512 : {
513 67 : spdk_for_each_channel_continue(iter->i, status);
514 67 : }
515 :
516 : static void
517 67 : nvme_bdev_each_channel_msg(struct spdk_io_channel_iter *i)
518 : {
519 67 : struct nvme_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
520 67 : struct nvme_bdev *nbdev = spdk_io_channel_iter_get_io_device(i);
521 67 : struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
522 67 : struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
523 :
524 67 : iter->i = i;
525 67 : iter->fn(iter, nbdev, nbdev_ch, iter->ctx);
526 67 : }
527 :
528 : static void
529 59 : nvme_bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status)
530 : {
531 59 : struct nvme_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i);
532 59 : struct nvme_bdev *nbdev = spdk_io_channel_iter_get_io_device(i);
533 :
534 59 : iter->i = i;
535 59 : iter->cpl(nbdev, iter->ctx, status);
536 :
537 59 : free(iter);
538 59 : }
539 :
540 : void
541 59 : nvme_bdev_for_each_channel(struct nvme_bdev *nbdev,
542 : nvme_bdev_for_each_channel_msg fn, void *ctx,
543 : nvme_bdev_for_each_channel_done cpl)
544 : {
545 : struct nvme_bdev_channel_iter *iter;
546 :
547 59 : assert(nbdev != NULL && fn != NULL);
548 :
549 59 : iter = calloc(1, sizeof(struct nvme_bdev_channel_iter));
550 59 : if (iter == NULL) {
551 0 : SPDK_ERRLOG("Unable to allocate iterator\n");
552 0 : assert(false);
553 : return;
554 : }
555 :
556 59 : iter->fn = fn;
557 59 : iter->cpl = cpl;
558 59 : iter->ctx = ctx;
559 :
560 59 : spdk_for_each_channel(nbdev, nvme_bdev_each_channel_msg, iter,
561 : nvme_bdev_each_channel_cpl);
562 59 : }
563 :
564 : void
565 0 : nvme_bdev_dump_trid_json(const struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w)
566 : {
567 : const char *trtype_str;
568 : const char *adrfam_str;
569 :
570 0 : trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype);
571 0 : if (trtype_str) {
572 0 : spdk_json_write_named_string(w, "trtype", trtype_str);
573 0 : }
574 :
575 0 : adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam);
576 0 : if (adrfam_str) {
577 0 : spdk_json_write_named_string(w, "adrfam", adrfam_str);
578 0 : }
579 :
580 0 : if (trid->traddr[0] != '\0') {
581 0 : spdk_json_write_named_string(w, "traddr", trid->traddr);
582 0 : }
583 :
584 0 : if (trid->trsvcid[0] != '\0') {
585 0 : spdk_json_write_named_string(w, "trsvcid", trid->trsvcid);
586 0 : }
587 :
588 0 : if (trid->subnqn[0] != '\0') {
589 0 : spdk_json_write_named_string(w, "subnqn", trid->subnqn);
590 0 : }
591 0 : }
592 :
593 : static void
594 60 : nvme_bdev_ctrlr_delete(struct nvme_bdev_ctrlr *nbdev_ctrlr,
595 : struct nvme_ctrlr *nvme_ctrlr)
596 : {
597 : SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_delete, nvme_ctrlr->nbdev_ctrlr->name);
598 60 : pthread_mutex_lock(&g_bdev_nvme_mutex);
599 :
600 60 : TAILQ_REMOVE(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq);
601 60 : if (!TAILQ_EMPTY(&nbdev_ctrlr->ctrlrs)) {
602 15 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
603 :
604 15 : return;
605 : }
606 45 : TAILQ_REMOVE(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq);
607 :
608 45 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
609 :
610 45 : assert(TAILQ_EMPTY(&nbdev_ctrlr->bdevs));
611 :
612 45 : free(nbdev_ctrlr->name);
613 45 : free(nbdev_ctrlr);
614 60 : }
615 :
616 : static void
617 61 : _nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr)
618 : {
619 : struct nvme_path_id *path_id, *tmp_path;
620 : struct nvme_ns *ns, *tmp_ns;
621 :
622 61 : free(nvme_ctrlr->copied_ana_desc);
623 61 : spdk_free(nvme_ctrlr->ana_log_page);
624 :
625 61 : if (nvme_ctrlr->opal_dev) {
626 0 : spdk_opal_dev_destruct(nvme_ctrlr->opal_dev);
627 0 : nvme_ctrlr->opal_dev = NULL;
628 0 : }
629 :
630 61 : if (nvme_ctrlr->nbdev_ctrlr) {
631 60 : nvme_bdev_ctrlr_delete(nvme_ctrlr->nbdev_ctrlr, nvme_ctrlr);
632 60 : }
633 :
634 61 : RB_FOREACH_SAFE(ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp_ns) {
635 0 : RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, ns);
636 0 : nvme_ns_free(ns);
637 0 : }
638 :
639 122 : TAILQ_FOREACH_SAFE(path_id, &nvme_ctrlr->trids, link, tmp_path) {
640 61 : TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link);
641 61 : free(path_id);
642 61 : }
643 :
644 61 : pthread_mutex_destroy(&nvme_ctrlr->mutex);
645 61 : spdk_keyring_put_key(nvme_ctrlr->psk);
646 61 : spdk_keyring_put_key(nvme_ctrlr->dhchap_key);
647 61 : spdk_keyring_put_key(nvme_ctrlr->dhchap_ctrlr_key);
648 61 : free(nvme_ctrlr);
649 :
650 61 : pthread_mutex_lock(&g_bdev_nvme_mutex);
651 61 : if (g_bdev_nvme_module_finish && TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
652 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
653 0 : spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL);
654 0 : spdk_bdev_module_fini_done();
655 0 : return;
656 : }
657 61 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
658 61 : }
659 :
660 : static int
661 61 : nvme_detach_poller(void *arg)
662 : {
663 61 : struct nvme_ctrlr *nvme_ctrlr = arg;
664 : int rc;
665 :
666 61 : rc = spdk_nvme_detach_poll_async(nvme_ctrlr->detach_ctx);
667 61 : if (rc != -EAGAIN) {
668 61 : spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller);
669 61 : _nvme_ctrlr_delete(nvme_ctrlr);
670 61 : }
671 :
672 61 : return SPDK_POLLER_BUSY;
673 : }
674 :
675 : static void
676 61 : nvme_ctrlr_delete(struct nvme_ctrlr *nvme_ctrlr)
677 : {
678 : int rc;
679 :
680 61 : spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer);
681 :
682 61 : if (spdk_interrupt_mode_is_enabled()) {
683 0 : spdk_interrupt_unregister(&nvme_ctrlr->intr);
684 0 : }
685 :
686 : /* First, unregister the adminq poller, as the driver will poll adminq if necessary */
687 61 : spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller);
688 :
689 : /* If we got here, the reset/detach poller cannot be active */
690 61 : assert(nvme_ctrlr->reset_detach_poller == NULL);
691 61 : nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(nvme_detach_poller,
692 : nvme_ctrlr, 1000);
693 61 : if (nvme_ctrlr->reset_detach_poller == NULL) {
694 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to register detach poller\n");
695 0 : goto error;
696 : }
697 :
698 61 : rc = spdk_nvme_detach_async(nvme_ctrlr->ctrlr, &nvme_ctrlr->detach_ctx);
699 61 : if (rc != 0) {
700 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to detach the NVMe controller\n");
701 0 : goto error;
702 : }
703 :
704 61 : return;
705 : error:
706 : /* We don't have a good way to handle errors here, so just do what we can and delete the
707 : * controller without detaching the underlying NVMe device.
708 : */
709 0 : spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller);
710 0 : _nvme_ctrlr_delete(nvme_ctrlr);
711 61 : }
712 :
713 : static void
714 60 : nvme_ctrlr_unregister_cb(void *io_device)
715 : {
716 60 : struct nvme_ctrlr *nvme_ctrlr = io_device;
717 :
718 60 : nvme_ctrlr_delete(nvme_ctrlr);
719 60 : }
720 :
721 : static void
722 60 : nvme_ctrlr_unregister(void *ctx)
723 : {
724 60 : struct nvme_ctrlr *nvme_ctrlr = ctx;
725 :
726 60 : spdk_io_device_unregister(nvme_ctrlr, nvme_ctrlr_unregister_cb);
727 60 : }
728 :
729 : static bool
730 244 : nvme_ctrlr_can_be_unregistered(struct nvme_ctrlr *nvme_ctrlr)
731 : {
732 244 : if (!nvme_ctrlr->destruct) {
733 128 : return false;
734 : }
735 :
736 116 : if (nvme_ctrlr->ref > 0) {
737 56 : return false;
738 : }
739 :
740 60 : if (nvme_ctrlr->resetting) {
741 0 : return false;
742 : }
743 :
744 60 : if (nvme_ctrlr->ana_log_page_updating) {
745 0 : return false;
746 : }
747 :
748 60 : if (nvme_ctrlr->io_path_cache_clearing) {
749 0 : return false;
750 : }
751 :
752 60 : return true;
753 244 : }
754 :
755 : static void
756 168 : nvme_ctrlr_release(struct nvme_ctrlr *nvme_ctrlr)
757 : {
758 168 : pthread_mutex_lock(&nvme_ctrlr->mutex);
759 : SPDK_DTRACE_PROBE2(bdev_nvme_ctrlr_release, nvme_ctrlr->nbdev_ctrlr->name, nvme_ctrlr->ref);
760 :
761 168 : assert(nvme_ctrlr->ref > 0);
762 168 : nvme_ctrlr->ref--;
763 :
764 168 : if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) {
765 108 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
766 108 : return;
767 : }
768 :
769 60 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
770 :
771 60 : spdk_thread_exec_msg(nvme_ctrlr->thread, nvme_ctrlr_unregister, nvme_ctrlr);
772 168 : }
773 :
774 : static void
775 251 : bdev_nvme_clear_current_io_path(struct nvme_bdev_channel *nbdev_ch)
776 : {
777 251 : nbdev_ch->current_io_path = NULL;
778 251 : nbdev_ch->rr_counter = 0;
779 251 : }
780 :
781 : static struct nvme_io_path *
782 8 : _bdev_nvme_get_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns)
783 : {
784 : struct nvme_io_path *io_path;
785 :
786 16 : STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
787 15 : if (io_path->nvme_ns == nvme_ns) {
788 7 : break;
789 : }
790 8 : }
791 :
792 8 : return io_path;
793 : }
794 :
795 : static struct nvme_io_path *
796 37 : nvme_io_path_alloc(void)
797 : {
798 : struct nvme_io_path *io_path;
799 :
800 37 : io_path = calloc(1, sizeof(*io_path));
801 37 : if (io_path == NULL) {
802 0 : SPDK_ERRLOG("Failed to alloc io_path.\n");
803 0 : return NULL;
804 : }
805 :
806 37 : if (g_opts.io_path_stat) {
807 0 : io_path->stat = calloc(1, sizeof(struct spdk_bdev_io_stat));
808 0 : if (io_path->stat == NULL) {
809 0 : free(io_path);
810 0 : SPDK_ERRLOG("Failed to alloc io_path stat.\n");
811 0 : return NULL;
812 : }
813 0 : spdk_bdev_reset_io_stat(io_path->stat, SPDK_BDEV_RESET_STAT_MAXMIN);
814 0 : }
815 :
816 37 : return io_path;
817 37 : }
818 :
819 : static void
820 37 : nvme_io_path_free(struct nvme_io_path *io_path)
821 : {
822 37 : free(io_path->stat);
823 37 : free(io_path);
824 37 : }
825 :
826 : static int
827 37 : _bdev_nvme_add_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_ns *nvme_ns)
828 : {
829 : struct nvme_io_path *io_path;
830 : struct spdk_io_channel *ch;
831 : struct nvme_ctrlr_channel *ctrlr_ch;
832 : struct nvme_qpair *nvme_qpair;
833 :
834 37 : io_path = nvme_io_path_alloc();
835 37 : if (io_path == NULL) {
836 0 : return -ENOMEM;
837 : }
838 :
839 37 : io_path->nvme_ns = nvme_ns;
840 :
841 37 : ch = spdk_get_io_channel(nvme_ns->ctrlr);
842 37 : if (ch == NULL) {
843 0 : nvme_io_path_free(io_path);
844 0 : SPDK_ERRLOG("Failed to alloc io_channel.\n");
845 0 : return -ENOMEM;
846 : }
847 :
848 37 : ctrlr_ch = spdk_io_channel_get_ctx(ch);
849 :
850 37 : nvme_qpair = ctrlr_ch->qpair;
851 37 : assert(nvme_qpair != NULL);
852 :
853 37 : io_path->qpair = nvme_qpair;
854 37 : TAILQ_INSERT_TAIL(&nvme_qpair->io_path_list, io_path, tailq);
855 :
856 37 : io_path->nbdev_ch = nbdev_ch;
857 37 : STAILQ_INSERT_TAIL(&nbdev_ch->io_path_list, io_path, stailq);
858 :
859 37 : bdev_nvme_clear_current_io_path(nbdev_ch);
860 :
861 37 : return 0;
862 37 : }
863 :
864 : static void
865 37 : bdev_nvme_clear_retry_io_path(struct nvme_bdev_channel *nbdev_ch,
866 : struct nvme_io_path *io_path)
867 : {
868 : struct nvme_bdev_io *bio;
869 :
870 38 : TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) {
871 1 : if (bio->io_path == io_path) {
872 1 : bio->io_path = NULL;
873 1 : }
874 1 : }
875 37 : }
876 :
877 : static void
878 37 : _bdev_nvme_delete_io_path(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *io_path)
879 : {
880 : struct spdk_io_channel *ch;
881 : struct nvme_qpair *nvme_qpair;
882 : struct nvme_ctrlr_channel *ctrlr_ch;
883 : struct nvme_bdev *nbdev;
884 :
885 37 : nbdev = spdk_io_channel_get_io_device(spdk_io_channel_from_ctx(nbdev_ch));
886 :
887 : /* Add the statistics to nvme_ns before this path is destroyed. */
888 37 : pthread_mutex_lock(&nbdev->mutex);
889 37 : if (nbdev->ref != 0 && io_path->nvme_ns->stat != NULL && io_path->stat != NULL) {
890 0 : spdk_bdev_add_io_stat(io_path->nvme_ns->stat, io_path->stat);
891 0 : }
892 37 : pthread_mutex_unlock(&nbdev->mutex);
893 :
894 37 : bdev_nvme_clear_current_io_path(nbdev_ch);
895 37 : bdev_nvme_clear_retry_io_path(nbdev_ch, io_path);
896 :
897 39 : STAILQ_REMOVE(&nbdev_ch->io_path_list, io_path, nvme_io_path, stailq);
898 37 : io_path->nbdev_ch = NULL;
899 :
900 37 : nvme_qpair = io_path->qpair;
901 37 : assert(nvme_qpair != NULL);
902 :
903 37 : ctrlr_ch = nvme_qpair->ctrlr_ch;
904 37 : assert(ctrlr_ch != NULL);
905 :
906 37 : ch = spdk_io_channel_from_ctx(ctrlr_ch);
907 37 : spdk_put_io_channel(ch);
908 :
909 : /* After an io_path is removed, I/Os submitted to it may complete and update statistics
910 : * of the io_path. To avoid heap-use-after-free error from this case, do not free the
911 : * io_path here but free the io_path when the associated qpair is freed. It is ensured
912 : * that all I/Os submitted to the io_path are completed when the associated qpair is freed.
913 : */
914 37 : }
915 :
916 : static void
917 24 : _bdev_nvme_delete_io_paths(struct nvme_bdev_channel *nbdev_ch)
918 : {
919 : struct nvme_io_path *io_path, *tmp_io_path;
920 :
921 59 : STAILQ_FOREACH_SAFE(io_path, &nbdev_ch->io_path_list, stailq, tmp_io_path) {
922 35 : _bdev_nvme_delete_io_path(nbdev_ch, io_path);
923 35 : }
924 24 : }
925 :
926 : static int
927 24 : bdev_nvme_create_bdev_channel_cb(void *io_device, void *ctx_buf)
928 : {
929 24 : struct nvme_bdev_channel *nbdev_ch = ctx_buf;
930 24 : struct nvme_bdev *nbdev = io_device;
931 : struct nvme_ns *nvme_ns;
932 : int rc;
933 :
934 24 : STAILQ_INIT(&nbdev_ch->io_path_list);
935 24 : TAILQ_INIT(&nbdev_ch->retry_io_list);
936 :
937 24 : pthread_mutex_lock(&nbdev->mutex);
938 :
939 24 : nbdev_ch->mp_policy = nbdev->mp_policy;
940 24 : nbdev_ch->mp_selector = nbdev->mp_selector;
941 24 : nbdev_ch->rr_min_io = nbdev->rr_min_io;
942 :
943 59 : TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) {
944 35 : rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns);
945 35 : if (rc != 0) {
946 0 : pthread_mutex_unlock(&nbdev->mutex);
947 :
948 0 : _bdev_nvme_delete_io_paths(nbdev_ch);
949 0 : return rc;
950 : }
951 35 : }
952 24 : pthread_mutex_unlock(&nbdev->mutex);
953 :
954 24 : return 0;
955 24 : }
956 :
957 : /* If cpl != NULL, complete the bdev_io with nvme status based on 'cpl'.
958 : * If cpl == NULL, complete the bdev_io with bdev status based on 'status'.
959 : */
960 : static inline void
961 57 : __bdev_nvme_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status,
962 : const struct spdk_nvme_cpl *cpl)
963 : {
964 57 : spdk_trace_record(TRACE_BDEV_NVME_IO_DONE, 0, 0, (uintptr_t)bdev_io->driver_ctx,
965 : (uintptr_t)bdev_io);
966 57 : if (cpl) {
967 29 : spdk_bdev_io_complete_nvme_status(bdev_io, cpl->cdw0, cpl->status.sct, cpl->status.sc);
968 29 : } else {
969 28 : spdk_bdev_io_complete(bdev_io, status);
970 : }
971 57 : }
972 :
973 : static void bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch);
974 :
975 : static void
976 24 : bdev_nvme_destroy_bdev_channel_cb(void *io_device, void *ctx_buf)
977 : {
978 24 : struct nvme_bdev_channel *nbdev_ch = ctx_buf;
979 :
980 24 : bdev_nvme_abort_retry_ios(nbdev_ch);
981 24 : _bdev_nvme_delete_io_paths(nbdev_ch);
982 24 : }
983 :
984 : static inline bool
985 62 : bdev_nvme_io_type_is_admin(enum spdk_bdev_io_type io_type)
986 : {
987 62 : switch (io_type) {
988 : case SPDK_BDEV_IO_TYPE_RESET:
989 : case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
990 : case SPDK_BDEV_IO_TYPE_ABORT:
991 5 : return true;
992 : default:
993 57 : break;
994 : }
995 :
996 57 : return false;
997 62 : }
998 :
999 : static inline bool
1000 98 : nvme_ns_is_active(struct nvme_ns *nvme_ns)
1001 : {
1002 98 : if (spdk_unlikely(nvme_ns->ana_state_updating)) {
1003 1 : return false;
1004 : }
1005 :
1006 97 : if (spdk_unlikely(nvme_ns->ns == NULL)) {
1007 0 : return false;
1008 : }
1009 :
1010 97 : return true;
1011 98 : }
1012 :
1013 : static inline bool
1014 86 : nvme_ns_is_accessible(struct nvme_ns *nvme_ns)
1015 : {
1016 86 : if (spdk_unlikely(!nvme_ns_is_active(nvme_ns))) {
1017 1 : return false;
1018 : }
1019 :
1020 85 : switch (nvme_ns->ana_state) {
1021 : case SPDK_NVME_ANA_OPTIMIZED_STATE:
1022 : case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
1023 76 : return true;
1024 : default:
1025 9 : break;
1026 : }
1027 :
1028 9 : return false;
1029 86 : }
1030 :
1031 : static inline bool
1032 128 : nvme_qpair_is_connected(struct nvme_qpair *nvme_qpair)
1033 : {
1034 128 : if (spdk_unlikely(nvme_qpair->qpair == NULL)) {
1035 23 : return false;
1036 : }
1037 :
1038 105 : if (spdk_unlikely(spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) !=
1039 : SPDK_NVME_QPAIR_FAILURE_NONE)) {
1040 2 : return false;
1041 : }
1042 :
1043 103 : if (spdk_unlikely(nvme_qpair->ctrlr_ch->reset_iter != NULL)) {
1044 0 : return false;
1045 : }
1046 :
1047 103 : return true;
1048 128 : }
1049 :
1050 : static inline bool
1051 102 : nvme_io_path_is_available(struct nvme_io_path *io_path)
1052 : {
1053 102 : if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) {
1054 16 : return false;
1055 : }
1056 :
1057 86 : if (spdk_unlikely(!nvme_ns_is_accessible(io_path->nvme_ns))) {
1058 10 : return false;
1059 : }
1060 :
1061 76 : return true;
1062 102 : }
1063 :
1064 : static inline bool
1065 9 : nvme_ctrlr_is_failed(struct nvme_ctrlr *nvme_ctrlr)
1066 : {
1067 9 : if (nvme_ctrlr->destruct) {
1068 0 : return true;
1069 : }
1070 :
1071 9 : if (nvme_ctrlr->fast_io_fail_timedout) {
1072 2 : return true;
1073 : }
1074 :
1075 7 : if (nvme_ctrlr->resetting) {
1076 5 : if (nvme_ctrlr->opts.reconnect_delay_sec != 0) {
1077 5 : return false;
1078 : } else {
1079 0 : return true;
1080 : }
1081 : }
1082 :
1083 2 : if (nvme_ctrlr->reconnect_is_delayed) {
1084 2 : return false;
1085 : }
1086 :
1087 0 : if (nvme_ctrlr->disabled) {
1088 0 : return true;
1089 : }
1090 :
1091 0 : if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) {
1092 0 : return true;
1093 : } else {
1094 0 : return false;
1095 : }
1096 9 : }
1097 :
1098 : static bool
1099 20 : nvme_ctrlr_is_available(struct nvme_ctrlr *nvme_ctrlr)
1100 : {
1101 20 : if (nvme_ctrlr->destruct) {
1102 0 : return false;
1103 : }
1104 :
1105 20 : if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) {
1106 3 : return false;
1107 : }
1108 :
1109 17 : if (nvme_ctrlr->resetting || nvme_ctrlr->reconnect_is_delayed) {
1110 1 : return false;
1111 : }
1112 :
1113 16 : if (nvme_ctrlr->disabled) {
1114 0 : return false;
1115 : }
1116 :
1117 16 : return true;
1118 20 : }
1119 :
1120 : /* Simulate circular linked list. */
1121 : static inline struct nvme_io_path *
1122 99 : nvme_io_path_get_next(struct nvme_bdev_channel *nbdev_ch, struct nvme_io_path *prev_path)
1123 : {
1124 : struct nvme_io_path *next_path;
1125 :
1126 99 : if (prev_path != NULL) {
1127 39 : next_path = STAILQ_NEXT(prev_path, stailq);
1128 39 : if (next_path != NULL) {
1129 14 : return next_path;
1130 : }
1131 25 : }
1132 :
1133 85 : return STAILQ_FIRST(&nbdev_ch->io_path_list);
1134 99 : }
1135 :
1136 : static struct nvme_io_path *
1137 67 : _bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
1138 : {
1139 67 : struct nvme_io_path *io_path, *start, *non_optimized = NULL;
1140 :
1141 67 : start = nvme_io_path_get_next(nbdev_ch, nbdev_ch->current_io_path);
1142 :
1143 67 : io_path = start;
1144 67 : do {
1145 79 : if (spdk_likely(nvme_io_path_is_available(io_path))) {
1146 57 : switch (io_path->nvme_ns->ana_state) {
1147 : case SPDK_NVME_ANA_OPTIMIZED_STATE:
1148 47 : nbdev_ch->current_io_path = io_path;
1149 47 : return io_path;
1150 : case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
1151 10 : if (non_optimized == NULL) {
1152 7 : non_optimized = io_path;
1153 7 : }
1154 10 : break;
1155 : default:
1156 0 : assert(false);
1157 : break;
1158 : }
1159 10 : }
1160 32 : io_path = nvme_io_path_get_next(nbdev_ch, io_path);
1161 32 : } while (io_path != start);
1162 :
1163 20 : if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) {
1164 : /* We come here only if there is no optimized path. Cache even non_optimized
1165 : * path for load balance across multiple non_optimized paths.
1166 : */
1167 1 : nbdev_ch->current_io_path = non_optimized;
1168 1 : }
1169 :
1170 20 : return non_optimized;
1171 67 : }
1172 :
1173 : static struct nvme_io_path *
1174 4 : _bdev_nvme_find_io_path_min_qd(struct nvme_bdev_channel *nbdev_ch)
1175 : {
1176 : struct nvme_io_path *io_path;
1177 4 : struct nvme_io_path *optimized = NULL, *non_optimized = NULL;
1178 4 : uint32_t opt_min_qd = UINT32_MAX, non_opt_min_qd = UINT32_MAX;
1179 : uint32_t num_outstanding_reqs;
1180 :
1181 16 : STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
1182 12 : if (spdk_unlikely(!nvme_qpair_is_connected(io_path->qpair))) {
1183 : /* The device is currently resetting. */
1184 0 : continue;
1185 : }
1186 :
1187 12 : if (spdk_unlikely(!nvme_ns_is_active(io_path->nvme_ns))) {
1188 0 : continue;
1189 : }
1190 :
1191 12 : num_outstanding_reqs = spdk_nvme_qpair_get_num_outstanding_reqs(io_path->qpair->qpair);
1192 12 : switch (io_path->nvme_ns->ana_state) {
1193 : case SPDK_NVME_ANA_OPTIMIZED_STATE:
1194 6 : if (num_outstanding_reqs < opt_min_qd) {
1195 5 : opt_min_qd = num_outstanding_reqs;
1196 5 : optimized = io_path;
1197 5 : }
1198 6 : break;
1199 : case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
1200 3 : if (num_outstanding_reqs < non_opt_min_qd) {
1201 3 : non_opt_min_qd = num_outstanding_reqs;
1202 3 : non_optimized = io_path;
1203 3 : }
1204 3 : break;
1205 : default:
1206 3 : break;
1207 : }
1208 12 : }
1209 :
1210 : /* don't cache io path for BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH selector */
1211 4 : if (optimized != NULL) {
1212 3 : return optimized;
1213 : }
1214 :
1215 1 : return non_optimized;
1216 4 : }
1217 :
1218 : static inline struct nvme_io_path *
1219 105 : bdev_nvme_find_io_path(struct nvme_bdev_channel *nbdev_ch)
1220 : {
1221 105 : if (spdk_likely(nbdev_ch->current_io_path != NULL)) {
1222 41 : if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE) {
1223 31 : return nbdev_ch->current_io_path;
1224 10 : } else if (nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) {
1225 10 : if (++nbdev_ch->rr_counter < nbdev_ch->rr_min_io) {
1226 3 : return nbdev_ch->current_io_path;
1227 : }
1228 7 : nbdev_ch->rr_counter = 0;
1229 7 : }
1230 7 : }
1231 :
1232 71 : if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE ||
1233 14 : nbdev_ch->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) {
1234 67 : return _bdev_nvme_find_io_path(nbdev_ch);
1235 : } else {
1236 4 : return _bdev_nvme_find_io_path_min_qd(nbdev_ch);
1237 : }
1238 105 : }
1239 :
1240 : /* Return true if there is any io_path whose qpair is active or ctrlr is not failed,
1241 : * or false otherwise.
1242 : *
1243 : * If any io_path has an active qpair but find_io_path() returned NULL, its namespace
1244 : * is likely to be non-accessible now but may become accessible.
1245 : *
1246 : * If any io_path has an unfailed ctrlr but find_io_path() returned NULL, the ctrlr
1247 : * is likely to be resetting now but the reset may succeed. A ctrlr is set to unfailed
1248 : * when starting to reset it but it is set to failed when the reset failed. Hence, if
1249 : * a ctrlr is unfailed, it is likely that it works fine or is resetting.
1250 : */
1251 : static bool
1252 15 : any_io_path_may_become_available(struct nvme_bdev_channel *nbdev_ch)
1253 : {
1254 : struct nvme_io_path *io_path;
1255 :
1256 15 : if (nbdev_ch->resetting) {
1257 1 : return false;
1258 : }
1259 :
1260 16 : STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
1261 14 : if (io_path->nvme_ns->ana_transition_timedout) {
1262 0 : continue;
1263 : }
1264 :
1265 14 : if (nvme_qpair_is_connected(io_path->qpair) ||
1266 9 : !nvme_ctrlr_is_failed(io_path->qpair->ctrlr)) {
1267 12 : return true;
1268 : }
1269 2 : }
1270 :
1271 2 : return false;
1272 15 : }
1273 :
1274 : static void
1275 14 : bdev_nvme_retry_io(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io)
1276 : {
1277 14 : struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
1278 : struct spdk_io_channel *ch;
1279 :
1280 14 : if (nbdev_io->io_path != NULL && nvme_io_path_is_available(nbdev_io->io_path)) {
1281 3 : _bdev_nvme_submit_request(nbdev_ch, bdev_io);
1282 3 : } else {
1283 11 : ch = spdk_io_channel_from_ctx(nbdev_ch);
1284 11 : bdev_nvme_submit_request(ch, bdev_io);
1285 : }
1286 14 : }
1287 :
1288 : static int
1289 14 : bdev_nvme_retry_ios(void *arg)
1290 : {
1291 14 : struct nvme_bdev_channel *nbdev_ch = arg;
1292 : struct nvme_bdev_io *bio, *tmp_bio;
1293 : uint64_t now, delay_us;
1294 :
1295 14 : now = spdk_get_ticks();
1296 :
1297 28 : TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) {
1298 15 : if (bio->retry_ticks > now) {
1299 1 : break;
1300 : }
1301 :
1302 14 : TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link);
1303 :
1304 14 : bdev_nvme_retry_io(nbdev_ch, spdk_bdev_io_from_ctx(bio));
1305 14 : }
1306 :
1307 14 : spdk_poller_unregister(&nbdev_ch->retry_io_poller);
1308 :
1309 14 : bio = TAILQ_FIRST(&nbdev_ch->retry_io_list);
1310 14 : if (bio != NULL) {
1311 4 : delay_us = (bio->retry_ticks - now) * SPDK_SEC_TO_USEC / spdk_get_ticks_hz();
1312 :
1313 4 : nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch,
1314 : delay_us);
1315 4 : }
1316 :
1317 14 : return SPDK_POLLER_BUSY;
1318 : }
1319 :
1320 : static void
1321 16 : bdev_nvme_queue_retry_io(struct nvme_bdev_channel *nbdev_ch,
1322 : struct nvme_bdev_io *bio, uint64_t delay_ms)
1323 : {
1324 : struct nvme_bdev_io *tmp_bio;
1325 :
1326 16 : bio->retry_ticks = spdk_get_ticks() + delay_ms * spdk_get_ticks_hz() / 1000ULL;
1327 :
1328 16 : TAILQ_FOREACH_REVERSE(tmp_bio, &nbdev_ch->retry_io_list, retry_io_head, retry_link) {
1329 1 : if (tmp_bio->retry_ticks <= bio->retry_ticks) {
1330 1 : TAILQ_INSERT_AFTER(&nbdev_ch->retry_io_list, tmp_bio, bio,
1331 : retry_link);
1332 1 : return;
1333 : }
1334 0 : }
1335 :
1336 : /* No earlier I/Os were found. This I/O must be the new head. */
1337 15 : TAILQ_INSERT_HEAD(&nbdev_ch->retry_io_list, bio, retry_link);
1338 :
1339 15 : spdk_poller_unregister(&nbdev_ch->retry_io_poller);
1340 :
1341 15 : nbdev_ch->retry_io_poller = SPDK_POLLER_REGISTER(bdev_nvme_retry_ios, nbdev_ch,
1342 : delay_ms * 1000ULL);
1343 16 : }
1344 :
1345 : static void
1346 54 : bdev_nvme_abort_retry_ios(struct nvme_bdev_channel *nbdev_ch)
1347 : {
1348 : struct nvme_bdev_io *bio, *tmp_bio;
1349 :
1350 55 : TAILQ_FOREACH_SAFE(bio, &nbdev_ch->retry_io_list, retry_link, tmp_bio) {
1351 1 : TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link);
1352 1 : __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL);
1353 1 : }
1354 :
1355 54 : spdk_poller_unregister(&nbdev_ch->retry_io_poller);
1356 54 : }
1357 :
1358 : static int
1359 6 : bdev_nvme_abort_retry_io(struct nvme_bdev_channel *nbdev_ch,
1360 : struct nvme_bdev_io *bio_to_abort)
1361 : {
1362 : struct nvme_bdev_io *bio;
1363 :
1364 6 : TAILQ_FOREACH(bio, &nbdev_ch->retry_io_list, retry_link) {
1365 1 : if (bio == bio_to_abort) {
1366 1 : TAILQ_REMOVE(&nbdev_ch->retry_io_list, bio, retry_link);
1367 1 : __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_ABORTED, NULL);
1368 1 : return 0;
1369 : }
1370 0 : }
1371 :
1372 5 : return -ENOENT;
1373 6 : }
1374 :
1375 : static void
1376 12 : bdev_nvme_update_nvme_error_stat(struct spdk_bdev_io *bdev_io, const struct spdk_nvme_cpl *cpl)
1377 : {
1378 : struct nvme_bdev *nbdev;
1379 : uint16_t sct, sc;
1380 :
1381 12 : assert(spdk_nvme_cpl_is_error(cpl));
1382 :
1383 12 : nbdev = bdev_io->bdev->ctxt;
1384 :
1385 12 : if (nbdev->err_stat == NULL) {
1386 12 : return;
1387 : }
1388 :
1389 0 : sct = cpl->status.sct;
1390 0 : sc = cpl->status.sc;
1391 :
1392 0 : pthread_mutex_lock(&nbdev->mutex);
1393 :
1394 0 : nbdev->err_stat->status_type[sct]++;
1395 0 : switch (sct) {
1396 : case SPDK_NVME_SCT_GENERIC:
1397 : case SPDK_NVME_SCT_COMMAND_SPECIFIC:
1398 : case SPDK_NVME_SCT_MEDIA_ERROR:
1399 : case SPDK_NVME_SCT_PATH:
1400 0 : nbdev->err_stat->status[sct][sc]++;
1401 0 : break;
1402 : default:
1403 0 : break;
1404 : }
1405 :
1406 0 : pthread_mutex_unlock(&nbdev->mutex);
1407 12 : }
1408 :
1409 : static inline void
1410 20 : bdev_nvme_update_io_path_stat(struct nvme_bdev_io *bio)
1411 : {
1412 20 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
1413 20 : uint64_t num_blocks = bdev_io->u.bdev.num_blocks;
1414 20 : uint32_t blocklen = bdev_io->bdev->blocklen;
1415 : struct spdk_bdev_io_stat *stat;
1416 : uint64_t tsc_diff;
1417 :
1418 20 : if (bio->io_path->stat == NULL) {
1419 20 : return;
1420 : }
1421 :
1422 0 : tsc_diff = spdk_get_ticks() - bio->submit_tsc;
1423 0 : stat = bio->io_path->stat;
1424 :
1425 0 : switch (bdev_io->type) {
1426 : case SPDK_BDEV_IO_TYPE_READ:
1427 0 : stat->bytes_read += num_blocks * blocklen;
1428 0 : stat->num_read_ops++;
1429 0 : stat->read_latency_ticks += tsc_diff;
1430 0 : if (stat->max_read_latency_ticks < tsc_diff) {
1431 0 : stat->max_read_latency_ticks = tsc_diff;
1432 0 : }
1433 0 : if (stat->min_read_latency_ticks > tsc_diff) {
1434 0 : stat->min_read_latency_ticks = tsc_diff;
1435 0 : }
1436 0 : break;
1437 : case SPDK_BDEV_IO_TYPE_WRITE:
1438 0 : stat->bytes_written += num_blocks * blocklen;
1439 0 : stat->num_write_ops++;
1440 0 : stat->write_latency_ticks += tsc_diff;
1441 0 : if (stat->max_write_latency_ticks < tsc_diff) {
1442 0 : stat->max_write_latency_ticks = tsc_diff;
1443 0 : }
1444 0 : if (stat->min_write_latency_ticks > tsc_diff) {
1445 0 : stat->min_write_latency_ticks = tsc_diff;
1446 0 : }
1447 0 : break;
1448 : case SPDK_BDEV_IO_TYPE_UNMAP:
1449 0 : stat->bytes_unmapped += num_blocks * blocklen;
1450 0 : stat->num_unmap_ops++;
1451 0 : stat->unmap_latency_ticks += tsc_diff;
1452 0 : if (stat->max_unmap_latency_ticks < tsc_diff) {
1453 0 : stat->max_unmap_latency_ticks = tsc_diff;
1454 0 : }
1455 0 : if (stat->min_unmap_latency_ticks > tsc_diff) {
1456 0 : stat->min_unmap_latency_ticks = tsc_diff;
1457 0 : }
1458 0 : break;
1459 : case SPDK_BDEV_IO_TYPE_ZCOPY:
1460 : /* Track the data in the start phase only */
1461 0 : if (!bdev_io->u.bdev.zcopy.start) {
1462 0 : break;
1463 : }
1464 0 : if (bdev_io->u.bdev.zcopy.populate) {
1465 0 : stat->bytes_read += num_blocks * blocklen;
1466 0 : stat->num_read_ops++;
1467 0 : stat->read_latency_ticks += tsc_diff;
1468 0 : if (stat->max_read_latency_ticks < tsc_diff) {
1469 0 : stat->max_read_latency_ticks = tsc_diff;
1470 0 : }
1471 0 : if (stat->min_read_latency_ticks > tsc_diff) {
1472 0 : stat->min_read_latency_ticks = tsc_diff;
1473 0 : }
1474 0 : } else {
1475 0 : stat->bytes_written += num_blocks * blocklen;
1476 0 : stat->num_write_ops++;
1477 0 : stat->write_latency_ticks += tsc_diff;
1478 0 : if (stat->max_write_latency_ticks < tsc_diff) {
1479 0 : stat->max_write_latency_ticks = tsc_diff;
1480 0 : }
1481 0 : if (stat->min_write_latency_ticks > tsc_diff) {
1482 0 : stat->min_write_latency_ticks = tsc_diff;
1483 0 : }
1484 : }
1485 0 : break;
1486 : case SPDK_BDEV_IO_TYPE_COPY:
1487 0 : stat->bytes_copied += num_blocks * blocklen;
1488 0 : stat->num_copy_ops++;
1489 0 : stat->copy_latency_ticks += tsc_diff;
1490 0 : if (stat->max_copy_latency_ticks < tsc_diff) {
1491 0 : stat->max_copy_latency_ticks = tsc_diff;
1492 0 : }
1493 0 : if (stat->min_copy_latency_ticks > tsc_diff) {
1494 0 : stat->min_copy_latency_ticks = tsc_diff;
1495 0 : }
1496 0 : break;
1497 : default:
1498 0 : break;
1499 : }
1500 20 : }
1501 :
1502 : static bool
1503 11 : bdev_nvme_check_retry_io(struct nvme_bdev_io *bio,
1504 : const struct spdk_nvme_cpl *cpl,
1505 : struct nvme_bdev_channel *nbdev_ch,
1506 : uint64_t *_delay_ms)
1507 : {
1508 11 : struct nvme_io_path *io_path = bio->io_path;
1509 11 : struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr;
1510 : const struct spdk_nvme_ctrlr_data *cdata;
1511 :
1512 15 : if (spdk_nvme_cpl_is_path_error(cpl) ||
1513 5 : spdk_nvme_cpl_is_aborted_sq_deletion(cpl) ||
1514 0 : !nvme_io_path_is_available(io_path) ||
1515 4 : !nvme_ctrlr_is_available(nvme_ctrlr)) {
1516 15 : bdev_nvme_clear_current_io_path(nbdev_ch);
1517 15 : bio->io_path = NULL;
1518 15 : if (spdk_nvme_cpl_is_ana_error(cpl)) {
1519 1 : if (nvme_ctrlr_read_ana_log_page(nvme_ctrlr) == 0) {
1520 1 : io_path->nvme_ns->ana_state_updating = true;
1521 1 : }
1522 1 : }
1523 3 : if (!any_io_path_may_become_available(nbdev_ch)) {
1524 0 : return false;
1525 : }
1526 3 : *_delay_ms = 0;
1527 3 : } else {
1528 4 : bio->retry_count++;
1529 :
1530 4 : cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr);
1531 :
1532 4 : if (cpl->status.crd != 0) {
1533 1 : *_delay_ms = cdata->crdt[cpl->status.crd] * 100;
1534 1 : } else {
1535 3 : *_delay_ms = 0;
1536 : }
1537 : }
1538 :
1539 7 : return true;
1540 7 : }
1541 :
1542 : static inline void
1543 40 : bdev_nvme_io_complete_nvme_status(struct nvme_bdev_io *bio,
1544 : const struct spdk_nvme_cpl *cpl)
1545 : {
1546 40 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
1547 : struct nvme_bdev_channel *nbdev_ch;
1548 : uint64_t delay_ms;
1549 :
1550 40 : assert(!bdev_nvme_io_type_is_admin(bdev_io->type));
1551 :
1552 40 : if (spdk_likely(spdk_nvme_cpl_is_success(cpl))) {
1553 20 : bdev_nvme_update_io_path_stat(bio);
1554 20 : goto complete;
1555 : }
1556 :
1557 : /* Update error counts before deciding if retry is needed.
1558 : * Hence, error counts may be more than the number of I/O errors.
1559 : */
1560 20 : bdev_nvme_update_nvme_error_stat(bdev_io, cpl);
1561 :
1562 27 : if (cpl->status.dnr != 0 || spdk_nvme_cpl_is_aborted_by_request(cpl) ||
1563 2 : (g_opts.bdev_retry_count != -1 && bio->retry_count >= g_opts.bdev_retry_count)) {
1564 23 : goto complete;
1565 : }
1566 :
1567 : /* At this point we don't know whether the sequence was successfully executed or not, so we
1568 : * cannot retry the IO */
1569 7 : if (bdev_io->u.bdev.accel_sequence != NULL) {
1570 0 : goto complete;
1571 : }
1572 :
1573 7 : nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
1574 :
1575 7 : if (bdev_nvme_check_retry_io(bio, cpl, nbdev_ch, &delay_ms)) {
1576 7 : bdev_nvme_queue_retry_io(nbdev_ch, bio, delay_ms);
1577 7 : return;
1578 : }
1579 :
1580 : complete:
1581 25 : bio->retry_count = 0;
1582 25 : bio->submit_tsc = 0;
1583 25 : bdev_io->u.bdev.accel_sequence = NULL;
1584 25 : __bdev_nvme_io_complete(bdev_io, 0, cpl);
1585 32 : }
1586 :
1587 : static inline void
1588 13 : bdev_nvme_io_complete(struct nvme_bdev_io *bio, int rc)
1589 : {
1590 13 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
1591 : struct nvme_bdev_channel *nbdev_ch;
1592 : enum spdk_bdev_io_status io_status;
1593 :
1594 13 : assert(!bdev_nvme_io_type_is_admin(bdev_io->type));
1595 :
1596 13 : switch (rc) {
1597 : case 0:
1598 1 : io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
1599 1 : break;
1600 : case -ENOMEM:
1601 0 : io_status = SPDK_BDEV_IO_STATUS_NOMEM;
1602 0 : break;
1603 : case -ENXIO:
1604 15 : if (g_opts.bdev_retry_count == -1 || bio->retry_count < g_opts.bdev_retry_count) {
1605 12 : nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
1606 :
1607 12 : bdev_nvme_clear_current_io_path(nbdev_ch);
1608 12 : bio->io_path = NULL;
1609 :
1610 12 : if (any_io_path_may_become_available(nbdev_ch)) {
1611 9 : bdev_nvme_queue_retry_io(nbdev_ch, bio, 1000ULL);
1612 9 : return;
1613 : }
1614 3 : }
1615 :
1616 : /* fallthrough */
1617 : default:
1618 3 : spdk_accel_sequence_abort(bdev_io->u.bdev.accel_sequence);
1619 3 : bdev_io->u.bdev.accel_sequence = NULL;
1620 3 : io_status = SPDK_BDEV_IO_STATUS_FAILED;
1621 3 : break;
1622 : }
1623 :
1624 4 : bio->retry_count = 0;
1625 4 : bio->submit_tsc = 0;
1626 4 : __bdev_nvme_io_complete(bdev_io, io_status, NULL);
1627 13 : }
1628 :
1629 : static inline void
1630 4 : bdev_nvme_admin_complete(struct nvme_bdev_io *bio, int rc)
1631 : {
1632 4 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
1633 : enum spdk_bdev_io_status io_status;
1634 :
1635 4 : switch (rc) {
1636 : case 0:
1637 1 : io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
1638 1 : break;
1639 : case -ENOMEM:
1640 0 : io_status = SPDK_BDEV_IO_STATUS_NOMEM;
1641 0 : break;
1642 1 : case -ENXIO:
1643 : /* fallthrough */
1644 : default:
1645 3 : io_status = SPDK_BDEV_IO_STATUS_FAILED;
1646 3 : break;
1647 : }
1648 :
1649 4 : __bdev_nvme_io_complete(bdev_io, io_status, NULL);
1650 4 : }
1651 :
1652 : static void
1653 3 : bdev_nvme_clear_io_path_caches_done(struct nvme_ctrlr *nvme_ctrlr,
1654 : void *ctx, int status)
1655 : {
1656 3 : pthread_mutex_lock(&nvme_ctrlr->mutex);
1657 :
1658 3 : assert(nvme_ctrlr->io_path_cache_clearing == true);
1659 3 : nvme_ctrlr->io_path_cache_clearing = false;
1660 :
1661 3 : if (!nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) {
1662 3 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
1663 3 : return;
1664 : }
1665 :
1666 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
1667 :
1668 0 : nvme_ctrlr_unregister(nvme_ctrlr);
1669 3 : }
1670 :
1671 : static void
1672 408 : _bdev_nvme_clear_io_path_cache(struct nvme_qpair *nvme_qpair)
1673 : {
1674 : struct nvme_io_path *io_path;
1675 :
1676 635 : TAILQ_FOREACH(io_path, &nvme_qpair->io_path_list, tailq) {
1677 227 : if (io_path->nbdev_ch == NULL) {
1678 68 : continue;
1679 : }
1680 159 : bdev_nvme_clear_current_io_path(io_path->nbdev_ch);
1681 159 : }
1682 408 : }
1683 :
1684 : static void
1685 1 : bdev_nvme_clear_io_path_cache(struct nvme_ctrlr_channel_iter *i,
1686 : struct nvme_ctrlr *nvme_ctrlr,
1687 : struct nvme_ctrlr_channel *ctrlr_ch,
1688 : void *ctx)
1689 : {
1690 1 : assert(ctrlr_ch->qpair != NULL);
1691 :
1692 1 : _bdev_nvme_clear_io_path_cache(ctrlr_ch->qpair);
1693 :
1694 1 : nvme_ctrlr_for_each_channel_continue(i, 0);
1695 1 : }
1696 :
1697 : static void
1698 3 : bdev_nvme_clear_io_path_caches(struct nvme_ctrlr *nvme_ctrlr)
1699 : {
1700 3 : pthread_mutex_lock(&nvme_ctrlr->mutex);
1701 3 : if (!nvme_ctrlr_is_available(nvme_ctrlr) ||
1702 3 : nvme_ctrlr->io_path_cache_clearing) {
1703 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
1704 0 : return;
1705 : }
1706 :
1707 3 : nvme_ctrlr->io_path_cache_clearing = true;
1708 3 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
1709 :
1710 3 : nvme_ctrlr_for_each_channel(nvme_ctrlr,
1711 : bdev_nvme_clear_io_path_cache,
1712 : NULL,
1713 : bdev_nvme_clear_io_path_caches_done);
1714 3 : }
1715 :
1716 : static struct nvme_qpair *
1717 117 : nvme_poll_group_get_qpair(struct nvme_poll_group *group, struct spdk_nvme_qpair *qpair)
1718 : {
1719 : struct nvme_qpair *nvme_qpair;
1720 :
1721 134 : TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) {
1722 134 : if (nvme_qpair->qpair == qpair) {
1723 117 : break;
1724 : }
1725 17 : }
1726 :
1727 117 : return nvme_qpair;
1728 : }
1729 :
1730 : static void nvme_qpair_delete(struct nvme_qpair *nvme_qpair);
1731 :
1732 : static void
1733 117 : bdev_nvme_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx)
1734 : {
1735 117 : struct nvme_poll_group *group = poll_group_ctx;
1736 : struct nvme_qpair *nvme_qpair;
1737 : struct nvme_ctrlr *nvme_ctrlr;
1738 : struct nvme_ctrlr_channel *ctrlr_ch;
1739 : int status;
1740 :
1741 117 : nvme_qpair = nvme_poll_group_get_qpair(group, qpair);
1742 117 : if (nvme_qpair == NULL) {
1743 0 : return;
1744 : }
1745 :
1746 117 : if (nvme_qpair->qpair != NULL) {
1747 117 : spdk_nvme_ctrlr_free_io_qpair(nvme_qpair->qpair);
1748 117 : nvme_qpair->qpair = NULL;
1749 117 : }
1750 :
1751 117 : _bdev_nvme_clear_io_path_cache(nvme_qpair);
1752 :
1753 117 : nvme_ctrlr = nvme_qpair->ctrlr;
1754 117 : ctrlr_ch = nvme_qpair->ctrlr_ch;
1755 :
1756 117 : if (ctrlr_ch != NULL) {
1757 72 : if (ctrlr_ch->reset_iter != NULL) {
1758 : /* We are in a full reset sequence. */
1759 67 : if (ctrlr_ch->connect_poller != NULL) {
1760 : /* qpair was failed to connect. Abort the reset sequence. */
1761 0 : NVME_CTRLR_INFOLOG(nvme_ctrlr,
1762 : "qpair %p was failed to connect. abort the reset ctrlr sequence.\n",
1763 : qpair);
1764 0 : spdk_poller_unregister(&ctrlr_ch->connect_poller);
1765 0 : status = -1;
1766 0 : } else {
1767 : /* qpair was completed to disconnect. Just move to the next ctrlr_channel. */
1768 67 : NVME_CTRLR_INFOLOG(nvme_ctrlr,
1769 : "qpair %p was disconnected and freed in a reset ctrlr sequence.\n",
1770 : qpair);
1771 67 : status = 0;
1772 : }
1773 67 : nvme_ctrlr_for_each_channel_continue(ctrlr_ch->reset_iter, status);
1774 67 : ctrlr_ch->reset_iter = NULL;
1775 67 : } else {
1776 : /* qpair was disconnected unexpectedly. Reset controller for recovery. */
1777 5 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpair %p was disconnected and freed. reset controller.\n",
1778 : qpair);
1779 5 : bdev_nvme_failover_ctrlr(nvme_ctrlr);
1780 : }
1781 72 : } else {
1782 : /* In this case, ctrlr_channel is already deleted. */
1783 45 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpair %p was disconnected and freed. delete nvme_qpair.\n",
1784 : qpair);
1785 45 : nvme_qpair_delete(nvme_qpair);
1786 : }
1787 117 : }
1788 :
1789 : static void
1790 0 : bdev_nvme_check_io_qpairs(struct nvme_poll_group *group)
1791 : {
1792 : struct nvme_qpair *nvme_qpair;
1793 :
1794 0 : TAILQ_FOREACH(nvme_qpair, &group->qpair_list, tailq) {
1795 0 : if (nvme_qpair->qpair == NULL || nvme_qpair->ctrlr_ch == NULL) {
1796 0 : continue;
1797 : }
1798 :
1799 0 : if (spdk_nvme_qpair_get_failure_reason(nvme_qpair->qpair) !=
1800 : SPDK_NVME_QPAIR_FAILURE_NONE) {
1801 0 : _bdev_nvme_clear_io_path_cache(nvme_qpair);
1802 0 : }
1803 0 : }
1804 0 : }
1805 :
1806 : static int
1807 1470 : bdev_nvme_poll(void *arg)
1808 : {
1809 1470 : struct nvme_poll_group *group = arg;
1810 : int64_t num_completions;
1811 :
1812 1470 : if (group->collect_spin_stat && group->start_ticks == 0) {
1813 0 : group->start_ticks = spdk_get_ticks();
1814 0 : }
1815 :
1816 1470 : num_completions = spdk_nvme_poll_group_process_completions(group->group, 0,
1817 : bdev_nvme_disconnected_qpair_cb);
1818 1470 : if (group->collect_spin_stat) {
1819 0 : if (num_completions > 0) {
1820 0 : if (group->end_ticks != 0) {
1821 0 : group->spin_ticks += (group->end_ticks - group->start_ticks);
1822 0 : group->end_ticks = 0;
1823 0 : }
1824 0 : group->start_ticks = 0;
1825 0 : } else {
1826 0 : group->end_ticks = spdk_get_ticks();
1827 : }
1828 0 : }
1829 :
1830 1470 : if (spdk_unlikely(num_completions < 0)) {
1831 0 : bdev_nvme_check_io_qpairs(group);
1832 0 : }
1833 :
1834 1470 : return num_completions > 0 ? SPDK_POLLER_BUSY : SPDK_POLLER_IDLE;
1835 : }
1836 :
1837 : static int bdev_nvme_poll_adminq(void *arg);
1838 :
1839 : static void
1840 140 : bdev_nvme_change_adminq_poll_period(struct nvme_ctrlr *nvme_ctrlr, uint64_t new_period_us)
1841 : {
1842 140 : if (spdk_interrupt_mode_is_enabled()) {
1843 0 : return;
1844 : }
1845 :
1846 140 : spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller);
1847 :
1848 140 : nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq,
1849 : nvme_ctrlr, new_period_us);
1850 140 : }
1851 :
1852 : static int
1853 187 : bdev_nvme_poll_adminq(void *arg)
1854 : {
1855 : int32_t rc;
1856 187 : struct nvme_ctrlr *nvme_ctrlr = arg;
1857 : nvme_ctrlr_disconnected_cb disconnected_cb;
1858 :
1859 187 : assert(nvme_ctrlr != NULL);
1860 :
1861 187 : rc = spdk_nvme_ctrlr_process_admin_completions(nvme_ctrlr->ctrlr);
1862 187 : if (rc < 0) {
1863 85 : disconnected_cb = nvme_ctrlr->disconnected_cb;
1864 85 : nvme_ctrlr->disconnected_cb = NULL;
1865 :
1866 85 : if (disconnected_cb != NULL) {
1867 140 : bdev_nvme_change_adminq_poll_period(nvme_ctrlr,
1868 70 : g_opts.nvme_adminq_poll_period_us);
1869 70 : disconnected_cb(nvme_ctrlr);
1870 70 : } else {
1871 15 : bdev_nvme_failover_ctrlr(nvme_ctrlr);
1872 : }
1873 187 : } else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) !=
1874 : SPDK_NVME_QPAIR_FAILURE_NONE) {
1875 0 : bdev_nvme_clear_io_path_caches(nvme_ctrlr);
1876 0 : }
1877 :
1878 187 : return rc == 0 ? SPDK_POLLER_IDLE : SPDK_POLLER_BUSY;
1879 : }
1880 :
1881 : static void
1882 38 : nvme_bdev_free(void *io_device)
1883 : {
1884 38 : struct nvme_bdev *nvme_disk = io_device;
1885 :
1886 38 : pthread_mutex_destroy(&nvme_disk->mutex);
1887 38 : free(nvme_disk->disk.name);
1888 38 : free(nvme_disk->err_stat);
1889 38 : free(nvme_disk);
1890 38 : }
1891 :
1892 : static int
1893 37 : bdev_nvme_destruct(void *ctx)
1894 : {
1895 37 : struct nvme_bdev *nvme_disk = ctx;
1896 : struct nvme_ns *nvme_ns, *tmp_nvme_ns;
1897 :
1898 : SPDK_DTRACE_PROBE2(bdev_nvme_destruct, nvme_disk->nbdev_ctrlr->name, nvme_disk->nsid);
1899 :
1900 75 : TAILQ_FOREACH_SAFE(nvme_ns, &nvme_disk->nvme_ns_list, tailq, tmp_nvme_ns) {
1901 38 : pthread_mutex_lock(&nvme_ns->ctrlr->mutex);
1902 :
1903 38 : nvme_ns->bdev = NULL;
1904 :
1905 38 : assert(nvme_ns->id > 0);
1906 :
1907 38 : if (nvme_ctrlr_get_ns(nvme_ns->ctrlr, nvme_ns->id) == NULL) {
1908 0 : pthread_mutex_unlock(&nvme_ns->ctrlr->mutex);
1909 :
1910 0 : nvme_ctrlr_release(nvme_ns->ctrlr);
1911 0 : nvme_ns_free(nvme_ns);
1912 0 : } else {
1913 38 : pthread_mutex_unlock(&nvme_ns->ctrlr->mutex);
1914 : }
1915 38 : }
1916 :
1917 37 : pthread_mutex_lock(&g_bdev_nvme_mutex);
1918 37 : TAILQ_REMOVE(&nvme_disk->nbdev_ctrlr->bdevs, nvme_disk, tailq);
1919 37 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
1920 :
1921 37 : spdk_io_device_unregister(nvme_disk, nvme_bdev_free);
1922 :
1923 37 : return 0;
1924 : }
1925 :
1926 : static int
1927 118 : bdev_nvme_create_qpair(struct nvme_qpair *nvme_qpair)
1928 : {
1929 : struct nvme_ctrlr *nvme_ctrlr;
1930 : struct spdk_nvme_io_qpair_opts opts;
1931 : struct spdk_nvme_qpair *qpair;
1932 : int rc;
1933 :
1934 118 : nvme_ctrlr = nvme_qpair->ctrlr;
1935 :
1936 118 : spdk_nvme_ctrlr_get_default_io_qpair_opts(nvme_ctrlr->ctrlr, &opts, sizeof(opts));
1937 118 : opts.create_only = true;
1938 : /* In interrupt mode qpairs must be created in sync mode, else it will never be connected.
1939 : * delay_cmd_submit must be false as in interrupt mode requests cannot be submitted in
1940 : * completion context.
1941 : */
1942 118 : if (!spdk_interrupt_mode_is_enabled()) {
1943 118 : opts.async_mode = true;
1944 118 : opts.delay_cmd_submit = g_opts.delay_cmd_submit;
1945 118 : }
1946 118 : opts.io_queue_requests = spdk_max(g_opts.io_queue_requests, opts.io_queue_requests);
1947 118 : g_opts.io_queue_requests = opts.io_queue_requests;
1948 :
1949 118 : qpair = spdk_nvme_ctrlr_alloc_io_qpair(nvme_ctrlr->ctrlr, &opts, sizeof(opts));
1950 118 : if (qpair == NULL) {
1951 0 : return -1;
1952 : }
1953 :
1954 : SPDK_DTRACE_PROBE3(bdev_nvme_create_qpair, nvme_ctrlr->nbdev_ctrlr->name,
1955 : spdk_nvme_qpair_get_id(qpair), spdk_thread_get_id(nvme_ctrlr->thread));
1956 :
1957 118 : assert(nvme_qpair->group != NULL);
1958 :
1959 118 : rc = spdk_nvme_poll_group_add(nvme_qpair->group->group, qpair);
1960 118 : if (rc != 0) {
1961 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Unable to begin polling on NVMe Channel.\n");
1962 0 : goto err;
1963 : }
1964 :
1965 118 : rc = spdk_nvme_ctrlr_connect_io_qpair(nvme_ctrlr->ctrlr, qpair);
1966 118 : if (rc != 0) {
1967 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Unable to connect I/O qpair.\n");
1968 0 : goto err;
1969 : }
1970 :
1971 118 : nvme_qpair->qpair = qpair;
1972 :
1973 118 : if (!g_opts.disable_auto_failback) {
1974 85 : _bdev_nvme_clear_io_path_cache(nvme_qpair);
1975 85 : }
1976 :
1977 118 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "Connecting qpair %p:%u started.\n",
1978 : qpair, spdk_nvme_qpair_get_id(qpair));
1979 :
1980 118 : return 0;
1981 :
1982 : err:
1983 0 : spdk_nvme_ctrlr_free_io_qpair(qpair);
1984 :
1985 0 : return rc;
1986 118 : }
1987 :
1988 : static void bdev_nvme_reset_io_continue(void *cb_arg, int rc);
1989 :
1990 : static void
1991 122 : bdev_nvme_complete_pending_resets(struct nvme_ctrlr_channel_iter *i,
1992 : struct nvme_ctrlr *nvme_ctrlr,
1993 : struct nvme_ctrlr_channel *ctrlr_ch,
1994 : void *ctx)
1995 : {
1996 122 : int rc = 0;
1997 : struct nvme_bdev_io *bio;
1998 :
1999 122 : if (ctx != NULL) {
2000 59 : rc = -1;
2001 59 : }
2002 :
2003 133 : while (!TAILQ_EMPTY(&ctrlr_ch->pending_resets)) {
2004 11 : bio = TAILQ_FIRST(&ctrlr_ch->pending_resets);
2005 11 : TAILQ_REMOVE(&ctrlr_ch->pending_resets, bio, retry_link);
2006 :
2007 11 : bdev_nvme_reset_io_continue(bio, rc);
2008 : }
2009 :
2010 122 : nvme_ctrlr_for_each_channel_continue(i, 0);
2011 122 : }
2012 :
2013 : /* This function marks the current trid as failed by storing the current ticks
2014 : * and then sets the next trid to the active trid within a controller if exists.
2015 : *
2016 : * The purpose of the boolean return value is to request the caller to disconnect
2017 : * the current trid now to try connecting the next trid.
2018 : */
2019 : static bool
2020 61 : bdev_nvme_failover_trid(struct nvme_ctrlr *nvme_ctrlr, bool remove, bool start)
2021 : {
2022 : struct nvme_path_id *path_id, *next_path;
2023 : int rc __attribute__((unused));
2024 :
2025 61 : path_id = TAILQ_FIRST(&nvme_ctrlr->trids);
2026 61 : assert(path_id);
2027 61 : assert(path_id == nvme_ctrlr->active_path_id);
2028 61 : next_path = TAILQ_NEXT(path_id, link);
2029 :
2030 : /* Update the last failed time. It means the trid is failed if its last
2031 : * failed time is non-zero.
2032 : */
2033 61 : path_id->last_failed_tsc = spdk_get_ticks();
2034 :
2035 61 : if (next_path == NULL) {
2036 : /* There is no alternate trid within a controller. */
2037 50 : return false;
2038 : }
2039 :
2040 11 : if (!start && nvme_ctrlr->opts.reconnect_delay_sec == 0) {
2041 : /* Connect is not retried in a controller reset sequence. Connecting
2042 : * the next trid will be done by the next bdev_nvme_failover_ctrlr() call.
2043 : */
2044 3 : return false;
2045 : }
2046 :
2047 8 : assert(path_id->trid.trtype != SPDK_NVME_TRANSPORT_PCIE);
2048 :
2049 8 : NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Start failover from %s:%s to %s:%s\n",
2050 : path_id->trid.traddr, path_id->trid.trsvcid,
2051 : next_path->trid.traddr, next_path->trid.trsvcid);
2052 :
2053 8 : spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr);
2054 8 : nvme_ctrlr->active_path_id = next_path;
2055 8 : rc = spdk_nvme_ctrlr_set_trid(nvme_ctrlr->ctrlr, &next_path->trid);
2056 8 : assert(rc == 0);
2057 8 : TAILQ_REMOVE(&nvme_ctrlr->trids, path_id, link);
2058 8 : if (!remove) {
2059 : /** Shuffle the old trid to the end of the list and use the new one.
2060 : * Allows for round robin through multiple connections.
2061 : */
2062 6 : TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, path_id, link);
2063 6 : } else {
2064 2 : free(path_id);
2065 : }
2066 :
2067 8 : if (start || next_path->last_failed_tsc == 0) {
2068 : /* bdev_nvme_failover_ctrlr() is just called or the next trid is not failed
2069 : * or used yet. Try the next trid now.
2070 : */
2071 7 : return true;
2072 : }
2073 :
2074 2 : if (spdk_get_ticks() > next_path->last_failed_tsc + spdk_get_ticks_hz() *
2075 1 : nvme_ctrlr->opts.reconnect_delay_sec) {
2076 : /* Enough backoff passed since the next trid failed. Try the next trid now. */
2077 0 : return true;
2078 : }
2079 :
2080 : /* The next trid will be tried after reconnect_delay_sec seconds. */
2081 1 : return false;
2082 61 : }
2083 :
2084 : static bool
2085 88 : bdev_nvme_check_ctrlr_loss_timeout(struct nvme_ctrlr *nvme_ctrlr)
2086 : {
2087 : int32_t elapsed;
2088 :
2089 88 : if (nvme_ctrlr->opts.ctrlr_loss_timeout_sec == 0 ||
2090 37 : nvme_ctrlr->opts.ctrlr_loss_timeout_sec == -1) {
2091 62 : return false;
2092 : }
2093 :
2094 26 : elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz();
2095 26 : if (elapsed >= nvme_ctrlr->opts.ctrlr_loss_timeout_sec) {
2096 6 : return true;
2097 : } else {
2098 20 : return false;
2099 : }
2100 88 : }
2101 :
2102 : static bool
2103 12 : bdev_nvme_check_fast_io_fail_timeout(struct nvme_ctrlr *nvme_ctrlr)
2104 : {
2105 : uint32_t elapsed;
2106 :
2107 12 : if (nvme_ctrlr->opts.fast_io_fail_timeout_sec == 0) {
2108 8 : return false;
2109 : }
2110 :
2111 4 : elapsed = (spdk_get_ticks() - nvme_ctrlr->reset_start_tsc) / spdk_get_ticks_hz();
2112 4 : if (elapsed >= nvme_ctrlr->opts.fast_io_fail_timeout_sec) {
2113 2 : return true;
2114 : } else {
2115 2 : return false;
2116 : }
2117 12 : }
2118 :
2119 : static void bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success);
2120 :
2121 : static void
2122 71 : nvme_ctrlr_disconnect(struct nvme_ctrlr *nvme_ctrlr, nvme_ctrlr_disconnected_cb cb_fn)
2123 : {
2124 : int rc;
2125 :
2126 71 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "Start disconnecting ctrlr.\n");
2127 :
2128 71 : rc = spdk_nvme_ctrlr_disconnect(nvme_ctrlr->ctrlr);
2129 71 : if (rc != 0) {
2130 1 : NVME_CTRLR_WARNLOG(nvme_ctrlr, "disconnecting ctrlr failed.\n");
2131 :
2132 : /* Disconnect fails if ctrlr is already resetting or removed. In this case,
2133 : * fail the reset sequence immediately.
2134 : */
2135 1 : bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false);
2136 1 : return;
2137 : }
2138 :
2139 : /* spdk_nvme_ctrlr_disconnect() may complete asynchronously later by polling adminq.
2140 : * Set callback here to execute the specified operation after ctrlr is really disconnected.
2141 : */
2142 70 : assert(nvme_ctrlr->disconnected_cb == NULL);
2143 70 : nvme_ctrlr->disconnected_cb = cb_fn;
2144 :
2145 : /* During disconnection, reduce the period to poll adminq more often. */
2146 70 : bdev_nvme_change_adminq_poll_period(nvme_ctrlr, 0);
2147 71 : }
2148 :
2149 : enum bdev_nvme_op_after_reset {
2150 : OP_NONE,
2151 : OP_COMPLETE_PENDING_DESTRUCT,
2152 : OP_DESTRUCT,
2153 : OP_DELAYED_RECONNECT,
2154 : OP_FAILOVER,
2155 : };
2156 :
2157 : typedef enum bdev_nvme_op_after_reset _bdev_nvme_op_after_reset;
2158 :
2159 : static _bdev_nvme_op_after_reset
2160 70 : bdev_nvme_check_op_after_reset(struct nvme_ctrlr *nvme_ctrlr, bool success)
2161 : {
2162 70 : if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) {
2163 : /* Complete pending destruct after reset completes. */
2164 0 : return OP_COMPLETE_PENDING_DESTRUCT;
2165 70 : } else if (nvme_ctrlr->pending_failover) {
2166 3 : nvme_ctrlr->pending_failover = false;
2167 3 : nvme_ctrlr->reset_start_tsc = 0;
2168 3 : return OP_FAILOVER;
2169 67 : } else if (success || nvme_ctrlr->opts.reconnect_delay_sec == 0) {
2170 53 : nvme_ctrlr->reset_start_tsc = 0;
2171 53 : return OP_NONE;
2172 14 : } else if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) {
2173 2 : return OP_DESTRUCT;
2174 : } else {
2175 12 : if (bdev_nvme_check_fast_io_fail_timeout(nvme_ctrlr)) {
2176 2 : nvme_ctrlr->fast_io_fail_timedout = true;
2177 2 : }
2178 12 : return OP_DELAYED_RECONNECT;
2179 : }
2180 70 : }
2181 :
2182 : static int bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug);
2183 : static void bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr);
2184 :
2185 : static int
2186 9 : bdev_nvme_reconnect_delay_timer_expired(void *ctx)
2187 : {
2188 9 : struct nvme_ctrlr *nvme_ctrlr = ctx;
2189 :
2190 : SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect_delay, nvme_ctrlr->nbdev_ctrlr->name);
2191 9 : pthread_mutex_lock(&nvme_ctrlr->mutex);
2192 :
2193 9 : spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer);
2194 :
2195 9 : if (!nvme_ctrlr->reconnect_is_delayed) {
2196 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2197 0 : return SPDK_POLLER_BUSY;
2198 : }
2199 :
2200 9 : nvme_ctrlr->reconnect_is_delayed = false;
2201 :
2202 9 : if (nvme_ctrlr->destruct) {
2203 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2204 0 : return SPDK_POLLER_BUSY;
2205 : }
2206 :
2207 9 : assert(nvme_ctrlr->resetting == false);
2208 9 : nvme_ctrlr->resetting = true;
2209 :
2210 9 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2211 :
2212 9 : spdk_poller_resume(nvme_ctrlr->adminq_timer_poller);
2213 :
2214 9 : bdev_nvme_reconnect_ctrlr(nvme_ctrlr);
2215 9 : return SPDK_POLLER_BUSY;
2216 9 : }
2217 :
2218 : static void
2219 12 : bdev_nvme_start_reconnect_delay_timer(struct nvme_ctrlr *nvme_ctrlr)
2220 : {
2221 12 : spdk_poller_pause(nvme_ctrlr->adminq_timer_poller);
2222 :
2223 12 : assert(nvme_ctrlr->reconnect_is_delayed == false);
2224 12 : nvme_ctrlr->reconnect_is_delayed = true;
2225 :
2226 12 : assert(nvme_ctrlr->reconnect_delay_timer == NULL);
2227 12 : nvme_ctrlr->reconnect_delay_timer = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_delay_timer_expired,
2228 : nvme_ctrlr,
2229 : nvme_ctrlr->opts.reconnect_delay_sec * SPDK_SEC_TO_USEC);
2230 12 : }
2231 :
2232 : static void remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr);
2233 :
2234 : static void
2235 68 : _bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status)
2236 : {
2237 68 : bool success = (ctx == NULL);
2238 68 : bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn;
2239 68 : void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg;
2240 : enum bdev_nvme_op_after_reset op_after_reset;
2241 :
2242 68 : assert(nvme_ctrlr->thread == spdk_get_thread());
2243 :
2244 68 : nvme_ctrlr->ctrlr_op_cb_fn = NULL;
2245 68 : nvme_ctrlr->ctrlr_op_cb_arg = NULL;
2246 :
2247 68 : if (!success) {
2248 33 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Resetting controller failed.\n");
2249 33 : } else {
2250 35 : NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Resetting controller successful.\n");
2251 : }
2252 :
2253 68 : pthread_mutex_lock(&nvme_ctrlr->mutex);
2254 68 : nvme_ctrlr->resetting = false;
2255 68 : nvme_ctrlr->dont_retry = false;
2256 68 : nvme_ctrlr->in_failover = false;
2257 :
2258 68 : op_after_reset = bdev_nvme_check_op_after_reset(nvme_ctrlr, success);
2259 68 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2260 :
2261 : /* Delay callbacks when the next operation is a failover. */
2262 68 : if (ctrlr_op_cb_fn && op_after_reset != OP_FAILOVER) {
2263 17 : ctrlr_op_cb_fn(ctrlr_op_cb_arg, success ? 0 : -1);
2264 17 : }
2265 :
2266 68 : switch (op_after_reset) {
2267 : case OP_COMPLETE_PENDING_DESTRUCT:
2268 0 : nvme_ctrlr_unregister(nvme_ctrlr);
2269 0 : break;
2270 : case OP_DESTRUCT:
2271 2 : bdev_nvme_delete_ctrlr(nvme_ctrlr, false);
2272 2 : remove_discovery_entry(nvme_ctrlr);
2273 2 : break;
2274 : case OP_DELAYED_RECONNECT:
2275 12 : nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_start_reconnect_delay_timer);
2276 12 : break;
2277 : case OP_FAILOVER:
2278 3 : nvme_ctrlr->ctrlr_op_cb_fn = ctrlr_op_cb_fn;
2279 3 : nvme_ctrlr->ctrlr_op_cb_arg = ctrlr_op_cb_arg;
2280 3 : bdev_nvme_failover_ctrlr(nvme_ctrlr);
2281 3 : break;
2282 : default:
2283 51 : break;
2284 : }
2285 68 : }
2286 :
2287 : static void
2288 70 : bdev_nvme_reset_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, bool success)
2289 : {
2290 70 : pthread_mutex_lock(&nvme_ctrlr->mutex);
2291 70 : if (!success) {
2292 : /* Connecting the active trid failed. Set the next alternate trid to the
2293 : * active trid if it exists.
2294 : */
2295 35 : if (bdev_nvme_failover_trid(nvme_ctrlr, false, false)) {
2296 : /* The next alternate trid exists and is ready to try. Try it now. */
2297 2 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2298 :
2299 2 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "Try the next alternate trid %s:%s now.\n",
2300 : nvme_ctrlr->active_path_id->trid.traddr,
2301 : nvme_ctrlr->active_path_id->trid.trsvcid);
2302 :
2303 2 : nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr);
2304 2 : return;
2305 : }
2306 :
2307 : /* We came here if there is no alternate trid or if the next trid exists but
2308 : * is not ready to try. We will try the active trid after reconnect_delay_sec
2309 : * seconds if it is non-zero or at the next reset call otherwise.
2310 : */
2311 33 : } else {
2312 : /* Connecting the active trid succeeded. Clear the last failed time because it
2313 : * means the trid is failed if its last failed time is non-zero.
2314 : */
2315 35 : nvme_ctrlr->active_path_id->last_failed_tsc = 0;
2316 : }
2317 68 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2318 :
2319 68 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "Clear pending resets.\n");
2320 :
2321 : /* Make sure we clear any pending resets before returning. */
2322 136 : nvme_ctrlr_for_each_channel(nvme_ctrlr,
2323 : bdev_nvme_complete_pending_resets,
2324 68 : success ? NULL : (void *)0x1,
2325 : _bdev_nvme_reset_ctrlr_complete);
2326 70 : }
2327 :
2328 : static void
2329 0 : bdev_nvme_reset_create_qpairs_failed(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status)
2330 : {
2331 0 : bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false);
2332 0 : }
2333 :
2334 : static void
2335 102 : bdev_nvme_reset_destroy_qpair(struct nvme_ctrlr_channel_iter *i,
2336 : struct nvme_ctrlr *nvme_ctrlr,
2337 : struct nvme_ctrlr_channel *ctrlr_ch, void *ctx)
2338 : {
2339 : struct nvme_qpair *nvme_qpair;
2340 : struct spdk_nvme_qpair *qpair;
2341 :
2342 102 : nvme_qpair = ctrlr_ch->qpair;
2343 102 : assert(nvme_qpair != NULL);
2344 :
2345 102 : _bdev_nvme_clear_io_path_cache(nvme_qpair);
2346 :
2347 102 : qpair = nvme_qpair->qpair;
2348 102 : if (qpair != NULL) {
2349 67 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "Start disconnecting qpair %p:%u.\n",
2350 : qpair, spdk_nvme_qpair_get_id(qpair));
2351 :
2352 67 : if (nvme_qpair->ctrlr->dont_retry) {
2353 53 : spdk_nvme_qpair_set_abort_dnr(qpair, true);
2354 53 : }
2355 67 : spdk_nvme_ctrlr_disconnect_io_qpair(qpair);
2356 :
2357 : /* The current full reset sequence will move to the next
2358 : * ctrlr_channel after the qpair is actually disconnected.
2359 : */
2360 67 : assert(ctrlr_ch->reset_iter == NULL);
2361 67 : ctrlr_ch->reset_iter = i;
2362 67 : } else {
2363 35 : nvme_ctrlr_for_each_channel_continue(i, 0);
2364 : }
2365 102 : }
2366 :
2367 : static void
2368 35 : bdev_nvme_reset_create_qpairs_done(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status)
2369 : {
2370 35 : if (status == 0) {
2371 35 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpairs were created after ctrlr reset.\n");
2372 :
2373 35 : bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, true);
2374 35 : } else {
2375 0 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpairs were failed to create after ctrlr reset.\n");
2376 :
2377 : /* Delete the added qpairs and quiesce ctrlr to make the states clean. */
2378 0 : nvme_ctrlr_for_each_channel(nvme_ctrlr,
2379 : bdev_nvme_reset_destroy_qpair,
2380 : NULL,
2381 : bdev_nvme_reset_create_qpairs_failed);
2382 : }
2383 35 : }
2384 :
2385 : static int
2386 59 : bdev_nvme_reset_check_qpair_connected(void *ctx)
2387 : {
2388 59 : struct nvme_ctrlr_channel *ctrlr_ch = ctx;
2389 59 : struct nvme_qpair *nvme_qpair = ctrlr_ch->qpair;
2390 : struct spdk_nvme_qpair *qpair;
2391 :
2392 59 : if (ctrlr_ch->reset_iter == NULL) {
2393 : /* qpair was already failed to connect and the reset sequence is being aborted. */
2394 0 : assert(ctrlr_ch->connect_poller == NULL);
2395 0 : assert(nvme_qpair->qpair == NULL);
2396 :
2397 0 : NVME_CTRLR_INFOLOG(nvme_qpair->ctrlr,
2398 : "qpair was already failed to connect. reset is being aborted.\n");
2399 0 : return SPDK_POLLER_BUSY;
2400 : }
2401 :
2402 59 : qpair = nvme_qpair->qpair;
2403 59 : assert(qpair != NULL);
2404 :
2405 59 : if (!spdk_nvme_qpair_is_connected(qpair)) {
2406 0 : return SPDK_POLLER_BUSY;
2407 : }
2408 :
2409 59 : NVME_CTRLR_INFOLOG(nvme_qpair->ctrlr, "qpair %p:%u was connected.\n",
2410 : qpair, spdk_nvme_qpair_get_id(qpair));
2411 :
2412 59 : spdk_poller_unregister(&ctrlr_ch->connect_poller);
2413 :
2414 : /* qpair was completed to connect. Move to the next ctrlr_channel */
2415 59 : nvme_ctrlr_for_each_channel_continue(ctrlr_ch->reset_iter, 0);
2416 59 : ctrlr_ch->reset_iter = NULL;
2417 :
2418 59 : if (!g_opts.disable_auto_failback) {
2419 44 : _bdev_nvme_clear_io_path_cache(nvme_qpair);
2420 44 : }
2421 :
2422 59 : return SPDK_POLLER_BUSY;
2423 59 : }
2424 :
2425 : static void
2426 59 : bdev_nvme_reset_create_qpair(struct nvme_ctrlr_channel_iter *i,
2427 : struct nvme_ctrlr *nvme_ctrlr,
2428 : struct nvme_ctrlr_channel *ctrlr_ch,
2429 : void *ctx)
2430 : {
2431 59 : struct nvme_qpair *nvme_qpair = ctrlr_ch->qpair;
2432 : struct spdk_nvme_qpair *qpair;
2433 : int rc;
2434 :
2435 59 : rc = bdev_nvme_create_qpair(nvme_qpair);
2436 59 : if (rc == 0) {
2437 59 : ctrlr_ch->connect_poller = SPDK_POLLER_REGISTER(bdev_nvme_reset_check_qpair_connected,
2438 : ctrlr_ch, 0);
2439 :
2440 59 : qpair = nvme_qpair->qpair;
2441 :
2442 59 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "Start checking qpair %p:%u to be connected.\n",
2443 : qpair, spdk_nvme_qpair_get_id(qpair));
2444 :
2445 : /* The current full reset sequence will move to the next
2446 : * ctrlr_channel after the qpair is actually connected.
2447 : */
2448 59 : assert(ctrlr_ch->reset_iter == NULL);
2449 59 : ctrlr_ch->reset_iter = i;
2450 59 : } else {
2451 0 : nvme_ctrlr_for_each_channel_continue(i, rc);
2452 : }
2453 59 : }
2454 :
2455 : static void
2456 35 : nvme_ctrlr_check_namespaces(struct nvme_ctrlr *nvme_ctrlr)
2457 : {
2458 35 : struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
2459 : struct nvme_ns *nvme_ns;
2460 :
2461 55 : for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
2462 55 : nvme_ns != NULL;
2463 20 : nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) {
2464 20 : if (!spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) {
2465 1 : SPDK_DEBUGLOG(bdev_nvme, "NSID %u was removed during reset.\n", nvme_ns->id);
2466 : /* NS can be added again. Just nullify nvme_ns->ns. */
2467 1 : nvme_ns->ns = NULL;
2468 1 : }
2469 20 : }
2470 35 : }
2471 :
2472 :
2473 : static int
2474 69 : bdev_nvme_reconnect_ctrlr_poll(void *arg)
2475 : {
2476 69 : struct nvme_ctrlr *nvme_ctrlr = arg;
2477 : struct spdk_nvme_transport_id *trid;
2478 69 : int rc = -ETIMEDOUT;
2479 :
2480 69 : if (bdev_nvme_check_ctrlr_loss_timeout(nvme_ctrlr)) {
2481 : /* Mark the ctrlr as failed. The next call to
2482 : * spdk_nvme_ctrlr_reconnect_poll_async() will then
2483 : * do the necessary cleanup and return failure.
2484 : */
2485 2 : spdk_nvme_ctrlr_fail(nvme_ctrlr->ctrlr);
2486 2 : }
2487 :
2488 69 : rc = spdk_nvme_ctrlr_reconnect_poll_async(nvme_ctrlr->ctrlr);
2489 69 : if (rc == -EAGAIN) {
2490 0 : return SPDK_POLLER_BUSY;
2491 : }
2492 :
2493 69 : spdk_poller_unregister(&nvme_ctrlr->reset_detach_poller);
2494 69 : if (rc == 0) {
2495 35 : trid = &nvme_ctrlr->active_path_id->trid;
2496 :
2497 35 : if (spdk_nvme_trtype_is_fabrics(trid->trtype)) {
2498 35 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr was connected to %s:%s. Create qpairs.\n",
2499 : trid->traddr, trid->trsvcid);
2500 35 : } else {
2501 0 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr was connected. Create qpairs.\n");
2502 : }
2503 :
2504 35 : nvme_ctrlr_check_namespaces(nvme_ctrlr);
2505 :
2506 : /* Recreate all of the I/O queue pairs */
2507 35 : nvme_ctrlr_for_each_channel(nvme_ctrlr,
2508 : bdev_nvme_reset_create_qpair,
2509 : NULL,
2510 : bdev_nvme_reset_create_qpairs_done);
2511 35 : } else {
2512 34 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr could not be connected.\n");
2513 :
2514 34 : bdev_nvme_reset_ctrlr_complete(nvme_ctrlr, false);
2515 : }
2516 69 : return SPDK_POLLER_BUSY;
2517 69 : }
2518 :
2519 : static void
2520 69 : bdev_nvme_reconnect_ctrlr(struct nvme_ctrlr *nvme_ctrlr)
2521 : {
2522 69 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "Start reconnecting ctrlr.\n");
2523 :
2524 69 : spdk_nvme_ctrlr_reconnect_async(nvme_ctrlr->ctrlr);
2525 :
2526 : SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reconnect, nvme_ctrlr->nbdev_ctrlr->name);
2527 69 : assert(nvme_ctrlr->reset_detach_poller == NULL);
2528 69 : nvme_ctrlr->reset_detach_poller = SPDK_POLLER_REGISTER(bdev_nvme_reconnect_ctrlr_poll,
2529 : nvme_ctrlr, 0);
2530 69 : }
2531 :
2532 : static void
2533 56 : bdev_nvme_reset_destroy_qpair_done(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status)
2534 : {
2535 : SPDK_DTRACE_PROBE1(bdev_nvme_ctrlr_reset, nvme_ctrlr->nbdev_ctrlr->name);
2536 56 : assert(status == 0);
2537 :
2538 56 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "qpairs were deleted.\n");
2539 :
2540 56 : if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) {
2541 0 : bdev_nvme_reconnect_ctrlr(nvme_ctrlr);
2542 0 : } else {
2543 56 : nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reconnect_ctrlr);
2544 : }
2545 56 : }
2546 :
2547 : static void
2548 56 : bdev_nvme_reset_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr)
2549 : {
2550 56 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "Delete qpairs for reset.\n");
2551 :
2552 56 : nvme_ctrlr_for_each_channel(nvme_ctrlr,
2553 : bdev_nvme_reset_destroy_qpair,
2554 : NULL,
2555 : bdev_nvme_reset_destroy_qpair_done);
2556 56 : }
2557 :
2558 : static void
2559 3 : bdev_nvme_reconnect_ctrlr_now(void *ctx)
2560 : {
2561 3 : struct nvme_ctrlr *nvme_ctrlr = ctx;
2562 :
2563 3 : assert(nvme_ctrlr->resetting == true);
2564 3 : assert(nvme_ctrlr->thread == spdk_get_thread());
2565 :
2566 3 : spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer);
2567 :
2568 3 : spdk_poller_resume(nvme_ctrlr->adminq_timer_poller);
2569 :
2570 3 : bdev_nvme_reconnect_ctrlr(nvme_ctrlr);
2571 3 : }
2572 :
2573 : static void
2574 56 : _bdev_nvme_reset_ctrlr(void *ctx)
2575 : {
2576 56 : struct nvme_ctrlr *nvme_ctrlr = ctx;
2577 :
2578 56 : assert(nvme_ctrlr->resetting == true);
2579 56 : assert(nvme_ctrlr->thread == spdk_get_thread());
2580 :
2581 56 : if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) {
2582 0 : nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_reset_destroy_qpairs);
2583 0 : } else {
2584 56 : bdev_nvme_reset_destroy_qpairs(nvme_ctrlr);
2585 : }
2586 56 : }
2587 :
2588 : static int
2589 49 : bdev_nvme_reset_ctrlr(struct nvme_ctrlr *nvme_ctrlr)
2590 : {
2591 : spdk_msg_fn msg_fn;
2592 :
2593 49 : pthread_mutex_lock(&nvme_ctrlr->mutex);
2594 49 : if (nvme_ctrlr->destruct) {
2595 3 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2596 3 : return -ENXIO;
2597 : }
2598 :
2599 46 : if (nvme_ctrlr->resetting) {
2600 13 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2601 13 : NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Unable to perform reset, already in progress.\n");
2602 13 : return -EBUSY;
2603 : }
2604 :
2605 33 : if (nvme_ctrlr->disabled) {
2606 1 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2607 1 : NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Unable to perform reset. Controller is disabled.\n");
2608 1 : return -EALREADY;
2609 : }
2610 :
2611 32 : nvme_ctrlr->resetting = true;
2612 32 : nvme_ctrlr->dont_retry = true;
2613 :
2614 32 : if (nvme_ctrlr->reconnect_is_delayed) {
2615 1 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "Reconnect is already scheduled.\n");
2616 1 : msg_fn = bdev_nvme_reconnect_ctrlr_now;
2617 1 : nvme_ctrlr->reconnect_is_delayed = false;
2618 1 : } else {
2619 31 : msg_fn = _bdev_nvme_reset_ctrlr;
2620 31 : assert(nvme_ctrlr->reset_start_tsc == 0);
2621 : }
2622 :
2623 32 : nvme_ctrlr->reset_start_tsc = spdk_get_ticks();
2624 :
2625 32 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2626 :
2627 32 : spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr);
2628 32 : return 0;
2629 49 : }
2630 :
2631 : static int
2632 3 : bdev_nvme_enable_ctrlr(struct nvme_ctrlr *nvme_ctrlr)
2633 : {
2634 3 : pthread_mutex_lock(&nvme_ctrlr->mutex);
2635 3 : if (nvme_ctrlr->destruct) {
2636 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2637 0 : return -ENXIO;
2638 : }
2639 :
2640 3 : if (nvme_ctrlr->resetting) {
2641 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2642 0 : return -EBUSY;
2643 : }
2644 :
2645 3 : if (!nvme_ctrlr->disabled) {
2646 1 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2647 1 : return -EALREADY;
2648 : }
2649 :
2650 2 : nvme_ctrlr->disabled = false;
2651 2 : nvme_ctrlr->resetting = true;
2652 :
2653 2 : nvme_ctrlr->reset_start_tsc = spdk_get_ticks();
2654 :
2655 2 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2656 :
2657 2 : spdk_thread_send_msg(nvme_ctrlr->thread, bdev_nvme_reconnect_ctrlr_now, nvme_ctrlr);
2658 2 : return 0;
2659 3 : }
2660 :
2661 : static void
2662 2 : _bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status)
2663 : {
2664 2 : bdev_nvme_ctrlr_op_cb ctrlr_op_cb_fn = nvme_ctrlr->ctrlr_op_cb_fn;
2665 2 : void *ctrlr_op_cb_arg = nvme_ctrlr->ctrlr_op_cb_arg;
2666 : enum bdev_nvme_op_after_reset op_after_disable;
2667 :
2668 2 : assert(nvme_ctrlr->thread == spdk_get_thread());
2669 :
2670 2 : nvme_ctrlr->ctrlr_op_cb_fn = NULL;
2671 2 : nvme_ctrlr->ctrlr_op_cb_arg = NULL;
2672 :
2673 2 : pthread_mutex_lock(&nvme_ctrlr->mutex);
2674 :
2675 2 : nvme_ctrlr->resetting = false;
2676 2 : nvme_ctrlr->dont_retry = false;
2677 :
2678 2 : op_after_disable = bdev_nvme_check_op_after_reset(nvme_ctrlr, true);
2679 :
2680 2 : nvme_ctrlr->disabled = true;
2681 2 : spdk_poller_pause(nvme_ctrlr->adminq_timer_poller);
2682 :
2683 2 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2684 :
2685 2 : if (ctrlr_op_cb_fn) {
2686 0 : ctrlr_op_cb_fn(ctrlr_op_cb_arg, 0);
2687 0 : }
2688 :
2689 2 : switch (op_after_disable) {
2690 : case OP_COMPLETE_PENDING_DESTRUCT:
2691 0 : nvme_ctrlr_unregister(nvme_ctrlr);
2692 0 : break;
2693 : default:
2694 2 : break;
2695 : }
2696 :
2697 2 : }
2698 :
2699 : static void
2700 2 : bdev_nvme_disable_ctrlr_complete(struct nvme_ctrlr *nvme_ctrlr)
2701 : {
2702 : /* Make sure we clear any pending resets before returning. */
2703 2 : nvme_ctrlr_for_each_channel(nvme_ctrlr,
2704 : bdev_nvme_complete_pending_resets,
2705 : NULL,
2706 : _bdev_nvme_disable_ctrlr_complete);
2707 2 : }
2708 :
2709 : static void
2710 1 : bdev_nvme_disable_destroy_qpairs_done(struct nvme_ctrlr *nvme_ctrlr, void *ctx, int status)
2711 : {
2712 1 : assert(status == 0);
2713 :
2714 1 : if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) {
2715 0 : bdev_nvme_disable_ctrlr_complete(nvme_ctrlr);
2716 0 : } else {
2717 1 : nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_ctrlr_complete);
2718 : }
2719 1 : }
2720 :
2721 : static void
2722 1 : bdev_nvme_disable_destroy_qpairs(struct nvme_ctrlr *nvme_ctrlr)
2723 : {
2724 1 : nvme_ctrlr_for_each_channel(nvme_ctrlr,
2725 : bdev_nvme_reset_destroy_qpair,
2726 : NULL,
2727 : bdev_nvme_disable_destroy_qpairs_done);
2728 1 : }
2729 :
2730 : static void
2731 1 : _bdev_nvme_cancel_reconnect_and_disable_ctrlr(void *ctx)
2732 : {
2733 1 : struct nvme_ctrlr *nvme_ctrlr = ctx;
2734 :
2735 1 : assert(nvme_ctrlr->resetting == true);
2736 1 : assert(nvme_ctrlr->thread == spdk_get_thread());
2737 :
2738 1 : spdk_poller_unregister(&nvme_ctrlr->reconnect_delay_timer);
2739 :
2740 1 : bdev_nvme_disable_ctrlr_complete(nvme_ctrlr);
2741 1 : }
2742 :
2743 : static void
2744 1 : _bdev_nvme_disconnect_and_disable_ctrlr(void *ctx)
2745 : {
2746 1 : struct nvme_ctrlr *nvme_ctrlr = ctx;
2747 :
2748 1 : assert(nvme_ctrlr->resetting == true);
2749 1 : assert(nvme_ctrlr->thread == spdk_get_thread());
2750 :
2751 1 : if (!spdk_nvme_ctrlr_is_fabrics(nvme_ctrlr->ctrlr)) {
2752 0 : nvme_ctrlr_disconnect(nvme_ctrlr, bdev_nvme_disable_destroy_qpairs);
2753 0 : } else {
2754 1 : bdev_nvme_disable_destroy_qpairs(nvme_ctrlr);
2755 : }
2756 1 : }
2757 :
2758 : static int
2759 5 : bdev_nvme_disable_ctrlr(struct nvme_ctrlr *nvme_ctrlr)
2760 : {
2761 : spdk_msg_fn msg_fn;
2762 :
2763 5 : pthread_mutex_lock(&nvme_ctrlr->mutex);
2764 5 : if (nvme_ctrlr->destruct) {
2765 1 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2766 1 : return -ENXIO;
2767 : }
2768 :
2769 4 : if (nvme_ctrlr->resetting) {
2770 1 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2771 1 : return -EBUSY;
2772 : }
2773 :
2774 3 : if (nvme_ctrlr->disabled) {
2775 1 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2776 1 : return -EALREADY;
2777 : }
2778 :
2779 2 : nvme_ctrlr->resetting = true;
2780 2 : nvme_ctrlr->dont_retry = true;
2781 :
2782 2 : if (nvme_ctrlr->reconnect_is_delayed) {
2783 1 : msg_fn = _bdev_nvme_cancel_reconnect_and_disable_ctrlr;
2784 1 : nvme_ctrlr->reconnect_is_delayed = false;
2785 1 : } else {
2786 1 : msg_fn = _bdev_nvme_disconnect_and_disable_ctrlr;
2787 : }
2788 :
2789 2 : nvme_ctrlr->reset_start_tsc = spdk_get_ticks();
2790 :
2791 2 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
2792 :
2793 2 : spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr);
2794 2 : return 0;
2795 5 : }
2796 :
2797 : static int
2798 31 : nvme_ctrlr_op(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op,
2799 : bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg)
2800 : {
2801 : int rc;
2802 :
2803 31 : switch (op) {
2804 : case NVME_CTRLR_OP_RESET:
2805 30 : rc = bdev_nvme_reset_ctrlr(nvme_ctrlr);
2806 30 : break;
2807 : case NVME_CTRLR_OP_ENABLE:
2808 0 : rc = bdev_nvme_enable_ctrlr(nvme_ctrlr);
2809 0 : break;
2810 : case NVME_CTRLR_OP_DISABLE:
2811 0 : rc = bdev_nvme_disable_ctrlr(nvme_ctrlr);
2812 0 : break;
2813 : default:
2814 1 : rc = -EINVAL;
2815 1 : break;
2816 : }
2817 :
2818 31 : if (rc == 0) {
2819 16 : assert(nvme_ctrlr->ctrlr_op_cb_fn == NULL);
2820 16 : assert(nvme_ctrlr->ctrlr_op_cb_arg == NULL);
2821 16 : nvme_ctrlr->ctrlr_op_cb_fn = cb_fn;
2822 16 : nvme_ctrlr->ctrlr_op_cb_arg = cb_arg;
2823 16 : }
2824 31 : return rc;
2825 : }
2826 :
2827 : struct nvme_ctrlr_op_rpc_ctx {
2828 : struct nvme_ctrlr *nvme_ctrlr;
2829 : struct spdk_thread *orig_thread;
2830 : enum nvme_ctrlr_op op;
2831 : int rc;
2832 : bdev_nvme_ctrlr_op_cb cb_fn;
2833 : void *cb_arg;
2834 : };
2835 :
2836 : static void
2837 4 : _nvme_ctrlr_op_rpc_complete(void *_ctx)
2838 : {
2839 4 : struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx;
2840 :
2841 4 : assert(ctx != NULL);
2842 4 : assert(ctx->cb_fn != NULL);
2843 :
2844 4 : ctx->cb_fn(ctx->cb_arg, ctx->rc);
2845 :
2846 4 : free(ctx);
2847 4 : }
2848 :
2849 : static void
2850 4 : nvme_ctrlr_op_rpc_complete(void *cb_arg, int rc)
2851 : {
2852 4 : struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg;
2853 :
2854 4 : ctx->rc = rc;
2855 :
2856 4 : spdk_thread_send_msg(ctx->orig_thread, _nvme_ctrlr_op_rpc_complete, ctx);
2857 4 : }
2858 :
2859 : void
2860 4 : nvme_ctrlr_op_rpc(struct nvme_ctrlr *nvme_ctrlr, enum nvme_ctrlr_op op,
2861 : bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg)
2862 : {
2863 : struct nvme_ctrlr_op_rpc_ctx *ctx;
2864 : int rc;
2865 :
2866 4 : assert(cb_fn != NULL);
2867 :
2868 4 : ctx = calloc(1, sizeof(*ctx));
2869 4 : if (ctx == NULL) {
2870 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to allocate nvme_ctrlr_op_rpc_ctx.\n");
2871 0 : cb_fn(cb_arg, -ENOMEM);
2872 0 : return;
2873 : }
2874 :
2875 4 : ctx->orig_thread = spdk_get_thread();
2876 4 : ctx->cb_fn = cb_fn;
2877 4 : ctx->cb_arg = cb_arg;
2878 :
2879 4 : rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_ctrlr_op_rpc_complete, ctx);
2880 4 : if (rc == 0) {
2881 1 : return;
2882 3 : } else if (rc == -EALREADY) {
2883 0 : rc = 0;
2884 0 : }
2885 :
2886 3 : nvme_ctrlr_op_rpc_complete(ctx, rc);
2887 4 : }
2888 :
2889 : static void nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc);
2890 :
2891 : static void
2892 2 : _nvme_bdev_ctrlr_op_rpc_continue(void *_ctx)
2893 : {
2894 2 : struct nvme_ctrlr_op_rpc_ctx *ctx = _ctx;
2895 : struct nvme_ctrlr *prev_nvme_ctrlr, *next_nvme_ctrlr;
2896 : int rc;
2897 :
2898 2 : prev_nvme_ctrlr = ctx->nvme_ctrlr;
2899 2 : ctx->nvme_ctrlr = NULL;
2900 :
2901 2 : if (ctx->rc != 0) {
2902 0 : goto complete;
2903 : }
2904 :
2905 2 : next_nvme_ctrlr = TAILQ_NEXT(prev_nvme_ctrlr, tailq);
2906 2 : if (next_nvme_ctrlr == NULL) {
2907 1 : goto complete;
2908 : }
2909 :
2910 1 : rc = nvme_ctrlr_op(next_nvme_ctrlr, ctx->op, nvme_bdev_ctrlr_op_rpc_continue, ctx);
2911 1 : if (rc == 0) {
2912 1 : ctx->nvme_ctrlr = next_nvme_ctrlr;
2913 1 : return;
2914 0 : } else if (rc == -EALREADY) {
2915 0 : ctx->nvme_ctrlr = next_nvme_ctrlr;
2916 0 : rc = 0;
2917 0 : }
2918 :
2919 0 : ctx->rc = rc;
2920 :
2921 : complete:
2922 1 : ctx->cb_fn(ctx->cb_arg, ctx->rc);
2923 1 : free(ctx);
2924 2 : }
2925 :
2926 : static void
2927 2 : nvme_bdev_ctrlr_op_rpc_continue(void *cb_arg, int rc)
2928 : {
2929 2 : struct nvme_ctrlr_op_rpc_ctx *ctx = cb_arg;
2930 :
2931 2 : ctx->rc = rc;
2932 :
2933 2 : spdk_thread_send_msg(ctx->orig_thread, _nvme_bdev_ctrlr_op_rpc_continue, ctx);
2934 2 : }
2935 :
2936 : void
2937 1 : nvme_bdev_ctrlr_op_rpc(struct nvme_bdev_ctrlr *nbdev_ctrlr, enum nvme_ctrlr_op op,
2938 : bdev_nvme_ctrlr_op_cb cb_fn, void *cb_arg)
2939 : {
2940 : struct nvme_ctrlr_op_rpc_ctx *ctx;
2941 : struct nvme_ctrlr *nvme_ctrlr;
2942 : int rc;
2943 :
2944 1 : assert(cb_fn != NULL);
2945 :
2946 1 : ctx = calloc(1, sizeof(*ctx));
2947 1 : if (ctx == NULL) {
2948 0 : SPDK_ERRLOG("Failed to allocate nvme_ctrlr_op_rpc_ctx.\n");
2949 0 : cb_fn(cb_arg, -ENOMEM);
2950 0 : return;
2951 : }
2952 :
2953 1 : ctx->orig_thread = spdk_get_thread();
2954 1 : ctx->op = op;
2955 1 : ctx->cb_fn = cb_fn;
2956 1 : ctx->cb_arg = cb_arg;
2957 :
2958 1 : nvme_ctrlr = TAILQ_FIRST(&nbdev_ctrlr->ctrlrs);
2959 1 : assert(nvme_ctrlr != NULL);
2960 :
2961 1 : rc = nvme_ctrlr_op(nvme_ctrlr, op, nvme_bdev_ctrlr_op_rpc_continue, ctx);
2962 1 : if (rc == 0) {
2963 1 : ctx->nvme_ctrlr = nvme_ctrlr;
2964 1 : return;
2965 0 : } else if (rc == -EALREADY) {
2966 0 : ctx->nvme_ctrlr = nvme_ctrlr;
2967 0 : rc = 0;
2968 0 : }
2969 :
2970 0 : nvme_bdev_ctrlr_op_rpc_continue(ctx, rc);
2971 1 : }
2972 :
2973 : static int _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio);
2974 :
2975 : static void
2976 15 : bdev_nvme_unfreeze_bdev_channel_done(struct nvme_bdev *nbdev, void *ctx, int status)
2977 : {
2978 15 : struct nvme_bdev_io *bio = ctx;
2979 : enum spdk_bdev_io_status io_status;
2980 :
2981 15 : if (bio->cpl.cdw0 == 0) {
2982 11 : io_status = SPDK_BDEV_IO_STATUS_SUCCESS;
2983 11 : } else {
2984 4 : io_status = SPDK_BDEV_IO_STATUS_FAILED;
2985 : }
2986 :
2987 15 : NVME_BDEV_INFOLOG(nbdev, "reset_io %p completed, status:%d\n", bio, io_status);
2988 :
2989 15 : __bdev_nvme_io_complete(spdk_bdev_io_from_ctx(bio), io_status, NULL);
2990 15 : }
2991 :
2992 : static void
2993 30 : bdev_nvme_unfreeze_bdev_channel(struct nvme_bdev_channel_iter *i,
2994 : struct nvme_bdev *nbdev,
2995 : struct nvme_bdev_channel *nbdev_ch, void *ctx)
2996 : {
2997 30 : bdev_nvme_abort_retry_ios(nbdev_ch);
2998 30 : nbdev_ch->resetting = false;
2999 :
3000 30 : nvme_bdev_for_each_channel_continue(i, 0);
3001 30 : }
3002 :
3003 : static void
3004 15 : bdev_nvme_reset_io_complete(struct nvme_bdev_io *bio)
3005 : {
3006 15 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3007 15 : struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
3008 :
3009 : /* Abort all queued I/Os for retry. */
3010 30 : nvme_bdev_for_each_channel(nbdev,
3011 : bdev_nvme_unfreeze_bdev_channel,
3012 15 : bio,
3013 : bdev_nvme_unfreeze_bdev_channel_done);
3014 15 : }
3015 :
3016 : static void
3017 25 : _bdev_nvme_reset_io_continue(void *ctx)
3018 : {
3019 25 : struct nvme_bdev_io *bio = ctx;
3020 : struct nvme_io_path *prev_io_path, *next_io_path;
3021 : int rc;
3022 :
3023 25 : prev_io_path = bio->io_path;
3024 25 : bio->io_path = NULL;
3025 :
3026 25 : next_io_path = STAILQ_NEXT(prev_io_path, stailq);
3027 25 : if (next_io_path == NULL) {
3028 15 : goto complete;
3029 : }
3030 :
3031 10 : rc = _bdev_nvme_reset_io(next_io_path, bio);
3032 10 : if (rc == 0) {
3033 10 : return;
3034 : }
3035 :
3036 : complete:
3037 15 : bdev_nvme_reset_io_complete(bio);
3038 25 : }
3039 :
3040 : static void
3041 25 : bdev_nvme_reset_io_continue(void *cb_arg, int rc)
3042 : {
3043 25 : struct nvme_bdev_io *bio = cb_arg;
3044 25 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3045 25 : struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
3046 :
3047 25 : NVME_BDEV_INFOLOG(nbdev, "continue reset_io %p, rc:%d\n", bio, rc);
3048 :
3049 : /* Reset status is initialized as "failed". Set to "success" once we have at least one
3050 : * successfully reset nvme_ctrlr.
3051 : */
3052 25 : if (rc == 0) {
3053 15 : bio->cpl.cdw0 = 0;
3054 15 : }
3055 :
3056 25 : spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), _bdev_nvme_reset_io_continue, bio);
3057 25 : }
3058 :
3059 : static int
3060 25 : _bdev_nvme_reset_io(struct nvme_io_path *io_path, struct nvme_bdev_io *bio)
3061 : {
3062 25 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3063 25 : struct nvme_bdev *nbdev = (struct nvme_bdev *)bdev_io->bdev->ctxt;
3064 25 : struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr;
3065 : struct nvme_ctrlr_channel *ctrlr_ch;
3066 : int rc;
3067 :
3068 25 : assert(bio->io_path == NULL);
3069 25 : bio->io_path = io_path;
3070 :
3071 50 : rc = nvme_ctrlr_op(nvme_ctrlr, NVME_CTRLR_OP_RESET,
3072 25 : bdev_nvme_reset_io_continue, bio);
3073 :
3074 25 : if (rc == 0) {
3075 13 : NVME_BDEV_INFOLOG(nbdev, "reset_io %p started resetting ctrlr [%s, %u].\n",
3076 : bio, CTRLR_STRING(nvme_ctrlr), CTRLR_ID(nvme_ctrlr));
3077 25 : } else if (rc == -EBUSY) {
3078 11 : ctrlr_ch = io_path->qpair->ctrlr_ch;
3079 11 : assert(ctrlr_ch != NULL);
3080 : /*
3081 : * Reset call is queued only if it is from the app framework. This is on purpose so that
3082 : * we don't interfere with the app framework reset strategy. i.e. we are deferring to the
3083 : * upper level. If they are in the middle of a reset, we won't try to schedule another one.
3084 : */
3085 11 : TAILQ_INSERT_TAIL(&ctrlr_ch->pending_resets, bio, retry_link);
3086 :
3087 11 : rc = 0;
3088 :
3089 11 : NVME_BDEV_INFOLOG(nbdev, "reset_io %p was queued to ctrlr [%s, %u].\n",
3090 : bio, CTRLR_STRING(nvme_ctrlr), CTRLR_ID(nvme_ctrlr));
3091 11 : } else {
3092 1 : NVME_BDEV_INFOLOG(nbdev, "reset_io %p could not reset ctrlr [%s, %u], rc:%d\n",
3093 : bio, CTRLR_STRING(nvme_ctrlr), CTRLR_ID(nvme_ctrlr), rc);
3094 : }
3095 :
3096 25 : return rc;
3097 : }
3098 :
3099 : static void
3100 15 : bdev_nvme_freeze_bdev_channel_done(struct nvme_bdev *nbdev, void *ctx, int status)
3101 : {
3102 15 : struct nvme_bdev_io *bio = ctx;
3103 15 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
3104 : struct nvme_bdev_channel *nbdev_ch;
3105 : struct nvme_io_path *io_path;
3106 : int rc;
3107 :
3108 15 : nbdev_ch = spdk_io_channel_get_ctx(spdk_bdev_io_get_io_channel(bdev_io));
3109 :
3110 : /* Initialize with failed status. With multipath it is enough to have at least one successful
3111 : * nvme_ctrlr reset. If there is none, reset status will remain failed.
3112 : */
3113 15 : bio->cpl.cdw0 = 1;
3114 :
3115 : /* Reset all nvme_ctrlrs of a bdev controller sequentially. */
3116 15 : io_path = STAILQ_FIRST(&nbdev_ch->io_path_list);
3117 15 : assert(io_path != NULL);
3118 :
3119 15 : rc = _bdev_nvme_reset_io(io_path, bio);
3120 15 : if (rc != 0) {
3121 : /* If the current nvme_ctrlr is disabled, skip it and move to the next nvme_ctrlr. */
3122 1 : rc = (rc == -EALREADY) ? 0 : rc;
3123 :
3124 1 : bdev_nvme_reset_io_continue(bio, rc);
3125 1 : }
3126 15 : }
3127 :
3128 : static void
3129 30 : bdev_nvme_freeze_bdev_channel(struct nvme_bdev_channel_iter *i,
3130 : struct nvme_bdev *nbdev,
3131 : struct nvme_bdev_channel *nbdev_ch, void *ctx)
3132 : {
3133 30 : nbdev_ch->resetting = true;
3134 :
3135 30 : nvme_bdev_for_each_channel_continue(i, 0);
3136 30 : }
3137 :
3138 : static void
3139 15 : bdev_nvme_reset_io(struct nvme_bdev *nbdev, struct nvme_bdev_io *bio)
3140 : {
3141 15 : NVME_BDEV_INFOLOG(nbdev, "reset_io %p started.\n", bio);
3142 :
3143 30 : nvme_bdev_for_each_channel(nbdev,
3144 : bdev_nvme_freeze_bdev_channel,
3145 15 : bio,
3146 : bdev_nvme_freeze_bdev_channel_done);
3147 15 : }
3148 :
3149 : static int
3150 31 : bdev_nvme_failover_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool remove)
3151 : {
3152 31 : if (nvme_ctrlr->destruct) {
3153 : /* Don't bother resetting if the controller is in the process of being destructed. */
3154 2 : return -ENXIO;
3155 : }
3156 :
3157 29 : if (nvme_ctrlr->resetting) {
3158 3 : if (!nvme_ctrlr->in_failover) {
3159 3 : NVME_CTRLR_NOTICELOG(nvme_ctrlr,
3160 : "Reset is already in progress. Defer failover until reset completes.\n");
3161 :
3162 : /* Defer failover until reset completes. */
3163 3 : nvme_ctrlr->pending_failover = true;
3164 3 : return -EINPROGRESS;
3165 : } else {
3166 0 : NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Unable to perform failover, already in progress.\n");
3167 0 : return -EBUSY;
3168 : }
3169 : }
3170 :
3171 26 : bdev_nvme_failover_trid(nvme_ctrlr, remove, true);
3172 :
3173 26 : if (nvme_ctrlr->reconnect_is_delayed) {
3174 1 : NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Reconnect is already scheduled.\n");
3175 :
3176 : /* We rely on the next reconnect for the failover. */
3177 1 : return -EALREADY;
3178 : }
3179 :
3180 25 : if (nvme_ctrlr->disabled) {
3181 0 : NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Controller is disabled.\n");
3182 :
3183 : /* We rely on the enablement for the failover. */
3184 0 : return -EALREADY;
3185 : }
3186 :
3187 25 : nvme_ctrlr->resetting = true;
3188 25 : nvme_ctrlr->in_failover = true;
3189 :
3190 25 : assert(nvme_ctrlr->reset_start_tsc == 0);
3191 25 : nvme_ctrlr->reset_start_tsc = spdk_get_ticks();
3192 :
3193 25 : return 0;
3194 31 : }
3195 :
3196 : static int
3197 29 : bdev_nvme_failover_ctrlr(struct nvme_ctrlr *nvme_ctrlr)
3198 : {
3199 : int rc;
3200 :
3201 29 : pthread_mutex_lock(&nvme_ctrlr->mutex);
3202 29 : rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, false);
3203 29 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
3204 :
3205 29 : if (rc == 0) {
3206 24 : spdk_thread_send_msg(nvme_ctrlr->thread, _bdev_nvme_reset_ctrlr, nvme_ctrlr);
3207 29 : } else if (rc == -EALREADY) {
3208 0 : rc = 0;
3209 0 : }
3210 :
3211 29 : return rc;
3212 : }
3213 :
3214 : static int bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks,
3215 : uint64_t num_blocks);
3216 :
3217 : static int bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks,
3218 : uint64_t num_blocks);
3219 :
3220 : static int bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks,
3221 : uint64_t src_offset_blocks,
3222 : uint64_t num_blocks);
3223 :
3224 : static void
3225 1 : bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io,
3226 : bool success)
3227 : {
3228 1 : struct nvme_bdev_io *bio = (struct nvme_bdev_io *)bdev_io->driver_ctx;
3229 : int ret;
3230 :
3231 1 : if (!success) {
3232 0 : ret = -EINVAL;
3233 0 : goto exit;
3234 : }
3235 :
3236 1 : if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) {
3237 0 : ret = -ENXIO;
3238 0 : goto exit;
3239 : }
3240 :
3241 2 : ret = bdev_nvme_readv(bio,
3242 1 : bdev_io->u.bdev.iovs,
3243 1 : bdev_io->u.bdev.iovcnt,
3244 1 : bdev_io->u.bdev.md_buf,
3245 1 : bdev_io->u.bdev.num_blocks,
3246 1 : bdev_io->u.bdev.offset_blocks,
3247 1 : bdev_io->u.bdev.dif_check_flags,
3248 1 : bdev_io->u.bdev.memory_domain,
3249 1 : bdev_io->u.bdev.memory_domain_ctx,
3250 1 : bdev_io->u.bdev.accel_sequence);
3251 :
3252 : exit:
3253 1 : if (spdk_unlikely(ret != 0)) {
3254 0 : bdev_nvme_io_complete(bio, ret);
3255 0 : }
3256 1 : }
3257 :
3258 : static inline void
3259 59 : _bdev_nvme_submit_request(struct nvme_bdev_channel *nbdev_ch, struct spdk_bdev_io *bdev_io)
3260 : {
3261 59 : struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
3262 59 : struct spdk_bdev *bdev = bdev_io->bdev;
3263 : struct nvme_bdev_io *nbdev_io_to_abort;
3264 59 : int rc = 0;
3265 :
3266 59 : switch (bdev_io->type) {
3267 : case SPDK_BDEV_IO_TYPE_READ:
3268 3 : if (bdev_io->u.bdev.iovs && bdev_io->u.bdev.iovs[0].iov_base) {
3269 :
3270 4 : rc = bdev_nvme_readv(nbdev_io,
3271 2 : bdev_io->u.bdev.iovs,
3272 2 : bdev_io->u.bdev.iovcnt,
3273 2 : bdev_io->u.bdev.md_buf,
3274 2 : bdev_io->u.bdev.num_blocks,
3275 2 : bdev_io->u.bdev.offset_blocks,
3276 2 : bdev_io->u.bdev.dif_check_flags,
3277 2 : bdev_io->u.bdev.memory_domain,
3278 2 : bdev_io->u.bdev.memory_domain_ctx,
3279 2 : bdev_io->u.bdev.accel_sequence);
3280 2 : } else {
3281 2 : spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb,
3282 1 : bdev_io->u.bdev.num_blocks * bdev->blocklen);
3283 1 : rc = 0;
3284 : }
3285 3 : break;
3286 : case SPDK_BDEV_IO_TYPE_WRITE:
3287 50 : rc = bdev_nvme_writev(nbdev_io,
3288 25 : bdev_io->u.bdev.iovs,
3289 25 : bdev_io->u.bdev.iovcnt,
3290 25 : bdev_io->u.bdev.md_buf,
3291 25 : bdev_io->u.bdev.num_blocks,
3292 25 : bdev_io->u.bdev.offset_blocks,
3293 25 : bdev_io->u.bdev.dif_check_flags,
3294 25 : bdev_io->u.bdev.memory_domain,
3295 25 : bdev_io->u.bdev.memory_domain_ctx,
3296 25 : bdev_io->u.bdev.accel_sequence,
3297 25 : bdev_io->u.bdev.nvme_cdw12,
3298 25 : bdev_io->u.bdev.nvme_cdw13);
3299 25 : break;
3300 : case SPDK_BDEV_IO_TYPE_COMPARE:
3301 2 : rc = bdev_nvme_comparev(nbdev_io,
3302 1 : bdev_io->u.bdev.iovs,
3303 1 : bdev_io->u.bdev.iovcnt,
3304 1 : bdev_io->u.bdev.md_buf,
3305 1 : bdev_io->u.bdev.num_blocks,
3306 1 : bdev_io->u.bdev.offset_blocks,
3307 1 : bdev_io->u.bdev.dif_check_flags);
3308 1 : break;
3309 : case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
3310 4 : rc = bdev_nvme_comparev_and_writev(nbdev_io,
3311 2 : bdev_io->u.bdev.iovs,
3312 2 : bdev_io->u.bdev.iovcnt,
3313 2 : bdev_io->u.bdev.fused_iovs,
3314 2 : bdev_io->u.bdev.fused_iovcnt,
3315 2 : bdev_io->u.bdev.md_buf,
3316 2 : bdev_io->u.bdev.num_blocks,
3317 2 : bdev_io->u.bdev.offset_blocks,
3318 2 : bdev_io->u.bdev.dif_check_flags);
3319 2 : break;
3320 : case SPDK_BDEV_IO_TYPE_UNMAP:
3321 2 : rc = bdev_nvme_unmap(nbdev_io,
3322 1 : bdev_io->u.bdev.offset_blocks,
3323 1 : bdev_io->u.bdev.num_blocks);
3324 1 : break;
3325 : case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3326 0 : rc = bdev_nvme_write_zeroes(nbdev_io,
3327 0 : bdev_io->u.bdev.offset_blocks,
3328 0 : bdev_io->u.bdev.num_blocks);
3329 0 : break;
3330 : case SPDK_BDEV_IO_TYPE_RESET:
3331 15 : nbdev_io->io_path = NULL;
3332 15 : bdev_nvme_reset_io(bdev->ctxt, nbdev_io);
3333 15 : return;
3334 :
3335 : case SPDK_BDEV_IO_TYPE_FLUSH:
3336 1 : bdev_nvme_io_complete(nbdev_io, 0);
3337 1 : return;
3338 :
3339 : case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
3340 0 : rc = bdev_nvme_zone_appendv(nbdev_io,
3341 0 : bdev_io->u.bdev.iovs,
3342 0 : bdev_io->u.bdev.iovcnt,
3343 0 : bdev_io->u.bdev.md_buf,
3344 0 : bdev_io->u.bdev.num_blocks,
3345 0 : bdev_io->u.bdev.offset_blocks,
3346 0 : bdev_io->u.bdev.dif_check_flags);
3347 0 : break;
3348 : case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
3349 0 : rc = bdev_nvme_get_zone_info(nbdev_io,
3350 0 : bdev_io->u.zone_mgmt.zone_id,
3351 0 : bdev_io->u.zone_mgmt.num_zones,
3352 0 : bdev_io->u.zone_mgmt.buf);
3353 0 : break;
3354 : case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
3355 0 : rc = bdev_nvme_zone_management(nbdev_io,
3356 0 : bdev_io->u.zone_mgmt.zone_id,
3357 0 : bdev_io->u.zone_mgmt.zone_action);
3358 0 : break;
3359 : case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
3360 5 : nbdev_io->io_path = NULL;
3361 10 : bdev_nvme_admin_passthru(nbdev_ch,
3362 5 : nbdev_io,
3363 5 : &bdev_io->u.nvme_passthru.cmd,
3364 5 : bdev_io->u.nvme_passthru.buf,
3365 5 : bdev_io->u.nvme_passthru.nbytes);
3366 5 : return;
3367 :
3368 : case SPDK_BDEV_IO_TYPE_NVME_IO:
3369 0 : rc = bdev_nvme_io_passthru(nbdev_io,
3370 0 : &bdev_io->u.nvme_passthru.cmd,
3371 0 : bdev_io->u.nvme_passthru.buf,
3372 0 : bdev_io->u.nvme_passthru.nbytes);
3373 0 : break;
3374 : case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
3375 0 : rc = bdev_nvme_io_passthru_md(nbdev_io,
3376 0 : &bdev_io->u.nvme_passthru.cmd,
3377 0 : bdev_io->u.nvme_passthru.buf,
3378 0 : bdev_io->u.nvme_passthru.nbytes,
3379 0 : bdev_io->u.nvme_passthru.md_buf,
3380 0 : bdev_io->u.nvme_passthru.md_len);
3381 0 : break;
3382 : case SPDK_BDEV_IO_TYPE_NVME_IOV_MD:
3383 0 : rc = bdev_nvme_iov_passthru_md(nbdev_io,
3384 0 : &bdev_io->u.nvme_passthru.cmd,
3385 0 : bdev_io->u.nvme_passthru.iovs,
3386 0 : bdev_io->u.nvme_passthru.iovcnt,
3387 0 : bdev_io->u.nvme_passthru.nbytes,
3388 0 : bdev_io->u.nvme_passthru.md_buf,
3389 0 : bdev_io->u.nvme_passthru.md_len);
3390 0 : break;
3391 : case SPDK_BDEV_IO_TYPE_ABORT:
3392 6 : nbdev_io->io_path = NULL;
3393 6 : nbdev_io_to_abort = (struct nvme_bdev_io *)bdev_io->u.abort.bio_to_abort->driver_ctx;
3394 12 : bdev_nvme_abort(nbdev_ch,
3395 6 : nbdev_io,
3396 6 : nbdev_io_to_abort);
3397 6 : return;
3398 :
3399 : case SPDK_BDEV_IO_TYPE_COPY:
3400 0 : rc = bdev_nvme_copy(nbdev_io,
3401 0 : bdev_io->u.bdev.offset_blocks,
3402 0 : bdev_io->u.bdev.copy.src_offset_blocks,
3403 0 : bdev_io->u.bdev.num_blocks);
3404 0 : break;
3405 : default:
3406 0 : rc = -EINVAL;
3407 0 : break;
3408 : }
3409 :
3410 32 : if (spdk_unlikely(rc != 0)) {
3411 0 : bdev_nvme_io_complete(nbdev_io, rc);
3412 0 : }
3413 59 : }
3414 :
3415 : static void
3416 68 : bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io)
3417 : {
3418 68 : struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
3419 68 : struct nvme_bdev_io *nbdev_io = (struct nvme_bdev_io *)bdev_io->driver_ctx;
3420 :
3421 68 : if (spdk_likely(nbdev_io->submit_tsc == 0)) {
3422 68 : nbdev_io->submit_tsc = spdk_bdev_io_get_submit_tsc(bdev_io);
3423 68 : } else {
3424 : /* There are cases where submit_tsc != 0, i.e. retry I/O.
3425 : * We need to update submit_tsc here.
3426 : */
3427 0 : nbdev_io->submit_tsc = spdk_get_ticks();
3428 : }
3429 :
3430 68 : spdk_trace_record(TRACE_BDEV_NVME_IO_START, 0, 0, (uintptr_t)nbdev_io, (uintptr_t)bdev_io);
3431 68 : nbdev_io->io_path = bdev_nvme_find_io_path(nbdev_ch);
3432 68 : if (spdk_unlikely(!nbdev_io->io_path)) {
3433 13 : if (!bdev_nvme_io_type_is_admin(bdev_io->type)) {
3434 12 : bdev_nvme_io_complete(nbdev_io, -ENXIO);
3435 12 : return;
3436 : }
3437 :
3438 : /* Admin commands do not use the optimal I/O path.
3439 : * Simply fall through even if it is not found.
3440 : */
3441 1 : }
3442 :
3443 56 : _bdev_nvme_submit_request(nbdev_ch, bdev_io);
3444 68 : }
3445 :
3446 : static bool
3447 0 : bdev_nvme_is_supported_csi(enum spdk_nvme_csi csi)
3448 : {
3449 0 : switch (csi) {
3450 : case SPDK_NVME_CSI_NVM:
3451 0 : return true;
3452 : case SPDK_NVME_CSI_ZNS:
3453 0 : return true;
3454 : default:
3455 0 : return false;
3456 : }
3457 0 : }
3458 :
3459 : static bool
3460 0 : bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type)
3461 : {
3462 0 : struct nvme_bdev *nbdev = ctx;
3463 : struct nvme_ns *nvme_ns;
3464 : struct spdk_nvme_ns *ns;
3465 : struct spdk_nvme_ctrlr *ctrlr;
3466 : const struct spdk_nvme_ctrlr_data *cdata;
3467 :
3468 0 : nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list);
3469 0 : assert(nvme_ns != NULL);
3470 0 : ns = nvme_ns->ns;
3471 0 : if (ns == NULL) {
3472 0 : return false;
3473 : }
3474 :
3475 0 : if (!bdev_nvme_is_supported_csi(spdk_nvme_ns_get_csi(ns))) {
3476 0 : switch (io_type) {
3477 : case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
3478 : case SPDK_BDEV_IO_TYPE_NVME_IO:
3479 0 : return true;
3480 :
3481 : case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
3482 0 : return spdk_nvme_ns_get_md_size(ns) ? true : false;
3483 :
3484 : default:
3485 0 : return false;
3486 : }
3487 : }
3488 :
3489 0 : ctrlr = spdk_nvme_ns_get_ctrlr(ns);
3490 :
3491 0 : switch (io_type) {
3492 : case SPDK_BDEV_IO_TYPE_READ:
3493 : case SPDK_BDEV_IO_TYPE_WRITE:
3494 : case SPDK_BDEV_IO_TYPE_RESET:
3495 : case SPDK_BDEV_IO_TYPE_FLUSH:
3496 : case SPDK_BDEV_IO_TYPE_NVME_ADMIN:
3497 : case SPDK_BDEV_IO_TYPE_NVME_IO:
3498 : case SPDK_BDEV_IO_TYPE_ABORT:
3499 0 : return true;
3500 :
3501 : case SPDK_BDEV_IO_TYPE_COMPARE:
3502 0 : return spdk_nvme_ns_supports_compare(ns);
3503 :
3504 : case SPDK_BDEV_IO_TYPE_NVME_IO_MD:
3505 0 : return spdk_nvme_ns_get_md_size(ns) ? true : false;
3506 :
3507 : case SPDK_BDEV_IO_TYPE_UNMAP:
3508 0 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
3509 0 : return cdata->oncs.dsm;
3510 :
3511 : case SPDK_BDEV_IO_TYPE_WRITE_ZEROES:
3512 0 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
3513 0 : return cdata->oncs.write_zeroes;
3514 :
3515 : case SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE:
3516 0 : if (spdk_nvme_ctrlr_get_flags(ctrlr) &
3517 : SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED) {
3518 0 : return true;
3519 : }
3520 0 : return false;
3521 :
3522 : case SPDK_BDEV_IO_TYPE_GET_ZONE_INFO:
3523 : case SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT:
3524 0 : return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS;
3525 :
3526 : case SPDK_BDEV_IO_TYPE_ZONE_APPEND:
3527 0 : return spdk_nvme_ns_get_csi(ns) == SPDK_NVME_CSI_ZNS &&
3528 0 : spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ZONE_APPEND_SUPPORTED;
3529 :
3530 : case SPDK_BDEV_IO_TYPE_COPY:
3531 0 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
3532 0 : return cdata->oncs.copy;
3533 :
3534 : default:
3535 0 : return false;
3536 : }
3537 0 : }
3538 :
3539 : static int
3540 59 : nvme_qpair_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ctrlr_channel *ctrlr_ch)
3541 : {
3542 : struct nvme_qpair *nvme_qpair;
3543 : struct spdk_io_channel *pg_ch;
3544 : int rc;
3545 :
3546 59 : nvme_qpair = calloc(1, sizeof(*nvme_qpair));
3547 59 : if (!nvme_qpair) {
3548 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to alloc nvme_qpair.\n");
3549 0 : return -1;
3550 : }
3551 :
3552 59 : TAILQ_INIT(&nvme_qpair->io_path_list);
3553 :
3554 59 : nvme_qpair->ctrlr = nvme_ctrlr;
3555 59 : nvme_qpair->ctrlr_ch = ctrlr_ch;
3556 :
3557 59 : pg_ch = spdk_get_io_channel(&g_nvme_bdev_ctrlrs);
3558 59 : if (!pg_ch) {
3559 0 : free(nvme_qpair);
3560 0 : return -1;
3561 : }
3562 :
3563 59 : nvme_qpair->group = spdk_io_channel_get_ctx(pg_ch);
3564 :
3565 : #ifdef SPDK_CONFIG_VTUNE
3566 : nvme_qpair->group->collect_spin_stat = true;
3567 : #else
3568 59 : nvme_qpair->group->collect_spin_stat = false;
3569 : #endif
3570 :
3571 59 : if (!nvme_ctrlr->disabled) {
3572 : /* If a nvme_ctrlr is disabled, don't try to create qpair for it. Qpair will
3573 : * be created when it's enabled.
3574 : */
3575 59 : rc = bdev_nvme_create_qpair(nvme_qpair);
3576 59 : if (rc != 0) {
3577 : /* nvme_ctrlr can't create IO qpair if connection is down.
3578 : * If reconnect_delay_sec is non-zero, creating IO qpair is retried
3579 : * after reconnect_delay_sec seconds. If bdev_retry_count is non-zero,
3580 : * submitted IO will be queued until IO qpair is successfully created.
3581 : *
3582 : * Hence, if both are satisfied, ignore the failure.
3583 : */
3584 0 : if (nvme_ctrlr->opts.reconnect_delay_sec == 0 || g_opts.bdev_retry_count == 0) {
3585 0 : spdk_put_io_channel(pg_ch);
3586 0 : free(nvme_qpair);
3587 0 : return rc;
3588 : }
3589 0 : }
3590 59 : }
3591 :
3592 59 : TAILQ_INSERT_TAIL(&nvme_qpair->group->qpair_list, nvme_qpair, tailq);
3593 :
3594 59 : ctrlr_ch->qpair = nvme_qpair;
3595 :
3596 59 : pthread_mutex_lock(&nvme_qpair->ctrlr->mutex);
3597 59 : nvme_qpair->ctrlr->ref++;
3598 59 : pthread_mutex_unlock(&nvme_qpair->ctrlr->mutex);
3599 :
3600 59 : return 0;
3601 59 : }
3602 :
3603 : static int
3604 59 : bdev_nvme_create_ctrlr_channel_cb(void *io_device, void *ctx_buf)
3605 : {
3606 59 : struct nvme_ctrlr *nvme_ctrlr = io_device;
3607 59 : struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf;
3608 :
3609 59 : TAILQ_INIT(&ctrlr_ch->pending_resets);
3610 :
3611 59 : return nvme_qpair_create(nvme_ctrlr, ctrlr_ch);
3612 : }
3613 :
3614 : static void
3615 59 : nvme_qpair_delete(struct nvme_qpair *nvme_qpair)
3616 : {
3617 : struct nvme_io_path *io_path, *next;
3618 :
3619 59 : assert(nvme_qpair->group != NULL);
3620 :
3621 96 : TAILQ_FOREACH_SAFE(io_path, &nvme_qpair->io_path_list, tailq, next) {
3622 37 : TAILQ_REMOVE(&nvme_qpair->io_path_list, io_path, tailq);
3623 37 : nvme_io_path_free(io_path);
3624 37 : }
3625 :
3626 59 : TAILQ_REMOVE(&nvme_qpair->group->qpair_list, nvme_qpair, tailq);
3627 :
3628 59 : spdk_put_io_channel(spdk_io_channel_from_ctx(nvme_qpair->group));
3629 :
3630 59 : nvme_ctrlr_release(nvme_qpair->ctrlr);
3631 :
3632 59 : free(nvme_qpair);
3633 59 : }
3634 :
3635 : static void
3636 59 : bdev_nvme_destroy_ctrlr_channel_cb(void *io_device, void *ctx_buf)
3637 : {
3638 59 : struct nvme_ctrlr_channel *ctrlr_ch = ctx_buf;
3639 : struct nvme_qpair *nvme_qpair;
3640 :
3641 59 : nvme_qpair = ctrlr_ch->qpair;
3642 59 : assert(nvme_qpair != NULL);
3643 :
3644 59 : _bdev_nvme_clear_io_path_cache(nvme_qpair);
3645 :
3646 59 : if (nvme_qpair->qpair != NULL) {
3647 45 : if (ctrlr_ch->reset_iter == NULL) {
3648 45 : spdk_nvme_ctrlr_disconnect_io_qpair(nvme_qpair->qpair);
3649 45 : } else {
3650 : /* Skip current ctrlr_channel in a full reset sequence because
3651 : * it is being deleted now. The qpair is already being disconnected.
3652 : * We do not have to restart disconnecting it.
3653 : */
3654 0 : nvme_ctrlr_for_each_channel_continue(ctrlr_ch->reset_iter, 0);
3655 : }
3656 :
3657 : /* We cannot release a reference to the poll group now.
3658 : * The qpair may be disconnected asynchronously later.
3659 : * We need to poll it until it is actually disconnected.
3660 : * Just detach the qpair from the deleting ctrlr_channel.
3661 : */
3662 45 : nvme_qpair->ctrlr_ch = NULL;
3663 45 : } else {
3664 14 : assert(ctrlr_ch->reset_iter == NULL);
3665 :
3666 14 : nvme_qpair_delete(nvme_qpair);
3667 : }
3668 59 : }
3669 :
3670 : static inline struct spdk_io_channel *
3671 0 : bdev_nvme_get_accel_channel(struct nvme_poll_group *group)
3672 : {
3673 0 : if (spdk_unlikely(!group->accel_channel)) {
3674 0 : group->accel_channel = spdk_accel_get_io_channel();
3675 0 : if (!group->accel_channel) {
3676 0 : SPDK_ERRLOG("Cannot get the accel_channel for bdev nvme polling group=%p\n",
3677 : group);
3678 0 : return NULL;
3679 : }
3680 0 : }
3681 :
3682 0 : return group->accel_channel;
3683 0 : }
3684 :
3685 : static void
3686 0 : bdev_nvme_finish_sequence(void *seq, spdk_nvme_accel_completion_cb cb_fn, void *cb_arg)
3687 : {
3688 0 : spdk_accel_sequence_finish(seq, cb_fn, cb_arg);
3689 0 : }
3690 :
3691 : static void
3692 0 : bdev_nvme_abort_sequence(void *seq)
3693 : {
3694 0 : spdk_accel_sequence_abort(seq);
3695 0 : }
3696 :
3697 : static void
3698 0 : bdev_nvme_reverse_sequence(void *seq)
3699 : {
3700 0 : spdk_accel_sequence_reverse(seq);
3701 0 : }
3702 :
3703 : static int
3704 0 : bdev_nvme_append_crc32c(void *ctx, void **seq, uint32_t *dst, struct iovec *iovs, uint32_t iovcnt,
3705 : struct spdk_memory_domain *domain, void *domain_ctx, uint32_t seed,
3706 : spdk_nvme_accel_step_cb cb_fn, void *cb_arg)
3707 : {
3708 : struct spdk_io_channel *ch;
3709 0 : struct nvme_poll_group *group = ctx;
3710 :
3711 0 : ch = bdev_nvme_get_accel_channel(group);
3712 0 : if (spdk_unlikely(ch == NULL)) {
3713 0 : return -ENOMEM;
3714 : }
3715 :
3716 0 : return spdk_accel_append_crc32c((struct spdk_accel_sequence **)seq, ch, dst, iovs, iovcnt,
3717 0 : domain, domain_ctx, seed, cb_fn, cb_arg);
3718 0 : }
3719 :
3720 : static int
3721 0 : bdev_nvme_append_copy(void *ctx, void **seq, struct iovec *dst_iovs, uint32_t dst_iovcnt,
3722 : struct spdk_memory_domain *dst_domain, void *dst_domain_ctx,
3723 : struct iovec *src_iovs, uint32_t src_iovcnt,
3724 : struct spdk_memory_domain *src_domain, void *src_domain_ctx,
3725 : spdk_nvme_accel_step_cb cb_fn, void *cb_arg)
3726 : {
3727 : struct spdk_io_channel *ch;
3728 0 : struct nvme_poll_group *group = ctx;
3729 :
3730 0 : ch = bdev_nvme_get_accel_channel(group);
3731 0 : if (spdk_unlikely(ch == NULL)) {
3732 0 : return -ENOMEM;
3733 : }
3734 :
3735 0 : return spdk_accel_append_copy((struct spdk_accel_sequence **)seq, ch,
3736 0 : dst_iovs, dst_iovcnt, dst_domain, dst_domain_ctx,
3737 0 : src_iovs, src_iovcnt, src_domain, src_domain_ctx,
3738 0 : cb_fn, cb_arg);
3739 0 : }
3740 :
3741 : static struct spdk_nvme_accel_fn_table g_bdev_nvme_accel_fn_table = {
3742 : .table_size = sizeof(struct spdk_nvme_accel_fn_table),
3743 : .append_crc32c = bdev_nvme_append_crc32c,
3744 : .append_copy = bdev_nvme_append_copy,
3745 : .finish_sequence = bdev_nvme_finish_sequence,
3746 : .reverse_sequence = bdev_nvme_reverse_sequence,
3747 : .abort_sequence = bdev_nvme_abort_sequence,
3748 : };
3749 :
3750 : static int
3751 0 : bdev_nvme_interrupt_wrapper(void *ctx)
3752 : {
3753 : int num_events;
3754 0 : struct nvme_poll_group *group = ctx;
3755 :
3756 0 : num_events = spdk_nvme_poll_group_wait(group->group, bdev_nvme_disconnected_qpair_cb);
3757 0 : if (spdk_unlikely(num_events < 0)) {
3758 0 : bdev_nvme_check_io_qpairs(group);
3759 0 : }
3760 :
3761 0 : return num_events;
3762 : }
3763 :
3764 : static int
3765 44 : bdev_nvme_create_poll_group_cb(void *io_device, void *ctx_buf)
3766 : {
3767 44 : struct nvme_poll_group *group = ctx_buf;
3768 : uint64_t period;
3769 : int fd;
3770 :
3771 44 : TAILQ_INIT(&group->qpair_list);
3772 :
3773 44 : group->group = spdk_nvme_poll_group_create(group, &g_bdev_nvme_accel_fn_table);
3774 44 : if (group->group == NULL) {
3775 0 : return -1;
3776 : }
3777 :
3778 44 : period = spdk_interrupt_mode_is_enabled() ? 0 : g_opts.nvme_ioq_poll_period_us;
3779 44 : group->poller = SPDK_POLLER_REGISTER(bdev_nvme_poll, group, period);
3780 :
3781 44 : if (group->poller == NULL) {
3782 0 : spdk_nvme_poll_group_destroy(group->group);
3783 0 : return -1;
3784 : }
3785 :
3786 44 : if (spdk_interrupt_mode_is_enabled()) {
3787 0 : spdk_poller_register_interrupt(group->poller, NULL, NULL);
3788 :
3789 0 : fd = spdk_nvme_poll_group_get_fd(group->group);
3790 0 : if (fd < 0) {
3791 0 : spdk_nvme_poll_group_destroy(group->group);
3792 0 : return -1;
3793 : }
3794 :
3795 0 : group->intr = SPDK_INTERRUPT_REGISTER(fd, bdev_nvme_interrupt_wrapper, group);
3796 0 : if (!group->intr) {
3797 0 : spdk_nvme_poll_group_destroy(group->group);
3798 0 : return -1;
3799 : }
3800 0 : }
3801 :
3802 44 : return 0;
3803 44 : }
3804 :
3805 : static void
3806 44 : bdev_nvme_destroy_poll_group_cb(void *io_device, void *ctx_buf)
3807 : {
3808 44 : struct nvme_poll_group *group = ctx_buf;
3809 :
3810 44 : assert(TAILQ_EMPTY(&group->qpair_list));
3811 :
3812 44 : if (group->accel_channel) {
3813 0 : spdk_put_io_channel(group->accel_channel);
3814 0 : }
3815 :
3816 44 : if (spdk_interrupt_mode_is_enabled()) {
3817 0 : spdk_interrupt_unregister(&group->intr);
3818 0 : }
3819 :
3820 44 : spdk_poller_unregister(&group->poller);
3821 44 : if (spdk_nvme_poll_group_destroy(group->group)) {
3822 0 : SPDK_ERRLOG("Unable to destroy a poll group for the NVMe bdev module.\n");
3823 0 : assert(false);
3824 : }
3825 44 : }
3826 :
3827 : static struct spdk_io_channel *
3828 0 : bdev_nvme_get_io_channel(void *ctx)
3829 : {
3830 0 : struct nvme_bdev *nvme_bdev = ctx;
3831 :
3832 0 : return spdk_get_io_channel(nvme_bdev);
3833 : }
3834 :
3835 : static void *
3836 0 : bdev_nvme_get_module_ctx(void *ctx)
3837 : {
3838 0 : struct nvme_bdev *nvme_bdev = ctx;
3839 : struct nvme_ns *nvme_ns;
3840 :
3841 0 : if (!nvme_bdev || nvme_bdev->disk.module != &nvme_if) {
3842 0 : return NULL;
3843 : }
3844 :
3845 0 : nvme_ns = TAILQ_FIRST(&nvme_bdev->nvme_ns_list);
3846 0 : if (!nvme_ns) {
3847 0 : return NULL;
3848 : }
3849 :
3850 0 : return nvme_ns->ns;
3851 0 : }
3852 :
3853 : static const char *
3854 0 : _nvme_ana_state_str(enum spdk_nvme_ana_state ana_state)
3855 : {
3856 0 : switch (ana_state) {
3857 : case SPDK_NVME_ANA_OPTIMIZED_STATE:
3858 0 : return "optimized";
3859 : case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
3860 0 : return "non_optimized";
3861 : case SPDK_NVME_ANA_INACCESSIBLE_STATE:
3862 0 : return "inaccessible";
3863 : case SPDK_NVME_ANA_PERSISTENT_LOSS_STATE:
3864 0 : return "persistent_loss";
3865 : case SPDK_NVME_ANA_CHANGE_STATE:
3866 0 : return "change";
3867 : default:
3868 0 : return NULL;
3869 : }
3870 0 : }
3871 :
3872 : static int
3873 8 : bdev_nvme_get_memory_domains(void *ctx, struct spdk_memory_domain **domains, int array_size)
3874 : {
3875 8 : struct spdk_memory_domain **_domains = NULL;
3876 8 : struct nvme_bdev *nbdev = ctx;
3877 : struct nvme_ns *nvme_ns;
3878 8 : int i = 0, _array_size = array_size;
3879 8 : int rc = 0;
3880 :
3881 22 : TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) {
3882 14 : if (domains && array_size >= i) {
3883 11 : _domains = &domains[i];
3884 11 : } else {
3885 3 : _domains = NULL;
3886 : }
3887 14 : rc = spdk_nvme_ctrlr_get_memory_domains(nvme_ns->ctrlr->ctrlr, _domains, _array_size);
3888 14 : if (rc > 0) {
3889 13 : i += rc;
3890 13 : if (_array_size >= rc) {
3891 9 : _array_size -= rc;
3892 9 : } else {
3893 4 : _array_size = 0;
3894 : }
3895 14 : } else if (rc < 0) {
3896 0 : return rc;
3897 : }
3898 14 : }
3899 :
3900 8 : return i;
3901 8 : }
3902 :
3903 : static const char *
3904 0 : nvme_ctrlr_get_state_str(struct nvme_ctrlr *nvme_ctrlr)
3905 : {
3906 0 : if (nvme_ctrlr->destruct) {
3907 0 : return "deleting";
3908 0 : } else if (spdk_nvme_ctrlr_is_failed(nvme_ctrlr->ctrlr)) {
3909 0 : return "failed";
3910 0 : } else if (nvme_ctrlr->resetting) {
3911 0 : return "resetting";
3912 0 : } else if (nvme_ctrlr->reconnect_is_delayed > 0) {
3913 0 : return "reconnect_is_delayed";
3914 0 : } else if (nvme_ctrlr->disabled) {
3915 0 : return "disabled";
3916 : } else {
3917 0 : return "enabled";
3918 : }
3919 0 : }
3920 :
3921 : void
3922 0 : nvme_ctrlr_info_json(struct spdk_json_write_ctx *w, struct nvme_ctrlr *nvme_ctrlr)
3923 : {
3924 : struct spdk_nvme_transport_id *trid;
3925 : const struct spdk_nvme_ctrlr_opts *opts;
3926 : const struct spdk_nvme_ctrlr_data *cdata;
3927 : struct nvme_path_id *path_id;
3928 : int32_t numa_id;
3929 :
3930 0 : spdk_json_write_object_begin(w);
3931 :
3932 0 : spdk_json_write_named_string(w, "state", nvme_ctrlr_get_state_str(nvme_ctrlr));
3933 :
3934 : #ifdef SPDK_CONFIG_NVME_CUSE
3935 : size_t cuse_name_size = 128;
3936 : char cuse_name[cuse_name_size];
3937 :
3938 : int rc = spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr, cuse_name, &cuse_name_size);
3939 : if (rc == 0) {
3940 : spdk_json_write_named_string(w, "cuse_device", cuse_name);
3941 : }
3942 : #endif
3943 0 : trid = &nvme_ctrlr->active_path_id->trid;
3944 0 : spdk_json_write_named_object_begin(w, "trid");
3945 0 : nvme_bdev_dump_trid_json(trid, w);
3946 0 : spdk_json_write_object_end(w);
3947 :
3948 0 : path_id = TAILQ_NEXT(nvme_ctrlr->active_path_id, link);
3949 0 : if (path_id != NULL) {
3950 0 : spdk_json_write_named_array_begin(w, "alternate_trids");
3951 0 : do {
3952 0 : trid = &path_id->trid;
3953 0 : spdk_json_write_object_begin(w);
3954 0 : nvme_bdev_dump_trid_json(trid, w);
3955 0 : spdk_json_write_object_end(w);
3956 :
3957 0 : path_id = TAILQ_NEXT(path_id, link);
3958 0 : } while (path_id != NULL);
3959 0 : spdk_json_write_array_end(w);
3960 0 : }
3961 :
3962 0 : cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr);
3963 0 : spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid);
3964 :
3965 0 : opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr);
3966 0 : spdk_json_write_named_object_begin(w, "host");
3967 0 : spdk_json_write_named_string(w, "nqn", opts->hostnqn);
3968 0 : spdk_json_write_named_string(w, "addr", opts->src_addr);
3969 0 : spdk_json_write_named_string(w, "svcid", opts->src_svcid);
3970 0 : spdk_json_write_object_end(w);
3971 :
3972 0 : numa_id = spdk_nvme_ctrlr_get_numa_id(nvme_ctrlr->ctrlr);
3973 0 : if (numa_id != SPDK_ENV_NUMA_ID_ANY) {
3974 0 : spdk_json_write_named_uint32(w, "numa_id", numa_id);
3975 0 : }
3976 0 : spdk_json_write_object_end(w);
3977 0 : }
3978 :
3979 : static void
3980 0 : nvme_namespace_info_json(struct spdk_json_write_ctx *w,
3981 : struct nvme_ns *nvme_ns)
3982 : {
3983 : struct spdk_nvme_ns *ns;
3984 : struct spdk_nvme_ctrlr *ctrlr;
3985 : const struct spdk_nvme_ctrlr_data *cdata;
3986 : const struct spdk_nvme_transport_id *trid;
3987 : union spdk_nvme_vs_register vs;
3988 : const struct spdk_nvme_ns_data *nsdata;
3989 : char buf[128];
3990 :
3991 0 : ns = nvme_ns->ns;
3992 0 : if (ns == NULL) {
3993 0 : return;
3994 : }
3995 :
3996 0 : ctrlr = spdk_nvme_ns_get_ctrlr(ns);
3997 :
3998 0 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
3999 0 : trid = spdk_nvme_ctrlr_get_transport_id(ctrlr);
4000 0 : vs = spdk_nvme_ctrlr_get_regs_vs(ctrlr);
4001 :
4002 0 : spdk_json_write_object_begin(w);
4003 :
4004 0 : if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
4005 0 : spdk_json_write_named_string(w, "pci_address", trid->traddr);
4006 0 : }
4007 :
4008 0 : spdk_json_write_named_object_begin(w, "trid");
4009 :
4010 0 : nvme_bdev_dump_trid_json(trid, w);
4011 :
4012 0 : spdk_json_write_object_end(w);
4013 :
4014 : #ifdef SPDK_CONFIG_NVME_CUSE
4015 : size_t cuse_name_size = 128;
4016 : char cuse_name[cuse_name_size];
4017 :
4018 : int rc = spdk_nvme_cuse_get_ns_name(ctrlr, spdk_nvme_ns_get_id(ns),
4019 : cuse_name, &cuse_name_size);
4020 : if (rc == 0) {
4021 : spdk_json_write_named_string(w, "cuse_device", cuse_name);
4022 : }
4023 : #endif
4024 :
4025 0 : spdk_json_write_named_object_begin(w, "ctrlr_data");
4026 :
4027 0 : spdk_json_write_named_uint16(w, "cntlid", cdata->cntlid);
4028 :
4029 0 : spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid);
4030 :
4031 0 : snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn);
4032 0 : spdk_str_trim(buf);
4033 0 : spdk_json_write_named_string(w, "model_number", buf);
4034 :
4035 0 : snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn);
4036 0 : spdk_str_trim(buf);
4037 0 : spdk_json_write_named_string(w, "serial_number", buf);
4038 :
4039 0 : snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr);
4040 0 : spdk_str_trim(buf);
4041 0 : spdk_json_write_named_string(w, "firmware_revision", buf);
4042 :
4043 0 : if (cdata->subnqn[0] != '\0') {
4044 0 : spdk_json_write_named_string(w, "subnqn", cdata->subnqn);
4045 0 : }
4046 :
4047 0 : spdk_json_write_named_object_begin(w, "oacs");
4048 :
4049 0 : spdk_json_write_named_uint32(w, "security", cdata->oacs.security);
4050 0 : spdk_json_write_named_uint32(w, "format", cdata->oacs.format);
4051 0 : spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware);
4052 0 : spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage);
4053 :
4054 0 : spdk_json_write_object_end(w);
4055 :
4056 0 : spdk_json_write_named_bool(w, "multi_ctrlr", cdata->cmic.multi_ctrlr);
4057 0 : spdk_json_write_named_bool(w, "ana_reporting", cdata->cmic.ana_reporting);
4058 :
4059 0 : spdk_json_write_object_end(w);
4060 :
4061 0 : spdk_json_write_named_object_begin(w, "vs");
4062 :
4063 0 : spdk_json_write_name(w, "nvme_version");
4064 0 : if (vs.bits.ter) {
4065 0 : spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter);
4066 0 : } else {
4067 0 : spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr);
4068 : }
4069 :
4070 0 : spdk_json_write_object_end(w);
4071 :
4072 0 : nsdata = spdk_nvme_ns_get_data(ns);
4073 :
4074 0 : spdk_json_write_named_object_begin(w, "ns_data");
4075 :
4076 0 : spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns));
4077 :
4078 0 : if (cdata->cmic.ana_reporting) {
4079 0 : spdk_json_write_named_string(w, "ana_state",
4080 0 : _nvme_ana_state_str(nvme_ns->ana_state));
4081 0 : }
4082 :
4083 0 : spdk_json_write_named_bool(w, "can_share", nsdata->nmic.can_share);
4084 :
4085 0 : spdk_json_write_object_end(w);
4086 :
4087 0 : if (cdata->oacs.security) {
4088 0 : spdk_json_write_named_object_begin(w, "security");
4089 :
4090 0 : spdk_json_write_named_bool(w, "opal", nvme_ns->bdev->opal);
4091 :
4092 0 : spdk_json_write_object_end(w);
4093 0 : }
4094 :
4095 0 : spdk_json_write_object_end(w);
4096 0 : }
4097 :
4098 : static const char *
4099 0 : nvme_bdev_get_mp_policy_str(struct nvme_bdev *nbdev)
4100 : {
4101 0 : switch (nbdev->mp_policy) {
4102 : case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE:
4103 0 : return "active_passive";
4104 : case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE:
4105 0 : return "active_active";
4106 : default:
4107 0 : assert(false);
4108 : return "invalid";
4109 : }
4110 0 : }
4111 :
4112 : static const char *
4113 0 : nvme_bdev_get_mp_selector_str(struct nvme_bdev *nbdev)
4114 : {
4115 0 : switch (nbdev->mp_selector) {
4116 : case BDEV_NVME_MP_SELECTOR_ROUND_ROBIN:
4117 0 : return "round_robin";
4118 : case BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH:
4119 0 : return "queue_depth";
4120 : default:
4121 0 : assert(false);
4122 : return "invalid";
4123 : }
4124 0 : }
4125 :
4126 : static int
4127 0 : bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w)
4128 : {
4129 0 : struct nvme_bdev *nvme_bdev = ctx;
4130 : struct nvme_ns *nvme_ns;
4131 :
4132 0 : pthread_mutex_lock(&nvme_bdev->mutex);
4133 0 : spdk_json_write_named_array_begin(w, "nvme");
4134 0 : TAILQ_FOREACH(nvme_ns, &nvme_bdev->nvme_ns_list, tailq) {
4135 0 : nvme_namespace_info_json(w, nvme_ns);
4136 0 : }
4137 0 : spdk_json_write_array_end(w);
4138 0 : spdk_json_write_named_string(w, "mp_policy", nvme_bdev_get_mp_policy_str(nvme_bdev));
4139 0 : if (nvme_bdev->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) {
4140 0 : spdk_json_write_named_string(w, "selector", nvme_bdev_get_mp_selector_str(nvme_bdev));
4141 0 : if (nvme_bdev->mp_selector == BDEV_NVME_MP_SELECTOR_ROUND_ROBIN) {
4142 0 : spdk_json_write_named_uint32(w, "rr_min_io", nvme_bdev->rr_min_io);
4143 0 : }
4144 0 : }
4145 0 : pthread_mutex_unlock(&nvme_bdev->mutex);
4146 :
4147 0 : return 0;
4148 : }
4149 :
4150 : static void
4151 0 : bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w)
4152 : {
4153 : /* No config per bdev needed */
4154 0 : }
4155 :
4156 : static uint64_t
4157 0 : bdev_nvme_get_spin_time(struct spdk_io_channel *ch)
4158 : {
4159 0 : struct nvme_bdev_channel *nbdev_ch = spdk_io_channel_get_ctx(ch);
4160 : struct nvme_io_path *io_path;
4161 : struct nvme_poll_group *group;
4162 0 : uint64_t spin_time = 0;
4163 :
4164 0 : STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
4165 0 : group = io_path->qpair->group;
4166 :
4167 0 : if (!group || !group->collect_spin_stat) {
4168 0 : continue;
4169 : }
4170 :
4171 0 : if (group->end_ticks != 0) {
4172 0 : group->spin_ticks += (group->end_ticks - group->start_ticks);
4173 0 : group->end_ticks = 0;
4174 0 : }
4175 :
4176 0 : spin_time += group->spin_ticks;
4177 0 : group->start_ticks = 0;
4178 0 : group->spin_ticks = 0;
4179 0 : }
4180 :
4181 0 : return (spin_time * 1000000ULL) / spdk_get_ticks_hz();
4182 : }
4183 :
4184 : static void
4185 0 : bdev_nvme_reset_device_stat(void *ctx)
4186 : {
4187 0 : struct nvme_bdev *nbdev = ctx;
4188 :
4189 0 : if (nbdev->err_stat != NULL) {
4190 0 : memset(nbdev->err_stat, 0, sizeof(struct nvme_error_stat));
4191 0 : }
4192 0 : }
4193 :
4194 : /* JSON string should be lowercases and underscore delimited string. */
4195 : static void
4196 0 : bdev_nvme_format_nvme_status(char *dst, const char *src)
4197 : {
4198 : char tmp[256];
4199 :
4200 0 : spdk_strcpy_replace(dst, 256, src, " - ", "_");
4201 0 : spdk_strcpy_replace(tmp, 256, dst, "-", "_");
4202 0 : spdk_strcpy_replace(dst, 256, tmp, " ", "_");
4203 0 : spdk_strlwr(dst);
4204 0 : }
4205 :
4206 : static void
4207 0 : bdev_nvme_dump_device_stat_json(void *ctx, struct spdk_json_write_ctx *w)
4208 : {
4209 0 : struct nvme_bdev *nbdev = ctx;
4210 0 : struct spdk_nvme_status status = {};
4211 : uint16_t sct, sc;
4212 : char status_json[256];
4213 : const char *status_str;
4214 :
4215 0 : if (nbdev->err_stat == NULL) {
4216 0 : return;
4217 : }
4218 :
4219 0 : spdk_json_write_named_object_begin(w, "nvme_error");
4220 :
4221 0 : spdk_json_write_named_object_begin(w, "status_type");
4222 0 : for (sct = 0; sct < 8; sct++) {
4223 0 : if (nbdev->err_stat->status_type[sct] == 0) {
4224 0 : continue;
4225 : }
4226 0 : status.sct = sct;
4227 :
4228 0 : status_str = spdk_nvme_cpl_get_status_type_string(&status);
4229 0 : assert(status_str != NULL);
4230 0 : bdev_nvme_format_nvme_status(status_json, status_str);
4231 :
4232 0 : spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status_type[sct]);
4233 0 : }
4234 0 : spdk_json_write_object_end(w);
4235 :
4236 0 : spdk_json_write_named_object_begin(w, "status_code");
4237 0 : for (sct = 0; sct < 4; sct++) {
4238 0 : status.sct = sct;
4239 0 : for (sc = 0; sc < 256; sc++) {
4240 0 : if (nbdev->err_stat->status[sct][sc] == 0) {
4241 0 : continue;
4242 : }
4243 0 : status.sc = sc;
4244 :
4245 0 : status_str = spdk_nvme_cpl_get_status_string(&status);
4246 0 : assert(status_str != NULL);
4247 0 : bdev_nvme_format_nvme_status(status_json, status_str);
4248 :
4249 0 : spdk_json_write_named_uint32(w, status_json, nbdev->err_stat->status[sct][sc]);
4250 0 : }
4251 0 : }
4252 0 : spdk_json_write_object_end(w);
4253 :
4254 0 : spdk_json_write_object_end(w);
4255 0 : }
4256 :
4257 : static bool
4258 0 : bdev_nvme_accel_sequence_supported(void *ctx, enum spdk_bdev_io_type type)
4259 : {
4260 0 : struct nvme_bdev *nbdev = ctx;
4261 : struct spdk_nvme_ctrlr *ctrlr;
4262 :
4263 0 : if (!g_opts.allow_accel_sequence) {
4264 0 : return false;
4265 : }
4266 :
4267 0 : switch (type) {
4268 : case SPDK_BDEV_IO_TYPE_WRITE:
4269 : case SPDK_BDEV_IO_TYPE_READ:
4270 0 : break;
4271 : default:
4272 0 : return false;
4273 : }
4274 :
4275 0 : ctrlr = bdev_nvme_get_ctrlr(&nbdev->disk);
4276 0 : assert(ctrlr != NULL);
4277 :
4278 0 : return spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_ACCEL_SEQUENCE_SUPPORTED;
4279 0 : }
4280 :
4281 : static const struct spdk_bdev_fn_table nvmelib_fn_table = {
4282 : .destruct = bdev_nvme_destruct,
4283 : .submit_request = bdev_nvme_submit_request,
4284 : .io_type_supported = bdev_nvme_io_type_supported,
4285 : .get_io_channel = bdev_nvme_get_io_channel,
4286 : .dump_info_json = bdev_nvme_dump_info_json,
4287 : .write_config_json = bdev_nvme_write_config_json,
4288 : .get_spin_time = bdev_nvme_get_spin_time,
4289 : .get_module_ctx = bdev_nvme_get_module_ctx,
4290 : .get_memory_domains = bdev_nvme_get_memory_domains,
4291 : .accel_sequence_supported = bdev_nvme_accel_sequence_supported,
4292 : .reset_device_stat = bdev_nvme_reset_device_stat,
4293 : .dump_device_stat_json = bdev_nvme_dump_device_stat_json,
4294 : };
4295 :
4296 : typedef int (*bdev_nvme_parse_ana_log_page_cb)(
4297 : const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg);
4298 :
4299 : static int
4300 41 : bdev_nvme_parse_ana_log_page(struct nvme_ctrlr *nvme_ctrlr,
4301 : bdev_nvme_parse_ana_log_page_cb cb_fn, void *cb_arg)
4302 : {
4303 : struct spdk_nvme_ana_group_descriptor *copied_desc;
4304 : uint8_t *orig_desc;
4305 : uint32_t i, desc_size, copy_len;
4306 41 : int rc = 0;
4307 :
4308 41 : if (nvme_ctrlr->ana_log_page == NULL) {
4309 0 : return -EINVAL;
4310 : }
4311 :
4312 41 : copied_desc = nvme_ctrlr->copied_ana_desc;
4313 :
4314 41 : orig_desc = (uint8_t *)nvme_ctrlr->ana_log_page + sizeof(struct spdk_nvme_ana_page);
4315 41 : copy_len = nvme_ctrlr->max_ana_log_page_size - sizeof(struct spdk_nvme_ana_page);
4316 :
4317 71 : for (i = 0; i < nvme_ctrlr->ana_log_page->num_ana_group_desc; i++) {
4318 66 : memcpy(copied_desc, orig_desc, copy_len);
4319 :
4320 66 : rc = cb_fn(copied_desc, cb_arg);
4321 66 : if (rc != 0) {
4322 36 : break;
4323 : }
4324 :
4325 30 : desc_size = sizeof(struct spdk_nvme_ana_group_descriptor) +
4326 30 : copied_desc->num_of_nsid * sizeof(uint32_t);
4327 30 : orig_desc += desc_size;
4328 30 : copy_len -= desc_size;
4329 30 : }
4330 :
4331 41 : return rc;
4332 41 : }
4333 :
4334 : static int
4335 5 : nvme_ns_ana_transition_timedout(void *ctx)
4336 : {
4337 5 : struct nvme_ns *nvme_ns = ctx;
4338 :
4339 5 : spdk_poller_unregister(&nvme_ns->anatt_timer);
4340 5 : nvme_ns->ana_transition_timedout = true;
4341 :
4342 5 : return SPDK_POLLER_BUSY;
4343 : }
4344 :
4345 : static void
4346 45 : _nvme_ns_set_ana_state(struct nvme_ns *nvme_ns,
4347 : const struct spdk_nvme_ana_group_descriptor *desc)
4348 : {
4349 : const struct spdk_nvme_ctrlr_data *cdata;
4350 :
4351 45 : nvme_ns->ana_group_id = desc->ana_group_id;
4352 45 : nvme_ns->ana_state = desc->ana_state;
4353 45 : nvme_ns->ana_state_updating = false;
4354 :
4355 45 : switch (nvme_ns->ana_state) {
4356 : case SPDK_NVME_ANA_OPTIMIZED_STATE:
4357 : case SPDK_NVME_ANA_NON_OPTIMIZED_STATE:
4358 38 : nvme_ns->ana_transition_timedout = false;
4359 38 : spdk_poller_unregister(&nvme_ns->anatt_timer);
4360 38 : break;
4361 :
4362 : case SPDK_NVME_ANA_INACCESSIBLE_STATE:
4363 : case SPDK_NVME_ANA_CHANGE_STATE:
4364 6 : if (nvme_ns->anatt_timer != NULL) {
4365 1 : break;
4366 : }
4367 :
4368 5 : cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr);
4369 5 : nvme_ns->anatt_timer = SPDK_POLLER_REGISTER(nvme_ns_ana_transition_timedout,
4370 : nvme_ns,
4371 : cdata->anatt * SPDK_SEC_TO_USEC);
4372 5 : break;
4373 : default:
4374 1 : break;
4375 : }
4376 45 : }
4377 :
4378 : static int
4379 59 : nvme_ns_set_ana_state(const struct spdk_nvme_ana_group_descriptor *desc, void *cb_arg)
4380 : {
4381 59 : struct nvme_ns *nvme_ns = cb_arg;
4382 : uint32_t i;
4383 :
4384 59 : assert(nvme_ns->ns != NULL);
4385 :
4386 81 : for (i = 0; i < desc->num_of_nsid; i++) {
4387 58 : if (desc->nsid[i] != spdk_nvme_ns_get_id(nvme_ns->ns)) {
4388 22 : continue;
4389 : }
4390 :
4391 36 : _nvme_ns_set_ana_state(nvme_ns, desc);
4392 36 : return 1;
4393 : }
4394 :
4395 23 : return 0;
4396 59 : }
4397 :
4398 : static int
4399 5 : nvme_generate_uuid(const char *sn, uint32_t nsid, struct spdk_uuid *uuid)
4400 : {
4401 5 : int rc = 0;
4402 : struct spdk_uuid new_uuid, namespace_uuid;
4403 5 : char merged_str[SPDK_NVME_CTRLR_SN_LEN + NSID_STR_LEN + 1] = {'\0'};
4404 : /* This namespace UUID was generated using uuid_generate() method. */
4405 5 : const char *namespace_str = {"edaed2de-24bc-4b07-b559-f47ecbe730fd"};
4406 : int size;
4407 :
4408 5 : assert(strlen(sn) <= SPDK_NVME_CTRLR_SN_LEN);
4409 :
4410 5 : spdk_uuid_set_null(&new_uuid);
4411 5 : spdk_uuid_set_null(&namespace_uuid);
4412 :
4413 5 : size = snprintf(merged_str, sizeof(merged_str), "%s%"PRIu32, sn, nsid);
4414 5 : if (size <= 0 || (unsigned long)size >= sizeof(merged_str)) {
4415 0 : return -EINVAL;
4416 : }
4417 :
4418 5 : spdk_uuid_parse(&namespace_uuid, namespace_str);
4419 :
4420 5 : rc = spdk_uuid_generate_sha1(&new_uuid, &namespace_uuid, merged_str, size);
4421 5 : if (rc == 0) {
4422 5 : memcpy(uuid, &new_uuid, sizeof(struct spdk_uuid));
4423 5 : }
4424 :
4425 5 : return rc;
4426 5 : }
4427 :
4428 : static int
4429 38 : nvme_disk_create(struct spdk_bdev *disk, const char *base_name,
4430 : struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns *ns,
4431 : struct spdk_bdev_nvme_ctrlr_opts *bdev_opts, void *ctx)
4432 : {
4433 : const struct spdk_uuid *uuid;
4434 : const uint8_t *nguid;
4435 : const struct spdk_nvme_ctrlr_data *cdata;
4436 : const struct spdk_nvme_ns_data *nsdata;
4437 : const struct spdk_nvme_ctrlr_opts *opts;
4438 : enum spdk_nvme_csi csi;
4439 : uint32_t atomic_bs, phys_bs, bs;
4440 38 : char sn_tmp[SPDK_NVME_CTRLR_SN_LEN + 1] = {'\0'};
4441 : int rc;
4442 :
4443 38 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
4444 38 : csi = spdk_nvme_ns_get_csi(ns);
4445 38 : opts = spdk_nvme_ctrlr_get_opts(ctrlr);
4446 :
4447 38 : switch (csi) {
4448 : case SPDK_NVME_CSI_NVM:
4449 38 : disk->product_name = "NVMe disk";
4450 38 : break;
4451 : case SPDK_NVME_CSI_ZNS:
4452 0 : disk->product_name = "NVMe ZNS disk";
4453 0 : disk->zoned = true;
4454 0 : disk->zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
4455 0 : disk->max_zone_append_size = spdk_nvme_zns_ctrlr_get_max_zone_append_size(ctrlr) /
4456 0 : spdk_nvme_ns_get_extended_sector_size(ns);
4457 0 : disk->max_open_zones = spdk_nvme_zns_ns_get_max_open_zones(ns);
4458 0 : disk->max_active_zones = spdk_nvme_zns_ns_get_max_active_zones(ns);
4459 0 : break;
4460 : default:
4461 0 : if (bdev_opts->allow_unrecognized_csi) {
4462 0 : disk->product_name = "NVMe Passthrough disk";
4463 0 : break;
4464 : }
4465 0 : SPDK_ERRLOG("unsupported CSI: %u\n", csi);
4466 0 : return -ENOTSUP;
4467 : }
4468 :
4469 38 : nguid = spdk_nvme_ns_get_nguid(ns);
4470 38 : if (!nguid) {
4471 38 : uuid = spdk_nvme_ns_get_uuid(ns);
4472 38 : if (uuid) {
4473 12 : disk->uuid = *uuid;
4474 38 : } else if (g_opts.generate_uuids) {
4475 0 : spdk_strcpy_pad(sn_tmp, cdata->sn, SPDK_NVME_CTRLR_SN_LEN, '\0');
4476 0 : rc = nvme_generate_uuid(sn_tmp, spdk_nvme_ns_get_id(ns), &disk->uuid);
4477 0 : if (rc < 0) {
4478 0 : SPDK_ERRLOG("UUID generation failed (%s)\n", spdk_strerror(-rc));
4479 0 : return rc;
4480 : }
4481 0 : }
4482 38 : } else {
4483 0 : memcpy(&disk->uuid, nguid, sizeof(disk->uuid));
4484 : }
4485 :
4486 38 : disk->name = spdk_sprintf_alloc("%sn%d", base_name, spdk_nvme_ns_get_id(ns));
4487 38 : if (!disk->name) {
4488 0 : return -ENOMEM;
4489 : }
4490 :
4491 38 : disk->write_cache = 0;
4492 38 : if (cdata->vwc.present) {
4493 : /* Enable if the Volatile Write Cache exists */
4494 0 : disk->write_cache = 1;
4495 0 : }
4496 38 : if (cdata->oncs.write_zeroes) {
4497 0 : disk->max_write_zeroes = UINT16_MAX + 1;
4498 0 : }
4499 38 : disk->blocklen = spdk_nvme_ns_get_extended_sector_size(ns);
4500 38 : disk->blockcnt = spdk_nvme_ns_get_num_sectors(ns);
4501 38 : disk->max_segment_size = spdk_nvme_ctrlr_get_max_xfer_size(ctrlr);
4502 38 : disk->ctratt.raw = cdata->ctratt.raw;
4503 : /* NVMe driver will split one request into multiple requests
4504 : * based on MDTS and stripe boundary, the bdev layer will use
4505 : * max_segment_size and max_num_segments to split one big IO
4506 : * into multiple requests, then small request can't run out
4507 : * of NVMe internal requests data structure.
4508 : */
4509 38 : if (opts && opts->io_queue_requests) {
4510 0 : disk->max_num_segments = opts->io_queue_requests / 2;
4511 0 : }
4512 38 : if (spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_SGL_SUPPORTED) {
4513 : /* The nvme driver will try to split I/O that have too many
4514 : * SGEs, but it doesn't work if that last SGE doesn't end on
4515 : * an aggregate total that is block aligned. The bdev layer has
4516 : * a more robust splitting framework, so use that instead for
4517 : * this case. (See issue #3269.)
4518 : */
4519 0 : uint16_t max_sges = spdk_nvme_ctrlr_get_max_sges(ctrlr);
4520 :
4521 0 : if (disk->max_num_segments == 0) {
4522 0 : disk->max_num_segments = max_sges;
4523 0 : } else {
4524 0 : disk->max_num_segments = spdk_min(disk->max_num_segments, max_sges);
4525 : }
4526 0 : }
4527 38 : disk->optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns);
4528 :
4529 38 : nsdata = spdk_nvme_ns_get_data(ns);
4530 38 : bs = spdk_nvme_ns_get_sector_size(ns);
4531 38 : atomic_bs = bs;
4532 38 : phys_bs = bs;
4533 38 : if (nsdata->nabo == 0) {
4534 38 : if (nsdata->nsfeat.ns_atomic_write_unit && nsdata->nawupf) {
4535 0 : atomic_bs = bs * (1 + nsdata->nawupf);
4536 0 : } else {
4537 38 : atomic_bs = bs * (1 + cdata->awupf);
4538 : }
4539 38 : }
4540 38 : if (nsdata->nsfeat.optperf) {
4541 0 : phys_bs = bs * (1 + nsdata->npwg);
4542 0 : }
4543 38 : disk->phys_blocklen = spdk_min(phys_bs, atomic_bs);
4544 :
4545 38 : disk->md_len = spdk_nvme_ns_get_md_size(ns);
4546 38 : if (disk->md_len != 0) {
4547 0 : disk->md_interleave = nsdata->flbas.extended;
4548 0 : disk->dif_type = (enum spdk_dif_type)spdk_nvme_ns_get_pi_type(ns);
4549 0 : if (disk->dif_type != SPDK_DIF_DISABLE) {
4550 0 : disk->dif_is_head_of_md = nsdata->dps.md_start;
4551 0 : disk->dif_check_flags = bdev_opts->prchk_flags;
4552 0 : disk->dif_pi_format = (enum spdk_dif_pi_format)spdk_nvme_ns_get_pi_format(ns);
4553 0 : }
4554 0 : }
4555 :
4556 38 : if (!(spdk_nvme_ctrlr_get_flags(ctrlr) &
4557 : SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED)) {
4558 38 : disk->acwu = 0;
4559 38 : } else if (nsdata->nsfeat.ns_atomic_write_unit) {
4560 0 : disk->acwu = nsdata->nacwu + 1; /* 0-based */
4561 0 : } else {
4562 0 : disk->acwu = cdata->acwu + 1; /* 0-based */
4563 : }
4564 :
4565 38 : if (cdata->oncs.copy) {
4566 : /* For now bdev interface allows only single segment copy */
4567 0 : disk->max_copy = nsdata->mssrl;
4568 0 : }
4569 :
4570 38 : disk->ctxt = ctx;
4571 38 : disk->fn_table = &nvmelib_fn_table;
4572 38 : disk->module = &nvme_if;
4573 :
4574 38 : disk->numa.id_valid = 1;
4575 38 : disk->numa.id = spdk_nvme_ctrlr_get_numa_id(ctrlr);
4576 :
4577 38 : return 0;
4578 38 : }
4579 :
4580 : static struct nvme_bdev *
4581 38 : nvme_bdev_alloc(void)
4582 : {
4583 : struct nvme_bdev *bdev;
4584 : int rc;
4585 :
4586 38 : bdev = calloc(1, sizeof(*bdev));
4587 38 : if (!bdev) {
4588 0 : SPDK_ERRLOG("bdev calloc() failed\n");
4589 0 : return NULL;
4590 : }
4591 :
4592 38 : if (g_opts.nvme_error_stat) {
4593 0 : bdev->err_stat = calloc(1, sizeof(struct nvme_error_stat));
4594 0 : if (!bdev->err_stat) {
4595 0 : SPDK_ERRLOG("err_stat calloc() failed\n");
4596 0 : free(bdev);
4597 0 : return NULL;
4598 : }
4599 0 : }
4600 :
4601 38 : rc = pthread_mutex_init(&bdev->mutex, NULL);
4602 38 : if (rc != 0) {
4603 0 : free(bdev->err_stat);
4604 0 : free(bdev);
4605 0 : return NULL;
4606 : }
4607 :
4608 38 : bdev->ref = 1;
4609 38 : bdev->mp_policy = BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE;
4610 38 : bdev->mp_selector = BDEV_NVME_MP_SELECTOR_ROUND_ROBIN;
4611 38 : bdev->rr_min_io = UINT32_MAX;
4612 38 : TAILQ_INIT(&bdev->nvme_ns_list);
4613 :
4614 38 : return bdev;
4615 38 : }
4616 :
4617 : static int
4618 38 : nvme_bdev_create(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
4619 : {
4620 : struct nvme_bdev *bdev;
4621 38 : struct nvme_bdev_ctrlr *nbdev_ctrlr = nvme_ctrlr->nbdev_ctrlr;
4622 : int rc;
4623 :
4624 38 : bdev = nvme_bdev_alloc();
4625 38 : if (bdev == NULL) {
4626 0 : SPDK_ERRLOG("Failed to allocate NVMe bdev\n");
4627 0 : return -ENOMEM;
4628 : }
4629 :
4630 38 : bdev->opal = nvme_ctrlr->opal_dev != NULL;
4631 :
4632 76 : rc = nvme_disk_create(&bdev->disk, nbdev_ctrlr->name, nvme_ctrlr->ctrlr,
4633 38 : nvme_ns->ns, &nvme_ctrlr->opts, bdev);
4634 38 : if (rc != 0) {
4635 0 : SPDK_ERRLOG("Failed to create NVMe disk\n");
4636 0 : nvme_bdev_free(bdev);
4637 0 : return rc;
4638 : }
4639 :
4640 76 : spdk_io_device_register(bdev,
4641 : bdev_nvme_create_bdev_channel_cb,
4642 : bdev_nvme_destroy_bdev_channel_cb,
4643 : sizeof(struct nvme_bdev_channel),
4644 38 : bdev->disk.name);
4645 :
4646 38 : nvme_ns->bdev = bdev;
4647 38 : bdev->nsid = nvme_ns->id;
4648 38 : TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq);
4649 :
4650 38 : bdev->nbdev_ctrlr = nbdev_ctrlr;
4651 38 : TAILQ_INSERT_TAIL(&nbdev_ctrlr->bdevs, bdev, tailq);
4652 :
4653 38 : rc = spdk_bdev_register(&bdev->disk);
4654 38 : if (rc != 0) {
4655 1 : SPDK_ERRLOG("spdk_bdev_register() failed\n");
4656 1 : spdk_io_device_unregister(bdev, NULL);
4657 1 : nvme_ns->bdev = NULL;
4658 1 : TAILQ_REMOVE(&nbdev_ctrlr->bdevs, bdev, tailq);
4659 1 : nvme_bdev_free(bdev);
4660 1 : return rc;
4661 : }
4662 :
4663 37 : return 0;
4664 38 : }
4665 :
4666 : static bool
4667 23 : bdev_nvme_compare_ns(struct spdk_nvme_ns *ns1, struct spdk_nvme_ns *ns2)
4668 : {
4669 : const struct spdk_nvme_ns_data *nsdata1, *nsdata2;
4670 : const struct spdk_uuid *uuid1, *uuid2;
4671 :
4672 23 : nsdata1 = spdk_nvme_ns_get_data(ns1);
4673 23 : nsdata2 = spdk_nvme_ns_get_data(ns2);
4674 23 : uuid1 = spdk_nvme_ns_get_uuid(ns1);
4675 23 : uuid2 = spdk_nvme_ns_get_uuid(ns2);
4676 :
4677 71 : return memcmp(nsdata1->nguid, nsdata2->nguid, sizeof(nsdata1->nguid)) == 0 &&
4678 22 : nsdata1->eui64 == nsdata2->eui64 &&
4679 21 : ((uuid1 == NULL && uuid2 == NULL) ||
4680 29 : (uuid1 != NULL && uuid2 != NULL && spdk_uuid_compare(uuid1, uuid2) == 0)) &&
4681 18 : spdk_nvme_ns_get_csi(ns1) == spdk_nvme_ns_get_csi(ns2);
4682 : }
4683 :
4684 : static bool
4685 0 : hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
4686 : struct spdk_nvme_ctrlr_opts *opts)
4687 : {
4688 : struct nvme_probe_skip_entry *entry;
4689 :
4690 0 : TAILQ_FOREACH(entry, &g_skipped_nvme_ctrlrs, tailq) {
4691 0 : if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
4692 0 : return false;
4693 : }
4694 0 : }
4695 :
4696 0 : opts->arbitration_burst = (uint8_t)g_opts.arbitration_burst;
4697 0 : opts->low_priority_weight = (uint8_t)g_opts.low_priority_weight;
4698 0 : opts->medium_priority_weight = (uint8_t)g_opts.medium_priority_weight;
4699 0 : opts->high_priority_weight = (uint8_t)g_opts.high_priority_weight;
4700 0 : opts->disable_read_ana_log_page = true;
4701 :
4702 0 : SPDK_DEBUGLOG(bdev_nvme, "Attaching to %s\n", trid->traddr);
4703 :
4704 0 : return true;
4705 0 : }
4706 :
4707 : static void
4708 0 : nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
4709 : {
4710 0 : struct nvme_ctrlr *nvme_ctrlr = ctx;
4711 :
4712 0 : if (spdk_nvme_cpl_is_error(cpl)) {
4713 0 : NVME_CTRLR_WARNLOG(nvme_ctrlr, "Abort failed. Resetting controller. sc is %u, sct is %u.\n",
4714 : cpl->status.sc, cpl->status.sct);
4715 0 : bdev_nvme_reset_ctrlr(nvme_ctrlr);
4716 0 : } else if (cpl->cdw0 & 0x1) {
4717 0 : NVME_CTRLR_WARNLOG(nvme_ctrlr, "Specified command could not be aborted.\n");
4718 0 : bdev_nvme_reset_ctrlr(nvme_ctrlr);
4719 0 : }
4720 0 : }
4721 :
4722 : static void
4723 0 : timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr,
4724 : struct spdk_nvme_qpair *qpair, uint16_t cid)
4725 : {
4726 0 : struct nvme_ctrlr *nvme_ctrlr = cb_arg;
4727 : union spdk_nvme_csts_register csts;
4728 : int rc;
4729 :
4730 0 : assert(nvme_ctrlr->ctrlr == ctrlr);
4731 :
4732 0 : NVME_CTRLR_WARNLOG(nvme_ctrlr, "Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n",
4733 : ctrlr, qpair, cid);
4734 :
4735 : /* Only try to read CSTS if it's a PCIe controller or we have a timeout on an I/O
4736 : * queue. (Note: qpair == NULL when there's an admin cmd timeout.) Otherwise we
4737 : * would submit another fabrics cmd on the admin queue to read CSTS and check for its
4738 : * completion recursively.
4739 : */
4740 0 : if (nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE || qpair != NULL) {
4741 0 : csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
4742 0 : if (csts.bits.cfs) {
4743 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Controller Fatal Status, reset required\n");
4744 0 : bdev_nvme_reset_ctrlr(nvme_ctrlr);
4745 0 : return;
4746 : }
4747 0 : }
4748 :
4749 0 : switch (g_opts.action_on_timeout) {
4750 : case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT:
4751 0 : if (qpair) {
4752 : /* Don't send abort to ctrlr when ctrlr is not available. */
4753 0 : pthread_mutex_lock(&nvme_ctrlr->mutex);
4754 0 : if (!nvme_ctrlr_is_available(nvme_ctrlr)) {
4755 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
4756 0 : NVME_CTRLR_NOTICELOG(nvme_ctrlr, "Quit abort. Ctrlr is not available.\n");
4757 0 : return;
4758 : }
4759 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
4760 :
4761 0 : rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid,
4762 0 : nvme_abort_cpl, nvme_ctrlr);
4763 0 : if (rc == 0) {
4764 0 : return;
4765 : }
4766 :
4767 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Unable to send abort. Resetting, rc is %d.\n", rc);
4768 0 : }
4769 :
4770 : /* FALLTHROUGH */
4771 : case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET:
4772 0 : bdev_nvme_reset_ctrlr(nvme_ctrlr);
4773 0 : break;
4774 : case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE:
4775 0 : NVME_CTRLR_DEBUGLOG(nvme_ctrlr, "No action for nvme controller timeout.\n");
4776 0 : break;
4777 : default:
4778 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "An invalid timeout action value is found.\n");
4779 0 : break;
4780 : }
4781 0 : }
4782 :
4783 : static struct nvme_ns *
4784 51 : nvme_ns_alloc(void)
4785 : {
4786 : struct nvme_ns *nvme_ns;
4787 :
4788 51 : nvme_ns = calloc(1, sizeof(struct nvme_ns));
4789 51 : if (nvme_ns == NULL) {
4790 0 : return NULL;
4791 : }
4792 :
4793 51 : if (g_opts.io_path_stat) {
4794 0 : nvme_ns->stat = calloc(1, sizeof(struct spdk_bdev_io_stat));
4795 0 : if (nvme_ns->stat == NULL) {
4796 0 : free(nvme_ns);
4797 0 : return NULL;
4798 : }
4799 0 : spdk_bdev_reset_io_stat(nvme_ns->stat, SPDK_BDEV_RESET_STAT_MAXMIN);
4800 0 : }
4801 :
4802 51 : return nvme_ns;
4803 51 : }
4804 :
4805 : static void
4806 51 : nvme_ns_free(struct nvme_ns *nvme_ns)
4807 : {
4808 51 : free(nvme_ns->stat);
4809 51 : free(nvme_ns);
4810 51 : }
4811 :
4812 : static void
4813 51 : nvme_ctrlr_populate_namespace_done(struct nvme_ns *nvme_ns, int rc)
4814 : {
4815 51 : struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr;
4816 51 : struct nvme_async_probe_ctx *ctx = nvme_ns->probe_ctx;
4817 :
4818 51 : if (rc == 0) {
4819 49 : nvme_ns->probe_ctx = NULL;
4820 49 : pthread_mutex_lock(&nvme_ctrlr->mutex);
4821 49 : nvme_ctrlr->ref++;
4822 49 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
4823 49 : } else {
4824 2 : RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns);
4825 2 : nvme_ns_free(nvme_ns);
4826 : }
4827 :
4828 51 : if (ctx) {
4829 50 : ctx->populates_in_progress--;
4830 50 : if (ctx->populates_in_progress == 0) {
4831 12 : nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx);
4832 12 : }
4833 50 : }
4834 51 : }
4835 :
4836 : static void
4837 2 : bdev_nvme_add_io_path(struct nvme_bdev_channel_iter *i,
4838 : struct nvme_bdev *nbdev,
4839 : struct nvme_bdev_channel *nbdev_ch, void *ctx)
4840 : {
4841 2 : struct nvme_ns *nvme_ns = ctx;
4842 : int rc;
4843 :
4844 2 : rc = _bdev_nvme_add_io_path(nbdev_ch, nvme_ns);
4845 2 : if (rc != 0) {
4846 0 : SPDK_ERRLOG("Failed to add I/O path to bdev_channel dynamically.\n");
4847 0 : }
4848 :
4849 2 : nvme_bdev_for_each_channel_continue(i, rc);
4850 2 : }
4851 :
4852 : static void
4853 2 : bdev_nvme_delete_io_path(struct nvme_bdev_channel_iter *i,
4854 : struct nvme_bdev *nbdev,
4855 : struct nvme_bdev_channel *nbdev_ch, void *ctx)
4856 : {
4857 2 : struct nvme_ns *nvme_ns = ctx;
4858 : struct nvme_io_path *io_path;
4859 :
4860 2 : io_path = _bdev_nvme_get_io_path(nbdev_ch, nvme_ns);
4861 2 : if (io_path != NULL) {
4862 2 : _bdev_nvme_delete_io_path(nbdev_ch, io_path);
4863 2 : }
4864 :
4865 2 : nvme_bdev_for_each_channel_continue(i, 0);
4866 2 : }
4867 :
4868 : static void
4869 0 : bdev_nvme_add_io_path_failed(struct nvme_bdev *nbdev, void *ctx, int status)
4870 : {
4871 0 : struct nvme_ns *nvme_ns = ctx;
4872 :
4873 0 : nvme_ctrlr_populate_namespace_done(nvme_ns, -1);
4874 0 : }
4875 :
4876 : static void
4877 12 : bdev_nvme_add_io_path_done(struct nvme_bdev *nbdev, void *ctx, int status)
4878 : {
4879 12 : struct nvme_ns *nvme_ns = ctx;
4880 :
4881 12 : if (status == 0) {
4882 12 : nvme_ctrlr_populate_namespace_done(nvme_ns, 0);
4883 12 : } else {
4884 : /* Delete the added io_paths and fail populating the namespace. */
4885 0 : nvme_bdev_for_each_channel(nbdev,
4886 : bdev_nvme_delete_io_path,
4887 0 : nvme_ns,
4888 : bdev_nvme_add_io_path_failed);
4889 : }
4890 12 : }
4891 :
4892 : static int
4893 13 : nvme_bdev_add_ns(struct nvme_bdev *bdev, struct nvme_ns *nvme_ns)
4894 : {
4895 : struct nvme_ns *tmp_ns;
4896 : const struct spdk_nvme_ns_data *nsdata;
4897 :
4898 13 : nsdata = spdk_nvme_ns_get_data(nvme_ns->ns);
4899 13 : if (!nsdata->nmic.can_share) {
4900 0 : SPDK_ERRLOG("Namespace cannot be shared.\n");
4901 0 : return -EINVAL;
4902 : }
4903 :
4904 13 : pthread_mutex_lock(&bdev->mutex);
4905 :
4906 13 : tmp_ns = TAILQ_FIRST(&bdev->nvme_ns_list);
4907 13 : assert(tmp_ns != NULL);
4908 :
4909 13 : if (tmp_ns->ns != NULL && !bdev_nvme_compare_ns(nvme_ns->ns, tmp_ns->ns)) {
4910 1 : pthread_mutex_unlock(&bdev->mutex);
4911 1 : SPDK_ERRLOG("Namespaces are not identical.\n");
4912 1 : return -EINVAL;
4913 : }
4914 :
4915 12 : bdev->ref++;
4916 12 : TAILQ_INSERT_TAIL(&bdev->nvme_ns_list, nvme_ns, tailq);
4917 12 : nvme_ns->bdev = bdev;
4918 :
4919 12 : pthread_mutex_unlock(&bdev->mutex);
4920 :
4921 : /* Add nvme_io_path to nvme_bdev_channels dynamically. */
4922 24 : nvme_bdev_for_each_channel(bdev,
4923 : bdev_nvme_add_io_path,
4924 12 : nvme_ns,
4925 : bdev_nvme_add_io_path_done);
4926 :
4927 12 : return 0;
4928 13 : }
4929 :
4930 : static void
4931 51 : nvme_ctrlr_populate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
4932 : {
4933 : struct spdk_nvme_ns *ns;
4934 : struct nvme_bdev *bdev;
4935 51 : int rc = 0;
4936 :
4937 51 : ns = spdk_nvme_ctrlr_get_ns(nvme_ctrlr->ctrlr, nvme_ns->id);
4938 51 : if (!ns) {
4939 0 : NVME_CTRLR_DEBUGLOG(nvme_ctrlr, "Invalid NS %d\n", nvme_ns->id);
4940 0 : rc = -EINVAL;
4941 0 : goto done;
4942 : }
4943 :
4944 51 : nvme_ns->ns = ns;
4945 51 : nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE;
4946 :
4947 51 : if (nvme_ctrlr->ana_log_page != NULL) {
4948 37 : bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ns_set_ana_state, nvme_ns);
4949 37 : }
4950 :
4951 51 : bdev = nvme_bdev_ctrlr_get_bdev(nvme_ctrlr->nbdev_ctrlr, nvme_ns->id);
4952 90 : if (bdev == NULL) {
4953 38 : rc = nvme_bdev_create(nvme_ctrlr, nvme_ns);
4954 38 : } else {
4955 13 : rc = nvme_bdev_add_ns(bdev, nvme_ns);
4956 13 : if (rc == 0) {
4957 12 : return;
4958 : }
4959 : }
4960 : done:
4961 39 : nvme_ctrlr_populate_namespace_done(nvme_ns, rc);
4962 51 : }
4963 :
4964 : static void
4965 49 : nvme_ctrlr_depopulate_namespace_done(struct nvme_ns *nvme_ns)
4966 : {
4967 49 : struct nvme_ctrlr *nvme_ctrlr = nvme_ns->ctrlr;
4968 :
4969 49 : assert(nvme_ctrlr != NULL);
4970 :
4971 49 : pthread_mutex_lock(&nvme_ctrlr->mutex);
4972 :
4973 49 : RB_REMOVE(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns);
4974 :
4975 49 : if (nvme_ns->bdev != NULL) {
4976 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
4977 0 : return;
4978 : }
4979 :
4980 49 : nvme_ns_free(nvme_ns);
4981 49 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
4982 :
4983 49 : nvme_ctrlr_release(nvme_ctrlr);
4984 49 : }
4985 :
4986 : static void
4987 11 : bdev_nvme_delete_io_path_done(struct nvme_bdev *nbdev, void *ctx, int status)
4988 : {
4989 11 : struct nvme_ns *nvme_ns = ctx;
4990 :
4991 11 : nvme_ctrlr_depopulate_namespace_done(nvme_ns);
4992 11 : }
4993 :
4994 : static void
4995 49 : nvme_ctrlr_depopulate_namespace(struct nvme_ctrlr *nvme_ctrlr, struct nvme_ns *nvme_ns)
4996 : {
4997 : struct nvme_bdev *bdev;
4998 :
4999 49 : spdk_poller_unregister(&nvme_ns->anatt_timer);
5000 :
5001 49 : bdev = nvme_ns->bdev;
5002 49 : if (bdev != NULL) {
5003 45 : pthread_mutex_lock(&bdev->mutex);
5004 :
5005 45 : assert(bdev->ref > 0);
5006 45 : bdev->ref--;
5007 45 : if (bdev->ref == 0) {
5008 34 : pthread_mutex_unlock(&bdev->mutex);
5009 :
5010 34 : spdk_bdev_unregister(&bdev->disk, NULL, NULL);
5011 34 : } else {
5012 : /* spdk_bdev_unregister() is not called until the last nvme_ns is
5013 : * depopulated. Hence we need to remove nvme_ns from bdev->nvme_ns_list
5014 : * and clear nvme_ns->bdev here.
5015 : */
5016 11 : TAILQ_REMOVE(&bdev->nvme_ns_list, nvme_ns, tailq);
5017 11 : nvme_ns->bdev = NULL;
5018 :
5019 11 : pthread_mutex_unlock(&bdev->mutex);
5020 :
5021 : /* Delete nvme_io_paths from nvme_bdev_channels dynamically. After that,
5022 : * we call depopulate_namespace_done() to avoid use-after-free.
5023 : */
5024 22 : nvme_bdev_for_each_channel(bdev,
5025 : bdev_nvme_delete_io_path,
5026 11 : nvme_ns,
5027 : bdev_nvme_delete_io_path_done);
5028 11 : return;
5029 : }
5030 34 : }
5031 :
5032 38 : nvme_ctrlr_depopulate_namespace_done(nvme_ns);
5033 49 : }
5034 :
5035 : static void
5036 62 : nvme_ctrlr_populate_namespaces(struct nvme_ctrlr *nvme_ctrlr,
5037 : struct nvme_async_probe_ctx *ctx)
5038 : {
5039 62 : struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
5040 : struct nvme_ns *nvme_ns, *next;
5041 : struct spdk_nvme_ns *ns;
5042 : struct nvme_bdev *bdev;
5043 : uint32_t nsid;
5044 : int rc;
5045 : uint64_t num_sectors;
5046 :
5047 62 : if (ctx) {
5048 : /* Initialize this count to 1 to handle the populate functions
5049 : * calling nvme_ctrlr_populate_namespace_done() immediately.
5050 : */
5051 46 : ctx->populates_in_progress = 1;
5052 46 : }
5053 :
5054 : /* First loop over our existing namespaces and see if they have been
5055 : * removed. */
5056 62 : nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
5057 66 : while (nvme_ns != NULL) {
5058 4 : next = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns);
5059 :
5060 4 : if (spdk_nvme_ctrlr_is_active_ns(ctrlr, nvme_ns->id)) {
5061 : /* NS is still there or added again. Its attributes may have changed. */
5062 3 : ns = spdk_nvme_ctrlr_get_ns(ctrlr, nvme_ns->id);
5063 3 : if (nvme_ns->ns != ns) {
5064 1 : assert(nvme_ns->ns == NULL);
5065 1 : nvme_ns->ns = ns;
5066 1 : NVME_CTRLR_DEBUGLOG(nvme_ctrlr, "NSID %u was added\n", nvme_ns->id);
5067 1 : }
5068 :
5069 3 : num_sectors = spdk_nvme_ns_get_num_sectors(ns);
5070 3 : bdev = nvme_ns->bdev;
5071 3 : assert(bdev != NULL);
5072 3 : if (bdev->disk.blockcnt != num_sectors) {
5073 1 : NVME_CTRLR_NOTICELOG(nvme_ctrlr,
5074 : "NSID %u is resized: bdev name %s, old size %" PRIu64 ", new size %" PRIu64 "\n",
5075 : nvme_ns->id,
5076 : bdev->disk.name,
5077 : bdev->disk.blockcnt,
5078 : num_sectors);
5079 1 : rc = spdk_bdev_notify_blockcnt_change(&bdev->disk, num_sectors);
5080 1 : if (rc != 0) {
5081 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr,
5082 : "Could not change num blocks for nvme bdev: name %s, errno: %d.\n",
5083 : bdev->disk.name, rc);
5084 0 : }
5085 1 : }
5086 3 : } else {
5087 : /* Namespace was removed */
5088 1 : nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns);
5089 : }
5090 :
5091 4 : nvme_ns = next;
5092 : }
5093 :
5094 : /* Loop through all of the namespaces at the nvme level and see if any of them are new */
5095 62 : nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
5096 116 : while (nsid != 0) {
5097 54 : nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid);
5098 :
5099 54 : if (nvme_ns == NULL) {
5100 : /* Found a new one */
5101 51 : nvme_ns = nvme_ns_alloc();
5102 51 : if (nvme_ns == NULL) {
5103 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to allocate namespace\n");
5104 : /* This just fails to attach the namespace. It may work on a future attempt. */
5105 0 : continue;
5106 : }
5107 :
5108 51 : nvme_ns->id = nsid;
5109 51 : nvme_ns->ctrlr = nvme_ctrlr;
5110 :
5111 51 : nvme_ns->bdev = NULL;
5112 :
5113 51 : if (ctx) {
5114 50 : ctx->populates_in_progress++;
5115 50 : }
5116 51 : nvme_ns->probe_ctx = ctx;
5117 :
5118 51 : RB_INSERT(nvme_ns_tree, &nvme_ctrlr->namespaces, nvme_ns);
5119 :
5120 51 : nvme_ctrlr_populate_namespace(nvme_ctrlr, nvme_ns);
5121 51 : }
5122 :
5123 54 : nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid);
5124 : }
5125 :
5126 62 : if (ctx) {
5127 : /* Decrement this count now that the loop is over to account
5128 : * for the one we started with. If the count is then 0, we
5129 : * know any populate_namespace functions completed immediately,
5130 : * so we'll kick the callback here.
5131 : */
5132 46 : ctx->populates_in_progress--;
5133 46 : if (ctx->populates_in_progress == 0) {
5134 34 : nvme_ctrlr_populate_namespaces_done(nvme_ctrlr, ctx);
5135 34 : }
5136 46 : }
5137 :
5138 62 : }
5139 :
5140 : static void
5141 61 : nvme_ctrlr_depopulate_namespaces(struct nvme_ctrlr *nvme_ctrlr)
5142 : {
5143 : struct nvme_ns *nvme_ns, *tmp;
5144 :
5145 109 : RB_FOREACH_SAFE(nvme_ns, nvme_ns_tree, &nvme_ctrlr->namespaces, tmp) {
5146 48 : nvme_ctrlr_depopulate_namespace(nvme_ctrlr, nvme_ns);
5147 48 : }
5148 61 : }
5149 :
5150 : static uint32_t
5151 36 : nvme_ctrlr_get_ana_log_page_size(struct nvme_ctrlr *nvme_ctrlr)
5152 : {
5153 36 : struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
5154 : const struct spdk_nvme_ctrlr_data *cdata;
5155 36 : uint32_t nsid, ns_count = 0;
5156 :
5157 36 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
5158 :
5159 80 : for (nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
5160 80 : nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, nsid)) {
5161 44 : ns_count++;
5162 44 : }
5163 :
5164 72 : return sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid *
5165 36 : sizeof(struct spdk_nvme_ana_group_descriptor) + ns_count *
5166 : sizeof(uint32_t);
5167 : }
5168 :
5169 : static int
5170 7 : nvme_ctrlr_set_ana_states(const struct spdk_nvme_ana_group_descriptor *desc,
5171 : void *cb_arg)
5172 : {
5173 7 : struct nvme_ctrlr *nvme_ctrlr = cb_arg;
5174 : struct nvme_ns *nvme_ns;
5175 : uint32_t i, nsid;
5176 :
5177 13 : for (i = 0; i < desc->num_of_nsid; i++) {
5178 6 : nsid = desc->nsid[i];
5179 6 : if (nsid == 0) {
5180 0 : continue;
5181 : }
5182 :
5183 6 : nvme_ns = nvme_ctrlr_get_ns(nvme_ctrlr, nsid);
5184 :
5185 6 : if (nvme_ns == NULL) {
5186 : /* Target told us that an inactive namespace had an ANA change */
5187 1 : continue;
5188 : }
5189 :
5190 5 : _nvme_ns_set_ana_state(nvme_ns, desc);
5191 5 : }
5192 :
5193 7 : return 0;
5194 : }
5195 :
5196 : static void
5197 0 : bdev_nvme_disable_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr)
5198 : {
5199 : struct nvme_ns *nvme_ns;
5200 :
5201 0 : spdk_free(nvme_ctrlr->ana_log_page);
5202 0 : nvme_ctrlr->ana_log_page = NULL;
5203 :
5204 0 : for (nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
5205 0 : nvme_ns != NULL;
5206 0 : nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns)) {
5207 0 : nvme_ns->ana_state_updating = false;
5208 0 : nvme_ns->ana_state = SPDK_NVME_ANA_OPTIMIZED_STATE;
5209 0 : }
5210 0 : }
5211 :
5212 : static void
5213 3 : nvme_ctrlr_read_ana_log_page_done(void *ctx, const struct spdk_nvme_cpl *cpl)
5214 : {
5215 3 : struct nvme_ctrlr *nvme_ctrlr = ctx;
5216 :
5217 3 : if (cpl != NULL && spdk_nvme_cpl_is_success(cpl)) {
5218 6 : bdev_nvme_parse_ana_log_page(nvme_ctrlr, nvme_ctrlr_set_ana_states,
5219 3 : nvme_ctrlr);
5220 3 : } else {
5221 0 : bdev_nvme_disable_read_ana_log_page(nvme_ctrlr);
5222 : }
5223 :
5224 3 : pthread_mutex_lock(&nvme_ctrlr->mutex);
5225 :
5226 3 : assert(nvme_ctrlr->ana_log_page_updating == true);
5227 3 : nvme_ctrlr->ana_log_page_updating = false;
5228 :
5229 3 : if (nvme_ctrlr_can_be_unregistered(nvme_ctrlr)) {
5230 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
5231 :
5232 0 : nvme_ctrlr_unregister(nvme_ctrlr);
5233 0 : } else {
5234 3 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
5235 :
5236 3 : bdev_nvme_clear_io_path_caches(nvme_ctrlr);
5237 : }
5238 3 : }
5239 :
5240 : static int
5241 6 : nvme_ctrlr_read_ana_log_page(struct nvme_ctrlr *nvme_ctrlr)
5242 : {
5243 : uint32_t ana_log_page_size;
5244 : int rc;
5245 :
5246 6 : if (nvme_ctrlr->ana_log_page == NULL) {
5247 0 : return -EINVAL;
5248 : }
5249 :
5250 6 : ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr);
5251 :
5252 6 : if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) {
5253 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr,
5254 : "ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n",
5255 : ana_log_page_size, nvme_ctrlr->max_ana_log_page_size);
5256 0 : return -EINVAL;
5257 : }
5258 :
5259 6 : pthread_mutex_lock(&nvme_ctrlr->mutex);
5260 6 : if (!nvme_ctrlr_is_available(nvme_ctrlr) ||
5261 5 : nvme_ctrlr->ana_log_page_updating) {
5262 3 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
5263 3 : return -EBUSY;
5264 : }
5265 :
5266 3 : nvme_ctrlr->ana_log_page_updating = true;
5267 3 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
5268 :
5269 6 : rc = spdk_nvme_ctrlr_cmd_get_log_page(nvme_ctrlr->ctrlr,
5270 : SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS,
5271 : SPDK_NVME_GLOBAL_NS_TAG,
5272 3 : nvme_ctrlr->ana_log_page,
5273 3 : ana_log_page_size, 0,
5274 : nvme_ctrlr_read_ana_log_page_done,
5275 3 : nvme_ctrlr);
5276 3 : if (rc != 0) {
5277 0 : nvme_ctrlr_read_ana_log_page_done(nvme_ctrlr, NULL);
5278 0 : }
5279 :
5280 3 : return rc;
5281 6 : }
5282 :
5283 : static void
5284 0 : dummy_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx)
5285 : {
5286 0 : }
5287 :
5288 : struct bdev_nvme_set_preferred_path_ctx {
5289 : struct spdk_bdev_desc *desc;
5290 : struct nvme_ns *nvme_ns;
5291 : bdev_nvme_set_preferred_path_cb cb_fn;
5292 : void *cb_arg;
5293 : };
5294 :
5295 : static void
5296 3 : bdev_nvme_set_preferred_path_done(struct nvme_bdev *nbdev, void *_ctx, int status)
5297 : {
5298 3 : struct bdev_nvme_set_preferred_path_ctx *ctx = _ctx;
5299 :
5300 3 : assert(ctx != NULL);
5301 3 : assert(ctx->desc != NULL);
5302 3 : assert(ctx->cb_fn != NULL);
5303 :
5304 3 : spdk_bdev_close(ctx->desc);
5305 :
5306 3 : ctx->cb_fn(ctx->cb_arg, status);
5307 :
5308 3 : free(ctx);
5309 3 : }
5310 :
5311 : static void
5312 2 : _bdev_nvme_set_preferred_path(struct nvme_bdev_channel_iter *i,
5313 : struct nvme_bdev *nbdev,
5314 : struct nvme_bdev_channel *nbdev_ch, void *_ctx)
5315 : {
5316 2 : struct bdev_nvme_set_preferred_path_ctx *ctx = _ctx;
5317 : struct nvme_io_path *io_path, *prev;
5318 :
5319 2 : prev = NULL;
5320 3 : STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
5321 3 : if (io_path->nvme_ns == ctx->nvme_ns) {
5322 2 : break;
5323 : }
5324 1 : prev = io_path;
5325 1 : }
5326 :
5327 2 : if (io_path != NULL) {
5328 2 : if (prev != NULL) {
5329 1 : STAILQ_REMOVE_AFTER(&nbdev_ch->io_path_list, prev, stailq);
5330 1 : STAILQ_INSERT_HEAD(&nbdev_ch->io_path_list, io_path, stailq);
5331 1 : }
5332 :
5333 : /* We can set io_path to nbdev_ch->current_io_path directly here.
5334 : * However, it needs to be conditional. To simplify the code,
5335 : * just clear nbdev_ch->current_io_path and let find_io_path()
5336 : * fill it.
5337 : *
5338 : * Automatic failback may be disabled. Hence even if the io_path is
5339 : * already at the head, clear nbdev_ch->current_io_path.
5340 : */
5341 2 : bdev_nvme_clear_current_io_path(nbdev_ch);
5342 2 : }
5343 :
5344 2 : nvme_bdev_for_each_channel_continue(i, 0);
5345 2 : }
5346 :
5347 : static struct nvme_ns *
5348 3 : bdev_nvme_set_preferred_ns(struct nvme_bdev *nbdev, uint16_t cntlid)
5349 : {
5350 : struct nvme_ns *nvme_ns, *prev;
5351 : const struct spdk_nvme_ctrlr_data *cdata;
5352 :
5353 3 : prev = NULL;
5354 6 : TAILQ_FOREACH(nvme_ns, &nbdev->nvme_ns_list, tailq) {
5355 6 : cdata = spdk_nvme_ctrlr_get_data(nvme_ns->ctrlr->ctrlr);
5356 :
5357 6 : if (cdata->cntlid == cntlid) {
5358 3 : break;
5359 : }
5360 3 : prev = nvme_ns;
5361 3 : }
5362 :
5363 3 : if (nvme_ns != NULL && prev != NULL) {
5364 2 : TAILQ_REMOVE(&nbdev->nvme_ns_list, nvme_ns, tailq);
5365 2 : TAILQ_INSERT_HEAD(&nbdev->nvme_ns_list, nvme_ns, tailq);
5366 2 : }
5367 :
5368 3 : return nvme_ns;
5369 : }
5370 :
5371 : /* This function supports only multipath mode. There is only a single I/O path
5372 : * for each NVMe-oF controller. Hence, just move the matched I/O path to the
5373 : * head of the I/O path list for each NVMe bdev channel.
5374 : *
5375 : * NVMe bdev channel may be acquired after completing this function. move the
5376 : * matched namespace to the head of the namespace list for the NVMe bdev too.
5377 : */
5378 : void
5379 3 : bdev_nvme_set_preferred_path(const char *name, uint16_t cntlid,
5380 : bdev_nvme_set_preferred_path_cb cb_fn, void *cb_arg)
5381 : {
5382 : struct bdev_nvme_set_preferred_path_ctx *ctx;
5383 : struct spdk_bdev *bdev;
5384 : struct nvme_bdev *nbdev;
5385 3 : int rc = 0;
5386 :
5387 3 : assert(cb_fn != NULL);
5388 :
5389 3 : ctx = calloc(1, sizeof(*ctx));
5390 3 : if (ctx == NULL) {
5391 0 : SPDK_ERRLOG("Failed to alloc context.\n");
5392 0 : rc = -ENOMEM;
5393 0 : goto err_alloc;
5394 : }
5395 :
5396 3 : ctx->cb_fn = cb_fn;
5397 3 : ctx->cb_arg = cb_arg;
5398 :
5399 3 : rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc);
5400 3 : if (rc != 0) {
5401 0 : SPDK_ERRLOG("Failed to open bdev %s.\n", name);
5402 0 : goto err_open;
5403 : }
5404 :
5405 3 : bdev = spdk_bdev_desc_get_bdev(ctx->desc);
5406 :
5407 3 : if (bdev->module != &nvme_if) {
5408 0 : SPDK_ERRLOG("bdev %s is not registered in this module.\n", name);
5409 0 : rc = -ENODEV;
5410 0 : goto err_bdev;
5411 : }
5412 :
5413 3 : nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk);
5414 :
5415 3 : pthread_mutex_lock(&nbdev->mutex);
5416 :
5417 3 : ctx->nvme_ns = bdev_nvme_set_preferred_ns(nbdev, cntlid);
5418 3 : if (ctx->nvme_ns == NULL) {
5419 0 : pthread_mutex_unlock(&nbdev->mutex);
5420 :
5421 0 : SPDK_ERRLOG("bdev %s does not have namespace to controller %u.\n", name, cntlid);
5422 0 : rc = -ENODEV;
5423 0 : goto err_bdev;
5424 : }
5425 :
5426 3 : pthread_mutex_unlock(&nbdev->mutex);
5427 :
5428 6 : nvme_bdev_for_each_channel(nbdev,
5429 : _bdev_nvme_set_preferred_path,
5430 3 : ctx,
5431 : bdev_nvme_set_preferred_path_done);
5432 3 : return;
5433 :
5434 : err_bdev:
5435 0 : spdk_bdev_close(ctx->desc);
5436 : err_open:
5437 0 : free(ctx);
5438 : err_alloc:
5439 0 : cb_fn(cb_arg, rc);
5440 3 : }
5441 :
5442 : struct bdev_nvme_set_multipath_policy_ctx {
5443 : struct spdk_bdev_desc *desc;
5444 : spdk_bdev_nvme_set_multipath_policy_cb cb_fn;
5445 : void *cb_arg;
5446 : };
5447 :
5448 : static void
5449 3 : bdev_nvme_set_multipath_policy_done(struct nvme_bdev *nbdev, void *_ctx, int status)
5450 : {
5451 3 : struct bdev_nvme_set_multipath_policy_ctx *ctx = _ctx;
5452 :
5453 3 : assert(ctx != NULL);
5454 3 : assert(ctx->desc != NULL);
5455 3 : assert(ctx->cb_fn != NULL);
5456 :
5457 3 : spdk_bdev_close(ctx->desc);
5458 :
5459 3 : ctx->cb_fn(ctx->cb_arg, status);
5460 :
5461 3 : free(ctx);
5462 3 : }
5463 :
5464 : static void
5465 1 : _bdev_nvme_set_multipath_policy(struct nvme_bdev_channel_iter *i,
5466 : struct nvme_bdev *nbdev,
5467 : struct nvme_bdev_channel *nbdev_ch, void *ctx)
5468 : {
5469 1 : nbdev_ch->mp_policy = nbdev->mp_policy;
5470 1 : nbdev_ch->mp_selector = nbdev->mp_selector;
5471 1 : nbdev_ch->rr_min_io = nbdev->rr_min_io;
5472 1 : bdev_nvme_clear_current_io_path(nbdev_ch);
5473 :
5474 1 : nvme_bdev_for_each_channel_continue(i, 0);
5475 1 : }
5476 :
5477 : void
5478 3 : spdk_bdev_nvme_set_multipath_policy(const char *name, enum spdk_bdev_nvme_multipath_policy policy,
5479 : enum spdk_bdev_nvme_multipath_selector selector, uint32_t rr_min_io,
5480 : spdk_bdev_nvme_set_multipath_policy_cb cb_fn, void *cb_arg)
5481 : {
5482 : struct bdev_nvme_set_multipath_policy_ctx *ctx;
5483 : struct spdk_bdev *bdev;
5484 : struct nvme_bdev *nbdev;
5485 : int rc;
5486 :
5487 3 : assert(cb_fn != NULL);
5488 :
5489 3 : switch (policy) {
5490 : case BDEV_NVME_MP_POLICY_ACTIVE_PASSIVE:
5491 1 : break;
5492 : case BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE:
5493 2 : switch (selector) {
5494 : case BDEV_NVME_MP_SELECTOR_ROUND_ROBIN:
5495 1 : if (rr_min_io == UINT32_MAX) {
5496 0 : rr_min_io = 1;
5497 1 : } else if (rr_min_io == 0) {
5498 0 : rc = -EINVAL;
5499 0 : goto exit;
5500 : }
5501 1 : break;
5502 : case BDEV_NVME_MP_SELECTOR_QUEUE_DEPTH:
5503 1 : break;
5504 : default:
5505 0 : rc = -EINVAL;
5506 0 : goto exit;
5507 : }
5508 2 : break;
5509 : default:
5510 0 : rc = -EINVAL;
5511 0 : goto exit;
5512 : }
5513 :
5514 3 : ctx = calloc(1, sizeof(*ctx));
5515 3 : if (ctx == NULL) {
5516 0 : SPDK_ERRLOG("Failed to alloc context.\n");
5517 0 : rc = -ENOMEM;
5518 0 : goto exit;
5519 : }
5520 :
5521 3 : ctx->cb_fn = cb_fn;
5522 3 : ctx->cb_arg = cb_arg;
5523 :
5524 3 : rc = spdk_bdev_open_ext(name, false, dummy_bdev_event_cb, NULL, &ctx->desc);
5525 3 : if (rc != 0) {
5526 0 : SPDK_ERRLOG("Failed to open bdev %s.\n", name);
5527 0 : rc = -ENODEV;
5528 0 : goto err_open;
5529 : }
5530 :
5531 3 : bdev = spdk_bdev_desc_get_bdev(ctx->desc);
5532 3 : if (bdev->module != &nvme_if) {
5533 0 : SPDK_ERRLOG("bdev %s is not registered in this module.\n", name);
5534 0 : rc = -ENODEV;
5535 0 : goto err_module;
5536 : }
5537 3 : nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk);
5538 :
5539 3 : pthread_mutex_lock(&nbdev->mutex);
5540 3 : nbdev->mp_policy = policy;
5541 3 : nbdev->mp_selector = selector;
5542 3 : nbdev->rr_min_io = rr_min_io;
5543 3 : pthread_mutex_unlock(&nbdev->mutex);
5544 :
5545 6 : nvme_bdev_for_each_channel(nbdev,
5546 : _bdev_nvme_set_multipath_policy,
5547 3 : ctx,
5548 : bdev_nvme_set_multipath_policy_done);
5549 3 : return;
5550 :
5551 : err_module:
5552 0 : spdk_bdev_close(ctx->desc);
5553 : err_open:
5554 0 : free(ctx);
5555 : exit:
5556 0 : cb_fn(cb_arg, rc);
5557 3 : }
5558 :
5559 : static void
5560 3 : aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
5561 : {
5562 3 : struct nvme_ctrlr *nvme_ctrlr = arg;
5563 : union spdk_nvme_async_event_completion event;
5564 :
5565 3 : if (spdk_nvme_cpl_is_error(cpl)) {
5566 0 : SPDK_WARNLOG("AER request execute failed\n");
5567 0 : return;
5568 : }
5569 :
5570 3 : event.raw = cpl->cdw0;
5571 3 : if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
5572 3 : (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) {
5573 2 : nvme_ctrlr_populate_namespaces(nvme_ctrlr, NULL);
5574 3 : } else if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
5575 1 : (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_ANA_CHANGE)) {
5576 1 : nvme_ctrlr_read_ana_log_page(nvme_ctrlr);
5577 1 : }
5578 3 : }
5579 :
5580 : static void
5581 52 : free_nvme_async_probe_ctx(struct nvme_async_probe_ctx *ctx)
5582 : {
5583 52 : spdk_keyring_put_key(ctx->drv_opts.tls_psk);
5584 52 : spdk_keyring_put_key(ctx->drv_opts.dhchap_key);
5585 52 : spdk_keyring_put_key(ctx->drv_opts.dhchap_ctrlr_key);
5586 52 : free(ctx);
5587 52 : }
5588 :
5589 : static void
5590 52 : populate_namespaces_cb(struct nvme_async_probe_ctx *ctx, int rc)
5591 : {
5592 52 : if (ctx->cb_fn) {
5593 52 : ctx->cb_fn(ctx->cb_ctx, ctx->reported_bdevs, rc);
5594 52 : }
5595 :
5596 52 : ctx->namespaces_populated = true;
5597 52 : if (ctx->probe_done) {
5598 : /* The probe was already completed, so we need to free the context
5599 : * here. This can happen for cases like OCSSD, where we need to
5600 : * send additional commands to the SSD after attach.
5601 : */
5602 31 : free_nvme_async_probe_ctx(ctx);
5603 31 : }
5604 52 : }
5605 :
5606 : static int
5607 20 : bdev_nvme_remove_poller(void *ctx)
5608 : {
5609 : struct spdk_nvme_transport_id trid_pcie;
5610 :
5611 20 : if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
5612 1 : spdk_poller_unregister(&g_hotplug_poller);
5613 1 : return SPDK_POLLER_IDLE;
5614 : }
5615 :
5616 19 : memset(&trid_pcie, 0, sizeof(trid_pcie));
5617 19 : spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
5618 :
5619 19 : if (spdk_nvme_scan_attached(&trid_pcie)) {
5620 0 : SPDK_ERRLOG_RATELIMIT("spdk_nvme_scan_attached() failed\n");
5621 0 : }
5622 :
5623 19 : return SPDK_POLLER_BUSY;
5624 20 : }
5625 :
5626 : static void
5627 60 : nvme_ctrlr_create_done(struct nvme_ctrlr *nvme_ctrlr,
5628 : struct nvme_async_probe_ctx *ctx)
5629 : {
5630 60 : struct spdk_nvme_transport_id *trid = &nvme_ctrlr->active_path_id->trid;
5631 :
5632 60 : if (spdk_nvme_trtype_is_fabrics(trid->trtype)) {
5633 60 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr was created to %s:%s\n",
5634 : trid->traddr, trid->trsvcid);
5635 60 : } else {
5636 0 : NVME_CTRLR_INFOLOG(nvme_ctrlr, "ctrlr was created\n");
5637 : }
5638 :
5639 120 : spdk_io_device_register(nvme_ctrlr,
5640 : bdev_nvme_create_ctrlr_channel_cb,
5641 : bdev_nvme_destroy_ctrlr_channel_cb,
5642 : sizeof(struct nvme_ctrlr_channel),
5643 60 : nvme_ctrlr->nbdev_ctrlr->name);
5644 :
5645 60 : nvme_ctrlr_populate_namespaces(nvme_ctrlr, ctx);
5646 :
5647 60 : if (g_hotplug_poller == NULL) {
5648 2 : g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_remove_poller, NULL,
5649 : NVME_HOTPLUG_POLL_PERIOD_DEFAULT);
5650 2 : }
5651 60 : }
5652 :
5653 : static void
5654 30 : nvme_ctrlr_init_ana_log_page_done(void *_ctx, const struct spdk_nvme_cpl *cpl)
5655 : {
5656 30 : struct nvme_ctrlr *nvme_ctrlr = _ctx;
5657 30 : struct nvme_async_probe_ctx *ctx = nvme_ctrlr->probe_ctx;
5658 :
5659 30 : nvme_ctrlr->probe_ctx = NULL;
5660 :
5661 30 : if (spdk_nvme_cpl_is_error(cpl)) {
5662 0 : nvme_ctrlr_delete(nvme_ctrlr);
5663 :
5664 0 : if (ctx != NULL) {
5665 0 : ctx->reported_bdevs = 0;
5666 0 : populate_namespaces_cb(ctx, -1);
5667 0 : }
5668 0 : return;
5669 : }
5670 :
5671 30 : nvme_ctrlr_create_done(nvme_ctrlr, ctx);
5672 30 : }
5673 :
5674 : static int
5675 30 : nvme_ctrlr_init_ana_log_page(struct nvme_ctrlr *nvme_ctrlr,
5676 : struct nvme_async_probe_ctx *ctx)
5677 : {
5678 30 : struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
5679 : const struct spdk_nvme_ctrlr_data *cdata;
5680 : uint32_t ana_log_page_size;
5681 :
5682 30 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
5683 :
5684 : /* Set buffer size enough to include maximum number of allowed namespaces. */
5685 60 : ana_log_page_size = sizeof(struct spdk_nvme_ana_page) + cdata->nanagrpid *
5686 30 : sizeof(struct spdk_nvme_ana_group_descriptor) + cdata->mnan *
5687 : sizeof(uint32_t);
5688 :
5689 30 : nvme_ctrlr->ana_log_page = spdk_zmalloc(ana_log_page_size, 64, NULL,
5690 : SPDK_ENV_NUMA_ID_ANY, SPDK_MALLOC_DMA);
5691 30 : if (nvme_ctrlr->ana_log_page == NULL) {
5692 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "could not allocate ANA log page buffer\n");
5693 0 : return -ENXIO;
5694 : }
5695 :
5696 : /* Each descriptor in a ANA log page is not ensured to be 8-bytes aligned.
5697 : * Hence copy each descriptor to a temporary area when parsing it.
5698 : *
5699 : * Allocate a buffer whose size is as large as ANA log page buffer because
5700 : * we do not know the size of a descriptor until actually reading it.
5701 : */
5702 30 : nvme_ctrlr->copied_ana_desc = calloc(1, ana_log_page_size);
5703 30 : if (nvme_ctrlr->copied_ana_desc == NULL) {
5704 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "could not allocate a buffer to parse ANA descriptor\n");
5705 0 : return -ENOMEM;
5706 : }
5707 :
5708 30 : nvme_ctrlr->max_ana_log_page_size = ana_log_page_size;
5709 :
5710 30 : nvme_ctrlr->probe_ctx = ctx;
5711 :
5712 : /* Then, set the read size only to include the current active namespaces. */
5713 30 : ana_log_page_size = nvme_ctrlr_get_ana_log_page_size(nvme_ctrlr);
5714 :
5715 30 : if (ana_log_page_size > nvme_ctrlr->max_ana_log_page_size) {
5716 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "ANA log page size %" PRIu32 " is larger than allowed %" PRIu32 "\n",
5717 : ana_log_page_size, nvme_ctrlr->max_ana_log_page_size);
5718 0 : return -EINVAL;
5719 : }
5720 :
5721 60 : return spdk_nvme_ctrlr_cmd_get_log_page(ctrlr,
5722 : SPDK_NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS,
5723 : SPDK_NVME_GLOBAL_NS_TAG,
5724 30 : nvme_ctrlr->ana_log_page,
5725 30 : ana_log_page_size, 0,
5726 : nvme_ctrlr_init_ana_log_page_done,
5727 30 : nvme_ctrlr);
5728 30 : }
5729 :
5730 : /* hostnqn and subnqn were already verified before attaching a controller.
5731 : * Hence check only the multipath capability and cntlid here.
5732 : */
5733 : static bool
5734 16 : bdev_nvme_check_multipath(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct spdk_nvme_ctrlr *ctrlr)
5735 : {
5736 : struct nvme_ctrlr *tmp;
5737 : const struct spdk_nvme_ctrlr_data *cdata, *tmp_cdata;
5738 :
5739 16 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
5740 :
5741 16 : if (!cdata->cmic.multi_ctrlr) {
5742 0 : SPDK_ERRLOG("Ctrlr%u does not support multipath.\n", cdata->cntlid);
5743 0 : return false;
5744 : }
5745 :
5746 33 : TAILQ_FOREACH(tmp, &nbdev_ctrlr->ctrlrs, tailq) {
5747 18 : tmp_cdata = spdk_nvme_ctrlr_get_data(tmp->ctrlr);
5748 :
5749 18 : if (!tmp_cdata->cmic.multi_ctrlr) {
5750 0 : NVME_CTRLR_ERRLOG(tmp, "Ctrlr%u does not support multipath.\n", cdata->cntlid);
5751 0 : return false;
5752 : }
5753 18 : if (cdata->cntlid == tmp_cdata->cntlid) {
5754 1 : NVME_CTRLR_ERRLOG(tmp, "cntlid %u are duplicated.\n", tmp_cdata->cntlid);
5755 1 : return false;
5756 : }
5757 17 : }
5758 :
5759 15 : return true;
5760 16 : }
5761 :
5762 :
5763 : static int
5764 61 : nvme_bdev_ctrlr_create(const char *name, struct nvme_ctrlr *nvme_ctrlr)
5765 : {
5766 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
5767 61 : struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr;
5768 : struct nvme_ctrlr *nctrlr;
5769 61 : int rc = 0;
5770 :
5771 61 : pthread_mutex_lock(&g_bdev_nvme_mutex);
5772 :
5773 61 : nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
5774 61 : if (nbdev_ctrlr != NULL) {
5775 16 : if (!bdev_nvme_check_multipath(nbdev_ctrlr, ctrlr)) {
5776 1 : rc = -EINVAL;
5777 1 : goto exit;
5778 : }
5779 32 : TAILQ_FOREACH(nctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
5780 17 : if (nctrlr->opts.multipath != nvme_ctrlr->opts.multipath) {
5781 : /* All controllers with the same name must be configured the same
5782 : * way, either for multipath or failover. If the configuration doesn't
5783 : * match - report error.
5784 : */
5785 0 : rc = -EINVAL;
5786 0 : goto exit;
5787 : }
5788 17 : }
5789 15 : } else {
5790 45 : nbdev_ctrlr = calloc(1, sizeof(*nbdev_ctrlr));
5791 45 : if (nbdev_ctrlr == NULL) {
5792 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to allocate nvme_bdev_ctrlr.\n");
5793 0 : rc = -ENOMEM;
5794 0 : goto exit;
5795 : }
5796 45 : nbdev_ctrlr->name = strdup(name);
5797 45 : if (nbdev_ctrlr->name == NULL) {
5798 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "Failed to allocate name of nvme_bdev_ctrlr.\n");
5799 0 : free(nbdev_ctrlr);
5800 0 : goto exit;
5801 : }
5802 45 : TAILQ_INIT(&nbdev_ctrlr->ctrlrs);
5803 45 : TAILQ_INIT(&nbdev_ctrlr->bdevs);
5804 45 : TAILQ_INSERT_TAIL(&g_nvme_bdev_ctrlrs, nbdev_ctrlr, tailq);
5805 : }
5806 60 : nvme_ctrlr->nbdev_ctrlr = nbdev_ctrlr;
5807 60 : TAILQ_INSERT_TAIL(&nbdev_ctrlr->ctrlrs, nvme_ctrlr, tailq);
5808 : exit:
5809 61 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
5810 61 : return rc;
5811 : }
5812 :
5813 : static int
5814 61 : nvme_ctrlr_create(struct spdk_nvme_ctrlr *ctrlr,
5815 : const char *name,
5816 : const struct spdk_nvme_transport_id *trid,
5817 : struct nvme_async_probe_ctx *ctx)
5818 : {
5819 : struct nvme_ctrlr *nvme_ctrlr;
5820 : struct nvme_path_id *path_id;
5821 : const struct spdk_nvme_ctrlr_data *cdata;
5822 61 : struct spdk_event_handler_opts opts = {
5823 : .opts_size = SPDK_SIZEOF(&opts, fd_type),
5824 : };
5825 : uint64_t period;
5826 : int fd, rc;
5827 :
5828 61 : nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr));
5829 61 : if (nvme_ctrlr == NULL) {
5830 0 : SPDK_ERRLOG("Failed to allocate device struct\n");
5831 0 : return -ENOMEM;
5832 : }
5833 :
5834 61 : rc = pthread_mutex_init(&nvme_ctrlr->mutex, NULL);
5835 61 : if (rc != 0) {
5836 0 : free(nvme_ctrlr);
5837 0 : return rc;
5838 : }
5839 :
5840 61 : TAILQ_INIT(&nvme_ctrlr->trids);
5841 61 : RB_INIT(&nvme_ctrlr->namespaces);
5842 :
5843 : /* Get another reference to the key, so the first one can be released from probe_ctx */
5844 61 : if (ctx != NULL) {
5845 47 : if (ctx->drv_opts.tls_psk != NULL) {
5846 0 : nvme_ctrlr->psk = spdk_keyring_get_key(
5847 0 : spdk_key_get_name(ctx->drv_opts.tls_psk));
5848 0 : if (nvme_ctrlr->psk == NULL) {
5849 : /* Could only happen if the key was removed in the meantime */
5850 0 : SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n",
5851 : spdk_key_get_name(ctx->drv_opts.tls_psk));
5852 0 : rc = -ENOKEY;
5853 0 : goto err;
5854 : }
5855 0 : }
5856 :
5857 47 : if (ctx->drv_opts.dhchap_key != NULL) {
5858 0 : nvme_ctrlr->dhchap_key = spdk_keyring_get_key(
5859 0 : spdk_key_get_name(ctx->drv_opts.dhchap_key));
5860 0 : if (nvme_ctrlr->dhchap_key == NULL) {
5861 0 : SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n",
5862 : spdk_key_get_name(ctx->drv_opts.dhchap_key));
5863 0 : rc = -ENOKEY;
5864 0 : goto err;
5865 : }
5866 0 : }
5867 :
5868 47 : if (ctx->drv_opts.dhchap_ctrlr_key != NULL) {
5869 0 : nvme_ctrlr->dhchap_ctrlr_key =
5870 0 : spdk_keyring_get_key(
5871 0 : spdk_key_get_name(ctx->drv_opts.dhchap_ctrlr_key));
5872 0 : if (nvme_ctrlr->dhchap_ctrlr_key == NULL) {
5873 0 : SPDK_ERRLOG("Couldn't get a reference to the key '%s'\n",
5874 : spdk_key_get_name(ctx->drv_opts.dhchap_ctrlr_key));
5875 0 : rc = -ENOKEY;
5876 0 : goto err;
5877 : }
5878 0 : }
5879 47 : }
5880 :
5881 : /* Check if we manage to enable interrupts on the controller. */
5882 61 : if (spdk_interrupt_mode_is_enabled() && ctx != NULL && !ctx->drv_opts.enable_interrupts) {
5883 0 : SPDK_ERRLOG("Failed to enable interrupts on the controller\n");
5884 0 : rc = -ENOTSUP;
5885 0 : goto err;
5886 : }
5887 :
5888 61 : path_id = calloc(1, sizeof(*path_id));
5889 61 : if (path_id == NULL) {
5890 0 : SPDK_ERRLOG("Failed to allocate trid entry pointer\n");
5891 0 : rc = -ENOMEM;
5892 0 : goto err;
5893 : }
5894 :
5895 61 : path_id->trid = *trid;
5896 61 : if (ctx != NULL) {
5897 47 : memcpy(path_id->hostid.hostaddr, ctx->drv_opts.src_addr, sizeof(path_id->hostid.hostaddr));
5898 47 : memcpy(path_id->hostid.hostsvcid, ctx->drv_opts.src_svcid, sizeof(path_id->hostid.hostsvcid));
5899 47 : }
5900 61 : nvme_ctrlr->active_path_id = path_id;
5901 61 : TAILQ_INSERT_HEAD(&nvme_ctrlr->trids, path_id, link);
5902 :
5903 61 : nvme_ctrlr->thread = spdk_get_thread();
5904 61 : nvme_ctrlr->ctrlr = ctrlr;
5905 61 : nvme_ctrlr->ref = 1;
5906 :
5907 61 : if (spdk_nvme_ctrlr_is_ocssd_supported(ctrlr)) {
5908 0 : SPDK_ERRLOG("OCSSDs are not supported");
5909 0 : rc = -ENOTSUP;
5910 0 : goto err;
5911 : }
5912 :
5913 61 : if (ctx != NULL) {
5914 47 : memcpy(&nvme_ctrlr->opts, &ctx->bdev_opts, sizeof(ctx->bdev_opts));
5915 47 : } else {
5916 14 : spdk_bdev_nvme_get_default_ctrlr_opts(&nvme_ctrlr->opts);
5917 : }
5918 :
5919 61 : period = spdk_interrupt_mode_is_enabled() ? 0 : g_opts.nvme_adminq_poll_period_us;
5920 :
5921 61 : nvme_ctrlr->adminq_timer_poller = SPDK_POLLER_REGISTER(bdev_nvme_poll_adminq, nvme_ctrlr,
5922 : period);
5923 :
5924 61 : if (spdk_interrupt_mode_is_enabled()) {
5925 0 : spdk_poller_register_interrupt(nvme_ctrlr->adminq_timer_poller, NULL, NULL);
5926 :
5927 0 : fd = spdk_nvme_ctrlr_get_admin_qp_fd(nvme_ctrlr->ctrlr, &opts);
5928 0 : if (fd < 0) {
5929 0 : rc = fd;
5930 0 : goto err;
5931 : }
5932 :
5933 0 : nvme_ctrlr->intr = SPDK_INTERRUPT_REGISTER_EXT(fd, bdev_nvme_poll_adminq,
5934 : nvme_ctrlr, &opts);
5935 0 : if (!nvme_ctrlr->intr) {
5936 0 : rc = -EINVAL;
5937 0 : goto err;
5938 : }
5939 0 : }
5940 :
5941 61 : if (g_opts.timeout_us > 0) {
5942 : /* Register timeout callback. Timeout values for IO vs. admin reqs can be different. */
5943 : /* If timeout_admin_us is 0 (not specified), admin uses same timeout as IO. */
5944 0 : uint64_t adm_timeout_us = (g_opts.timeout_admin_us == 0) ?
5945 0 : g_opts.timeout_us : g_opts.timeout_admin_us;
5946 0 : spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us,
5947 0 : adm_timeout_us, timeout_cb, nvme_ctrlr);
5948 0 : }
5949 :
5950 61 : spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr);
5951 61 : spdk_nvme_ctrlr_set_remove_cb(ctrlr, remove_cb, nvme_ctrlr);
5952 :
5953 61 : if (spdk_nvme_ctrlr_get_flags(ctrlr) &
5954 : SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED) {
5955 0 : nvme_ctrlr->opal_dev = spdk_opal_dev_construct(ctrlr);
5956 0 : }
5957 :
5958 61 : rc = nvme_bdev_ctrlr_create(name, nvme_ctrlr);
5959 61 : if (rc != 0) {
5960 1 : goto err;
5961 : }
5962 :
5963 60 : cdata = spdk_nvme_ctrlr_get_data(ctrlr);
5964 :
5965 60 : if (cdata->cmic.ana_reporting) {
5966 30 : rc = nvme_ctrlr_init_ana_log_page(nvme_ctrlr, ctx);
5967 30 : if (rc == 0) {
5968 30 : return 0;
5969 : }
5970 0 : } else {
5971 30 : nvme_ctrlr_create_done(nvme_ctrlr, ctx);
5972 30 : return 0;
5973 : }
5974 :
5975 : err:
5976 1 : nvme_ctrlr_delete(nvme_ctrlr);
5977 1 : return rc;
5978 61 : }
5979 :
5980 : void
5981 33 : spdk_bdev_nvme_get_default_ctrlr_opts(struct spdk_bdev_nvme_ctrlr_opts *opts)
5982 : {
5983 33 : opts->prchk_flags = 0;
5984 33 : opts->ctrlr_loss_timeout_sec = g_opts.ctrlr_loss_timeout_sec;
5985 33 : opts->reconnect_delay_sec = g_opts.reconnect_delay_sec;
5986 33 : opts->fast_io_fail_timeout_sec = g_opts.fast_io_fail_timeout_sec;
5987 33 : opts->multipath = true;
5988 33 : }
5989 :
5990 : static void
5991 0 : attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
5992 : struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *drv_opts)
5993 : {
5994 : char *name;
5995 :
5996 0 : name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++);
5997 0 : if (!name) {
5998 0 : SPDK_ERRLOG("Failed to assign name to NVMe device\n");
5999 0 : return;
6000 : }
6001 :
6002 0 : if (nvme_ctrlr_create(ctrlr, name, trid, NULL) == 0) {
6003 0 : SPDK_DEBUGLOG(bdev_nvme, "Attached to %s (%s)\n", trid->traddr, name);
6004 0 : } else {
6005 0 : SPDK_ERRLOG("Failed to attach to %s (%s)\n", trid->traddr, name);
6006 : }
6007 :
6008 0 : free(name);
6009 0 : }
6010 :
6011 : static void
6012 60 : _nvme_ctrlr_destruct(void *ctx)
6013 : {
6014 60 : struct nvme_ctrlr *nvme_ctrlr = ctx;
6015 :
6016 60 : nvme_ctrlr_depopulate_namespaces(nvme_ctrlr);
6017 60 : nvme_ctrlr_release(nvme_ctrlr);
6018 60 : }
6019 :
6020 : static int
6021 57 : bdev_nvme_delete_ctrlr_unsafe(struct nvme_ctrlr *nvme_ctrlr, bool hotplug)
6022 : {
6023 : struct nvme_probe_skip_entry *entry;
6024 :
6025 : /* The controller's destruction was already started */
6026 57 : if (nvme_ctrlr->destruct) {
6027 0 : return -EALREADY;
6028 : }
6029 :
6030 57 : if (!hotplug &&
6031 57 : nvme_ctrlr->active_path_id->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
6032 0 : entry = calloc(1, sizeof(*entry));
6033 0 : if (!entry) {
6034 0 : return -ENOMEM;
6035 : }
6036 0 : entry->trid = nvme_ctrlr->active_path_id->trid;
6037 0 : TAILQ_INSERT_TAIL(&g_skipped_nvme_ctrlrs, entry, tailq);
6038 0 : }
6039 :
6040 57 : nvme_ctrlr->destruct = true;
6041 57 : return 0;
6042 57 : }
6043 :
6044 : static int
6045 2 : bdev_nvme_delete_ctrlr(struct nvme_ctrlr *nvme_ctrlr, bool hotplug)
6046 : {
6047 : int rc;
6048 :
6049 2 : pthread_mutex_lock(&nvme_ctrlr->mutex);
6050 2 : rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, hotplug);
6051 2 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
6052 :
6053 2 : if (rc == 0) {
6054 2 : _nvme_ctrlr_destruct(nvme_ctrlr);
6055 2 : } else if (rc == -EALREADY) {
6056 0 : rc = 0;
6057 0 : }
6058 :
6059 2 : return rc;
6060 : }
6061 :
6062 : static void
6063 0 : remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr)
6064 : {
6065 0 : struct nvme_ctrlr *nvme_ctrlr = cb_ctx;
6066 :
6067 0 : bdev_nvme_delete_ctrlr(nvme_ctrlr, true);
6068 0 : }
6069 :
6070 : static int
6071 0 : bdev_nvme_hotplug_probe(void *arg)
6072 : {
6073 0 : if (g_hotplug_probe_ctx == NULL) {
6074 0 : spdk_poller_unregister(&g_hotplug_probe_poller);
6075 0 : return SPDK_POLLER_IDLE;
6076 : }
6077 :
6078 0 : if (spdk_nvme_probe_poll_async(g_hotplug_probe_ctx) != -EAGAIN) {
6079 0 : g_hotplug_probe_ctx = NULL;
6080 0 : spdk_poller_unregister(&g_hotplug_probe_poller);
6081 0 : }
6082 :
6083 0 : return SPDK_POLLER_BUSY;
6084 0 : }
6085 :
6086 : static int
6087 0 : bdev_nvme_hotplug(void *arg)
6088 : {
6089 : struct spdk_nvme_transport_id trid_pcie;
6090 :
6091 0 : if (g_hotplug_probe_ctx) {
6092 0 : return SPDK_POLLER_BUSY;
6093 : }
6094 :
6095 0 : memset(&trid_pcie, 0, sizeof(trid_pcie));
6096 0 : spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE);
6097 :
6098 0 : g_hotplug_probe_ctx = spdk_nvme_probe_async(&trid_pcie, NULL,
6099 : hotplug_probe_cb, attach_cb, NULL);
6100 :
6101 0 : if (g_hotplug_probe_ctx) {
6102 0 : assert(g_hotplug_probe_poller == NULL);
6103 0 : g_hotplug_probe_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug_probe, NULL, 1000);
6104 0 : }
6105 :
6106 0 : return SPDK_POLLER_BUSY;
6107 0 : }
6108 :
6109 : void
6110 0 : bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts)
6111 : {
6112 0 : *opts = g_opts;
6113 0 : }
6114 :
6115 : static bool bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec,
6116 : uint32_t reconnect_delay_sec,
6117 : uint32_t fast_io_fail_timeout_sec);
6118 :
6119 : static int
6120 0 : bdev_nvme_validate_opts(const struct spdk_bdev_nvme_opts *opts)
6121 : {
6122 0 : if ((opts->timeout_us == 0) && (opts->timeout_admin_us != 0)) {
6123 : /* Can't set timeout_admin_us without also setting timeout_us */
6124 0 : SPDK_WARNLOG("Invalid options: Can't have (timeout_us == 0) with (timeout_admin_us > 0)\n");
6125 0 : return -EINVAL;
6126 : }
6127 :
6128 0 : if (opts->bdev_retry_count < -1) {
6129 0 : SPDK_WARNLOG("Invalid option: bdev_retry_count can't be less than -1.\n");
6130 0 : return -EINVAL;
6131 : }
6132 :
6133 0 : if (!bdev_nvme_check_io_error_resiliency_params(opts->ctrlr_loss_timeout_sec,
6134 0 : opts->reconnect_delay_sec,
6135 0 : opts->fast_io_fail_timeout_sec)) {
6136 0 : return -EINVAL;
6137 : }
6138 :
6139 0 : return 0;
6140 0 : }
6141 :
6142 : int
6143 0 : bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts)
6144 : {
6145 : int ret;
6146 :
6147 0 : ret = bdev_nvme_validate_opts(opts);
6148 0 : if (ret) {
6149 0 : SPDK_WARNLOG("Failed to set nvme opts.\n");
6150 0 : return ret;
6151 : }
6152 :
6153 0 : if (g_bdev_nvme_init_thread != NULL) {
6154 0 : if (!TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
6155 0 : return -EPERM;
6156 : }
6157 0 : }
6158 :
6159 0 : if (opts->rdma_srq_size != 0 ||
6160 0 : opts->rdma_max_cq_size != 0 ||
6161 0 : opts->rdma_cm_event_timeout_ms != 0) {
6162 : struct spdk_nvme_transport_opts drv_opts;
6163 :
6164 0 : spdk_nvme_transport_get_opts(&drv_opts, sizeof(drv_opts));
6165 0 : if (opts->rdma_srq_size != 0) {
6166 0 : drv_opts.rdma_srq_size = opts->rdma_srq_size;
6167 0 : }
6168 0 : if (opts->rdma_max_cq_size != 0) {
6169 0 : drv_opts.rdma_max_cq_size = opts->rdma_max_cq_size;
6170 0 : }
6171 0 : if (opts->rdma_cm_event_timeout_ms != 0) {
6172 0 : drv_opts.rdma_cm_event_timeout_ms = opts->rdma_cm_event_timeout_ms;
6173 0 : }
6174 :
6175 0 : ret = spdk_nvme_transport_set_opts(&drv_opts, sizeof(drv_opts));
6176 0 : if (ret) {
6177 0 : SPDK_ERRLOG("Failed to set NVMe transport opts.\n");
6178 0 : return ret;
6179 : }
6180 0 : }
6181 :
6182 0 : g_opts = *opts;
6183 :
6184 0 : return 0;
6185 0 : }
6186 :
6187 : struct set_nvme_hotplug_ctx {
6188 : uint64_t period_us;
6189 : bool enabled;
6190 : spdk_msg_fn fn;
6191 : void *fn_ctx;
6192 : };
6193 :
6194 : static void
6195 0 : set_nvme_hotplug_period_cb(void *_ctx)
6196 : {
6197 0 : struct set_nvme_hotplug_ctx *ctx = _ctx;
6198 :
6199 0 : spdk_poller_unregister(&g_hotplug_poller);
6200 0 : if (ctx->enabled) {
6201 0 : g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_hotplug, NULL, ctx->period_us);
6202 0 : } else {
6203 0 : g_hotplug_poller = SPDK_POLLER_REGISTER(bdev_nvme_remove_poller, NULL,
6204 : NVME_HOTPLUG_POLL_PERIOD_DEFAULT);
6205 : }
6206 :
6207 0 : g_nvme_hotplug_poll_period_us = ctx->period_us;
6208 0 : g_nvme_hotplug_enabled = ctx->enabled;
6209 0 : if (ctx->fn) {
6210 0 : ctx->fn(ctx->fn_ctx);
6211 0 : }
6212 :
6213 0 : free(ctx);
6214 0 : }
6215 :
6216 : int
6217 0 : bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_msg_fn cb, void *cb_ctx)
6218 : {
6219 : struct set_nvme_hotplug_ctx *ctx;
6220 :
6221 0 : if (enabled == true && !spdk_process_is_primary()) {
6222 0 : return -EPERM;
6223 : }
6224 :
6225 0 : ctx = calloc(1, sizeof(*ctx));
6226 0 : if (ctx == NULL) {
6227 0 : return -ENOMEM;
6228 : }
6229 :
6230 0 : period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us;
6231 0 : ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX);
6232 0 : ctx->enabled = enabled;
6233 0 : ctx->fn = cb;
6234 0 : ctx->fn_ctx = cb_ctx;
6235 :
6236 0 : spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx);
6237 0 : return 0;
6238 0 : }
6239 :
6240 : static void
6241 46 : nvme_ctrlr_populate_namespaces_done(struct nvme_ctrlr *nvme_ctrlr,
6242 : struct nvme_async_probe_ctx *ctx)
6243 : {
6244 : struct nvme_ns *nvme_ns;
6245 : struct nvme_bdev *nvme_bdev;
6246 : size_t j;
6247 :
6248 46 : assert(nvme_ctrlr != NULL);
6249 :
6250 46 : if (ctx->names == NULL) {
6251 0 : ctx->reported_bdevs = 0;
6252 0 : populate_namespaces_cb(ctx, 0);
6253 0 : return;
6254 : }
6255 :
6256 : /*
6257 : * Report the new bdevs that were created in this call.
6258 : * There can be more than one bdev per NVMe controller.
6259 : */
6260 46 : j = 0;
6261 46 : nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
6262 94 : while (nvme_ns != NULL) {
6263 48 : nvme_bdev = nvme_ns->bdev;
6264 48 : if (j < ctx->max_bdevs) {
6265 48 : ctx->names[j] = nvme_bdev->disk.name;
6266 48 : j++;
6267 48 : } else {
6268 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr,
6269 : "Maximum number of namespaces supported per NVMe controller is %du. "
6270 : "Unable to return all names of created bdevs\n",
6271 : ctx->max_bdevs);
6272 0 : ctx->reported_bdevs = 0;
6273 0 : populate_namespaces_cb(ctx, -ERANGE);
6274 0 : return;
6275 : }
6276 :
6277 48 : nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns);
6278 : }
6279 :
6280 46 : ctx->reported_bdevs = j;
6281 46 : populate_namespaces_cb(ctx, 0);
6282 46 : }
6283 :
6284 : static int
6285 9 : bdev_nvme_check_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
6286 : struct spdk_nvme_ctrlr *new_ctrlr,
6287 : struct spdk_nvme_transport_id *trid)
6288 : {
6289 : struct nvme_path_id *tmp_trid;
6290 :
6291 9 : if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
6292 0 : NVME_CTRLR_ERRLOG(nvme_ctrlr, "PCIe failover is not supported.\n");
6293 0 : return -ENOTSUP;
6294 : }
6295 :
6296 : /* Currently we only support failover to the same transport type. */
6297 9 : if (nvme_ctrlr->active_path_id->trid.trtype != trid->trtype) {
6298 0 : NVME_CTRLR_WARNLOG(nvme_ctrlr,
6299 : "Failover from trtype: %s to a different trtype: %s is not supported currently\n",
6300 : spdk_nvme_transport_id_trtype_str(nvme_ctrlr->active_path_id->trid.trtype),
6301 : spdk_nvme_transport_id_trtype_str(trid->trtype));
6302 0 : return -EINVAL;
6303 : }
6304 :
6305 :
6306 : /* Currently we only support failover to the same NQN. */
6307 9 : if (strncmp(trid->subnqn, nvme_ctrlr->active_path_id->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) {
6308 0 : NVME_CTRLR_WARNLOG(nvme_ctrlr,
6309 : "Failover from subnqn: %s to a different subnqn: %s is not supported currently\n",
6310 : nvme_ctrlr->active_path_id->trid.subnqn, trid->subnqn);
6311 0 : return -EINVAL;
6312 : }
6313 :
6314 : /* Skip all the other checks if we've already registered this path. */
6315 21 : TAILQ_FOREACH(tmp_trid, &nvme_ctrlr->trids, link) {
6316 12 : if (!spdk_nvme_transport_id_compare(&tmp_trid->trid, trid)) {
6317 0 : NVME_CTRLR_WARNLOG(nvme_ctrlr, "This path (traddr: %s subnqn: %s) is already registered\n",
6318 : trid->traddr, trid->subnqn);
6319 0 : return -EALREADY;
6320 : }
6321 12 : }
6322 :
6323 9 : return 0;
6324 9 : }
6325 :
6326 : static int
6327 9 : bdev_nvme_check_secondary_namespace(struct nvme_ctrlr *nvme_ctrlr,
6328 : struct spdk_nvme_ctrlr *new_ctrlr)
6329 : {
6330 : struct nvme_ns *nvme_ns;
6331 : struct spdk_nvme_ns *new_ns;
6332 :
6333 9 : nvme_ns = nvme_ctrlr_get_first_active_ns(nvme_ctrlr);
6334 9 : while (nvme_ns != NULL) {
6335 0 : new_ns = spdk_nvme_ctrlr_get_ns(new_ctrlr, nvme_ns->id);
6336 0 : assert(new_ns != NULL);
6337 :
6338 0 : if (!bdev_nvme_compare_ns(nvme_ns->ns, new_ns)) {
6339 0 : return -EINVAL;
6340 : }
6341 :
6342 0 : nvme_ns = nvme_ctrlr_get_next_active_ns(nvme_ctrlr, nvme_ns);
6343 : }
6344 :
6345 9 : return 0;
6346 9 : }
6347 :
6348 : static int
6349 9 : _bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
6350 : struct spdk_nvme_transport_id *trid)
6351 : {
6352 : struct nvme_path_id *active_id, *new_trid, *tmp_trid;
6353 :
6354 9 : new_trid = calloc(1, sizeof(*new_trid));
6355 9 : if (new_trid == NULL) {
6356 0 : return -ENOMEM;
6357 : }
6358 9 : new_trid->trid = *trid;
6359 :
6360 9 : active_id = nvme_ctrlr->active_path_id;
6361 9 : assert(active_id != NULL);
6362 9 : assert(active_id == TAILQ_FIRST(&nvme_ctrlr->trids));
6363 :
6364 : /* Skip the active trid not to replace it until it is failed. */
6365 9 : tmp_trid = TAILQ_NEXT(active_id, link);
6366 9 : if (tmp_trid == NULL) {
6367 6 : goto add_tail;
6368 : }
6369 :
6370 : /* It means the trid is faled if its last failed time is non-zero.
6371 : * Insert the new alternate trid before any failed trid.
6372 : */
6373 5 : TAILQ_FOREACH_FROM(tmp_trid, &nvme_ctrlr->trids, link) {
6374 3 : if (tmp_trid->last_failed_tsc != 0) {
6375 1 : TAILQ_INSERT_BEFORE(tmp_trid, new_trid, link);
6376 1 : return 0;
6377 : }
6378 4 : }
6379 :
6380 : add_tail:
6381 8 : TAILQ_INSERT_TAIL(&nvme_ctrlr->trids, new_trid, link);
6382 8 : return 0;
6383 9 : }
6384 :
6385 : /* This is the case that a secondary path is added to an existing
6386 : * nvme_ctrlr for failover. After checking if it can access the same
6387 : * namespaces as the primary path, it is disconnected until failover occurs.
6388 : */
6389 : static int
6390 9 : bdev_nvme_add_secondary_trid(struct nvme_ctrlr *nvme_ctrlr,
6391 : struct spdk_nvme_ctrlr *new_ctrlr,
6392 : struct spdk_nvme_transport_id *trid)
6393 : {
6394 : int rc;
6395 :
6396 9 : assert(nvme_ctrlr != NULL);
6397 :
6398 9 : pthread_mutex_lock(&nvme_ctrlr->mutex);
6399 :
6400 9 : rc = bdev_nvme_check_secondary_trid(nvme_ctrlr, new_ctrlr, trid);
6401 9 : if (rc != 0) {
6402 0 : goto exit;
6403 : }
6404 :
6405 9 : rc = bdev_nvme_check_secondary_namespace(nvme_ctrlr, new_ctrlr);
6406 9 : if (rc != 0) {
6407 0 : goto exit;
6408 : }
6409 :
6410 9 : rc = _bdev_nvme_add_secondary_trid(nvme_ctrlr, trid);
6411 :
6412 : exit:
6413 9 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
6414 :
6415 9 : spdk_nvme_detach(new_ctrlr);
6416 :
6417 9 : return rc;
6418 : }
6419 :
6420 : static void
6421 47 : connect_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
6422 : struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
6423 : {
6424 47 : struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
6425 : struct nvme_async_probe_ctx *ctx;
6426 : int rc;
6427 :
6428 47 : ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts);
6429 47 : ctx->ctrlr_attached = true;
6430 :
6431 47 : rc = nvme_ctrlr_create(ctrlr, ctx->base_name, &ctx->trid, ctx);
6432 47 : if (rc != 0) {
6433 1 : ctx->reported_bdevs = 0;
6434 1 : populate_namespaces_cb(ctx, rc);
6435 1 : }
6436 47 : }
6437 :
6438 :
6439 : static void
6440 4 : connect_set_failover_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
6441 : struct spdk_nvme_ctrlr *ctrlr,
6442 : const struct spdk_nvme_ctrlr_opts *opts)
6443 : {
6444 4 : struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
6445 : struct nvme_ctrlr *nvme_ctrlr;
6446 : struct nvme_async_probe_ctx *ctx;
6447 : int rc;
6448 :
6449 4 : ctx = SPDK_CONTAINEROF(user_opts, struct nvme_async_probe_ctx, drv_opts);
6450 4 : ctx->ctrlr_attached = true;
6451 :
6452 4 : nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name);
6453 4 : if (nvme_ctrlr) {
6454 4 : rc = bdev_nvme_add_secondary_trid(nvme_ctrlr, ctrlr, &ctx->trid);
6455 4 : } else {
6456 0 : rc = -ENODEV;
6457 : }
6458 :
6459 4 : ctx->reported_bdevs = 0;
6460 4 : populate_namespaces_cb(ctx, rc);
6461 4 : }
6462 :
6463 : static int
6464 52 : bdev_nvme_async_poll(void *arg)
6465 : {
6466 52 : struct nvme_async_probe_ctx *ctx = arg;
6467 : int rc;
6468 :
6469 52 : rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
6470 52 : if (spdk_unlikely(rc != -EAGAIN)) {
6471 52 : ctx->probe_done = true;
6472 52 : spdk_poller_unregister(&ctx->poller);
6473 52 : if (!ctx->ctrlr_attached) {
6474 : /* The probe is done, but no controller was attached.
6475 : * That means we had a failure, so report -EIO back to
6476 : * the caller (usually the RPC). populate_namespaces_cb()
6477 : * will take care of freeing the nvme_async_probe_ctx.
6478 : */
6479 1 : ctx->reported_bdevs = 0;
6480 1 : populate_namespaces_cb(ctx, -EIO);
6481 52 : } else if (ctx->namespaces_populated) {
6482 : /* The namespaces for the attached controller were all
6483 : * populated and the response was already sent to the
6484 : * caller (usually the RPC). So free the context here.
6485 : */
6486 21 : free_nvme_async_probe_ctx(ctx);
6487 21 : }
6488 52 : }
6489 :
6490 52 : return SPDK_POLLER_BUSY;
6491 : }
6492 :
6493 : static bool
6494 71 : bdev_nvme_check_io_error_resiliency_params(int32_t ctrlr_loss_timeout_sec,
6495 : uint32_t reconnect_delay_sec,
6496 : uint32_t fast_io_fail_timeout_sec)
6497 : {
6498 71 : if (ctrlr_loss_timeout_sec < -1) {
6499 1 : SPDK_ERRLOG("ctrlr_loss_timeout_sec can't be less than -1.\n");
6500 1 : return false;
6501 70 : } else if (ctrlr_loss_timeout_sec == -1) {
6502 14 : if (reconnect_delay_sec == 0) {
6503 1 : SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n");
6504 1 : return false;
6505 13 : } else if (fast_io_fail_timeout_sec != 0 &&
6506 3 : fast_io_fail_timeout_sec < reconnect_delay_sec) {
6507 1 : SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io-fail_timeout_sec.\n");
6508 1 : return false;
6509 : }
6510 68 : } else if (ctrlr_loss_timeout_sec != 0) {
6511 11 : if (reconnect_delay_sec == 0) {
6512 1 : SPDK_ERRLOG("reconnect_delay_sec can't be 0 if ctrlr_loss_timeout_sec is not 0.\n");
6513 1 : return false;
6514 10 : } else if (reconnect_delay_sec > (uint32_t)ctrlr_loss_timeout_sec) {
6515 1 : SPDK_ERRLOG("reconnect_delay_sec can't be more than ctrlr_loss_timeout_sec.\n");
6516 1 : return false;
6517 9 : } else if (fast_io_fail_timeout_sec != 0) {
6518 6 : if (fast_io_fail_timeout_sec < reconnect_delay_sec) {
6519 1 : SPDK_ERRLOG("reconnect_delay_sec can't be more than fast_io_fail_timeout_sec.\n");
6520 1 : return false;
6521 5 : } else if (fast_io_fail_timeout_sec > (uint32_t)ctrlr_loss_timeout_sec) {
6522 1 : SPDK_ERRLOG("fast_io_fail_timeout_sec can't be more than ctrlr_loss_timeout_sec.\n");
6523 1 : return false;
6524 : }
6525 4 : }
6526 52 : } else if (reconnect_delay_sec != 0 || fast_io_fail_timeout_sec != 0) {
6527 2 : SPDK_ERRLOG("Both reconnect_delay_sec and fast_io_fail_timeout_sec must be 0 if ctrlr_loss_timeout_sec is 0.\n");
6528 2 : return false;
6529 : }
6530 :
6531 62 : return true;
6532 71 : }
6533 :
6534 : int
6535 52 : spdk_bdev_nvme_create(struct spdk_nvme_transport_id *trid,
6536 : const char *base_name,
6537 : const char **names,
6538 : uint32_t count,
6539 : spdk_bdev_nvme_create_cb cb_fn,
6540 : void *cb_ctx,
6541 : struct spdk_nvme_ctrlr_opts *drv_opts,
6542 : struct spdk_bdev_nvme_ctrlr_opts *bdev_opts)
6543 : {
6544 : struct nvme_probe_skip_entry *entry, *tmp;
6545 : struct nvme_async_probe_ctx *ctx;
6546 : spdk_nvme_attach_cb attach_cb;
6547 : struct nvme_ctrlr *nvme_ctrlr;
6548 : int len;
6549 :
6550 : /* TODO expand this check to include both the host and target TRIDs.
6551 : * Only if both are the same should we fail.
6552 : */
6553 52 : if (nvme_ctrlr_get(trid, drv_opts->hostnqn) != NULL) {
6554 0 : SPDK_ERRLOG("A controller with the provided trid (traddr: %s, hostnqn: %s) "
6555 : "already exists.\n", trid->traddr, drv_opts->hostnqn);
6556 0 : return -EEXIST;
6557 : }
6558 :
6559 52 : len = strnlen(base_name, SPDK_CONTROLLER_NAME_MAX);
6560 :
6561 52 : if (len == 0 || len == SPDK_CONTROLLER_NAME_MAX) {
6562 0 : SPDK_ERRLOG("controller name must be between 1 and %d characters\n", SPDK_CONTROLLER_NAME_MAX - 1);
6563 0 : return -EINVAL;
6564 : }
6565 :
6566 52 : if (bdev_opts != NULL &&
6567 104 : !bdev_nvme_check_io_error_resiliency_params(bdev_opts->ctrlr_loss_timeout_sec,
6568 52 : bdev_opts->reconnect_delay_sec,
6569 52 : bdev_opts->fast_io_fail_timeout_sec)) {
6570 0 : return -EINVAL;
6571 : }
6572 :
6573 52 : ctx = calloc(1, sizeof(*ctx));
6574 52 : if (!ctx) {
6575 0 : return -ENOMEM;
6576 : }
6577 52 : ctx->base_name = base_name;
6578 52 : ctx->names = names;
6579 52 : ctx->max_bdevs = count;
6580 52 : ctx->cb_fn = cb_fn;
6581 52 : ctx->cb_ctx = cb_ctx;
6582 52 : ctx->trid = *trid;
6583 :
6584 52 : if (bdev_opts) {
6585 52 : memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts));
6586 52 : } else {
6587 0 : spdk_bdev_nvme_get_default_ctrlr_opts(&ctx->bdev_opts);
6588 : }
6589 :
6590 52 : if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
6591 0 : TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, tmp) {
6592 0 : if (spdk_nvme_transport_id_compare(trid, &entry->trid) == 0) {
6593 0 : TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
6594 0 : free(entry);
6595 0 : break;
6596 : }
6597 0 : }
6598 0 : }
6599 :
6600 52 : memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts));
6601 52 : ctx->drv_opts.transport_retry_count = g_opts.transport_retry_count;
6602 52 : ctx->drv_opts.transport_ack_timeout = g_opts.transport_ack_timeout;
6603 52 : ctx->drv_opts.keep_alive_timeout_ms = g_opts.keep_alive_timeout_ms;
6604 52 : ctx->drv_opts.disable_read_ana_log_page = true;
6605 52 : ctx->drv_opts.transport_tos = g_opts.transport_tos;
6606 :
6607 52 : if (spdk_interrupt_mode_is_enabled()) {
6608 0 : if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) {
6609 0 : ctx->drv_opts.enable_interrupts = true;
6610 0 : } else {
6611 0 : SPDK_ERRLOG("Interrupt mode is only supported with PCIe transport\n");
6612 0 : free_nvme_async_probe_ctx(ctx);
6613 0 : return -ENOTSUP;
6614 : }
6615 0 : }
6616 :
6617 52 : if (ctx->bdev_opts.psk != NULL) {
6618 0 : ctx->drv_opts.tls_psk = spdk_keyring_get_key(ctx->bdev_opts.psk);
6619 0 : if (ctx->drv_opts.tls_psk == NULL) {
6620 0 : SPDK_ERRLOG("Could not load PSK: %s\n", ctx->bdev_opts.psk);
6621 0 : free_nvme_async_probe_ctx(ctx);
6622 0 : return -ENOKEY;
6623 : }
6624 0 : }
6625 :
6626 52 : if (ctx->bdev_opts.dhchap_key != NULL) {
6627 0 : ctx->drv_opts.dhchap_key = spdk_keyring_get_key(ctx->bdev_opts.dhchap_key);
6628 0 : if (ctx->drv_opts.dhchap_key == NULL) {
6629 0 : SPDK_ERRLOG("Could not load DH-HMAC-CHAP key: %s\n",
6630 : ctx->bdev_opts.dhchap_key);
6631 0 : free_nvme_async_probe_ctx(ctx);
6632 0 : return -ENOKEY;
6633 : }
6634 :
6635 0 : ctx->drv_opts.dhchap_digests = g_opts.dhchap_digests;
6636 0 : ctx->drv_opts.dhchap_dhgroups = g_opts.dhchap_dhgroups;
6637 0 : }
6638 52 : if (ctx->bdev_opts.dhchap_ctrlr_key != NULL) {
6639 0 : ctx->drv_opts.dhchap_ctrlr_key =
6640 0 : spdk_keyring_get_key(ctx->bdev_opts.dhchap_ctrlr_key);
6641 0 : if (ctx->drv_opts.dhchap_ctrlr_key == NULL) {
6642 0 : SPDK_ERRLOG("Could not load DH-HMAC-CHAP controller key: %s\n",
6643 : ctx->bdev_opts.dhchap_ctrlr_key);
6644 0 : free_nvme_async_probe_ctx(ctx);
6645 0 : return -ENOKEY;
6646 : }
6647 0 : }
6648 :
6649 52 : if (nvme_bdev_ctrlr_get_by_name(base_name) == NULL || ctx->bdev_opts.multipath) {
6650 48 : attach_cb = connect_attach_cb;
6651 48 : } else {
6652 4 : attach_cb = connect_set_failover_cb;
6653 : }
6654 :
6655 52 : nvme_ctrlr = nvme_ctrlr_get_by_name(ctx->base_name);
6656 52 : if (nvme_ctrlr && nvme_ctrlr->opts.multipath != ctx->bdev_opts.multipath) {
6657 : /* All controllers with the same name must be configured the same
6658 : * way, either for multipath or failover. If the configuration doesn't
6659 : * match - report error.
6660 : */
6661 0 : free_nvme_async_probe_ctx(ctx);
6662 0 : return -EINVAL;
6663 : }
6664 :
6665 52 : ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, attach_cb);
6666 52 : if (ctx->probe_ctx == NULL) {
6667 0 : SPDK_ERRLOG("No controller was found with provided trid (traddr: %s)\n", trid->traddr);
6668 0 : free_nvme_async_probe_ctx(ctx);
6669 0 : return -ENODEV;
6670 : }
6671 52 : ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_async_poll, ctx, 1000);
6672 :
6673 52 : return 0;
6674 52 : }
6675 :
6676 : struct bdev_nvme_delete_ctx {
6677 : char *name;
6678 : struct nvme_path_id path_id;
6679 : bdev_nvme_delete_done_fn delete_done;
6680 : void *delete_done_ctx;
6681 : uint64_t timeout_ticks;
6682 : struct spdk_poller *poller;
6683 : };
6684 :
6685 : static void
6686 2 : free_bdev_nvme_delete_ctx(struct bdev_nvme_delete_ctx *ctx)
6687 : {
6688 2 : if (ctx != NULL) {
6689 1 : free(ctx->name);
6690 1 : free(ctx);
6691 1 : }
6692 2 : }
6693 :
6694 : static bool
6695 75 : nvme_path_id_compare(struct nvme_path_id *p, const struct nvme_path_id *path_id)
6696 : {
6697 75 : if (path_id->trid.trtype != 0) {
6698 21 : if (path_id->trid.trtype == SPDK_NVME_TRANSPORT_CUSTOM) {
6699 0 : if (strcasecmp(path_id->trid.trstring, p->trid.trstring) != 0) {
6700 0 : return false;
6701 : }
6702 0 : } else {
6703 21 : if (path_id->trid.trtype != p->trid.trtype) {
6704 0 : return false;
6705 : }
6706 : }
6707 21 : }
6708 :
6709 75 : if (!spdk_mem_all_zero(path_id->trid.traddr, sizeof(path_id->trid.traddr))) {
6710 21 : if (strcasecmp(path_id->trid.traddr, p->trid.traddr) != 0) {
6711 11 : return false;
6712 : }
6713 10 : }
6714 :
6715 64 : if (path_id->trid.adrfam != 0) {
6716 0 : if (path_id->trid.adrfam != p->trid.adrfam) {
6717 0 : return false;
6718 : }
6719 0 : }
6720 :
6721 64 : if (!spdk_mem_all_zero(path_id->trid.trsvcid, sizeof(path_id->trid.trsvcid))) {
6722 10 : if (strcasecmp(path_id->trid.trsvcid, p->trid.trsvcid) != 0) {
6723 0 : return false;
6724 : }
6725 10 : }
6726 :
6727 64 : if (!spdk_mem_all_zero(path_id->trid.subnqn, sizeof(path_id->trid.subnqn))) {
6728 10 : if (strcmp(path_id->trid.subnqn, p->trid.subnqn) != 0) {
6729 0 : return false;
6730 : }
6731 10 : }
6732 :
6733 64 : if (!spdk_mem_all_zero(path_id->hostid.hostaddr, sizeof(path_id->hostid.hostaddr))) {
6734 0 : if (strcmp(path_id->hostid.hostaddr, p->hostid.hostaddr) != 0) {
6735 0 : return false;
6736 : }
6737 0 : }
6738 :
6739 64 : if (!spdk_mem_all_zero(path_id->hostid.hostsvcid, sizeof(path_id->hostid.hostsvcid))) {
6740 0 : if (strcmp(path_id->hostid.hostsvcid, p->hostid.hostsvcid) != 0) {
6741 0 : return false;
6742 : }
6743 0 : }
6744 :
6745 64 : return true;
6746 75 : }
6747 :
6748 : static bool
6749 2 : nvme_path_id_exists(const char *name, const struct nvme_path_id *path_id)
6750 : {
6751 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
6752 : struct nvme_ctrlr *ctrlr;
6753 : struct nvme_path_id *p;
6754 :
6755 2 : pthread_mutex_lock(&g_bdev_nvme_mutex);
6756 2 : nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
6757 2 : if (!nbdev_ctrlr) {
6758 1 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
6759 1 : return false;
6760 : }
6761 :
6762 1 : TAILQ_FOREACH(ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
6763 1 : pthread_mutex_lock(&ctrlr->mutex);
6764 1 : TAILQ_FOREACH(p, &ctrlr->trids, link) {
6765 1 : if (nvme_path_id_compare(p, path_id)) {
6766 1 : pthread_mutex_unlock(&ctrlr->mutex);
6767 1 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
6768 1 : return true;
6769 : }
6770 0 : }
6771 0 : pthread_mutex_unlock(&ctrlr->mutex);
6772 0 : }
6773 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
6774 :
6775 0 : return false;
6776 2 : }
6777 :
6778 : static int
6779 2 : bdev_nvme_delete_complete_poll(void *arg)
6780 : {
6781 2 : struct bdev_nvme_delete_ctx *ctx = arg;
6782 2 : int rc = 0;
6783 :
6784 2 : if (nvme_path_id_exists(ctx->name, &ctx->path_id)) {
6785 1 : if (ctx->timeout_ticks > spdk_get_ticks()) {
6786 1 : return SPDK_POLLER_BUSY;
6787 : }
6788 :
6789 0 : SPDK_ERRLOG("NVMe path '%s' still exists after delete\n", ctx->name);
6790 0 : rc = -ETIMEDOUT;
6791 0 : }
6792 :
6793 1 : spdk_poller_unregister(&ctx->poller);
6794 :
6795 1 : ctx->delete_done(ctx->delete_done_ctx, rc);
6796 1 : free_bdev_nvme_delete_ctx(ctx);
6797 :
6798 1 : return SPDK_POLLER_BUSY;
6799 2 : }
6800 :
6801 : static int
6802 64 : _bdev_nvme_delete(struct nvme_ctrlr *nvme_ctrlr, const struct nvme_path_id *path_id)
6803 : {
6804 : struct nvme_path_id *p, *t;
6805 : spdk_msg_fn msg_fn;
6806 64 : int rc = -ENXIO;
6807 :
6808 64 : pthread_mutex_lock(&nvme_ctrlr->mutex);
6809 :
6810 74 : TAILQ_FOREACH_REVERSE_SAFE(p, &nvme_ctrlr->trids, nvme_paths, link, t) {
6811 74 : if (p == TAILQ_FIRST(&nvme_ctrlr->trids)) {
6812 64 : break;
6813 : }
6814 :
6815 10 : if (!nvme_path_id_compare(p, path_id)) {
6816 3 : continue;
6817 : }
6818 :
6819 : /* We are not using the specified path. */
6820 7 : TAILQ_REMOVE(&nvme_ctrlr->trids, p, link);
6821 7 : free(p);
6822 7 : rc = 0;
6823 7 : }
6824 :
6825 64 : if (p == NULL || !nvme_path_id_compare(p, path_id)) {
6826 8 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
6827 8 : return rc;
6828 : }
6829 :
6830 : /* If we made it here, then this path is a match! Now we need to remove it. */
6831 :
6832 : /* This is the active path in use right now. The active path is always the first in the list. */
6833 56 : assert(p == nvme_ctrlr->active_path_id);
6834 :
6835 56 : if (!TAILQ_NEXT(p, link)) {
6836 : /* The current path is the only path. */
6837 55 : msg_fn = _nvme_ctrlr_destruct;
6838 55 : rc = bdev_nvme_delete_ctrlr_unsafe(nvme_ctrlr, false);
6839 55 : } else {
6840 : /* There is an alternative path. */
6841 1 : msg_fn = _bdev_nvme_reset_ctrlr;
6842 1 : rc = bdev_nvme_failover_ctrlr_unsafe(nvme_ctrlr, true);
6843 : }
6844 :
6845 56 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
6846 :
6847 56 : if (rc == 0) {
6848 56 : spdk_thread_send_msg(nvme_ctrlr->thread, msg_fn, nvme_ctrlr);
6849 56 : } else if (rc == -EALREADY) {
6850 0 : rc = 0;
6851 0 : }
6852 :
6853 56 : return rc;
6854 64 : }
6855 :
6856 : int
6857 49 : bdev_nvme_delete(const char *name, const struct nvme_path_id *path_id,
6858 : bdev_nvme_delete_done_fn delete_done, void *delete_done_ctx)
6859 : {
6860 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
6861 : struct nvme_ctrlr *nvme_ctrlr, *tmp_nvme_ctrlr;
6862 49 : struct bdev_nvme_delete_ctx *ctx = NULL;
6863 49 : int rc = -ENXIO, _rc;
6864 :
6865 49 : if (name == NULL || path_id == NULL) {
6866 0 : rc = -EINVAL;
6867 0 : goto exit;
6868 : }
6869 :
6870 49 : pthread_mutex_lock(&g_bdev_nvme_mutex);
6871 :
6872 49 : nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
6873 49 : if (nbdev_ctrlr == NULL) {
6874 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
6875 :
6876 0 : SPDK_ERRLOG("Failed to find NVMe bdev controller\n");
6877 0 : rc = -ENODEV;
6878 0 : goto exit;
6879 : }
6880 :
6881 113 : TAILQ_FOREACH_SAFE(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq, tmp_nvme_ctrlr) {
6882 64 : _rc = _bdev_nvme_delete(nvme_ctrlr, path_id);
6883 64 : if (_rc < 0 && _rc != -ENXIO) {
6884 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
6885 0 : rc = _rc;
6886 0 : goto exit;
6887 64 : } else if (_rc == 0) {
6888 : /* We traverse all remaining nvme_ctrlrs even if one nvme_ctrlr
6889 : * was deleted successfully. To remember the successful deletion,
6890 : * overwrite rc only if _rc is zero.
6891 : */
6892 58 : rc = 0;
6893 58 : }
6894 64 : }
6895 :
6896 49 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
6897 :
6898 49 : if (rc != 0 || delete_done == NULL) {
6899 48 : goto exit;
6900 : }
6901 :
6902 1 : ctx = calloc(1, sizeof(*ctx));
6903 1 : if (ctx == NULL) {
6904 0 : SPDK_ERRLOG("Failed to allocate context for bdev_nvme_delete\n");
6905 0 : rc = -ENOMEM;
6906 0 : goto exit;
6907 : }
6908 :
6909 1 : ctx->name = strdup(name);
6910 1 : if (ctx->name == NULL) {
6911 0 : SPDK_ERRLOG("Failed to copy controller name for deletion\n");
6912 0 : rc = -ENOMEM;
6913 0 : goto exit;
6914 : }
6915 :
6916 1 : ctx->delete_done = delete_done;
6917 1 : ctx->delete_done_ctx = delete_done_ctx;
6918 1 : ctx->path_id = *path_id;
6919 1 : ctx->timeout_ticks = spdk_get_ticks() + 10 * spdk_get_ticks_hz();
6920 1 : ctx->poller = SPDK_POLLER_REGISTER(bdev_nvme_delete_complete_poll, ctx, 1000);
6921 1 : if (ctx->poller == NULL) {
6922 0 : SPDK_ERRLOG("Failed to register bdev_nvme_delete poller\n");
6923 0 : rc = -ENOMEM;
6924 0 : goto exit;
6925 : }
6926 :
6927 : exit:
6928 49 : if (rc != 0) {
6929 1 : free_bdev_nvme_delete_ctx(ctx);
6930 1 : }
6931 :
6932 49 : return rc;
6933 : }
6934 :
6935 : #define DISCOVERY_INFOLOG(ctx, format, ...) \
6936 : SPDK_INFOLOG(bdev_nvme, "Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__);
6937 :
6938 : #define DISCOVERY_ERRLOG(ctx, format, ...) \
6939 : SPDK_ERRLOG("Discovery[%s:%s] " format, ctx->trid.traddr, ctx->trid.trsvcid, ##__VA_ARGS__);
6940 :
6941 : struct discovery_entry_ctx {
6942 : char name[128];
6943 : struct spdk_nvme_transport_id trid;
6944 : struct spdk_nvme_ctrlr_opts drv_opts;
6945 : struct spdk_nvmf_discovery_log_page_entry entry;
6946 : TAILQ_ENTRY(discovery_entry_ctx) tailq;
6947 : struct discovery_ctx *ctx;
6948 : };
6949 :
6950 : struct discovery_ctx {
6951 : char *name;
6952 : spdk_bdev_nvme_start_discovery_fn start_cb_fn;
6953 : spdk_bdev_nvme_stop_discovery_fn stop_cb_fn;
6954 : void *cb_ctx;
6955 : struct spdk_nvme_probe_ctx *probe_ctx;
6956 : struct spdk_nvme_detach_ctx *detach_ctx;
6957 : struct spdk_nvme_ctrlr *ctrlr;
6958 : struct spdk_nvme_transport_id trid;
6959 : struct discovery_entry_ctx *entry_ctx_in_use;
6960 : struct spdk_poller *poller;
6961 : struct spdk_nvme_ctrlr_opts drv_opts;
6962 : struct spdk_bdev_nvme_ctrlr_opts bdev_opts;
6963 : struct spdk_nvmf_discovery_log_page *log_page;
6964 : TAILQ_ENTRY(discovery_ctx) tailq;
6965 : TAILQ_HEAD(, discovery_entry_ctx) nvm_entry_ctxs;
6966 : TAILQ_HEAD(, discovery_entry_ctx) discovery_entry_ctxs;
6967 : int rc;
6968 : bool wait_for_attach;
6969 : uint64_t timeout_ticks;
6970 : /* Denotes that the discovery service is being started. We're waiting
6971 : * for the initial connection to the discovery controller to be
6972 : * established and attach discovered NVM ctrlrs.
6973 : */
6974 : bool initializing;
6975 : /* Denotes if a discovery is currently in progress for this context.
6976 : * That includes connecting to newly discovered subsystems. Used to
6977 : * ensure we do not start a new discovery until an existing one is
6978 : * complete.
6979 : */
6980 : bool in_progress;
6981 :
6982 : /* Denotes if another discovery is needed after the one in progress
6983 : * completes. Set when we receive an AER completion while a discovery
6984 : * is already in progress.
6985 : */
6986 : bool pending;
6987 :
6988 : /* Signal to the discovery context poller that it should stop the
6989 : * discovery service, including detaching from the current discovery
6990 : * controller.
6991 : */
6992 : bool stop;
6993 :
6994 : struct spdk_thread *calling_thread;
6995 : uint32_t index;
6996 : uint32_t attach_in_progress;
6997 : char *hostnqn;
6998 :
6999 : /* Denotes if the discovery service was started by the mdns discovery.
7000 : */
7001 : bool from_mdns_discovery_service;
7002 : };
7003 :
7004 : TAILQ_HEAD(discovery_ctxs, discovery_ctx);
7005 : static struct discovery_ctxs g_discovery_ctxs = TAILQ_HEAD_INITIALIZER(g_discovery_ctxs);
7006 :
7007 : static void get_discovery_log_page(struct discovery_ctx *ctx);
7008 :
7009 : static void
7010 0 : free_discovery_ctx(struct discovery_ctx *ctx)
7011 : {
7012 0 : free(ctx->log_page);
7013 0 : free(ctx->hostnqn);
7014 0 : free(ctx->name);
7015 0 : free(ctx);
7016 0 : }
7017 :
7018 : static void
7019 0 : discovery_complete(struct discovery_ctx *ctx)
7020 : {
7021 0 : ctx->initializing = false;
7022 0 : ctx->in_progress = false;
7023 0 : if (ctx->pending) {
7024 0 : ctx->pending = false;
7025 0 : get_discovery_log_page(ctx);
7026 0 : }
7027 0 : }
7028 :
7029 : static void
7030 0 : build_trid_from_log_page_entry(struct spdk_nvme_transport_id *trid,
7031 : struct spdk_nvmf_discovery_log_page_entry *entry)
7032 : {
7033 : char *space;
7034 :
7035 0 : trid->trtype = entry->trtype;
7036 0 : trid->adrfam = entry->adrfam;
7037 0 : memcpy(trid->traddr, entry->traddr, sizeof(entry->traddr));
7038 0 : memcpy(trid->trsvcid, entry->trsvcid, sizeof(entry->trsvcid));
7039 : /* Because the source buffer (entry->subnqn) is longer than trid->subnqn, and
7040 : * before call to this function trid->subnqn is zeroed out, we need
7041 : * to copy sizeof(trid->subnqn) minus one byte to make sure the last character
7042 : * remains 0. Then we can shorten the string (replace ' ' with 0) if required
7043 : */
7044 0 : memcpy(trid->subnqn, entry->subnqn, sizeof(trid->subnqn) - 1);
7045 :
7046 : /* We want the traddr, trsvcid and subnqn fields to be NULL-terminated.
7047 : * But the log page entries typically pad them with spaces, not zeroes.
7048 : * So add a NULL terminator to each of these fields at the appropriate
7049 : * location.
7050 : */
7051 0 : space = strchr(trid->traddr, ' ');
7052 0 : if (space) {
7053 0 : *space = 0;
7054 0 : }
7055 0 : space = strchr(trid->trsvcid, ' ');
7056 0 : if (space) {
7057 0 : *space = 0;
7058 0 : }
7059 0 : space = strchr(trid->subnqn, ' ');
7060 0 : if (space) {
7061 0 : *space = 0;
7062 0 : }
7063 0 : }
7064 :
7065 : static void
7066 0 : _stop_discovery(void *_ctx)
7067 : {
7068 0 : struct discovery_ctx *ctx = _ctx;
7069 :
7070 0 : if (ctx->attach_in_progress > 0) {
7071 0 : spdk_thread_send_msg(spdk_get_thread(), _stop_discovery, ctx);
7072 0 : return;
7073 : }
7074 :
7075 0 : ctx->stop = true;
7076 :
7077 0 : while (!TAILQ_EMPTY(&ctx->nvm_entry_ctxs)) {
7078 : struct discovery_entry_ctx *entry_ctx;
7079 0 : struct nvme_path_id path = {};
7080 :
7081 0 : entry_ctx = TAILQ_FIRST(&ctx->nvm_entry_ctxs);
7082 0 : path.trid = entry_ctx->trid;
7083 0 : bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL);
7084 0 : TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq);
7085 0 : free(entry_ctx);
7086 : }
7087 :
7088 0 : while (!TAILQ_EMPTY(&ctx->discovery_entry_ctxs)) {
7089 : struct discovery_entry_ctx *entry_ctx;
7090 :
7091 0 : entry_ctx = TAILQ_FIRST(&ctx->discovery_entry_ctxs);
7092 0 : TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq);
7093 0 : free(entry_ctx);
7094 : }
7095 :
7096 0 : free(ctx->entry_ctx_in_use);
7097 0 : ctx->entry_ctx_in_use = NULL;
7098 0 : }
7099 :
7100 : static void
7101 0 : stop_discovery(struct discovery_ctx *ctx, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx)
7102 : {
7103 0 : ctx->stop_cb_fn = cb_fn;
7104 0 : ctx->cb_ctx = cb_ctx;
7105 :
7106 0 : if (ctx->attach_in_progress > 0) {
7107 0 : DISCOVERY_INFOLOG(ctx, "stopping discovery with attach_in_progress: %"PRIu32"\n",
7108 : ctx->attach_in_progress);
7109 0 : }
7110 :
7111 0 : _stop_discovery(ctx);
7112 0 : }
7113 :
7114 : static void
7115 2 : remove_discovery_entry(struct nvme_ctrlr *nvme_ctrlr)
7116 : {
7117 : struct discovery_ctx *d_ctx;
7118 : struct nvme_path_id *path_id;
7119 2 : struct spdk_nvme_transport_id trid = {};
7120 : struct discovery_entry_ctx *entry_ctx, *tmp;
7121 :
7122 2 : path_id = TAILQ_FIRST(&nvme_ctrlr->trids);
7123 :
7124 2 : TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) {
7125 0 : TAILQ_FOREACH_SAFE(entry_ctx, &d_ctx->nvm_entry_ctxs, tailq, tmp) {
7126 0 : build_trid_from_log_page_entry(&trid, &entry_ctx->entry);
7127 0 : if (spdk_nvme_transport_id_compare(&trid, &path_id->trid) != 0) {
7128 0 : continue;
7129 : }
7130 :
7131 0 : TAILQ_REMOVE(&d_ctx->nvm_entry_ctxs, entry_ctx, tailq);
7132 0 : free(entry_ctx);
7133 0 : DISCOVERY_INFOLOG(d_ctx, "Remove discovery entry: %s:%s:%s\n",
7134 : trid.subnqn, trid.traddr, trid.trsvcid);
7135 :
7136 : /* Fail discovery ctrlr to force reattach attempt */
7137 0 : spdk_nvme_ctrlr_fail(d_ctx->ctrlr);
7138 0 : }
7139 0 : }
7140 2 : }
7141 :
7142 : static void
7143 0 : discovery_remove_controllers(struct discovery_ctx *ctx)
7144 : {
7145 0 : struct spdk_nvmf_discovery_log_page *log_page = ctx->log_page;
7146 : struct discovery_entry_ctx *entry_ctx, *tmp;
7147 : struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry;
7148 0 : struct spdk_nvme_transport_id old_trid = {};
7149 : uint64_t numrec, i;
7150 : bool found;
7151 :
7152 0 : numrec = from_le64(&log_page->numrec);
7153 0 : TAILQ_FOREACH_SAFE(entry_ctx, &ctx->nvm_entry_ctxs, tailq, tmp) {
7154 0 : found = false;
7155 0 : old_entry = &entry_ctx->entry;
7156 0 : build_trid_from_log_page_entry(&old_trid, old_entry);
7157 0 : for (i = 0; i < numrec; i++) {
7158 0 : new_entry = &log_page->entries[i];
7159 0 : if (!memcmp(old_entry, new_entry, sizeof(*old_entry))) {
7160 0 : DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s found again\n",
7161 : old_trid.subnqn, old_trid.traddr, old_trid.trsvcid);
7162 0 : found = true;
7163 0 : break;
7164 : }
7165 0 : }
7166 0 : if (!found) {
7167 0 : struct nvme_path_id path = {};
7168 :
7169 0 : DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s not found\n",
7170 : old_trid.subnqn, old_trid.traddr, old_trid.trsvcid);
7171 :
7172 0 : path.trid = entry_ctx->trid;
7173 0 : bdev_nvme_delete(entry_ctx->name, &path, NULL, NULL);
7174 0 : TAILQ_REMOVE(&ctx->nvm_entry_ctxs, entry_ctx, tailq);
7175 0 : free(entry_ctx);
7176 0 : }
7177 0 : }
7178 0 : free(log_page);
7179 0 : ctx->log_page = NULL;
7180 0 : discovery_complete(ctx);
7181 0 : }
7182 :
7183 : static void
7184 0 : complete_discovery_start(struct discovery_ctx *ctx, int status)
7185 : {
7186 0 : ctx->timeout_ticks = 0;
7187 0 : ctx->rc = status;
7188 0 : if (ctx->start_cb_fn) {
7189 0 : ctx->start_cb_fn(ctx->cb_ctx, status);
7190 0 : ctx->start_cb_fn = NULL;
7191 0 : ctx->cb_ctx = NULL;
7192 0 : }
7193 0 : }
7194 :
7195 : static void
7196 0 : discovery_attach_controller_done(void *cb_ctx, size_t bdev_count, int rc)
7197 : {
7198 0 : struct discovery_entry_ctx *entry_ctx = cb_ctx;
7199 0 : struct discovery_ctx *ctx = entry_ctx->ctx;
7200 :
7201 0 : DISCOVERY_INFOLOG(ctx, "attach %s done\n", entry_ctx->name);
7202 0 : ctx->attach_in_progress--;
7203 0 : if (ctx->attach_in_progress == 0) {
7204 0 : complete_discovery_start(ctx, ctx->rc);
7205 0 : if (ctx->initializing && ctx->rc != 0) {
7206 0 : DISCOVERY_ERRLOG(ctx, "stopping discovery due to errors: %d\n", ctx->rc);
7207 0 : stop_discovery(ctx, NULL, ctx->cb_ctx);
7208 0 : } else {
7209 0 : discovery_remove_controllers(ctx);
7210 : }
7211 0 : }
7212 0 : }
7213 :
7214 : static struct discovery_entry_ctx *
7215 0 : create_discovery_entry_ctx(struct discovery_ctx *ctx, struct spdk_nvme_transport_id *trid)
7216 : {
7217 : struct discovery_entry_ctx *new_ctx;
7218 :
7219 0 : new_ctx = calloc(1, sizeof(*new_ctx));
7220 0 : if (new_ctx == NULL) {
7221 0 : DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n");
7222 0 : return NULL;
7223 : }
7224 :
7225 0 : new_ctx->ctx = ctx;
7226 0 : memcpy(&new_ctx->trid, trid, sizeof(*trid));
7227 0 : spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts));
7228 0 : snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn);
7229 0 : return new_ctx;
7230 0 : }
7231 :
7232 : static void
7233 0 : discovery_log_page_cb(void *cb_arg, int rc, const struct spdk_nvme_cpl *cpl,
7234 : struct spdk_nvmf_discovery_log_page *log_page)
7235 : {
7236 0 : struct discovery_ctx *ctx = cb_arg;
7237 : struct discovery_entry_ctx *entry_ctx, *tmp;
7238 : struct spdk_nvmf_discovery_log_page_entry *new_entry, *old_entry;
7239 : uint64_t numrec, i;
7240 : bool found;
7241 :
7242 0 : if (rc || spdk_nvme_cpl_is_error(cpl)) {
7243 0 : DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n");
7244 0 : return;
7245 : }
7246 :
7247 0 : ctx->log_page = log_page;
7248 0 : assert(ctx->attach_in_progress == 0);
7249 0 : numrec = from_le64(&log_page->numrec);
7250 0 : TAILQ_FOREACH_SAFE(entry_ctx, &ctx->discovery_entry_ctxs, tailq, tmp) {
7251 0 : TAILQ_REMOVE(&ctx->discovery_entry_ctxs, entry_ctx, tailq);
7252 0 : free(entry_ctx);
7253 0 : }
7254 0 : for (i = 0; i < numrec; i++) {
7255 0 : found = false;
7256 0 : new_entry = &log_page->entries[i];
7257 0 : if (new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY_CURRENT ||
7258 0 : new_entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) {
7259 : struct discovery_entry_ctx *new_ctx;
7260 0 : struct spdk_nvme_transport_id trid = {};
7261 :
7262 0 : build_trid_from_log_page_entry(&trid, new_entry);
7263 0 : new_ctx = create_discovery_entry_ctx(ctx, &trid);
7264 0 : if (new_ctx == NULL) {
7265 0 : DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n");
7266 0 : break;
7267 : }
7268 :
7269 0 : TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, new_ctx, tailq);
7270 0 : continue;
7271 : }
7272 0 : TAILQ_FOREACH(entry_ctx, &ctx->nvm_entry_ctxs, tailq) {
7273 0 : old_entry = &entry_ctx->entry;
7274 0 : if (!memcmp(new_entry, old_entry, sizeof(*new_entry))) {
7275 0 : found = true;
7276 0 : break;
7277 : }
7278 0 : }
7279 0 : if (!found) {
7280 0 : struct discovery_entry_ctx *subnqn_ctx = NULL, *new_ctx;
7281 : struct discovery_ctx *d_ctx;
7282 :
7283 0 : TAILQ_FOREACH(d_ctx, &g_discovery_ctxs, tailq) {
7284 0 : TAILQ_FOREACH(subnqn_ctx, &d_ctx->nvm_entry_ctxs, tailq) {
7285 0 : if (!memcmp(subnqn_ctx->entry.subnqn, new_entry->subnqn,
7286 : sizeof(new_entry->subnqn))) {
7287 0 : break;
7288 : }
7289 0 : }
7290 0 : if (subnqn_ctx) {
7291 0 : break;
7292 : }
7293 0 : }
7294 :
7295 0 : new_ctx = calloc(1, sizeof(*new_ctx));
7296 0 : if (new_ctx == NULL) {
7297 0 : DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n");
7298 0 : break;
7299 : }
7300 :
7301 0 : new_ctx->ctx = ctx;
7302 0 : memcpy(&new_ctx->entry, new_entry, sizeof(*new_entry));
7303 0 : build_trid_from_log_page_entry(&new_ctx->trid, new_entry);
7304 0 : if (subnqn_ctx) {
7305 0 : snprintf(new_ctx->name, sizeof(new_ctx->name), "%s", subnqn_ctx->name);
7306 0 : DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new path for %s\n",
7307 : new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid,
7308 : new_ctx->name);
7309 0 : } else {
7310 0 : snprintf(new_ctx->name, sizeof(new_ctx->name), "%s%d", ctx->name, ctx->index++);
7311 0 : DISCOVERY_INFOLOG(ctx, "NVM %s:%s:%s new subsystem %s\n",
7312 : new_ctx->trid.subnqn, new_ctx->trid.traddr, new_ctx->trid.trsvcid,
7313 : new_ctx->name);
7314 : }
7315 0 : spdk_nvme_ctrlr_get_default_ctrlr_opts(&new_ctx->drv_opts, sizeof(new_ctx->drv_opts));
7316 0 : snprintf(new_ctx->drv_opts.hostnqn, sizeof(new_ctx->drv_opts.hostnqn), "%s", ctx->hostnqn);
7317 0 : rc = spdk_bdev_nvme_create(&new_ctx->trid, new_ctx->name, NULL, 0,
7318 0 : discovery_attach_controller_done, new_ctx,
7319 0 : &new_ctx->drv_opts, &ctx->bdev_opts);
7320 0 : if (rc == 0) {
7321 0 : TAILQ_INSERT_TAIL(&ctx->nvm_entry_ctxs, new_ctx, tailq);
7322 0 : ctx->attach_in_progress++;
7323 0 : } else {
7324 0 : DISCOVERY_ERRLOG(ctx, "spdk_bdev_nvme_create failed (%s)\n", spdk_strerror(-rc));
7325 : }
7326 0 : }
7327 0 : }
7328 :
7329 0 : if (ctx->attach_in_progress == 0) {
7330 0 : discovery_remove_controllers(ctx);
7331 0 : }
7332 0 : }
7333 :
7334 : static void
7335 0 : get_discovery_log_page(struct discovery_ctx *ctx)
7336 : {
7337 : int rc;
7338 :
7339 0 : assert(ctx->in_progress == false);
7340 0 : ctx->in_progress = true;
7341 0 : rc = spdk_nvme_ctrlr_get_discovery_log_page(ctx->ctrlr, discovery_log_page_cb, ctx);
7342 0 : if (rc != 0) {
7343 0 : DISCOVERY_ERRLOG(ctx, "could not get discovery log page\n");
7344 0 : }
7345 0 : DISCOVERY_INFOLOG(ctx, "sent discovery log page command\n");
7346 0 : }
7347 :
7348 : static void
7349 0 : discovery_aer_cb(void *arg, const struct spdk_nvme_cpl *cpl)
7350 : {
7351 0 : struct discovery_ctx *ctx = arg;
7352 0 : uint32_t log_page_id = (cpl->cdw0 & 0xFF0000) >> 16;
7353 :
7354 0 : if (spdk_nvme_cpl_is_error(cpl)) {
7355 0 : DISCOVERY_ERRLOG(ctx, "aer failed\n");
7356 0 : return;
7357 : }
7358 :
7359 0 : if (log_page_id != SPDK_NVME_LOG_DISCOVERY) {
7360 0 : DISCOVERY_ERRLOG(ctx, "unexpected log page 0x%x\n", log_page_id);
7361 0 : return;
7362 : }
7363 :
7364 0 : DISCOVERY_INFOLOG(ctx, "got aer\n");
7365 0 : if (ctx->in_progress) {
7366 0 : ctx->pending = true;
7367 0 : return;
7368 : }
7369 :
7370 0 : get_discovery_log_page(ctx);
7371 0 : }
7372 :
7373 : static void
7374 0 : discovery_attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
7375 : struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts)
7376 : {
7377 0 : struct spdk_nvme_ctrlr_opts *user_opts = cb_ctx;
7378 : struct discovery_ctx *ctx;
7379 :
7380 0 : ctx = SPDK_CONTAINEROF(user_opts, struct discovery_ctx, drv_opts);
7381 :
7382 0 : DISCOVERY_INFOLOG(ctx, "discovery ctrlr attached\n");
7383 0 : ctx->probe_ctx = NULL;
7384 0 : ctx->ctrlr = ctrlr;
7385 :
7386 0 : if (ctx->rc != 0) {
7387 0 : DISCOVERY_ERRLOG(ctx, "encountered error while attaching discovery ctrlr: %d\n",
7388 : ctx->rc);
7389 0 : return;
7390 : }
7391 :
7392 0 : spdk_nvme_ctrlr_register_aer_callback(ctx->ctrlr, discovery_aer_cb, ctx);
7393 0 : }
7394 :
7395 : static int
7396 0 : discovery_poller(void *arg)
7397 : {
7398 0 : struct discovery_ctx *ctx = arg;
7399 : struct spdk_nvme_transport_id *trid;
7400 : int rc;
7401 :
7402 0 : if (ctx->detach_ctx) {
7403 0 : rc = spdk_nvme_detach_poll_async(ctx->detach_ctx);
7404 0 : if (rc != -EAGAIN) {
7405 0 : ctx->detach_ctx = NULL;
7406 0 : ctx->ctrlr = NULL;
7407 0 : }
7408 0 : } else if (ctx->stop) {
7409 0 : if (ctx->ctrlr != NULL) {
7410 0 : rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx);
7411 0 : if (rc == 0) {
7412 0 : return SPDK_POLLER_BUSY;
7413 : }
7414 0 : DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n");
7415 0 : }
7416 0 : spdk_poller_unregister(&ctx->poller);
7417 0 : TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq);
7418 0 : assert(ctx->start_cb_fn == NULL);
7419 0 : if (ctx->stop_cb_fn != NULL) {
7420 0 : ctx->stop_cb_fn(ctx->cb_ctx);
7421 0 : }
7422 0 : free_discovery_ctx(ctx);
7423 0 : } else if (ctx->probe_ctx == NULL && ctx->ctrlr == NULL) {
7424 0 : if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) {
7425 0 : DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n");
7426 0 : assert(ctx->initializing);
7427 0 : spdk_poller_unregister(&ctx->poller);
7428 0 : TAILQ_REMOVE(&g_discovery_ctxs, ctx, tailq);
7429 0 : complete_discovery_start(ctx, -ETIMEDOUT);
7430 0 : stop_discovery(ctx, NULL, NULL);
7431 0 : free_discovery_ctx(ctx);
7432 0 : return SPDK_POLLER_BUSY;
7433 : }
7434 :
7435 0 : assert(ctx->entry_ctx_in_use == NULL);
7436 0 : ctx->entry_ctx_in_use = TAILQ_FIRST(&ctx->discovery_entry_ctxs);
7437 0 : TAILQ_REMOVE(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq);
7438 0 : trid = &ctx->entry_ctx_in_use->trid;
7439 :
7440 : /* All controllers must be configured explicitely either for multipath or failover.
7441 : * While discovery use multipath mode, we need to set this in bdev options as well.
7442 : */
7443 0 : ctx->bdev_opts.multipath = true;
7444 :
7445 0 : ctx->probe_ctx = spdk_nvme_connect_async(trid, &ctx->drv_opts, discovery_attach_cb);
7446 0 : if (ctx->probe_ctx) {
7447 0 : spdk_poller_unregister(&ctx->poller);
7448 0 : ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000);
7449 0 : } else {
7450 0 : DISCOVERY_ERRLOG(ctx, "could not start discovery connect\n");
7451 0 : TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq);
7452 0 : ctx->entry_ctx_in_use = NULL;
7453 : }
7454 0 : } else if (ctx->probe_ctx) {
7455 0 : if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) {
7456 0 : DISCOVERY_ERRLOG(ctx, "timed out while attaching discovery ctrlr\n");
7457 0 : complete_discovery_start(ctx, -ETIMEDOUT);
7458 0 : return SPDK_POLLER_BUSY;
7459 : }
7460 :
7461 0 : rc = spdk_nvme_probe_poll_async(ctx->probe_ctx);
7462 0 : if (rc != -EAGAIN) {
7463 0 : if (ctx->rc != 0) {
7464 0 : assert(ctx->initializing);
7465 0 : stop_discovery(ctx, NULL, ctx->cb_ctx);
7466 0 : } else {
7467 0 : assert(rc == 0);
7468 0 : DISCOVERY_INFOLOG(ctx, "discovery ctrlr connected\n");
7469 0 : ctx->rc = rc;
7470 0 : get_discovery_log_page(ctx);
7471 : }
7472 0 : }
7473 0 : } else {
7474 0 : if (ctx->timeout_ticks != 0 && ctx->timeout_ticks < spdk_get_ticks()) {
7475 0 : DISCOVERY_ERRLOG(ctx, "timed out while attaching NVM ctrlrs\n");
7476 0 : complete_discovery_start(ctx, -ETIMEDOUT);
7477 : /* We need to wait until all NVM ctrlrs are attached before we stop the
7478 : * discovery service to make sure we don't detach a ctrlr that is still
7479 : * being attached.
7480 : */
7481 0 : if (ctx->attach_in_progress == 0) {
7482 0 : stop_discovery(ctx, NULL, ctx->cb_ctx);
7483 0 : return SPDK_POLLER_BUSY;
7484 : }
7485 0 : }
7486 :
7487 0 : rc = spdk_nvme_ctrlr_process_admin_completions(ctx->ctrlr);
7488 0 : if (rc < 0) {
7489 0 : spdk_poller_unregister(&ctx->poller);
7490 0 : ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000);
7491 0 : TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, ctx->entry_ctx_in_use, tailq);
7492 0 : ctx->entry_ctx_in_use = NULL;
7493 :
7494 0 : rc = spdk_nvme_detach_async(ctx->ctrlr, &ctx->detach_ctx);
7495 0 : if (rc != 0) {
7496 0 : DISCOVERY_ERRLOG(ctx, "could not detach discovery ctrlr\n");
7497 0 : ctx->ctrlr = NULL;
7498 0 : }
7499 0 : }
7500 : }
7501 :
7502 0 : return SPDK_POLLER_BUSY;
7503 0 : }
7504 :
7505 : static void
7506 0 : start_discovery_poller(void *arg)
7507 : {
7508 0 : struct discovery_ctx *ctx = arg;
7509 :
7510 0 : TAILQ_INSERT_TAIL(&g_discovery_ctxs, ctx, tailq);
7511 0 : ctx->poller = SPDK_POLLER_REGISTER(discovery_poller, ctx, 1000 * 1000);
7512 0 : }
7513 :
7514 : int
7515 0 : bdev_nvme_start_discovery(struct spdk_nvme_transport_id *trid,
7516 : const char *base_name,
7517 : struct spdk_nvme_ctrlr_opts *drv_opts,
7518 : struct spdk_bdev_nvme_ctrlr_opts *bdev_opts,
7519 : uint64_t attach_timeout,
7520 : bool from_mdns,
7521 : spdk_bdev_nvme_start_discovery_fn cb_fn, void *cb_ctx)
7522 : {
7523 : struct discovery_ctx *ctx;
7524 : struct discovery_entry_ctx *discovery_entry_ctx;
7525 :
7526 0 : snprintf(trid->subnqn, sizeof(trid->subnqn), "%s", SPDK_NVMF_DISCOVERY_NQN);
7527 0 : TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) {
7528 0 : if (strcmp(ctx->name, base_name) == 0) {
7529 0 : return -EEXIST;
7530 : }
7531 :
7532 0 : if (ctx->entry_ctx_in_use != NULL) {
7533 0 : if (!spdk_nvme_transport_id_compare(trid, &ctx->entry_ctx_in_use->trid)) {
7534 0 : return -EEXIST;
7535 : }
7536 0 : }
7537 :
7538 0 : TAILQ_FOREACH(discovery_entry_ctx, &ctx->discovery_entry_ctxs, tailq) {
7539 0 : if (!spdk_nvme_transport_id_compare(trid, &discovery_entry_ctx->trid)) {
7540 0 : return -EEXIST;
7541 : }
7542 0 : }
7543 0 : }
7544 :
7545 0 : ctx = calloc(1, sizeof(*ctx));
7546 0 : if (ctx == NULL) {
7547 0 : return -ENOMEM;
7548 : }
7549 :
7550 0 : ctx->name = strdup(base_name);
7551 0 : if (ctx->name == NULL) {
7552 0 : free_discovery_ctx(ctx);
7553 0 : return -ENOMEM;
7554 : }
7555 0 : memcpy(&ctx->drv_opts, drv_opts, sizeof(*drv_opts));
7556 0 : memcpy(&ctx->bdev_opts, bdev_opts, sizeof(*bdev_opts));
7557 0 : ctx->from_mdns_discovery_service = from_mdns;
7558 0 : ctx->bdev_opts.from_discovery_service = true;
7559 0 : ctx->calling_thread = spdk_get_thread();
7560 0 : ctx->start_cb_fn = cb_fn;
7561 0 : ctx->cb_ctx = cb_ctx;
7562 0 : ctx->initializing = true;
7563 0 : if (ctx->start_cb_fn) {
7564 : /* We can use this when dumping json to denote if this RPC parameter
7565 : * was specified or not.
7566 : */
7567 0 : ctx->wait_for_attach = true;
7568 0 : }
7569 0 : if (attach_timeout != 0) {
7570 0 : ctx->timeout_ticks = spdk_get_ticks() + attach_timeout *
7571 0 : spdk_get_ticks_hz() / 1000ull;
7572 0 : }
7573 0 : TAILQ_INIT(&ctx->nvm_entry_ctxs);
7574 0 : TAILQ_INIT(&ctx->discovery_entry_ctxs);
7575 0 : memcpy(&ctx->trid, trid, sizeof(*trid));
7576 : /* Even if user did not specify hostnqn, we can still strdup("\0"); */
7577 0 : ctx->hostnqn = strdup(ctx->drv_opts.hostnqn);
7578 0 : if (ctx->hostnqn == NULL) {
7579 0 : free_discovery_ctx(ctx);
7580 0 : return -ENOMEM;
7581 : }
7582 0 : discovery_entry_ctx = create_discovery_entry_ctx(ctx, trid);
7583 0 : if (discovery_entry_ctx == NULL) {
7584 0 : DISCOVERY_ERRLOG(ctx, "could not allocate new entry_ctx\n");
7585 0 : free_discovery_ctx(ctx);
7586 0 : return -ENOMEM;
7587 : }
7588 :
7589 0 : TAILQ_INSERT_TAIL(&ctx->discovery_entry_ctxs, discovery_entry_ctx, tailq);
7590 0 : spdk_thread_send_msg(g_bdev_nvme_init_thread, start_discovery_poller, ctx);
7591 0 : return 0;
7592 0 : }
7593 :
7594 : int
7595 0 : bdev_nvme_stop_discovery(const char *name, spdk_bdev_nvme_stop_discovery_fn cb_fn, void *cb_ctx)
7596 : {
7597 : struct discovery_ctx *ctx;
7598 :
7599 0 : TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) {
7600 0 : if (strcmp(name, ctx->name) == 0) {
7601 0 : if (ctx->stop) {
7602 0 : return -EALREADY;
7603 : }
7604 : /* If we're still starting the discovery service and ->rc is non-zero, we're
7605 : * going to stop it as soon as we can
7606 : */
7607 0 : if (ctx->initializing && ctx->rc != 0) {
7608 0 : return -EALREADY;
7609 : }
7610 0 : stop_discovery(ctx, cb_fn, cb_ctx);
7611 0 : return 0;
7612 : }
7613 0 : }
7614 :
7615 0 : return -ENOENT;
7616 0 : }
7617 :
7618 : static int
7619 1 : bdev_nvme_library_init(void)
7620 : {
7621 1 : g_bdev_nvme_init_thread = spdk_get_thread();
7622 :
7623 1 : spdk_io_device_register(&g_nvme_bdev_ctrlrs, bdev_nvme_create_poll_group_cb,
7624 : bdev_nvme_destroy_poll_group_cb,
7625 : sizeof(struct nvme_poll_group), "nvme_poll_groups");
7626 :
7627 1 : return 0;
7628 : }
7629 :
7630 : static void
7631 1 : bdev_nvme_fini_destruct_ctrlrs(void)
7632 : {
7633 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
7634 : struct nvme_ctrlr *nvme_ctrlr;
7635 :
7636 1 : pthread_mutex_lock(&g_bdev_nvme_mutex);
7637 1 : TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
7638 0 : TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
7639 0 : pthread_mutex_lock(&nvme_ctrlr->mutex);
7640 0 : if (nvme_ctrlr->destruct) {
7641 : /* This controller's destruction was already started
7642 : * before the application started shutting down
7643 : */
7644 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
7645 0 : continue;
7646 : }
7647 0 : nvme_ctrlr->destruct = true;
7648 0 : pthread_mutex_unlock(&nvme_ctrlr->mutex);
7649 :
7650 0 : spdk_thread_send_msg(nvme_ctrlr->thread, _nvme_ctrlr_destruct,
7651 0 : nvme_ctrlr);
7652 0 : }
7653 0 : }
7654 :
7655 1 : g_bdev_nvme_module_finish = true;
7656 1 : if (TAILQ_EMPTY(&g_nvme_bdev_ctrlrs)) {
7657 1 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
7658 1 : spdk_io_device_unregister(&g_nvme_bdev_ctrlrs, NULL);
7659 1 : spdk_bdev_module_fini_done();
7660 1 : return;
7661 : }
7662 :
7663 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
7664 1 : }
7665 :
7666 : static void
7667 0 : check_discovery_fini(void *arg)
7668 : {
7669 0 : if (TAILQ_EMPTY(&g_discovery_ctxs)) {
7670 0 : bdev_nvme_fini_destruct_ctrlrs();
7671 0 : }
7672 0 : }
7673 :
7674 : static void
7675 1 : bdev_nvme_library_fini(void)
7676 : {
7677 : struct nvme_probe_skip_entry *entry, *entry_tmp;
7678 : struct discovery_ctx *ctx;
7679 :
7680 1 : spdk_poller_unregister(&g_hotplug_poller);
7681 1 : free(g_hotplug_probe_ctx);
7682 1 : g_hotplug_probe_ctx = NULL;
7683 :
7684 1 : TAILQ_FOREACH_SAFE(entry, &g_skipped_nvme_ctrlrs, tailq, entry_tmp) {
7685 0 : TAILQ_REMOVE(&g_skipped_nvme_ctrlrs, entry, tailq);
7686 0 : free(entry);
7687 0 : }
7688 :
7689 1 : assert(spdk_get_thread() == g_bdev_nvme_init_thread);
7690 1 : if (TAILQ_EMPTY(&g_discovery_ctxs)) {
7691 1 : bdev_nvme_fini_destruct_ctrlrs();
7692 1 : } else {
7693 0 : TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) {
7694 0 : stop_discovery(ctx, check_discovery_fini, NULL);
7695 0 : }
7696 : }
7697 1 : }
7698 :
7699 : static void
7700 0 : bdev_nvme_verify_pi_error(struct nvme_bdev_io *bio)
7701 : {
7702 0 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
7703 0 : struct spdk_bdev *bdev = bdev_io->bdev;
7704 : struct spdk_dif_ctx dif_ctx;
7705 0 : struct spdk_dif_error err_blk = {};
7706 : int rc;
7707 : struct spdk_dif_ctx_init_ext_opts dif_opts;
7708 :
7709 0 : dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format);
7710 0 : dif_opts.dif_pi_format = bdev->dif_pi_format;
7711 0 : rc = spdk_dif_ctx_init(&dif_ctx,
7712 0 : bdev->blocklen, bdev->md_len, bdev->md_interleave,
7713 0 : bdev->dif_is_head_of_md, bdev->dif_type,
7714 0 : bdev_io->u.bdev.dif_check_flags,
7715 0 : bdev_io->u.bdev.offset_blocks, 0, 0, 0, 0, &dif_opts);
7716 0 : if (rc != 0) {
7717 0 : SPDK_ERRLOG("Initialization of DIF context failed\n");
7718 0 : return;
7719 : }
7720 :
7721 0 : if (bdev->md_interleave) {
7722 0 : rc = spdk_dif_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
7723 0 : bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
7724 0 : } else {
7725 0 : struct iovec md_iov = {
7726 0 : .iov_base = bdev_io->u.bdev.md_buf,
7727 0 : .iov_len = bdev_io->u.bdev.num_blocks * bdev->md_len,
7728 : };
7729 :
7730 0 : rc = spdk_dix_verify(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt,
7731 0 : &md_iov, bdev_io->u.bdev.num_blocks, &dif_ctx, &err_blk);
7732 : }
7733 :
7734 0 : if (rc != 0) {
7735 0 : SPDK_ERRLOG("DIF error detected. type=%d, offset=%" PRIu32 "\n",
7736 : err_blk.err_type, err_blk.err_offset);
7737 0 : } else {
7738 0 : SPDK_ERRLOG("Hardware reported PI error but SPDK could not find any.\n");
7739 : }
7740 0 : }
7741 :
7742 : static void
7743 0 : bdev_nvme_no_pi_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
7744 : {
7745 0 : struct nvme_bdev_io *bio = ref;
7746 :
7747 0 : if (spdk_nvme_cpl_is_success(cpl)) {
7748 : /* Run PI verification for read data buffer. */
7749 0 : bdev_nvme_verify_pi_error(bio);
7750 0 : }
7751 :
7752 : /* Return original completion status */
7753 0 : bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
7754 0 : }
7755 :
7756 : static void
7757 3 : bdev_nvme_readv_done(void *ref, const struct spdk_nvme_cpl *cpl)
7758 : {
7759 3 : struct nvme_bdev_io *bio = ref;
7760 3 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
7761 : int ret;
7762 :
7763 3 : if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) {
7764 0 : SPDK_ERRLOG("readv completed with PI error (sct=%d, sc=%d)\n",
7765 : cpl->status.sct, cpl->status.sc);
7766 :
7767 : /* Save completion status to use after verifying PI error. */
7768 0 : bio->cpl = *cpl;
7769 :
7770 0 : if (spdk_likely(nvme_io_path_is_available(bio->io_path))) {
7771 : /* Read without PI checking to verify PI error. */
7772 0 : ret = bdev_nvme_no_pi_readv(bio,
7773 0 : bdev_io->u.bdev.iovs,
7774 0 : bdev_io->u.bdev.iovcnt,
7775 0 : bdev_io->u.bdev.md_buf,
7776 0 : bdev_io->u.bdev.num_blocks,
7777 0 : bdev_io->u.bdev.offset_blocks);
7778 0 : if (ret == 0) {
7779 0 : return;
7780 : }
7781 0 : }
7782 0 : }
7783 :
7784 3 : bdev_nvme_io_complete_nvme_status(bio, cpl);
7785 3 : }
7786 :
7787 : static void
7788 25 : bdev_nvme_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
7789 : {
7790 25 : struct nvme_bdev_io *bio = ref;
7791 :
7792 25 : if (spdk_unlikely(spdk_nvme_cpl_is_pi_error(cpl))) {
7793 0 : SPDK_ERRLOG("writev completed with PI error (sct=%d, sc=%d)\n",
7794 : cpl->status.sct, cpl->status.sc);
7795 : /* Run PI verification for write data buffer if PI error is detected. */
7796 0 : bdev_nvme_verify_pi_error(bio);
7797 0 : }
7798 :
7799 25 : bdev_nvme_io_complete_nvme_status(bio, cpl);
7800 25 : }
7801 :
7802 : static void
7803 0 : bdev_nvme_zone_appendv_done(void *ref, const struct spdk_nvme_cpl *cpl)
7804 : {
7805 0 : struct nvme_bdev_io *bio = ref;
7806 0 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
7807 :
7808 : /* spdk_bdev_io_get_append_location() requires that the ALBA is stored in offset_blocks.
7809 : * Additionally, offset_blocks has to be set before calling bdev_nvme_verify_pi_error().
7810 : */
7811 0 : bdev_io->u.bdev.offset_blocks = *(uint64_t *)&cpl->cdw0;
7812 :
7813 0 : if (spdk_nvme_cpl_is_pi_error(cpl)) {
7814 0 : SPDK_ERRLOG("zone append completed with PI error (sct=%d, sc=%d)\n",
7815 : cpl->status.sct, cpl->status.sc);
7816 : /* Run PI verification for zone append data buffer if PI error is detected. */
7817 0 : bdev_nvme_verify_pi_error(bio);
7818 0 : }
7819 :
7820 0 : bdev_nvme_io_complete_nvme_status(bio, cpl);
7821 0 : }
7822 :
7823 : static void
7824 1 : bdev_nvme_comparev_done(void *ref, const struct spdk_nvme_cpl *cpl)
7825 : {
7826 1 : struct nvme_bdev_io *bio = ref;
7827 :
7828 1 : if (spdk_nvme_cpl_is_pi_error(cpl)) {
7829 0 : SPDK_ERRLOG("comparev completed with PI error (sct=%d, sc=%d)\n",
7830 : cpl->status.sct, cpl->status.sc);
7831 : /* Run PI verification for compare data buffer if PI error is detected. */
7832 0 : bdev_nvme_verify_pi_error(bio);
7833 0 : }
7834 :
7835 1 : bdev_nvme_io_complete_nvme_status(bio, cpl);
7836 1 : }
7837 :
7838 : static void
7839 4 : bdev_nvme_comparev_and_writev_done(void *ref, const struct spdk_nvme_cpl *cpl)
7840 : {
7841 4 : struct nvme_bdev_io *bio = ref;
7842 :
7843 : /* Compare operation completion */
7844 4 : if (!bio->first_fused_completed) {
7845 : /* Save compare result for write callback */
7846 2 : bio->cpl = *cpl;
7847 2 : bio->first_fused_completed = true;
7848 2 : return;
7849 : }
7850 :
7851 : /* Write operation completion */
7852 2 : if (spdk_nvme_cpl_is_error(&bio->cpl)) {
7853 : /* If bio->cpl is already an error, it means the compare operation failed. In that case,
7854 : * complete the IO with the compare operation's status.
7855 : */
7856 1 : if (!spdk_nvme_cpl_is_error(cpl)) {
7857 1 : SPDK_ERRLOG("Unexpected write success after compare failure.\n");
7858 1 : }
7859 :
7860 1 : bdev_nvme_io_complete_nvme_status(bio, &bio->cpl);
7861 1 : } else {
7862 1 : bdev_nvme_io_complete_nvme_status(bio, cpl);
7863 : }
7864 4 : }
7865 :
7866 : static void
7867 1 : bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl)
7868 : {
7869 1 : struct nvme_bdev_io *bio = ref;
7870 :
7871 1 : bdev_nvme_io_complete_nvme_status(bio, cpl);
7872 1 : }
7873 :
7874 : static int
7875 0 : fill_zone_from_report(struct spdk_bdev_zone_info *info, struct spdk_nvme_zns_zone_desc *desc)
7876 : {
7877 0 : switch (desc->zt) {
7878 : case SPDK_NVME_ZONE_TYPE_SEQWR:
7879 0 : info->type = SPDK_BDEV_ZONE_TYPE_SEQWR;
7880 0 : break;
7881 : default:
7882 0 : SPDK_ERRLOG("Invalid zone type: %#x in zone report\n", desc->zt);
7883 0 : return -EIO;
7884 : }
7885 :
7886 0 : switch (desc->zs) {
7887 : case SPDK_NVME_ZONE_STATE_EMPTY:
7888 0 : info->state = SPDK_BDEV_ZONE_STATE_EMPTY;
7889 0 : break;
7890 : case SPDK_NVME_ZONE_STATE_IOPEN:
7891 0 : info->state = SPDK_BDEV_ZONE_STATE_IMP_OPEN;
7892 0 : break;
7893 : case SPDK_NVME_ZONE_STATE_EOPEN:
7894 0 : info->state = SPDK_BDEV_ZONE_STATE_EXP_OPEN;
7895 0 : break;
7896 : case SPDK_NVME_ZONE_STATE_CLOSED:
7897 0 : info->state = SPDK_BDEV_ZONE_STATE_CLOSED;
7898 0 : break;
7899 : case SPDK_NVME_ZONE_STATE_RONLY:
7900 0 : info->state = SPDK_BDEV_ZONE_STATE_READ_ONLY;
7901 0 : break;
7902 : case SPDK_NVME_ZONE_STATE_FULL:
7903 0 : info->state = SPDK_BDEV_ZONE_STATE_FULL;
7904 0 : break;
7905 : case SPDK_NVME_ZONE_STATE_OFFLINE:
7906 0 : info->state = SPDK_BDEV_ZONE_STATE_OFFLINE;
7907 0 : break;
7908 : default:
7909 0 : SPDK_ERRLOG("Invalid zone state: %#x in zone report\n", desc->zs);
7910 0 : return -EIO;
7911 : }
7912 :
7913 0 : info->zone_id = desc->zslba;
7914 0 : info->write_pointer = desc->wp;
7915 0 : info->capacity = desc->zcap;
7916 :
7917 0 : return 0;
7918 0 : }
7919 :
7920 : static void
7921 0 : bdev_nvme_get_zone_info_done(void *ref, const struct spdk_nvme_cpl *cpl)
7922 : {
7923 0 : struct nvme_bdev_io *bio = ref;
7924 0 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
7925 0 : uint64_t zone_id = bdev_io->u.zone_mgmt.zone_id;
7926 0 : uint32_t zones_to_copy = bdev_io->u.zone_mgmt.num_zones;
7927 0 : struct spdk_bdev_zone_info *info = bdev_io->u.zone_mgmt.buf;
7928 : uint64_t max_zones_per_buf, i;
7929 : uint32_t zone_report_bufsize;
7930 : struct spdk_nvme_ns *ns;
7931 : struct spdk_nvme_qpair *qpair;
7932 : int ret;
7933 :
7934 0 : if (spdk_nvme_cpl_is_error(cpl)) {
7935 0 : goto out_complete_io_nvme_cpl;
7936 : }
7937 :
7938 0 : if (spdk_unlikely(!nvme_io_path_is_available(bio->io_path))) {
7939 0 : ret = -ENXIO;
7940 0 : goto out_complete_io_ret;
7941 : }
7942 :
7943 0 : ns = bio->io_path->nvme_ns->ns;
7944 0 : qpair = bio->io_path->qpair->qpair;
7945 :
7946 0 : zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns);
7947 0 : max_zones_per_buf = (zone_report_bufsize - sizeof(*bio->zone_report_buf)) /
7948 : sizeof(bio->zone_report_buf->descs[0]);
7949 :
7950 0 : if (bio->zone_report_buf->nr_zones > max_zones_per_buf) {
7951 0 : ret = -EINVAL;
7952 0 : goto out_complete_io_ret;
7953 : }
7954 :
7955 0 : if (!bio->zone_report_buf->nr_zones) {
7956 0 : ret = -EINVAL;
7957 0 : goto out_complete_io_ret;
7958 : }
7959 :
7960 0 : for (i = 0; i < bio->zone_report_buf->nr_zones && bio->handled_zones < zones_to_copy; i++) {
7961 0 : ret = fill_zone_from_report(&info[bio->handled_zones],
7962 0 : &bio->zone_report_buf->descs[i]);
7963 0 : if (ret) {
7964 0 : goto out_complete_io_ret;
7965 : }
7966 0 : bio->handled_zones++;
7967 0 : }
7968 :
7969 0 : if (bio->handled_zones < zones_to_copy) {
7970 0 : uint64_t zone_size_lba = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
7971 0 : uint64_t slba = zone_id + (zone_size_lba * bio->handled_zones);
7972 :
7973 0 : memset(bio->zone_report_buf, 0, zone_report_bufsize);
7974 0 : ret = spdk_nvme_zns_report_zones(ns, qpair,
7975 0 : bio->zone_report_buf, zone_report_bufsize,
7976 0 : slba, SPDK_NVME_ZRA_LIST_ALL, true,
7977 0 : bdev_nvme_get_zone_info_done, bio);
7978 0 : if (!ret) {
7979 0 : return;
7980 : } else {
7981 0 : goto out_complete_io_ret;
7982 : }
7983 : }
7984 :
7985 : out_complete_io_nvme_cpl:
7986 0 : free(bio->zone_report_buf);
7987 0 : bio->zone_report_buf = NULL;
7988 0 : bdev_nvme_io_complete_nvme_status(bio, cpl);
7989 0 : return;
7990 :
7991 : out_complete_io_ret:
7992 0 : free(bio->zone_report_buf);
7993 0 : bio->zone_report_buf = NULL;
7994 0 : bdev_nvme_io_complete(bio, ret);
7995 0 : }
7996 :
7997 : static void
7998 0 : bdev_nvme_zone_management_done(void *ref, const struct spdk_nvme_cpl *cpl)
7999 : {
8000 0 : struct nvme_bdev_io *bio = ref;
8001 :
8002 0 : bdev_nvme_io_complete_nvme_status(bio, cpl);
8003 0 : }
8004 :
8005 : static void
8006 4 : bdev_nvme_admin_passthru_complete_nvme_status(void *ctx)
8007 : {
8008 4 : struct nvme_bdev_io *bio = ctx;
8009 4 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
8010 4 : const struct spdk_nvme_cpl *cpl = &bio->cpl;
8011 :
8012 4 : assert(bdev_nvme_io_type_is_admin(bdev_io->type));
8013 :
8014 4 : __bdev_nvme_io_complete(bdev_io, 0, cpl);
8015 4 : }
8016 :
8017 : static void
8018 3 : bdev_nvme_abort_complete(void *ctx)
8019 : {
8020 3 : struct nvme_bdev_io *bio = ctx;
8021 3 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
8022 :
8023 3 : if (spdk_nvme_cpl_is_abort_success(&bio->cpl)) {
8024 3 : __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS, NULL);
8025 3 : } else {
8026 0 : __bdev_nvme_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED, NULL);
8027 : }
8028 3 : }
8029 :
8030 : static void
8031 3 : bdev_nvme_abort_done(void *ref, const struct spdk_nvme_cpl *cpl)
8032 : {
8033 3 : struct nvme_bdev_io *bio = ref;
8034 3 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
8035 :
8036 3 : bio->cpl = *cpl;
8037 3 : spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_nvme_abort_complete, bio);
8038 3 : }
8039 :
8040 : static void
8041 4 : bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl)
8042 : {
8043 4 : struct nvme_bdev_io *bio = ref;
8044 4 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
8045 :
8046 4 : bio->cpl = *cpl;
8047 8 : spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io),
8048 4 : bdev_nvme_admin_passthru_complete_nvme_status, bio);
8049 4 : }
8050 :
8051 : static void
8052 0 : bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset)
8053 : {
8054 0 : struct nvme_bdev_io *bio = ref;
8055 : struct iovec *iov;
8056 :
8057 0 : bio->iov_offset = sgl_offset;
8058 0 : for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) {
8059 0 : iov = &bio->iovs[bio->iovpos];
8060 0 : if (bio->iov_offset < iov->iov_len) {
8061 0 : break;
8062 : }
8063 :
8064 0 : bio->iov_offset -= iov->iov_len;
8065 0 : }
8066 0 : }
8067 :
8068 : static int
8069 0 : bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length)
8070 : {
8071 0 : struct nvme_bdev_io *bio = ref;
8072 : struct iovec *iov;
8073 :
8074 0 : assert(bio->iovpos < bio->iovcnt);
8075 :
8076 0 : iov = &bio->iovs[bio->iovpos];
8077 :
8078 0 : *address = iov->iov_base;
8079 0 : *length = iov->iov_len;
8080 :
8081 0 : if (bio->iov_offset) {
8082 0 : assert(bio->iov_offset <= iov->iov_len);
8083 0 : *address += bio->iov_offset;
8084 0 : *length -= bio->iov_offset;
8085 0 : }
8086 :
8087 0 : bio->iov_offset += *length;
8088 0 : if (bio->iov_offset == iov->iov_len) {
8089 0 : bio->iovpos++;
8090 0 : bio->iov_offset = 0;
8091 0 : }
8092 :
8093 0 : return 0;
8094 : }
8095 :
8096 : static void
8097 0 : bdev_nvme_queued_reset_fused_sgl(void *ref, uint32_t sgl_offset)
8098 : {
8099 0 : struct nvme_bdev_io *bio = ref;
8100 : struct iovec *iov;
8101 :
8102 0 : bio->fused_iov_offset = sgl_offset;
8103 0 : for (bio->fused_iovpos = 0; bio->fused_iovpos < bio->fused_iovcnt; bio->fused_iovpos++) {
8104 0 : iov = &bio->fused_iovs[bio->fused_iovpos];
8105 0 : if (bio->fused_iov_offset < iov->iov_len) {
8106 0 : break;
8107 : }
8108 :
8109 0 : bio->fused_iov_offset -= iov->iov_len;
8110 0 : }
8111 0 : }
8112 :
8113 : static int
8114 0 : bdev_nvme_queued_next_fused_sge(void *ref, void **address, uint32_t *length)
8115 : {
8116 0 : struct nvme_bdev_io *bio = ref;
8117 : struct iovec *iov;
8118 :
8119 0 : assert(bio->fused_iovpos < bio->fused_iovcnt);
8120 :
8121 0 : iov = &bio->fused_iovs[bio->fused_iovpos];
8122 :
8123 0 : *address = iov->iov_base;
8124 0 : *length = iov->iov_len;
8125 :
8126 0 : if (bio->fused_iov_offset) {
8127 0 : assert(bio->fused_iov_offset <= iov->iov_len);
8128 0 : *address += bio->fused_iov_offset;
8129 0 : *length -= bio->fused_iov_offset;
8130 0 : }
8131 :
8132 0 : bio->fused_iov_offset += *length;
8133 0 : if (bio->fused_iov_offset == iov->iov_len) {
8134 0 : bio->fused_iovpos++;
8135 0 : bio->fused_iov_offset = 0;
8136 0 : }
8137 :
8138 0 : return 0;
8139 : }
8140 :
8141 : static int
8142 0 : bdev_nvme_no_pi_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
8143 : void *md, uint64_t lba_count, uint64_t lba)
8144 : {
8145 : int rc;
8146 :
8147 0 : SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 " without PI check\n",
8148 : lba_count, lba);
8149 :
8150 0 : bio->iovs = iov;
8151 0 : bio->iovcnt = iovcnt;
8152 0 : bio->iovpos = 0;
8153 0 : bio->iov_offset = 0;
8154 :
8155 0 : rc = spdk_nvme_ns_cmd_readv_with_md(bio->io_path->nvme_ns->ns,
8156 0 : bio->io_path->qpair->qpair,
8157 0 : lba, lba_count,
8158 0 : bdev_nvme_no_pi_readv_done, bio, 0,
8159 : bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
8160 0 : md, 0, 0);
8161 :
8162 0 : if (rc != 0 && rc != -ENOMEM) {
8163 0 : SPDK_ERRLOG("no_pi_readv failed: rc = %d\n", rc);
8164 0 : }
8165 0 : return rc;
8166 : }
8167 :
8168 : static int
8169 3 : bdev_nvme_readv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
8170 : void *md, uint64_t lba_count, uint64_t lba, uint32_t flags,
8171 : struct spdk_memory_domain *domain, void *domain_ctx,
8172 : struct spdk_accel_sequence *seq)
8173 : {
8174 3 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8175 3 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8176 : int rc;
8177 :
8178 3 : SPDK_DEBUGLOG(bdev_nvme, "read %" PRIu64 " blocks with offset %#" PRIx64 "\n",
8179 : lba_count, lba);
8180 :
8181 3 : bio->iovs = iov;
8182 3 : bio->iovcnt = iovcnt;
8183 3 : bio->iovpos = 0;
8184 3 : bio->iov_offset = 0;
8185 :
8186 3 : if (domain != NULL || seq != NULL) {
8187 1 : bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence);
8188 1 : bio->ext_opts.memory_domain = domain;
8189 1 : bio->ext_opts.memory_domain_ctx = domain_ctx;
8190 1 : bio->ext_opts.io_flags = flags;
8191 1 : bio->ext_opts.metadata = md;
8192 1 : bio->ext_opts.accel_sequence = seq;
8193 :
8194 1 : if (iovcnt == 1) {
8195 2 : rc = spdk_nvme_ns_cmd_read_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_readv_done,
8196 1 : bio, &bio->ext_opts);
8197 1 : } else {
8198 0 : rc = spdk_nvme_ns_cmd_readv_ext(ns, qpair, lba, lba_count,
8199 0 : bdev_nvme_readv_done, bio,
8200 : bdev_nvme_queued_reset_sgl,
8201 : bdev_nvme_queued_next_sge,
8202 0 : &bio->ext_opts);
8203 : }
8204 3 : } else if (iovcnt == 1) {
8205 4 : rc = spdk_nvme_ns_cmd_read_with_md(ns, qpair, iov[0].iov_base,
8206 2 : md, lba, lba_count, bdev_nvme_readv_done,
8207 2 : bio, flags, 0, 0);
8208 2 : } else {
8209 0 : rc = spdk_nvme_ns_cmd_readv_with_md(ns, qpair, lba, lba_count,
8210 0 : bdev_nvme_readv_done, bio, flags,
8211 : bdev_nvme_queued_reset_sgl,
8212 0 : bdev_nvme_queued_next_sge, md, 0, 0);
8213 : }
8214 :
8215 3 : if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) {
8216 0 : SPDK_ERRLOG("readv failed: rc = %d\n", rc);
8217 0 : }
8218 3 : return rc;
8219 : }
8220 :
8221 : static int
8222 25 : bdev_nvme_writev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
8223 : void *md, uint64_t lba_count, uint64_t lba, uint32_t flags,
8224 : struct spdk_memory_domain *domain, void *domain_ctx,
8225 : struct spdk_accel_sequence *seq,
8226 : union spdk_bdev_nvme_cdw12 cdw12, union spdk_bdev_nvme_cdw13 cdw13)
8227 : {
8228 25 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8229 25 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8230 : int rc;
8231 :
8232 25 : SPDK_DEBUGLOG(bdev_nvme, "write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
8233 : lba_count, lba);
8234 :
8235 25 : bio->iovs = iov;
8236 25 : bio->iovcnt = iovcnt;
8237 25 : bio->iovpos = 0;
8238 25 : bio->iov_offset = 0;
8239 :
8240 25 : if (domain != NULL || seq != NULL) {
8241 0 : bio->ext_opts.size = SPDK_SIZEOF(&bio->ext_opts, accel_sequence);
8242 0 : bio->ext_opts.memory_domain = domain;
8243 0 : bio->ext_opts.memory_domain_ctx = domain_ctx;
8244 0 : bio->ext_opts.io_flags = flags | SPDK_NVME_IO_FLAGS_DIRECTIVE(cdw12.write.dtype);
8245 0 : bio->ext_opts.cdw13 = cdw13.raw;
8246 0 : bio->ext_opts.metadata = md;
8247 0 : bio->ext_opts.accel_sequence = seq;
8248 :
8249 0 : if (iovcnt == 1) {
8250 0 : rc = spdk_nvme_ns_cmd_write_ext(ns, qpair, iov[0].iov_base, lba, lba_count, bdev_nvme_writev_done,
8251 0 : bio, &bio->ext_opts);
8252 0 : } else {
8253 0 : rc = spdk_nvme_ns_cmd_writev_ext(ns, qpair, lba, lba_count,
8254 0 : bdev_nvme_writev_done, bio,
8255 : bdev_nvme_queued_reset_sgl,
8256 : bdev_nvme_queued_next_sge,
8257 0 : &bio->ext_opts);
8258 : }
8259 25 : } else if (iovcnt == 1) {
8260 50 : rc = spdk_nvme_ns_cmd_write_with_md(ns, qpair, iov[0].iov_base,
8261 25 : md, lba, lba_count, bdev_nvme_writev_done,
8262 25 : bio, flags, 0, 0);
8263 25 : } else {
8264 0 : rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
8265 0 : bdev_nvme_writev_done, bio, flags,
8266 : bdev_nvme_queued_reset_sgl,
8267 0 : bdev_nvme_queued_next_sge, md, 0, 0);
8268 : }
8269 :
8270 25 : if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) {
8271 0 : SPDK_ERRLOG("writev failed: rc = %d\n", rc);
8272 0 : }
8273 25 : return rc;
8274 : }
8275 :
8276 : static int
8277 0 : bdev_nvme_zone_appendv(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
8278 : void *md, uint64_t lba_count, uint64_t zslba,
8279 : uint32_t flags)
8280 : {
8281 0 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8282 0 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8283 : int rc;
8284 :
8285 0 : SPDK_DEBUGLOG(bdev_nvme, "zone append %" PRIu64 " blocks to zone start lba %#" PRIx64 "\n",
8286 : lba_count, zslba);
8287 :
8288 0 : bio->iovs = iov;
8289 0 : bio->iovcnt = iovcnt;
8290 0 : bio->iovpos = 0;
8291 0 : bio->iov_offset = 0;
8292 :
8293 0 : if (iovcnt == 1) {
8294 0 : rc = spdk_nvme_zns_zone_append_with_md(ns, qpair, iov[0].iov_base, md, zslba,
8295 0 : lba_count,
8296 0 : bdev_nvme_zone_appendv_done, bio,
8297 0 : flags,
8298 : 0, 0);
8299 0 : } else {
8300 0 : rc = spdk_nvme_zns_zone_appendv_with_md(ns, qpair, zslba, lba_count,
8301 0 : bdev_nvme_zone_appendv_done, bio, flags,
8302 : bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
8303 0 : md, 0, 0);
8304 : }
8305 :
8306 0 : if (rc != 0 && rc != -ENOMEM) {
8307 0 : SPDK_ERRLOG("zone append failed: rc = %d\n", rc);
8308 0 : }
8309 0 : return rc;
8310 : }
8311 :
8312 : static int
8313 1 : bdev_nvme_comparev(struct nvme_bdev_io *bio, struct iovec *iov, int iovcnt,
8314 : void *md, uint64_t lba_count, uint64_t lba,
8315 : uint32_t flags)
8316 : {
8317 : int rc;
8318 :
8319 1 : SPDK_DEBUGLOG(bdev_nvme, "compare %" PRIu64 " blocks with offset %#" PRIx64 "\n",
8320 : lba_count, lba);
8321 :
8322 1 : bio->iovs = iov;
8323 1 : bio->iovcnt = iovcnt;
8324 1 : bio->iovpos = 0;
8325 1 : bio->iov_offset = 0;
8326 :
8327 2 : rc = spdk_nvme_ns_cmd_comparev_with_md(bio->io_path->nvme_ns->ns,
8328 1 : bio->io_path->qpair->qpair,
8329 1 : lba, lba_count,
8330 1 : bdev_nvme_comparev_done, bio, flags,
8331 : bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge,
8332 1 : md, 0, 0);
8333 :
8334 1 : if (rc != 0 && rc != -ENOMEM) {
8335 0 : SPDK_ERRLOG("comparev failed: rc = %d\n", rc);
8336 0 : }
8337 1 : return rc;
8338 : }
8339 :
8340 : static int
8341 2 : bdev_nvme_comparev_and_writev(struct nvme_bdev_io *bio, struct iovec *cmp_iov, int cmp_iovcnt,
8342 : struct iovec *write_iov, int write_iovcnt,
8343 : void *md, uint64_t lba_count, uint64_t lba, uint32_t flags)
8344 : {
8345 2 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8346 2 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8347 2 : struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio);
8348 : int rc;
8349 :
8350 2 : SPDK_DEBUGLOG(bdev_nvme, "compare and write %" PRIu64 " blocks with offset %#" PRIx64 "\n",
8351 : lba_count, lba);
8352 :
8353 2 : bio->iovs = cmp_iov;
8354 2 : bio->iovcnt = cmp_iovcnt;
8355 2 : bio->iovpos = 0;
8356 2 : bio->iov_offset = 0;
8357 2 : bio->fused_iovs = write_iov;
8358 2 : bio->fused_iovcnt = write_iovcnt;
8359 2 : bio->fused_iovpos = 0;
8360 2 : bio->fused_iov_offset = 0;
8361 :
8362 2 : if (bdev_io->num_retries == 0) {
8363 2 : bio->first_fused_submitted = false;
8364 2 : bio->first_fused_completed = false;
8365 2 : }
8366 :
8367 2 : if (!bio->first_fused_submitted) {
8368 2 : flags |= SPDK_NVME_IO_FLAGS_FUSE_FIRST;
8369 2 : memset(&bio->cpl, 0, sizeof(bio->cpl));
8370 :
8371 4 : rc = spdk_nvme_ns_cmd_comparev_with_md(ns, qpair, lba, lba_count,
8372 2 : bdev_nvme_comparev_and_writev_done, bio, flags,
8373 2 : bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge, md, 0, 0);
8374 2 : if (rc == 0) {
8375 2 : bio->first_fused_submitted = true;
8376 2 : flags &= ~SPDK_NVME_IO_FLAGS_FUSE_FIRST;
8377 2 : } else {
8378 0 : if (rc != -ENOMEM) {
8379 0 : SPDK_ERRLOG("compare failed: rc = %d\n", rc);
8380 0 : }
8381 0 : return rc;
8382 : }
8383 2 : }
8384 :
8385 2 : flags |= SPDK_NVME_IO_FLAGS_FUSE_SECOND;
8386 :
8387 4 : rc = spdk_nvme_ns_cmd_writev_with_md(ns, qpair, lba, lba_count,
8388 2 : bdev_nvme_comparev_and_writev_done, bio, flags,
8389 2 : bdev_nvme_queued_reset_fused_sgl, bdev_nvme_queued_next_fused_sge, md, 0, 0);
8390 2 : if (rc != 0 && rc != -ENOMEM) {
8391 0 : SPDK_ERRLOG("write failed: rc = %d\n", rc);
8392 0 : rc = 0;
8393 0 : }
8394 :
8395 2 : return rc;
8396 2 : }
8397 :
8398 : static int
8399 1 : bdev_nvme_unmap(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks)
8400 : {
8401 : struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES];
8402 : struct spdk_nvme_dsm_range *range;
8403 : uint64_t offset, remaining;
8404 : uint64_t num_ranges_u64;
8405 : uint16_t num_ranges;
8406 : int rc;
8407 :
8408 1 : num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) /
8409 : SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
8410 1 : if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) {
8411 0 : SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks);
8412 0 : return -EINVAL;
8413 : }
8414 1 : num_ranges = (uint16_t)num_ranges_u64;
8415 :
8416 1 : offset = offset_blocks;
8417 1 : remaining = num_blocks;
8418 1 : range = &dsm_ranges[0];
8419 :
8420 : /* Fill max-size ranges until the remaining blocks fit into one range */
8421 1 : while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) {
8422 0 : range->attributes.raw = 0;
8423 0 : range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
8424 0 : range->starting_lba = offset;
8425 :
8426 0 : offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
8427 0 : remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS;
8428 0 : range++;
8429 : }
8430 :
8431 : /* Final range describes the remaining blocks */
8432 1 : range->attributes.raw = 0;
8433 1 : range->length = remaining;
8434 1 : range->starting_lba = offset;
8435 :
8436 2 : rc = spdk_nvme_ns_cmd_dataset_management(bio->io_path->nvme_ns->ns,
8437 1 : bio->io_path->qpair->qpair,
8438 : SPDK_NVME_DSM_ATTR_DEALLOCATE,
8439 1 : dsm_ranges, num_ranges,
8440 1 : bdev_nvme_queued_done, bio);
8441 :
8442 1 : return rc;
8443 1 : }
8444 :
8445 : static int
8446 0 : bdev_nvme_write_zeroes(struct nvme_bdev_io *bio, uint64_t offset_blocks, uint64_t num_blocks)
8447 : {
8448 0 : if (num_blocks > UINT16_MAX + 1) {
8449 0 : SPDK_ERRLOG("NVMe write zeroes is limited to 16-bit block count\n");
8450 0 : return -EINVAL;
8451 : }
8452 :
8453 0 : return spdk_nvme_ns_cmd_write_zeroes(bio->io_path->nvme_ns->ns,
8454 0 : bio->io_path->qpair->qpair,
8455 0 : offset_blocks, num_blocks,
8456 0 : bdev_nvme_queued_done, bio,
8457 : 0);
8458 0 : }
8459 :
8460 : static int
8461 0 : bdev_nvme_get_zone_info(struct nvme_bdev_io *bio, uint64_t zone_id, uint32_t num_zones,
8462 : struct spdk_bdev_zone_info *info)
8463 : {
8464 0 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8465 0 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8466 0 : uint32_t zone_report_bufsize = spdk_nvme_ns_get_max_io_xfer_size(ns);
8467 0 : uint64_t zone_size = spdk_nvme_zns_ns_get_zone_size_sectors(ns);
8468 0 : uint64_t total_zones = spdk_nvme_zns_ns_get_num_zones(ns);
8469 :
8470 0 : if (zone_id % zone_size != 0) {
8471 0 : return -EINVAL;
8472 : }
8473 :
8474 0 : if (num_zones > total_zones || !num_zones) {
8475 0 : return -EINVAL;
8476 : }
8477 :
8478 0 : assert(!bio->zone_report_buf);
8479 0 : bio->zone_report_buf = calloc(1, zone_report_bufsize);
8480 0 : if (!bio->zone_report_buf) {
8481 0 : return -ENOMEM;
8482 : }
8483 :
8484 0 : bio->handled_zones = 0;
8485 :
8486 0 : return spdk_nvme_zns_report_zones(ns, qpair, bio->zone_report_buf, zone_report_bufsize,
8487 0 : zone_id, SPDK_NVME_ZRA_LIST_ALL, true,
8488 0 : bdev_nvme_get_zone_info_done, bio);
8489 0 : }
8490 :
8491 : static int
8492 0 : bdev_nvme_zone_management(struct nvme_bdev_io *bio, uint64_t zone_id,
8493 : enum spdk_bdev_zone_action action)
8494 : {
8495 0 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8496 0 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8497 :
8498 0 : switch (action) {
8499 : case SPDK_BDEV_ZONE_CLOSE:
8500 0 : return spdk_nvme_zns_close_zone(ns, qpair, zone_id, false,
8501 0 : bdev_nvme_zone_management_done, bio);
8502 : case SPDK_BDEV_ZONE_FINISH:
8503 0 : return spdk_nvme_zns_finish_zone(ns, qpair, zone_id, false,
8504 0 : bdev_nvme_zone_management_done, bio);
8505 : case SPDK_BDEV_ZONE_OPEN:
8506 0 : return spdk_nvme_zns_open_zone(ns, qpair, zone_id, false,
8507 0 : bdev_nvme_zone_management_done, bio);
8508 : case SPDK_BDEV_ZONE_RESET:
8509 0 : return spdk_nvme_zns_reset_zone(ns, qpair, zone_id, false,
8510 0 : bdev_nvme_zone_management_done, bio);
8511 : case SPDK_BDEV_ZONE_OFFLINE:
8512 0 : return spdk_nvme_zns_offline_zone(ns, qpair, zone_id, false,
8513 0 : bdev_nvme_zone_management_done, bio);
8514 : default:
8515 0 : return -EINVAL;
8516 : }
8517 0 : }
8518 :
8519 : static void
8520 5 : bdev_nvme_admin_passthru(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio,
8521 : struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes)
8522 : {
8523 : struct nvme_io_path *io_path;
8524 : struct nvme_ctrlr *nvme_ctrlr;
8525 : uint32_t max_xfer_size;
8526 5 : int rc = -ENXIO;
8527 :
8528 : /* Choose the first ctrlr which is not failed. */
8529 8 : STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
8530 7 : nvme_ctrlr = io_path->qpair->ctrlr;
8531 :
8532 : /* We should skip any unavailable nvme_ctrlr rather than checking
8533 : * if the return value of spdk_nvme_ctrlr_cmd_admin_raw() is -ENXIO.
8534 : */
8535 7 : if (!nvme_ctrlr_is_available(nvme_ctrlr)) {
8536 3 : continue;
8537 : }
8538 :
8539 4 : max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nvme_ctrlr->ctrlr);
8540 :
8541 4 : if (nbytes > max_xfer_size) {
8542 0 : SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
8543 0 : rc = -EINVAL;
8544 0 : goto err;
8545 : }
8546 :
8547 8 : rc = spdk_nvme_ctrlr_cmd_admin_raw(nvme_ctrlr->ctrlr, cmd, buf, (uint32_t)nbytes,
8548 4 : bdev_nvme_admin_passthru_done, bio);
8549 4 : if (rc == 0) {
8550 4 : return;
8551 : }
8552 1 : }
8553 :
8554 : err:
8555 1 : bdev_nvme_admin_complete(bio, rc);
8556 5 : }
8557 :
8558 : static int
8559 0 : bdev_nvme_io_passthru(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
8560 : void *buf, size_t nbytes)
8561 : {
8562 0 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8563 0 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8564 0 : uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
8565 0 : struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
8566 :
8567 0 : if (nbytes > max_xfer_size) {
8568 0 : SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
8569 0 : return -EINVAL;
8570 : }
8571 :
8572 : /*
8573 : * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
8574 : * so fill it out automatically.
8575 : */
8576 0 : cmd->nsid = spdk_nvme_ns_get_id(ns);
8577 :
8578 0 : return spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, cmd, buf,
8579 0 : (uint32_t)nbytes, bdev_nvme_queued_done, bio);
8580 0 : }
8581 :
8582 : static int
8583 0 : bdev_nvme_io_passthru_md(struct nvme_bdev_io *bio, struct spdk_nvme_cmd *cmd,
8584 : void *buf, size_t nbytes, void *md_buf, size_t md_len)
8585 : {
8586 0 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8587 0 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8588 0 : size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns);
8589 0 : uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
8590 0 : struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
8591 :
8592 0 : if (nbytes > max_xfer_size) {
8593 0 : SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
8594 0 : return -EINVAL;
8595 : }
8596 :
8597 0 : if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) {
8598 0 : SPDK_ERRLOG("invalid meta data buffer size\n");
8599 0 : return -EINVAL;
8600 : }
8601 :
8602 : /*
8603 : * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid,
8604 : * so fill it out automatically.
8605 : */
8606 0 : cmd->nsid = spdk_nvme_ns_get_id(ns);
8607 :
8608 0 : return spdk_nvme_ctrlr_cmd_io_raw_with_md(ctrlr, qpair, cmd, buf,
8609 0 : (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio);
8610 0 : }
8611 :
8612 : static int
8613 0 : bdev_nvme_iov_passthru_md(struct nvme_bdev_io *bio,
8614 : struct spdk_nvme_cmd *cmd, struct iovec *iov, int iovcnt,
8615 : size_t nbytes, void *md_buf, size_t md_len)
8616 : {
8617 0 : struct spdk_nvme_ns *ns = bio->io_path->nvme_ns->ns;
8618 0 : struct spdk_nvme_qpair *qpair = bio->io_path->qpair->qpair;
8619 0 : size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(ns);
8620 0 : uint32_t max_xfer_size = spdk_nvme_ns_get_max_io_xfer_size(ns);
8621 0 : struct spdk_nvme_ctrlr *ctrlr = spdk_nvme_ns_get_ctrlr(ns);
8622 :
8623 0 : bio->iovs = iov;
8624 0 : bio->iovcnt = iovcnt;
8625 0 : bio->iovpos = 0;
8626 0 : bio->iov_offset = 0;
8627 :
8628 0 : if (nbytes > max_xfer_size) {
8629 0 : SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size);
8630 0 : return -EINVAL;
8631 : }
8632 :
8633 0 : if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(ns)) {
8634 0 : SPDK_ERRLOG("invalid meta data buffer size\n");
8635 0 : return -EINVAL;
8636 : }
8637 :
8638 : /*
8639 : * Each NVMe bdev is a specific namespace, and all NVMe I/O commands
8640 : * require a nsid, so fill it out automatically.
8641 : */
8642 0 : cmd->nsid = spdk_nvme_ns_get_id(ns);
8643 :
8644 0 : return spdk_nvme_ctrlr_cmd_iov_raw_with_md(
8645 0 : ctrlr, qpair, cmd, (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio,
8646 : bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge);
8647 0 : }
8648 :
8649 : static void
8650 6 : bdev_nvme_abort(struct nvme_bdev_channel *nbdev_ch, struct nvme_bdev_io *bio,
8651 : struct nvme_bdev_io *bio_to_abort)
8652 : {
8653 : struct nvme_io_path *io_path;
8654 6 : int rc = 0;
8655 :
8656 6 : rc = bdev_nvme_abort_retry_io(nbdev_ch, bio_to_abort);
8657 6 : if (rc == 0) {
8658 1 : bdev_nvme_admin_complete(bio, 0);
8659 1 : return;
8660 : }
8661 :
8662 5 : io_path = bio_to_abort->io_path;
8663 5 : if (io_path != NULL) {
8664 6 : rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr,
8665 3 : io_path->qpair->qpair,
8666 3 : bio_to_abort,
8667 3 : bdev_nvme_abort_done, bio);
8668 3 : } else {
8669 3 : STAILQ_FOREACH(io_path, &nbdev_ch->io_path_list, stailq) {
8670 4 : rc = spdk_nvme_ctrlr_cmd_abort_ext(io_path->qpair->ctrlr->ctrlr,
8671 : NULL,
8672 2 : bio_to_abort,
8673 2 : bdev_nvme_abort_done, bio);
8674 :
8675 2 : if (rc != -ENOENT) {
8676 1 : break;
8677 : }
8678 1 : }
8679 : }
8680 :
8681 5 : if (rc != 0) {
8682 : /* If no command was found or there was any error, complete the abort
8683 : * request with failure.
8684 : */
8685 2 : bdev_nvme_admin_complete(bio, rc);
8686 2 : }
8687 6 : }
8688 :
8689 : static int
8690 0 : bdev_nvme_copy(struct nvme_bdev_io *bio, uint64_t dst_offset_blocks, uint64_t src_offset_blocks,
8691 : uint64_t num_blocks)
8692 : {
8693 0 : struct spdk_nvme_scc_source_range range = {
8694 0 : .slba = src_offset_blocks,
8695 0 : .nlb = num_blocks - 1
8696 : };
8697 :
8698 0 : return spdk_nvme_ns_cmd_copy(bio->io_path->nvme_ns->ns,
8699 0 : bio->io_path->qpair->qpair,
8700 0 : &range, 1, dst_offset_blocks,
8701 0 : bdev_nvme_queued_done, bio);
8702 : }
8703 :
8704 : static void
8705 0 : bdev_nvme_opts_config_json(struct spdk_json_write_ctx *w)
8706 : {
8707 : const char *action;
8708 : uint32_t i;
8709 :
8710 0 : if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) {
8711 0 : action = "reset";
8712 0 : } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) {
8713 0 : action = "abort";
8714 0 : } else {
8715 0 : action = "none";
8716 : }
8717 :
8718 0 : spdk_json_write_object_begin(w);
8719 :
8720 0 : spdk_json_write_named_string(w, "method", "bdev_nvme_set_options");
8721 :
8722 0 : spdk_json_write_named_object_begin(w, "params");
8723 0 : spdk_json_write_named_string(w, "action_on_timeout", action);
8724 0 : spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us);
8725 0 : spdk_json_write_named_uint64(w, "timeout_admin_us", g_opts.timeout_admin_us);
8726 0 : spdk_json_write_named_uint32(w, "keep_alive_timeout_ms", g_opts.keep_alive_timeout_ms);
8727 0 : spdk_json_write_named_uint32(w, "arbitration_burst", g_opts.arbitration_burst);
8728 0 : spdk_json_write_named_uint32(w, "low_priority_weight", g_opts.low_priority_weight);
8729 0 : spdk_json_write_named_uint32(w, "medium_priority_weight", g_opts.medium_priority_weight);
8730 0 : spdk_json_write_named_uint32(w, "high_priority_weight", g_opts.high_priority_weight);
8731 0 : spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us);
8732 0 : spdk_json_write_named_uint64(w, "nvme_ioq_poll_period_us", g_opts.nvme_ioq_poll_period_us);
8733 0 : spdk_json_write_named_uint32(w, "io_queue_requests", g_opts.io_queue_requests);
8734 0 : spdk_json_write_named_bool(w, "delay_cmd_submit", g_opts.delay_cmd_submit);
8735 0 : spdk_json_write_named_uint32(w, "transport_retry_count", g_opts.transport_retry_count);
8736 0 : spdk_json_write_named_int32(w, "bdev_retry_count", g_opts.bdev_retry_count);
8737 0 : spdk_json_write_named_uint8(w, "transport_ack_timeout", g_opts.transport_ack_timeout);
8738 0 : spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", g_opts.ctrlr_loss_timeout_sec);
8739 0 : spdk_json_write_named_uint32(w, "reconnect_delay_sec", g_opts.reconnect_delay_sec);
8740 0 : spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec", g_opts.fast_io_fail_timeout_sec);
8741 0 : spdk_json_write_named_bool(w, "disable_auto_failback", g_opts.disable_auto_failback);
8742 0 : spdk_json_write_named_bool(w, "generate_uuids", g_opts.generate_uuids);
8743 0 : spdk_json_write_named_uint8(w, "transport_tos", g_opts.transport_tos);
8744 0 : spdk_json_write_named_bool(w, "nvme_error_stat", g_opts.nvme_error_stat);
8745 0 : spdk_json_write_named_uint32(w, "rdma_srq_size", g_opts.rdma_srq_size);
8746 0 : spdk_json_write_named_bool(w, "io_path_stat", g_opts.io_path_stat);
8747 0 : spdk_json_write_named_bool(w, "allow_accel_sequence", g_opts.allow_accel_sequence);
8748 0 : spdk_json_write_named_uint32(w, "rdma_max_cq_size", g_opts.rdma_max_cq_size);
8749 0 : spdk_json_write_named_uint16(w, "rdma_cm_event_timeout_ms", g_opts.rdma_cm_event_timeout_ms);
8750 0 : spdk_json_write_named_array_begin(w, "dhchap_digests");
8751 0 : for (i = 0; i < 32; ++i) {
8752 0 : if (g_opts.dhchap_digests & SPDK_BIT(i)) {
8753 0 : spdk_json_write_string(w, spdk_nvme_dhchap_get_digest_name(i));
8754 0 : }
8755 0 : }
8756 0 : spdk_json_write_array_end(w);
8757 0 : spdk_json_write_named_array_begin(w, "dhchap_dhgroups");
8758 0 : for (i = 0; i < 32; ++i) {
8759 0 : if (g_opts.dhchap_dhgroups & SPDK_BIT(i)) {
8760 0 : spdk_json_write_string(w, spdk_nvme_dhchap_get_dhgroup_name(i));
8761 0 : }
8762 0 : }
8763 :
8764 0 : spdk_json_write_array_end(w);
8765 0 : spdk_json_write_object_end(w);
8766 :
8767 0 : spdk_json_write_object_end(w);
8768 0 : }
8769 :
8770 : static void
8771 0 : bdev_nvme_discovery_config_json(struct spdk_json_write_ctx *w, struct discovery_ctx *ctx)
8772 : {
8773 : struct spdk_nvme_transport_id trid;
8774 :
8775 0 : spdk_json_write_object_begin(w);
8776 :
8777 0 : spdk_json_write_named_string(w, "method", "bdev_nvme_start_discovery");
8778 :
8779 0 : spdk_json_write_named_object_begin(w, "params");
8780 0 : spdk_json_write_named_string(w, "name", ctx->name);
8781 0 : spdk_json_write_named_string(w, "hostnqn", ctx->hostnqn);
8782 :
8783 0 : trid = ctx->trid;
8784 0 : memset(trid.subnqn, 0, sizeof(trid.subnqn));
8785 0 : nvme_bdev_dump_trid_json(&trid, w);
8786 :
8787 0 : spdk_json_write_named_bool(w, "wait_for_attach", ctx->wait_for_attach);
8788 0 : spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", ctx->bdev_opts.ctrlr_loss_timeout_sec);
8789 0 : spdk_json_write_named_uint32(w, "reconnect_delay_sec", ctx->bdev_opts.reconnect_delay_sec);
8790 0 : spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec",
8791 0 : ctx->bdev_opts.fast_io_fail_timeout_sec);
8792 0 : spdk_json_write_object_end(w);
8793 :
8794 0 : spdk_json_write_object_end(w);
8795 0 : }
8796 :
8797 : #ifdef SPDK_CONFIG_NVME_CUSE
8798 : static void
8799 : nvme_ctrlr_cuse_config_json(struct spdk_json_write_ctx *w,
8800 : struct nvme_ctrlr *nvme_ctrlr)
8801 : {
8802 : size_t cuse_name_size = 128;
8803 : char cuse_name[cuse_name_size];
8804 :
8805 : if (spdk_nvme_cuse_get_ctrlr_name(nvme_ctrlr->ctrlr,
8806 : cuse_name, &cuse_name_size) != 0) {
8807 : return;
8808 : }
8809 :
8810 : spdk_json_write_object_begin(w);
8811 :
8812 : spdk_json_write_named_string(w, "method", "bdev_nvme_cuse_register");
8813 :
8814 : spdk_json_write_named_object_begin(w, "params");
8815 : spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name);
8816 : spdk_json_write_object_end(w);
8817 :
8818 : spdk_json_write_object_end(w);
8819 : }
8820 : #endif
8821 :
8822 : static void
8823 0 : nvme_ctrlr_config_json(struct spdk_json_write_ctx *w,
8824 : struct nvme_ctrlr *nvme_ctrlr,
8825 : struct nvme_path_id *path_id)
8826 : {
8827 : struct spdk_nvme_transport_id *trid;
8828 : const struct spdk_nvme_ctrlr_opts *opts;
8829 :
8830 0 : if (nvme_ctrlr->opts.from_discovery_service) {
8831 : /* Do not emit an RPC for this - it will be implicitly
8832 : * covered by a separate bdev_nvme_start_discovery or
8833 : * bdev_nvme_start_mdns_discovery RPC.
8834 : */
8835 0 : return;
8836 : }
8837 :
8838 0 : trid = &path_id->trid;
8839 :
8840 0 : spdk_json_write_object_begin(w);
8841 :
8842 0 : spdk_json_write_named_string(w, "method", "bdev_nvme_attach_controller");
8843 :
8844 0 : spdk_json_write_named_object_begin(w, "params");
8845 0 : spdk_json_write_named_string(w, "name", nvme_ctrlr->nbdev_ctrlr->name);
8846 0 : nvme_bdev_dump_trid_json(trid, w);
8847 0 : spdk_json_write_named_bool(w, "prchk_reftag",
8848 0 : (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) != 0);
8849 0 : spdk_json_write_named_bool(w, "prchk_guard",
8850 0 : (nvme_ctrlr->opts.prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) != 0);
8851 0 : spdk_json_write_named_int32(w, "ctrlr_loss_timeout_sec", nvme_ctrlr->opts.ctrlr_loss_timeout_sec);
8852 0 : spdk_json_write_named_uint32(w, "reconnect_delay_sec", nvme_ctrlr->opts.reconnect_delay_sec);
8853 0 : spdk_json_write_named_uint32(w, "fast_io_fail_timeout_sec",
8854 0 : nvme_ctrlr->opts.fast_io_fail_timeout_sec);
8855 0 : if (nvme_ctrlr->psk != NULL) {
8856 0 : spdk_json_write_named_string(w, "psk", spdk_key_get_name(nvme_ctrlr->psk));
8857 0 : }
8858 0 : if (nvme_ctrlr->dhchap_key != NULL) {
8859 0 : spdk_json_write_named_string(w, "dhchap_key",
8860 0 : spdk_key_get_name(nvme_ctrlr->dhchap_key));
8861 0 : }
8862 0 : if (nvme_ctrlr->dhchap_ctrlr_key != NULL) {
8863 0 : spdk_json_write_named_string(w, "dhchap_ctrlr_key",
8864 0 : spdk_key_get_name(nvme_ctrlr->dhchap_ctrlr_key));
8865 0 : }
8866 0 : opts = spdk_nvme_ctrlr_get_opts(nvme_ctrlr->ctrlr);
8867 0 : spdk_json_write_named_string(w, "hostnqn", opts->hostnqn);
8868 0 : spdk_json_write_named_bool(w, "hdgst", opts->header_digest);
8869 0 : spdk_json_write_named_bool(w, "ddgst", opts->data_digest);
8870 0 : if (opts->src_addr[0] != '\0') {
8871 0 : spdk_json_write_named_string(w, "hostaddr", opts->src_addr);
8872 0 : }
8873 0 : if (opts->src_svcid[0] != '\0') {
8874 0 : spdk_json_write_named_string(w, "hostsvcid", opts->src_svcid);
8875 0 : }
8876 :
8877 0 : if (nvme_ctrlr->opts.multipath) {
8878 0 : spdk_json_write_named_string(w, "multipath", "multipath");
8879 0 : }
8880 0 : spdk_json_write_object_end(w);
8881 :
8882 0 : spdk_json_write_object_end(w);
8883 0 : }
8884 :
8885 : static void
8886 0 : bdev_nvme_hotplug_config_json(struct spdk_json_write_ctx *w)
8887 : {
8888 0 : spdk_json_write_object_begin(w);
8889 0 : spdk_json_write_named_string(w, "method", "bdev_nvme_set_hotplug");
8890 :
8891 0 : spdk_json_write_named_object_begin(w, "params");
8892 0 : spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us);
8893 0 : spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled);
8894 0 : spdk_json_write_object_end(w);
8895 :
8896 0 : spdk_json_write_object_end(w);
8897 0 : }
8898 :
8899 : static int
8900 0 : bdev_nvme_config_json(struct spdk_json_write_ctx *w)
8901 : {
8902 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
8903 : struct nvme_ctrlr *nvme_ctrlr;
8904 : struct discovery_ctx *ctx;
8905 : struct nvme_path_id *path_id;
8906 :
8907 0 : bdev_nvme_opts_config_json(w);
8908 :
8909 0 : pthread_mutex_lock(&g_bdev_nvme_mutex);
8910 :
8911 0 : TAILQ_FOREACH(nbdev_ctrlr, &g_nvme_bdev_ctrlrs, tailq) {
8912 0 : TAILQ_FOREACH(nvme_ctrlr, &nbdev_ctrlr->ctrlrs, tailq) {
8913 0 : path_id = nvme_ctrlr->active_path_id;
8914 0 : assert(path_id == TAILQ_FIRST(&nvme_ctrlr->trids));
8915 0 : nvme_ctrlr_config_json(w, nvme_ctrlr, path_id);
8916 :
8917 0 : path_id = TAILQ_NEXT(path_id, link);
8918 0 : while (path_id != NULL) {
8919 0 : nvme_ctrlr_config_json(w, nvme_ctrlr, path_id);
8920 0 : path_id = TAILQ_NEXT(path_id, link);
8921 : }
8922 :
8923 : #ifdef SPDK_CONFIG_NVME_CUSE
8924 : nvme_ctrlr_cuse_config_json(w, nvme_ctrlr);
8925 : #endif
8926 0 : }
8927 0 : }
8928 :
8929 0 : TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) {
8930 0 : if (!ctx->from_mdns_discovery_service) {
8931 0 : bdev_nvme_discovery_config_json(w, ctx);
8932 0 : }
8933 0 : }
8934 :
8935 0 : bdev_nvme_mdns_discovery_config_json(w);
8936 :
8937 : /* Dump as last parameter to give all NVMe bdevs chance to be constructed
8938 : * before enabling hotplug poller.
8939 : */
8940 0 : bdev_nvme_hotplug_config_json(w);
8941 :
8942 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
8943 0 : return 0;
8944 : }
8945 :
8946 : struct spdk_nvme_ctrlr *
8947 1 : bdev_nvme_get_ctrlr(struct spdk_bdev *bdev)
8948 : {
8949 : struct nvme_bdev *nbdev;
8950 : struct nvme_ns *nvme_ns;
8951 :
8952 1 : if (!bdev || bdev->module != &nvme_if) {
8953 0 : return NULL;
8954 : }
8955 :
8956 1 : nbdev = SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk);
8957 1 : nvme_ns = TAILQ_FIRST(&nbdev->nvme_ns_list);
8958 1 : assert(nvme_ns != NULL);
8959 :
8960 1 : return nvme_ns->ctrlr->ctrlr;
8961 1 : }
8962 :
8963 : static bool
8964 12 : nvme_io_path_is_current(struct nvme_io_path *io_path)
8965 : {
8966 : const struct nvme_bdev_channel *nbdev_ch;
8967 : bool current;
8968 :
8969 12 : if (!nvme_io_path_is_available(io_path)) {
8970 4 : return false;
8971 : }
8972 :
8973 8 : nbdev_ch = io_path->nbdev_ch;
8974 8 : if (nbdev_ch == NULL) {
8975 1 : current = false;
8976 8 : } else if (nbdev_ch->mp_policy == BDEV_NVME_MP_POLICY_ACTIVE_ACTIVE) {
8977 3 : struct nvme_io_path *optimized_io_path = NULL;
8978 :
8979 6 : STAILQ_FOREACH(optimized_io_path, &nbdev_ch->io_path_list, stailq) {
8980 5 : if (optimized_io_path->nvme_ns->ana_state == SPDK_NVME_ANA_OPTIMIZED_STATE) {
8981 2 : break;
8982 : }
8983 3 : }
8984 :
8985 : /* A non-optimized path is only current if there are no optimized paths. */
8986 3 : current = (io_path->nvme_ns->ana_state == SPDK_NVME_ANA_OPTIMIZED_STATE) ||
8987 2 : (optimized_io_path == NULL);
8988 3 : } else {
8989 4 : if (nbdev_ch->current_io_path) {
8990 1 : current = (io_path == nbdev_ch->current_io_path);
8991 1 : } else {
8992 : struct nvme_io_path *first_path;
8993 :
8994 : /* We arrived here as there are no optimized paths for active-passive
8995 : * mode. Check if this io_path is the first one available on the list.
8996 : */
8997 3 : current = false;
8998 3 : STAILQ_FOREACH(first_path, &nbdev_ch->io_path_list, stailq) {
8999 3 : if (nvme_io_path_is_available(first_path)) {
9000 3 : current = (io_path == first_path);
9001 3 : break;
9002 : }
9003 0 : }
9004 : }
9005 : }
9006 :
9007 8 : return current;
9008 12 : }
9009 :
9010 : static struct nvme_ctrlr *
9011 0 : bdev_nvme_next_ctrlr_unsafe(struct nvme_bdev_ctrlr *nbdev_ctrlr, struct nvme_ctrlr *prev)
9012 : {
9013 : struct nvme_ctrlr *next;
9014 :
9015 : /* Must be called under g_bdev_nvme_mutex */
9016 0 : next = prev != NULL ? TAILQ_NEXT(prev, tailq) : TAILQ_FIRST(&nbdev_ctrlr->ctrlrs);
9017 0 : while (next != NULL) {
9018 : /* ref can be 0 when the ctrlr was released, but hasn't been detached yet */
9019 0 : pthread_mutex_lock(&next->mutex);
9020 0 : if (next->ref > 0) {
9021 0 : next->ref++;
9022 0 : pthread_mutex_unlock(&next->mutex);
9023 0 : return next;
9024 : }
9025 :
9026 0 : pthread_mutex_unlock(&next->mutex);
9027 0 : next = TAILQ_NEXT(next, tailq);
9028 : }
9029 :
9030 0 : return NULL;
9031 0 : }
9032 :
9033 : struct bdev_nvme_set_keys_ctx {
9034 : struct nvme_ctrlr *nctrlr;
9035 : struct spdk_key *dhchap_key;
9036 : struct spdk_key *dhchap_ctrlr_key;
9037 : struct spdk_thread *thread;
9038 : bdev_nvme_set_keys_cb cb_fn;
9039 : void *cb_ctx;
9040 : int status;
9041 : };
9042 :
9043 : static void
9044 0 : bdev_nvme_free_set_keys_ctx(struct bdev_nvme_set_keys_ctx *ctx)
9045 : {
9046 0 : if (ctx == NULL) {
9047 0 : return;
9048 : }
9049 :
9050 0 : spdk_keyring_put_key(ctx->dhchap_key);
9051 0 : spdk_keyring_put_key(ctx->dhchap_ctrlr_key);
9052 0 : free(ctx);
9053 0 : }
9054 :
9055 : static void
9056 0 : _bdev_nvme_set_keys_done(void *_ctx)
9057 : {
9058 0 : struct bdev_nvme_set_keys_ctx *ctx = _ctx;
9059 :
9060 0 : ctx->cb_fn(ctx->cb_ctx, ctx->status);
9061 :
9062 0 : if (ctx->nctrlr != NULL) {
9063 0 : nvme_ctrlr_release(ctx->nctrlr);
9064 0 : }
9065 0 : bdev_nvme_free_set_keys_ctx(ctx);
9066 0 : }
9067 :
9068 : static void
9069 0 : bdev_nvme_set_keys_done(struct bdev_nvme_set_keys_ctx *ctx, int status)
9070 : {
9071 0 : ctx->status = status;
9072 0 : spdk_thread_exec_msg(ctx->thread, _bdev_nvme_set_keys_done, ctx);
9073 0 : }
9074 :
9075 : static void bdev_nvme_authenticate_ctrlr(struct bdev_nvme_set_keys_ctx *ctx);
9076 :
9077 : static void
9078 0 : bdev_nvme_authenticate_ctrlr_continue(struct bdev_nvme_set_keys_ctx *ctx)
9079 : {
9080 : struct nvme_ctrlr *next;
9081 :
9082 0 : pthread_mutex_lock(&g_bdev_nvme_mutex);
9083 0 : next = bdev_nvme_next_ctrlr_unsafe(NULL, ctx->nctrlr);
9084 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
9085 :
9086 0 : nvme_ctrlr_release(ctx->nctrlr);
9087 0 : ctx->nctrlr = next;
9088 :
9089 0 : if (next == NULL) {
9090 0 : bdev_nvme_set_keys_done(ctx, 0);
9091 0 : } else {
9092 0 : bdev_nvme_authenticate_ctrlr(ctx);
9093 : }
9094 0 : }
9095 :
9096 : static void
9097 0 : bdev_nvme_authenticate_qpairs_done(struct spdk_io_channel_iter *i, int status)
9098 : {
9099 0 : struct bdev_nvme_set_keys_ctx *ctx = spdk_io_channel_iter_get_ctx(i);
9100 :
9101 0 : if (status != 0) {
9102 0 : bdev_nvme_set_keys_done(ctx, status);
9103 0 : return;
9104 : }
9105 0 : bdev_nvme_authenticate_ctrlr_continue(ctx);
9106 0 : }
9107 :
9108 : static void
9109 0 : bdev_nvme_authenticate_qpair_done(void *ctx, int status)
9110 : {
9111 0 : spdk_for_each_channel_continue(ctx, status);
9112 0 : }
9113 :
9114 : static void
9115 0 : bdev_nvme_authenticate_qpair(struct spdk_io_channel_iter *i)
9116 : {
9117 0 : struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i);
9118 0 : struct nvme_ctrlr_channel *ctrlr_ch = spdk_io_channel_get_ctx(ch);
9119 0 : struct nvme_qpair *qpair = ctrlr_ch->qpair;
9120 : int rc;
9121 :
9122 0 : if (!nvme_qpair_is_connected(qpair)) {
9123 0 : spdk_for_each_channel_continue(i, 0);
9124 0 : return;
9125 : }
9126 :
9127 0 : rc = spdk_nvme_qpair_authenticate(qpair->qpair, bdev_nvme_authenticate_qpair_done, i);
9128 0 : if (rc != 0) {
9129 0 : spdk_for_each_channel_continue(i, rc);
9130 0 : }
9131 0 : }
9132 :
9133 : static void
9134 0 : bdev_nvme_authenticate_ctrlr_done(void *_ctx, int status)
9135 : {
9136 0 : struct bdev_nvme_set_keys_ctx *ctx = _ctx;
9137 :
9138 0 : if (status != 0) {
9139 0 : bdev_nvme_set_keys_done(ctx, status);
9140 0 : return;
9141 : }
9142 :
9143 0 : spdk_for_each_channel(ctx->nctrlr, bdev_nvme_authenticate_qpair, ctx,
9144 : bdev_nvme_authenticate_qpairs_done);
9145 0 : }
9146 :
9147 : static void
9148 0 : bdev_nvme_authenticate_ctrlr(struct bdev_nvme_set_keys_ctx *ctx)
9149 : {
9150 0 : struct spdk_nvme_ctrlr_key_opts opts = {};
9151 0 : struct nvme_ctrlr *nctrlr = ctx->nctrlr;
9152 : int rc;
9153 :
9154 0 : opts.size = SPDK_SIZEOF(&opts, dhchap_ctrlr_key);
9155 0 : opts.dhchap_key = ctx->dhchap_key;
9156 0 : opts.dhchap_ctrlr_key = ctx->dhchap_ctrlr_key;
9157 0 : rc = spdk_nvme_ctrlr_set_keys(nctrlr->ctrlr, &opts);
9158 0 : if (rc != 0) {
9159 0 : bdev_nvme_set_keys_done(ctx, rc);
9160 0 : return;
9161 : }
9162 :
9163 0 : if (ctx->dhchap_key != NULL) {
9164 0 : rc = spdk_nvme_ctrlr_authenticate(nctrlr->ctrlr,
9165 0 : bdev_nvme_authenticate_ctrlr_done, ctx);
9166 0 : if (rc != 0) {
9167 0 : bdev_nvme_set_keys_done(ctx, rc);
9168 0 : }
9169 0 : } else {
9170 0 : bdev_nvme_authenticate_ctrlr_continue(ctx);
9171 : }
9172 0 : }
9173 :
9174 : int
9175 0 : bdev_nvme_set_keys(const char *name, const char *dhchap_key, const char *dhchap_ctrlr_key,
9176 : bdev_nvme_set_keys_cb cb_fn, void *cb_ctx)
9177 : {
9178 : struct bdev_nvme_set_keys_ctx *ctx;
9179 : struct nvme_bdev_ctrlr *nbdev_ctrlr;
9180 : struct nvme_ctrlr *nctrlr;
9181 :
9182 0 : ctx = calloc(1, sizeof(*ctx));
9183 0 : if (ctx == NULL) {
9184 0 : return -ENOMEM;
9185 : }
9186 :
9187 0 : if (dhchap_key != NULL) {
9188 0 : ctx->dhchap_key = spdk_keyring_get_key(dhchap_key);
9189 0 : if (ctx->dhchap_key == NULL) {
9190 0 : SPDK_ERRLOG("Could not find key %s for bdev %s\n", dhchap_key, name);
9191 0 : bdev_nvme_free_set_keys_ctx(ctx);
9192 0 : return -ENOKEY;
9193 : }
9194 0 : }
9195 0 : if (dhchap_ctrlr_key != NULL) {
9196 0 : ctx->dhchap_ctrlr_key = spdk_keyring_get_key(dhchap_ctrlr_key);
9197 0 : if (ctx->dhchap_ctrlr_key == NULL) {
9198 0 : SPDK_ERRLOG("Could not find key %s for bdev %s\n", dhchap_ctrlr_key, name);
9199 0 : bdev_nvme_free_set_keys_ctx(ctx);
9200 0 : return -ENOKEY;
9201 : }
9202 0 : }
9203 :
9204 0 : pthread_mutex_lock(&g_bdev_nvme_mutex);
9205 0 : nbdev_ctrlr = nvme_bdev_ctrlr_get_by_name(name);
9206 0 : if (nbdev_ctrlr == NULL) {
9207 0 : SPDK_ERRLOG("Could not find bdev_ctrlr %s\n", name);
9208 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
9209 0 : bdev_nvme_free_set_keys_ctx(ctx);
9210 0 : return -ENODEV;
9211 : }
9212 0 : nctrlr = bdev_nvme_next_ctrlr_unsafe(nbdev_ctrlr, NULL);
9213 0 : if (nctrlr == NULL) {
9214 0 : SPDK_ERRLOG("Could not find any nvme_ctrlrs on bdev_ctrlr %s\n", name);
9215 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
9216 0 : bdev_nvme_free_set_keys_ctx(ctx);
9217 0 : return -ENODEV;
9218 : }
9219 0 : pthread_mutex_unlock(&g_bdev_nvme_mutex);
9220 :
9221 0 : ctx->nctrlr = nctrlr;
9222 0 : ctx->cb_fn = cb_fn;
9223 0 : ctx->cb_ctx = cb_ctx;
9224 0 : ctx->thread = spdk_get_thread();
9225 :
9226 0 : bdev_nvme_authenticate_ctrlr(ctx);
9227 :
9228 0 : return 0;
9229 0 : }
9230 :
9231 : void
9232 0 : nvme_io_path_info_json(struct spdk_json_write_ctx *w, struct nvme_io_path *io_path)
9233 : {
9234 0 : struct nvme_ns *nvme_ns = io_path->nvme_ns;
9235 0 : struct nvme_ctrlr *nvme_ctrlr = io_path->qpair->ctrlr;
9236 : const struct spdk_nvme_ctrlr_data *cdata;
9237 : const struct spdk_nvme_transport_id *trid;
9238 : const char *adrfam_str;
9239 :
9240 0 : spdk_json_write_object_begin(w);
9241 :
9242 0 : spdk_json_write_named_string(w, "bdev_name", nvme_ns->bdev->disk.name);
9243 :
9244 0 : cdata = spdk_nvme_ctrlr_get_data(nvme_ctrlr->ctrlr);
9245 0 : trid = spdk_nvme_ctrlr_get_transport_id(nvme_ctrlr->ctrlr);
9246 :
9247 0 : spdk_json_write_named_uint32(w, "cntlid", cdata->cntlid);
9248 0 : spdk_json_write_named_bool(w, "current", nvme_io_path_is_current(io_path));
9249 0 : spdk_json_write_named_bool(w, "connected", nvme_qpair_is_connected(io_path->qpair));
9250 0 : spdk_json_write_named_bool(w, "accessible", nvme_ns_is_accessible(nvme_ns));
9251 :
9252 0 : spdk_json_write_named_object_begin(w, "transport");
9253 0 : spdk_json_write_named_string(w, "trtype", trid->trstring);
9254 0 : spdk_json_write_named_string(w, "traddr", trid->traddr);
9255 0 : if (trid->trsvcid[0] != '\0') {
9256 0 : spdk_json_write_named_string(w, "trsvcid", trid->trsvcid);
9257 0 : }
9258 0 : adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam);
9259 0 : if (adrfam_str) {
9260 0 : spdk_json_write_named_string(w, "adrfam", adrfam_str);
9261 0 : }
9262 0 : spdk_json_write_object_end(w);
9263 :
9264 0 : spdk_json_write_object_end(w);
9265 0 : }
9266 :
9267 : void
9268 0 : bdev_nvme_get_discovery_info(struct spdk_json_write_ctx *w)
9269 : {
9270 : struct discovery_ctx *ctx;
9271 : struct discovery_entry_ctx *entry_ctx;
9272 :
9273 0 : spdk_json_write_array_begin(w);
9274 0 : TAILQ_FOREACH(ctx, &g_discovery_ctxs, tailq) {
9275 0 : spdk_json_write_object_begin(w);
9276 0 : spdk_json_write_named_string(w, "name", ctx->name);
9277 :
9278 0 : spdk_json_write_named_object_begin(w, "trid");
9279 0 : nvme_bdev_dump_trid_json(&ctx->trid, w);
9280 0 : spdk_json_write_object_end(w);
9281 :
9282 0 : spdk_json_write_named_array_begin(w, "referrals");
9283 0 : TAILQ_FOREACH(entry_ctx, &ctx->discovery_entry_ctxs, tailq) {
9284 0 : spdk_json_write_object_begin(w);
9285 0 : spdk_json_write_named_object_begin(w, "trid");
9286 0 : nvme_bdev_dump_trid_json(&entry_ctx->trid, w);
9287 0 : spdk_json_write_object_end(w);
9288 0 : spdk_json_write_object_end(w);
9289 0 : }
9290 0 : spdk_json_write_array_end(w);
9291 :
9292 0 : spdk_json_write_object_end(w);
9293 0 : }
9294 0 : spdk_json_write_array_end(w);
9295 0 : }
9296 :
9297 1 : SPDK_LOG_REGISTER_COMPONENT(bdev_nvme)
9298 :
9299 : static void
9300 0 : bdev_nvme_trace(void)
9301 : {
9302 0 : struct spdk_trace_tpoint_opts opts[] = {
9303 : {
9304 : "BDEV_NVME_IO_START", TRACE_BDEV_NVME_IO_START,
9305 : OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 1,
9306 : {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }}
9307 : },
9308 : {
9309 : "BDEV_NVME_IO_DONE", TRACE_BDEV_NVME_IO_DONE,
9310 : OWNER_TYPE_NONE, OBJECT_BDEV_NVME_IO, 0,
9311 : {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }}
9312 : }
9313 : };
9314 :
9315 :
9316 0 : spdk_trace_register_object(OBJECT_BDEV_NVME_IO, 'N');
9317 0 : spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts));
9318 0 : spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_SUBMIT, OBJECT_BDEV_NVME_IO, 0);
9319 0 : spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_SUBMIT, OBJECT_BDEV_NVME_IO, 0);
9320 0 : spdk_trace_tpoint_register_relation(TRACE_NVME_PCIE_COMPLETE, OBJECT_BDEV_NVME_IO, 0);
9321 0 : spdk_trace_tpoint_register_relation(TRACE_NVME_TCP_COMPLETE, OBJECT_BDEV_NVME_IO, 0);
9322 0 : }
9323 1 : SPDK_TRACE_REGISTER_FN(bdev_nvme_trace, "bdev_nvme", TRACE_GROUP_BDEV_NVME)
|