Line data Source code
1 : /* SPDX-License-Identifier: BSD-3-Clause
2 : * Copyright (C) 2017 Intel Corporation.
3 : * All rights reserved.
4 : */
5 :
6 : #include "spdk/stdinc.h"
7 :
8 : #include "env_internal.h"
9 : #include "pci_dpdk.h"
10 :
11 : #include <rte_config.h>
12 : #include <rte_memory.h>
13 : #include <rte_eal_memconfig.h>
14 : #include <rte_dev.h>
15 : #include <rte_pci.h>
16 :
17 : #include "spdk_internal/assert.h"
18 :
19 : #include "spdk/assert.h"
20 : #include "spdk/likely.h"
21 : #include "spdk/queue.h"
22 : #include "spdk/util.h"
23 : #include "spdk/memory.h"
24 : #include "spdk/env_dpdk.h"
25 : #include "spdk/log.h"
26 :
27 : #ifdef __linux__
28 : #include <linux/version.h>
29 : #if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0)
30 : #include <linux/vfio.h>
31 : #include <rte_vfio.h>
32 :
33 : struct spdk_vfio_dma_map {
34 : struct vfio_iommu_type1_dma_map map;
35 : TAILQ_ENTRY(spdk_vfio_dma_map) tailq;
36 : };
37 :
38 : struct vfio_cfg {
39 : int fd;
40 : bool enabled;
41 : bool noiommu_enabled;
42 : unsigned device_ref;
43 : TAILQ_HEAD(, spdk_vfio_dma_map) maps;
44 : pthread_mutex_t mutex;
45 : };
46 :
47 : static struct vfio_cfg g_vfio = {
48 : .fd = -1,
49 : .enabled = false,
50 : .noiommu_enabled = false,
51 : .device_ref = 0,
52 : .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps),
53 : .mutex = PTHREAD_MUTEX_INITIALIZER
54 : };
55 : #endif
56 : #endif
57 :
58 : #if DEBUG
59 : #define DEBUG_PRINT(...) SPDK_ERRLOG(__VA_ARGS__)
60 : #else
61 : #define DEBUG_PRINT(...)
62 : #endif
63 :
64 : #define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB))
65 : #define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB))
66 :
67 : #define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB))
68 : #define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1))
69 :
70 : /* Page is registered */
71 : #define REG_MAP_REGISTERED (1ULL << 62)
72 :
73 : /* A notification region barrier. The 2MB translation entry that's marked
74 : * with this flag must be unregistered separately. This allows contiguous
75 : * regions to be unregistered in the same chunks they were registered.
76 : */
77 : #define REG_MAP_NOTIFY_START (1ULL << 63)
78 :
79 : /* Translation of a single 2MB page. */
80 : struct map_2mb {
81 : uint64_t translation_2mb;
82 : };
83 :
84 : /* Second-level map table indexed by bits [21..29] of the virtual address.
85 : * Each entry contains the address translation or error for entries that haven't
86 : * been retrieved yet.
87 : */
88 : struct map_1gb {
89 : struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)];
90 : };
91 :
92 : /* Top-level map table indexed by bits [30..47] of the virtual address.
93 : * Each entry points to a second-level map table or NULL.
94 : */
95 : struct map_256tb {
96 : struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)];
97 : };
98 :
99 : /* Page-granularity memory address translation */
100 : struct spdk_mem_map {
101 : struct map_256tb map_256tb;
102 : pthread_mutex_t mutex;
103 : uint64_t default_translation;
104 : struct spdk_mem_map_ops ops;
105 : void *cb_ctx;
106 : TAILQ_ENTRY(spdk_mem_map) tailq;
107 : };
108 :
109 : /* Registrations map. The 64 bit translations are bit fields with the
110 : * following layout (starting with the low bits):
111 : * 0 - 61 : reserved
112 : * 62 - 63 : flags
113 : */
114 : static struct spdk_mem_map *g_mem_reg_map;
115 : static TAILQ_HEAD(spdk_mem_map_head, spdk_mem_map) g_spdk_mem_maps =
116 : TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps);
117 : static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER;
118 :
119 : static bool g_legacy_mem;
120 : static bool g_huge_pages = true;
121 :
122 : /*
123 : * Walk the currently registered memory via the main memory registration map
124 : * and call the new map's notify callback for each virtually contiguous region.
125 : */
126 : static int
127 0 : mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action)
128 : {
129 : size_t idx_256tb;
130 : uint64_t idx_1gb;
131 0 : uint64_t contig_start = UINT64_MAX;
132 0 : uint64_t contig_end = UINT64_MAX;
133 : struct map_1gb *map_1gb;
134 : int rc;
135 :
136 0 : if (!g_mem_reg_map) {
137 0 : return -EINVAL;
138 : }
139 :
140 : /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */
141 0 : pthread_mutex_lock(&g_mem_reg_map->mutex);
142 :
143 0 : for (idx_256tb = 0;
144 : idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]);
145 0 : idx_256tb++) {
146 0 : map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
147 :
148 0 : if (!map_1gb) {
149 0 : if (contig_start != UINT64_MAX) {
150 : /* End of of a virtually contiguous range */
151 0 : rc = map->ops.notify_cb(map->cb_ctx, map, action,
152 : (void *)contig_start,
153 0 : contig_end - contig_start + VALUE_2MB);
154 : /* Don't bother handling unregister failures. It can't be any worse */
155 0 : if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
156 0 : goto err_unregister;
157 : }
158 : }
159 0 : contig_start = UINT64_MAX;
160 0 : continue;
161 : }
162 :
163 0 : for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) {
164 0 : if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
165 0 : (contig_start == UINT64_MAX ||
166 0 : (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
167 : /* Rebuild the virtual address from the indexes */
168 0 : uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
169 :
170 0 : if (contig_start == UINT64_MAX) {
171 0 : contig_start = vaddr;
172 : }
173 :
174 0 : contig_end = vaddr;
175 : } else {
176 0 : if (contig_start != UINT64_MAX) {
177 : /* End of of a virtually contiguous range */
178 0 : rc = map->ops.notify_cb(map->cb_ctx, map, action,
179 : (void *)contig_start,
180 0 : contig_end - contig_start + VALUE_2MB);
181 : /* Don't bother handling unregister failures. It can't be any worse */
182 0 : if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) {
183 0 : goto err_unregister;
184 : }
185 :
186 : /* This page might be a part of a neighbour region, so process
187 : * it again. The idx_1gb will be incremented immediately.
188 : */
189 0 : idx_1gb--;
190 : }
191 0 : contig_start = UINT64_MAX;
192 : }
193 : }
194 : }
195 :
196 0 : pthread_mutex_unlock(&g_mem_reg_map->mutex);
197 0 : return 0;
198 :
199 0 : err_unregister:
200 : /* Unwind to the first empty translation so we don't unregister
201 : * a region that just failed to register.
202 : */
203 0 : idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1);
204 0 : idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1);
205 0 : contig_start = UINT64_MAX;
206 0 : contig_end = UINT64_MAX;
207 :
208 : /* Unregister any memory we managed to register before the failure */
209 0 : for (; idx_256tb < SIZE_MAX; idx_256tb--) {
210 0 : map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb];
211 :
212 0 : if (!map_1gb) {
213 0 : if (contig_end != UINT64_MAX) {
214 : /* End of of a virtually contiguous range */
215 0 : map->ops.notify_cb(map->cb_ctx, map,
216 : SPDK_MEM_MAP_NOTIFY_UNREGISTER,
217 : (void *)contig_start,
218 0 : contig_end - contig_start + VALUE_2MB);
219 : }
220 0 : contig_end = UINT64_MAX;
221 0 : continue;
222 : }
223 :
224 0 : for (; idx_1gb < UINT64_MAX; idx_1gb--) {
225 : /* Rebuild the virtual address from the indexes */
226 0 : uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB);
227 0 : if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) &&
228 0 : (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) {
229 :
230 0 : if (contig_end == UINT64_MAX) {
231 0 : contig_end = vaddr;
232 : }
233 0 : contig_start = vaddr;
234 : } else {
235 0 : if (contig_end != UINT64_MAX) {
236 0 : if (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) {
237 0 : contig_start = vaddr;
238 : }
239 : /* End of of a virtually contiguous range */
240 0 : map->ops.notify_cb(map->cb_ctx, map,
241 : SPDK_MEM_MAP_NOTIFY_UNREGISTER,
242 : (void *)contig_start,
243 0 : contig_end - contig_start + VALUE_2MB);
244 : }
245 0 : contig_end = UINT64_MAX;
246 : }
247 : }
248 0 : idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1;
249 : }
250 :
251 0 : pthread_mutex_unlock(&g_mem_reg_map->mutex);
252 0 : return rc;
253 : }
254 :
255 : struct spdk_mem_map *
256 0 : spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx)
257 : {
258 : struct spdk_mem_map *map;
259 : int rc;
260 : size_t i;
261 :
262 0 : map = calloc(1, sizeof(*map));
263 0 : if (map == NULL) {
264 0 : return NULL;
265 : }
266 :
267 0 : if (pthread_mutex_init(&map->mutex, NULL)) {
268 0 : free(map);
269 0 : return NULL;
270 : }
271 :
272 0 : map->default_translation = default_translation;
273 0 : map->cb_ctx = cb_ctx;
274 0 : if (ops) {
275 0 : map->ops = *ops;
276 : }
277 :
278 0 : if (ops && ops->notify_cb) {
279 0 : pthread_mutex_lock(&g_spdk_mem_map_mutex);
280 0 : rc = mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER);
281 0 : if (rc != 0) {
282 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
283 0 : DEBUG_PRINT("Initial mem_map notify failed\n");
284 0 : pthread_mutex_destroy(&map->mutex);
285 0 : for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) {
286 0 : free(map->map_256tb.map[i]);
287 : }
288 0 : free(map);
289 0 : return NULL;
290 : }
291 0 : TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq);
292 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
293 : }
294 :
295 0 : return map;
296 : }
297 :
298 : void
299 0 : spdk_mem_map_free(struct spdk_mem_map **pmap)
300 : {
301 : struct spdk_mem_map *map;
302 : size_t i;
303 :
304 0 : if (!pmap) {
305 0 : return;
306 : }
307 :
308 0 : map = *pmap;
309 :
310 0 : if (!map) {
311 0 : return;
312 : }
313 :
314 0 : if (map->ops.notify_cb) {
315 0 : pthread_mutex_lock(&g_spdk_mem_map_mutex);
316 0 : mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER);
317 0 : TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq);
318 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
319 : }
320 :
321 0 : for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) {
322 0 : free(map->map_256tb.map[i]);
323 : }
324 :
325 0 : pthread_mutex_destroy(&map->mutex);
326 :
327 0 : free(map);
328 0 : *pmap = NULL;
329 : }
330 :
331 : int
332 0 : spdk_mem_register(void *vaddr, size_t len)
333 : {
334 : struct spdk_mem_map *map;
335 : int rc;
336 : void *seg_vaddr;
337 : size_t seg_len;
338 : uint64_t reg;
339 :
340 0 : if ((uintptr_t)vaddr & ~MASK_256TB) {
341 0 : DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
342 0 : return -EINVAL;
343 : }
344 :
345 0 : if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
346 0 : DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
347 : __func__, vaddr, len);
348 0 : return -EINVAL;
349 : }
350 :
351 0 : if (len == 0) {
352 0 : return 0;
353 : }
354 :
355 0 : pthread_mutex_lock(&g_spdk_mem_map_mutex);
356 :
357 0 : seg_vaddr = vaddr;
358 0 : seg_len = len;
359 0 : while (seg_len > 0) {
360 0 : reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
361 0 : if (reg & REG_MAP_REGISTERED) {
362 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
363 0 : return -EBUSY;
364 : }
365 0 : seg_vaddr += VALUE_2MB;
366 0 : seg_len -= VALUE_2MB;
367 : }
368 :
369 0 : seg_vaddr = vaddr;
370 0 : seg_len = 0;
371 0 : while (len > 0) {
372 0 : spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB,
373 : seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED);
374 0 : seg_len += VALUE_2MB;
375 0 : vaddr += VALUE_2MB;
376 0 : len -= VALUE_2MB;
377 : }
378 :
379 0 : TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
380 0 : rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len);
381 0 : if (rc != 0) {
382 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
383 0 : return rc;
384 : }
385 : }
386 :
387 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
388 0 : return 0;
389 : }
390 :
391 : int
392 0 : spdk_mem_unregister(void *vaddr, size_t len)
393 : {
394 : struct spdk_mem_map *map;
395 : int rc;
396 : void *seg_vaddr;
397 : size_t seg_len;
398 : uint64_t reg, newreg;
399 :
400 0 : if ((uintptr_t)vaddr & ~MASK_256TB) {
401 0 : DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
402 0 : return -EINVAL;
403 : }
404 :
405 0 : if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
406 0 : DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
407 : __func__, vaddr, len);
408 0 : return -EINVAL;
409 : }
410 :
411 0 : pthread_mutex_lock(&g_spdk_mem_map_mutex);
412 :
413 : /* The first page must be a start of a region. Also check if it's
414 : * registered to make sure we don't return -ERANGE for non-registered
415 : * regions.
416 : */
417 0 : reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
418 0 : if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) {
419 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
420 0 : return -ERANGE;
421 : }
422 :
423 0 : seg_vaddr = vaddr;
424 0 : seg_len = len;
425 0 : while (seg_len > 0) {
426 0 : reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
427 0 : if ((reg & REG_MAP_REGISTERED) == 0) {
428 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
429 0 : return -EINVAL;
430 : }
431 0 : seg_vaddr += VALUE_2MB;
432 0 : seg_len -= VALUE_2MB;
433 : }
434 :
435 0 : newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
436 : /* If the next page is registered, it must be a start of a region as well,
437 : * otherwise we'd be unregistering only a part of a region.
438 : */
439 0 : if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) {
440 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
441 0 : return -ERANGE;
442 : }
443 0 : seg_vaddr = vaddr;
444 0 : seg_len = 0;
445 :
446 0 : while (len > 0) {
447 0 : reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL);
448 0 : spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0);
449 :
450 0 : if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) {
451 0 : TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) {
452 0 : rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len);
453 0 : if (rc != 0) {
454 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
455 0 : return rc;
456 : }
457 : }
458 :
459 0 : seg_vaddr = vaddr;
460 0 : seg_len = VALUE_2MB;
461 : } else {
462 0 : seg_len += VALUE_2MB;
463 : }
464 :
465 0 : vaddr += VALUE_2MB;
466 0 : len -= VALUE_2MB;
467 : }
468 :
469 0 : if (seg_len > 0) {
470 0 : TAILQ_FOREACH_REVERSE(map, &g_spdk_mem_maps, spdk_mem_map_head, tailq) {
471 0 : rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len);
472 0 : if (rc != 0) {
473 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
474 0 : return rc;
475 : }
476 : }
477 : }
478 :
479 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
480 0 : return 0;
481 : }
482 :
483 : int
484 0 : spdk_mem_reserve(void *vaddr, size_t len)
485 : {
486 : struct spdk_mem_map *map;
487 : void *seg_vaddr;
488 : size_t seg_len;
489 : uint64_t reg;
490 :
491 0 : if ((uintptr_t)vaddr & ~MASK_256TB) {
492 0 : DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
493 0 : return -EINVAL;
494 : }
495 :
496 0 : if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
497 0 : DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n",
498 : __func__, vaddr, len);
499 0 : return -EINVAL;
500 : }
501 :
502 0 : if (len == 0) {
503 0 : return 0;
504 : }
505 :
506 0 : pthread_mutex_lock(&g_spdk_mem_map_mutex);
507 :
508 : /* Check if any part of this range is already registered */
509 0 : seg_vaddr = vaddr;
510 0 : seg_len = len;
511 0 : while (seg_len > 0) {
512 0 : reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL);
513 0 : if (reg & REG_MAP_REGISTERED) {
514 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
515 0 : return -EBUSY;
516 : }
517 0 : seg_vaddr += VALUE_2MB;
518 0 : seg_len -= VALUE_2MB;
519 : }
520 :
521 : /* Simply set the translation to the memory map's default. This allocates the space in the
522 : * map but does not provide a valid translation. */
523 0 : spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, len,
524 0 : g_mem_reg_map->default_translation);
525 :
526 0 : TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) {
527 0 : spdk_mem_map_set_translation(map, (uint64_t)vaddr, len, map->default_translation);
528 : }
529 :
530 0 : pthread_mutex_unlock(&g_spdk_mem_map_mutex);
531 0 : return 0;
532 : }
533 :
534 : static struct map_1gb *
535 0 : mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb)
536 : {
537 : struct map_1gb *map_1gb;
538 0 : uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb);
539 : size_t i;
540 :
541 0 : if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) {
542 0 : return NULL;
543 : }
544 :
545 0 : map_1gb = map->map_256tb.map[idx_256tb];
546 :
547 0 : if (!map_1gb) {
548 0 : pthread_mutex_lock(&map->mutex);
549 :
550 : /* Recheck to make sure nobody else got the mutex first. */
551 0 : map_1gb = map->map_256tb.map[idx_256tb];
552 0 : if (!map_1gb) {
553 0 : map_1gb = malloc(sizeof(struct map_1gb));
554 0 : if (map_1gb) {
555 : /* initialize all entries to default translation */
556 0 : for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) {
557 0 : map_1gb->map[i].translation_2mb = map->default_translation;
558 : }
559 0 : map->map_256tb.map[idx_256tb] = map_1gb;
560 : }
561 : }
562 :
563 0 : pthread_mutex_unlock(&map->mutex);
564 :
565 0 : if (!map_1gb) {
566 0 : DEBUG_PRINT("allocation failed\n");
567 0 : return NULL;
568 : }
569 : }
570 :
571 0 : return map_1gb;
572 : }
573 :
574 : int
575 0 : spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size,
576 : uint64_t translation)
577 : {
578 : uint64_t vfn_2mb;
579 : struct map_1gb *map_1gb;
580 : uint64_t idx_1gb;
581 : struct map_2mb *map_2mb;
582 :
583 0 : if ((uintptr_t)vaddr & ~MASK_256TB) {
584 0 : DEBUG_PRINT("invalid usermode virtual address %" PRIu64 "\n", vaddr);
585 0 : return -EINVAL;
586 : }
587 :
588 : /* For now, only 2 MB-aligned registrations are supported */
589 0 : if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) {
590 0 : DEBUG_PRINT("invalid %s parameters, vaddr=%" PRIu64 " len=%" PRIu64 "\n",
591 : __func__, vaddr, size);
592 0 : return -EINVAL;
593 : }
594 :
595 0 : vfn_2mb = vaddr >> SHIFT_2MB;
596 :
597 0 : while (size) {
598 0 : map_1gb = mem_map_get_map_1gb(map, vfn_2mb);
599 0 : if (!map_1gb) {
600 0 : DEBUG_PRINT("could not get %p map\n", (void *)vaddr);
601 0 : return -ENOMEM;
602 : }
603 :
604 0 : idx_1gb = MAP_1GB_IDX(vfn_2mb);
605 0 : map_2mb = &map_1gb->map[idx_1gb];
606 0 : map_2mb->translation_2mb = translation;
607 :
608 0 : size -= VALUE_2MB;
609 0 : vfn_2mb++;
610 : }
611 :
612 0 : return 0;
613 : }
614 :
615 : int
616 0 : spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size)
617 : {
618 0 : return spdk_mem_map_set_translation(map, vaddr, size, map->default_translation);
619 : }
620 :
621 : inline uint64_t
622 0 : spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size)
623 : {
624 : const struct map_1gb *map_1gb;
625 : const struct map_2mb *map_2mb;
626 : uint64_t idx_256tb;
627 : uint64_t idx_1gb;
628 : uint64_t vfn_2mb;
629 : uint64_t cur_size;
630 : uint64_t prev_translation;
631 : uint64_t orig_translation;
632 :
633 0 : if (spdk_unlikely(vaddr & ~MASK_256TB)) {
634 0 : DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr);
635 0 : return map->default_translation;
636 : }
637 :
638 0 : vfn_2mb = vaddr >> SHIFT_2MB;
639 0 : idx_256tb = MAP_256TB_IDX(vfn_2mb);
640 0 : idx_1gb = MAP_1GB_IDX(vfn_2mb);
641 :
642 0 : map_1gb = map->map_256tb.map[idx_256tb];
643 0 : if (spdk_unlikely(!map_1gb)) {
644 0 : return map->default_translation;
645 : }
646 :
647 0 : cur_size = VALUE_2MB - _2MB_OFFSET(vaddr);
648 0 : map_2mb = &map_1gb->map[idx_1gb];
649 0 : if (size == NULL || map->ops.are_contiguous == NULL ||
650 0 : map_2mb->translation_2mb == map->default_translation) {
651 0 : if (size != NULL) {
652 0 : *size = spdk_min(*size, cur_size);
653 : }
654 0 : return map_2mb->translation_2mb;
655 : }
656 :
657 0 : orig_translation = map_2mb->translation_2mb;
658 0 : prev_translation = orig_translation;
659 0 : while (cur_size < *size) {
660 0 : vfn_2mb++;
661 0 : idx_256tb = MAP_256TB_IDX(vfn_2mb);
662 0 : idx_1gb = MAP_1GB_IDX(vfn_2mb);
663 :
664 0 : map_1gb = map->map_256tb.map[idx_256tb];
665 0 : if (spdk_unlikely(!map_1gb)) {
666 0 : break;
667 : }
668 :
669 0 : map_2mb = &map_1gb->map[idx_1gb];
670 0 : if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) {
671 0 : break;
672 : }
673 :
674 0 : cur_size += VALUE_2MB;
675 0 : prev_translation = map_2mb->translation_2mb;
676 : }
677 :
678 0 : *size = spdk_min(*size, cur_size);
679 0 : return orig_translation;
680 : }
681 :
682 : static void
683 0 : memory_hotplug_cb(enum rte_mem_event event_type,
684 : const void *addr, size_t len, void *arg)
685 : {
686 0 : if (event_type == RTE_MEM_EVENT_ALLOC) {
687 0 : spdk_mem_register((void *)addr, len);
688 :
689 0 : if (!spdk_env_dpdk_external_init()) {
690 0 : return;
691 : }
692 :
693 : /* When the user initialized DPDK separately, we can't
694 : * be sure that --match-allocations RTE flag was specified.
695 : * Without this flag, DPDK can free memory in different units
696 : * than it was allocated. It doesn't work with things like RDMA MRs.
697 : *
698 : * For such cases, we mark segments so they aren't freed.
699 : */
700 0 : while (len > 0) {
701 : struct rte_memseg *seg;
702 :
703 0 : seg = rte_mem_virt2memseg(addr, NULL);
704 0 : assert(seg != NULL);
705 0 : seg->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE;
706 0 : addr = (void *)((uintptr_t)addr + seg->hugepage_sz);
707 0 : len -= seg->hugepage_sz;
708 : }
709 0 : } else if (event_type == RTE_MEM_EVENT_FREE) {
710 0 : spdk_mem_unregister((void *)addr, len);
711 : }
712 : }
713 :
714 : static int
715 0 : memory_iter_cb(const struct rte_memseg_list *msl,
716 : const struct rte_memseg *ms, size_t len, void *arg)
717 : {
718 0 : return spdk_mem_register(ms->addr, len);
719 : }
720 :
721 : int
722 0 : mem_map_init(bool legacy_mem)
723 : {
724 0 : g_legacy_mem = legacy_mem;
725 :
726 0 : g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL);
727 0 : if (g_mem_reg_map == NULL) {
728 0 : DEBUG_PRINT("memory registration map allocation failed\n");
729 0 : return -ENOMEM;
730 : }
731 :
732 : /*
733 : * Walk all DPDK memory segments and register them
734 : * with the main memory map
735 : */
736 0 : if (g_huge_pages) {
737 0 : rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL);
738 0 : rte_memseg_contig_walk(memory_iter_cb, NULL);
739 : }
740 0 : return 0;
741 : }
742 :
743 : bool
744 0 : spdk_iommu_is_enabled(void)
745 : {
746 : #if VFIO_ENABLED
747 0 : return g_vfio.enabled && !g_vfio.noiommu_enabled;
748 : #else
749 : return false;
750 : #endif
751 : }
752 :
753 : struct spdk_vtophys_pci_device {
754 : struct rte_pci_device *pci_device;
755 : TAILQ_ENTRY(spdk_vtophys_pci_device) tailq;
756 : };
757 :
758 : static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER;
759 : static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices =
760 : TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices);
761 :
762 : static struct spdk_mem_map *g_vtophys_map;
763 : static struct spdk_mem_map *g_phys_ref_map;
764 :
765 : #if VFIO_ENABLED
766 : static int
767 0 : _vfio_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
768 : {
769 : struct spdk_vfio_dma_map *dma_map;
770 : int ret;
771 :
772 0 : dma_map = calloc(1, sizeof(*dma_map));
773 0 : if (dma_map == NULL) {
774 0 : return -ENOMEM;
775 : }
776 :
777 0 : dma_map->map.argsz = sizeof(dma_map->map);
778 0 : dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
779 0 : dma_map->map.vaddr = vaddr;
780 0 : dma_map->map.iova = iova;
781 0 : dma_map->map.size = size;
782 :
783 0 : if (g_vfio.device_ref == 0) {
784 : /* VFIO requires at least one device (IOMMU group) to be added to
785 : * a VFIO container before it is possible to perform any IOMMU
786 : * operations on that container. This memory will be mapped once
787 : * the first device (IOMMU group) is hotplugged.
788 : *
789 : * Since the vfio container is managed internally by DPDK, it is
790 : * also possible that some device is already in that container, but
791 : * it's not managed by SPDK - e.g. an NIC attached internally
792 : * inside DPDK. We could map the memory straight away in such
793 : * scenario, but there's no need to do it. DPDK devices clearly
794 : * don't need our mappings and hence we defer the mapping
795 : * unconditionally until the first SPDK-managed device is
796 : * hotplugged.
797 : */
798 0 : goto out_insert;
799 : }
800 :
801 0 : ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
802 0 : if (ret) {
803 : /* There are cases the vfio container doesn't have IOMMU group, it's safe for this case */
804 0 : SPDK_NOTICELOG("Cannot set up DMA mapping, error %d, ignored\n", errno);
805 : }
806 :
807 0 : out_insert:
808 0 : TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq);
809 0 : return 0;
810 : }
811 :
812 :
813 : static int
814 0 : vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size)
815 : {
816 : uint64_t refcount;
817 : int ret;
818 :
819 0 : refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
820 0 : assert(refcount < UINT64_MAX);
821 0 : if (refcount > 0) {
822 0 : spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
823 0 : return 0;
824 : }
825 :
826 0 : pthread_mutex_lock(&g_vfio.mutex);
827 0 : ret = _vfio_iommu_map_dma(vaddr, iova, size);
828 0 : pthread_mutex_unlock(&g_vfio.mutex);
829 0 : if (ret) {
830 0 : return ret;
831 : }
832 :
833 0 : spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount + 1);
834 0 : return 0;
835 : }
836 :
837 : int
838 0 : vtophys_iommu_map_dma_bar(uint64_t vaddr, uint64_t iova, uint64_t size)
839 : {
840 : int ret;
841 :
842 0 : pthread_mutex_lock(&g_vfio.mutex);
843 0 : ret = _vfio_iommu_map_dma(vaddr, iova, size);
844 0 : pthread_mutex_unlock(&g_vfio.mutex);
845 :
846 0 : return ret;
847 : }
848 :
849 : static int
850 0 : _vfio_iommu_unmap_dma(struct spdk_vfio_dma_map *dma_map)
851 : {
852 0 : struct vfio_iommu_type1_dma_unmap unmap = {};
853 : int ret;
854 :
855 0 : if (g_vfio.device_ref == 0) {
856 : /* Memory is not mapped anymore, just remove it's references */
857 0 : goto out_remove;
858 : }
859 :
860 0 : unmap.argsz = sizeof(unmap);
861 0 : unmap.flags = 0;
862 0 : unmap.iova = dma_map->map.iova;
863 0 : unmap.size = dma_map->map.size;
864 0 : ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap);
865 0 : if (ret) {
866 0 : SPDK_NOTICELOG("Cannot clear DMA mapping, error %d, ignored\n", errno);
867 : }
868 :
869 0 : out_remove:
870 0 : TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq);
871 0 : free(dma_map);
872 0 : return 0;
873 : }
874 :
875 : static int
876 0 : vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size)
877 : {
878 : struct spdk_vfio_dma_map *dma_map;
879 : uint64_t refcount;
880 : int ret;
881 :
882 0 : pthread_mutex_lock(&g_vfio.mutex);
883 0 : TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
884 0 : if (dma_map->map.iova == iova) {
885 0 : break;
886 : }
887 : }
888 :
889 0 : if (dma_map == NULL) {
890 0 : DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova);
891 0 : pthread_mutex_unlock(&g_vfio.mutex);
892 0 : return -ENXIO;
893 : }
894 :
895 0 : refcount = spdk_mem_map_translate(g_phys_ref_map, iova, NULL);
896 0 : assert(refcount < UINT64_MAX);
897 0 : if (refcount > 0) {
898 0 : spdk_mem_map_set_translation(g_phys_ref_map, iova, size, refcount - 1);
899 : }
900 :
901 : /* We still have outstanding references, don't clear it. */
902 0 : if (refcount > 1) {
903 0 : pthread_mutex_unlock(&g_vfio.mutex);
904 0 : return 0;
905 : }
906 :
907 : /** don't support partial or multiple-page unmap for now */
908 0 : assert(dma_map->map.size == size);
909 :
910 0 : ret = _vfio_iommu_unmap_dma(dma_map);
911 0 : pthread_mutex_unlock(&g_vfio.mutex);
912 :
913 0 : return ret;
914 : }
915 :
916 : int
917 0 : vtophys_iommu_unmap_dma_bar(uint64_t vaddr)
918 : {
919 : struct spdk_vfio_dma_map *dma_map;
920 : int ret;
921 :
922 0 : pthread_mutex_lock(&g_vfio.mutex);
923 0 : TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
924 0 : if (dma_map->map.vaddr == vaddr) {
925 0 : break;
926 : }
927 : }
928 :
929 0 : if (dma_map == NULL) {
930 0 : DEBUG_PRINT("Cannot clear DMA mapping for address %"PRIx64" - it's not mapped\n", vaddr);
931 0 : pthread_mutex_unlock(&g_vfio.mutex);
932 0 : return -ENXIO;
933 : }
934 :
935 0 : ret = _vfio_iommu_unmap_dma(dma_map);
936 0 : pthread_mutex_unlock(&g_vfio.mutex);
937 0 : return ret;
938 : }
939 : #endif
940 :
941 : static uint64_t
942 0 : vtophys_get_paddr_memseg(uint64_t vaddr)
943 : {
944 : uintptr_t paddr;
945 : struct rte_memseg *seg;
946 :
947 0 : seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL);
948 0 : if (seg != NULL) {
949 0 : paddr = seg->iova;
950 0 : if (paddr == RTE_BAD_IOVA) {
951 0 : return SPDK_VTOPHYS_ERROR;
952 : }
953 0 : paddr += (vaddr - (uintptr_t)seg->addr);
954 0 : return paddr;
955 : }
956 :
957 0 : return SPDK_VTOPHYS_ERROR;
958 : }
959 :
960 : /* Try to get the paddr from /proc/self/pagemap */
961 : static uint64_t
962 0 : vtophys_get_paddr_pagemap(uint64_t vaddr)
963 : {
964 : uintptr_t paddr;
965 :
966 : /* Silence static analyzers */
967 0 : assert(vaddr != 0);
968 0 : paddr = rte_mem_virt2iova((void *)vaddr);
969 0 : if (paddr == RTE_BAD_IOVA) {
970 : /*
971 : * The vaddr may be valid but doesn't have a backing page
972 : * assigned yet. Touch the page to ensure a backing page
973 : * gets assigned, then try to translate again.
974 : */
975 0 : rte_atomic64_read((rte_atomic64_t *)vaddr);
976 0 : paddr = rte_mem_virt2iova((void *)vaddr);
977 : }
978 0 : if (paddr == RTE_BAD_IOVA) {
979 : /* Unable to get to the physical address. */
980 0 : return SPDK_VTOPHYS_ERROR;
981 : }
982 :
983 0 : return paddr;
984 : }
985 :
986 : static uint64_t
987 0 : pci_device_vtophys(struct rte_pci_device *dev, uint64_t vaddr, size_t len)
988 : {
989 : struct rte_mem_resource *res;
990 : uint64_t paddr;
991 : unsigned r;
992 :
993 0 : for (r = 0; r < PCI_MAX_RESOURCE; r++) {
994 0 : res = dpdk_pci_device_get_mem_resource(dev, r);
995 :
996 0 : if (res->phys_addr == 0 || vaddr < (uint64_t)res->addr ||
997 0 : (vaddr + len) >= (uint64_t)res->addr + res->len) {
998 0 : continue;
999 : }
1000 :
1001 : #if VFIO_ENABLED
1002 0 : if (spdk_iommu_is_enabled() && rte_eal_iova_mode() == RTE_IOVA_VA) {
1003 : /*
1004 : * The IOMMU is on and we're using IOVA == VA. The BAR was
1005 : * automatically registered when it was mapped, so just return
1006 : * the virtual address here.
1007 : */
1008 0 : return vaddr;
1009 : }
1010 : #endif
1011 0 : paddr = res->phys_addr + (vaddr - (uint64_t)res->addr);
1012 0 : return paddr;
1013 : }
1014 :
1015 0 : return SPDK_VTOPHYS_ERROR;
1016 : }
1017 :
1018 : /* Try to get the paddr from pci devices */
1019 : static uint64_t
1020 0 : vtophys_get_paddr_pci(uint64_t vaddr, size_t len)
1021 : {
1022 : struct spdk_vtophys_pci_device *vtophys_dev;
1023 : uintptr_t paddr;
1024 : struct rte_pci_device *dev;
1025 :
1026 0 : pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1027 0 : TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
1028 0 : dev = vtophys_dev->pci_device;
1029 0 : paddr = pci_device_vtophys(dev, vaddr, len);
1030 0 : if (paddr != SPDK_VTOPHYS_ERROR) {
1031 0 : pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1032 0 : return paddr;
1033 : }
1034 : }
1035 0 : pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1036 :
1037 0 : return SPDK_VTOPHYS_ERROR;
1038 : }
1039 :
1040 : static int
1041 0 : vtophys_notify(void *cb_ctx, struct spdk_mem_map *map,
1042 : enum spdk_mem_map_notify_action action,
1043 : void *vaddr, size_t len)
1044 : {
1045 0 : int rc = 0;
1046 : uint64_t paddr;
1047 :
1048 0 : if ((uintptr_t)vaddr & ~MASK_256TB) {
1049 0 : DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr);
1050 0 : return -EINVAL;
1051 : }
1052 :
1053 0 : if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) {
1054 0 : DEBUG_PRINT("invalid parameters, vaddr=%p len=%ju\n",
1055 : vaddr, len);
1056 0 : return -EINVAL;
1057 : }
1058 :
1059 : /* Get the physical address from the DPDK memsegs */
1060 0 : paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
1061 :
1062 0 : switch (action) {
1063 0 : case SPDK_MEM_MAP_NOTIFY_REGISTER:
1064 0 : if (paddr == SPDK_VTOPHYS_ERROR) {
1065 : /* This is not an address that DPDK is managing. */
1066 :
1067 : /* Check if this is a PCI BAR. They need special handling */
1068 0 : paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len);
1069 0 : if (paddr != SPDK_VTOPHYS_ERROR) {
1070 : /* Get paddr for each 2MB chunk in this address range */
1071 0 : while (len > 0) {
1072 0 : paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB);
1073 0 : if (paddr == SPDK_VTOPHYS_ERROR) {
1074 0 : DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1075 0 : return -EFAULT;
1076 : }
1077 :
1078 0 : rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1079 0 : if (rc != 0) {
1080 0 : return rc;
1081 : }
1082 :
1083 0 : vaddr += VALUE_2MB;
1084 0 : len -= VALUE_2MB;
1085 : }
1086 :
1087 0 : return 0;
1088 : }
1089 :
1090 : #if VFIO_ENABLED
1091 : enum rte_iova_mode iova_mode;
1092 :
1093 0 : iova_mode = rte_eal_iova_mode();
1094 :
1095 0 : if (spdk_iommu_is_enabled() && iova_mode == RTE_IOVA_VA) {
1096 : /* We'll use the virtual address as the iova to match DPDK. */
1097 0 : paddr = (uint64_t)vaddr;
1098 0 : rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, len);
1099 0 : if (rc) {
1100 0 : return -EFAULT;
1101 : }
1102 0 : while (len > 0) {
1103 0 : rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1104 0 : if (rc != 0) {
1105 0 : return rc;
1106 : }
1107 0 : vaddr += VALUE_2MB;
1108 0 : paddr += VALUE_2MB;
1109 0 : len -= VALUE_2MB;
1110 : }
1111 : } else
1112 : #endif
1113 : {
1114 : /* Get the physical address from /proc/self/pagemap. */
1115 0 : paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
1116 0 : if (paddr == SPDK_VTOPHYS_ERROR) {
1117 0 : DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1118 0 : return -EFAULT;
1119 : }
1120 :
1121 : /* Get paddr for each 2MB chunk in this address range */
1122 0 : while (len > 0) {
1123 : /* Get the physical address from /proc/self/pagemap. */
1124 0 : paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr);
1125 :
1126 0 : if (paddr == SPDK_VTOPHYS_ERROR) {
1127 0 : DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1128 0 : return -EFAULT;
1129 : }
1130 :
1131 0 : if (paddr & MASK_2MB) {
1132 0 : DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr);
1133 0 : return -EINVAL;
1134 : }
1135 : #if VFIO_ENABLED
1136 : /* If the IOMMU is on, but DPDK is using iova-mode=pa, we want to register this memory
1137 : * with the IOMMU using the physical address to match. */
1138 0 : if (spdk_iommu_is_enabled()) {
1139 0 : rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB);
1140 0 : if (rc) {
1141 0 : DEBUG_PRINT("Unable to assign vaddr %p to paddr 0x%" PRIx64 "\n", vaddr, paddr);
1142 0 : return -EFAULT;
1143 : }
1144 : }
1145 : #endif
1146 :
1147 0 : rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1148 0 : if (rc != 0) {
1149 0 : return rc;
1150 : }
1151 :
1152 0 : vaddr += VALUE_2MB;
1153 0 : len -= VALUE_2MB;
1154 : }
1155 : }
1156 : } else {
1157 : /* This is an address managed by DPDK. Just setup the translations. */
1158 0 : while (len > 0) {
1159 0 : paddr = vtophys_get_paddr_memseg((uint64_t)vaddr);
1160 0 : if (paddr == SPDK_VTOPHYS_ERROR) {
1161 0 : DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1162 0 : return -EFAULT;
1163 : }
1164 :
1165 0 : rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr);
1166 0 : if (rc != 0) {
1167 0 : return rc;
1168 : }
1169 :
1170 0 : vaddr += VALUE_2MB;
1171 0 : len -= VALUE_2MB;
1172 : }
1173 : }
1174 :
1175 0 : break;
1176 0 : case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
1177 : #if VFIO_ENABLED
1178 0 : if (paddr == SPDK_VTOPHYS_ERROR) {
1179 : /*
1180 : * This is not an address that DPDK is managing.
1181 : */
1182 :
1183 : /* Check if this is a PCI BAR. They need special handling */
1184 0 : paddr = vtophys_get_paddr_pci((uint64_t)vaddr, len);
1185 0 : if (paddr != SPDK_VTOPHYS_ERROR) {
1186 : /* Get paddr for each 2MB chunk in this address range */
1187 0 : while (len > 0) {
1188 0 : paddr = vtophys_get_paddr_pci((uint64_t)vaddr, VALUE_2MB);
1189 0 : if (paddr == SPDK_VTOPHYS_ERROR) {
1190 0 : DEBUG_PRINT("could not get phys addr for %p\n", vaddr);
1191 0 : return -EFAULT;
1192 : }
1193 :
1194 0 : rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
1195 0 : if (rc != 0) {
1196 0 : return rc;
1197 : }
1198 :
1199 0 : vaddr += VALUE_2MB;
1200 0 : len -= VALUE_2MB;
1201 : }
1202 :
1203 0 : return 0;
1204 : }
1205 :
1206 : /* If vfio is enabled,
1207 : * we need to unmap the range from the IOMMU
1208 : */
1209 0 : if (spdk_iommu_is_enabled()) {
1210 0 : uint64_t buffer_len = len;
1211 0 : uint8_t *va = vaddr;
1212 : enum rte_iova_mode iova_mode;
1213 :
1214 0 : iova_mode = rte_eal_iova_mode();
1215 : /*
1216 : * In virtual address mode, the region is contiguous and can be done in
1217 : * one unmap.
1218 : */
1219 0 : if (iova_mode == RTE_IOVA_VA) {
1220 0 : paddr = spdk_mem_map_translate(map, (uint64_t)va, &buffer_len);
1221 0 : if (buffer_len != len || paddr != (uintptr_t)va) {
1222 0 : DEBUG_PRINT("Unmapping %p with length %lu failed because "
1223 : "translation had address 0x%" PRIx64 " and length %lu\n",
1224 : va, len, paddr, buffer_len);
1225 0 : return -EINVAL;
1226 : }
1227 0 : rc = vtophys_iommu_unmap_dma(paddr, len);
1228 0 : if (rc) {
1229 0 : DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr);
1230 0 : return -EFAULT;
1231 : }
1232 0 : } else if (iova_mode == RTE_IOVA_PA) {
1233 : /* Get paddr for each 2MB chunk in this address range */
1234 0 : while (buffer_len > 0) {
1235 0 : paddr = spdk_mem_map_translate(map, (uint64_t)va, NULL);
1236 :
1237 0 : if (paddr == SPDK_VTOPHYS_ERROR || buffer_len < VALUE_2MB) {
1238 0 : DEBUG_PRINT("could not get phys addr for %p\n", va);
1239 0 : return -EFAULT;
1240 : }
1241 :
1242 0 : rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB);
1243 0 : if (rc) {
1244 0 : DEBUG_PRINT("Failed to iommu unmap paddr 0x%" PRIx64 "\n", paddr);
1245 0 : return -EFAULT;
1246 : }
1247 :
1248 0 : va += VALUE_2MB;
1249 0 : buffer_len -= VALUE_2MB;
1250 : }
1251 : }
1252 : }
1253 : }
1254 : #endif
1255 0 : while (len > 0) {
1256 0 : rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB);
1257 0 : if (rc != 0) {
1258 0 : return rc;
1259 : }
1260 :
1261 0 : vaddr += VALUE_2MB;
1262 0 : len -= VALUE_2MB;
1263 : }
1264 :
1265 0 : break;
1266 0 : default:
1267 0 : SPDK_UNREACHABLE();
1268 : }
1269 :
1270 0 : return rc;
1271 : }
1272 :
1273 : static int
1274 0 : vtophys_check_contiguous_entries(uint64_t paddr1, uint64_t paddr2)
1275 : {
1276 : /* This function is always called with paddrs for two subsequent
1277 : * 2MB chunks in virtual address space, so those chunks will be only
1278 : * physically contiguous if the physical addresses are 2MB apart
1279 : * from each other as well.
1280 : */
1281 0 : return (paddr2 - paddr1 == VALUE_2MB);
1282 : }
1283 :
1284 : #if VFIO_ENABLED
1285 :
1286 : static bool
1287 0 : vfio_enabled(void)
1288 : {
1289 0 : return rte_vfio_is_enabled("vfio_pci");
1290 : }
1291 :
1292 : /* Check if IOMMU is enabled on the system */
1293 : static bool
1294 0 : has_iommu_groups(void)
1295 : {
1296 0 : int count = 0;
1297 0 : DIR *dir = opendir("/sys/kernel/iommu_groups");
1298 :
1299 0 : if (dir == NULL) {
1300 0 : return false;
1301 : }
1302 :
1303 0 : while (count < 3 && readdir(dir) != NULL) {
1304 0 : count++;
1305 : }
1306 :
1307 0 : closedir(dir);
1308 : /* there will always be ./ and ../ entries */
1309 0 : return count > 2;
1310 : }
1311 :
1312 : static bool
1313 0 : vfio_noiommu_enabled(void)
1314 : {
1315 0 : return rte_vfio_noiommu_is_enabled();
1316 : }
1317 :
1318 : static void
1319 0 : vtophys_iommu_init(void)
1320 : {
1321 0 : char proc_fd_path[PATH_MAX + 1];
1322 0 : char link_path[PATH_MAX + 1];
1323 0 : const char vfio_path[] = "/dev/vfio/vfio";
1324 : DIR *dir;
1325 : struct dirent *d;
1326 :
1327 0 : if (!vfio_enabled()) {
1328 0 : return;
1329 : }
1330 :
1331 0 : if (vfio_noiommu_enabled()) {
1332 0 : g_vfio.noiommu_enabled = true;
1333 0 : } else if (!has_iommu_groups()) {
1334 0 : return;
1335 : }
1336 :
1337 0 : dir = opendir("/proc/self/fd");
1338 0 : if (!dir) {
1339 0 : DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno);
1340 0 : return;
1341 : }
1342 :
1343 0 : while ((d = readdir(dir)) != NULL) {
1344 0 : if (d->d_type != DT_LNK) {
1345 0 : continue;
1346 : }
1347 :
1348 0 : snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name);
1349 0 : if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) {
1350 0 : continue;
1351 : }
1352 :
1353 0 : if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) {
1354 0 : sscanf(d->d_name, "%d", &g_vfio.fd);
1355 0 : break;
1356 : }
1357 : }
1358 :
1359 0 : closedir(dir);
1360 :
1361 0 : if (g_vfio.fd < 0) {
1362 0 : DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n");
1363 0 : return;
1364 : }
1365 :
1366 0 : g_vfio.enabled = true;
1367 :
1368 0 : return;
1369 : }
1370 :
1371 : #endif
1372 :
1373 : void
1374 0 : vtophys_pci_device_added(struct rte_pci_device *pci_device)
1375 : {
1376 : struct spdk_vtophys_pci_device *vtophys_dev;
1377 :
1378 0 : pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1379 :
1380 0 : vtophys_dev = calloc(1, sizeof(*vtophys_dev));
1381 0 : if (vtophys_dev) {
1382 0 : vtophys_dev->pci_device = pci_device;
1383 0 : TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq);
1384 : } else {
1385 0 : DEBUG_PRINT("Memory allocation error\n");
1386 : }
1387 0 : pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1388 :
1389 : #if VFIO_ENABLED
1390 : struct spdk_vfio_dma_map *dma_map;
1391 : int ret;
1392 :
1393 0 : if (!g_vfio.enabled) {
1394 0 : return;
1395 : }
1396 :
1397 0 : pthread_mutex_lock(&g_vfio.mutex);
1398 0 : g_vfio.device_ref++;
1399 0 : if (g_vfio.device_ref > 1) {
1400 0 : pthread_mutex_unlock(&g_vfio.mutex);
1401 0 : return;
1402 : }
1403 :
1404 : /* This is the first SPDK device using DPDK vfio. This means that the first
1405 : * IOMMU group might have been just been added to the DPDK vfio container.
1406 : * From this point it is certain that the memory can be mapped now.
1407 : */
1408 0 : TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
1409 0 : ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map);
1410 0 : if (ret) {
1411 0 : DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno);
1412 0 : break;
1413 : }
1414 : }
1415 0 : pthread_mutex_unlock(&g_vfio.mutex);
1416 : #endif
1417 : }
1418 :
1419 : void
1420 0 : vtophys_pci_device_removed(struct rte_pci_device *pci_device)
1421 : {
1422 : struct spdk_vtophys_pci_device *vtophys_dev;
1423 :
1424 0 : pthread_mutex_lock(&g_vtophys_pci_devices_mutex);
1425 0 : TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) {
1426 0 : if (vtophys_dev->pci_device == pci_device) {
1427 0 : TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq);
1428 0 : free(vtophys_dev);
1429 0 : break;
1430 : }
1431 : }
1432 0 : pthread_mutex_unlock(&g_vtophys_pci_devices_mutex);
1433 :
1434 : #if VFIO_ENABLED
1435 : struct spdk_vfio_dma_map *dma_map;
1436 : int ret;
1437 :
1438 0 : if (!g_vfio.enabled) {
1439 0 : return;
1440 : }
1441 :
1442 0 : pthread_mutex_lock(&g_vfio.mutex);
1443 0 : assert(g_vfio.device_ref > 0);
1444 0 : g_vfio.device_ref--;
1445 0 : if (g_vfio.device_ref > 0) {
1446 0 : pthread_mutex_unlock(&g_vfio.mutex);
1447 0 : return;
1448 : }
1449 :
1450 : /* This is the last SPDK device using DPDK vfio. If DPDK doesn't have
1451 : * any additional devices using it's vfio container, all the mappings
1452 : * will be automatically removed by the Linux vfio driver. We unmap
1453 : * the memory manually to be able to easily re-map it later regardless
1454 : * of other, external factors.
1455 : */
1456 0 : TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) {
1457 0 : struct vfio_iommu_type1_dma_unmap unmap = {};
1458 0 : unmap.argsz = sizeof(unmap);
1459 0 : unmap.flags = 0;
1460 0 : unmap.iova = dma_map->map.iova;
1461 0 : unmap.size = dma_map->map.size;
1462 0 : ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &unmap);
1463 0 : if (ret) {
1464 0 : DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno);
1465 0 : break;
1466 : }
1467 : }
1468 0 : pthread_mutex_unlock(&g_vfio.mutex);
1469 : #endif
1470 : }
1471 :
1472 : int
1473 0 : vtophys_init(void)
1474 : {
1475 0 : const struct spdk_mem_map_ops vtophys_map_ops = {
1476 : .notify_cb = vtophys_notify,
1477 : .are_contiguous = vtophys_check_contiguous_entries,
1478 : };
1479 :
1480 0 : const struct spdk_mem_map_ops phys_ref_map_ops = {
1481 : .notify_cb = NULL,
1482 : .are_contiguous = NULL,
1483 : };
1484 :
1485 : #if VFIO_ENABLED
1486 0 : vtophys_iommu_init();
1487 : #endif
1488 :
1489 0 : g_phys_ref_map = spdk_mem_map_alloc(0, &phys_ref_map_ops, NULL);
1490 0 : if (g_phys_ref_map == NULL) {
1491 0 : DEBUG_PRINT("phys_ref map allocation failed.\n");
1492 0 : return -ENOMEM;
1493 : }
1494 :
1495 0 : if (g_huge_pages) {
1496 0 : g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL);
1497 0 : if (g_vtophys_map == NULL) {
1498 0 : DEBUG_PRINT("vtophys map allocation failed\n");
1499 0 : spdk_mem_map_free(&g_phys_ref_map);
1500 0 : return -ENOMEM;
1501 : }
1502 : }
1503 0 : return 0;
1504 : }
1505 :
1506 : uint64_t
1507 0 : spdk_vtophys(const void *buf, uint64_t *size)
1508 : {
1509 : uint64_t vaddr, paddr_2mb;
1510 :
1511 0 : if (!g_huge_pages) {
1512 0 : return SPDK_VTOPHYS_ERROR;
1513 : }
1514 :
1515 0 : vaddr = (uint64_t)buf;
1516 0 : paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, size);
1517 :
1518 : /*
1519 : * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR,
1520 : * we will still bitwise-or it with the buf offset below, but the result will still be
1521 : * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being
1522 : * unaligned) we must now check the return value before addition.
1523 : */
1524 : SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s");
1525 0 : if (paddr_2mb == SPDK_VTOPHYS_ERROR) {
1526 0 : return SPDK_VTOPHYS_ERROR;
1527 : } else {
1528 0 : return paddr_2mb + (vaddr & MASK_2MB);
1529 : }
1530 : }
1531 :
1532 : int
1533 0 : spdk_mem_get_fd_and_offset(void *vaddr, uint64_t *offset)
1534 : {
1535 : struct rte_memseg *seg;
1536 : int ret, fd;
1537 :
1538 0 : seg = rte_mem_virt2memseg(vaddr, NULL);
1539 0 : if (!seg) {
1540 0 : SPDK_ERRLOG("memory %p doesn't exist\n", vaddr);
1541 0 : return -ENOENT;
1542 : }
1543 :
1544 0 : fd = rte_memseg_get_fd_thread_unsafe(seg);
1545 0 : if (fd < 0) {
1546 0 : return fd;
1547 : }
1548 :
1549 0 : ret = rte_memseg_get_fd_offset_thread_unsafe(seg, offset);
1550 0 : if (ret < 0) {
1551 0 : return ret;
1552 : }
1553 :
1554 0 : return fd;
1555 : }
1556 :
1557 : void
1558 0 : mem_disable_huge_pages(void)
1559 : {
1560 0 : g_huge_pages = false;
1561 0 : }
|