dpdk/drivers/net/mlx4/mlx4_mr.c
<<
>>
Prefs
   1/* SPDX-License-Identifier: BSD-3-Clause
   2 * Copyright 2017 6WIND S.A.
   3 * Copyright 2017 Mellanox Technologies, Ltd
   4 */
   5
   6/**
   7 * @file
   8 * Memory management functions for mlx4 driver.
   9 */
  10
  11#include <errno.h>
  12#include <inttypes.h>
  13#include <stddef.h>
  14#include <stdint.h>
  15#include <string.h>
  16
  17/* Verbs headers do not support -pedantic. */
  18#ifdef PEDANTIC
  19#pragma GCC diagnostic ignored "-Wpedantic"
  20#endif
  21#include <infiniband/verbs.h>
  22#ifdef PEDANTIC
  23#pragma GCC diagnostic error "-Wpedantic"
  24#endif
  25
  26#include <rte_branch_prediction.h>
  27#include <rte_common.h>
  28#include <rte_eal_memconfig.h>
  29#include <rte_errno.h>
  30#include <rte_malloc.h>
  31#include <rte_memory.h>
  32#include <rte_mempool.h>
  33#include <rte_rwlock.h>
  34
  35#include "mlx4_glue.h"
  36#include "mlx4_mr.h"
  37#include "mlx4_rxtx.h"
  38#include "mlx4_utils.h"
  39
  40struct mr_find_contig_memsegs_data {
  41        uintptr_t addr;
  42        uintptr_t start;
  43        uintptr_t end;
  44        const struct rte_memseg_list *msl;
  45};
  46
  47struct mr_update_mp_data {
  48        struct rte_eth_dev *dev;
  49        struct mlx4_mr_ctrl *mr_ctrl;
  50        int ret;
  51};
  52
  53/**
  54 * Expand B-tree table to a given size. Can't be called with holding
  55 * memory_hotplug_lock or priv->mr.rwlock due to rte_realloc().
  56 *
  57 * @param bt
  58 *   Pointer to B-tree structure.
  59 * @param n
  60 *   Number of entries for expansion.
  61 *
  62 * @return
  63 *   0 on success, -1 on failure.
  64 */
  65static int
  66mr_btree_expand(struct mlx4_mr_btree *bt, int n)
  67{
  68        void *mem;
  69        int ret = 0;
  70
  71        if (n <= bt->size)
  72                return ret;
  73        /*
  74         * Downside of directly using rte_realloc() is that SOCKET_ID_ANY is
  75         * used inside if there's no room to expand. Because this is a quite
  76         * rare case and a part of very slow path, it is very acceptable.
  77         * Initially cache_bh[] will be given practically enough space and once
  78         * it is expanded, expansion wouldn't be needed again ever.
  79         */
  80        mem = rte_realloc(bt->table, n * sizeof(struct mlx4_mr_cache), 0);
  81        if (mem == NULL) {
  82                /* Not an error, B-tree search will be skipped. */
  83                WARN("failed to expand MR B-tree (%p) table", (void *)bt);
  84                ret = -1;
  85        } else {
  86                DEBUG("expanded MR B-tree table (size=%u)", n);
  87                bt->table = mem;
  88                bt->size = n;
  89        }
  90        return ret;
  91}
  92
  93/**
  94 * Look up LKey from given B-tree lookup table, store the last index and return
  95 * searched LKey.
  96 *
  97 * @param bt
  98 *   Pointer to B-tree structure.
  99 * @param[out] idx
 100 *   Pointer to index. Even on search failure, returns index where it stops
 101 *   searching so that index can be used when inserting a new entry.
 102 * @param addr
 103 *   Search key.
 104 *
 105 * @return
 106 *   Searched LKey on success, UINT32_MAX on no match.
 107 */
 108static uint32_t
 109mr_btree_lookup(struct mlx4_mr_btree *bt, uint16_t *idx, uintptr_t addr)
 110{
 111        struct mlx4_mr_cache *lkp_tbl;
 112        uint16_t n;
 113        uint16_t base = 0;
 114
 115        MLX4_ASSERT(bt != NULL);
 116        lkp_tbl = *bt->table;
 117        n = bt->len;
 118        /* First entry must be NULL for comparison. */
 119        MLX4_ASSERT(bt->len > 0 || (lkp_tbl[0].start == 0 &&
 120                                    lkp_tbl[0].lkey == UINT32_MAX));
 121        /* Binary search. */
 122        do {
 123                register uint16_t delta = n >> 1;
 124
 125                if (addr < lkp_tbl[base + delta].start) {
 126                        n = delta;
 127                } else {
 128                        base += delta;
 129                        n -= delta;
 130                }
 131        } while (n > 1);
 132        MLX4_ASSERT(addr >= lkp_tbl[base].start);
 133        *idx = base;
 134        if (addr < lkp_tbl[base].end)
 135                return lkp_tbl[base].lkey;
 136        /* Not found. */
 137        return UINT32_MAX;
 138}
 139
 140/**
 141 * Insert an entry to B-tree lookup table.
 142 *
 143 * @param bt
 144 *   Pointer to B-tree structure.
 145 * @param entry
 146 *   Pointer to new entry to insert.
 147 *
 148 * @return
 149 *   0 on success, -1 on failure.
 150 */
 151static int
 152mr_btree_insert(struct mlx4_mr_btree *bt, struct mlx4_mr_cache *entry)
 153{
 154        struct mlx4_mr_cache *lkp_tbl;
 155        uint16_t idx = 0;
 156        size_t shift;
 157
 158        MLX4_ASSERT(bt != NULL);
 159        MLX4_ASSERT(bt->len <= bt->size);
 160        MLX4_ASSERT(bt->len > 0);
 161        lkp_tbl = *bt->table;
 162        /* Find out the slot for insertion. */
 163        if (mr_btree_lookup(bt, &idx, entry->start) != UINT32_MAX) {
 164                DEBUG("abort insertion to B-tree(%p): already exist at"
 165                      " idx=%u [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x",
 166                      (void *)bt, idx, entry->start, entry->end, entry->lkey);
 167                /* Already exist, return. */
 168                return 0;
 169        }
 170        /* If table is full, return error. */
 171        if (unlikely(bt->len == bt->size)) {
 172                bt->overflow = 1;
 173                return -1;
 174        }
 175        /* Insert entry. */
 176        ++idx;
 177        shift = (bt->len - idx) * sizeof(struct mlx4_mr_cache);
 178        if (shift)
 179                memmove(&lkp_tbl[idx + 1], &lkp_tbl[idx], shift);
 180        lkp_tbl[idx] = *entry;
 181        bt->len++;
 182        DEBUG("inserted B-tree(%p)[%u],"
 183              " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x",
 184              (void *)bt, idx, entry->start, entry->end, entry->lkey);
 185        return 0;
 186}
 187
 188/**
 189 * Initialize B-tree and allocate memory for lookup table.
 190 *
 191 * @param bt
 192 *   Pointer to B-tree structure.
 193 * @param n
 194 *   Number of entries to allocate.
 195 * @param socket
 196 *   NUMA socket on which memory must be allocated.
 197 *
 198 * @return
 199 *   0 on success, a negative errno value otherwise and rte_errno is set.
 200 */
 201int
 202mlx4_mr_btree_init(struct mlx4_mr_btree *bt, int n, int socket)
 203{
 204        if (bt == NULL) {
 205                rte_errno = EINVAL;
 206                return -rte_errno;
 207        }
 208        memset(bt, 0, sizeof(*bt));
 209        bt->table = rte_calloc_socket("B-tree table",
 210                                      n, sizeof(struct mlx4_mr_cache),
 211                                      0, socket);
 212        if (bt->table == NULL) {
 213                rte_errno = ENOMEM;
 214                ERROR("failed to allocate memory for btree cache on socket %d",
 215                      socket);
 216                return -rte_errno;
 217        }
 218        bt->size = n;
 219        /* First entry must be NULL for binary search. */
 220        (*bt->table)[bt->len++] = (struct mlx4_mr_cache) {
 221                .lkey = UINT32_MAX,
 222        };
 223        DEBUG("initialized B-tree %p with table %p",
 224              (void *)bt, (void *)bt->table);
 225        return 0;
 226}
 227
 228/**
 229 * Free B-tree resources.
 230 *
 231 * @param bt
 232 *   Pointer to B-tree structure.
 233 */
 234void
 235mlx4_mr_btree_free(struct mlx4_mr_btree *bt)
 236{
 237        if (bt == NULL)
 238                return;
 239        DEBUG("freeing B-tree %p with table %p", (void *)bt, (void *)bt->table);
 240        rte_free(bt->table);
 241        memset(bt, 0, sizeof(*bt));
 242}
 243
 244#ifdef RTE_LIBRTE_MLX4_DEBUG
 245/**
 246 * Dump all the entries in a B-tree
 247 *
 248 * @param bt
 249 *   Pointer to B-tree structure.
 250 */
 251void
 252mlx4_mr_btree_dump(struct mlx4_mr_btree *bt)
 253{
 254        int idx;
 255        struct mlx4_mr_cache *lkp_tbl;
 256
 257        if (bt == NULL)
 258                return;
 259        lkp_tbl = *bt->table;
 260        for (idx = 0; idx < bt->len; ++idx) {
 261                struct mlx4_mr_cache *entry = &lkp_tbl[idx];
 262
 263                DEBUG("B-tree(%p)[%u],"
 264                      " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x",
 265                      (void *)bt, idx, entry->start, entry->end, entry->lkey);
 266        }
 267}
 268#endif
 269
 270/**
 271 * Find virtually contiguous memory chunk in a given MR.
 272 *
 273 * @param dev
 274 *   Pointer to MR structure.
 275 * @param[out] entry
 276 *   Pointer to returning MR cache entry. If not found, this will not be
 277 *   updated.
 278 * @param start_idx
 279 *   Start index of the memseg bitmap.
 280 *
 281 * @return
 282 *   Next index to go on lookup.
 283 */
 284static int
 285mr_find_next_chunk(struct mlx4_mr *mr, struct mlx4_mr_cache *entry,
 286                   int base_idx)
 287{
 288        uintptr_t start = 0;
 289        uintptr_t end = 0;
 290        uint32_t idx = 0;
 291
 292        /* MR for external memory doesn't have memseg list. */
 293        if (mr->msl == NULL) {
 294                struct ibv_mr *ibv_mr = mr->ibv_mr;
 295
 296                MLX4_ASSERT(mr->ms_bmp_n == 1);
 297                MLX4_ASSERT(mr->ms_n == 1);
 298                MLX4_ASSERT(base_idx == 0);
 299                /*
 300                 * Can't search it from memseg list but get it directly from
 301                 * verbs MR as there's only one chunk.
 302                 */
 303                entry->start = (uintptr_t)ibv_mr->addr;
 304                entry->end = (uintptr_t)ibv_mr->addr + mr->ibv_mr->length;
 305                entry->lkey = rte_cpu_to_be_32(mr->ibv_mr->lkey);
 306                /* Returning 1 ends iteration. */
 307                return 1;
 308        }
 309        for (idx = base_idx; idx < mr->ms_bmp_n; ++idx) {
 310                if (rte_bitmap_get(mr->ms_bmp, idx)) {
 311                        const struct rte_memseg_list *msl;
 312                        const struct rte_memseg *ms;
 313
 314                        msl = mr->msl;
 315                        ms = rte_fbarray_get(&msl->memseg_arr,
 316                                             mr->ms_base_idx + idx);
 317                        MLX4_ASSERT(msl->page_sz == ms->hugepage_sz);
 318                        if (!start)
 319                                start = ms->addr_64;
 320                        end = ms->addr_64 + ms->hugepage_sz;
 321                } else if (start) {
 322                        /* Passed the end of a fragment. */
 323                        break;
 324                }
 325        }
 326        if (start) {
 327                /* Found one chunk. */
 328                entry->start = start;
 329                entry->end = end;
 330                entry->lkey = rte_cpu_to_be_32(mr->ibv_mr->lkey);
 331        }
 332        return idx;
 333}
 334
 335/**
 336 * Insert a MR to the global B-tree cache. It may fail due to low-on-memory.
 337 * Then, this entry will have to be searched by mr_lookup_dev_list() in
 338 * mlx4_mr_create() on miss.
 339 *
 340 * @param dev
 341 *   Pointer to Ethernet device.
 342 * @param mr
 343 *   Pointer to MR to insert.
 344 *
 345 * @return
 346 *   0 on success, -1 on failure.
 347 */
 348static int
 349mr_insert_dev_cache(struct rte_eth_dev *dev, struct mlx4_mr *mr)
 350{
 351        struct mlx4_priv *priv = dev->data->dev_private;
 352        unsigned int n;
 353
 354        DEBUG("port %u inserting MR(%p) to global cache",
 355              dev->data->port_id, (void *)mr);
 356        for (n = 0; n < mr->ms_bmp_n; ) {
 357                struct mlx4_mr_cache entry;
 358
 359                memset(&entry, 0, sizeof(entry));
 360                /* Find a contiguous chunk and advance the index. */
 361                n = mr_find_next_chunk(mr, &entry, n);
 362                if (!entry.end)
 363                        break;
 364                if (mr_btree_insert(&priv->mr.cache, &entry) < 0) {
 365                        /*
 366                         * Overflowed, but the global table cannot be expanded
 367                         * because of deadlock.
 368                         */
 369                        return -1;
 370                }
 371        }
 372        return 0;
 373}
 374
 375/**
 376 * Look up address in the original global MR list.
 377 *
 378 * @param dev
 379 *   Pointer to Ethernet device.
 380 * @param[out] entry
 381 *   Pointer to returning MR cache entry. If no match, this will not be updated.
 382 * @param addr
 383 *   Search key.
 384 *
 385 * @return
 386 *   Found MR on match, NULL otherwise.
 387 */
 388static struct mlx4_mr *
 389mr_lookup_dev_list(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry,
 390                   uintptr_t addr)
 391{
 392        struct mlx4_priv *priv = dev->data->dev_private;
 393        struct mlx4_mr *mr;
 394
 395        /* Iterate all the existing MRs. */
 396        LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
 397                unsigned int n;
 398
 399                if (mr->ms_n == 0)
 400                        continue;
 401                for (n = 0; n < mr->ms_bmp_n; ) {
 402                        struct mlx4_mr_cache ret;
 403
 404                        memset(&ret, 0, sizeof(ret));
 405                        n = mr_find_next_chunk(mr, &ret, n);
 406                        if (addr >= ret.start && addr < ret.end) {
 407                                /* Found. */
 408                                *entry = ret;
 409                                return mr;
 410                        }
 411                }
 412        }
 413        return NULL;
 414}
 415
 416/**
 417 * Look up address on device.
 418 *
 419 * @param dev
 420 *   Pointer to Ethernet device.
 421 * @param[out] entry
 422 *   Pointer to returning MR cache entry. If no match, this will not be updated.
 423 * @param addr
 424 *   Search key.
 425 *
 426 * @return
 427 *   Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
 428 */
 429static uint32_t
 430mr_lookup_dev(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry,
 431              uintptr_t addr)
 432{
 433        struct mlx4_priv *priv = dev->data->dev_private;
 434        uint16_t idx;
 435        uint32_t lkey = UINT32_MAX;
 436        struct mlx4_mr *mr;
 437
 438        /*
 439         * If the global cache has overflowed since it failed to expand the
 440         * B-tree table, it can't have all the existing MRs. Then, the address
 441         * has to be searched by traversing the original MR list instead, which
 442         * is very slow path. Otherwise, the global cache is all inclusive.
 443         */
 444        if (!unlikely(priv->mr.cache.overflow)) {
 445                lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
 446                if (lkey != UINT32_MAX)
 447                        *entry = (*priv->mr.cache.table)[idx];
 448        } else {
 449                /* Falling back to the slowest path. */
 450                mr = mr_lookup_dev_list(dev, entry, addr);
 451                if (mr != NULL)
 452                        lkey = entry->lkey;
 453        }
 454        MLX4_ASSERT(lkey == UINT32_MAX || (addr >= entry->start &&
 455                                           addr < entry->end));
 456        return lkey;
 457}
 458
 459/**
 460 * Free MR resources. MR lock must not be held to avoid a deadlock. rte_free()
 461 * can raise memory free event and the callback function will spin on the lock.
 462 *
 463 * @param mr
 464 *   Pointer to MR to free.
 465 */
 466static void
 467mr_free(struct mlx4_mr *mr)
 468{
 469        if (mr == NULL)
 470                return;
 471        DEBUG("freeing MR(%p):", (void *)mr);
 472        if (mr->ibv_mr != NULL)
 473                claim_zero(mlx4_glue->dereg_mr(mr->ibv_mr));
 474        if (mr->ms_bmp != NULL)
 475                rte_bitmap_free(mr->ms_bmp);
 476        rte_free(mr);
 477}
 478
 479/**
 480 * Release resources of detached MR having no online entry.
 481 *
 482 * @param dev
 483 *   Pointer to Ethernet device.
 484 */
 485static void
 486mlx4_mr_garbage_collect(struct rte_eth_dev *dev)
 487{
 488        struct mlx4_priv *priv = dev->data->dev_private;
 489        struct mlx4_mr *mr_next;
 490        struct mlx4_mr_list free_list = LIST_HEAD_INITIALIZER(free_list);
 491
 492        /* Must be called from the primary process. */
 493        MLX4_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
 494        /*
 495         * MR can't be freed with holding the lock because rte_free() could call
 496         * memory free callback function. This will be a deadlock situation.
 497         */
 498        rte_rwlock_write_lock(&priv->mr.rwlock);
 499        /* Detach the whole free list and release it after unlocking. */
 500        free_list = priv->mr.mr_free_list;
 501        LIST_INIT(&priv->mr.mr_free_list);
 502        rte_rwlock_write_unlock(&priv->mr.rwlock);
 503        /* Release resources. */
 504        mr_next = LIST_FIRST(&free_list);
 505        while (mr_next != NULL) {
 506                struct mlx4_mr *mr = mr_next;
 507
 508                mr_next = LIST_NEXT(mr, mr);
 509                mr_free(mr);
 510        }
 511}
 512
 513/* Called during rte_memseg_contig_walk() by mlx4_mr_create(). */
 514static int
 515mr_find_contig_memsegs_cb(const struct rte_memseg_list *msl,
 516                          const struct rte_memseg *ms, size_t len, void *arg)
 517{
 518        struct mr_find_contig_memsegs_data *data = arg;
 519
 520        if (data->addr < ms->addr_64 || data->addr >= ms->addr_64 + len)
 521                return 0;
 522        /* Found, save it and stop walking. */
 523        data->start = ms->addr_64;
 524        data->end = ms->addr_64 + len;
 525        data->msl = msl;
 526        return 1;
 527}
 528
 529/**
 530 * Create a new global Memory Region (MR) for a missing virtual address.
 531 * This API should be called on a secondary process, then a request is sent to
 532 * the primary process in order to create a MR for the address. As the global MR
 533 * list is on the shared memory, following LKey lookup should succeed unless the
 534 * request fails.
 535 *
 536 * @param dev
 537 *   Pointer to Ethernet device.
 538 * @param[out] entry
 539 *   Pointer to returning MR cache entry, found in the global cache or newly
 540 *   created. If failed to create one, this will not be updated.
 541 * @param addr
 542 *   Target virtual address to register.
 543 *
 544 * @return
 545 *   Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
 546 */
 547static uint32_t
 548mlx4_mr_create_secondary(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry,
 549                         uintptr_t addr)
 550{
 551        struct mlx4_priv *priv = dev->data->dev_private;
 552        int ret;
 553
 554        DEBUG("port %u requesting MR creation for address (%p)",
 555              dev->data->port_id, (void *)addr);
 556        ret = mlx4_mp_req_mr_create(dev, addr);
 557        if (ret) {
 558                DEBUG("port %u fail to request MR creation for address (%p)",
 559                      dev->data->port_id, (void *)addr);
 560                return UINT32_MAX;
 561        }
 562        rte_rwlock_read_lock(&priv->mr.rwlock);
 563        /* Fill in output data. */
 564        mr_lookup_dev(dev, entry, addr);
 565        /* Lookup can't fail. */
 566        MLX4_ASSERT(entry->lkey != UINT32_MAX);
 567        rte_rwlock_read_unlock(&priv->mr.rwlock);
 568        DEBUG("port %u MR CREATED by primary process for %p:\n"
 569              "  [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x",
 570              dev->data->port_id, (void *)addr,
 571              entry->start, entry->end, entry->lkey);
 572        return entry->lkey;
 573}
 574
 575/**
 576 * Create a new global Memory Region (MR) for a missing virtual address.
 577 * Register entire virtually contiguous memory chunk around the address.
 578 * This must be called from the primary process.
 579 *
 580 * @param dev
 581 *   Pointer to Ethernet device.
 582 * @param[out] entry
 583 *   Pointer to returning MR cache entry, found in the global cache or newly
 584 *   created. If failed to create one, this will not be updated.
 585 * @param addr
 586 *   Target virtual address to register.
 587 *
 588 * @return
 589 *   Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
 590 */
 591uint32_t
 592mlx4_mr_create_primary(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry,
 593                       uintptr_t addr)
 594{
 595        struct mlx4_priv *priv = dev->data->dev_private;
 596        const struct rte_memseg_list *msl;
 597        const struct rte_memseg *ms;
 598        struct mlx4_mr *mr = NULL;
 599        size_t len;
 600        uint32_t ms_n;
 601        uint32_t bmp_size;
 602        void *bmp_mem;
 603        int ms_idx_shift = -1;
 604        unsigned int n;
 605        struct mr_find_contig_memsegs_data data = {
 606                .addr = addr,
 607        };
 608        struct mr_find_contig_memsegs_data data_re;
 609
 610        DEBUG("port %u creating a MR using address (%p)",
 611              dev->data->port_id, (void *)addr);
 612        /*
 613         * Release detached MRs if any. This can't be called with holding either
 614         * memory_hotplug_lock or priv->mr.rwlock. MRs on the free list have
 615         * been detached by the memory free event but it couldn't be released
 616         * inside the callback due to deadlock. As a result, releasing resources
 617         * is quite opportunistic.
 618         */
 619        mlx4_mr_garbage_collect(dev);
 620        /*
 621         * If enabled, find out a contiguous virtual address chunk in use, to
 622         * which the given address belongs, in order to register maximum range.
 623         * In the best case where mempools are not dynamically recreated and
 624         * '--socket-mem' is specified as an EAL option, it is very likely to
 625         * have only one MR(LKey) per a socket and per a hugepage-size even
 626         * though the system memory is highly fragmented. As the whole memory
 627         * chunk will be pinned by kernel, it can't be reused unless entire
 628         * chunk is freed from EAL.
 629         *
 630         * If disabled, just register one memseg (page). Then, memory
 631         * consumption will be minimized but it may drop performance if there
 632         * are many MRs to lookup on the datapath.
 633         */
 634        if (!priv->mr_ext_memseg_en) {
 635                data.msl = rte_mem_virt2memseg_list((void *)addr);
 636                data.start = RTE_ALIGN_FLOOR(addr, data.msl->page_sz);
 637                data.end = data.start + data.msl->page_sz;
 638        } else if (!rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data)) {
 639                WARN("port %u unable to find virtually contiguous"
 640                     " chunk for address (%p)."
 641                     " rte_memseg_contig_walk() failed.",
 642                     dev->data->port_id, (void *)addr);
 643                rte_errno = ENXIO;
 644                goto err_nolock;
 645        }
 646alloc_resources:
 647        /* Addresses must be page-aligned. */
 648        MLX4_ASSERT(rte_is_aligned((void *)data.start, data.msl->page_sz));
 649        MLX4_ASSERT(rte_is_aligned((void *)data.end, data.msl->page_sz));
 650        msl = data.msl;
 651        ms = rte_mem_virt2memseg((void *)data.start, msl);
 652        len = data.end - data.start;
 653        MLX4_ASSERT(msl->page_sz == ms->hugepage_sz);
 654        /* Number of memsegs in the range. */
 655        ms_n = len / msl->page_sz;
 656        DEBUG("port %u extending %p to [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
 657              " page_sz=0x%" PRIx64 ", ms_n=%u",
 658              dev->data->port_id, (void *)addr,
 659              data.start, data.end, msl->page_sz, ms_n);
 660        /* Size of memory for bitmap. */
 661        bmp_size = rte_bitmap_get_memory_footprint(ms_n);
 662        mr = rte_zmalloc_socket(NULL,
 663                                RTE_ALIGN_CEIL(sizeof(*mr),
 664                                               RTE_CACHE_LINE_SIZE) +
 665                                bmp_size,
 666                                RTE_CACHE_LINE_SIZE, msl->socket_id);
 667        if (mr == NULL) {
 668                WARN("port %u unable to allocate memory for a new MR of"
 669                     " address (%p).",
 670                     dev->data->port_id, (void *)addr);
 671                rte_errno = ENOMEM;
 672                goto err_nolock;
 673        }
 674        mr->msl = msl;
 675        /*
 676         * Save the index of the first memseg and initialize memseg bitmap. To
 677         * see if a memseg of ms_idx in the memseg-list is still valid, check:
 678         *      rte_bitmap_get(mr->bmp, ms_idx - mr->ms_base_idx)
 679         */
 680        mr->ms_base_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
 681        bmp_mem = RTE_PTR_ALIGN_CEIL(mr + 1, RTE_CACHE_LINE_SIZE);
 682        mr->ms_bmp = rte_bitmap_init(ms_n, bmp_mem, bmp_size);
 683        if (mr->ms_bmp == NULL) {
 684                WARN("port %u unable to initialize bitmap for a new MR of"
 685                     " address (%p).",
 686                     dev->data->port_id, (void *)addr);
 687                rte_errno = EINVAL;
 688                goto err_nolock;
 689        }
 690        /*
 691         * Should recheck whether the extended contiguous chunk is still valid.
 692         * Because memory_hotplug_lock can't be held if there's any memory
 693         * related calls in a critical path, resource allocation above can't be
 694         * locked. If the memory has been changed at this point, try again with
 695         * just single page. If not, go on with the big chunk atomically from
 696         * here.
 697         */
 698        rte_mcfg_mem_read_lock();
 699        data_re = data;
 700        if (len > msl->page_sz &&
 701            !rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data_re)) {
 702                WARN("port %u unable to find virtually contiguous"
 703                     " chunk for address (%p)."
 704                     " rte_memseg_contig_walk() failed.",
 705                     dev->data->port_id, (void *)addr);
 706                rte_errno = ENXIO;
 707                goto err_memlock;
 708        }
 709        if (data.start != data_re.start || data.end != data_re.end) {
 710                /*
 711                 * The extended contiguous chunk has been changed. Try again
 712                 * with single memseg instead.
 713                 */
 714                data.start = RTE_ALIGN_FLOOR(addr, msl->page_sz);
 715                data.end = data.start + msl->page_sz;
 716                rte_mcfg_mem_read_unlock();
 717                mr_free(mr);
 718                goto alloc_resources;
 719        }
 720        MLX4_ASSERT(data.msl == data_re.msl);
 721        rte_rwlock_write_lock(&priv->mr.rwlock);
 722        /*
 723         * Check the address is really missing. If other thread already created
 724         * one or it is not found due to overflow, abort and return.
 725         */
 726        if (mr_lookup_dev(dev, entry, addr) != UINT32_MAX) {
 727                /*
 728                 * Insert to the global cache table. It may fail due to
 729                 * low-on-memory. Then, this entry will have to be searched
 730                 * here again.
 731                 */
 732                mr_btree_insert(&priv->mr.cache, entry);
 733                DEBUG("port %u found MR for %p on final lookup, abort",
 734                      dev->data->port_id, (void *)addr);
 735                rte_rwlock_write_unlock(&priv->mr.rwlock);
 736                rte_mcfg_mem_read_unlock();
 737                /*
 738                 * Must be unlocked before calling rte_free() because
 739                 * mlx4_mr_mem_event_free_cb() can be called inside.
 740                 */
 741                mr_free(mr);
 742                return entry->lkey;
 743        }
 744        /*
 745         * Trim start and end addresses for verbs MR. Set bits for registering
 746         * memsegs but exclude already registered ones. Bitmap can be
 747         * fragmented.
 748         */
 749        for (n = 0; n < ms_n; ++n) {
 750                uintptr_t start;
 751                struct mlx4_mr_cache ret;
 752
 753                memset(&ret, 0, sizeof(ret));
 754                start = data_re.start + n * msl->page_sz;
 755                /* Exclude memsegs already registered by other MRs. */
 756                if (mr_lookup_dev(dev, &ret, start) == UINT32_MAX) {
 757                        /*
 758                         * Start from the first unregistered memseg in the
 759                         * extended range.
 760                         */
 761                        if (ms_idx_shift == -1) {
 762                                mr->ms_base_idx += n;
 763                                data.start = start;
 764                                ms_idx_shift = n;
 765                        }
 766                        data.end = start + msl->page_sz;
 767                        rte_bitmap_set(mr->ms_bmp, n - ms_idx_shift);
 768                        ++mr->ms_n;
 769                }
 770        }
 771        len = data.end - data.start;
 772        mr->ms_bmp_n = len / msl->page_sz;
 773        MLX4_ASSERT(ms_idx_shift + mr->ms_bmp_n <= ms_n);
 774        /*
 775         * Finally create a verbs MR for the memory chunk. ibv_reg_mr() can be
 776         * called with holding the memory lock because it doesn't use
 777         * mlx4_alloc_buf_extern() which eventually calls rte_malloc_socket()
 778         * through mlx4_alloc_verbs_buf().
 779         */
 780        mr->ibv_mr = mlx4_glue->reg_mr(priv->pd, (void *)data.start, len,
 781                                       IBV_ACCESS_LOCAL_WRITE);
 782        if (mr->ibv_mr == NULL) {
 783                WARN("port %u fail to create a verbs MR for address (%p)",
 784                     dev->data->port_id, (void *)addr);
 785                rte_errno = EINVAL;
 786                goto err_mrlock;
 787        }
 788        MLX4_ASSERT((uintptr_t)mr->ibv_mr->addr == data.start);
 789        MLX4_ASSERT(mr->ibv_mr->length == len);
 790        LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
 791        DEBUG("port %u MR CREATED (%p) for %p:\n"
 792              "  [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
 793              " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u",
 794              dev->data->port_id, (void *)mr, (void *)addr,
 795              data.start, data.end, rte_cpu_to_be_32(mr->ibv_mr->lkey),
 796              mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n);
 797        /* Insert to the global cache table. */
 798        mr_insert_dev_cache(dev, mr);
 799        /* Fill in output data. */
 800        mr_lookup_dev(dev, entry, addr);
 801        /* Lookup can't fail. */
 802        MLX4_ASSERT(entry->lkey != UINT32_MAX);
 803        rte_rwlock_write_unlock(&priv->mr.rwlock);
 804        rte_mcfg_mem_read_unlock();
 805        return entry->lkey;
 806err_mrlock:
 807        rte_rwlock_write_unlock(&priv->mr.rwlock);
 808err_memlock:
 809        rte_mcfg_mem_read_unlock();
 810err_nolock:
 811        /*
 812         * In case of error, as this can be called in a datapath, a warning
 813         * message per an error is preferable instead. Must be unlocked before
 814         * calling rte_free() because mlx4_mr_mem_event_free_cb() can be called
 815         * inside.
 816         */
 817        mr_free(mr);
 818        return UINT32_MAX;
 819}
 820
 821/**
 822 * Create a new global Memory Region (MR) for a missing virtual address.
 823 * This can be called from primary and secondary process.
 824 *
 825 * @param dev
 826 *   Pointer to Ethernet device.
 827 * @param[out] entry
 828 *   Pointer to returning MR cache entry, found in the global cache or newly
 829 *   created. If failed to create one, this will not be updated.
 830 * @param addr
 831 *   Target virtual address to register.
 832 *
 833 * @return
 834 *   Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
 835 */
 836static uint32_t
 837mlx4_mr_create(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry,
 838               uintptr_t addr)
 839{
 840        uint32_t ret = 0;
 841
 842        switch (rte_eal_process_type()) {
 843        case RTE_PROC_PRIMARY:
 844                ret = mlx4_mr_create_primary(dev, entry, addr);
 845                break;
 846        case RTE_PROC_SECONDARY:
 847                ret = mlx4_mr_create_secondary(dev, entry, addr);
 848                break;
 849        default:
 850                break;
 851        }
 852        return ret;
 853}
 854
 855/**
 856 * Rebuild the global B-tree cache of device from the original MR list.
 857 *
 858 * @param dev
 859 *   Pointer to Ethernet device.
 860 */
 861static void
 862mr_rebuild_dev_cache(struct rte_eth_dev *dev)
 863{
 864        struct mlx4_priv *priv = dev->data->dev_private;
 865        struct mlx4_mr *mr;
 866
 867        DEBUG("port %u rebuild dev cache[]", dev->data->port_id);
 868        /* Flush cache to rebuild. */
 869        priv->mr.cache.len = 1;
 870        priv->mr.cache.overflow = 0;
 871        /* Iterate all the existing MRs. */
 872        LIST_FOREACH(mr, &priv->mr.mr_list, mr)
 873                if (mr_insert_dev_cache(dev, mr) < 0)
 874                        return;
 875}
 876
 877/**
 878 * Callback for memory free event. Iterate freed memsegs and check whether it
 879 * belongs to an existing MR. If found, clear the bit from bitmap of MR. As a
 880 * result, the MR would be fragmented. If it becomes empty, the MR will be freed
 881 * later by mlx4_mr_garbage_collect().
 882 *
 883 * The global cache must be rebuilt if there's any change and this event has to
 884 * be propagated to dataplane threads to flush the local caches.
 885 *
 886 * @param dev
 887 *   Pointer to Ethernet device.
 888 * @param addr
 889 *   Address of freed memory.
 890 * @param len
 891 *   Size of freed memory.
 892 */
 893static void
 894mlx4_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr, size_t len)
 895{
 896        struct mlx4_priv *priv = dev->data->dev_private;
 897        const struct rte_memseg_list *msl;
 898        struct mlx4_mr *mr;
 899        int ms_n;
 900        int i;
 901        int rebuild = 0;
 902
 903        DEBUG("port %u free callback: addr=%p, len=%zu",
 904              dev->data->port_id, addr, len);
 905        msl = rte_mem_virt2memseg_list(addr);
 906        /* addr and len must be page-aligned. */
 907        MLX4_ASSERT((uintptr_t)addr ==
 908                    RTE_ALIGN((uintptr_t)addr, msl->page_sz));
 909        MLX4_ASSERT(len == RTE_ALIGN(len, msl->page_sz));
 910        ms_n = len / msl->page_sz;
 911        rte_rwlock_write_lock(&priv->mr.rwlock);
 912        /* Clear bits of freed memsegs from MR. */
 913        for (i = 0; i < ms_n; ++i) {
 914                const struct rte_memseg *ms;
 915                struct mlx4_mr_cache entry;
 916                uintptr_t start;
 917                int ms_idx;
 918                uint32_t pos;
 919
 920                /* Find MR having this memseg. */
 921                start = (uintptr_t)addr + i * msl->page_sz;
 922                mr = mr_lookup_dev_list(dev, &entry, start);
 923                if (mr == NULL)
 924                        continue;
 925                MLX4_ASSERT(mr->msl); /* Can't be external memory. */
 926                ms = rte_mem_virt2memseg((void *)start, msl);
 927                MLX4_ASSERT(ms != NULL);
 928                MLX4_ASSERT(msl->page_sz == ms->hugepage_sz);
 929                ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
 930                pos = ms_idx - mr->ms_base_idx;
 931                MLX4_ASSERT(rte_bitmap_get(mr->ms_bmp, pos));
 932                MLX4_ASSERT(pos < mr->ms_bmp_n);
 933                DEBUG("port %u MR(%p): clear bitmap[%u] for addr %p",
 934                      dev->data->port_id, (void *)mr, pos, (void *)start);
 935                rte_bitmap_clear(mr->ms_bmp, pos);
 936                if (--mr->ms_n == 0) {
 937                        LIST_REMOVE(mr, mr);
 938                        LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
 939                        DEBUG("port %u remove MR(%p) from list",
 940                              dev->data->port_id, (void *)mr);
 941                }
 942                /*
 943                 * MR is fragmented or will be freed. the global cache must be
 944                 * rebuilt.
 945                 */
 946                rebuild = 1;
 947        }
 948        if (rebuild) {
 949                mr_rebuild_dev_cache(dev);
 950                /*
 951                 * No explicit wmb is needed after updating dev_gen due to
 952                 * store-release ordering in unlock that provides the
 953                 * implicit barrier at the software visible level.
 954                 */
 955                ++priv->mr.dev_gen;
 956                DEBUG("broadcasting local cache flush, gen=%d",
 957                      priv->mr.dev_gen);
 958        }
 959        rte_rwlock_write_unlock(&priv->mr.rwlock);
 960#ifdef RTE_LIBRTE_MLX4_DEBUG
 961        if (rebuild)
 962                mlx4_mr_dump_dev(dev);
 963#endif
 964}
 965
 966/**
 967 * Callback for memory event.
 968 *
 969 * @param event_type
 970 *   Memory event type.
 971 * @param addr
 972 *   Address of memory.
 973 * @param len
 974 *   Size of memory.
 975 */
 976void
 977mlx4_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
 978                     size_t len, void *arg __rte_unused)
 979{
 980        struct mlx4_priv *priv;
 981        struct mlx4_dev_list *dev_list = &mlx4_shared_data->mem_event_cb_list;
 982
 983        /* Must be called from the primary process. */
 984        MLX4_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
 985        switch (event_type) {
 986        case RTE_MEM_EVENT_FREE:
 987                rte_rwlock_read_lock(&mlx4_shared_data->mem_event_rwlock);
 988                /* Iterate all the existing mlx4 devices. */
 989                LIST_FOREACH(priv, dev_list, mem_event_cb)
 990                        mlx4_mr_mem_event_free_cb(ETH_DEV(priv), addr, len);
 991                rte_rwlock_read_unlock(&mlx4_shared_data->mem_event_rwlock);
 992                break;
 993        case RTE_MEM_EVENT_ALLOC:
 994        default:
 995                break;
 996        }
 997}
 998
 999/**
1000 * Look up address in the global MR cache table. If not found, create a new MR.
1001 * Insert the found/created entry to local bottom-half cache table.
1002 *
1003 * @param dev
1004 *   Pointer to Ethernet device.
1005 * @param mr_ctrl
1006 *   Pointer to per-queue MR control structure.
1007 * @param[out] entry
1008 *   Pointer to returning MR cache entry, found in the global cache or newly
1009 *   created. If failed to create one, this is not written.
1010 * @param addr
1011 *   Search key.
1012 *
1013 * @return
1014 *   Searched LKey on success, UINT32_MAX on no match.
1015 */
1016static uint32_t
1017mlx4_mr_lookup_dev(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl,
1018                   struct mlx4_mr_cache *entry, uintptr_t addr)
1019{
1020        struct mlx4_priv *priv = dev->data->dev_private;
1021        struct mlx4_mr_btree *bt = &mr_ctrl->cache_bh;
1022        uint16_t idx;
1023        uint32_t lkey;
1024
1025        /* If local cache table is full, try to double it. */
1026        if (unlikely(bt->len == bt->size))
1027                mr_btree_expand(bt, bt->size << 1);
1028        /* Look up in the global cache. */
1029        rte_rwlock_read_lock(&priv->mr.rwlock);
1030        lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
1031        if (lkey != UINT32_MAX) {
1032                /* Found. */
1033                *entry = (*priv->mr.cache.table)[idx];
1034                rte_rwlock_read_unlock(&priv->mr.rwlock);
1035                /*
1036                 * Update local cache. Even if it fails, return the found entry
1037                 * to update top-half cache. Next time, this entry will be found
1038                 * in the global cache.
1039                 */
1040                mr_btree_insert(bt, entry);
1041                return lkey;
1042        }
1043        rte_rwlock_read_unlock(&priv->mr.rwlock);
1044        /* First time to see the address? Create a new MR. */
1045        lkey = mlx4_mr_create(dev, entry, addr);
1046        /*
1047         * Update the local cache if successfully created a new global MR. Even
1048         * if failed to create one, there's no action to take in this datapath
1049         * code. As returning LKey is invalid, this will eventually make HW
1050         * fail.
1051         */
1052        if (lkey != UINT32_MAX)
1053                mr_btree_insert(bt, entry);
1054        return lkey;
1055}
1056
1057/**
1058 * Bottom-half of LKey search on datapath. Firstly search in cache_bh[] and if
1059 * misses, search in the global MR cache table and update the new entry to
1060 * per-queue local caches.
1061 *
1062 * @param dev
1063 *   Pointer to Ethernet device.
1064 * @param mr_ctrl
1065 *   Pointer to per-queue MR control structure.
1066 * @param addr
1067 *   Search key.
1068 *
1069 * @return
1070 *   Searched LKey on success, UINT32_MAX on no match.
1071 */
1072static uint32_t
1073mlx4_mr_addr2mr_bh(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl,
1074                   uintptr_t addr)
1075{
1076        uint32_t lkey;
1077        uint16_t bh_idx = 0;
1078        /* Victim in top-half cache to replace with new entry. */
1079        struct mlx4_mr_cache *repl = &mr_ctrl->cache[mr_ctrl->head];
1080
1081        /* Binary-search MR translation table. */
1082        lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr);
1083        /* Update top-half cache. */
1084        if (likely(lkey != UINT32_MAX)) {
1085                *repl = (*mr_ctrl->cache_bh.table)[bh_idx];
1086        } else {
1087                /*
1088                 * If missed in local lookup table, search in the global cache
1089                 * and local cache_bh[] will be updated inside if possible.
1090                 * Top-half cache entry will also be updated.
1091                 */
1092                lkey = mlx4_mr_lookup_dev(dev, mr_ctrl, repl, addr);
1093                if (unlikely(lkey == UINT32_MAX))
1094                        return UINT32_MAX;
1095        }
1096        /* Update the most recently used entry. */
1097        mr_ctrl->mru = mr_ctrl->head;
1098        /* Point to the next victim, the oldest. */
1099        mr_ctrl->head = (mr_ctrl->head + 1) % MLX4_MR_CACHE_N;
1100        return lkey;
1101}
1102
1103/**
1104 * Bottom-half of LKey search on Rx.
1105 *
1106 * @param rxq
1107 *   Pointer to Rx queue structure.
1108 * @param addr
1109 *   Search key.
1110 *
1111 * @return
1112 *   Searched LKey on success, UINT32_MAX on no match.
1113 */
1114uint32_t
1115mlx4_rx_addr2mr_bh(struct rxq *rxq, uintptr_t addr)
1116{
1117        struct mlx4_mr_ctrl *mr_ctrl = &rxq->mr_ctrl;
1118        struct mlx4_priv *priv = rxq->priv;
1119
1120        return mlx4_mr_addr2mr_bh(ETH_DEV(priv), mr_ctrl, addr);
1121}
1122
1123/**
1124 * Bottom-half of LKey search on Tx.
1125 *
1126 * @param txq
1127 *   Pointer to Tx queue structure.
1128 * @param addr
1129 *   Search key.
1130 *
1131 * @return
1132 *   Searched LKey on success, UINT32_MAX on no match.
1133 */
1134static uint32_t
1135mlx4_tx_addr2mr_bh(struct txq *txq, uintptr_t addr)
1136{
1137        struct mlx4_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
1138        struct mlx4_priv *priv = txq->priv;
1139
1140        return mlx4_mr_addr2mr_bh(ETH_DEV(priv), mr_ctrl, addr);
1141}
1142
1143/**
1144 * Bottom-half of LKey search on Tx. If it can't be searched in the memseg
1145 * list, register the mempool of the mbuf as externally allocated memory.
1146 *
1147 * @param txq
1148 *   Pointer to Tx queue structure.
1149 * @param mb
1150 *   Pointer to mbuf.
1151 *
1152 * @return
1153 *   Searched LKey on success, UINT32_MAX on no match.
1154 */
1155uint32_t
1156mlx4_tx_mb2mr_bh(struct txq *txq, struct rte_mbuf *mb)
1157{
1158        uintptr_t addr = (uintptr_t)mb->buf_addr;
1159        uint32_t lkey;
1160
1161        lkey = mlx4_tx_addr2mr_bh(txq, addr);
1162        if (lkey == UINT32_MAX && rte_errno == ENXIO) {
1163                /* Mempool may have externally allocated memory. */
1164                return mlx4_tx_update_ext_mp(txq, addr, mlx4_mb2mp(mb));
1165        }
1166        return lkey;
1167}
1168
1169/**
1170 * Flush all of the local cache entries.
1171 *
1172 * @param mr_ctrl
1173 *   Pointer to per-queue MR control structure.
1174 */
1175void
1176mlx4_mr_flush_local_cache(struct mlx4_mr_ctrl *mr_ctrl)
1177{
1178        /* Reset the most-recently-used index. */
1179        mr_ctrl->mru = 0;
1180        /* Reset the linear search array. */
1181        mr_ctrl->head = 0;
1182        memset(mr_ctrl->cache, 0, sizeof(mr_ctrl->cache));
1183        /* Reset the B-tree table. */
1184        mr_ctrl->cache_bh.len = 1;
1185        mr_ctrl->cache_bh.overflow = 0;
1186        /* Update the generation number. */
1187        mr_ctrl->cur_gen = *mr_ctrl->dev_gen_ptr;
1188        DEBUG("mr_ctrl(%p): flushed, cur_gen=%d",
1189              (void *)mr_ctrl, mr_ctrl->cur_gen);
1190}
1191
1192/**
1193 * Called during rte_mempool_mem_iter() by mlx4_mr_update_ext_mp().
1194 *
1195 * Externally allocated chunk is registered and a MR is created for the chunk.
1196 * The MR object is added to the global list. If memseg list of a MR object
1197 * (mr->msl) is null, the MR object can be regarded as externally allocated
1198 * memory.
1199 *
1200 * Once external memory is registered, it should be static. If the memory is
1201 * freed and the virtual address range has different physical memory mapped
1202 * again, it may cause crash on device due to the wrong translation entry. PMD
1203 * can't track the free event of the external memory for now.
1204 */
1205static void
1206mlx4_mr_update_ext_mp_cb(struct rte_mempool *mp, void *opaque,
1207                         struct rte_mempool_memhdr *memhdr,
1208                         unsigned mem_idx __rte_unused)
1209{
1210        struct mr_update_mp_data *data = opaque;
1211        struct rte_eth_dev *dev = data->dev;
1212        struct mlx4_priv *priv = dev->data->dev_private;
1213        struct mlx4_mr_ctrl *mr_ctrl = data->mr_ctrl;
1214        struct mlx4_mr *mr = NULL;
1215        uintptr_t addr = (uintptr_t)memhdr->addr;
1216        size_t len = memhdr->len;
1217        struct mlx4_mr_cache entry;
1218        uint32_t lkey;
1219
1220        MLX4_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
1221        /* If already registered, it should return. */
1222        rte_rwlock_read_lock(&priv->mr.rwlock);
1223        lkey = mr_lookup_dev(dev, &entry, addr);
1224        rte_rwlock_read_unlock(&priv->mr.rwlock);
1225        if (lkey != UINT32_MAX)
1226                return;
1227        mr = rte_zmalloc_socket(NULL,
1228                                RTE_ALIGN_CEIL(sizeof(*mr),
1229                                               RTE_CACHE_LINE_SIZE),
1230                                RTE_CACHE_LINE_SIZE, mp->socket_id);
1231        if (mr == NULL) {
1232                WARN("port %u unable to allocate memory for a new MR of"
1233                     " mempool (%s).",
1234                     dev->data->port_id, mp->name);
1235                data->ret = -1;
1236                return;
1237        }
1238        DEBUG("port %u register MR for chunk #%d of mempool (%s)",
1239              dev->data->port_id, mem_idx, mp->name);
1240        mr->ibv_mr = mlx4_glue->reg_mr(priv->pd, (void *)addr, len,
1241                                       IBV_ACCESS_LOCAL_WRITE);
1242        if (mr->ibv_mr == NULL) {
1243                WARN("port %u fail to create a verbs MR for address (%p)",
1244                     dev->data->port_id, (void *)addr);
1245                rte_free(mr);
1246                data->ret = -1;
1247                return;
1248        }
1249        mr->msl = NULL; /* Mark it is external memory. */
1250        mr->ms_bmp = NULL;
1251        mr->ms_n = 1;
1252        mr->ms_bmp_n = 1;
1253        rte_rwlock_write_lock(&priv->mr.rwlock);
1254        LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
1255        DEBUG("port %u MR CREATED (%p) for external memory %p:\n"
1256              "  [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
1257              " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u",
1258              dev->data->port_id, (void *)mr, (void *)addr,
1259              addr, addr + len, rte_cpu_to_be_32(mr->ibv_mr->lkey),
1260              mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n);
1261        /* Insert to the global cache table. */
1262        mr_insert_dev_cache(dev, mr);
1263        rte_rwlock_write_unlock(&priv->mr.rwlock);
1264        /* Insert to the local cache table */
1265        mlx4_mr_addr2mr_bh(dev, mr_ctrl, addr);
1266}
1267
1268/**
1269 * Register MR for entire memory chunks in a Mempool having externally allocated
1270 * memory and fill in local cache.
1271 *
1272 * @param dev
1273 *   Pointer to Ethernet device.
1274 * @param mr_ctrl
1275 *   Pointer to per-queue MR control structure.
1276 * @param mp
1277 *   Pointer to registering Mempool.
1278 *
1279 * @return
1280 *   0 on success, -1 on failure.
1281 */
1282static uint32_t
1283mlx4_mr_update_ext_mp(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl,
1284                      struct rte_mempool *mp)
1285{
1286        struct mr_update_mp_data data = {
1287                .dev = dev,
1288                .mr_ctrl = mr_ctrl,
1289                .ret = 0,
1290        };
1291
1292        rte_mempool_mem_iter(mp, mlx4_mr_update_ext_mp_cb, &data);
1293        return data.ret;
1294}
1295
1296/**
1297 * Register MR entire memory chunks in a Mempool having externally allocated
1298 * memory and search LKey of the address to return.
1299 *
1300 * @param dev
1301 *   Pointer to Ethernet device.
1302 * @param addr
1303 *   Search key.
1304 * @param mp
1305 *   Pointer to registering Mempool where addr belongs.
1306 *
1307 * @return
1308 *   LKey for address on success, UINT32_MAX on failure.
1309 */
1310uint32_t
1311mlx4_tx_update_ext_mp(struct txq *txq, uintptr_t addr, struct rte_mempool *mp)
1312{
1313        struct mlx4_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
1314        struct mlx4_priv *priv = txq->priv;
1315
1316        if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
1317                WARN("port %u using address (%p) from unregistered mempool"
1318                     " having externally allocated memory"
1319                     " in secondary process, please create mempool"
1320                     " prior to rte_eth_dev_start()",
1321                     PORT_ID(priv), (void *)addr);
1322                return UINT32_MAX;
1323        }
1324        mlx4_mr_update_ext_mp(ETH_DEV(priv), mr_ctrl, mp);
1325        return mlx4_tx_addr2mr_bh(txq, addr);
1326}
1327
1328/* Called during rte_mempool_mem_iter() by mlx4_mr_update_mp(). */
1329static void
1330mlx4_mr_update_mp_cb(struct rte_mempool *mp __rte_unused, void *opaque,
1331                     struct rte_mempool_memhdr *memhdr,
1332                     unsigned mem_idx __rte_unused)
1333{
1334        struct mr_update_mp_data *data = opaque;
1335        uint32_t lkey;
1336
1337        /* Stop iteration if failed in the previous walk. */
1338        if (data->ret < 0)
1339                return;
1340        /* Register address of the chunk and update local caches. */
1341        lkey = mlx4_mr_addr2mr_bh(data->dev, data->mr_ctrl,
1342                                  (uintptr_t)memhdr->addr);
1343        if (lkey == UINT32_MAX)
1344                data->ret = -1;
1345}
1346
1347/**
1348 * Register entire memory chunks in a Mempool.
1349 *
1350 * @param dev
1351 *   Pointer to Ethernet device.
1352 * @param mr_ctrl
1353 *   Pointer to per-queue MR control structure.
1354 * @param mp
1355 *   Pointer to registering Mempool.
1356 *
1357 * @return
1358 *   0 on success, -1 on failure.
1359 */
1360int
1361mlx4_mr_update_mp(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl,
1362                  struct rte_mempool *mp)
1363{
1364        struct mr_update_mp_data data = {
1365                .dev = dev,
1366                .mr_ctrl = mr_ctrl,
1367                .ret = 0,
1368        };
1369
1370        rte_mempool_mem_iter(mp, mlx4_mr_update_mp_cb, &data);
1371        if (data.ret < 0 && rte_errno == ENXIO) {
1372                /* Mempool may have externally allocated memory. */
1373                return mlx4_mr_update_ext_mp(dev, mr_ctrl, mp);
1374        }
1375        return data.ret;
1376}
1377
1378#ifdef RTE_LIBRTE_MLX4_DEBUG
1379/**
1380 * Dump all the created MRs and the global cache entries.
1381 *
1382 * @param dev
1383 *   Pointer to Ethernet device.
1384 */
1385void
1386mlx4_mr_dump_dev(struct rte_eth_dev *dev)
1387{
1388        struct mlx4_priv *priv = dev->data->dev_private;
1389        struct mlx4_mr *mr;
1390        int mr_n = 0;
1391        int chunk_n = 0;
1392
1393        rte_rwlock_read_lock(&priv->mr.rwlock);
1394        /* Iterate all the existing MRs. */
1395        LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
1396                unsigned int n;
1397
1398                DEBUG("port %u MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
1399                      dev->data->port_id, mr_n++,
1400                      rte_cpu_to_be_32(mr->ibv_mr->lkey),
1401                      mr->ms_n, mr->ms_bmp_n);
1402                if (mr->ms_n == 0)
1403                        continue;
1404                for (n = 0; n < mr->ms_bmp_n; ) {
1405                        struct mlx4_mr_cache ret;
1406
1407                        memset(&ret, 0, sizeof(ret));
1408                        n = mr_find_next_chunk(mr, &ret, n);
1409                        if (!ret.end)
1410                                break;
1411                        DEBUG("  chunk[%u], [0x%" PRIxPTR ", 0x%" PRIxPTR ")",
1412                              chunk_n++, ret.start, ret.end);
1413                }
1414        }
1415        DEBUG("port %u dumping global cache", dev->data->port_id);
1416        mlx4_mr_btree_dump(&priv->mr.cache);
1417        rte_rwlock_read_unlock(&priv->mr.rwlock);
1418}
1419#endif
1420
1421/**
1422 * Release all the created MRs and resources. Remove device from memory callback
1423 * list.
1424 *
1425 * @param dev
1426 *   Pointer to Ethernet device.
1427 */
1428void
1429mlx4_mr_release(struct rte_eth_dev *dev)
1430{
1431        struct mlx4_priv *priv = dev->data->dev_private;
1432        struct mlx4_mr *mr_next;
1433
1434        /* Remove from memory callback device list. */
1435        rte_rwlock_write_lock(&mlx4_shared_data->mem_event_rwlock);
1436        LIST_REMOVE(priv, mem_event_cb);
1437        rte_rwlock_write_unlock(&mlx4_shared_data->mem_event_rwlock);
1438#ifdef RTE_LIBRTE_MLX4_DEBUG
1439        mlx4_mr_dump_dev(dev);
1440#endif
1441        rte_rwlock_write_lock(&priv->mr.rwlock);
1442        /* Detach from MR list and move to free list. */
1443        mr_next = LIST_FIRST(&priv->mr.mr_list);
1444        while (mr_next != NULL) {
1445                struct mlx4_mr *mr = mr_next;
1446
1447                mr_next = LIST_NEXT(mr, mr);
1448                LIST_REMOVE(mr, mr);
1449                LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
1450        }
1451        LIST_INIT(&priv->mr.mr_list);
1452        /* Free global cache. */
1453        mlx4_mr_btree_free(&priv->mr.cache);
1454        rte_rwlock_write_unlock(&priv->mr.rwlock);
1455        /* Free all remaining MRs. */
1456        mlx4_mr_garbage_collect(dev);
1457}
1458