dpdk/drivers/net/mlx4/mlx4_mr.c
<<
>>
Prefs
   1/* SPDX-License-Identifier: BSD-3-Clause
   2 * Copyright 2017 6WIND S.A.
   3 * Copyright 2017 Mellanox Technologies, Ltd
   4 */
   5
   6/**
   7 * @file
   8 * Memory management functions for mlx4 driver.
   9 */
  10
  11#include <errno.h>
  12#include <inttypes.h>
  13#include <stddef.h>
  14#include <stdint.h>
  15#include <string.h>
  16
  17/* Verbs headers do not support -pedantic. */
  18#ifdef PEDANTIC
  19#pragma GCC diagnostic ignored "-Wpedantic"
  20#endif
  21#include <infiniband/verbs.h>
  22#ifdef PEDANTIC
  23#pragma GCC diagnostic error "-Wpedantic"
  24#endif
  25
  26#include <rte_branch_prediction.h>
  27#include <rte_common.h>
  28#include <rte_eal_memconfig.h>
  29#include <rte_errno.h>
  30#include <rte_malloc.h>
  31#include <rte_memory.h>
  32#include <rte_mempool.h>
  33#include <rte_rwlock.h>
  34
  35#include "mlx4_glue.h"
  36#include "mlx4_mr.h"
  37#include "mlx4_rxtx.h"
  38#include "mlx4_utils.h"
  39
  40struct mr_find_contig_memsegs_data {
  41        uintptr_t addr;
  42        uintptr_t start;
  43        uintptr_t end;
  44        const struct rte_memseg_list *msl;
  45};
  46
  47struct mr_update_mp_data {
  48        struct rte_eth_dev *dev;
  49        struct mlx4_mr_ctrl *mr_ctrl;
  50        int ret;
  51};
  52
  53/**
  54 * Expand B-tree table to a given size. Can't be called with holding
  55 * memory_hotplug_lock or priv->mr.rwlock due to rte_realloc().
  56 *
  57 * @param bt
  58 *   Pointer to B-tree structure.
  59 * @param n
  60 *   Number of entries for expansion.
  61 *
  62 * @return
  63 *   0 on success, -1 on failure.
  64 */
  65static int
  66mr_btree_expand(struct mlx4_mr_btree *bt, int n)
  67{
  68        void *mem;
  69        int ret = 0;
  70
  71        if (n <= bt->size)
  72                return ret;
  73        /*
  74         * Downside of directly using rte_realloc() is that SOCKET_ID_ANY is
  75         * used inside if there's no room to expand. Because this is a quite
  76         * rare case and a part of very slow path, it is very acceptable.
  77         * Initially cache_bh[] will be given practically enough space and once
  78         * it is expanded, expansion wouldn't be needed again ever.
  79         */
  80        mem = rte_realloc(bt->table, n * sizeof(struct mlx4_mr_cache), 0);
  81        if (mem == NULL) {
  82                /* Not an error, B-tree search will be skipped. */
  83                WARN("failed to expand MR B-tree (%p) table", (void *)bt);
  84                ret = -1;
  85        } else {
  86                DEBUG("expanded MR B-tree table (size=%u)", n);
  87                bt->table = mem;
  88                bt->size = n;
  89        }
  90        return ret;
  91}
  92
  93/**
  94 * Look up LKey from given B-tree lookup table, store the last index and return
  95 * searched LKey.
  96 *
  97 * @param bt
  98 *   Pointer to B-tree structure.
  99 * @param[out] idx
 100 *   Pointer to index. Even on search failure, returns index where it stops
 101 *   searching so that index can be used when inserting a new entry.
 102 * @param addr
 103 *   Search key.
 104 *
 105 * @return
 106 *   Searched LKey on success, UINT32_MAX on no match.
 107 */
 108static uint32_t
 109mr_btree_lookup(struct mlx4_mr_btree *bt, uint16_t *idx, uintptr_t addr)
 110{
 111        struct mlx4_mr_cache *lkp_tbl;
 112        uint16_t n;
 113        uint16_t base = 0;
 114
 115        MLX4_ASSERT(bt != NULL);
 116        lkp_tbl = *bt->table;
 117        n = bt->len;
 118        /* First entry must be NULL for comparison. */
 119        MLX4_ASSERT(bt->len > 0 || (lkp_tbl[0].start == 0 &&
 120                                    lkp_tbl[0].lkey == UINT32_MAX));
 121        /* Binary search. */
 122        do {
 123                register uint16_t delta = n >> 1;
 124
 125                if (addr < lkp_tbl[base + delta].start) {
 126                        n = delta;
 127                } else {
 128                        base += delta;
 129                        n -= delta;
 130                }
 131        } while (n > 1);
 132        MLX4_ASSERT(addr >= lkp_tbl[base].start);
 133        *idx = base;
 134        if (addr < lkp_tbl[base].end)
 135                return lkp_tbl[base].lkey;
 136        /* Not found. */
 137        return UINT32_MAX;
 138}
 139
 140/**
 141 * Insert an entry to B-tree lookup table.
 142 *
 143 * @param bt
 144 *   Pointer to B-tree structure.
 145 * @param entry
 146 *   Pointer to new entry to insert.
 147 *
 148 * @return
 149 *   0 on success, -1 on failure.
 150 */
 151static int
 152mr_btree_insert(struct mlx4_mr_btree *bt, struct mlx4_mr_cache *entry)
 153{
 154        struct mlx4_mr_cache *lkp_tbl;
 155        uint16_t idx = 0;
 156        size_t shift;
 157
 158        MLX4_ASSERT(bt != NULL);
 159        MLX4_ASSERT(bt->len <= bt->size);
 160        MLX4_ASSERT(bt->len > 0);
 161        lkp_tbl = *bt->table;
 162        /* Find out the slot for insertion. */
 163        if (mr_btree_lookup(bt, &idx, entry->start) != UINT32_MAX) {
 164                DEBUG("abort insertion to B-tree(%p): already exist at"
 165                      " idx=%u [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x",
 166                      (void *)bt, idx, entry->start, entry->end, entry->lkey);
 167                /* Already exist, return. */
 168                return 0;
 169        }
 170        /* If table is full, return error. */
 171        if (unlikely(bt->len == bt->size)) {
 172                bt->overflow = 1;
 173                return -1;
 174        }
 175        /* Insert entry. */
 176        ++idx;
 177        shift = (bt->len - idx) * sizeof(struct mlx4_mr_cache);
 178        if (shift)
 179                memmove(&lkp_tbl[idx + 1], &lkp_tbl[idx], shift);
 180        lkp_tbl[idx] = *entry;
 181        bt->len++;
 182        DEBUG("inserted B-tree(%p)[%u],"
 183              " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x",
 184              (void *)bt, idx, entry->start, entry->end, entry->lkey);
 185        return 0;
 186}
 187
 188/**
 189 * Initialize B-tree and allocate memory for lookup table.
 190 *
 191 * @param bt
 192 *   Pointer to B-tree structure.
 193 * @param n
 194 *   Number of entries to allocate.
 195 * @param socket
 196 *   NUMA socket on which memory must be allocated.
 197 *
 198 * @return
 199 *   0 on success, a negative errno value otherwise and rte_errno is set.
 200 */
 201int
 202mlx4_mr_btree_init(struct mlx4_mr_btree *bt, int n, int socket)
 203{
 204        if (bt == NULL) {
 205                rte_errno = EINVAL;
 206                return -rte_errno;
 207        }
 208        memset(bt, 0, sizeof(*bt));
 209        bt->table = rte_calloc_socket("B-tree table",
 210                                      n, sizeof(struct mlx4_mr_cache),
 211                                      0, socket);
 212        if (bt->table == NULL) {
 213                rte_errno = ENOMEM;
 214                ERROR("failed to allocate memory for btree cache on socket %d",
 215                      socket);
 216                return -rte_errno;
 217        }
 218        bt->size = n;
 219        /* First entry must be NULL for binary search. */
 220        (*bt->table)[bt->len++] = (struct mlx4_mr_cache) {
 221                .lkey = UINT32_MAX,
 222        };
 223        DEBUG("initialized B-tree %p with table %p",
 224              (void *)bt, (void *)bt->table);
 225        return 0;
 226}
 227
 228/**
 229 * Free B-tree resources.
 230 *
 231 * @param bt
 232 *   Pointer to B-tree structure.
 233 */
 234void
 235mlx4_mr_btree_free(struct mlx4_mr_btree *bt)
 236{
 237        if (bt == NULL)
 238                return;
 239        DEBUG("freeing B-tree %p with table %p", (void *)bt, (void *)bt->table);
 240        rte_free(bt->table);
 241        memset(bt, 0, sizeof(*bt));
 242}
 243
 244#ifdef RTE_LIBRTE_MLX4_DEBUG
 245/**
 246 * Dump all the entries in a B-tree
 247 *
 248 * @param bt
 249 *   Pointer to B-tree structure.
 250 */
 251void
 252mlx4_mr_btree_dump(struct mlx4_mr_btree *bt)
 253{
 254        int idx;
 255        struct mlx4_mr_cache *lkp_tbl;
 256
 257        if (bt == NULL)
 258                return;
 259        lkp_tbl = *bt->table;
 260        for (idx = 0; idx < bt->len; ++idx) {
 261                struct mlx4_mr_cache *entry = &lkp_tbl[idx];
 262
 263                DEBUG("B-tree(%p)[%u],"
 264                      " [0x%" PRIxPTR ", 0x%" PRIxPTR ") lkey=0x%x",
 265                      (void *)bt, idx, entry->start, entry->end, entry->lkey);
 266        }
 267}
 268#endif
 269
 270/**
 271 * Find virtually contiguous memory chunk in a given MR.
 272 *
 273 * @param dev
 274 *   Pointer to MR structure.
 275 * @param[out] entry
 276 *   Pointer to returning MR cache entry. If not found, this will not be
 277 *   updated.
 278 * @param start_idx
 279 *   Start index of the memseg bitmap.
 280 *
 281 * @return
 282 *   Next index to go on lookup.
 283 */
 284static int
 285mr_find_next_chunk(struct mlx4_mr *mr, struct mlx4_mr_cache *entry,
 286                   int base_idx)
 287{
 288        uintptr_t start = 0;
 289        uintptr_t end = 0;
 290        uint32_t idx = 0;
 291
 292        /* MR for external memory doesn't have memseg list. */
 293        if (mr->msl == NULL) {
 294                struct ibv_mr *ibv_mr = mr->ibv_mr;
 295
 296                MLX4_ASSERT(mr->ms_bmp_n == 1);
 297                MLX4_ASSERT(mr->ms_n == 1);
 298                MLX4_ASSERT(base_idx == 0);
 299                /*
 300                 * Can't search it from memseg list but get it directly from
 301                 * verbs MR as there's only one chunk.
 302                 */
 303                entry->start = (uintptr_t)ibv_mr->addr;
 304                entry->end = (uintptr_t)ibv_mr->addr + mr->ibv_mr->length;
 305                entry->lkey = rte_cpu_to_be_32(mr->ibv_mr->lkey);
 306                /* Returning 1 ends iteration. */
 307                return 1;
 308        }
 309        for (idx = base_idx; idx < mr->ms_bmp_n; ++idx) {
 310                if (rte_bitmap_get(mr->ms_bmp, idx)) {
 311                        const struct rte_memseg_list *msl;
 312                        const struct rte_memseg *ms;
 313
 314                        msl = mr->msl;
 315                        ms = rte_fbarray_get(&msl->memseg_arr,
 316                                             mr->ms_base_idx + idx);
 317                        MLX4_ASSERT(msl->page_sz == ms->hugepage_sz);
 318                        if (!start)
 319                                start = ms->addr_64;
 320                        end = ms->addr_64 + ms->hugepage_sz;
 321                } else if (start) {
 322                        /* Passed the end of a fragment. */
 323                        break;
 324                }
 325        }
 326        if (start) {
 327                /* Found one chunk. */
 328                entry->start = start;
 329                entry->end = end;
 330                entry->lkey = rte_cpu_to_be_32(mr->ibv_mr->lkey);
 331        }
 332        return idx;
 333}
 334
 335/**
 336 * Insert a MR to the global B-tree cache. It may fail due to low-on-memory.
 337 * Then, this entry will have to be searched by mr_lookup_dev_list() in
 338 * mlx4_mr_create() on miss.
 339 *
 340 * @param dev
 341 *   Pointer to Ethernet device.
 342 * @param mr
 343 *   Pointer to MR to insert.
 344 *
 345 * @return
 346 *   0 on success, -1 on failure.
 347 */
 348static int
 349mr_insert_dev_cache(struct rte_eth_dev *dev, struct mlx4_mr *mr)
 350{
 351        struct mlx4_priv *priv = dev->data->dev_private;
 352        unsigned int n;
 353
 354        DEBUG("port %u inserting MR(%p) to global cache",
 355              dev->data->port_id, (void *)mr);
 356        for (n = 0; n < mr->ms_bmp_n; ) {
 357                struct mlx4_mr_cache entry;
 358
 359                memset(&entry, 0, sizeof(entry));
 360                /* Find a contiguous chunk and advance the index. */
 361                n = mr_find_next_chunk(mr, &entry, n);
 362                if (!entry.end)
 363                        break;
 364                if (mr_btree_insert(&priv->mr.cache, &entry) < 0) {
 365                        /*
 366                         * Overflowed, but the global table cannot be expanded
 367                         * because of deadlock.
 368                         */
 369                        return -1;
 370                }
 371        }
 372        return 0;
 373}
 374
 375/**
 376 * Look up address in the original global MR list.
 377 *
 378 * @param dev
 379 *   Pointer to Ethernet device.
 380 * @param[out] entry
 381 *   Pointer to returning MR cache entry. If no match, this will not be updated.
 382 * @param addr
 383 *   Search key.
 384 *
 385 * @return
 386 *   Found MR on match, NULL otherwise.
 387 */
 388static struct mlx4_mr *
 389mr_lookup_dev_list(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry,
 390                   uintptr_t addr)
 391{
 392        struct mlx4_priv *priv = dev->data->dev_private;
 393        struct mlx4_mr *mr;
 394
 395        /* Iterate all the existing MRs. */
 396        LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
 397                unsigned int n;
 398
 399                if (mr->ms_n == 0)
 400                        continue;
 401                for (n = 0; n < mr->ms_bmp_n; ) {
 402                        struct mlx4_mr_cache ret;
 403
 404                        memset(&ret, 0, sizeof(ret));
 405                        n = mr_find_next_chunk(mr, &ret, n);
 406                        if (addr >= ret.start && addr < ret.end) {
 407                                /* Found. */
 408                                *entry = ret;
 409                                return mr;
 410                        }
 411                }
 412        }
 413        return NULL;
 414}
 415
 416/**
 417 * Look up address on device.
 418 *
 419 * @param dev
 420 *   Pointer to Ethernet device.
 421 * @param[out] entry
 422 *   Pointer to returning MR cache entry. If no match, this will not be updated.
 423 * @param addr
 424 *   Search key.
 425 *
 426 * @return
 427 *   Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
 428 */
 429static uint32_t
 430mr_lookup_dev(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry,
 431              uintptr_t addr)
 432{
 433        struct mlx4_priv *priv = dev->data->dev_private;
 434        uint16_t idx;
 435        uint32_t lkey = UINT32_MAX;
 436        struct mlx4_mr *mr;
 437
 438        /*
 439         * If the global cache has overflowed since it failed to expand the
 440         * B-tree table, it can't have all the existing MRs. Then, the address
 441         * has to be searched by traversing the original MR list instead, which
 442         * is very slow path. Otherwise, the global cache is all inclusive.
 443         */
 444        if (!unlikely(priv->mr.cache.overflow)) {
 445                lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
 446                if (lkey != UINT32_MAX)
 447                        *entry = (*priv->mr.cache.table)[idx];
 448        } else {
 449                /* Falling back to the slowest path. */
 450                mr = mr_lookup_dev_list(dev, entry, addr);
 451                if (mr != NULL)
 452                        lkey = entry->lkey;
 453        }
 454        MLX4_ASSERT(lkey == UINT32_MAX || (addr >= entry->start &&
 455                                           addr < entry->end));
 456        return lkey;
 457}
 458
 459/**
 460 * Free MR resources. MR lock must not be held to avoid a deadlock. rte_free()
 461 * can raise memory free event and the callback function will spin on the lock.
 462 *
 463 * @param mr
 464 *   Pointer to MR to free.
 465 */
 466static void
 467mr_free(struct mlx4_mr *mr)
 468{
 469        if (mr == NULL)
 470                return;
 471        DEBUG("freeing MR(%p):", (void *)mr);
 472        if (mr->ibv_mr != NULL)
 473                claim_zero(mlx4_glue->dereg_mr(mr->ibv_mr));
 474        if (mr->ms_bmp != NULL)
 475                rte_bitmap_free(mr->ms_bmp);
 476        rte_free(mr);
 477}
 478
 479/**
 480 * Release resources of detached MR having no online entry.
 481 *
 482 * @param dev
 483 *   Pointer to Ethernet device.
 484 */
 485static void
 486mlx4_mr_garbage_collect(struct rte_eth_dev *dev)
 487{
 488        struct mlx4_priv *priv = dev->data->dev_private;
 489        struct mlx4_mr *mr_next;
 490        struct mlx4_mr_list free_list = LIST_HEAD_INITIALIZER(free_list);
 491
 492        /* Must be called from the primary process. */
 493        MLX4_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
 494        /*
 495         * MR can't be freed with holding the lock because rte_free() could call
 496         * memory free callback function. This will be a deadlock situation.
 497         */
 498        rte_rwlock_write_lock(&priv->mr.rwlock);
 499        /* Detach the whole free list and release it after unlocking. */
 500        free_list = priv->mr.mr_free_list;
 501        LIST_INIT(&priv->mr.mr_free_list);
 502        rte_rwlock_write_unlock(&priv->mr.rwlock);
 503        /* Release resources. */
 504        mr_next = LIST_FIRST(&free_list);
 505        while (mr_next != NULL) {
 506                struct mlx4_mr *mr = mr_next;
 507
 508                mr_next = LIST_NEXT(mr, mr);
 509                mr_free(mr);
 510        }
 511}
 512
 513/* Called during rte_memseg_contig_walk() by mlx4_mr_create(). */
 514static int
 515mr_find_contig_memsegs_cb(const struct rte_memseg_list *msl,
 516                          const struct rte_memseg *ms, size_t len, void *arg)
 517{
 518        struct mr_find_contig_memsegs_data *data = arg;
 519
 520        if (data->addr < ms->addr_64 || data->addr >= ms->addr_64 + len)
 521                return 0;
 522        /* Found, save it and stop walking. */
 523        data->start = ms->addr_64;
 524        data->end = ms->addr_64 + len;
 525        data->msl = msl;
 526        return 1;
 527}
 528
 529/**
 530 * Create a new global Memory Region (MR) for a missing virtual address.
 531 * This API should be called on a secondary process, then a request is sent to
 532 * the primary process in order to create a MR for the address. As the global MR
 533 * list is on the shared memory, following LKey lookup should succeed unless the
 534 * request fails.
 535 *
 536 * @param dev
 537 *   Pointer to Ethernet device.
 538 * @param[out] entry
 539 *   Pointer to returning MR cache entry, found in the global cache or newly
 540 *   created. If failed to create one, this will not be updated.
 541 * @param addr
 542 *   Target virtual address to register.
 543 *
 544 * @return
 545 *   Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
 546 */
 547static uint32_t
 548mlx4_mr_create_secondary(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry,
 549                         uintptr_t addr)
 550{
 551        struct mlx4_priv *priv = dev->data->dev_private;
 552        int ret;
 553
 554        DEBUG("port %u requesting MR creation for address (%p)",
 555              dev->data->port_id, (void *)addr);
 556        ret = mlx4_mp_req_mr_create(dev, addr);
 557        if (ret) {
 558                DEBUG("port %u fail to request MR creation for address (%p)",
 559                      dev->data->port_id, (void *)addr);
 560                return UINT32_MAX;
 561        }
 562        rte_rwlock_read_lock(&priv->mr.rwlock);
 563        /* Fill in output data. */
 564        mr_lookup_dev(dev, entry, addr);
 565        /* Lookup can't fail. */
 566        MLX4_ASSERT(entry->lkey != UINT32_MAX);
 567        rte_rwlock_read_unlock(&priv->mr.rwlock);
 568        DEBUG("port %u MR CREATED by primary process for %p:\n"
 569              "  [0x%" PRIxPTR ", 0x%" PRIxPTR "), lkey=0x%x",
 570              dev->data->port_id, (void *)addr,
 571              entry->start, entry->end, entry->lkey);
 572        return entry->lkey;
 573}
 574
 575/**
 576 * Create a new global Memory Region (MR) for a missing virtual address.
 577 * Register entire virtually contiguous memory chunk around the address.
 578 * This must be called from the primary process.
 579 *
 580 * @param dev
 581 *   Pointer to Ethernet device.
 582 * @param[out] entry
 583 *   Pointer to returning MR cache entry, found in the global cache or newly
 584 *   created. If failed to create one, this will not be updated.
 585 * @param addr
 586 *   Target virtual address to register.
 587 *
 588 * @return
 589 *   Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
 590 */
 591uint32_t
 592mlx4_mr_create_primary(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry,
 593                       uintptr_t addr)
 594{
 595        struct mlx4_priv *priv = dev->data->dev_private;
 596        const struct rte_memseg_list *msl;
 597        const struct rte_memseg *ms;
 598        struct mlx4_mr *mr = NULL;
 599        size_t len;
 600        uint32_t ms_n;
 601        uint32_t bmp_size;
 602        void *bmp_mem;
 603        int ms_idx_shift = -1;
 604        unsigned int n;
 605        struct mr_find_contig_memsegs_data data = {
 606                .addr = addr,
 607        };
 608        struct mr_find_contig_memsegs_data data_re;
 609
 610        DEBUG("port %u creating a MR using address (%p)",
 611              dev->data->port_id, (void *)addr);
 612        /*
 613         * Release detached MRs if any. This can't be called with holding either
 614         * memory_hotplug_lock or priv->mr.rwlock. MRs on the free list have
 615         * been detached by the memory free event but it couldn't be released
 616         * inside the callback due to deadlock. As a result, releasing resources
 617         * is quite opportunistic.
 618         */
 619        mlx4_mr_garbage_collect(dev);
 620        /*
 621         * If enabled, find out a contiguous virtual address chunk in use, to
 622         * which the given address belongs, in order to register maximum range.
 623         * In the best case where mempools are not dynamically recreated and
 624         * '--socket-mem' is specified as an EAL option, it is very likely to
 625         * have only one MR(LKey) per a socket and per a hugepage-size even
 626         * though the system memory is highly fragmented. As the whole memory
 627         * chunk will be pinned by kernel, it can't be reused unless entire
 628         * chunk is freed from EAL.
 629         *
 630         * If disabled, just register one memseg (page). Then, memory
 631         * consumption will be minimized but it may drop performance if there
 632         * are many MRs to lookup on the datapath.
 633         */
 634        if (!priv->mr_ext_memseg_en) {
 635                data.msl = rte_mem_virt2memseg_list((void *)addr);
 636                data.start = RTE_ALIGN_FLOOR(addr, data.msl->page_sz);
 637                data.end = data.start + data.msl->page_sz;
 638        } else if (!rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data)) {
 639                WARN("port %u unable to find virtually contiguous"
 640                     " chunk for address (%p)."
 641                     " rte_memseg_contig_walk() failed.",
 642                     dev->data->port_id, (void *)addr);
 643                rte_errno = ENXIO;
 644                goto err_nolock;
 645        }
 646alloc_resources:
 647        /* Addresses must be page-aligned. */
 648        MLX4_ASSERT(rte_is_aligned((void *)data.start, data.msl->page_sz));
 649        MLX4_ASSERT(rte_is_aligned((void *)data.end, data.msl->page_sz));
 650        msl = data.msl;
 651        ms = rte_mem_virt2memseg((void *)data.start, msl);
 652        len = data.end - data.start;
 653        MLX4_ASSERT(msl->page_sz == ms->hugepage_sz);
 654        /* Number of memsegs in the range. */
 655        ms_n = len / msl->page_sz;
 656        DEBUG("port %u extending %p to [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
 657              " page_sz=0x%" PRIx64 ", ms_n=%u",
 658              dev->data->port_id, (void *)addr,
 659              data.start, data.end, msl->page_sz, ms_n);
 660        /* Size of memory for bitmap. */
 661        bmp_size = rte_bitmap_get_memory_footprint(ms_n);
 662        mr = rte_zmalloc_socket(NULL,
 663                                RTE_ALIGN_CEIL(sizeof(*mr),
 664                                               RTE_CACHE_LINE_SIZE) +
 665                                bmp_size,
 666                                RTE_CACHE_LINE_SIZE, msl->socket_id);
 667        if (mr == NULL) {
 668                WARN("port %u unable to allocate memory for a new MR of"
 669                     " address (%p).",
 670                     dev->data->port_id, (void *)addr);
 671                rte_errno = ENOMEM;
 672                goto err_nolock;
 673        }
 674        mr->msl = msl;
 675        /*
 676         * Save the index of the first memseg and initialize memseg bitmap. To
 677         * see if a memseg of ms_idx in the memseg-list is still valid, check:
 678         *      rte_bitmap_get(mr->bmp, ms_idx - mr->ms_base_idx)
 679         */
 680        mr->ms_base_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
 681        bmp_mem = RTE_PTR_ALIGN_CEIL(mr + 1, RTE_CACHE_LINE_SIZE);
 682        mr->ms_bmp = rte_bitmap_init(ms_n, bmp_mem, bmp_size);
 683        if (mr->ms_bmp == NULL) {
 684                WARN("port %u unable to initialize bitmap for a new MR of"
 685                     " address (%p).",
 686                     dev->data->port_id, (void *)addr);
 687                rte_errno = EINVAL;
 688                goto err_nolock;
 689        }
 690        /*
 691         * Should recheck whether the extended contiguous chunk is still valid.
 692         * Because memory_hotplug_lock can't be held if there's any memory
 693         * related calls in a critical path, resource allocation above can't be
 694         * locked. If the memory has been changed at this point, try again with
 695         * just single page. If not, go on with the big chunk atomically from
 696         * here.
 697         */
 698        rte_mcfg_mem_read_lock();
 699        data_re = data;
 700        if (len > msl->page_sz &&
 701            !rte_memseg_contig_walk(mr_find_contig_memsegs_cb, &data_re)) {
 702                WARN("port %u unable to find virtually contiguous"
 703                     " chunk for address (%p)."
 704                     " rte_memseg_contig_walk() failed.",
 705                     dev->data->port_id, (void *)addr);
 706                rte_errno = ENXIO;
 707                goto err_memlock;
 708        }
 709        if (data.start != data_re.start || data.end != data_re.end) {
 710                /*
 711                 * The extended contiguous chunk has been changed. Try again
 712                 * with single memseg instead.
 713                 */
 714                data.start = RTE_ALIGN_FLOOR(addr, msl->page_sz);
 715                data.end = data.start + msl->page_sz;
 716                rte_mcfg_mem_read_unlock();
 717                mr_free(mr);
 718                goto alloc_resources;
 719        }
 720        MLX4_ASSERT(data.msl == data_re.msl);
 721        rte_rwlock_write_lock(&priv->mr.rwlock);
 722        /*
 723         * Check the address is really missing. If other thread already created
 724         * one or it is not found due to overflow, abort and return.
 725         */
 726        if (mr_lookup_dev(dev, entry, addr) != UINT32_MAX) {
 727                /*
 728                 * Insert to the global cache table. It may fail due to
 729                 * low-on-memory. Then, this entry will have to be searched
 730                 * here again.
 731                 */
 732                mr_btree_insert(&priv->mr.cache, entry);
 733                DEBUG("port %u found MR for %p on final lookup, abort",
 734                      dev->data->port_id, (void *)addr);
 735                rte_rwlock_write_unlock(&priv->mr.rwlock);
 736                rte_mcfg_mem_read_unlock();
 737                /*
 738                 * Must be unlocked before calling rte_free() because
 739                 * mlx4_mr_mem_event_free_cb() can be called inside.
 740                 */
 741                mr_free(mr);
 742                return entry->lkey;
 743        }
 744        /*
 745         * Trim start and end addresses for verbs MR. Set bits for registering
 746         * memsegs but exclude already registered ones. Bitmap can be
 747         * fragmented.
 748         */
 749        for (n = 0; n < ms_n; ++n) {
 750                uintptr_t start;
 751                struct mlx4_mr_cache ret;
 752
 753                memset(&ret, 0, sizeof(ret));
 754                start = data_re.start + n * msl->page_sz;
 755                /* Exclude memsegs already registered by other MRs. */
 756                if (mr_lookup_dev(dev, &ret, start) == UINT32_MAX) {
 757                        /*
 758                         * Start from the first unregistered memseg in the
 759                         * extended range.
 760                         */
 761                        if (ms_idx_shift == -1) {
 762                                mr->ms_base_idx += n;
 763                                data.start = start;
 764                                ms_idx_shift = n;
 765                        }
 766                        data.end = start + msl->page_sz;
 767                        rte_bitmap_set(mr->ms_bmp, n - ms_idx_shift);
 768                        ++mr->ms_n;
 769                }
 770        }
 771        len = data.end - data.start;
 772        mr->ms_bmp_n = len / msl->page_sz;
 773        MLX4_ASSERT(ms_idx_shift + mr->ms_bmp_n <= ms_n);
 774        /*
 775         * Finally create a verbs MR for the memory chunk. ibv_reg_mr() can be
 776         * called with holding the memory lock because it doesn't use
 777         * mlx4_alloc_buf_extern() which eventually calls rte_malloc_socket()
 778         * through mlx4_alloc_verbs_buf().
 779         */
 780        mr->ibv_mr = mlx4_glue->reg_mr(priv->pd, (void *)data.start, len,
 781                                       IBV_ACCESS_LOCAL_WRITE);
 782        if (mr->ibv_mr == NULL) {
 783                WARN("port %u fail to create a verbs MR for address (%p)",
 784                     dev->data->port_id, (void *)addr);
 785                rte_errno = EINVAL;
 786                goto err_mrlock;
 787        }
 788        MLX4_ASSERT((uintptr_t)mr->ibv_mr->addr == data.start);
 789        MLX4_ASSERT(mr->ibv_mr->length == len);
 790        LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
 791        DEBUG("port %u MR CREATED (%p) for %p:\n"
 792              "  [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
 793              " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u",
 794              dev->data->port_id, (void *)mr, (void *)addr,
 795              data.start, data.end, rte_cpu_to_be_32(mr->ibv_mr->lkey),
 796              mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n);
 797        /* Insert to the global cache table. */
 798        mr_insert_dev_cache(dev, mr);
 799        /* Fill in output data. */
 800        mr_lookup_dev(dev, entry, addr);
 801        /* Lookup can't fail. */
 802        MLX4_ASSERT(entry->lkey != UINT32_MAX);
 803        rte_rwlock_write_unlock(&priv->mr.rwlock);
 804        rte_mcfg_mem_read_unlock();
 805        return entry->lkey;
 806err_mrlock:
 807        rte_rwlock_write_unlock(&priv->mr.rwlock);
 808err_memlock:
 809        rte_mcfg_mem_read_unlock();
 810err_nolock:
 811        /*
 812         * In case of error, as this can be called in a datapath, a warning
 813         * message per an error is preferable instead. Must be unlocked before
 814         * calling rte_free() because mlx4_mr_mem_event_free_cb() can be called
 815         * inside.
 816         */
 817        mr_free(mr);
 818        return UINT32_MAX;
 819}
 820
 821/**
 822 * Create a new global Memory Region (MR) for a missing virtual address.
 823 * This can be called from primary and secondary process.
 824 *
 825 * @param dev
 826 *   Pointer to Ethernet device.
 827 * @param[out] entry
 828 *   Pointer to returning MR cache entry, found in the global cache or newly
 829 *   created. If failed to create one, this will not be updated.
 830 * @param addr
 831 *   Target virtual address to register.
 832 *
 833 * @return
 834 *   Searched LKey on success, UINT32_MAX on failure and rte_errno is set.
 835 */
 836static uint32_t
 837mlx4_mr_create(struct rte_eth_dev *dev, struct mlx4_mr_cache *entry,
 838               uintptr_t addr)
 839{
 840        uint32_t ret = 0;
 841
 842        switch (rte_eal_process_type()) {
 843        case RTE_PROC_PRIMARY:
 844                ret = mlx4_mr_create_primary(dev, entry, addr);
 845                break;
 846        case RTE_PROC_SECONDARY:
 847                ret = mlx4_mr_create_secondary(dev, entry, addr);
 848                break;
 849        default:
 850                break;
 851        }
 852        return ret;
 853}
 854
 855/**
 856 * Rebuild the global B-tree cache of device from the original MR list.
 857 *
 858 * @param dev
 859 *   Pointer to Ethernet device.
 860 */
 861static void
 862mr_rebuild_dev_cache(struct rte_eth_dev *dev)
 863{
 864        struct mlx4_priv *priv = dev->data->dev_private;
 865        struct mlx4_mr *mr;
 866
 867        DEBUG("port %u rebuild dev cache[]", dev->data->port_id);
 868        /* Flush cache to rebuild. */
 869        priv->mr.cache.len = 1;
 870        priv->mr.cache.overflow = 0;
 871        /* Iterate all the existing MRs. */
 872        LIST_FOREACH(mr, &priv->mr.mr_list, mr)
 873                if (mr_insert_dev_cache(dev, mr) < 0)
 874                        return;
 875}
 876
 877/**
 878 * Callback for memory free event. Iterate freed memsegs and check whether it
 879 * belongs to an existing MR. If found, clear the bit from bitmap of MR. As a
 880 * result, the MR would be fragmented. If it becomes empty, the MR will be freed
 881 * later by mlx4_mr_garbage_collect().
 882 *
 883 * The global cache must be rebuilt if there's any change and this event has to
 884 * be propagated to dataplane threads to flush the local caches.
 885 *
 886 * @param dev
 887 *   Pointer to Ethernet device.
 888 * @param addr
 889 *   Address of freed memory.
 890 * @param len
 891 *   Size of freed memory.
 892 */
 893static void
 894mlx4_mr_mem_event_free_cb(struct rte_eth_dev *dev, const void *addr, size_t len)
 895{
 896        struct mlx4_priv *priv = dev->data->dev_private;
 897        const struct rte_memseg_list *msl;
 898        struct mlx4_mr *mr;
 899        int ms_n;
 900        int i;
 901        int rebuild = 0;
 902
 903        DEBUG("port %u free callback: addr=%p, len=%zu",
 904              dev->data->port_id, addr, len);
 905        msl = rte_mem_virt2memseg_list(addr);
 906        /* addr and len must be page-aligned. */
 907        MLX4_ASSERT((uintptr_t)addr ==
 908                    RTE_ALIGN((uintptr_t)addr, msl->page_sz));
 909        MLX4_ASSERT(len == RTE_ALIGN(len, msl->page_sz));
 910        ms_n = len / msl->page_sz;
 911        rte_rwlock_write_lock(&priv->mr.rwlock);
 912        /* Clear bits of freed memsegs from MR. */
 913        for (i = 0; i < ms_n; ++i) {
 914                const struct rte_memseg *ms;
 915                struct mlx4_mr_cache entry;
 916                uintptr_t start;
 917                int ms_idx;
 918                uint32_t pos;
 919
 920                /* Find MR having this memseg. */
 921                start = (uintptr_t)addr + i * msl->page_sz;
 922                mr = mr_lookup_dev_list(dev, &entry, start);
 923                if (mr == NULL)
 924                        continue;
 925                MLX4_ASSERT(mr->msl); /* Can't be external memory. */
 926                ms = rte_mem_virt2memseg((void *)start, msl);
 927                MLX4_ASSERT(ms != NULL);
 928                MLX4_ASSERT(msl->page_sz == ms->hugepage_sz);
 929                ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
 930                pos = ms_idx - mr->ms_base_idx;
 931                MLX4_ASSERT(rte_bitmap_get(mr->ms_bmp, pos));
 932                MLX4_ASSERT(pos < mr->ms_bmp_n);
 933                DEBUG("port %u MR(%p): clear bitmap[%u] for addr %p",
 934                      dev->data->port_id, (void *)mr, pos, (void *)start);
 935                rte_bitmap_clear(mr->ms_bmp, pos);
 936                if (--mr->ms_n == 0) {
 937                        LIST_REMOVE(mr, mr);
 938                        LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
 939                        DEBUG("port %u remove MR(%p) from list",
 940                              dev->data->port_id, (void *)mr);
 941                }
 942                /*
 943                 * MR is fragmented or will be freed. the global cache must be
 944                 * rebuilt.
 945                 */
 946                rebuild = 1;
 947        }
 948        if (rebuild) {
 949                mr_rebuild_dev_cache(dev);
 950                /*
 951                 * Flush local caches by propagating invalidation across cores.
 952                 * rte_smp_wmb() is enough to synchronize this event. If one of
 953                 * freed memsegs is seen by other core, that means the memseg
 954                 * has been allocated by allocator, which will come after this
 955                 * free call. Therefore, this store instruction (incrementing
 956                 * generation below) will be guaranteed to be seen by other core
 957                 * before the core sees the newly allocated memory.
 958                 */
 959                ++priv->mr.dev_gen;
 960                DEBUG("broadcasting local cache flush, gen=%d",
 961                      priv->mr.dev_gen);
 962                rte_smp_wmb();
 963        }
 964        rte_rwlock_write_unlock(&priv->mr.rwlock);
 965#ifdef RTE_LIBRTE_MLX4_DEBUG
 966        if (rebuild)
 967                mlx4_mr_dump_dev(dev);
 968#endif
 969}
 970
 971/**
 972 * Callback for memory event.
 973 *
 974 * @param event_type
 975 *   Memory event type.
 976 * @param addr
 977 *   Address of memory.
 978 * @param len
 979 *   Size of memory.
 980 */
 981void
 982mlx4_mr_mem_event_cb(enum rte_mem_event event_type, const void *addr,
 983                     size_t len, void *arg __rte_unused)
 984{
 985        struct mlx4_priv *priv;
 986        struct mlx4_dev_list *dev_list = &mlx4_shared_data->mem_event_cb_list;
 987
 988        /* Must be called from the primary process. */
 989        MLX4_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
 990        switch (event_type) {
 991        case RTE_MEM_EVENT_FREE:
 992                rte_rwlock_read_lock(&mlx4_shared_data->mem_event_rwlock);
 993                /* Iterate all the existing mlx4 devices. */
 994                LIST_FOREACH(priv, dev_list, mem_event_cb)
 995                        mlx4_mr_mem_event_free_cb(ETH_DEV(priv), addr, len);
 996                rte_rwlock_read_unlock(&mlx4_shared_data->mem_event_rwlock);
 997                break;
 998        case RTE_MEM_EVENT_ALLOC:
 999        default:
1000                break;
1001        }
1002}
1003
1004/**
1005 * Look up address in the global MR cache table. If not found, create a new MR.
1006 * Insert the found/created entry to local bottom-half cache table.
1007 *
1008 * @param dev
1009 *   Pointer to Ethernet device.
1010 * @param mr_ctrl
1011 *   Pointer to per-queue MR control structure.
1012 * @param[out] entry
1013 *   Pointer to returning MR cache entry, found in the global cache or newly
1014 *   created. If failed to create one, this is not written.
1015 * @param addr
1016 *   Search key.
1017 *
1018 * @return
1019 *   Searched LKey on success, UINT32_MAX on no match.
1020 */
1021static uint32_t
1022mlx4_mr_lookup_dev(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl,
1023                   struct mlx4_mr_cache *entry, uintptr_t addr)
1024{
1025        struct mlx4_priv *priv = dev->data->dev_private;
1026        struct mlx4_mr_btree *bt = &mr_ctrl->cache_bh;
1027        uint16_t idx;
1028        uint32_t lkey;
1029
1030        /* If local cache table is full, try to double it. */
1031        if (unlikely(bt->len == bt->size))
1032                mr_btree_expand(bt, bt->size << 1);
1033        /* Look up in the global cache. */
1034        rte_rwlock_read_lock(&priv->mr.rwlock);
1035        lkey = mr_btree_lookup(&priv->mr.cache, &idx, addr);
1036        if (lkey != UINT32_MAX) {
1037                /* Found. */
1038                *entry = (*priv->mr.cache.table)[idx];
1039                rte_rwlock_read_unlock(&priv->mr.rwlock);
1040                /*
1041                 * Update local cache. Even if it fails, return the found entry
1042                 * to update top-half cache. Next time, this entry will be found
1043                 * in the global cache.
1044                 */
1045                mr_btree_insert(bt, entry);
1046                return lkey;
1047        }
1048        rte_rwlock_read_unlock(&priv->mr.rwlock);
1049        /* First time to see the address? Create a new MR. */
1050        lkey = mlx4_mr_create(dev, entry, addr);
1051        /*
1052         * Update the local cache if successfully created a new global MR. Even
1053         * if failed to create one, there's no action to take in this datapath
1054         * code. As returning LKey is invalid, this will eventually make HW
1055         * fail.
1056         */
1057        if (lkey != UINT32_MAX)
1058                mr_btree_insert(bt, entry);
1059        return lkey;
1060}
1061
1062/**
1063 * Bottom-half of LKey search on datapath. Firstly search in cache_bh[] and if
1064 * misses, search in the global MR cache table and update the new entry to
1065 * per-queue local caches.
1066 *
1067 * @param dev
1068 *   Pointer to Ethernet device.
1069 * @param mr_ctrl
1070 *   Pointer to per-queue MR control structure.
1071 * @param addr
1072 *   Search key.
1073 *
1074 * @return
1075 *   Searched LKey on success, UINT32_MAX on no match.
1076 */
1077static uint32_t
1078mlx4_mr_addr2mr_bh(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl,
1079                   uintptr_t addr)
1080{
1081        uint32_t lkey;
1082        uint16_t bh_idx = 0;
1083        /* Victim in top-half cache to replace with new entry. */
1084        struct mlx4_mr_cache *repl = &mr_ctrl->cache[mr_ctrl->head];
1085
1086        /* Binary-search MR translation table. */
1087        lkey = mr_btree_lookup(&mr_ctrl->cache_bh, &bh_idx, addr);
1088        /* Update top-half cache. */
1089        if (likely(lkey != UINT32_MAX)) {
1090                *repl = (*mr_ctrl->cache_bh.table)[bh_idx];
1091        } else {
1092                /*
1093                 * If missed in local lookup table, search in the global cache
1094                 * and local cache_bh[] will be updated inside if possible.
1095                 * Top-half cache entry will also be updated.
1096                 */
1097                lkey = mlx4_mr_lookup_dev(dev, mr_ctrl, repl, addr);
1098                if (unlikely(lkey == UINT32_MAX))
1099                        return UINT32_MAX;
1100        }
1101        /* Update the most recently used entry. */
1102        mr_ctrl->mru = mr_ctrl->head;
1103        /* Point to the next victim, the oldest. */
1104        mr_ctrl->head = (mr_ctrl->head + 1) % MLX4_MR_CACHE_N;
1105        return lkey;
1106}
1107
1108/**
1109 * Bottom-half of LKey search on Rx.
1110 *
1111 * @param rxq
1112 *   Pointer to Rx queue structure.
1113 * @param addr
1114 *   Search key.
1115 *
1116 * @return
1117 *   Searched LKey on success, UINT32_MAX on no match.
1118 */
1119uint32_t
1120mlx4_rx_addr2mr_bh(struct rxq *rxq, uintptr_t addr)
1121{
1122        struct mlx4_mr_ctrl *mr_ctrl = &rxq->mr_ctrl;
1123        struct mlx4_priv *priv = rxq->priv;
1124
1125        return mlx4_mr_addr2mr_bh(ETH_DEV(priv), mr_ctrl, addr);
1126}
1127
1128/**
1129 * Bottom-half of LKey search on Tx.
1130 *
1131 * @param txq
1132 *   Pointer to Tx queue structure.
1133 * @param addr
1134 *   Search key.
1135 *
1136 * @return
1137 *   Searched LKey on success, UINT32_MAX on no match.
1138 */
1139static uint32_t
1140mlx4_tx_addr2mr_bh(struct txq *txq, uintptr_t addr)
1141{
1142        struct mlx4_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
1143        struct mlx4_priv *priv = txq->priv;
1144
1145        return mlx4_mr_addr2mr_bh(ETH_DEV(priv), mr_ctrl, addr);
1146}
1147
1148/**
1149 * Bottom-half of LKey search on Tx. If it can't be searched in the memseg
1150 * list, register the mempool of the mbuf as externally allocated memory.
1151 *
1152 * @param txq
1153 *   Pointer to Tx queue structure.
1154 * @param mb
1155 *   Pointer to mbuf.
1156 *
1157 * @return
1158 *   Searched LKey on success, UINT32_MAX on no match.
1159 */
1160uint32_t
1161mlx4_tx_mb2mr_bh(struct txq *txq, struct rte_mbuf *mb)
1162{
1163        uintptr_t addr = (uintptr_t)mb->buf_addr;
1164        uint32_t lkey;
1165
1166        lkey = mlx4_tx_addr2mr_bh(txq, addr);
1167        if (lkey == UINT32_MAX && rte_errno == ENXIO) {
1168                /* Mempool may have externally allocated memory. */
1169                return mlx4_tx_update_ext_mp(txq, addr, mlx4_mb2mp(mb));
1170        }
1171        return lkey;
1172}
1173
1174/**
1175 * Flush all of the local cache entries.
1176 *
1177 * @param mr_ctrl
1178 *   Pointer to per-queue MR control structure.
1179 */
1180void
1181mlx4_mr_flush_local_cache(struct mlx4_mr_ctrl *mr_ctrl)
1182{
1183        /* Reset the most-recently-used index. */
1184        mr_ctrl->mru = 0;
1185        /* Reset the linear search array. */
1186        mr_ctrl->head = 0;
1187        memset(mr_ctrl->cache, 0, sizeof(mr_ctrl->cache));
1188        /* Reset the B-tree table. */
1189        mr_ctrl->cache_bh.len = 1;
1190        mr_ctrl->cache_bh.overflow = 0;
1191        /* Update the generation number. */
1192        mr_ctrl->cur_gen = *mr_ctrl->dev_gen_ptr;
1193        DEBUG("mr_ctrl(%p): flushed, cur_gen=%d",
1194              (void *)mr_ctrl, mr_ctrl->cur_gen);
1195}
1196
1197/**
1198 * Called during rte_mempool_mem_iter() by mlx4_mr_update_ext_mp().
1199 *
1200 * Externally allocated chunk is registered and a MR is created for the chunk.
1201 * The MR object is added to the global list. If memseg list of a MR object
1202 * (mr->msl) is null, the MR object can be regarded as externally allocated
1203 * memory.
1204 *
1205 * Once external memory is registered, it should be static. If the memory is
1206 * freed and the virtual address range has different physical memory mapped
1207 * again, it may cause crash on device due to the wrong translation entry. PMD
1208 * can't track the free event of the external memory for now.
1209 */
1210static void
1211mlx4_mr_update_ext_mp_cb(struct rte_mempool *mp, void *opaque,
1212                         struct rte_mempool_memhdr *memhdr,
1213                         unsigned mem_idx __rte_unused)
1214{
1215        struct mr_update_mp_data *data = opaque;
1216        struct rte_eth_dev *dev = data->dev;
1217        struct mlx4_priv *priv = dev->data->dev_private;
1218        struct mlx4_mr_ctrl *mr_ctrl = data->mr_ctrl;
1219        struct mlx4_mr *mr = NULL;
1220        uintptr_t addr = (uintptr_t)memhdr->addr;
1221        size_t len = memhdr->len;
1222        struct mlx4_mr_cache entry;
1223        uint32_t lkey;
1224
1225        MLX4_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
1226        /* If already registered, it should return. */
1227        rte_rwlock_read_lock(&priv->mr.rwlock);
1228        lkey = mr_lookup_dev(dev, &entry, addr);
1229        rte_rwlock_read_unlock(&priv->mr.rwlock);
1230        if (lkey != UINT32_MAX)
1231                return;
1232        mr = rte_zmalloc_socket(NULL,
1233                                RTE_ALIGN_CEIL(sizeof(*mr),
1234                                               RTE_CACHE_LINE_SIZE),
1235                                RTE_CACHE_LINE_SIZE, mp->socket_id);
1236        if (mr == NULL) {
1237                WARN("port %u unable to allocate memory for a new MR of"
1238                     " mempool (%s).",
1239                     dev->data->port_id, mp->name);
1240                data->ret = -1;
1241                return;
1242        }
1243        DEBUG("port %u register MR for chunk #%d of mempool (%s)",
1244              dev->data->port_id, mem_idx, mp->name);
1245        mr->ibv_mr = mlx4_glue->reg_mr(priv->pd, (void *)addr, len,
1246                                       IBV_ACCESS_LOCAL_WRITE);
1247        if (mr->ibv_mr == NULL) {
1248                WARN("port %u fail to create a verbs MR for address (%p)",
1249                     dev->data->port_id, (void *)addr);
1250                rte_free(mr);
1251                data->ret = -1;
1252                return;
1253        }
1254        mr->msl = NULL; /* Mark it is external memory. */
1255        mr->ms_bmp = NULL;
1256        mr->ms_n = 1;
1257        mr->ms_bmp_n = 1;
1258        rte_rwlock_write_lock(&priv->mr.rwlock);
1259        LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
1260        DEBUG("port %u MR CREATED (%p) for external memory %p:\n"
1261              "  [0x%" PRIxPTR ", 0x%" PRIxPTR "),"
1262              " lkey=0x%x base_idx=%u ms_n=%u, ms_bmp_n=%u",
1263              dev->data->port_id, (void *)mr, (void *)addr,
1264              addr, addr + len, rte_cpu_to_be_32(mr->ibv_mr->lkey),
1265              mr->ms_base_idx, mr->ms_n, mr->ms_bmp_n);
1266        /* Insert to the global cache table. */
1267        mr_insert_dev_cache(dev, mr);
1268        rte_rwlock_write_unlock(&priv->mr.rwlock);
1269        /* Insert to the local cache table */
1270        mlx4_mr_addr2mr_bh(dev, mr_ctrl, addr);
1271}
1272
1273/**
1274 * Register MR for entire memory chunks in a Mempool having externally allocated
1275 * memory and fill in local cache.
1276 *
1277 * @param dev
1278 *   Pointer to Ethernet device.
1279 * @param mr_ctrl
1280 *   Pointer to per-queue MR control structure.
1281 * @param mp
1282 *   Pointer to registering Mempool.
1283 *
1284 * @return
1285 *   0 on success, -1 on failure.
1286 */
1287static uint32_t
1288mlx4_mr_update_ext_mp(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl,
1289                      struct rte_mempool *mp)
1290{
1291        struct mr_update_mp_data data = {
1292                .dev = dev,
1293                .mr_ctrl = mr_ctrl,
1294                .ret = 0,
1295        };
1296
1297        rte_mempool_mem_iter(mp, mlx4_mr_update_ext_mp_cb, &data);
1298        return data.ret;
1299}
1300
1301/**
1302 * Register MR entire memory chunks in a Mempool having externally allocated
1303 * memory and search LKey of the address to return.
1304 *
1305 * @param dev
1306 *   Pointer to Ethernet device.
1307 * @param addr
1308 *   Search key.
1309 * @param mp
1310 *   Pointer to registering Mempool where addr belongs.
1311 *
1312 * @return
1313 *   LKey for address on success, UINT32_MAX on failure.
1314 */
1315uint32_t
1316mlx4_tx_update_ext_mp(struct txq *txq, uintptr_t addr, struct rte_mempool *mp)
1317{
1318        struct mlx4_mr_ctrl *mr_ctrl = &txq->mr_ctrl;
1319        struct mlx4_priv *priv = txq->priv;
1320
1321        if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
1322                WARN("port %u using address (%p) from unregistered mempool"
1323                     " having externally allocated memory"
1324                     " in secondary process, please create mempool"
1325                     " prior to rte_eth_dev_start()",
1326                     PORT_ID(priv), (void *)addr);
1327                return UINT32_MAX;
1328        }
1329        mlx4_mr_update_ext_mp(ETH_DEV(priv), mr_ctrl, mp);
1330        return mlx4_tx_addr2mr_bh(txq, addr);
1331}
1332
1333/* Called during rte_mempool_mem_iter() by mlx4_mr_update_mp(). */
1334static void
1335mlx4_mr_update_mp_cb(struct rte_mempool *mp __rte_unused, void *opaque,
1336                     struct rte_mempool_memhdr *memhdr,
1337                     unsigned mem_idx __rte_unused)
1338{
1339        struct mr_update_mp_data *data = opaque;
1340        uint32_t lkey;
1341
1342        /* Stop iteration if failed in the previous walk. */
1343        if (data->ret < 0)
1344                return;
1345        /* Register address of the chunk and update local caches. */
1346        lkey = mlx4_mr_addr2mr_bh(data->dev, data->mr_ctrl,
1347                                  (uintptr_t)memhdr->addr);
1348        if (lkey == UINT32_MAX)
1349                data->ret = -1;
1350}
1351
1352/**
1353 * Register entire memory chunks in a Mempool.
1354 *
1355 * @param dev
1356 *   Pointer to Ethernet device.
1357 * @param mr_ctrl
1358 *   Pointer to per-queue MR control structure.
1359 * @param mp
1360 *   Pointer to registering Mempool.
1361 *
1362 * @return
1363 *   0 on success, -1 on failure.
1364 */
1365int
1366mlx4_mr_update_mp(struct rte_eth_dev *dev, struct mlx4_mr_ctrl *mr_ctrl,
1367                  struct rte_mempool *mp)
1368{
1369        struct mr_update_mp_data data = {
1370                .dev = dev,
1371                .mr_ctrl = mr_ctrl,
1372                .ret = 0,
1373        };
1374
1375        rte_mempool_mem_iter(mp, mlx4_mr_update_mp_cb, &data);
1376        if (data.ret < 0 && rte_errno == ENXIO) {
1377                /* Mempool may have externally allocated memory. */
1378                return mlx4_mr_update_ext_mp(dev, mr_ctrl, mp);
1379        }
1380        return data.ret;
1381}
1382
1383#ifdef RTE_LIBRTE_MLX4_DEBUG
1384/**
1385 * Dump all the created MRs and the global cache entries.
1386 *
1387 * @param dev
1388 *   Pointer to Ethernet device.
1389 */
1390void
1391mlx4_mr_dump_dev(struct rte_eth_dev *dev)
1392{
1393        struct mlx4_priv *priv = dev->data->dev_private;
1394        struct mlx4_mr *mr;
1395        int mr_n = 0;
1396        int chunk_n = 0;
1397
1398        rte_rwlock_read_lock(&priv->mr.rwlock);
1399        /* Iterate all the existing MRs. */
1400        LIST_FOREACH(mr, &priv->mr.mr_list, mr) {
1401                unsigned int n;
1402
1403                DEBUG("port %u MR[%u], LKey = 0x%x, ms_n = %u, ms_bmp_n = %u",
1404                      dev->data->port_id, mr_n++,
1405                      rte_cpu_to_be_32(mr->ibv_mr->lkey),
1406                      mr->ms_n, mr->ms_bmp_n);
1407                if (mr->ms_n == 0)
1408                        continue;
1409                for (n = 0; n < mr->ms_bmp_n; ) {
1410                        struct mlx4_mr_cache ret;
1411
1412                        memset(&ret, 0, sizeof(ret));
1413                        n = mr_find_next_chunk(mr, &ret, n);
1414                        if (!ret.end)
1415                                break;
1416                        DEBUG("  chunk[%u], [0x%" PRIxPTR ", 0x%" PRIxPTR ")",
1417                              chunk_n++, ret.start, ret.end);
1418                }
1419        }
1420        DEBUG("port %u dumping global cache", dev->data->port_id);
1421        mlx4_mr_btree_dump(&priv->mr.cache);
1422        rte_rwlock_read_unlock(&priv->mr.rwlock);
1423}
1424#endif
1425
1426/**
1427 * Release all the created MRs and resources. Remove device from memory callback
1428 * list.
1429 *
1430 * @param dev
1431 *   Pointer to Ethernet device.
1432 */
1433void
1434mlx4_mr_release(struct rte_eth_dev *dev)
1435{
1436        struct mlx4_priv *priv = dev->data->dev_private;
1437        struct mlx4_mr *mr_next;
1438
1439        /* Remove from memory callback device list. */
1440        rte_rwlock_write_lock(&mlx4_shared_data->mem_event_rwlock);
1441        LIST_REMOVE(priv, mem_event_cb);
1442        rte_rwlock_write_unlock(&mlx4_shared_data->mem_event_rwlock);
1443#ifdef RTE_LIBRTE_MLX4_DEBUG
1444        mlx4_mr_dump_dev(dev);
1445#endif
1446        rte_rwlock_write_lock(&priv->mr.rwlock);
1447        /* Detach from MR list and move to free list. */
1448        mr_next = LIST_FIRST(&priv->mr.mr_list);
1449        while (mr_next != NULL) {
1450                struct mlx4_mr *mr = mr_next;
1451
1452                mr_next = LIST_NEXT(mr, mr);
1453                LIST_REMOVE(mr, mr);
1454                LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
1455        }
1456        LIST_INIT(&priv->mr.mr_list);
1457        /* Free global cache. */
1458        mlx4_mr_btree_free(&priv->mr.cache);
1459        rte_rwlock_write_unlock(&priv->mr.rwlock);
1460        /* Free all remaining MRs. */
1461        mlx4_mr_garbage_collect(dev);
1462}
1463