dpdk/lib/librte_eal/common/eal_common_memory.c
<<
>>
Prefs
   1/* SPDX-License-Identifier: BSD-3-Clause
   2 * Copyright(c) 2010-2014 Intel Corporation
   3 */
   4
   5#include <fcntl.h>
   6#include <errno.h>
   7#include <stdio.h>
   8#include <stdint.h>
   9#include <stdlib.h>
  10#include <stdarg.h>
  11#include <string.h>
  12#include <unistd.h>
  13#include <inttypes.h>
  14#include <sys/queue.h>
  15
  16#include <rte_fbarray.h>
  17#include <rte_memory.h>
  18#include <rte_eal.h>
  19#include <rte_eal_memconfig.h>
  20#include <rte_eal_paging.h>
  21#include <rte_errno.h>
  22#include <rte_log.h>
  23
  24#include "eal_memalloc.h"
  25#include "eal_private.h"
  26#include "eal_internal_cfg.h"
  27#include "eal_memcfg.h"
  28#include "eal_options.h"
  29#include "malloc_heap.h"
  30
  31/*
  32 * Try to mmap *size bytes in /dev/zero. If it is successful, return the
  33 * pointer to the mmap'd area and keep *size unmodified. Else, retry
  34 * with a smaller zone: decrease *size by hugepage_sz until it reaches
  35 * 0. In this case, return NULL. Note: this function returns an address
  36 * which is a multiple of hugepage size.
  37 */
  38
  39#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i"
  40
  41static void *next_baseaddr;
  42static uint64_t system_page_sz;
  43
  44#define MAX_MMAP_WITH_DEFINED_ADDR_TRIES 5
  45void *
  46eal_get_virtual_area(void *requested_addr, size_t *size,
  47        size_t page_sz, int flags, int reserve_flags)
  48{
  49        bool addr_is_hint, allow_shrink, unmap, no_align;
  50        uint64_t map_sz;
  51        void *mapped_addr, *aligned_addr;
  52        uint8_t try = 0;
  53        struct internal_config *internal_conf =
  54                eal_get_internal_configuration();
  55
  56        if (system_page_sz == 0)
  57                system_page_sz = rte_mem_page_size();
  58
  59        RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size);
  60
  61        addr_is_hint = (flags & EAL_VIRTUAL_AREA_ADDR_IS_HINT) > 0;
  62        allow_shrink = (flags & EAL_VIRTUAL_AREA_ALLOW_SHRINK) > 0;
  63        unmap = (flags & EAL_VIRTUAL_AREA_UNMAP) > 0;
  64
  65        if (next_baseaddr == NULL && internal_conf->base_virtaddr != 0 &&
  66                        rte_eal_process_type() == RTE_PROC_PRIMARY)
  67                next_baseaddr = (void *) internal_conf->base_virtaddr;
  68
  69#ifdef RTE_ARCH_64
  70        if (next_baseaddr == NULL && internal_conf->base_virtaddr == 0 &&
  71                        rte_eal_process_type() == RTE_PROC_PRIMARY)
  72                next_baseaddr = (void *) eal_get_baseaddr();
  73#endif
  74        if (requested_addr == NULL && next_baseaddr != NULL) {
  75                requested_addr = next_baseaddr;
  76                requested_addr = RTE_PTR_ALIGN(requested_addr, page_sz);
  77                addr_is_hint = true;
  78        }
  79
  80        /* we don't need alignment of resulting pointer in the following cases:
  81         *
  82         * 1. page size is equal to system size
  83         * 2. we have a requested address, and it is page-aligned, and we will
  84         *    be discarding the address if we get a different one.
  85         *
  86         * for all other cases, alignment is potentially necessary.
  87         */
  88        no_align = (requested_addr != NULL &&
  89                requested_addr == RTE_PTR_ALIGN(requested_addr, page_sz) &&
  90                !addr_is_hint) ||
  91                page_sz == system_page_sz;
  92
  93        do {
  94                map_sz = no_align ? *size : *size + page_sz;
  95                if (map_sz > SIZE_MAX) {
  96                        RTE_LOG(ERR, EAL, "Map size too big\n");
  97                        rte_errno = E2BIG;
  98                        return NULL;
  99                }
 100
 101                mapped_addr = eal_mem_reserve(
 102                        requested_addr, (size_t)map_sz, reserve_flags);
 103                if ((mapped_addr == NULL) && allow_shrink)
 104                        *size -= page_sz;
 105
 106                if ((mapped_addr != NULL) && addr_is_hint &&
 107                                (mapped_addr != requested_addr)) {
 108                        try++;
 109                        next_baseaddr = RTE_PTR_ADD(next_baseaddr, page_sz);
 110                        if (try <= MAX_MMAP_WITH_DEFINED_ADDR_TRIES) {
 111                                /* hint was not used. Try with another offset */
 112                                eal_mem_free(mapped_addr, map_sz);
 113                                mapped_addr = NULL;
 114                                requested_addr = next_baseaddr;
 115                        }
 116                }
 117        } while ((allow_shrink || addr_is_hint) &&
 118                (mapped_addr == NULL) && (*size > 0));
 119
 120        /* align resulting address - if map failed, we will ignore the value
 121         * anyway, so no need to add additional checks.
 122         */
 123        aligned_addr = no_align ? mapped_addr :
 124                        RTE_PTR_ALIGN(mapped_addr, page_sz);
 125
 126        if (*size == 0) {
 127                RTE_LOG(ERR, EAL, "Cannot get a virtual area of any size: %s\n",
 128                        rte_strerror(rte_errno));
 129                return NULL;
 130        } else if (mapped_addr == NULL) {
 131                RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n",
 132                        rte_strerror(rte_errno));
 133                return NULL;
 134        } else if (requested_addr != NULL && !addr_is_hint &&
 135                        aligned_addr != requested_addr) {
 136                RTE_LOG(ERR, EAL, "Cannot get a virtual area at requested address: %p (got %p)\n",
 137                        requested_addr, aligned_addr);
 138                eal_mem_free(mapped_addr, map_sz);
 139                rte_errno = EADDRNOTAVAIL;
 140                return NULL;
 141        } else if (requested_addr != NULL && addr_is_hint &&
 142                        aligned_addr != requested_addr) {
 143                RTE_LOG(WARNING, EAL, "WARNING! Base virtual address hint (%p != %p) not respected!\n",
 144                        requested_addr, aligned_addr);
 145                RTE_LOG(WARNING, EAL, "   This may cause issues with mapping memory into secondary processes\n");
 146        } else if (next_baseaddr != NULL) {
 147                next_baseaddr = RTE_PTR_ADD(aligned_addr, *size);
 148        }
 149
 150        RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n",
 151                aligned_addr, *size);
 152
 153        if (unmap) {
 154                eal_mem_free(mapped_addr, map_sz);
 155        } else if (!no_align) {
 156                void *map_end, *aligned_end;
 157                size_t before_len, after_len;
 158
 159                /* when we reserve space with alignment, we add alignment to
 160                 * mapping size. On 32-bit, if 1GB alignment was requested, this
 161                 * would waste 1GB of address space, which is a luxury we cannot
 162                 * afford. so, if alignment was performed, check if any unneeded
 163                 * address space can be unmapped back.
 164                 */
 165
 166                map_end = RTE_PTR_ADD(mapped_addr, (size_t)map_sz);
 167                aligned_end = RTE_PTR_ADD(aligned_addr, *size);
 168
 169                /* unmap space before aligned mmap address */
 170                before_len = RTE_PTR_DIFF(aligned_addr, mapped_addr);
 171                if (before_len > 0)
 172                        eal_mem_free(mapped_addr, before_len);
 173
 174                /* unmap space after aligned end mmap address */
 175                after_len = RTE_PTR_DIFF(map_end, aligned_end);
 176                if (after_len > 0)
 177                        eal_mem_free(aligned_end, after_len);
 178        }
 179
 180        if (!unmap) {
 181                /* Exclude these pages from a core dump. */
 182                eal_mem_set_dump(aligned_addr, *size, false);
 183        }
 184
 185        return aligned_addr;
 186}
 187
 188int
 189eal_memseg_list_init_named(struct rte_memseg_list *msl, const char *name,
 190                uint64_t page_sz, int n_segs, int socket_id, bool heap)
 191{
 192        if (rte_fbarray_init(&msl->memseg_arr, name, n_segs,
 193                        sizeof(struct rte_memseg))) {
 194                RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n",
 195                        rte_strerror(rte_errno));
 196                return -1;
 197        }
 198
 199        msl->page_sz = page_sz;
 200        msl->socket_id = socket_id;
 201        msl->base_va = NULL;
 202        msl->heap = heap;
 203
 204        RTE_LOG(DEBUG, EAL,
 205                "Memseg list allocated at socket %i, page size 0x%"PRIx64"kB\n",
 206                socket_id, page_sz >> 10);
 207
 208        return 0;
 209}
 210
 211int
 212eal_memseg_list_init(struct rte_memseg_list *msl, uint64_t page_sz,
 213                int n_segs, int socket_id, int type_msl_idx, bool heap)
 214{
 215        char name[RTE_FBARRAY_NAME_LEN];
 216
 217        snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id,
 218                 type_msl_idx);
 219
 220        return eal_memseg_list_init_named(
 221                msl, name, page_sz, n_segs, socket_id, heap);
 222}
 223
 224int
 225eal_memseg_list_alloc(struct rte_memseg_list *msl, int reserve_flags)
 226{
 227        size_t page_sz, mem_sz;
 228        void *addr;
 229
 230        page_sz = msl->page_sz;
 231        mem_sz = page_sz * msl->memseg_arr.len;
 232
 233        addr = eal_get_virtual_area(
 234                msl->base_va, &mem_sz, page_sz, 0, reserve_flags);
 235        if (addr == NULL) {
 236#ifndef RTE_EXEC_ENV_WINDOWS
 237                /* The hint would be misleading on Windows, because address
 238                 * is by default system-selected (base VA = 0).
 239                 * However, this function is called from many places,
 240                 * including common code, so don't duplicate the message.
 241                 */
 242                if (rte_errno == EADDRNOTAVAIL)
 243                        RTE_LOG(ERR, EAL, "Cannot reserve %llu bytes at [%p] - "
 244                                "please use '--" OPT_BASE_VIRTADDR "' option\n",
 245                                (unsigned long long)mem_sz, msl->base_va);
 246#endif
 247                return -1;
 248        }
 249        msl->base_va = addr;
 250        msl->len = mem_sz;
 251
 252        RTE_LOG(DEBUG, EAL, "VA reserved for memseg list at %p, size %zx\n",
 253                        addr, mem_sz);
 254
 255        return 0;
 256}
 257
 258void
 259eal_memseg_list_populate(struct rte_memseg_list *msl, void *addr, int n_segs)
 260{
 261        size_t page_sz = msl->page_sz;
 262        int i;
 263
 264        for (i = 0; i < n_segs; i++) {
 265                struct rte_fbarray *arr = &msl->memseg_arr;
 266                struct rte_memseg *ms = rte_fbarray_get(arr, i);
 267
 268                if (rte_eal_iova_mode() == RTE_IOVA_VA)
 269                        ms->iova = (uintptr_t)addr;
 270                else
 271                        ms->iova = RTE_BAD_IOVA;
 272                ms->addr = addr;
 273                ms->hugepage_sz = page_sz;
 274                ms->socket_id = 0;
 275                ms->len = page_sz;
 276
 277                rte_fbarray_set_used(arr, i);
 278
 279                addr = RTE_PTR_ADD(addr, page_sz);
 280        }
 281}
 282
 283static struct rte_memseg *
 284virt2memseg(const void *addr, const struct rte_memseg_list *msl)
 285{
 286        const struct rte_fbarray *arr;
 287        void *start, *end;
 288        int ms_idx;
 289
 290        if (msl == NULL)
 291                return NULL;
 292
 293        /* a memseg list was specified, check if it's the right one */
 294        start = msl->base_va;
 295        end = RTE_PTR_ADD(start, msl->len);
 296
 297        if (addr < start || addr >= end)
 298                return NULL;
 299
 300        /* now, calculate index */
 301        arr = &msl->memseg_arr;
 302        ms_idx = RTE_PTR_DIFF(addr, msl->base_va) / msl->page_sz;
 303        return rte_fbarray_get(arr, ms_idx);
 304}
 305
 306static struct rte_memseg_list *
 307virt2memseg_list(const void *addr)
 308{
 309        struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
 310        struct rte_memseg_list *msl;
 311        int msl_idx;
 312
 313        for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
 314                void *start, *end;
 315                msl = &mcfg->memsegs[msl_idx];
 316
 317                start = msl->base_va;
 318                end = RTE_PTR_ADD(start, msl->len);
 319                if (addr >= start && addr < end)
 320                        break;
 321        }
 322        /* if we didn't find our memseg list */
 323        if (msl_idx == RTE_MAX_MEMSEG_LISTS)
 324                return NULL;
 325        return msl;
 326}
 327
 328struct rte_memseg_list *
 329rte_mem_virt2memseg_list(const void *addr)
 330{
 331        return virt2memseg_list(addr);
 332}
 333
 334struct virtiova {
 335        rte_iova_t iova;
 336        void *virt;
 337};
 338static int
 339find_virt(const struct rte_memseg_list *msl __rte_unused,
 340                const struct rte_memseg *ms, void *arg)
 341{
 342        struct virtiova *vi = arg;
 343        if (vi->iova >= ms->iova && vi->iova < (ms->iova + ms->len)) {
 344                size_t offset = vi->iova - ms->iova;
 345                vi->virt = RTE_PTR_ADD(ms->addr, offset);
 346                /* stop the walk */
 347                return 1;
 348        }
 349        return 0;
 350}
 351static int
 352find_virt_legacy(const struct rte_memseg_list *msl __rte_unused,
 353                const struct rte_memseg *ms, size_t len, void *arg)
 354{
 355        struct virtiova *vi = arg;
 356        if (vi->iova >= ms->iova && vi->iova < (ms->iova + len)) {
 357                size_t offset = vi->iova - ms->iova;
 358                vi->virt = RTE_PTR_ADD(ms->addr, offset);
 359                /* stop the walk */
 360                return 1;
 361        }
 362        return 0;
 363}
 364
 365void *
 366rte_mem_iova2virt(rte_iova_t iova)
 367{
 368        struct virtiova vi;
 369        const struct internal_config *internal_conf =
 370                eal_get_internal_configuration();
 371
 372        memset(&vi, 0, sizeof(vi));
 373
 374        vi.iova = iova;
 375        /* for legacy mem, we can get away with scanning VA-contiguous segments,
 376         * as we know they are PA-contiguous as well
 377         */
 378        if (internal_conf->legacy_mem)
 379                rte_memseg_contig_walk(find_virt_legacy, &vi);
 380        else
 381                rte_memseg_walk(find_virt, &vi);
 382
 383        return vi.virt;
 384}
 385
 386struct rte_memseg *
 387rte_mem_virt2memseg(const void *addr, const struct rte_memseg_list *msl)
 388{
 389        return virt2memseg(addr, msl != NULL ? msl :
 390                        rte_mem_virt2memseg_list(addr));
 391}
 392
 393static int
 394physmem_size(const struct rte_memseg_list *msl, void *arg)
 395{
 396        uint64_t *total_len = arg;
 397
 398        if (msl->external)
 399                return 0;
 400
 401        *total_len += msl->memseg_arr.count * msl->page_sz;
 402
 403        return 0;
 404}
 405
 406/* get the total size of memory */
 407uint64_t
 408rte_eal_get_physmem_size(void)
 409{
 410        uint64_t total_len = 0;
 411
 412        rte_memseg_list_walk(physmem_size, &total_len);
 413
 414        return total_len;
 415}
 416
 417static int
 418dump_memseg(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
 419                void *arg)
 420{
 421        struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
 422        int msl_idx, ms_idx, fd;
 423        FILE *f = arg;
 424
 425        msl_idx = msl - mcfg->memsegs;
 426        if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS)
 427                return -1;
 428
 429        ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
 430        if (ms_idx < 0)
 431                return -1;
 432
 433        fd = eal_memalloc_get_seg_fd(msl_idx, ms_idx);
 434        fprintf(f, "Segment %i-%i: IOVA:0x%"PRIx64", len:%zu, "
 435                        "virt:%p, socket_id:%"PRId32", "
 436                        "hugepage_sz:%"PRIu64", nchannel:%"PRIx32", "
 437                        "nrank:%"PRIx32" fd:%i\n",
 438                        msl_idx, ms_idx,
 439                        ms->iova,
 440                        ms->len,
 441                        ms->addr,
 442                        ms->socket_id,
 443                        ms->hugepage_sz,
 444                        ms->nchannel,
 445                        ms->nrank,
 446                        fd);
 447
 448        return 0;
 449}
 450
 451/*
 452 * Defining here because declared in rte_memory.h, but the actual implementation
 453 * is in eal_common_memalloc.c, like all other memalloc internals.
 454 */
 455int
 456rte_mem_event_callback_register(const char *name, rte_mem_event_callback_t clb,
 457                void *arg)
 458{
 459        const struct internal_config *internal_conf =
 460                eal_get_internal_configuration();
 461
 462        /* FreeBSD boots with legacy mem enabled by default */
 463        if (internal_conf->legacy_mem) {
 464                RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n");
 465                rte_errno = ENOTSUP;
 466                return -1;
 467        }
 468        return eal_memalloc_mem_event_callback_register(name, clb, arg);
 469}
 470
 471int
 472rte_mem_event_callback_unregister(const char *name, void *arg)
 473{
 474        const struct internal_config *internal_conf =
 475                eal_get_internal_configuration();
 476
 477        /* FreeBSD boots with legacy mem enabled by default */
 478        if (internal_conf->legacy_mem) {
 479                RTE_LOG(DEBUG, EAL, "Registering mem event callbacks not supported\n");
 480                rte_errno = ENOTSUP;
 481                return -1;
 482        }
 483        return eal_memalloc_mem_event_callback_unregister(name, arg);
 484}
 485
 486int
 487rte_mem_alloc_validator_register(const char *name,
 488                rte_mem_alloc_validator_t clb, int socket_id, size_t limit)
 489{
 490        const struct internal_config *internal_conf =
 491                eal_get_internal_configuration();
 492
 493        /* FreeBSD boots with legacy mem enabled by default */
 494        if (internal_conf->legacy_mem) {
 495                RTE_LOG(DEBUG, EAL, "Registering mem alloc validators not supported\n");
 496                rte_errno = ENOTSUP;
 497                return -1;
 498        }
 499        return eal_memalloc_mem_alloc_validator_register(name, clb, socket_id,
 500                        limit);
 501}
 502
 503int
 504rte_mem_alloc_validator_unregister(const char *name, int socket_id)
 505{
 506        const struct internal_config *internal_conf =
 507                eal_get_internal_configuration();
 508
 509        /* FreeBSD boots with legacy mem enabled by default */
 510        if (internal_conf->legacy_mem) {
 511                RTE_LOG(DEBUG, EAL, "Registering mem alloc validators not supported\n");
 512                rte_errno = ENOTSUP;
 513                return -1;
 514        }
 515        return eal_memalloc_mem_alloc_validator_unregister(name, socket_id);
 516}
 517
 518/* Dump the physical memory layout on console */
 519void
 520rte_dump_physmem_layout(FILE *f)
 521{
 522        rte_memseg_walk(dump_memseg, f);
 523}
 524
 525static int
 526check_iova(const struct rte_memseg_list *msl __rte_unused,
 527                const struct rte_memseg *ms, void *arg)
 528{
 529        uint64_t *mask = arg;
 530        rte_iova_t iova;
 531
 532        /* higher address within segment */
 533        iova = (ms->iova + ms->len) - 1;
 534        if (!(iova & *mask))
 535                return 0;
 536
 537        RTE_LOG(DEBUG, EAL, "memseg iova %"PRIx64", len %zx, out of range\n",
 538                            ms->iova, ms->len);
 539
 540        RTE_LOG(DEBUG, EAL, "\tusing dma mask %"PRIx64"\n", *mask);
 541        return 1;
 542}
 543
 544#define MAX_DMA_MASK_BITS 63
 545
 546/* check memseg iovas are within the required range based on dma mask */
 547static int
 548check_dma_mask(uint8_t maskbits, bool thread_unsafe)
 549{
 550        struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
 551        uint64_t mask;
 552        int ret;
 553
 554        /* Sanity check. We only check width can be managed with 64 bits
 555         * variables. Indeed any higher value is likely wrong. */
 556        if (maskbits > MAX_DMA_MASK_BITS) {
 557                RTE_LOG(ERR, EAL, "wrong dma mask size %u (Max: %u)\n",
 558                                   maskbits, MAX_DMA_MASK_BITS);
 559                return -1;
 560        }
 561
 562        /* create dma mask */
 563        mask = ~((1ULL << maskbits) - 1);
 564
 565        if (thread_unsafe)
 566                ret = rte_memseg_walk_thread_unsafe(check_iova, &mask);
 567        else
 568                ret = rte_memseg_walk(check_iova, &mask);
 569
 570        if (ret)
 571                /*
 572                 * Dma mask precludes hugepage usage.
 573                 * This device can not be used and we do not need to keep
 574                 * the dma mask.
 575                 */
 576                return 1;
 577
 578        /*
 579         * we need to keep the more restricted maskbit for checking
 580         * potential dynamic memory allocation in the future.
 581         */
 582        mcfg->dma_maskbits = mcfg->dma_maskbits == 0 ? maskbits :
 583                             RTE_MIN(mcfg->dma_maskbits, maskbits);
 584
 585        return 0;
 586}
 587
 588int
 589rte_mem_check_dma_mask(uint8_t maskbits)
 590{
 591        return check_dma_mask(maskbits, false);
 592}
 593
 594int
 595rte_mem_check_dma_mask_thread_unsafe(uint8_t maskbits)
 596{
 597        return check_dma_mask(maskbits, true);
 598}
 599
 600/*
 601 * Set dma mask to use when memory initialization is done.
 602 *
 603 * This function should ONLY be used by code executed before the memory
 604 * initialization. PMDs should use rte_mem_check_dma_mask if addressing
 605 * limitations by the device.
 606 */
 607void
 608rte_mem_set_dma_mask(uint8_t maskbits)
 609{
 610        struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
 611
 612        mcfg->dma_maskbits = mcfg->dma_maskbits == 0 ? maskbits :
 613                             RTE_MIN(mcfg->dma_maskbits, maskbits);
 614}
 615
 616/* return the number of memory channels */
 617unsigned rte_memory_get_nchannel(void)
 618{
 619        return rte_eal_get_configuration()->mem_config->nchannel;
 620}
 621
 622/* return the number of memory rank */
 623unsigned rte_memory_get_nrank(void)
 624{
 625        return rte_eal_get_configuration()->mem_config->nrank;
 626}
 627
 628static int
 629rte_eal_memdevice_init(void)
 630{
 631        struct rte_config *config;
 632        const struct internal_config *internal_conf;
 633
 634        if (rte_eal_process_type() == RTE_PROC_SECONDARY)
 635                return 0;
 636
 637        internal_conf = eal_get_internal_configuration();
 638        config = rte_eal_get_configuration();
 639        config->mem_config->nchannel = internal_conf->force_nchannel;
 640        config->mem_config->nrank = internal_conf->force_nrank;
 641
 642        return 0;
 643}
 644
 645/* Lock page in physical memory and prevent from swapping. */
 646int
 647rte_mem_lock_page(const void *virt)
 648{
 649        uintptr_t virtual = (uintptr_t)virt;
 650        size_t page_size = rte_mem_page_size();
 651        uintptr_t aligned = RTE_PTR_ALIGN_FLOOR(virtual, page_size);
 652        return rte_mem_lock((void *)aligned, page_size);
 653}
 654
 655int
 656rte_memseg_contig_walk_thread_unsafe(rte_memseg_contig_walk_t func, void *arg)
 657{
 658        struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
 659        int i, ms_idx, ret = 0;
 660
 661        for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
 662                struct rte_memseg_list *msl = &mcfg->memsegs[i];
 663                const struct rte_memseg *ms;
 664                struct rte_fbarray *arr;
 665
 666                if (msl->memseg_arr.count == 0)
 667                        continue;
 668
 669                arr = &msl->memseg_arr;
 670
 671                ms_idx = rte_fbarray_find_next_used(arr, 0);
 672                while (ms_idx >= 0) {
 673                        int n_segs;
 674                        size_t len;
 675
 676                        ms = rte_fbarray_get(arr, ms_idx);
 677
 678                        /* find how many more segments there are, starting with
 679                         * this one.
 680                         */
 681                        n_segs = rte_fbarray_find_contig_used(arr, ms_idx);
 682                        len = n_segs * msl->page_sz;
 683
 684                        ret = func(msl, ms, len, arg);
 685                        if (ret)
 686                                return ret;
 687                        ms_idx = rte_fbarray_find_next_used(arr,
 688                                        ms_idx + n_segs);
 689                }
 690        }
 691        return 0;
 692}
 693
 694int
 695rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg)
 696{
 697        int ret = 0;
 698
 699        /* do not allow allocations/frees/init while we iterate */
 700        rte_mcfg_mem_read_lock();
 701        ret = rte_memseg_contig_walk_thread_unsafe(func, arg);
 702        rte_mcfg_mem_read_unlock();
 703
 704        return ret;
 705}
 706
 707int
 708rte_memseg_walk_thread_unsafe(rte_memseg_walk_t func, void *arg)
 709{
 710        struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
 711        int i, ms_idx, ret = 0;
 712
 713        for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
 714                struct rte_memseg_list *msl = &mcfg->memsegs[i];
 715                const struct rte_memseg *ms;
 716                struct rte_fbarray *arr;
 717
 718                if (msl->memseg_arr.count == 0)
 719                        continue;
 720
 721                arr = &msl->memseg_arr;
 722
 723                ms_idx = rte_fbarray_find_next_used(arr, 0);
 724                while (ms_idx >= 0) {
 725                        ms = rte_fbarray_get(arr, ms_idx);
 726                        ret = func(msl, ms, arg);
 727                        if (ret)
 728                                return ret;
 729                        ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1);
 730                }
 731        }
 732        return 0;
 733}
 734
 735int
 736rte_memseg_walk(rte_memseg_walk_t func, void *arg)
 737{
 738        int ret = 0;
 739
 740        /* do not allow allocations/frees/init while we iterate */
 741        rte_mcfg_mem_read_lock();
 742        ret = rte_memseg_walk_thread_unsafe(func, arg);
 743        rte_mcfg_mem_read_unlock();
 744
 745        return ret;
 746}
 747
 748int
 749rte_memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg)
 750{
 751        struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
 752        int i, ret = 0;
 753
 754        for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
 755                struct rte_memseg_list *msl = &mcfg->memsegs[i];
 756
 757                if (msl->base_va == NULL)
 758                        continue;
 759
 760                ret = func(msl, arg);
 761                if (ret)
 762                        return ret;
 763        }
 764        return 0;
 765}
 766
 767int
 768rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg)
 769{
 770        int ret = 0;
 771
 772        /* do not allow allocations/frees/init while we iterate */
 773        rte_mcfg_mem_read_lock();
 774        ret = rte_memseg_list_walk_thread_unsafe(func, arg);
 775        rte_mcfg_mem_read_unlock();
 776
 777        return ret;
 778}
 779
 780int
 781rte_memseg_get_fd_thread_unsafe(const struct rte_memseg *ms)
 782{
 783        struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
 784        struct rte_memseg_list *msl;
 785        struct rte_fbarray *arr;
 786        int msl_idx, seg_idx, ret;
 787
 788        if (ms == NULL) {
 789                rte_errno = EINVAL;
 790                return -1;
 791        }
 792
 793        msl = rte_mem_virt2memseg_list(ms->addr);
 794        if (msl == NULL) {
 795                rte_errno = EINVAL;
 796                return -1;
 797        }
 798        arr = &msl->memseg_arr;
 799
 800        msl_idx = msl - mcfg->memsegs;
 801        seg_idx = rte_fbarray_find_idx(arr, ms);
 802
 803        if (!rte_fbarray_is_used(arr, seg_idx)) {
 804                rte_errno = ENOENT;
 805                return -1;
 806        }
 807
 808        /* segment fd API is not supported for external segments */
 809        if (msl->external) {
 810                rte_errno = ENOTSUP;
 811                return -1;
 812        }
 813
 814        ret = eal_memalloc_get_seg_fd(msl_idx, seg_idx);
 815        if (ret < 0) {
 816                rte_errno = -ret;
 817                ret = -1;
 818        }
 819        return ret;
 820}
 821
 822int
 823rte_memseg_get_fd(const struct rte_memseg *ms)
 824{
 825        int ret;
 826
 827        rte_mcfg_mem_read_lock();
 828        ret = rte_memseg_get_fd_thread_unsafe(ms);
 829        rte_mcfg_mem_read_unlock();
 830
 831        return ret;
 832}
 833
 834int
 835rte_memseg_get_fd_offset_thread_unsafe(const struct rte_memseg *ms,
 836                size_t *offset)
 837{
 838        struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
 839        struct rte_memseg_list *msl;
 840        struct rte_fbarray *arr;
 841        int msl_idx, seg_idx, ret;
 842
 843        if (ms == NULL || offset == NULL) {
 844                rte_errno = EINVAL;
 845                return -1;
 846        }
 847
 848        msl = rte_mem_virt2memseg_list(ms->addr);
 849        if (msl == NULL) {
 850                rte_errno = EINVAL;
 851                return -1;
 852        }
 853        arr = &msl->memseg_arr;
 854
 855        msl_idx = msl - mcfg->memsegs;
 856        seg_idx = rte_fbarray_find_idx(arr, ms);
 857
 858        if (!rte_fbarray_is_used(arr, seg_idx)) {
 859                rte_errno = ENOENT;
 860                return -1;
 861        }
 862
 863        /* segment fd API is not supported for external segments */
 864        if (msl->external) {
 865                rte_errno = ENOTSUP;
 866                return -1;
 867        }
 868
 869        ret = eal_memalloc_get_seg_fd_offset(msl_idx, seg_idx, offset);
 870        if (ret < 0) {
 871                rte_errno = -ret;
 872                ret = -1;
 873        }
 874        return ret;
 875}
 876
 877int
 878rte_memseg_get_fd_offset(const struct rte_memseg *ms, size_t *offset)
 879{
 880        int ret;
 881
 882        rte_mcfg_mem_read_lock();
 883        ret = rte_memseg_get_fd_offset_thread_unsafe(ms, offset);
 884        rte_mcfg_mem_read_unlock();
 885
 886        return ret;
 887}
 888
 889int
 890rte_extmem_register(void *va_addr, size_t len, rte_iova_t iova_addrs[],
 891                unsigned int n_pages, size_t page_sz)
 892{
 893        struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
 894        unsigned int socket_id, n;
 895        int ret = 0;
 896
 897        if (va_addr == NULL || page_sz == 0 || len == 0 ||
 898                        !rte_is_power_of_2(page_sz) ||
 899                        RTE_ALIGN(len, page_sz) != len ||
 900                        ((len / page_sz) != n_pages && iova_addrs != NULL) ||
 901                        !rte_is_aligned(va_addr, page_sz)) {
 902                rte_errno = EINVAL;
 903                return -1;
 904        }
 905        rte_mcfg_mem_write_lock();
 906
 907        /* make sure the segment doesn't already exist */
 908        if (malloc_heap_find_external_seg(va_addr, len) != NULL) {
 909                rte_errno = EEXIST;
 910                ret = -1;
 911                goto unlock;
 912        }
 913
 914        /* get next available socket ID */
 915        socket_id = mcfg->next_socket_id;
 916        if (socket_id > INT32_MAX) {
 917                RTE_LOG(ERR, EAL, "Cannot assign new socket ID's\n");
 918                rte_errno = ENOSPC;
 919                ret = -1;
 920                goto unlock;
 921        }
 922
 923        /* we can create a new memseg */
 924        n = len / page_sz;
 925        if (malloc_heap_create_external_seg(va_addr, iova_addrs, n,
 926                        page_sz, "extmem", socket_id) == NULL) {
 927                ret = -1;
 928                goto unlock;
 929        }
 930
 931        /* memseg list successfully created - increment next socket ID */
 932        mcfg->next_socket_id++;
 933unlock:
 934        rte_mcfg_mem_write_unlock();
 935        return ret;
 936}
 937
 938int
 939rte_extmem_unregister(void *va_addr, size_t len)
 940{
 941        struct rte_memseg_list *msl;
 942        int ret = 0;
 943
 944        if (va_addr == NULL || len == 0) {
 945                rte_errno = EINVAL;
 946                return -1;
 947        }
 948        rte_mcfg_mem_write_lock();
 949
 950        /* find our segment */
 951        msl = malloc_heap_find_external_seg(va_addr, len);
 952        if (msl == NULL) {
 953                rte_errno = ENOENT;
 954                ret = -1;
 955                goto unlock;
 956        }
 957
 958        ret = malloc_heap_destroy_external_seg(msl);
 959unlock:
 960        rte_mcfg_mem_write_unlock();
 961        return ret;
 962}
 963
 964static int
 965sync_memory(void *va_addr, size_t len, bool attach)
 966{
 967        struct rte_memseg_list *msl;
 968        int ret = 0;
 969
 970        if (va_addr == NULL || len == 0) {
 971                rte_errno = EINVAL;
 972                return -1;
 973        }
 974        rte_mcfg_mem_write_lock();
 975
 976        /* find our segment */
 977        msl = malloc_heap_find_external_seg(va_addr, len);
 978        if (msl == NULL) {
 979                rte_errno = ENOENT;
 980                ret = -1;
 981                goto unlock;
 982        }
 983        if (attach)
 984                ret = rte_fbarray_attach(&msl->memseg_arr);
 985        else
 986                ret = rte_fbarray_detach(&msl->memseg_arr);
 987
 988unlock:
 989        rte_mcfg_mem_write_unlock();
 990        return ret;
 991}
 992
 993int
 994rte_extmem_attach(void *va_addr, size_t len)
 995{
 996        return sync_memory(va_addr, len, true);
 997}
 998
 999int
1000rte_extmem_detach(void *va_addr, size_t len)
1001{
1002        return sync_memory(va_addr, len, false);
1003}
1004
1005/* init memory subsystem */
1006int
1007rte_eal_memory_init(void)
1008{
1009        struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1010        const struct internal_config *internal_conf =
1011                eal_get_internal_configuration();
1012
1013        int retval;
1014        RTE_LOG(DEBUG, EAL, "Setting up physically contiguous memory...\n");
1015
1016        if (!mcfg)
1017                return -1;
1018
1019        /* lock mem hotplug here, to prevent races while we init */
1020        rte_mcfg_mem_read_lock();
1021
1022        if (rte_eal_memseg_init() < 0)
1023                goto fail;
1024
1025        if (eal_memalloc_init() < 0)
1026                goto fail;
1027
1028        retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
1029                        rte_eal_hugepage_init() :
1030                        rte_eal_hugepage_attach();
1031        if (retval < 0)
1032                goto fail;
1033
1034        if (internal_conf->no_shconf == 0 && rte_eal_memdevice_init() < 0)
1035                goto fail;
1036
1037        return 0;
1038fail:
1039        rte_mcfg_mem_read_unlock();
1040        return -1;
1041}
1042