linux/net/core/page_pool.c
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0
   2 *
   3 * page_pool.c
   4 *      Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
   5 *      Copyright (C) 2016 Red Hat, Inc.
   6 */
   7#include <linux/types.h>
   8#include <linux/kernel.h>
   9#include <linux/slab.h>
  10
  11#include <net/page_pool.h>
  12#include <linux/dma-direction.h>
  13#include <linux/dma-mapping.h>
  14#include <linux/page-flags.h>
  15#include <linux/mm.h> /* for __put_page() */
  16
  17static int page_pool_init(struct page_pool *pool,
  18                          const struct page_pool_params *params)
  19{
  20        unsigned int ring_qsize = 1024; /* Default */
  21
  22        memcpy(&pool->p, params, sizeof(pool->p));
  23
  24        /* Validate only known flags were used */
  25        if (pool->p.flags & ~(PP_FLAG_ALL))
  26                return -EINVAL;
  27
  28        if (pool->p.pool_size)
  29                ring_qsize = pool->p.pool_size;
  30
  31        /* Sanity limit mem that can be pinned down */
  32        if (ring_qsize > 32768)
  33                return -E2BIG;
  34
  35        /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
  36         * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
  37         * which is the XDP_TX use-case.
  38         */
  39        if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
  40            (pool->p.dma_dir != DMA_BIDIRECTIONAL))
  41                return -EINVAL;
  42
  43        if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
  44                return -ENOMEM;
  45
  46        return 0;
  47}
  48
  49struct page_pool *page_pool_create(const struct page_pool_params *params)
  50{
  51        struct page_pool *pool;
  52        int err = 0;
  53
  54        pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
  55        if (!pool)
  56                return ERR_PTR(-ENOMEM);
  57
  58        err = page_pool_init(pool, params);
  59        if (err < 0) {
  60                pr_warn("%s() gave up with errno %d\n", __func__, err);
  61                kfree(pool);
  62                return ERR_PTR(err);
  63        }
  64        return pool;
  65}
  66EXPORT_SYMBOL(page_pool_create);
  67
  68/* fast path */
  69static struct page *__page_pool_get_cached(struct page_pool *pool)
  70{
  71        struct ptr_ring *r = &pool->ring;
  72        struct page *page;
  73
  74        /* Quicker fallback, avoid locks when ring is empty */
  75        if (__ptr_ring_empty(r))
  76                return NULL;
  77
  78        /* Test for safe-context, caller should provide this guarantee */
  79        if (likely(in_serving_softirq())) {
  80                if (likely(pool->alloc.count)) {
  81                        /* Fast-path */
  82                        page = pool->alloc.cache[--pool->alloc.count];
  83                        return page;
  84                }
  85                /* Slower-path: Alloc array empty, time to refill
  86                 *
  87                 * Open-coded bulk ptr_ring consumer.
  88                 *
  89                 * Discussion: the ring consumer lock is not really
  90                 * needed due to the softirq/NAPI protection, but
  91                 * later need the ability to reclaim pages on the
  92                 * ring. Thus, keeping the locks.
  93                 */
  94                spin_lock(&r->consumer_lock);
  95                while ((page = __ptr_ring_consume(r))) {
  96                        if (pool->alloc.count == PP_ALLOC_CACHE_REFILL)
  97                                break;
  98                        pool->alloc.cache[pool->alloc.count++] = page;
  99                }
 100                spin_unlock(&r->consumer_lock);
 101                return page;
 102        }
 103
 104        /* Slow-path: Get page from locked ring queue */
 105        page = ptr_ring_consume(&pool->ring);
 106        return page;
 107}
 108
 109/* slow path */
 110noinline
 111static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
 112                                                 gfp_t _gfp)
 113{
 114        struct page *page;
 115        gfp_t gfp = _gfp;
 116        dma_addr_t dma;
 117
 118        /* We could always set __GFP_COMP, and avoid this branch, as
 119         * prep_new_page() can handle order-0 with __GFP_COMP.
 120         */
 121        if (pool->p.order)
 122                gfp |= __GFP_COMP;
 123
 124        /* FUTURE development:
 125         *
 126         * Current slow-path essentially falls back to single page
 127         * allocations, which doesn't improve performance.  This code
 128         * need bulk allocation support from the page allocator code.
 129         */
 130
 131        /* Cache was empty, do real allocation */
 132        page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
 133        if (!page)
 134                return NULL;
 135
 136        if (!(pool->p.flags & PP_FLAG_DMA_MAP))
 137                goto skip_dma_map;
 138
 139        /* Setup DMA mapping: use 'struct page' area for storing DMA-addr
 140         * since dma_addr_t can be either 32 or 64 bits and does not always fit
 141         * into page private data (i.e 32bit cpu with 64bit DMA caps)
 142         * This mapping is kept for lifetime of page, until leaving pool.
 143         */
 144        dma = dma_map_page_attrs(pool->p.dev, page, 0,
 145                                 (PAGE_SIZE << pool->p.order),
 146                                 pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC);
 147        if (dma_mapping_error(pool->p.dev, dma)) {
 148                put_page(page);
 149                return NULL;
 150        }
 151        page->dma_addr = dma;
 152
 153skip_dma_map:
 154        /* When page just alloc'ed is should/must have refcnt 1. */
 155        return page;
 156}
 157
 158/* For using page_pool replace: alloc_pages() API calls, but provide
 159 * synchronization guarantee for allocation side.
 160 */
 161struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
 162{
 163        struct page *page;
 164
 165        /* Fast-path: Get a page from cache */
 166        page = __page_pool_get_cached(pool);
 167        if (page)
 168                return page;
 169
 170        /* Slow-path: cache empty, do real allocation */
 171        page = __page_pool_alloc_pages_slow(pool, gfp);
 172        return page;
 173}
 174EXPORT_SYMBOL(page_pool_alloc_pages);
 175
 176/* Cleanup page_pool state from page */
 177static void __page_pool_clean_page(struct page_pool *pool,
 178                                   struct page *page)
 179{
 180        dma_addr_t dma;
 181
 182        if (!(pool->p.flags & PP_FLAG_DMA_MAP))
 183                return;
 184
 185        dma = page->dma_addr;
 186        /* DMA unmap */
 187        dma_unmap_page_attrs(pool->p.dev, dma,
 188                             PAGE_SIZE << pool->p.order, pool->p.dma_dir,
 189                             DMA_ATTR_SKIP_CPU_SYNC);
 190        page->dma_addr = 0;
 191}
 192
 193/* Return a page to the page allocator, cleaning up our state */
 194static void __page_pool_return_page(struct page_pool *pool, struct page *page)
 195{
 196        __page_pool_clean_page(pool, page);
 197        put_page(page);
 198        /* An optimization would be to call __free_pages(page, pool->p.order)
 199         * knowing page is not part of page-cache (thus avoiding a
 200         * __page_cache_release() call).
 201         */
 202}
 203
 204static bool __page_pool_recycle_into_ring(struct page_pool *pool,
 205                                   struct page *page)
 206{
 207        int ret;
 208        /* BH protection not needed if current is serving softirq */
 209        if (in_serving_softirq())
 210                ret = ptr_ring_produce(&pool->ring, page);
 211        else
 212                ret = ptr_ring_produce_bh(&pool->ring, page);
 213
 214        return (ret == 0) ? true : false;
 215}
 216
 217/* Only allow direct recycling in special circumstances, into the
 218 * alloc side cache.  E.g. during RX-NAPI processing for XDP_DROP use-case.
 219 *
 220 * Caller must provide appropriate safe context.
 221 */
 222static bool __page_pool_recycle_direct(struct page *page,
 223                                       struct page_pool *pool)
 224{
 225        if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
 226                return false;
 227
 228        /* Caller MUST have verified/know (page_ref_count(page) == 1) */
 229        pool->alloc.cache[pool->alloc.count++] = page;
 230        return true;
 231}
 232
 233void __page_pool_put_page(struct page_pool *pool,
 234                          struct page *page, bool allow_direct)
 235{
 236        /* This allocator is optimized for the XDP mode that uses
 237         * one-frame-per-page, but have fallbacks that act like the
 238         * regular page allocator APIs.
 239         *
 240         * refcnt == 1 means page_pool owns page, and can recycle it.
 241         */
 242        if (likely(page_ref_count(page) == 1)) {
 243                /* Read barrier done in page_ref_count / READ_ONCE */
 244
 245                if (allow_direct && in_serving_softirq())
 246                        if (__page_pool_recycle_direct(page, pool))
 247                                return;
 248
 249                if (!__page_pool_recycle_into_ring(pool, page)) {
 250                        /* Cache full, fallback to free pages */
 251                        __page_pool_return_page(pool, page);
 252                }
 253                return;
 254        }
 255        /* Fallback/non-XDP mode: API user have elevated refcnt.
 256         *
 257         * Many drivers split up the page into fragments, and some
 258         * want to keep doing this to save memory and do refcnt based
 259         * recycling. Support this use case too, to ease drivers
 260         * switching between XDP/non-XDP.
 261         *
 262         * In-case page_pool maintains the DMA mapping, API user must
 263         * call page_pool_put_page once.  In this elevated refcnt
 264         * case, the DMA is unmapped/released, as driver is likely
 265         * doing refcnt based recycle tricks, meaning another process
 266         * will be invoking put_page.
 267         */
 268        __page_pool_clean_page(pool, page);
 269        put_page(page);
 270}
 271EXPORT_SYMBOL(__page_pool_put_page);
 272
 273static void __page_pool_empty_ring(struct page_pool *pool)
 274{
 275        struct page *page;
 276
 277        /* Empty recycle ring */
 278        while ((page = ptr_ring_consume_bh(&pool->ring))) {
 279                /* Verify the refcnt invariant of cached pages */
 280                if (!(page_ref_count(page) == 1))
 281                        pr_crit("%s() page_pool refcnt %d violation\n",
 282                                __func__, page_ref_count(page));
 283
 284                __page_pool_return_page(pool, page);
 285        }
 286}
 287
 288static void __page_pool_destroy_rcu(struct rcu_head *rcu)
 289{
 290        struct page_pool *pool;
 291
 292        pool = container_of(rcu, struct page_pool, rcu);
 293
 294        WARN(pool->alloc.count, "API usage violation");
 295
 296        __page_pool_empty_ring(pool);
 297        ptr_ring_cleanup(&pool->ring, NULL);
 298        kfree(pool);
 299}
 300
 301/* Cleanup and release resources */
 302void page_pool_destroy(struct page_pool *pool)
 303{
 304        struct page *page;
 305
 306        /* Empty alloc cache, assume caller made sure this is
 307         * no-longer in use, and page_pool_alloc_pages() cannot be
 308         * call concurrently.
 309         */
 310        while (pool->alloc.count) {
 311                page = pool->alloc.cache[--pool->alloc.count];
 312                __page_pool_return_page(pool, page);
 313        }
 314
 315        /* No more consumers should exist, but producers could still
 316         * be in-flight.
 317         */
 318        __page_pool_empty_ring(pool);
 319
 320        /* An xdp_mem_allocator can still ref page_pool pointer */
 321        call_rcu(&pool->rcu, __page_pool_destroy_rcu);
 322}
 323EXPORT_SYMBOL(page_pool_destroy);
 324