linux/net/xdp/xsk.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/* XDP sockets
   3 *
   4 * AF_XDP sockets allows a channel between XDP programs and userspace
   5 * applications.
   6 * Copyright(c) 2018 Intel Corporation.
   7 *
   8 * Author(s): Björn Töpel <bjorn.topel@intel.com>
   9 *            Magnus Karlsson <magnus.karlsson@intel.com>
  10 */
  11
  12#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
  13
  14#include <linux/if_xdp.h>
  15#include <linux/init.h>
  16#include <linux/sched/mm.h>
  17#include <linux/sched/signal.h>
  18#include <linux/sched/task.h>
  19#include <linux/socket.h>
  20#include <linux/file.h>
  21#include <linux/uaccess.h>
  22#include <linux/net.h>
  23#include <linux/netdevice.h>
  24#include <linux/rculist.h>
  25#include <net/xdp_sock_drv.h>
  26#include <net/busy_poll.h>
  27#include <net/xdp.h>
  28
  29#include "xsk_queue.h"
  30#include "xdp_umem.h"
  31#include "xsk.h"
  32
  33#define TX_BATCH_SIZE 32
  34
  35static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
  36
  37void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
  38{
  39        if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
  40                return;
  41
  42        pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
  43        pool->cached_need_wakeup |= XDP_WAKEUP_RX;
  44}
  45EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
  46
  47void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
  48{
  49        struct xdp_sock *xs;
  50
  51        if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
  52                return;
  53
  54        rcu_read_lock();
  55        list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
  56                xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
  57        }
  58        rcu_read_unlock();
  59
  60        pool->cached_need_wakeup |= XDP_WAKEUP_TX;
  61}
  62EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
  63
  64void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
  65{
  66        if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX))
  67                return;
  68
  69        pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
  70        pool->cached_need_wakeup &= ~XDP_WAKEUP_RX;
  71}
  72EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
  73
  74void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
  75{
  76        struct xdp_sock *xs;
  77
  78        if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX))
  79                return;
  80
  81        rcu_read_lock();
  82        list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
  83                xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
  84        }
  85        rcu_read_unlock();
  86
  87        pool->cached_need_wakeup &= ~XDP_WAKEUP_TX;
  88}
  89EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
  90
  91bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
  92{
  93        return pool->uses_need_wakeup;
  94}
  95EXPORT_SYMBOL(xsk_uses_need_wakeup);
  96
  97struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
  98                                            u16 queue_id)
  99{
 100        if (queue_id < dev->real_num_rx_queues)
 101                return dev->_rx[queue_id].pool;
 102        if (queue_id < dev->real_num_tx_queues)
 103                return dev->_tx[queue_id].pool;
 104
 105        return NULL;
 106}
 107EXPORT_SYMBOL(xsk_get_pool_from_qid);
 108
 109void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
 110{
 111        if (queue_id < dev->num_rx_queues)
 112                dev->_rx[queue_id].pool = NULL;
 113        if (queue_id < dev->num_tx_queues)
 114                dev->_tx[queue_id].pool = NULL;
 115}
 116
 117/* The buffer pool is stored both in the _rx struct and the _tx struct as we do
 118 * not know if the device has more tx queues than rx, or the opposite.
 119 * This might also change during run time.
 120 */
 121int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
 122                        u16 queue_id)
 123{
 124        if (queue_id >= max_t(unsigned int,
 125                              dev->real_num_rx_queues,
 126                              dev->real_num_tx_queues))
 127                return -EINVAL;
 128
 129        if (queue_id < dev->real_num_rx_queues)
 130                dev->_rx[queue_id].pool = pool;
 131        if (queue_id < dev->real_num_tx_queues)
 132                dev->_tx[queue_id].pool = pool;
 133
 134        return 0;
 135}
 136
 137void xp_release(struct xdp_buff_xsk *xskb)
 138{
 139        xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb;
 140}
 141
 142static u64 xp_get_handle(struct xdp_buff_xsk *xskb)
 143{
 144        u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start;
 145
 146        offset += xskb->pool->headroom;
 147        if (!xskb->pool->unaligned)
 148                return xskb->orig_addr + offset;
 149        return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT);
 150}
 151
 152static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 153{
 154        struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
 155        u64 addr;
 156        int err;
 157
 158        addr = xp_get_handle(xskb);
 159        err = xskq_prod_reserve_desc(xs->rx, addr, len);
 160        if (err) {
 161                xs->rx_queue_full++;
 162                return err;
 163        }
 164
 165        xp_release(xskb);
 166        return 0;
 167}
 168
 169static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len)
 170{
 171        void *from_buf, *to_buf;
 172        u32 metalen;
 173
 174        if (unlikely(xdp_data_meta_unsupported(from))) {
 175                from_buf = from->data;
 176                to_buf = to->data;
 177                metalen = 0;
 178        } else {
 179                from_buf = from->data_meta;
 180                metalen = from->data - from->data_meta;
 181                to_buf = to->data - metalen;
 182        }
 183
 184        memcpy(to_buf, from_buf, len + metalen);
 185}
 186
 187static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 188{
 189        struct xdp_buff *xsk_xdp;
 190        int err;
 191        u32 len;
 192
 193        len = xdp->data_end - xdp->data;
 194        if (len > xsk_pool_get_rx_frame_size(xs->pool)) {
 195                xs->rx_dropped++;
 196                return -ENOSPC;
 197        }
 198
 199        xsk_xdp = xsk_buff_alloc(xs->pool);
 200        if (!xsk_xdp) {
 201                xs->rx_dropped++;
 202                return -ENOSPC;
 203        }
 204
 205        xsk_copy_xdp(xsk_xdp, xdp, len);
 206        err = __xsk_rcv_zc(xs, xsk_xdp, len);
 207        if (err) {
 208                xsk_buff_free(xsk_xdp);
 209                return err;
 210        }
 211        return 0;
 212}
 213
 214static bool xsk_tx_writeable(struct xdp_sock *xs)
 215{
 216        if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
 217                return false;
 218
 219        return true;
 220}
 221
 222static bool xsk_is_bound(struct xdp_sock *xs)
 223{
 224        if (READ_ONCE(xs->state) == XSK_BOUND) {
 225                /* Matches smp_wmb() in bind(). */
 226                smp_rmb();
 227                return true;
 228        }
 229        return false;
 230}
 231
 232static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp)
 233{
 234        if (!xsk_is_bound(xs))
 235                return -EINVAL;
 236
 237        if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
 238                return -EINVAL;
 239
 240        sk_mark_napi_id_once_xdp(&xs->sk, xdp);
 241        return 0;
 242}
 243
 244static void xsk_flush(struct xdp_sock *xs)
 245{
 246        xskq_prod_submit(xs->rx);
 247        __xskq_cons_release(xs->pool->fq);
 248        sock_def_readable(&xs->sk);
 249}
 250
 251int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 252{
 253        int err;
 254
 255        spin_lock_bh(&xs->rx_lock);
 256        err = xsk_rcv_check(xs, xdp);
 257        if (!err) {
 258                err = __xsk_rcv(xs, xdp);
 259                xsk_flush(xs);
 260        }
 261        spin_unlock_bh(&xs->rx_lock);
 262        return err;
 263}
 264
 265static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 266{
 267        int err;
 268        u32 len;
 269
 270        err = xsk_rcv_check(xs, xdp);
 271        if (err)
 272                return err;
 273
 274        if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
 275                len = xdp->data_end - xdp->data;
 276                return __xsk_rcv_zc(xs, xdp, len);
 277        }
 278
 279        err = __xsk_rcv(xs, xdp);
 280        if (!err)
 281                xdp_return_buff(xdp);
 282        return err;
 283}
 284
 285int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
 286{
 287        struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
 288        int err;
 289
 290        err = xsk_rcv(xs, xdp);
 291        if (err)
 292                return err;
 293
 294        if (!xs->flush_node.prev)
 295                list_add(&xs->flush_node, flush_list);
 296
 297        return 0;
 298}
 299
 300void __xsk_map_flush(void)
 301{
 302        struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
 303        struct xdp_sock *xs, *tmp;
 304
 305        list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
 306                xsk_flush(xs);
 307                __list_del_clearprev(&xs->flush_node);
 308        }
 309}
 310
 311void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
 312{
 313        xskq_prod_submit_n(pool->cq, nb_entries);
 314}
 315EXPORT_SYMBOL(xsk_tx_completed);
 316
 317void xsk_tx_release(struct xsk_buff_pool *pool)
 318{
 319        struct xdp_sock *xs;
 320
 321        rcu_read_lock();
 322        list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
 323                __xskq_cons_release(xs->tx);
 324                if (xsk_tx_writeable(xs))
 325                        xs->sk.sk_write_space(&xs->sk);
 326        }
 327        rcu_read_unlock();
 328}
 329EXPORT_SYMBOL(xsk_tx_release);
 330
 331bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
 332{
 333        struct xdp_sock *xs;
 334
 335        rcu_read_lock();
 336        list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
 337                if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
 338                        xs->tx->queue_empty_descs++;
 339                        continue;
 340                }
 341
 342                /* This is the backpressure mechanism for the Tx path.
 343                 * Reserve space in the completion queue and only proceed
 344                 * if there is space in it. This avoids having to implement
 345                 * any buffering in the Tx path.
 346                 */
 347                if (xskq_prod_reserve_addr(pool->cq, desc->addr))
 348                        goto out;
 349
 350                xskq_cons_release(xs->tx);
 351                rcu_read_unlock();
 352                return true;
 353        }
 354
 355out:
 356        rcu_read_unlock();
 357        return false;
 358}
 359EXPORT_SYMBOL(xsk_tx_peek_desc);
 360
 361static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, struct xdp_desc *descs,
 362                                        u32 max_entries)
 363{
 364        u32 nb_pkts = 0;
 365
 366        while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
 367                nb_pkts++;
 368
 369        xsk_tx_release(pool);
 370        return nb_pkts;
 371}
 372
 373u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *descs,
 374                                   u32 max_entries)
 375{
 376        struct xdp_sock *xs;
 377        u32 nb_pkts;
 378
 379        rcu_read_lock();
 380        if (!list_is_singular(&pool->xsk_tx_list)) {
 381                /* Fallback to the non-batched version */
 382                rcu_read_unlock();
 383                return xsk_tx_peek_release_fallback(pool, descs, max_entries);
 384        }
 385
 386        xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
 387        if (!xs) {
 388                nb_pkts = 0;
 389                goto out;
 390        }
 391
 392        nb_pkts = xskq_cons_peek_desc_batch(xs->tx, descs, pool, max_entries);
 393        if (!nb_pkts) {
 394                xs->tx->queue_empty_descs++;
 395                goto out;
 396        }
 397
 398        /* This is the backpressure mechanism for the Tx path. Try to
 399         * reserve space in the completion queue for all packets, but
 400         * if there are fewer slots available, just process that many
 401         * packets. This avoids having to implement any buffering in
 402         * the Tx path.
 403         */
 404        nb_pkts = xskq_prod_reserve_addr_batch(pool->cq, descs, nb_pkts);
 405        if (!nb_pkts)
 406                goto out;
 407
 408        xskq_cons_release_n(xs->tx, nb_pkts);
 409        __xskq_cons_release(xs->tx);
 410        xs->sk.sk_write_space(&xs->sk);
 411
 412out:
 413        rcu_read_unlock();
 414        return nb_pkts;
 415}
 416EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch);
 417
 418static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
 419{
 420        struct net_device *dev = xs->dev;
 421        int err;
 422
 423        rcu_read_lock();
 424        err = dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
 425        rcu_read_unlock();
 426
 427        return err;
 428}
 429
 430static int xsk_zc_xmit(struct xdp_sock *xs)
 431{
 432        return xsk_wakeup(xs, XDP_WAKEUP_TX);
 433}
 434
 435static void xsk_destruct_skb(struct sk_buff *skb)
 436{
 437        u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
 438        struct xdp_sock *xs = xdp_sk(skb->sk);
 439        unsigned long flags;
 440
 441        spin_lock_irqsave(&xs->pool->cq_lock, flags);
 442        xskq_prod_submit_addr(xs->pool->cq, addr);
 443        spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
 444
 445        sock_wfree(skb);
 446}
 447
 448static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
 449                                              struct xdp_desc *desc)
 450{
 451        struct xsk_buff_pool *pool = xs->pool;
 452        u32 hr, len, ts, offset, copy, copied;
 453        struct sk_buff *skb;
 454        struct page *page;
 455        void *buffer;
 456        int err, i;
 457        u64 addr;
 458
 459        hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
 460
 461        skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
 462        if (unlikely(!skb))
 463                return ERR_PTR(err);
 464
 465        skb_reserve(skb, hr);
 466
 467        addr = desc->addr;
 468        len = desc->len;
 469        ts = pool->unaligned ? len : pool->chunk_size;
 470
 471        buffer = xsk_buff_raw_get_data(pool, addr);
 472        offset = offset_in_page(buffer);
 473        addr = buffer - pool->addrs;
 474
 475        for (copied = 0, i = 0; copied < len; i++) {
 476                page = pool->umem->pgs[addr >> PAGE_SHIFT];
 477                get_page(page);
 478
 479                copy = min_t(u32, PAGE_SIZE - offset, len - copied);
 480                skb_fill_page_desc(skb, i, page, offset, copy);
 481
 482                copied += copy;
 483                addr += copy;
 484                offset = 0;
 485        }
 486
 487        skb->len += len;
 488        skb->data_len += len;
 489        skb->truesize += ts;
 490
 491        refcount_add(ts, &xs->sk.sk_wmem_alloc);
 492
 493        return skb;
 494}
 495
 496static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
 497                                     struct xdp_desc *desc)
 498{
 499        struct net_device *dev = xs->dev;
 500        struct sk_buff *skb;
 501
 502        if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
 503                skb = xsk_build_skb_zerocopy(xs, desc);
 504                if (IS_ERR(skb))
 505                        return skb;
 506        } else {
 507                u32 hr, tr, len;
 508                void *buffer;
 509                int err;
 510
 511                hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
 512                tr = dev->needed_tailroom;
 513                len = desc->len;
 514
 515                skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
 516                if (unlikely(!skb))
 517                        return ERR_PTR(err);
 518
 519                skb_reserve(skb, hr);
 520                skb_put(skb, len);
 521
 522                buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
 523                err = skb_store_bits(skb, 0, buffer, len);
 524                if (unlikely(err)) {
 525                        kfree_skb(skb);
 526                        return ERR_PTR(err);
 527                }
 528        }
 529
 530        skb->dev = dev;
 531        skb->priority = xs->sk.sk_priority;
 532        skb->mark = xs->sk.sk_mark;
 533        skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
 534        skb->destructor = xsk_destruct_skb;
 535
 536        return skb;
 537}
 538
 539static int xsk_generic_xmit(struct sock *sk)
 540{
 541        struct xdp_sock *xs = xdp_sk(sk);
 542        u32 max_batch = TX_BATCH_SIZE;
 543        bool sent_frame = false;
 544        struct xdp_desc desc;
 545        struct sk_buff *skb;
 546        unsigned long flags;
 547        int err = 0;
 548
 549        mutex_lock(&xs->mutex);
 550
 551        if (xs->queue_id >= xs->dev->real_num_tx_queues)
 552                goto out;
 553
 554        while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
 555                if (max_batch-- == 0) {
 556                        err = -EAGAIN;
 557                        goto out;
 558                }
 559
 560                skb = xsk_build_skb(xs, &desc);
 561                if (IS_ERR(skb)) {
 562                        err = PTR_ERR(skb);
 563                        goto out;
 564                }
 565
 566                /* This is the backpressure mechanism for the Tx path.
 567                 * Reserve space in the completion queue and only proceed
 568                 * if there is space in it. This avoids having to implement
 569                 * any buffering in the Tx path.
 570                 */
 571                spin_lock_irqsave(&xs->pool->cq_lock, flags);
 572                if (xskq_prod_reserve(xs->pool->cq)) {
 573                        spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
 574                        kfree_skb(skb);
 575                        goto out;
 576                }
 577                spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
 578
 579                err = __dev_direct_xmit(skb, xs->queue_id);
 580                if  (err == NETDEV_TX_BUSY) {
 581                        /* Tell user-space to retry the send */
 582                        skb->destructor = sock_wfree;
 583                        spin_lock_irqsave(&xs->pool->cq_lock, flags);
 584                        xskq_prod_cancel(xs->pool->cq);
 585                        spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
 586                        /* Free skb without triggering the perf drop trace */
 587                        consume_skb(skb);
 588                        err = -EAGAIN;
 589                        goto out;
 590                }
 591
 592                xskq_cons_release(xs->tx);
 593                /* Ignore NET_XMIT_CN as packet might have been sent */
 594                if (err == NET_XMIT_DROP) {
 595                        /* SKB completed but not sent */
 596                        err = -EBUSY;
 597                        goto out;
 598                }
 599
 600                sent_frame = true;
 601        }
 602
 603        xs->tx->queue_empty_descs++;
 604
 605out:
 606        if (sent_frame)
 607                if (xsk_tx_writeable(xs))
 608                        sk->sk_write_space(sk);
 609
 610        mutex_unlock(&xs->mutex);
 611        return err;
 612}
 613
 614static int __xsk_sendmsg(struct sock *sk)
 615{
 616        struct xdp_sock *xs = xdp_sk(sk);
 617
 618        if (unlikely(!(xs->dev->flags & IFF_UP)))
 619                return -ENETDOWN;
 620        if (unlikely(!xs->tx))
 621                return -ENOBUFS;
 622
 623        return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk);
 624}
 625
 626static bool xsk_no_wakeup(struct sock *sk)
 627{
 628#ifdef CONFIG_NET_RX_BUSY_POLL
 629        /* Prefer busy-polling, skip the wakeup. */
 630        return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) &&
 631                READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID;
 632#else
 633        return false;
 634#endif
 635}
 636
 637static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 638{
 639        bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
 640        struct sock *sk = sock->sk;
 641        struct xdp_sock *xs = xdp_sk(sk);
 642        struct xsk_buff_pool *pool;
 643
 644        if (unlikely(!xsk_is_bound(xs)))
 645                return -ENXIO;
 646        if (unlikely(need_wait))
 647                return -EOPNOTSUPP;
 648
 649        if (sk_can_busy_loop(sk))
 650                sk_busy_loop(sk, 1); /* only support non-blocking sockets */
 651
 652        if (xsk_no_wakeup(sk))
 653                return 0;
 654
 655        pool = xs->pool;
 656        if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
 657                return __xsk_sendmsg(sk);
 658        return 0;
 659}
 660
 661static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
 662{
 663        bool need_wait = !(flags & MSG_DONTWAIT);
 664        struct sock *sk = sock->sk;
 665        struct xdp_sock *xs = xdp_sk(sk);
 666
 667        if (unlikely(!xsk_is_bound(xs)))
 668                return -ENXIO;
 669        if (unlikely(!(xs->dev->flags & IFF_UP)))
 670                return -ENETDOWN;
 671        if (unlikely(!xs->rx))
 672                return -ENOBUFS;
 673        if (unlikely(need_wait))
 674                return -EOPNOTSUPP;
 675
 676        if (sk_can_busy_loop(sk))
 677                sk_busy_loop(sk, 1); /* only support non-blocking sockets */
 678
 679        if (xsk_no_wakeup(sk))
 680                return 0;
 681
 682        if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc)
 683                return xsk_wakeup(xs, XDP_WAKEUP_RX);
 684        return 0;
 685}
 686
 687static __poll_t xsk_poll(struct file *file, struct socket *sock,
 688                             struct poll_table_struct *wait)
 689{
 690        __poll_t mask = 0;
 691        struct sock *sk = sock->sk;
 692        struct xdp_sock *xs = xdp_sk(sk);
 693        struct xsk_buff_pool *pool;
 694
 695        sock_poll_wait(file, sock, wait);
 696
 697        if (unlikely(!xsk_is_bound(xs)))
 698                return mask;
 699
 700        pool = xs->pool;
 701
 702        if (pool->cached_need_wakeup) {
 703                if (xs->zc)
 704                        xsk_wakeup(xs, pool->cached_need_wakeup);
 705                else
 706                        /* Poll needs to drive Tx also in copy mode */
 707                        __xsk_sendmsg(sk);
 708        }
 709
 710        if (xs->rx && !xskq_prod_is_empty(xs->rx))
 711                mask |= EPOLLIN | EPOLLRDNORM;
 712        if (xs->tx && xsk_tx_writeable(xs))
 713                mask |= EPOLLOUT | EPOLLWRNORM;
 714
 715        return mask;
 716}
 717
 718static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
 719                          bool umem_queue)
 720{
 721        struct xsk_queue *q;
 722
 723        if (entries == 0 || *queue || !is_power_of_2(entries))
 724                return -EINVAL;
 725
 726        q = xskq_create(entries, umem_queue);
 727        if (!q)
 728                return -ENOMEM;
 729
 730        /* Make sure queue is ready before it can be seen by others */
 731        smp_wmb();
 732        WRITE_ONCE(*queue, q);
 733        return 0;
 734}
 735
 736static void xsk_unbind_dev(struct xdp_sock *xs)
 737{
 738        struct net_device *dev = xs->dev;
 739
 740        if (xs->state != XSK_BOUND)
 741                return;
 742        WRITE_ONCE(xs->state, XSK_UNBOUND);
 743
 744        /* Wait for driver to stop using the xdp socket. */
 745        xp_del_xsk(xs->pool, xs);
 746        xs->dev = NULL;
 747        synchronize_net();
 748        dev_put(dev);
 749}
 750
 751static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
 752                                              struct xdp_sock __rcu ***map_entry)
 753{
 754        struct xsk_map *map = NULL;
 755        struct xsk_map_node *node;
 756
 757        *map_entry = NULL;
 758
 759        spin_lock_bh(&xs->map_list_lock);
 760        node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
 761                                        node);
 762        if (node) {
 763                bpf_map_inc(&node->map->map);
 764                map = node->map;
 765                *map_entry = node->map_entry;
 766        }
 767        spin_unlock_bh(&xs->map_list_lock);
 768        return map;
 769}
 770
 771static void xsk_delete_from_maps(struct xdp_sock *xs)
 772{
 773        /* This function removes the current XDP socket from all the
 774         * maps it resides in. We need to take extra care here, due to
 775         * the two locks involved. Each map has a lock synchronizing
 776         * updates to the entries, and each socket has a lock that
 777         * synchronizes access to the list of maps (map_list). For
 778         * deadlock avoidance the locks need to be taken in the order
 779         * "map lock"->"socket map list lock". We start off by
 780         * accessing the socket map list, and take a reference to the
 781         * map to guarantee existence between the
 782         * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
 783         * calls. Then we ask the map to remove the socket, which
 784         * tries to remove the socket from the map. Note that there
 785         * might be updates to the map between
 786         * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
 787         */
 788        struct xdp_sock __rcu **map_entry = NULL;
 789        struct xsk_map *map;
 790
 791        while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
 792                xsk_map_try_sock_delete(map, xs, map_entry);
 793                bpf_map_put(&map->map);
 794        }
 795}
 796
 797static int xsk_release(struct socket *sock)
 798{
 799        struct sock *sk = sock->sk;
 800        struct xdp_sock *xs = xdp_sk(sk);
 801        struct net *net;
 802
 803        if (!sk)
 804                return 0;
 805
 806        net = sock_net(sk);
 807
 808        mutex_lock(&net->xdp.lock);
 809        sk_del_node_init_rcu(sk);
 810        mutex_unlock(&net->xdp.lock);
 811
 812        local_bh_disable();
 813        sock_prot_inuse_add(net, sk->sk_prot, -1);
 814        local_bh_enable();
 815
 816        xsk_delete_from_maps(xs);
 817        mutex_lock(&xs->mutex);
 818        xsk_unbind_dev(xs);
 819        mutex_unlock(&xs->mutex);
 820
 821        xskq_destroy(xs->rx);
 822        xskq_destroy(xs->tx);
 823        xskq_destroy(xs->fq_tmp);
 824        xskq_destroy(xs->cq_tmp);
 825
 826        sock_orphan(sk);
 827        sock->sk = NULL;
 828
 829        sk_refcnt_debug_release(sk);
 830        sock_put(sk);
 831
 832        return 0;
 833}
 834
 835static struct socket *xsk_lookup_xsk_from_fd(int fd)
 836{
 837        struct socket *sock;
 838        int err;
 839
 840        sock = sockfd_lookup(fd, &err);
 841        if (!sock)
 842                return ERR_PTR(-ENOTSOCK);
 843
 844        if (sock->sk->sk_family != PF_XDP) {
 845                sockfd_put(sock);
 846                return ERR_PTR(-ENOPROTOOPT);
 847        }
 848
 849        return sock;
 850}
 851
 852static bool xsk_validate_queues(struct xdp_sock *xs)
 853{
 854        return xs->fq_tmp && xs->cq_tmp;
 855}
 856
 857static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 858{
 859        struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
 860        struct sock *sk = sock->sk;
 861        struct xdp_sock *xs = xdp_sk(sk);
 862        struct net_device *dev;
 863        u32 flags, qid;
 864        int err = 0;
 865
 866        if (addr_len < sizeof(struct sockaddr_xdp))
 867                return -EINVAL;
 868        if (sxdp->sxdp_family != AF_XDP)
 869                return -EINVAL;
 870
 871        flags = sxdp->sxdp_flags;
 872        if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
 873                      XDP_USE_NEED_WAKEUP))
 874                return -EINVAL;
 875
 876        rtnl_lock();
 877        mutex_lock(&xs->mutex);
 878        if (xs->state != XSK_READY) {
 879                err = -EBUSY;
 880                goto out_release;
 881        }
 882
 883        dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
 884        if (!dev) {
 885                err = -ENODEV;
 886                goto out_release;
 887        }
 888
 889        if (!xs->rx && !xs->tx) {
 890                err = -EINVAL;
 891                goto out_unlock;
 892        }
 893
 894        qid = sxdp->sxdp_queue_id;
 895
 896        if (flags & XDP_SHARED_UMEM) {
 897                struct xdp_sock *umem_xs;
 898                struct socket *sock;
 899
 900                if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
 901                    (flags & XDP_USE_NEED_WAKEUP)) {
 902                        /* Cannot specify flags for shared sockets. */
 903                        err = -EINVAL;
 904                        goto out_unlock;
 905                }
 906
 907                if (xs->umem) {
 908                        /* We have already our own. */
 909                        err = -EINVAL;
 910                        goto out_unlock;
 911                }
 912
 913                sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
 914                if (IS_ERR(sock)) {
 915                        err = PTR_ERR(sock);
 916                        goto out_unlock;
 917                }
 918
 919                umem_xs = xdp_sk(sock->sk);
 920                if (!xsk_is_bound(umem_xs)) {
 921                        err = -EBADF;
 922                        sockfd_put(sock);
 923                        goto out_unlock;
 924                }
 925
 926                if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
 927                        /* Share the umem with another socket on another qid
 928                         * and/or device.
 929                         */
 930                        xs->pool = xp_create_and_assign_umem(xs,
 931                                                             umem_xs->umem);
 932                        if (!xs->pool) {
 933                                err = -ENOMEM;
 934                                sockfd_put(sock);
 935                                goto out_unlock;
 936                        }
 937
 938                        err = xp_assign_dev_shared(xs->pool, umem_xs->umem,
 939                                                   dev, qid);
 940                        if (err) {
 941                                xp_destroy(xs->pool);
 942                                xs->pool = NULL;
 943                                sockfd_put(sock);
 944                                goto out_unlock;
 945                        }
 946                } else {
 947                        /* Share the buffer pool with the other socket. */
 948                        if (xs->fq_tmp || xs->cq_tmp) {
 949                                /* Do not allow setting your own fq or cq. */
 950                                err = -EINVAL;
 951                                sockfd_put(sock);
 952                                goto out_unlock;
 953                        }
 954
 955                        xp_get_pool(umem_xs->pool);
 956                        xs->pool = umem_xs->pool;
 957                }
 958
 959                xdp_get_umem(umem_xs->umem);
 960                WRITE_ONCE(xs->umem, umem_xs->umem);
 961                sockfd_put(sock);
 962        } else if (!xs->umem || !xsk_validate_queues(xs)) {
 963                err = -EINVAL;
 964                goto out_unlock;
 965        } else {
 966                /* This xsk has its own umem. */
 967                xs->pool = xp_create_and_assign_umem(xs, xs->umem);
 968                if (!xs->pool) {
 969                        err = -ENOMEM;
 970                        goto out_unlock;
 971                }
 972
 973                err = xp_assign_dev(xs->pool, dev, qid, flags);
 974                if (err) {
 975                        xp_destroy(xs->pool);
 976                        xs->pool = NULL;
 977                        goto out_unlock;
 978                }
 979        }
 980
 981        /* FQ and CQ are now owned by the buffer pool and cleaned up with it. */
 982        xs->fq_tmp = NULL;
 983        xs->cq_tmp = NULL;
 984
 985        xs->dev = dev;
 986        xs->zc = xs->umem->zc;
 987        xs->queue_id = qid;
 988        xp_add_xsk(xs->pool, xs);
 989
 990out_unlock:
 991        if (err) {
 992                dev_put(dev);
 993        } else {
 994                /* Matches smp_rmb() in bind() for shared umem
 995                 * sockets, and xsk_is_bound().
 996                 */
 997                smp_wmb();
 998                WRITE_ONCE(xs->state, XSK_BOUND);
 999        }
1000out_release:
1001        mutex_unlock(&xs->mutex);
1002        rtnl_unlock();
1003        return err;
1004}
1005
1006struct xdp_umem_reg_v1 {
1007        __u64 addr; /* Start of packet data area */
1008        __u64 len; /* Length of packet data area */
1009        __u32 chunk_size;
1010        __u32 headroom;
1011};
1012
1013static int xsk_setsockopt(struct socket *sock, int level, int optname,
1014                          sockptr_t optval, unsigned int optlen)
1015{
1016        struct sock *sk = sock->sk;
1017        struct xdp_sock *xs = xdp_sk(sk);
1018        int err;
1019
1020        if (level != SOL_XDP)
1021                return -ENOPROTOOPT;
1022
1023        switch (optname) {
1024        case XDP_RX_RING:
1025        case XDP_TX_RING:
1026        {
1027                struct xsk_queue **q;
1028                int entries;
1029
1030                if (optlen < sizeof(entries))
1031                        return -EINVAL;
1032                if (copy_from_sockptr(&entries, optval, sizeof(entries)))
1033                        return -EFAULT;
1034
1035                mutex_lock(&xs->mutex);
1036                if (xs->state != XSK_READY) {
1037                        mutex_unlock(&xs->mutex);
1038                        return -EBUSY;
1039                }
1040                q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
1041                err = xsk_init_queue(entries, q, false);
1042                if (!err && optname == XDP_TX_RING)
1043                        /* Tx needs to be explicitly woken up the first time */
1044                        xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
1045                mutex_unlock(&xs->mutex);
1046                return err;
1047        }
1048        case XDP_UMEM_REG:
1049        {
1050                size_t mr_size = sizeof(struct xdp_umem_reg);
1051                struct xdp_umem_reg mr = {};
1052                struct xdp_umem *umem;
1053
1054                if (optlen < sizeof(struct xdp_umem_reg_v1))
1055                        return -EINVAL;
1056                else if (optlen < sizeof(mr))
1057                        mr_size = sizeof(struct xdp_umem_reg_v1);
1058
1059                if (copy_from_sockptr(&mr, optval, mr_size))
1060                        return -EFAULT;
1061
1062                mutex_lock(&xs->mutex);
1063                if (xs->state != XSK_READY || xs->umem) {
1064                        mutex_unlock(&xs->mutex);
1065                        return -EBUSY;
1066                }
1067
1068                umem = xdp_umem_create(&mr);
1069                if (IS_ERR(umem)) {
1070                        mutex_unlock(&xs->mutex);
1071                        return PTR_ERR(umem);
1072                }
1073
1074                /* Make sure umem is ready before it can be seen by others */
1075                smp_wmb();
1076                WRITE_ONCE(xs->umem, umem);
1077                mutex_unlock(&xs->mutex);
1078                return 0;
1079        }
1080        case XDP_UMEM_FILL_RING:
1081        case XDP_UMEM_COMPLETION_RING:
1082        {
1083                struct xsk_queue **q;
1084                int entries;
1085
1086                if (copy_from_sockptr(&entries, optval, sizeof(entries)))
1087                        return -EFAULT;
1088
1089                mutex_lock(&xs->mutex);
1090                if (xs->state != XSK_READY) {
1091                        mutex_unlock(&xs->mutex);
1092                        return -EBUSY;
1093                }
1094
1095                q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
1096                        &xs->cq_tmp;
1097                err = xsk_init_queue(entries, q, true);
1098                mutex_unlock(&xs->mutex);
1099                return err;
1100        }
1101        default:
1102                break;
1103        }
1104
1105        return -ENOPROTOOPT;
1106}
1107
1108static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
1109{
1110        ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
1111        ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
1112        ring->desc = offsetof(struct xdp_rxtx_ring, desc);
1113}
1114
1115static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
1116{
1117        ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
1118        ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
1119        ring->desc = offsetof(struct xdp_umem_ring, desc);
1120}
1121
1122struct xdp_statistics_v1 {
1123        __u64 rx_dropped;
1124        __u64 rx_invalid_descs;
1125        __u64 tx_invalid_descs;
1126};
1127
1128static int xsk_getsockopt(struct socket *sock, int level, int optname,
1129                          char __user *optval, int __user *optlen)
1130{
1131        struct sock *sk = sock->sk;
1132        struct xdp_sock *xs = xdp_sk(sk);
1133        int len;
1134
1135        if (level != SOL_XDP)
1136                return -ENOPROTOOPT;
1137
1138        if (get_user(len, optlen))
1139                return -EFAULT;
1140        if (len < 0)
1141                return -EINVAL;
1142
1143        switch (optname) {
1144        case XDP_STATISTICS:
1145        {
1146                struct xdp_statistics stats = {};
1147                bool extra_stats = true;
1148                size_t stats_size;
1149
1150                if (len < sizeof(struct xdp_statistics_v1)) {
1151                        return -EINVAL;
1152                } else if (len < sizeof(stats)) {
1153                        extra_stats = false;
1154                        stats_size = sizeof(struct xdp_statistics_v1);
1155                } else {
1156                        stats_size = sizeof(stats);
1157                }
1158
1159                mutex_lock(&xs->mutex);
1160                stats.rx_dropped = xs->rx_dropped;
1161                if (extra_stats) {
1162                        stats.rx_ring_full = xs->rx_queue_full;
1163                        stats.rx_fill_ring_empty_descs =
1164                                xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
1165                        stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
1166                } else {
1167                        stats.rx_dropped += xs->rx_queue_full;
1168                }
1169                stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
1170                stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
1171                mutex_unlock(&xs->mutex);
1172
1173                if (copy_to_user(optval, &stats, stats_size))
1174                        return -EFAULT;
1175                if (put_user(stats_size, optlen))
1176                        return -EFAULT;
1177
1178                return 0;
1179        }
1180        case XDP_MMAP_OFFSETS:
1181        {
1182                struct xdp_mmap_offsets off;
1183                struct xdp_mmap_offsets_v1 off_v1;
1184                bool flags_supported = true;
1185                void *to_copy;
1186
1187                if (len < sizeof(off_v1))
1188                        return -EINVAL;
1189                else if (len < sizeof(off))
1190                        flags_supported = false;
1191
1192                if (flags_supported) {
1193                        /* xdp_ring_offset is identical to xdp_ring_offset_v1
1194                         * except for the flags field added to the end.
1195                         */
1196                        xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1197                                               &off.rx);
1198                        xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1199                                               &off.tx);
1200                        xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1201                                               &off.fr);
1202                        xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1203                                               &off.cr);
1204                        off.rx.flags = offsetof(struct xdp_rxtx_ring,
1205                                                ptrs.flags);
1206                        off.tx.flags = offsetof(struct xdp_rxtx_ring,
1207                                                ptrs.flags);
1208                        off.fr.flags = offsetof(struct xdp_umem_ring,
1209                                                ptrs.flags);
1210                        off.cr.flags = offsetof(struct xdp_umem_ring,
1211                                                ptrs.flags);
1212
1213                        len = sizeof(off);
1214                        to_copy = &off;
1215                } else {
1216                        xsk_enter_rxtx_offsets(&off_v1.rx);
1217                        xsk_enter_rxtx_offsets(&off_v1.tx);
1218                        xsk_enter_umem_offsets(&off_v1.fr);
1219                        xsk_enter_umem_offsets(&off_v1.cr);
1220
1221                        len = sizeof(off_v1);
1222                        to_copy = &off_v1;
1223                }
1224
1225                if (copy_to_user(optval, to_copy, len))
1226                        return -EFAULT;
1227                if (put_user(len, optlen))
1228                        return -EFAULT;
1229
1230                return 0;
1231        }
1232        case XDP_OPTIONS:
1233        {
1234                struct xdp_options opts = {};
1235
1236                if (len < sizeof(opts))
1237                        return -EINVAL;
1238
1239                mutex_lock(&xs->mutex);
1240                if (xs->zc)
1241                        opts.flags |= XDP_OPTIONS_ZEROCOPY;
1242                mutex_unlock(&xs->mutex);
1243
1244                len = sizeof(opts);
1245                if (copy_to_user(optval, &opts, len))
1246                        return -EFAULT;
1247                if (put_user(len, optlen))
1248                        return -EFAULT;
1249
1250                return 0;
1251        }
1252        default:
1253                break;
1254        }
1255
1256        return -EOPNOTSUPP;
1257}
1258
1259static int xsk_mmap(struct file *file, struct socket *sock,
1260                    struct vm_area_struct *vma)
1261{
1262        loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
1263        unsigned long size = vma->vm_end - vma->vm_start;
1264        struct xdp_sock *xs = xdp_sk(sock->sk);
1265        struct xsk_queue *q = NULL;
1266        unsigned long pfn;
1267        struct page *qpg;
1268
1269        if (READ_ONCE(xs->state) != XSK_READY)
1270                return -EBUSY;
1271
1272        if (offset == XDP_PGOFF_RX_RING) {
1273                q = READ_ONCE(xs->rx);
1274        } else if (offset == XDP_PGOFF_TX_RING) {
1275                q = READ_ONCE(xs->tx);
1276        } else {
1277                /* Matches the smp_wmb() in XDP_UMEM_REG */
1278                smp_rmb();
1279                if (offset == XDP_UMEM_PGOFF_FILL_RING)
1280                        q = READ_ONCE(xs->fq_tmp);
1281                else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
1282                        q = READ_ONCE(xs->cq_tmp);
1283        }
1284
1285        if (!q)
1286                return -EINVAL;
1287
1288        /* Matches the smp_wmb() in xsk_init_queue */
1289        smp_rmb();
1290        qpg = virt_to_head_page(q->ring);
1291        if (size > page_size(qpg))
1292                return -EINVAL;
1293
1294        pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
1295        return remap_pfn_range(vma, vma->vm_start, pfn,
1296                               size, vma->vm_page_prot);
1297}
1298
1299static int xsk_notifier(struct notifier_block *this,
1300                        unsigned long msg, void *ptr)
1301{
1302        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1303        struct net *net = dev_net(dev);
1304        struct sock *sk;
1305
1306        switch (msg) {
1307        case NETDEV_UNREGISTER:
1308                mutex_lock(&net->xdp.lock);
1309                sk_for_each(sk, &net->xdp.list) {
1310                        struct xdp_sock *xs = xdp_sk(sk);
1311
1312                        mutex_lock(&xs->mutex);
1313                        if (xs->dev == dev) {
1314                                sk->sk_err = ENETDOWN;
1315                                if (!sock_flag(sk, SOCK_DEAD))
1316                                        sk_error_report(sk);
1317
1318                                xsk_unbind_dev(xs);
1319
1320                                /* Clear device references. */
1321                                xp_clear_dev(xs->pool);
1322                        }
1323                        mutex_unlock(&xs->mutex);
1324                }
1325                mutex_unlock(&net->xdp.lock);
1326                break;
1327        }
1328        return NOTIFY_DONE;
1329}
1330
1331static struct proto xsk_proto = {
1332        .name =         "XDP",
1333        .owner =        THIS_MODULE,
1334        .obj_size =     sizeof(struct xdp_sock),
1335};
1336
1337static const struct proto_ops xsk_proto_ops = {
1338        .family         = PF_XDP,
1339        .owner          = THIS_MODULE,
1340        .release        = xsk_release,
1341        .bind           = xsk_bind,
1342        .connect        = sock_no_connect,
1343        .socketpair     = sock_no_socketpair,
1344        .accept         = sock_no_accept,
1345        .getname        = sock_no_getname,
1346        .poll           = xsk_poll,
1347        .ioctl          = sock_no_ioctl,
1348        .listen         = sock_no_listen,
1349        .shutdown       = sock_no_shutdown,
1350        .setsockopt     = xsk_setsockopt,
1351        .getsockopt     = xsk_getsockopt,
1352        .sendmsg        = xsk_sendmsg,
1353        .recvmsg        = xsk_recvmsg,
1354        .mmap           = xsk_mmap,
1355        .sendpage       = sock_no_sendpage,
1356};
1357
1358static void xsk_destruct(struct sock *sk)
1359{
1360        struct xdp_sock *xs = xdp_sk(sk);
1361
1362        if (!sock_flag(sk, SOCK_DEAD))
1363                return;
1364
1365        if (!xp_put_pool(xs->pool))
1366                xdp_put_umem(xs->umem, !xs->pool);
1367
1368        sk_refcnt_debug_dec(sk);
1369}
1370
1371static int xsk_create(struct net *net, struct socket *sock, int protocol,
1372                      int kern)
1373{
1374        struct xdp_sock *xs;
1375        struct sock *sk;
1376
1377        if (!ns_capable(net->user_ns, CAP_NET_RAW))
1378                return -EPERM;
1379        if (sock->type != SOCK_RAW)
1380                return -ESOCKTNOSUPPORT;
1381
1382        if (protocol)
1383                return -EPROTONOSUPPORT;
1384
1385        sock->state = SS_UNCONNECTED;
1386
1387        sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
1388        if (!sk)
1389                return -ENOBUFS;
1390
1391        sock->ops = &xsk_proto_ops;
1392
1393        sock_init_data(sock, sk);
1394
1395        sk->sk_family = PF_XDP;
1396
1397        sk->sk_destruct = xsk_destruct;
1398        sk_refcnt_debug_inc(sk);
1399
1400        sock_set_flag(sk, SOCK_RCU_FREE);
1401
1402        xs = xdp_sk(sk);
1403        xs->state = XSK_READY;
1404        mutex_init(&xs->mutex);
1405        spin_lock_init(&xs->rx_lock);
1406
1407        INIT_LIST_HEAD(&xs->map_list);
1408        spin_lock_init(&xs->map_list_lock);
1409
1410        mutex_lock(&net->xdp.lock);
1411        sk_add_node_rcu(sk, &net->xdp.list);
1412        mutex_unlock(&net->xdp.lock);
1413
1414        local_bh_disable();
1415        sock_prot_inuse_add(net, &xsk_proto, 1);
1416        local_bh_enable();
1417
1418        return 0;
1419}
1420
1421static const struct net_proto_family xsk_family_ops = {
1422        .family = PF_XDP,
1423        .create = xsk_create,
1424        .owner  = THIS_MODULE,
1425};
1426
1427static struct notifier_block xsk_netdev_notifier = {
1428        .notifier_call  = xsk_notifier,
1429};
1430
1431static int __net_init xsk_net_init(struct net *net)
1432{
1433        mutex_init(&net->xdp.lock);
1434        INIT_HLIST_HEAD(&net->xdp.list);
1435        return 0;
1436}
1437
1438static void __net_exit xsk_net_exit(struct net *net)
1439{
1440        WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
1441}
1442
1443static struct pernet_operations xsk_net_ops = {
1444        .init = xsk_net_init,
1445        .exit = xsk_net_exit,
1446};
1447
1448static int __init xsk_init(void)
1449{
1450        int err, cpu;
1451
1452        err = proto_register(&xsk_proto, 0 /* no slab */);
1453        if (err)
1454                goto out;
1455
1456        err = sock_register(&xsk_family_ops);
1457        if (err)
1458                goto out_proto;
1459
1460        err = register_pernet_subsys(&xsk_net_ops);
1461        if (err)
1462                goto out_sk;
1463
1464        err = register_netdevice_notifier(&xsk_netdev_notifier);
1465        if (err)
1466                goto out_pernet;
1467
1468        for_each_possible_cpu(cpu)
1469                INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
1470        return 0;
1471
1472out_pernet:
1473        unregister_pernet_subsys(&xsk_net_ops);
1474out_sk:
1475        sock_unregister(PF_XDP);
1476out_proto:
1477        proto_unregister(&xsk_proto);
1478out:
1479        return err;
1480}
1481
1482fs_initcall(xsk_init);
1483