linux/net/xdp/xsk.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/* XDP sockets
   3 *
   4 * AF_XDP sockets allows a channel between XDP programs and userspace
   5 * applications.
   6 * Copyright(c) 2018 Intel Corporation.
   7 *
   8 * Author(s): Björn Töpel <bjorn.topel@intel.com>
   9 *            Magnus Karlsson <magnus.karlsson@intel.com>
  10 */
  11
  12#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
  13
  14#include <linux/if_xdp.h>
  15#include <linux/init.h>
  16#include <linux/sched/mm.h>
  17#include <linux/sched/signal.h>
  18#include <linux/sched/task.h>
  19#include <linux/socket.h>
  20#include <linux/file.h>
  21#include <linux/uaccess.h>
  22#include <linux/net.h>
  23#include <linux/netdevice.h>
  24#include <linux/rculist.h>
  25#include <net/xdp_sock.h>
  26#include <net/xdp.h>
  27
  28#include "xsk_queue.h"
  29#include "xdp_umem.h"
  30#include "xsk.h"
  31
  32#define TX_BATCH_SIZE 16
  33
  34bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
  35{
  36        return READ_ONCE(xs->rx) &&  READ_ONCE(xs->umem) &&
  37                READ_ONCE(xs->umem->fq);
  38}
  39
  40bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
  41{
  42        return xskq_has_addrs(umem->fq, cnt);
  43}
  44EXPORT_SYMBOL(xsk_umem_has_addrs);
  45
  46u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
  47{
  48        return xskq_peek_addr(umem->fq, addr, umem);
  49}
  50EXPORT_SYMBOL(xsk_umem_peek_addr);
  51
  52void xsk_umem_discard_addr(struct xdp_umem *umem)
  53{
  54        xskq_discard_addr(umem->fq);
  55}
  56EXPORT_SYMBOL(xsk_umem_discard_addr);
  57
  58void xsk_set_rx_need_wakeup(struct xdp_umem *umem)
  59{
  60        if (umem->need_wakeup & XDP_WAKEUP_RX)
  61                return;
  62
  63        umem->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
  64        umem->need_wakeup |= XDP_WAKEUP_RX;
  65}
  66EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
  67
  68void xsk_set_tx_need_wakeup(struct xdp_umem *umem)
  69{
  70        struct xdp_sock *xs;
  71
  72        if (umem->need_wakeup & XDP_WAKEUP_TX)
  73                return;
  74
  75        rcu_read_lock();
  76        list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
  77                xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
  78        }
  79        rcu_read_unlock();
  80
  81        umem->need_wakeup |= XDP_WAKEUP_TX;
  82}
  83EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
  84
  85void xsk_clear_rx_need_wakeup(struct xdp_umem *umem)
  86{
  87        if (!(umem->need_wakeup & XDP_WAKEUP_RX))
  88                return;
  89
  90        umem->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
  91        umem->need_wakeup &= ~XDP_WAKEUP_RX;
  92}
  93EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
  94
  95void xsk_clear_tx_need_wakeup(struct xdp_umem *umem)
  96{
  97        struct xdp_sock *xs;
  98
  99        if (!(umem->need_wakeup & XDP_WAKEUP_TX))
 100                return;
 101
 102        rcu_read_lock();
 103        list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
 104                xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
 105        }
 106        rcu_read_unlock();
 107
 108        umem->need_wakeup &= ~XDP_WAKEUP_TX;
 109}
 110EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
 111
 112bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem)
 113{
 114        return umem->flags & XDP_UMEM_USES_NEED_WAKEUP;
 115}
 116EXPORT_SYMBOL(xsk_umem_uses_need_wakeup);
 117
 118/* If a buffer crosses a page boundary, we need to do 2 memcpy's, one for
 119 * each page. This is only required in copy mode.
 120 */
 121static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf,
 122                             u32 len, u32 metalen)
 123{
 124        void *to_buf = xdp_umem_get_data(umem, addr);
 125
 126        addr = xsk_umem_add_offset_to_addr(addr);
 127        if (xskq_crosses_non_contig_pg(umem, addr, len + metalen)) {
 128                void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr;
 129                u64 page_start = addr & ~(PAGE_SIZE - 1);
 130                u64 first_len = PAGE_SIZE - (addr - page_start);
 131
 132                memcpy(to_buf, from_buf, first_len + metalen);
 133                memcpy(next_pg_addr, from_buf + first_len, len - first_len);
 134
 135                return;
 136        }
 137
 138        memcpy(to_buf, from_buf, len + metalen);
 139}
 140
 141static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 142{
 143        u64 offset = xs->umem->headroom;
 144        u64 addr, memcpy_addr;
 145        void *from_buf;
 146        u32 metalen;
 147        int err;
 148
 149        if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
 150            len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
 151                xs->rx_dropped++;
 152                return -ENOSPC;
 153        }
 154
 155        if (unlikely(xdp_data_meta_unsupported(xdp))) {
 156                from_buf = xdp->data;
 157                metalen = 0;
 158        } else {
 159                from_buf = xdp->data_meta;
 160                metalen = xdp->data - xdp->data_meta;
 161        }
 162
 163        memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
 164        __xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen);
 165
 166        offset += metalen;
 167        addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
 168        err = xskq_produce_batch_desc(xs->rx, addr, len);
 169        if (!err) {
 170                xskq_discard_addr(xs->umem->fq);
 171                xdp_return_buff(xdp);
 172                return 0;
 173        }
 174
 175        xs->rx_dropped++;
 176        return err;
 177}
 178
 179static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 180{
 181        int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len);
 182
 183        if (err)
 184                xs->rx_dropped++;
 185
 186        return err;
 187}
 188
 189static bool xsk_is_bound(struct xdp_sock *xs)
 190{
 191        if (READ_ONCE(xs->state) == XSK_BOUND) {
 192                /* Matches smp_wmb() in bind(). */
 193                smp_rmb();
 194                return true;
 195        }
 196        return false;
 197}
 198
 199int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 200{
 201        u32 len;
 202
 203        if (!xsk_is_bound(xs))
 204                return -EINVAL;
 205
 206        if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
 207                return -EINVAL;
 208
 209        len = xdp->data_end - xdp->data;
 210
 211        return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ?
 212                __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len);
 213}
 214
 215void xsk_flush(struct xdp_sock *xs)
 216{
 217        xskq_produce_flush_desc(xs->rx);
 218        xs->sk.sk_data_ready(&xs->sk);
 219}
 220
 221int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 222{
 223        u32 metalen = xdp->data - xdp->data_meta;
 224        u32 len = xdp->data_end - xdp->data;
 225        u64 offset = xs->umem->headroom;
 226        void *buffer;
 227        u64 addr;
 228        int err;
 229
 230        spin_lock_bh(&xs->rx_lock);
 231
 232        if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) {
 233                err = -EINVAL;
 234                goto out_unlock;
 235        }
 236
 237        if (!xskq_peek_addr(xs->umem->fq, &addr, xs->umem) ||
 238            len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
 239                err = -ENOSPC;
 240                goto out_drop;
 241        }
 242
 243        addr = xsk_umem_adjust_offset(xs->umem, addr, offset);
 244        buffer = xdp_umem_get_data(xs->umem, addr);
 245        memcpy(buffer, xdp->data_meta, len + metalen);
 246
 247        addr = xsk_umem_adjust_offset(xs->umem, addr, metalen);
 248        err = xskq_produce_batch_desc(xs->rx, addr, len);
 249        if (err)
 250                goto out_drop;
 251
 252        xskq_discard_addr(xs->umem->fq);
 253        xskq_produce_flush_desc(xs->rx);
 254
 255        spin_unlock_bh(&xs->rx_lock);
 256
 257        xs->sk.sk_data_ready(&xs->sk);
 258        return 0;
 259
 260out_drop:
 261        xs->rx_dropped++;
 262out_unlock:
 263        spin_unlock_bh(&xs->rx_lock);
 264        return err;
 265}
 266
 267void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
 268{
 269        xskq_produce_flush_addr_n(umem->cq, nb_entries);
 270}
 271EXPORT_SYMBOL(xsk_umem_complete_tx);
 272
 273void xsk_umem_consume_tx_done(struct xdp_umem *umem)
 274{
 275        struct xdp_sock *xs;
 276
 277        rcu_read_lock();
 278        list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
 279                xs->sk.sk_write_space(&xs->sk);
 280        }
 281        rcu_read_unlock();
 282}
 283EXPORT_SYMBOL(xsk_umem_consume_tx_done);
 284
 285bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
 286{
 287        struct xdp_sock *xs;
 288
 289        rcu_read_lock();
 290        list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
 291                if (!xskq_peek_desc(xs->tx, desc, umem))
 292                        continue;
 293
 294                if (xskq_produce_addr_lazy(umem->cq, desc->addr))
 295                        goto out;
 296
 297                xskq_discard_desc(xs->tx);
 298                rcu_read_unlock();
 299                return true;
 300        }
 301
 302out:
 303        rcu_read_unlock();
 304        return false;
 305}
 306EXPORT_SYMBOL(xsk_umem_consume_tx);
 307
 308static int xsk_zc_xmit(struct xdp_sock *xs)
 309{
 310        struct net_device *dev = xs->dev;
 311
 312        return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id,
 313                                               XDP_WAKEUP_TX);
 314}
 315
 316static void xsk_destruct_skb(struct sk_buff *skb)
 317{
 318        u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
 319        struct xdp_sock *xs = xdp_sk(skb->sk);
 320        unsigned long flags;
 321
 322        spin_lock_irqsave(&xs->tx_completion_lock, flags);
 323        WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr));
 324        spin_unlock_irqrestore(&xs->tx_completion_lock, flags);
 325
 326        sock_wfree(skb);
 327}
 328
 329static int xsk_generic_xmit(struct sock *sk)
 330{
 331        struct xdp_sock *xs = xdp_sk(sk);
 332        u32 max_batch = TX_BATCH_SIZE;
 333        bool sent_frame = false;
 334        struct xdp_desc desc;
 335        struct sk_buff *skb;
 336        int err = 0;
 337
 338        mutex_lock(&xs->mutex);
 339
 340        if (xs->queue_id >= xs->dev->real_num_tx_queues)
 341                goto out;
 342
 343        while (xskq_peek_desc(xs->tx, &desc, xs->umem)) {
 344                char *buffer;
 345                u64 addr;
 346                u32 len;
 347
 348                if (max_batch-- == 0) {
 349                        err = -EAGAIN;
 350                        goto out;
 351                }
 352
 353                len = desc.len;
 354                skb = sock_alloc_send_skb(sk, len, 1, &err);
 355                if (unlikely(!skb)) {
 356                        err = -EAGAIN;
 357                        goto out;
 358                }
 359
 360                skb_put(skb, len);
 361                addr = desc.addr;
 362                buffer = xdp_umem_get_data(xs->umem, addr);
 363                err = skb_store_bits(skb, 0, buffer, len);
 364                if (unlikely(err) || xskq_reserve_addr(xs->umem->cq)) {
 365                        kfree_skb(skb);
 366                        goto out;
 367                }
 368
 369                skb->dev = xs->dev;
 370                skb->priority = sk->sk_priority;
 371                skb->mark = sk->sk_mark;
 372                skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
 373                skb->destructor = xsk_destruct_skb;
 374
 375                err = dev_direct_xmit(skb, xs->queue_id);
 376                xskq_discard_desc(xs->tx);
 377                /* Ignore NET_XMIT_CN as packet might have been sent */
 378                if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
 379                        /* SKB completed but not sent */
 380                        err = -EBUSY;
 381                        goto out;
 382                }
 383
 384                sent_frame = true;
 385        }
 386
 387out:
 388        if (sent_frame)
 389                sk->sk_write_space(sk);
 390
 391        mutex_unlock(&xs->mutex);
 392        return err;
 393}
 394
 395static int __xsk_sendmsg(struct sock *sk)
 396{
 397        struct xdp_sock *xs = xdp_sk(sk);
 398
 399        if (unlikely(!(xs->dev->flags & IFF_UP)))
 400                return -ENETDOWN;
 401        if (unlikely(!xs->tx))
 402                return -ENOBUFS;
 403
 404        return xs->zc ? xsk_zc_xmit(xs) : xsk_generic_xmit(sk);
 405}
 406
 407static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 408{
 409        bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
 410        struct sock *sk = sock->sk;
 411        struct xdp_sock *xs = xdp_sk(sk);
 412
 413        if (unlikely(!xsk_is_bound(xs)))
 414                return -ENXIO;
 415        if (unlikely(need_wait))
 416                return -EOPNOTSUPP;
 417
 418        return __xsk_sendmsg(sk);
 419}
 420
 421static unsigned int xsk_poll(struct file *file, struct socket *sock,
 422                             struct poll_table_struct *wait)
 423{
 424        unsigned int mask = datagram_poll(file, sock, wait);
 425        struct sock *sk = sock->sk;
 426        struct xdp_sock *xs = xdp_sk(sk);
 427        struct net_device *dev;
 428        struct xdp_umem *umem;
 429
 430        if (unlikely(!xsk_is_bound(xs)))
 431                return mask;
 432
 433        dev = xs->dev;
 434        umem = xs->umem;
 435
 436        if (umem->need_wakeup) {
 437                if (dev->netdev_ops->ndo_xsk_wakeup)
 438                        dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id,
 439                                                        umem->need_wakeup);
 440                else
 441                        /* Poll needs to drive Tx also in copy mode */
 442                        __xsk_sendmsg(sk);
 443        }
 444
 445        if (xs->rx && !xskq_empty_desc(xs->rx))
 446                mask |= POLLIN | POLLRDNORM;
 447        if (xs->tx && !xskq_full_desc(xs->tx))
 448                mask |= POLLOUT | POLLWRNORM;
 449
 450        return mask;
 451}
 452
 453static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
 454                          bool umem_queue)
 455{
 456        struct xsk_queue *q;
 457
 458        if (entries == 0 || *queue || !is_power_of_2(entries))
 459                return -EINVAL;
 460
 461        q = xskq_create(entries, umem_queue);
 462        if (!q)
 463                return -ENOMEM;
 464
 465        /* Make sure queue is ready before it can be seen by others */
 466        smp_wmb();
 467        WRITE_ONCE(*queue, q);
 468        return 0;
 469}
 470
 471static void xsk_unbind_dev(struct xdp_sock *xs)
 472{
 473        struct net_device *dev = xs->dev;
 474
 475        if (xs->state != XSK_BOUND)
 476                return;
 477        WRITE_ONCE(xs->state, XSK_UNBOUND);
 478
 479        /* Wait for driver to stop using the xdp socket. */
 480        xdp_del_sk_umem(xs->umem, xs);
 481        xs->dev = NULL;
 482        synchronize_net();
 483        dev_put(dev);
 484}
 485
 486static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
 487                                              struct xdp_sock ***map_entry)
 488{
 489        struct xsk_map *map = NULL;
 490        struct xsk_map_node *node;
 491
 492        *map_entry = NULL;
 493
 494        spin_lock_bh(&xs->map_list_lock);
 495        node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
 496                                        node);
 497        if (node) {
 498                WARN_ON(xsk_map_inc(node->map));
 499                map = node->map;
 500                *map_entry = node->map_entry;
 501        }
 502        spin_unlock_bh(&xs->map_list_lock);
 503        return map;
 504}
 505
 506static void xsk_delete_from_maps(struct xdp_sock *xs)
 507{
 508        /* This function removes the current XDP socket from all the
 509         * maps it resides in. We need to take extra care here, due to
 510         * the two locks involved. Each map has a lock synchronizing
 511         * updates to the entries, and each socket has a lock that
 512         * synchronizes access to the list of maps (map_list). For
 513         * deadlock avoidance the locks need to be taken in the order
 514         * "map lock"->"socket map list lock". We start off by
 515         * accessing the socket map list, and take a reference to the
 516         * map to guarantee existence between the
 517         * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
 518         * calls. Then we ask the map to remove the socket, which
 519         * tries to remove the socket from the map. Note that there
 520         * might be updates to the map between
 521         * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
 522         */
 523        struct xdp_sock **map_entry = NULL;
 524        struct xsk_map *map;
 525
 526        while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
 527                xsk_map_try_sock_delete(map, xs, map_entry);
 528                xsk_map_put(map);
 529        }
 530}
 531
 532static int xsk_release(struct socket *sock)
 533{
 534        struct sock *sk = sock->sk;
 535        struct xdp_sock *xs = xdp_sk(sk);
 536        struct net *net;
 537
 538        if (!sk)
 539                return 0;
 540
 541        net = sock_net(sk);
 542
 543        mutex_lock(&net->xdp.lock);
 544        sk_del_node_init_rcu(sk);
 545        mutex_unlock(&net->xdp.lock);
 546
 547        local_bh_disable();
 548        sock_prot_inuse_add(net, sk->sk_prot, -1);
 549        local_bh_enable();
 550
 551        xsk_delete_from_maps(xs);
 552        mutex_lock(&xs->mutex);
 553        xsk_unbind_dev(xs);
 554        mutex_unlock(&xs->mutex);
 555
 556        xskq_destroy(xs->rx);
 557        xskq_destroy(xs->tx);
 558
 559        sock_orphan(sk);
 560        sock->sk = NULL;
 561
 562        sk_refcnt_debug_release(sk);
 563        sock_put(sk);
 564
 565        return 0;
 566}
 567
 568static struct socket *xsk_lookup_xsk_from_fd(int fd)
 569{
 570        struct socket *sock;
 571        int err;
 572
 573        sock = sockfd_lookup(fd, &err);
 574        if (!sock)
 575                return ERR_PTR(-ENOTSOCK);
 576
 577        if (sock->sk->sk_family != PF_XDP) {
 578                sockfd_put(sock);
 579                return ERR_PTR(-ENOPROTOOPT);
 580        }
 581
 582        return sock;
 583}
 584
 585/* Check if umem pages are contiguous.
 586 * If zero-copy mode, use the DMA address to do the page contiguity check
 587 * For all other modes we use addr (kernel virtual address)
 588 * Store the result in the low bits of addr.
 589 */
 590static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 flags)
 591{
 592        struct xdp_umem_page *pgs = umem->pages;
 593        int i, is_contig;
 594
 595        for (i = 0; i < umem->npgs - 1; i++) {
 596                is_contig = (flags & XDP_ZEROCOPY) ?
 597                        (pgs[i].dma + PAGE_SIZE == pgs[i + 1].dma) :
 598                        (pgs[i].addr + PAGE_SIZE == pgs[i + 1].addr);
 599                pgs[i].addr += is_contig << XSK_NEXT_PG_CONTIG_SHIFT;
 600        }
 601}
 602
 603static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 604{
 605        struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
 606        struct sock *sk = sock->sk;
 607        struct xdp_sock *xs = xdp_sk(sk);
 608        struct net_device *dev;
 609        u32 flags, qid;
 610        int err = 0;
 611
 612        if (addr_len < sizeof(struct sockaddr_xdp))
 613                return -EINVAL;
 614        if (sxdp->sxdp_family != AF_XDP)
 615                return -EINVAL;
 616
 617        flags = sxdp->sxdp_flags;
 618        if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
 619                      XDP_USE_NEED_WAKEUP))
 620                return -EINVAL;
 621
 622        rtnl_lock();
 623        mutex_lock(&xs->mutex);
 624        if (xs->state != XSK_READY) {
 625                err = -EBUSY;
 626                goto out_release;
 627        }
 628
 629        dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
 630        if (!dev) {
 631                err = -ENODEV;
 632                goto out_release;
 633        }
 634
 635        if (!xs->rx && !xs->tx) {
 636                err = -EINVAL;
 637                goto out_unlock;
 638        }
 639
 640        qid = sxdp->sxdp_queue_id;
 641
 642        if (flags & XDP_SHARED_UMEM) {
 643                struct xdp_sock *umem_xs;
 644                struct socket *sock;
 645
 646                if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
 647                    (flags & XDP_USE_NEED_WAKEUP)) {
 648                        /* Cannot specify flags for shared sockets. */
 649                        err = -EINVAL;
 650                        goto out_unlock;
 651                }
 652
 653                if (xs->umem) {
 654                        /* We have already our own. */
 655                        err = -EINVAL;
 656                        goto out_unlock;
 657                }
 658
 659                sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
 660                if (IS_ERR(sock)) {
 661                        err = PTR_ERR(sock);
 662                        goto out_unlock;
 663                }
 664
 665                umem_xs = xdp_sk(sock->sk);
 666                if (!xsk_is_bound(umem_xs)) {
 667                        err = -EBADF;
 668                        sockfd_put(sock);
 669                        goto out_unlock;
 670                }
 671                if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
 672                        err = -EINVAL;
 673                        sockfd_put(sock);
 674                        goto out_unlock;
 675                }
 676
 677                xdp_get_umem(umem_xs->umem);
 678                WRITE_ONCE(xs->umem, umem_xs->umem);
 679                sockfd_put(sock);
 680        } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
 681                err = -EINVAL;
 682                goto out_unlock;
 683        } else {
 684                /* This xsk has its own umem. */
 685                xskq_set_umem(xs->umem->fq, xs->umem->size,
 686                              xs->umem->chunk_mask);
 687                xskq_set_umem(xs->umem->cq, xs->umem->size,
 688                              xs->umem->chunk_mask);
 689
 690                err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
 691                if (err)
 692                        goto out_unlock;
 693
 694                xsk_check_page_contiguity(xs->umem, flags);
 695        }
 696
 697        xs->dev = dev;
 698        xs->zc = xs->umem->zc;
 699        xs->queue_id = qid;
 700        xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask);
 701        xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask);
 702        xdp_add_sk_umem(xs->umem, xs);
 703
 704out_unlock:
 705        if (err) {
 706                dev_put(dev);
 707        } else {
 708                /* Matches smp_rmb() in bind() for shared umem
 709                 * sockets, and xsk_is_bound().
 710                 */
 711                smp_wmb();
 712                WRITE_ONCE(xs->state, XSK_BOUND);
 713        }
 714out_release:
 715        mutex_unlock(&xs->mutex);
 716        rtnl_unlock();
 717        return err;
 718}
 719
 720struct xdp_umem_reg_v1 {
 721        __u64 addr; /* Start of packet data area */
 722        __u64 len; /* Length of packet data area */
 723        __u32 chunk_size;
 724        __u32 headroom;
 725};
 726
 727static int xsk_setsockopt(struct socket *sock, int level, int optname,
 728                          char __user *optval, unsigned int optlen)
 729{
 730        struct sock *sk = sock->sk;
 731        struct xdp_sock *xs = xdp_sk(sk);
 732        int err;
 733
 734        if (level != SOL_XDP)
 735                return -ENOPROTOOPT;
 736
 737        switch (optname) {
 738        case XDP_RX_RING:
 739        case XDP_TX_RING:
 740        {
 741                struct xsk_queue **q;
 742                int entries;
 743
 744                if (optlen < sizeof(entries))
 745                        return -EINVAL;
 746                if (copy_from_user(&entries, optval, sizeof(entries)))
 747                        return -EFAULT;
 748
 749                mutex_lock(&xs->mutex);
 750                if (xs->state != XSK_READY) {
 751                        mutex_unlock(&xs->mutex);
 752                        return -EBUSY;
 753                }
 754                q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
 755                err = xsk_init_queue(entries, q, false);
 756                if (!err && optname == XDP_TX_RING)
 757                        /* Tx needs to be explicitly woken up the first time */
 758                        xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
 759                mutex_unlock(&xs->mutex);
 760                return err;
 761        }
 762        case XDP_UMEM_REG:
 763        {
 764                size_t mr_size = sizeof(struct xdp_umem_reg);
 765                struct xdp_umem_reg mr = {};
 766                struct xdp_umem *umem;
 767
 768                if (optlen < sizeof(struct xdp_umem_reg_v1))
 769                        return -EINVAL;
 770                else if (optlen < sizeof(mr))
 771                        mr_size = sizeof(struct xdp_umem_reg_v1);
 772
 773                if (copy_from_user(&mr, optval, mr_size))
 774                        return -EFAULT;
 775
 776                mutex_lock(&xs->mutex);
 777                if (xs->state != XSK_READY || xs->umem) {
 778                        mutex_unlock(&xs->mutex);
 779                        return -EBUSY;
 780                }
 781
 782                umem = xdp_umem_create(&mr);
 783                if (IS_ERR(umem)) {
 784                        mutex_unlock(&xs->mutex);
 785                        return PTR_ERR(umem);
 786                }
 787
 788                /* Make sure umem is ready before it can be seen by others */
 789                smp_wmb();
 790                WRITE_ONCE(xs->umem, umem);
 791                mutex_unlock(&xs->mutex);
 792                return 0;
 793        }
 794        case XDP_UMEM_FILL_RING:
 795        case XDP_UMEM_COMPLETION_RING:
 796        {
 797                struct xsk_queue **q;
 798                int entries;
 799
 800                if (copy_from_user(&entries, optval, sizeof(entries)))
 801                        return -EFAULT;
 802
 803                mutex_lock(&xs->mutex);
 804                if (xs->state != XSK_READY) {
 805                        mutex_unlock(&xs->mutex);
 806                        return -EBUSY;
 807                }
 808                if (!xs->umem) {
 809                        mutex_unlock(&xs->mutex);
 810                        return -EINVAL;
 811                }
 812
 813                q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
 814                        &xs->umem->cq;
 815                err = xsk_init_queue(entries, q, true);
 816                mutex_unlock(&xs->mutex);
 817                return err;
 818        }
 819        default:
 820                break;
 821        }
 822
 823        return -ENOPROTOOPT;
 824}
 825
 826static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
 827{
 828        ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
 829        ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
 830        ring->desc = offsetof(struct xdp_rxtx_ring, desc);
 831}
 832
 833static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
 834{
 835        ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
 836        ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
 837        ring->desc = offsetof(struct xdp_umem_ring, desc);
 838}
 839
 840static int xsk_getsockopt(struct socket *sock, int level, int optname,
 841                          char __user *optval, int __user *optlen)
 842{
 843        struct sock *sk = sock->sk;
 844        struct xdp_sock *xs = xdp_sk(sk);
 845        int len;
 846
 847        if (level != SOL_XDP)
 848                return -ENOPROTOOPT;
 849
 850        if (get_user(len, optlen))
 851                return -EFAULT;
 852        if (len < 0)
 853                return -EINVAL;
 854
 855        switch (optname) {
 856        case XDP_STATISTICS:
 857        {
 858                struct xdp_statistics stats;
 859
 860                if (len < sizeof(stats))
 861                        return -EINVAL;
 862
 863                mutex_lock(&xs->mutex);
 864                stats.rx_dropped = xs->rx_dropped;
 865                stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
 866                stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
 867                mutex_unlock(&xs->mutex);
 868
 869                if (copy_to_user(optval, &stats, sizeof(stats)))
 870                        return -EFAULT;
 871                if (put_user(sizeof(stats), optlen))
 872                        return -EFAULT;
 873
 874                return 0;
 875        }
 876        case XDP_MMAP_OFFSETS:
 877        {
 878                struct xdp_mmap_offsets off;
 879                struct xdp_mmap_offsets_v1 off_v1;
 880                bool flags_supported = true;
 881                void *to_copy;
 882
 883                if (len < sizeof(off_v1))
 884                        return -EINVAL;
 885                else if (len < sizeof(off))
 886                        flags_supported = false;
 887
 888                if (flags_supported) {
 889                        /* xdp_ring_offset is identical to xdp_ring_offset_v1
 890                         * except for the flags field added to the end.
 891                         */
 892                        xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
 893                                               &off.rx);
 894                        xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
 895                                               &off.tx);
 896                        xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
 897                                               &off.fr);
 898                        xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
 899                                               &off.cr);
 900                        off.rx.flags = offsetof(struct xdp_rxtx_ring,
 901                                                ptrs.flags);
 902                        off.tx.flags = offsetof(struct xdp_rxtx_ring,
 903                                                ptrs.flags);
 904                        off.fr.flags = offsetof(struct xdp_umem_ring,
 905                                                ptrs.flags);
 906                        off.cr.flags = offsetof(struct xdp_umem_ring,
 907                                                ptrs.flags);
 908
 909                        len = sizeof(off);
 910                        to_copy = &off;
 911                } else {
 912                        xsk_enter_rxtx_offsets(&off_v1.rx);
 913                        xsk_enter_rxtx_offsets(&off_v1.tx);
 914                        xsk_enter_umem_offsets(&off_v1.fr);
 915                        xsk_enter_umem_offsets(&off_v1.cr);
 916
 917                        len = sizeof(off_v1);
 918                        to_copy = &off_v1;
 919                }
 920
 921                if (copy_to_user(optval, to_copy, len))
 922                        return -EFAULT;
 923                if (put_user(len, optlen))
 924                        return -EFAULT;
 925
 926                return 0;
 927        }
 928        case XDP_OPTIONS:
 929        {
 930                struct xdp_options opts = {};
 931
 932                if (len < sizeof(opts))
 933                        return -EINVAL;
 934
 935                mutex_lock(&xs->mutex);
 936                if (xs->zc)
 937                        opts.flags |= XDP_OPTIONS_ZEROCOPY;
 938                mutex_unlock(&xs->mutex);
 939
 940                len = sizeof(opts);
 941                if (copy_to_user(optval, &opts, len))
 942                        return -EFAULT;
 943                if (put_user(len, optlen))
 944                        return -EFAULT;
 945
 946                return 0;
 947        }
 948        default:
 949                break;
 950        }
 951
 952        return -EOPNOTSUPP;
 953}
 954
 955static int xsk_mmap(struct file *file, struct socket *sock,
 956                    struct vm_area_struct *vma)
 957{
 958        loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
 959        unsigned long size = vma->vm_end - vma->vm_start;
 960        struct xdp_sock *xs = xdp_sk(sock->sk);
 961        struct xsk_queue *q = NULL;
 962        struct xdp_umem *umem;
 963        unsigned long pfn;
 964        struct page *qpg;
 965
 966        if (READ_ONCE(xs->state) != XSK_READY)
 967                return -EBUSY;
 968
 969        if (offset == XDP_PGOFF_RX_RING) {
 970                q = READ_ONCE(xs->rx);
 971        } else if (offset == XDP_PGOFF_TX_RING) {
 972                q = READ_ONCE(xs->tx);
 973        } else {
 974                umem = READ_ONCE(xs->umem);
 975                if (!umem)
 976                        return -EINVAL;
 977
 978                /* Matches the smp_wmb() in XDP_UMEM_REG */
 979                smp_rmb();
 980                if (offset == XDP_UMEM_PGOFF_FILL_RING)
 981                        q = READ_ONCE(umem->fq);
 982                else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
 983                        q = READ_ONCE(umem->cq);
 984        }
 985
 986        if (!q)
 987                return -EINVAL;
 988
 989        /* Matches the smp_wmb() in xsk_init_queue */
 990        smp_rmb();
 991        qpg = virt_to_head_page(q->ring);
 992        if (size > page_size(qpg))
 993                return -EINVAL;
 994
 995        pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
 996        return remap_pfn_range(vma, vma->vm_start, pfn,
 997                               size, vma->vm_page_prot);
 998}
 999
1000static int xsk_notifier(struct notifier_block *this,
1001                        unsigned long msg, void *ptr)
1002{
1003        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1004        struct net *net = dev_net(dev);
1005        struct sock *sk;
1006
1007        switch (msg) {
1008        case NETDEV_UNREGISTER:
1009                mutex_lock(&net->xdp.lock);
1010                sk_for_each(sk, &net->xdp.list) {
1011                        struct xdp_sock *xs = xdp_sk(sk);
1012
1013                        mutex_lock(&xs->mutex);
1014                        if (xs->dev == dev) {
1015                                sk->sk_err = ENETDOWN;
1016                                if (!sock_flag(sk, SOCK_DEAD))
1017                                        sk->sk_error_report(sk);
1018
1019                                xsk_unbind_dev(xs);
1020
1021                                /* Clear device references in umem. */
1022                                xdp_umem_clear_dev(xs->umem);
1023                        }
1024                        mutex_unlock(&xs->mutex);
1025                }
1026                mutex_unlock(&net->xdp.lock);
1027                break;
1028        }
1029        return NOTIFY_DONE;
1030}
1031
1032static struct proto xsk_proto = {
1033        .name =         "XDP",
1034        .owner =        THIS_MODULE,
1035        .obj_size =     sizeof(struct xdp_sock),
1036};
1037
1038static const struct proto_ops xsk_proto_ops = {
1039        .family         = PF_XDP,
1040        .owner          = THIS_MODULE,
1041        .release        = xsk_release,
1042        .bind           = xsk_bind,
1043        .connect        = sock_no_connect,
1044        .socketpair     = sock_no_socketpair,
1045        .accept         = sock_no_accept,
1046        .getname        = sock_no_getname,
1047        .poll           = xsk_poll,
1048        .ioctl          = sock_no_ioctl,
1049        .listen         = sock_no_listen,
1050        .shutdown       = sock_no_shutdown,
1051        .setsockopt     = xsk_setsockopt,
1052        .getsockopt     = xsk_getsockopt,
1053        .sendmsg        = xsk_sendmsg,
1054        .recvmsg        = sock_no_recvmsg,
1055        .mmap           = xsk_mmap,
1056        .sendpage       = sock_no_sendpage,
1057};
1058
1059static void xsk_destruct(struct sock *sk)
1060{
1061        struct xdp_sock *xs = xdp_sk(sk);
1062
1063        if (!sock_flag(sk, SOCK_DEAD))
1064                return;
1065
1066        xdp_put_umem(xs->umem);
1067
1068        sk_refcnt_debug_dec(sk);
1069}
1070
1071static int xsk_create(struct net *net, struct socket *sock, int protocol,
1072                      int kern)
1073{
1074        struct sock *sk;
1075        struct xdp_sock *xs;
1076
1077        if (!ns_capable(net->user_ns, CAP_NET_RAW))
1078                return -EPERM;
1079        if (sock->type != SOCK_RAW)
1080                return -ESOCKTNOSUPPORT;
1081
1082        if (protocol)
1083                return -EPROTONOSUPPORT;
1084
1085        sock->state = SS_UNCONNECTED;
1086
1087        sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
1088        if (!sk)
1089                return -ENOBUFS;
1090
1091        sock->ops = &xsk_proto_ops;
1092
1093        sock_init_data(sock, sk);
1094
1095        sk->sk_family = PF_XDP;
1096
1097        sk->sk_destruct = xsk_destruct;
1098        sk_refcnt_debug_inc(sk);
1099
1100        sock_set_flag(sk, SOCK_RCU_FREE);
1101
1102        xs = xdp_sk(sk);
1103        xs->state = XSK_READY;
1104        mutex_init(&xs->mutex);
1105        spin_lock_init(&xs->rx_lock);
1106        spin_lock_init(&xs->tx_completion_lock);
1107
1108        INIT_LIST_HEAD(&xs->map_list);
1109        spin_lock_init(&xs->map_list_lock);
1110
1111        mutex_lock(&net->xdp.lock);
1112        sk_add_node_rcu(sk, &net->xdp.list);
1113        mutex_unlock(&net->xdp.lock);
1114
1115        local_bh_disable();
1116        sock_prot_inuse_add(net, &xsk_proto, 1);
1117        local_bh_enable();
1118
1119        return 0;
1120}
1121
1122static const struct net_proto_family xsk_family_ops = {
1123        .family = PF_XDP,
1124        .create = xsk_create,
1125        .owner  = THIS_MODULE,
1126};
1127
1128static struct notifier_block xsk_netdev_notifier = {
1129        .notifier_call  = xsk_notifier,
1130};
1131
1132static int __net_init xsk_net_init(struct net *net)
1133{
1134        mutex_init(&net->xdp.lock);
1135        INIT_HLIST_HEAD(&net->xdp.list);
1136        return 0;
1137}
1138
1139static void __net_exit xsk_net_exit(struct net *net)
1140{
1141        WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
1142}
1143
1144static struct pernet_operations xsk_net_ops = {
1145        .init = xsk_net_init,
1146        .exit = xsk_net_exit,
1147};
1148
1149static int __init xsk_init(void)
1150{
1151        int err;
1152
1153        err = proto_register(&xsk_proto, 0 /* no slab */);
1154        if (err)
1155                goto out;
1156
1157        err = sock_register(&xsk_family_ops);
1158        if (err)
1159                goto out_proto;
1160
1161        err = register_pernet_subsys(&xsk_net_ops);
1162        if (err)
1163                goto out_sk;
1164
1165        err = register_netdevice_notifier(&xsk_netdev_notifier);
1166        if (err)
1167                goto out_pernet;
1168
1169        return 0;
1170
1171out_pernet:
1172        unregister_pernet_subsys(&xsk_net_ops);
1173out_sk:
1174        sock_unregister(PF_XDP);
1175out_proto:
1176        proto_unregister(&xsk_proto);
1177out:
1178        return err;
1179}
1180
1181fs_initcall(xsk_init);
1182