linux/net/xdp/xsk.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/* XDP sockets
   3 *
   4 * AF_XDP sockets allows a channel between XDP programs and userspace
   5 * applications.
   6 * Copyright(c) 2018 Intel Corporation.
   7 *
   8 * Author(s): Björn Töpel <bjorn.topel@intel.com>
   9 *            Magnus Karlsson <magnus.karlsson@intel.com>
  10 */
  11
  12#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
  13
  14#include <linux/if_xdp.h>
  15#include <linux/init.h>
  16#include <linux/sched/mm.h>
  17#include <linux/sched/signal.h>
  18#include <linux/sched/task.h>
  19#include <linux/socket.h>
  20#include <linux/file.h>
  21#include <linux/uaccess.h>
  22#include <linux/net.h>
  23#include <linux/netdevice.h>
  24#include <linux/rculist.h>
  25#include <net/xdp_sock.h>
  26#include <net/xdp.h>
  27
  28#include "xsk_queue.h"
  29#include "xdp_umem.h"
  30#include "xsk.h"
  31
  32#define TX_BATCH_SIZE 16
  33
  34bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
  35{
  36        return READ_ONCE(xs->rx) &&  READ_ONCE(xs->umem) &&
  37                READ_ONCE(xs->umem->fq);
  38}
  39
  40bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt)
  41{
  42        return xskq_has_addrs(umem->fq, cnt);
  43}
  44EXPORT_SYMBOL(xsk_umem_has_addrs);
  45
  46u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
  47{
  48        return xskq_peek_addr(umem->fq, addr);
  49}
  50EXPORT_SYMBOL(xsk_umem_peek_addr);
  51
  52void xsk_umem_discard_addr(struct xdp_umem *umem)
  53{
  54        xskq_discard_addr(umem->fq);
  55}
  56EXPORT_SYMBOL(xsk_umem_discard_addr);
  57
  58static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
  59{
  60        void *to_buf, *from_buf;
  61        u32 metalen;
  62        u64 addr;
  63        int err;
  64
  65        if (!xskq_peek_addr(xs->umem->fq, &addr) ||
  66            len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
  67                xs->rx_dropped++;
  68                return -ENOSPC;
  69        }
  70
  71        addr += xs->umem->headroom;
  72
  73        if (unlikely(xdp_data_meta_unsupported(xdp))) {
  74                from_buf = xdp->data;
  75                metalen = 0;
  76        } else {
  77                from_buf = xdp->data_meta;
  78                metalen = xdp->data - xdp->data_meta;
  79        }
  80
  81        to_buf = xdp_umem_get_data(xs->umem, addr);
  82        memcpy(to_buf, from_buf, len + metalen);
  83        addr += metalen;
  84        err = xskq_produce_batch_desc(xs->rx, addr, len);
  85        if (!err) {
  86                xskq_discard_addr(xs->umem->fq);
  87                xdp_return_buff(xdp);
  88                return 0;
  89        }
  90
  91        xs->rx_dropped++;
  92        return err;
  93}
  94
  95static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
  96{
  97        int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len);
  98
  99        if (err)
 100                xs->rx_dropped++;
 101
 102        return err;
 103}
 104
 105int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 106{
 107        u32 len;
 108
 109        if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
 110                return -EINVAL;
 111
 112        len = xdp->data_end - xdp->data;
 113
 114        return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ?
 115                __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len);
 116}
 117
 118void xsk_flush(struct xdp_sock *xs)
 119{
 120        xskq_produce_flush_desc(xs->rx);
 121        xs->sk.sk_data_ready(&xs->sk);
 122}
 123
 124int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 125{
 126        u32 metalen = xdp->data - xdp->data_meta;
 127        u32 len = xdp->data_end - xdp->data;
 128        void *buffer;
 129        u64 addr;
 130        int err;
 131
 132        spin_lock_bh(&xs->rx_lock);
 133
 134        if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) {
 135                err = -EINVAL;
 136                goto out_unlock;
 137        }
 138
 139        if (!xskq_peek_addr(xs->umem->fq, &addr) ||
 140            len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
 141                err = -ENOSPC;
 142                goto out_drop;
 143        }
 144
 145        addr += xs->umem->headroom;
 146
 147        buffer = xdp_umem_get_data(xs->umem, addr);
 148        memcpy(buffer, xdp->data_meta, len + metalen);
 149        addr += metalen;
 150        err = xskq_produce_batch_desc(xs->rx, addr, len);
 151        if (err)
 152                goto out_drop;
 153
 154        xskq_discard_addr(xs->umem->fq);
 155        xskq_produce_flush_desc(xs->rx);
 156
 157        spin_unlock_bh(&xs->rx_lock);
 158
 159        xs->sk.sk_data_ready(&xs->sk);
 160        return 0;
 161
 162out_drop:
 163        xs->rx_dropped++;
 164out_unlock:
 165        spin_unlock_bh(&xs->rx_lock);
 166        return err;
 167}
 168
 169void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
 170{
 171        xskq_produce_flush_addr_n(umem->cq, nb_entries);
 172}
 173EXPORT_SYMBOL(xsk_umem_complete_tx);
 174
 175void xsk_umem_consume_tx_done(struct xdp_umem *umem)
 176{
 177        struct xdp_sock *xs;
 178
 179        rcu_read_lock();
 180        list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
 181                xs->sk.sk_write_space(&xs->sk);
 182        }
 183        rcu_read_unlock();
 184}
 185EXPORT_SYMBOL(xsk_umem_consume_tx_done);
 186
 187bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc)
 188{
 189        struct xdp_sock *xs;
 190
 191        rcu_read_lock();
 192        list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
 193                if (!xskq_peek_desc(xs->tx, desc))
 194                        continue;
 195
 196                if (xskq_produce_addr_lazy(umem->cq, desc->addr))
 197                        goto out;
 198
 199                xskq_discard_desc(xs->tx);
 200                rcu_read_unlock();
 201                return true;
 202        }
 203
 204out:
 205        rcu_read_unlock();
 206        return false;
 207}
 208EXPORT_SYMBOL(xsk_umem_consume_tx);
 209
 210static int xsk_zc_xmit(struct sock *sk)
 211{
 212        struct xdp_sock *xs = xdp_sk(sk);
 213        struct net_device *dev = xs->dev;
 214
 215        return dev->netdev_ops->ndo_xsk_async_xmit(dev, xs->queue_id);
 216}
 217
 218static void xsk_destruct_skb(struct sk_buff *skb)
 219{
 220        u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
 221        struct xdp_sock *xs = xdp_sk(skb->sk);
 222        unsigned long flags;
 223
 224        spin_lock_irqsave(&xs->tx_completion_lock, flags);
 225        WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr));
 226        spin_unlock_irqrestore(&xs->tx_completion_lock, flags);
 227
 228        sock_wfree(skb);
 229}
 230
 231static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 232                            size_t total_len)
 233{
 234        u32 max_batch = TX_BATCH_SIZE;
 235        struct xdp_sock *xs = xdp_sk(sk);
 236        bool sent_frame = false;
 237        struct xdp_desc desc;
 238        struct sk_buff *skb;
 239        int err = 0;
 240
 241        mutex_lock(&xs->mutex);
 242
 243        if (xs->queue_id >= xs->dev->real_num_tx_queues)
 244                goto out;
 245
 246        while (xskq_peek_desc(xs->tx, &desc)) {
 247                char *buffer;
 248                u64 addr;
 249                u32 len;
 250
 251                if (max_batch-- == 0) {
 252                        err = -EAGAIN;
 253                        goto out;
 254                }
 255
 256                len = desc.len;
 257                skb = sock_alloc_send_skb(sk, len, 1, &err);
 258                if (unlikely(!skb)) {
 259                        err = -EAGAIN;
 260                        goto out;
 261                }
 262
 263                skb_put(skb, len);
 264                addr = desc.addr;
 265                buffer = xdp_umem_get_data(xs->umem, addr);
 266                err = skb_store_bits(skb, 0, buffer, len);
 267                if (unlikely(err) || xskq_reserve_addr(xs->umem->cq)) {
 268                        kfree_skb(skb);
 269                        goto out;
 270                }
 271
 272                skb->dev = xs->dev;
 273                skb->priority = sk->sk_priority;
 274                skb->mark = sk->sk_mark;
 275                skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
 276                skb->destructor = xsk_destruct_skb;
 277
 278                err = dev_direct_xmit(skb, xs->queue_id);
 279                xskq_discard_desc(xs->tx);
 280                /* Ignore NET_XMIT_CN as packet might have been sent */
 281                if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
 282                        /* SKB completed but not sent */
 283                        err = -EBUSY;
 284                        goto out;
 285                }
 286
 287                sent_frame = true;
 288        }
 289
 290out:
 291        if (sent_frame)
 292                sk->sk_write_space(sk);
 293
 294        mutex_unlock(&xs->mutex);
 295        return err;
 296}
 297
 298static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 299{
 300        bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
 301        struct sock *sk = sock->sk;
 302        struct xdp_sock *xs = xdp_sk(sk);
 303
 304        if (unlikely(!xs->dev))
 305                return -ENXIO;
 306        if (unlikely(!(xs->dev->flags & IFF_UP)))
 307                return -ENETDOWN;
 308        if (unlikely(!xs->tx))
 309                return -ENOBUFS;
 310        if (need_wait)
 311                return -EOPNOTSUPP;
 312
 313        return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len);
 314}
 315
 316static unsigned int xsk_poll(struct file *file, struct socket *sock,
 317                             struct poll_table_struct *wait)
 318{
 319        unsigned int mask = datagram_poll(file, sock, wait);
 320        struct sock *sk = sock->sk;
 321        struct xdp_sock *xs = xdp_sk(sk);
 322
 323        if (xs->rx && !xskq_empty_desc(xs->rx))
 324                mask |= POLLIN | POLLRDNORM;
 325        if (xs->tx && !xskq_full_desc(xs->tx))
 326                mask |= POLLOUT | POLLWRNORM;
 327
 328        return mask;
 329}
 330
 331static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
 332                          bool umem_queue)
 333{
 334        struct xsk_queue *q;
 335
 336        if (entries == 0 || *queue || !is_power_of_2(entries))
 337                return -EINVAL;
 338
 339        q = xskq_create(entries, umem_queue);
 340        if (!q)
 341                return -ENOMEM;
 342
 343        /* Make sure queue is ready before it can be seen by others */
 344        smp_wmb();
 345        *queue = q;
 346        return 0;
 347}
 348
 349static void xsk_unbind_dev(struct xdp_sock *xs)
 350{
 351        struct net_device *dev = xs->dev;
 352
 353        if (!dev || xs->state != XSK_BOUND)
 354                return;
 355
 356        xs->state = XSK_UNBOUND;
 357
 358        /* Wait for driver to stop using the xdp socket. */
 359        xdp_del_sk_umem(xs->umem, xs);
 360        xs->dev = NULL;
 361        synchronize_net();
 362        dev_put(dev);
 363}
 364
 365static int xsk_release(struct socket *sock)
 366{
 367        struct sock *sk = sock->sk;
 368        struct xdp_sock *xs = xdp_sk(sk);
 369        struct net *net;
 370
 371        if (!sk)
 372                return 0;
 373
 374        net = sock_net(sk);
 375
 376        mutex_lock(&net->xdp.lock);
 377        sk_del_node_init_rcu(sk);
 378        mutex_unlock(&net->xdp.lock);
 379
 380        local_bh_disable();
 381        sock_prot_inuse_add(net, sk->sk_prot, -1);
 382        local_bh_enable();
 383
 384        xsk_unbind_dev(xs);
 385
 386        xskq_destroy(xs->rx);
 387        xskq_destroy(xs->tx);
 388
 389        sock_orphan(sk);
 390        sock->sk = NULL;
 391
 392        sk_refcnt_debug_release(sk);
 393        sock_put(sk);
 394
 395        return 0;
 396}
 397
 398static struct socket *xsk_lookup_xsk_from_fd(int fd)
 399{
 400        struct socket *sock;
 401        int err;
 402
 403        sock = sockfd_lookup(fd, &err);
 404        if (!sock)
 405                return ERR_PTR(-ENOTSOCK);
 406
 407        if (sock->sk->sk_family != PF_XDP) {
 408                sockfd_put(sock);
 409                return ERR_PTR(-ENOPROTOOPT);
 410        }
 411
 412        return sock;
 413}
 414
 415static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 416{
 417        struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
 418        struct sock *sk = sock->sk;
 419        struct xdp_sock *xs = xdp_sk(sk);
 420        struct net_device *dev;
 421        u32 flags, qid;
 422        int err = 0;
 423
 424        if (addr_len < sizeof(struct sockaddr_xdp))
 425                return -EINVAL;
 426        if (sxdp->sxdp_family != AF_XDP)
 427                return -EINVAL;
 428
 429        flags = sxdp->sxdp_flags;
 430        if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY))
 431                return -EINVAL;
 432
 433        rtnl_lock();
 434        mutex_lock(&xs->mutex);
 435        if (xs->state != XSK_READY) {
 436                err = -EBUSY;
 437                goto out_release;
 438        }
 439
 440        dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
 441        if (!dev) {
 442                err = -ENODEV;
 443                goto out_release;
 444        }
 445
 446        if (!xs->rx && !xs->tx) {
 447                err = -EINVAL;
 448                goto out_unlock;
 449        }
 450
 451        qid = sxdp->sxdp_queue_id;
 452
 453        if (flags & XDP_SHARED_UMEM) {
 454                struct xdp_sock *umem_xs;
 455                struct socket *sock;
 456
 457                if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) {
 458                        /* Cannot specify flags for shared sockets. */
 459                        err = -EINVAL;
 460                        goto out_unlock;
 461                }
 462
 463                if (xs->umem) {
 464                        /* We have already our own. */
 465                        err = -EINVAL;
 466                        goto out_unlock;
 467                }
 468
 469                sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
 470                if (IS_ERR(sock)) {
 471                        err = PTR_ERR(sock);
 472                        goto out_unlock;
 473                }
 474
 475                umem_xs = xdp_sk(sock->sk);
 476                if (!umem_xs->umem) {
 477                        /* No umem to inherit. */
 478                        err = -EBADF;
 479                        sockfd_put(sock);
 480                        goto out_unlock;
 481                } else if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
 482                        err = -EINVAL;
 483                        sockfd_put(sock);
 484                        goto out_unlock;
 485                }
 486
 487                xdp_get_umem(umem_xs->umem);
 488                xs->umem = umem_xs->umem;
 489                sockfd_put(sock);
 490        } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
 491                err = -EINVAL;
 492                goto out_unlock;
 493        } else {
 494                /* This xsk has its own umem. */
 495                xskq_set_umem(xs->umem->fq, xs->umem->size,
 496                              xs->umem->chunk_mask);
 497                xskq_set_umem(xs->umem->cq, xs->umem->size,
 498                              xs->umem->chunk_mask);
 499
 500                err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
 501                if (err)
 502                        goto out_unlock;
 503        }
 504
 505        xs->dev = dev;
 506        xs->zc = xs->umem->zc;
 507        xs->queue_id = qid;
 508        xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask);
 509        xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask);
 510        xdp_add_sk_umem(xs->umem, xs);
 511
 512out_unlock:
 513        if (err)
 514                dev_put(dev);
 515        else
 516                xs->state = XSK_BOUND;
 517out_release:
 518        mutex_unlock(&xs->mutex);
 519        rtnl_unlock();
 520        return err;
 521}
 522
 523static int xsk_setsockopt(struct socket *sock, int level, int optname,
 524                          char __user *optval, unsigned int optlen)
 525{
 526        struct sock *sk = sock->sk;
 527        struct xdp_sock *xs = xdp_sk(sk);
 528        int err;
 529
 530        if (level != SOL_XDP)
 531                return -ENOPROTOOPT;
 532
 533        switch (optname) {
 534        case XDP_RX_RING:
 535        case XDP_TX_RING:
 536        {
 537                struct xsk_queue **q;
 538                int entries;
 539
 540                if (optlen < sizeof(entries))
 541                        return -EINVAL;
 542                if (copy_from_user(&entries, optval, sizeof(entries)))
 543                        return -EFAULT;
 544
 545                mutex_lock(&xs->mutex);
 546                if (xs->state != XSK_READY) {
 547                        mutex_unlock(&xs->mutex);
 548                        return -EBUSY;
 549                }
 550                q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
 551                err = xsk_init_queue(entries, q, false);
 552                mutex_unlock(&xs->mutex);
 553                return err;
 554        }
 555        case XDP_UMEM_REG:
 556        {
 557                struct xdp_umem_reg mr;
 558                struct xdp_umem *umem;
 559
 560                if (copy_from_user(&mr, optval, sizeof(mr)))
 561                        return -EFAULT;
 562
 563                mutex_lock(&xs->mutex);
 564                if (xs->state != XSK_READY || xs->umem) {
 565                        mutex_unlock(&xs->mutex);
 566                        return -EBUSY;
 567                }
 568
 569                umem = xdp_umem_create(&mr);
 570                if (IS_ERR(umem)) {
 571                        mutex_unlock(&xs->mutex);
 572                        return PTR_ERR(umem);
 573                }
 574
 575                /* Make sure umem is ready before it can be seen by others */
 576                smp_wmb();
 577                xs->umem = umem;
 578                mutex_unlock(&xs->mutex);
 579                return 0;
 580        }
 581        case XDP_UMEM_FILL_RING:
 582        case XDP_UMEM_COMPLETION_RING:
 583        {
 584                struct xsk_queue **q;
 585                int entries;
 586
 587                if (copy_from_user(&entries, optval, sizeof(entries)))
 588                        return -EFAULT;
 589
 590                mutex_lock(&xs->mutex);
 591                if (xs->state != XSK_READY) {
 592                        mutex_unlock(&xs->mutex);
 593                        return -EBUSY;
 594                }
 595                if (!xs->umem) {
 596                        mutex_unlock(&xs->mutex);
 597                        return -EINVAL;
 598                }
 599
 600                q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
 601                        &xs->umem->cq;
 602                err = xsk_init_queue(entries, q, true);
 603                mutex_unlock(&xs->mutex);
 604                return err;
 605        }
 606        default:
 607                break;
 608        }
 609
 610        return -ENOPROTOOPT;
 611}
 612
 613static int xsk_getsockopt(struct socket *sock, int level, int optname,
 614                          char __user *optval, int __user *optlen)
 615{
 616        struct sock *sk = sock->sk;
 617        struct xdp_sock *xs = xdp_sk(sk);
 618        int len;
 619
 620        if (level != SOL_XDP)
 621                return -ENOPROTOOPT;
 622
 623        if (get_user(len, optlen))
 624                return -EFAULT;
 625        if (len < 0)
 626                return -EINVAL;
 627
 628        switch (optname) {
 629        case XDP_STATISTICS:
 630        {
 631                struct xdp_statistics stats;
 632
 633                if (len < sizeof(stats))
 634                        return -EINVAL;
 635
 636                mutex_lock(&xs->mutex);
 637                stats.rx_dropped = xs->rx_dropped;
 638                stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
 639                stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
 640                mutex_unlock(&xs->mutex);
 641
 642                if (copy_to_user(optval, &stats, sizeof(stats)))
 643                        return -EFAULT;
 644                if (put_user(sizeof(stats), optlen))
 645                        return -EFAULT;
 646
 647                return 0;
 648        }
 649        case XDP_MMAP_OFFSETS:
 650        {
 651                struct xdp_mmap_offsets off;
 652
 653                if (len < sizeof(off))
 654                        return -EINVAL;
 655
 656                off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
 657                off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
 658                off.rx.desc     = offsetof(struct xdp_rxtx_ring, desc);
 659                off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
 660                off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
 661                off.tx.desc     = offsetof(struct xdp_rxtx_ring, desc);
 662
 663                off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
 664                off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
 665                off.fr.desc     = offsetof(struct xdp_umem_ring, desc);
 666                off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
 667                off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
 668                off.cr.desc     = offsetof(struct xdp_umem_ring, desc);
 669
 670                len = sizeof(off);
 671                if (copy_to_user(optval, &off, len))
 672                        return -EFAULT;
 673                if (put_user(len, optlen))
 674                        return -EFAULT;
 675
 676                return 0;
 677        }
 678        case XDP_OPTIONS:
 679        {
 680                struct xdp_options opts = {};
 681
 682                if (len < sizeof(opts))
 683                        return -EINVAL;
 684
 685                mutex_lock(&xs->mutex);
 686                if (xs->zc)
 687                        opts.flags |= XDP_OPTIONS_ZEROCOPY;
 688                mutex_unlock(&xs->mutex);
 689
 690                len = sizeof(opts);
 691                if (copy_to_user(optval, &opts, len))
 692                        return -EFAULT;
 693                if (put_user(len, optlen))
 694                        return -EFAULT;
 695
 696                return 0;
 697        }
 698        default:
 699                break;
 700        }
 701
 702        return -EOPNOTSUPP;
 703}
 704
 705static int xsk_mmap(struct file *file, struct socket *sock,
 706                    struct vm_area_struct *vma)
 707{
 708        loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
 709        unsigned long size = vma->vm_end - vma->vm_start;
 710        struct xdp_sock *xs = xdp_sk(sock->sk);
 711        struct xsk_queue *q = NULL;
 712        struct xdp_umem *umem;
 713        unsigned long pfn;
 714        struct page *qpg;
 715
 716        if (xs->state != XSK_READY)
 717                return -EBUSY;
 718
 719        if (offset == XDP_PGOFF_RX_RING) {
 720                q = READ_ONCE(xs->rx);
 721        } else if (offset == XDP_PGOFF_TX_RING) {
 722                q = READ_ONCE(xs->tx);
 723        } else {
 724                umem = READ_ONCE(xs->umem);
 725                if (!umem)
 726                        return -EINVAL;
 727
 728                /* Matches the smp_wmb() in XDP_UMEM_REG */
 729                smp_rmb();
 730                if (offset == XDP_UMEM_PGOFF_FILL_RING)
 731                        q = READ_ONCE(umem->fq);
 732                else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
 733                        q = READ_ONCE(umem->cq);
 734        }
 735
 736        if (!q)
 737                return -EINVAL;
 738
 739        /* Matches the smp_wmb() in xsk_init_queue */
 740        smp_rmb();
 741        qpg = virt_to_head_page(q->ring);
 742        if (size > (PAGE_SIZE << compound_order(qpg)))
 743                return -EINVAL;
 744
 745        pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
 746        return remap_pfn_range(vma, vma->vm_start, pfn,
 747                               size, vma->vm_page_prot);
 748}
 749
 750static int xsk_notifier(struct notifier_block *this,
 751                        unsigned long msg, void *ptr)
 752{
 753        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 754        struct net *net = dev_net(dev);
 755        struct sock *sk;
 756
 757        switch (msg) {
 758        case NETDEV_UNREGISTER:
 759                mutex_lock(&net->xdp.lock);
 760                sk_for_each(sk, &net->xdp.list) {
 761                        struct xdp_sock *xs = xdp_sk(sk);
 762
 763                        mutex_lock(&xs->mutex);
 764                        if (xs->dev == dev) {
 765                                sk->sk_err = ENETDOWN;
 766                                if (!sock_flag(sk, SOCK_DEAD))
 767                                        sk->sk_error_report(sk);
 768
 769                                xsk_unbind_dev(xs);
 770
 771                                /* Clear device references in umem. */
 772                                xdp_umem_clear_dev(xs->umem);
 773                        }
 774                        mutex_unlock(&xs->mutex);
 775                }
 776                mutex_unlock(&net->xdp.lock);
 777                break;
 778        }
 779        return NOTIFY_DONE;
 780}
 781
 782static struct proto xsk_proto = {
 783        .name =         "XDP",
 784        .owner =        THIS_MODULE,
 785        .obj_size =     sizeof(struct xdp_sock),
 786};
 787
 788static const struct proto_ops xsk_proto_ops = {
 789        .family         = PF_XDP,
 790        .owner          = THIS_MODULE,
 791        .release        = xsk_release,
 792        .bind           = xsk_bind,
 793        .connect        = sock_no_connect,
 794        .socketpair     = sock_no_socketpair,
 795        .accept         = sock_no_accept,
 796        .getname        = sock_no_getname,
 797        .poll           = xsk_poll,
 798        .ioctl          = sock_no_ioctl,
 799        .listen         = sock_no_listen,
 800        .shutdown       = sock_no_shutdown,
 801        .setsockopt     = xsk_setsockopt,
 802        .getsockopt     = xsk_getsockopt,
 803        .sendmsg        = xsk_sendmsg,
 804        .recvmsg        = sock_no_recvmsg,
 805        .mmap           = xsk_mmap,
 806        .sendpage       = sock_no_sendpage,
 807};
 808
 809static void xsk_destruct(struct sock *sk)
 810{
 811        struct xdp_sock *xs = xdp_sk(sk);
 812
 813        if (!sock_flag(sk, SOCK_DEAD))
 814                return;
 815
 816        xdp_put_umem(xs->umem);
 817
 818        sk_refcnt_debug_dec(sk);
 819}
 820
 821static int xsk_create(struct net *net, struct socket *sock, int protocol,
 822                      int kern)
 823{
 824        struct sock *sk;
 825        struct xdp_sock *xs;
 826
 827        if (!ns_capable(net->user_ns, CAP_NET_RAW))
 828                return -EPERM;
 829        if (sock->type != SOCK_RAW)
 830                return -ESOCKTNOSUPPORT;
 831
 832        if (protocol)
 833                return -EPROTONOSUPPORT;
 834
 835        sock->state = SS_UNCONNECTED;
 836
 837        sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
 838        if (!sk)
 839                return -ENOBUFS;
 840
 841        sock->ops = &xsk_proto_ops;
 842
 843        sock_init_data(sock, sk);
 844
 845        sk->sk_family = PF_XDP;
 846
 847        sk->sk_destruct = xsk_destruct;
 848        sk_refcnt_debug_inc(sk);
 849
 850        sock_set_flag(sk, SOCK_RCU_FREE);
 851
 852        xs = xdp_sk(sk);
 853        xs->state = XSK_READY;
 854        mutex_init(&xs->mutex);
 855        spin_lock_init(&xs->rx_lock);
 856        spin_lock_init(&xs->tx_completion_lock);
 857
 858        mutex_lock(&net->xdp.lock);
 859        sk_add_node_rcu(sk, &net->xdp.list);
 860        mutex_unlock(&net->xdp.lock);
 861
 862        local_bh_disable();
 863        sock_prot_inuse_add(net, &xsk_proto, 1);
 864        local_bh_enable();
 865
 866        return 0;
 867}
 868
 869static const struct net_proto_family xsk_family_ops = {
 870        .family = PF_XDP,
 871        .create = xsk_create,
 872        .owner  = THIS_MODULE,
 873};
 874
 875static struct notifier_block xsk_netdev_notifier = {
 876        .notifier_call  = xsk_notifier,
 877};
 878
 879static int __net_init xsk_net_init(struct net *net)
 880{
 881        mutex_init(&net->xdp.lock);
 882        INIT_HLIST_HEAD(&net->xdp.list);
 883        return 0;
 884}
 885
 886static void __net_exit xsk_net_exit(struct net *net)
 887{
 888        WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
 889}
 890
 891static struct pernet_operations xsk_net_ops = {
 892        .init = xsk_net_init,
 893        .exit = xsk_net_exit,
 894};
 895
 896static int __init xsk_init(void)
 897{
 898        int err;
 899
 900        err = proto_register(&xsk_proto, 0 /* no slab */);
 901        if (err)
 902                goto out;
 903
 904        err = sock_register(&xsk_family_ops);
 905        if (err)
 906                goto out_proto;
 907
 908        err = register_pernet_subsys(&xsk_net_ops);
 909        if (err)
 910                goto out_sk;
 911
 912        err = register_netdevice_notifier(&xsk_netdev_notifier);
 913        if (err)
 914                goto out_pernet;
 915
 916        return 0;
 917
 918out_pernet:
 919        unregister_pernet_subsys(&xsk_net_ops);
 920out_sk:
 921        sock_unregister(PF_XDP);
 922out_proto:
 923        proto_unregister(&xsk_proto);
 924out:
 925        return err;
 926}
 927
 928fs_initcall(xsk_init);
 929