linux/net/xdp/xsk.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/* XDP sockets
   3 *
   4 * AF_XDP sockets allows a channel between XDP programs and userspace
   5 * applications.
   6 * Copyright(c) 2018 Intel Corporation.
   7 *
   8 * Author(s): Björn Töpel <bjorn.topel@intel.com>
   9 *            Magnus Karlsson <magnus.karlsson@intel.com>
  10 */
  11
  12#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
  13
  14#include <linux/if_xdp.h>
  15#include <linux/init.h>
  16#include <linux/sched/mm.h>
  17#include <linux/sched/signal.h>
  18#include <linux/sched/task.h>
  19#include <linux/socket.h>
  20#include <linux/file.h>
  21#include <linux/uaccess.h>
  22#include <linux/net.h>
  23#include <linux/netdevice.h>
  24#include <linux/rculist.h>
  25#include <net/xdp_sock.h>
  26#include <net/xdp.h>
  27
  28#include "xsk_queue.h"
  29#include "xdp_umem.h"
  30
  31#define TX_BATCH_SIZE 16
  32
  33static struct xdp_sock *xdp_sk(struct sock *sk)
  34{
  35        return (struct xdp_sock *)sk;
  36}
  37
  38bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
  39{
  40        return READ_ONCE(xs->rx) &&  READ_ONCE(xs->umem) &&
  41                READ_ONCE(xs->umem->fq);
  42}
  43
  44u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
  45{
  46        return xskq_peek_addr(umem->fq, addr);
  47}
  48EXPORT_SYMBOL(xsk_umem_peek_addr);
  49
  50void xsk_umem_discard_addr(struct xdp_umem *umem)
  51{
  52        xskq_discard_addr(umem->fq);
  53}
  54EXPORT_SYMBOL(xsk_umem_discard_addr);
  55
  56static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
  57{
  58        void *buffer;
  59        u64 addr;
  60        int err;
  61
  62        if (!xskq_peek_addr(xs->umem->fq, &addr) ||
  63            len > xs->umem->chunk_size_nohr) {
  64                xs->rx_dropped++;
  65                return -ENOSPC;
  66        }
  67
  68        addr += xs->umem->headroom;
  69
  70        buffer = xdp_umem_get_data(xs->umem, addr);
  71        memcpy(buffer, xdp->data, len);
  72        err = xskq_produce_batch_desc(xs->rx, addr, len);
  73        if (!err) {
  74                xskq_discard_addr(xs->umem->fq);
  75                xdp_return_buff(xdp);
  76                return 0;
  77        }
  78
  79        xs->rx_dropped++;
  80        return err;
  81}
  82
  83static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
  84{
  85        int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len);
  86
  87        if (err)
  88                xs->rx_dropped++;
  89
  90        return err;
  91}
  92
  93int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
  94{
  95        u32 len;
  96
  97        if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
  98                return -EINVAL;
  99
 100        len = xdp->data_end - xdp->data;
 101
 102        return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ?
 103                __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len);
 104}
 105
 106void xsk_flush(struct xdp_sock *xs)
 107{
 108        xskq_produce_flush_desc(xs->rx);
 109        xs->sk.sk_data_ready(&xs->sk);
 110}
 111
 112int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 113{
 114        u32 len = xdp->data_end - xdp->data;
 115        void *buffer;
 116        u64 addr;
 117        int err;
 118
 119        if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
 120                return -EINVAL;
 121
 122        if (!xskq_peek_addr(xs->umem->fq, &addr) ||
 123            len > xs->umem->chunk_size_nohr) {
 124                xs->rx_dropped++;
 125                return -ENOSPC;
 126        }
 127
 128        addr += xs->umem->headroom;
 129
 130        buffer = xdp_umem_get_data(xs->umem, addr);
 131        memcpy(buffer, xdp->data, len);
 132        err = xskq_produce_batch_desc(xs->rx, addr, len);
 133        if (!err) {
 134                xskq_discard_addr(xs->umem->fq);
 135                xsk_flush(xs);
 136                return 0;
 137        }
 138
 139        xs->rx_dropped++;
 140        return err;
 141}
 142
 143void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
 144{
 145        xskq_produce_flush_addr_n(umem->cq, nb_entries);
 146}
 147EXPORT_SYMBOL(xsk_umem_complete_tx);
 148
 149void xsk_umem_consume_tx_done(struct xdp_umem *umem)
 150{
 151        struct xdp_sock *xs;
 152
 153        rcu_read_lock();
 154        list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
 155                xs->sk.sk_write_space(&xs->sk);
 156        }
 157        rcu_read_unlock();
 158}
 159EXPORT_SYMBOL(xsk_umem_consume_tx_done);
 160
 161bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len)
 162{
 163        struct xdp_desc desc;
 164        struct xdp_sock *xs;
 165
 166        rcu_read_lock();
 167        list_for_each_entry_rcu(xs, &umem->xsk_list, list) {
 168                if (!xskq_peek_desc(xs->tx, &desc))
 169                        continue;
 170
 171                if (xskq_produce_addr_lazy(umem->cq, desc.addr))
 172                        goto out;
 173
 174                *dma = xdp_umem_get_dma(umem, desc.addr);
 175                *len = desc.len;
 176
 177                xskq_discard_desc(xs->tx);
 178                rcu_read_unlock();
 179                return true;
 180        }
 181
 182out:
 183        rcu_read_unlock();
 184        return false;
 185}
 186EXPORT_SYMBOL(xsk_umem_consume_tx);
 187
 188static int xsk_zc_xmit(struct sock *sk)
 189{
 190        struct xdp_sock *xs = xdp_sk(sk);
 191        struct net_device *dev = xs->dev;
 192
 193        return dev->netdev_ops->ndo_xsk_async_xmit(dev, xs->queue_id);
 194}
 195
 196static void xsk_destruct_skb(struct sk_buff *skb)
 197{
 198        u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
 199        struct xdp_sock *xs = xdp_sk(skb->sk);
 200        unsigned long flags;
 201
 202        spin_lock_irqsave(&xs->tx_completion_lock, flags);
 203        WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr));
 204        spin_unlock_irqrestore(&xs->tx_completion_lock, flags);
 205
 206        sock_wfree(skb);
 207}
 208
 209static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
 210                            size_t total_len)
 211{
 212        u32 max_batch = TX_BATCH_SIZE;
 213        struct xdp_sock *xs = xdp_sk(sk);
 214        bool sent_frame = false;
 215        struct xdp_desc desc;
 216        struct sk_buff *skb;
 217        int err = 0;
 218
 219        mutex_lock(&xs->mutex);
 220
 221        while (xskq_peek_desc(xs->tx, &desc)) {
 222                char *buffer;
 223                u64 addr;
 224                u32 len;
 225
 226                if (max_batch-- == 0) {
 227                        err = -EAGAIN;
 228                        goto out;
 229                }
 230
 231                if (xskq_reserve_addr(xs->umem->cq))
 232                        goto out;
 233
 234                if (xs->queue_id >= xs->dev->real_num_tx_queues)
 235                        goto out;
 236
 237                len = desc.len;
 238                skb = sock_alloc_send_skb(sk, len, 1, &err);
 239                if (unlikely(!skb)) {
 240                        err = -EAGAIN;
 241                        goto out;
 242                }
 243
 244                skb_put(skb, len);
 245                addr = desc.addr;
 246                buffer = xdp_umem_get_data(xs->umem, addr);
 247                err = skb_store_bits(skb, 0, buffer, len);
 248                if (unlikely(err)) {
 249                        kfree_skb(skb);
 250                        goto out;
 251                }
 252
 253                skb->dev = xs->dev;
 254                skb->priority = sk->sk_priority;
 255                skb->mark = sk->sk_mark;
 256                skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
 257                skb->destructor = xsk_destruct_skb;
 258
 259                err = dev_direct_xmit(skb, xs->queue_id);
 260                xskq_discard_desc(xs->tx);
 261                /* Ignore NET_XMIT_CN as packet might have been sent */
 262                if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
 263                        /* SKB completed but not sent */
 264                        err = -EBUSY;
 265                        goto out;
 266                }
 267
 268                sent_frame = true;
 269        }
 270
 271out:
 272        if (sent_frame)
 273                sk->sk_write_space(sk);
 274
 275        mutex_unlock(&xs->mutex);
 276        return err;
 277}
 278
 279static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 280{
 281        bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
 282        struct sock *sk = sock->sk;
 283        struct xdp_sock *xs = xdp_sk(sk);
 284
 285        if (unlikely(!xs->dev))
 286                return -ENXIO;
 287        if (unlikely(!(xs->dev->flags & IFF_UP)))
 288                return -ENETDOWN;
 289        if (unlikely(!xs->tx))
 290                return -ENOBUFS;
 291        if (need_wait)
 292                return -EOPNOTSUPP;
 293
 294        return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len);
 295}
 296
 297static unsigned int xsk_poll(struct file *file, struct socket *sock,
 298                             struct poll_table_struct *wait)
 299{
 300        unsigned int mask = datagram_poll(file, sock, wait);
 301        struct sock *sk = sock->sk;
 302        struct xdp_sock *xs = xdp_sk(sk);
 303
 304        if (xs->rx && !xskq_empty_desc(xs->rx))
 305                mask |= POLLIN | POLLRDNORM;
 306        if (xs->tx && !xskq_full_desc(xs->tx))
 307                mask |= POLLOUT | POLLWRNORM;
 308
 309        return mask;
 310}
 311
 312static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
 313                          bool umem_queue)
 314{
 315        struct xsk_queue *q;
 316
 317        if (entries == 0 || *queue || !is_power_of_2(entries))
 318                return -EINVAL;
 319
 320        q = xskq_create(entries, umem_queue);
 321        if (!q)
 322                return -ENOMEM;
 323
 324        /* Make sure queue is ready before it can be seen by others */
 325        smp_wmb();
 326        *queue = q;
 327        return 0;
 328}
 329
 330static int xsk_release(struct socket *sock)
 331{
 332        struct sock *sk = sock->sk;
 333        struct xdp_sock *xs = xdp_sk(sk);
 334        struct net *net;
 335
 336        if (!sk)
 337                return 0;
 338
 339        net = sock_net(sk);
 340
 341        local_bh_disable();
 342        sock_prot_inuse_add(net, sk->sk_prot, -1);
 343        local_bh_enable();
 344
 345        if (xs->dev) {
 346                /* Wait for driver to stop using the xdp socket. */
 347                synchronize_net();
 348                dev_put(xs->dev);
 349                xs->dev = NULL;
 350        }
 351
 352        sock_orphan(sk);
 353        sock->sk = NULL;
 354
 355        sk_refcnt_debug_release(sk);
 356        sock_put(sk);
 357
 358        return 0;
 359}
 360
 361static struct socket *xsk_lookup_xsk_from_fd(int fd)
 362{
 363        struct socket *sock;
 364        int err;
 365
 366        sock = sockfd_lookup(fd, &err);
 367        if (!sock)
 368                return ERR_PTR(-ENOTSOCK);
 369
 370        if (sock->sk->sk_family != PF_XDP) {
 371                sockfd_put(sock);
 372                return ERR_PTR(-ENOPROTOOPT);
 373        }
 374
 375        return sock;
 376}
 377
 378static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 379{
 380        struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
 381        struct sock *sk = sock->sk;
 382        struct xdp_sock *xs = xdp_sk(sk);
 383        struct net_device *dev;
 384        u32 flags, qid;
 385        int err = 0;
 386
 387        if (addr_len < sizeof(struct sockaddr_xdp))
 388                return -EINVAL;
 389        if (sxdp->sxdp_family != AF_XDP)
 390                return -EINVAL;
 391
 392        mutex_lock(&xs->mutex);
 393        if (xs->dev) {
 394                err = -EBUSY;
 395                goto out_release;
 396        }
 397
 398        dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
 399        if (!dev) {
 400                err = -ENODEV;
 401                goto out_release;
 402        }
 403
 404        if (!xs->rx && !xs->tx) {
 405                err = -EINVAL;
 406                goto out_unlock;
 407        }
 408
 409        qid = sxdp->sxdp_queue_id;
 410
 411        if ((xs->rx && qid >= dev->real_num_rx_queues) ||
 412            (xs->tx && qid >= dev->real_num_tx_queues)) {
 413                err = -EINVAL;
 414                goto out_unlock;
 415        }
 416
 417        flags = sxdp->sxdp_flags;
 418
 419        if (flags & XDP_SHARED_UMEM) {
 420                struct xdp_sock *umem_xs;
 421                struct socket *sock;
 422
 423                if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) {
 424                        /* Cannot specify flags for shared sockets. */
 425                        err = -EINVAL;
 426                        goto out_unlock;
 427                }
 428
 429                if (xs->umem) {
 430                        /* We have already our own. */
 431                        err = -EINVAL;
 432                        goto out_unlock;
 433                }
 434
 435                sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
 436                if (IS_ERR(sock)) {
 437                        err = PTR_ERR(sock);
 438                        goto out_unlock;
 439                }
 440
 441                umem_xs = xdp_sk(sock->sk);
 442                if (!umem_xs->umem) {
 443                        /* No umem to inherit. */
 444                        err = -EBADF;
 445                        sockfd_put(sock);
 446                        goto out_unlock;
 447                } else if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
 448                        err = -EINVAL;
 449                        sockfd_put(sock);
 450                        goto out_unlock;
 451                }
 452
 453                xdp_get_umem(umem_xs->umem);
 454                xs->umem = umem_xs->umem;
 455                sockfd_put(sock);
 456        } else if (!xs->umem || !xdp_umem_validate_queues(xs->umem)) {
 457                err = -EINVAL;
 458                goto out_unlock;
 459        } else {
 460                /* This xsk has its own umem. */
 461                xskq_set_umem(xs->umem->fq, &xs->umem->props);
 462                xskq_set_umem(xs->umem->cq, &xs->umem->props);
 463
 464                err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
 465                if (err)
 466                        goto out_unlock;
 467        }
 468
 469        xs->dev = dev;
 470        xs->zc = xs->umem->zc;
 471        xs->queue_id = qid;
 472        xskq_set_umem(xs->rx, &xs->umem->props);
 473        xskq_set_umem(xs->tx, &xs->umem->props);
 474        xdp_add_sk_umem(xs->umem, xs);
 475
 476out_unlock:
 477        if (err)
 478                dev_put(dev);
 479out_release:
 480        mutex_unlock(&xs->mutex);
 481        return err;
 482}
 483
 484static int xsk_setsockopt(struct socket *sock, int level, int optname,
 485                          char __user *optval, unsigned int optlen)
 486{
 487        struct sock *sk = sock->sk;
 488        struct xdp_sock *xs = xdp_sk(sk);
 489        int err;
 490
 491        if (level != SOL_XDP)
 492                return -ENOPROTOOPT;
 493
 494        switch (optname) {
 495        case XDP_RX_RING:
 496        case XDP_TX_RING:
 497        {
 498                struct xsk_queue **q;
 499                int entries;
 500
 501                if (optlen < sizeof(entries))
 502                        return -EINVAL;
 503                if (copy_from_user(&entries, optval, sizeof(entries)))
 504                        return -EFAULT;
 505
 506                mutex_lock(&xs->mutex);
 507                q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
 508                err = xsk_init_queue(entries, q, false);
 509                mutex_unlock(&xs->mutex);
 510                return err;
 511        }
 512        case XDP_UMEM_REG:
 513        {
 514                struct xdp_umem_reg mr;
 515                struct xdp_umem *umem;
 516
 517                if (copy_from_user(&mr, optval, sizeof(mr)))
 518                        return -EFAULT;
 519
 520                mutex_lock(&xs->mutex);
 521                if (xs->umem) {
 522                        mutex_unlock(&xs->mutex);
 523                        return -EBUSY;
 524                }
 525
 526                umem = xdp_umem_create(&mr);
 527                if (IS_ERR(umem)) {
 528                        mutex_unlock(&xs->mutex);
 529                        return PTR_ERR(umem);
 530                }
 531
 532                /* Make sure umem is ready before it can be seen by others */
 533                smp_wmb();
 534                xs->umem = umem;
 535                mutex_unlock(&xs->mutex);
 536                return 0;
 537        }
 538        case XDP_UMEM_FILL_RING:
 539        case XDP_UMEM_COMPLETION_RING:
 540        {
 541                struct xsk_queue **q;
 542                int entries;
 543
 544                if (copy_from_user(&entries, optval, sizeof(entries)))
 545                        return -EFAULT;
 546
 547                mutex_lock(&xs->mutex);
 548                if (!xs->umem) {
 549                        mutex_unlock(&xs->mutex);
 550                        return -EINVAL;
 551                }
 552
 553                q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq :
 554                        &xs->umem->cq;
 555                err = xsk_init_queue(entries, q, true);
 556                mutex_unlock(&xs->mutex);
 557                return err;
 558        }
 559        default:
 560                break;
 561        }
 562
 563        return -ENOPROTOOPT;
 564}
 565
 566static int xsk_getsockopt(struct socket *sock, int level, int optname,
 567                          char __user *optval, int __user *optlen)
 568{
 569        struct sock *sk = sock->sk;
 570        struct xdp_sock *xs = xdp_sk(sk);
 571        int len;
 572
 573        if (level != SOL_XDP)
 574                return -ENOPROTOOPT;
 575
 576        if (get_user(len, optlen))
 577                return -EFAULT;
 578        if (len < 0)
 579                return -EINVAL;
 580
 581        switch (optname) {
 582        case XDP_STATISTICS:
 583        {
 584                struct xdp_statistics stats;
 585
 586                if (len < sizeof(stats))
 587                        return -EINVAL;
 588
 589                mutex_lock(&xs->mutex);
 590                stats.rx_dropped = xs->rx_dropped;
 591                stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
 592                stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
 593                mutex_unlock(&xs->mutex);
 594
 595                if (copy_to_user(optval, &stats, sizeof(stats)))
 596                        return -EFAULT;
 597                if (put_user(sizeof(stats), optlen))
 598                        return -EFAULT;
 599
 600                return 0;
 601        }
 602        case XDP_MMAP_OFFSETS:
 603        {
 604                struct xdp_mmap_offsets off;
 605
 606                if (len < sizeof(off))
 607                        return -EINVAL;
 608
 609                off.rx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
 610                off.rx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
 611                off.rx.desc     = offsetof(struct xdp_rxtx_ring, desc);
 612                off.tx.producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
 613                off.tx.consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
 614                off.tx.desc     = offsetof(struct xdp_rxtx_ring, desc);
 615
 616                off.fr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
 617                off.fr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
 618                off.fr.desc     = offsetof(struct xdp_umem_ring, desc);
 619                off.cr.producer = offsetof(struct xdp_umem_ring, ptrs.producer);
 620                off.cr.consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
 621                off.cr.desc     = offsetof(struct xdp_umem_ring, desc);
 622
 623                len = sizeof(off);
 624                if (copy_to_user(optval, &off, len))
 625                        return -EFAULT;
 626                if (put_user(len, optlen))
 627                        return -EFAULT;
 628
 629                return 0;
 630        }
 631        default:
 632                break;
 633        }
 634
 635        return -EOPNOTSUPP;
 636}
 637
 638static int xsk_mmap(struct file *file, struct socket *sock,
 639                    struct vm_area_struct *vma)
 640{
 641        loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
 642        unsigned long size = vma->vm_end - vma->vm_start;
 643        struct xdp_sock *xs = xdp_sk(sock->sk);
 644        struct xsk_queue *q = NULL;
 645        struct xdp_umem *umem;
 646        unsigned long pfn;
 647        struct page *qpg;
 648
 649        if (offset == XDP_PGOFF_RX_RING) {
 650                q = READ_ONCE(xs->rx);
 651        } else if (offset == XDP_PGOFF_TX_RING) {
 652                q = READ_ONCE(xs->tx);
 653        } else {
 654                umem = READ_ONCE(xs->umem);
 655                if (!umem)
 656                        return -EINVAL;
 657
 658                if (offset == XDP_UMEM_PGOFF_FILL_RING)
 659                        q = READ_ONCE(umem->fq);
 660                else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
 661                        q = READ_ONCE(umem->cq);
 662        }
 663
 664        if (!q)
 665                return -EINVAL;
 666
 667        qpg = virt_to_head_page(q->ring);
 668        if (size > (PAGE_SIZE << compound_order(qpg)))
 669                return -EINVAL;
 670
 671        pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
 672        return remap_pfn_range(vma, vma->vm_start, pfn,
 673                               size, vma->vm_page_prot);
 674}
 675
 676static struct proto xsk_proto = {
 677        .name =         "XDP",
 678        .owner =        THIS_MODULE,
 679        .obj_size =     sizeof(struct xdp_sock),
 680};
 681
 682static const struct proto_ops xsk_proto_ops = {
 683        .family         = PF_XDP,
 684        .owner          = THIS_MODULE,
 685        .release        = xsk_release,
 686        .bind           = xsk_bind,
 687        .connect        = sock_no_connect,
 688        .socketpair     = sock_no_socketpair,
 689        .accept         = sock_no_accept,
 690        .getname        = sock_no_getname,
 691        .poll           = xsk_poll,
 692        .ioctl          = sock_no_ioctl,
 693        .listen         = sock_no_listen,
 694        .shutdown       = sock_no_shutdown,
 695        .setsockopt     = xsk_setsockopt,
 696        .getsockopt     = xsk_getsockopt,
 697        .sendmsg        = xsk_sendmsg,
 698        .recvmsg        = sock_no_recvmsg,
 699        .mmap           = xsk_mmap,
 700        .sendpage       = sock_no_sendpage,
 701};
 702
 703static void xsk_destruct(struct sock *sk)
 704{
 705        struct xdp_sock *xs = xdp_sk(sk);
 706
 707        if (!sock_flag(sk, SOCK_DEAD))
 708                return;
 709
 710        xskq_destroy(xs->rx);
 711        xskq_destroy(xs->tx);
 712        xdp_del_sk_umem(xs->umem, xs);
 713        xdp_put_umem(xs->umem);
 714
 715        sk_refcnt_debug_dec(sk);
 716}
 717
 718static int xsk_create(struct net *net, struct socket *sock, int protocol,
 719                      int kern)
 720{
 721        struct sock *sk;
 722        struct xdp_sock *xs;
 723
 724        if (!ns_capable(net->user_ns, CAP_NET_RAW))
 725                return -EPERM;
 726        if (sock->type != SOCK_RAW)
 727                return -ESOCKTNOSUPPORT;
 728
 729        if (protocol)
 730                return -EPROTONOSUPPORT;
 731
 732        sock->state = SS_UNCONNECTED;
 733
 734        sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
 735        if (!sk)
 736                return -ENOBUFS;
 737
 738        sock->ops = &xsk_proto_ops;
 739
 740        sock_init_data(sock, sk);
 741
 742        sk->sk_family = PF_XDP;
 743
 744        sk->sk_destruct = xsk_destruct;
 745        sk_refcnt_debug_inc(sk);
 746
 747        sock_set_flag(sk, SOCK_RCU_FREE);
 748
 749        xs = xdp_sk(sk);
 750        mutex_init(&xs->mutex);
 751        spin_lock_init(&xs->tx_completion_lock);
 752
 753        local_bh_disable();
 754        sock_prot_inuse_add(net, &xsk_proto, 1);
 755        local_bh_enable();
 756
 757        return 0;
 758}
 759
 760static const struct net_proto_family xsk_family_ops = {
 761        .family = PF_XDP,
 762        .create = xsk_create,
 763        .owner  = THIS_MODULE,
 764};
 765
 766static int __init xsk_init(void)
 767{
 768        int err;
 769
 770        err = proto_register(&xsk_proto, 0 /* no slab */);
 771        if (err)
 772                goto out;
 773
 774        err = sock_register(&xsk_family_ops);
 775        if (err)
 776                goto out_proto;
 777
 778        return 0;
 779
 780out_proto:
 781        proto_unregister(&xsk_proto);
 782out:
 783        return err;
 784}
 785
 786fs_initcall(xsk_init);
 787