linux/net/ipv4/udp.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4 *              operating system.  INET is implemented using the  BSD Socket
   5 *              interface as the means of communication with the user level.
   6 *
   7 *              The User Datagram Protocol (UDP).
   8 *
   9 * Authors:     Ross Biro
  10 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  11 *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  12 *              Alan Cox, <alan@lxorguk.ukuu.org.uk>
  13 *              Hirokazu Takahashi, <taka@valinux.co.jp>
  14 *
  15 * Fixes:
  16 *              Alan Cox        :       verify_area() calls
  17 *              Alan Cox        :       stopped close while in use off icmp
  18 *                                      messages. Not a fix but a botch that
  19 *                                      for udp at least is 'valid'.
  20 *              Alan Cox        :       Fixed icmp handling properly
  21 *              Alan Cox        :       Correct error for oversized datagrams
  22 *              Alan Cox        :       Tidied select() semantics.
  23 *              Alan Cox        :       udp_err() fixed properly, also now
  24 *                                      select and read wake correctly on errors
  25 *              Alan Cox        :       udp_send verify_area moved to avoid mem leak
  26 *              Alan Cox        :       UDP can count its memory
  27 *              Alan Cox        :       send to an unknown connection causes
  28 *                                      an ECONNREFUSED off the icmp, but
  29 *                                      does NOT close.
  30 *              Alan Cox        :       Switched to new sk_buff handlers. No more backlog!
  31 *              Alan Cox        :       Using generic datagram code. Even smaller and the PEEK
  32 *                                      bug no longer crashes it.
  33 *              Fred Van Kempen :       Net2e support for sk->broadcast.
  34 *              Alan Cox        :       Uses skb_free_datagram
  35 *              Alan Cox        :       Added get/set sockopt support.
  36 *              Alan Cox        :       Broadcasting without option set returns EACCES.
  37 *              Alan Cox        :       No wakeup calls. Instead we now use the callbacks.
  38 *              Alan Cox        :       Use ip_tos and ip_ttl
  39 *              Alan Cox        :       SNMP Mibs
  40 *              Alan Cox        :       MSG_DONTROUTE, and 0.0.0.0 support.
  41 *              Matt Dillon     :       UDP length checks.
  42 *              Alan Cox        :       Smarter af_inet used properly.
  43 *              Alan Cox        :       Use new kernel side addressing.
  44 *              Alan Cox        :       Incorrect return on truncated datagram receive.
  45 *      Arnt Gulbrandsen        :       New udp_send and stuff
  46 *              Alan Cox        :       Cache last socket
  47 *              Alan Cox        :       Route cache
  48 *              Jon Peatfield   :       Minor efficiency fix to sendto().
  49 *              Mike Shaver     :       RFC1122 checks.
  50 *              Alan Cox        :       Nonblocking error fix.
  51 *      Willy Konynenberg       :       Transparent proxying support.
  52 *              Mike McLagan    :       Routing by source
  53 *              David S. Miller :       New socket lookup architecture.
  54 *                                      Last socket cache retained as it
  55 *                                      does have a high hit rate.
  56 *              Olaf Kirch      :       Don't linearise iovec on sendmsg.
  57 *              Andi Kleen      :       Some cleanups, cache destination entry
  58 *                                      for connect.
  59 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  60 *              Melvin Smith    :       Check msg_name not msg_namelen in sendto(),
  61 *                                      return ENOTCONN for unconnected sockets (POSIX)
  62 *              Janos Farkas    :       don't deliver multi/broadcasts to a different
  63 *                                      bound-to-device socket
  64 *      Hirokazu Takahashi      :       HW checksumming for outgoing UDP
  65 *                                      datagrams.
  66 *      Hirokazu Takahashi      :       sendfile() on UDP works now.
  67 *              Arnaldo C. Melo :       convert /proc/net/udp to seq_file
  68 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  69 *      Alexey Kuznetsov:               allow both IPv4 and IPv6 sockets to bind
  70 *                                      a single port at the same time.
  71 *      Derek Atkins <derek@ihtfp.com>: Add Encapulation Support
  72 *      James Chapman           :       Add L2TP encapsulation type.
  73 */
  74
  75#define pr_fmt(fmt) "UDP: " fmt
  76
  77#include <linux/uaccess.h>
  78#include <asm/ioctls.h>
  79#include <linux/memblock.h>
  80#include <linux/highmem.h>
  81#include <linux/swap.h>
  82#include <linux/types.h>
  83#include <linux/fcntl.h>
  84#include <linux/module.h>
  85#include <linux/socket.h>
  86#include <linux/sockios.h>
  87#include <linux/igmp.h>
  88#include <linux/inetdevice.h>
  89#include <linux/in.h>
  90#include <linux/errno.h>
  91#include <linux/timer.h>
  92#include <linux/mm.h>
  93#include <linux/inet.h>
  94#include <linux/netdevice.h>
  95#include <linux/slab.h>
  96#include <net/tcp_states.h>
  97#include <linux/skbuff.h>
  98#include <linux/proc_fs.h>
  99#include <linux/seq_file.h>
 100#include <net/net_namespace.h>
 101#include <net/icmp.h>
 102#include <net/inet_hashtables.h>
 103#include <net/ip_tunnels.h>
 104#include <net/route.h>
 105#include <net/checksum.h>
 106#include <net/xfrm.h>
 107#include <trace/events/udp.h>
 108#include <linux/static_key.h>
 109#include <linux/btf_ids.h>
 110#include <trace/events/skb.h>
 111#include <net/busy_poll.h>
 112#include "udp_impl.h"
 113#include <net/sock_reuseport.h>
 114#include <net/addrconf.h>
 115#include <net/udp_tunnel.h>
 116#if IS_ENABLED(CONFIG_IPV6)
 117#include <net/ipv6_stubs.h>
 118#endif
 119
 120struct udp_table udp_table __read_mostly;
 121EXPORT_SYMBOL(udp_table);
 122
 123long sysctl_udp_mem[3] __read_mostly;
 124EXPORT_SYMBOL(sysctl_udp_mem);
 125
 126atomic_long_t udp_memory_allocated;
 127EXPORT_SYMBOL(udp_memory_allocated);
 128
 129#define MAX_UDP_PORTS 65536
 130#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
 131
 132static int udp_lib_lport_inuse(struct net *net, __u16 num,
 133                               const struct udp_hslot *hslot,
 134                               unsigned long *bitmap,
 135                               struct sock *sk, unsigned int log)
 136{
 137        struct sock *sk2;
 138        kuid_t uid = sock_i_uid(sk);
 139
 140        sk_for_each(sk2, &hslot->head) {
 141                if (net_eq(sock_net(sk2), net) &&
 142                    sk2 != sk &&
 143                    (bitmap || udp_sk(sk2)->udp_port_hash == num) &&
 144                    (!sk2->sk_reuse || !sk->sk_reuse) &&
 145                    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
 146                     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
 147                    inet_rcv_saddr_equal(sk, sk2, true)) {
 148                        if (sk2->sk_reuseport && sk->sk_reuseport &&
 149                            !rcu_access_pointer(sk->sk_reuseport_cb) &&
 150                            uid_eq(uid, sock_i_uid(sk2))) {
 151                                if (!bitmap)
 152                                        return 0;
 153                        } else {
 154                                if (!bitmap)
 155                                        return 1;
 156                                __set_bit(udp_sk(sk2)->udp_port_hash >> log,
 157                                          bitmap);
 158                        }
 159                }
 160        }
 161        return 0;
 162}
 163
 164/*
 165 * Note: we still hold spinlock of primary hash chain, so no other writer
 166 * can insert/delete a socket with local_port == num
 167 */
 168static int udp_lib_lport_inuse2(struct net *net, __u16 num,
 169                                struct udp_hslot *hslot2,
 170                                struct sock *sk)
 171{
 172        struct sock *sk2;
 173        kuid_t uid = sock_i_uid(sk);
 174        int res = 0;
 175
 176        spin_lock(&hslot2->lock);
 177        udp_portaddr_for_each_entry(sk2, &hslot2->head) {
 178                if (net_eq(sock_net(sk2), net) &&
 179                    sk2 != sk &&
 180                    (udp_sk(sk2)->udp_port_hash == num) &&
 181                    (!sk2->sk_reuse || !sk->sk_reuse) &&
 182                    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
 183                     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
 184                    inet_rcv_saddr_equal(sk, sk2, true)) {
 185                        if (sk2->sk_reuseport && sk->sk_reuseport &&
 186                            !rcu_access_pointer(sk->sk_reuseport_cb) &&
 187                            uid_eq(uid, sock_i_uid(sk2))) {
 188                                res = 0;
 189                        } else {
 190                                res = 1;
 191                        }
 192                        break;
 193                }
 194        }
 195        spin_unlock(&hslot2->lock);
 196        return res;
 197}
 198
 199static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
 200{
 201        struct net *net = sock_net(sk);
 202        kuid_t uid = sock_i_uid(sk);
 203        struct sock *sk2;
 204
 205        sk_for_each(sk2, &hslot->head) {
 206                if (net_eq(sock_net(sk2), net) &&
 207                    sk2 != sk &&
 208                    sk2->sk_family == sk->sk_family &&
 209                    ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
 210                    (udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) &&
 211                    (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
 212                    sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
 213                    inet_rcv_saddr_equal(sk, sk2, false)) {
 214                        return reuseport_add_sock(sk, sk2,
 215                                                  inet_rcv_saddr_any(sk));
 216                }
 217        }
 218
 219        return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
 220}
 221
 222/**
 223 *  udp_lib_get_port  -  UDP/-Lite port lookup for IPv4 and IPv6
 224 *
 225 *  @sk:          socket struct in question
 226 *  @snum:        port number to look up
 227 *  @hash2_nulladdr: AF-dependent hash value in secondary hash chains,
 228 *                   with NULL address
 229 */
 230int udp_lib_get_port(struct sock *sk, unsigned short snum,
 231                     unsigned int hash2_nulladdr)
 232{
 233        struct udp_hslot *hslot, *hslot2;
 234        struct udp_table *udptable = sk->sk_prot->h.udp_table;
 235        int    error = 1;
 236        struct net *net = sock_net(sk);
 237
 238        if (!snum) {
 239                int low, high, remaining;
 240                unsigned int rand;
 241                unsigned short first, last;
 242                DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
 243
 244                inet_get_local_port_range(net, &low, &high);
 245                remaining = (high - low) + 1;
 246
 247                rand = prandom_u32();
 248                first = reciprocal_scale(rand, remaining) + low;
 249                /*
 250                 * force rand to be an odd multiple of UDP_HTABLE_SIZE
 251                 */
 252                rand = (rand | 1) * (udptable->mask + 1);
 253                last = first + udptable->mask + 1;
 254                do {
 255                        hslot = udp_hashslot(udptable, net, first);
 256                        bitmap_zero(bitmap, PORTS_PER_CHAIN);
 257                        spin_lock_bh(&hslot->lock);
 258                        udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
 259                                            udptable->log);
 260
 261                        snum = first;
 262                        /*
 263                         * Iterate on all possible values of snum for this hash.
 264                         * Using steps of an odd multiple of UDP_HTABLE_SIZE
 265                         * give us randomization and full range coverage.
 266                         */
 267                        do {
 268                                if (low <= snum && snum <= high &&
 269                                    !test_bit(snum >> udptable->log, bitmap) &&
 270                                    !inet_is_local_reserved_port(net, snum))
 271                                        goto found;
 272                                snum += rand;
 273                        } while (snum != first);
 274                        spin_unlock_bh(&hslot->lock);
 275                        cond_resched();
 276                } while (++first != last);
 277                goto fail;
 278        } else {
 279                hslot = udp_hashslot(udptable, net, snum);
 280                spin_lock_bh(&hslot->lock);
 281                if (hslot->count > 10) {
 282                        int exist;
 283                        unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum;
 284
 285                        slot2          &= udptable->mask;
 286                        hash2_nulladdr &= udptable->mask;
 287
 288                        hslot2 = udp_hashslot2(udptable, slot2);
 289                        if (hslot->count < hslot2->count)
 290                                goto scan_primary_hash;
 291
 292                        exist = udp_lib_lport_inuse2(net, snum, hslot2, sk);
 293                        if (!exist && (hash2_nulladdr != slot2)) {
 294                                hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
 295                                exist = udp_lib_lport_inuse2(net, snum, hslot2,
 296                                                             sk);
 297                        }
 298                        if (exist)
 299                                goto fail_unlock;
 300                        else
 301                                goto found;
 302                }
 303scan_primary_hash:
 304                if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, 0))
 305                        goto fail_unlock;
 306        }
 307found:
 308        inet_sk(sk)->inet_num = snum;
 309        udp_sk(sk)->udp_port_hash = snum;
 310        udp_sk(sk)->udp_portaddr_hash ^= snum;
 311        if (sk_unhashed(sk)) {
 312                if (sk->sk_reuseport &&
 313                    udp_reuseport_add_sock(sk, hslot)) {
 314                        inet_sk(sk)->inet_num = 0;
 315                        udp_sk(sk)->udp_port_hash = 0;
 316                        udp_sk(sk)->udp_portaddr_hash ^= snum;
 317                        goto fail_unlock;
 318                }
 319
 320                sk_add_node_rcu(sk, &hslot->head);
 321                hslot->count++;
 322                sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 323
 324                hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
 325                spin_lock(&hslot2->lock);
 326                if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
 327                    sk->sk_family == AF_INET6)
 328                        hlist_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node,
 329                                           &hslot2->head);
 330                else
 331                        hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
 332                                           &hslot2->head);
 333                hslot2->count++;
 334                spin_unlock(&hslot2->lock);
 335        }
 336        sock_set_flag(sk, SOCK_RCU_FREE);
 337        error = 0;
 338fail_unlock:
 339        spin_unlock_bh(&hslot->lock);
 340fail:
 341        return error;
 342}
 343EXPORT_SYMBOL(udp_lib_get_port);
 344
 345int udp_v4_get_port(struct sock *sk, unsigned short snum)
 346{
 347        unsigned int hash2_nulladdr =
 348                ipv4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
 349        unsigned int hash2_partial =
 350                ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
 351
 352        /* precompute partial secondary hash */
 353        udp_sk(sk)->udp_portaddr_hash = hash2_partial;
 354        return udp_lib_get_port(sk, snum, hash2_nulladdr);
 355}
 356
 357static int compute_score(struct sock *sk, struct net *net,
 358                         __be32 saddr, __be16 sport,
 359                         __be32 daddr, unsigned short hnum,
 360                         int dif, int sdif)
 361{
 362        int score;
 363        struct inet_sock *inet;
 364        bool dev_match;
 365
 366        if (!net_eq(sock_net(sk), net) ||
 367            udp_sk(sk)->udp_port_hash != hnum ||
 368            ipv6_only_sock(sk))
 369                return -1;
 370
 371        if (sk->sk_rcv_saddr != daddr)
 372                return -1;
 373
 374        score = (sk->sk_family == PF_INET) ? 2 : 1;
 375
 376        inet = inet_sk(sk);
 377        if (inet->inet_daddr) {
 378                if (inet->inet_daddr != saddr)
 379                        return -1;
 380                score += 4;
 381        }
 382
 383        if (inet->inet_dport) {
 384                if (inet->inet_dport != sport)
 385                        return -1;
 386                score += 4;
 387        }
 388
 389        dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
 390                                        dif, sdif);
 391        if (!dev_match)
 392                return -1;
 393        if (sk->sk_bound_dev_if)
 394                score += 4;
 395
 396        if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
 397                score++;
 398        return score;
 399}
 400
 401static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
 402                       const __u16 lport, const __be32 faddr,
 403                       const __be16 fport)
 404{
 405        static u32 udp_ehash_secret __read_mostly;
 406
 407        net_get_random_once(&udp_ehash_secret, sizeof(udp_ehash_secret));
 408
 409        return __inet_ehashfn(laddr, lport, faddr, fport,
 410                              udp_ehash_secret + net_hash_mix(net));
 411}
 412
 413static struct sock *lookup_reuseport(struct net *net, struct sock *sk,
 414                                     struct sk_buff *skb,
 415                                     __be32 saddr, __be16 sport,
 416                                     __be32 daddr, unsigned short hnum)
 417{
 418        struct sock *reuse_sk = NULL;
 419        u32 hash;
 420
 421        if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) {
 422                hash = udp_ehashfn(net, daddr, hnum, saddr, sport);
 423                reuse_sk = reuseport_select_sock(sk, hash, skb,
 424                                                 sizeof(struct udphdr));
 425        }
 426        return reuse_sk;
 427}
 428
 429/* called with rcu_read_lock() */
 430static struct sock *udp4_lib_lookup2(struct net *net,
 431                                     __be32 saddr, __be16 sport,
 432                                     __be32 daddr, unsigned int hnum,
 433                                     int dif, int sdif,
 434                                     struct udp_hslot *hslot2,
 435                                     struct sk_buff *skb)
 436{
 437        struct sock *sk, *result;
 438        int score, badness;
 439
 440        result = NULL;
 441        badness = 0;
 442        udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
 443                score = compute_score(sk, net, saddr, sport,
 444                                      daddr, hnum, dif, sdif);
 445                if (score > badness) {
 446                        result = lookup_reuseport(net, sk, skb,
 447                                                  saddr, sport, daddr, hnum);
 448                        /* Fall back to scoring if group has connections */
 449                        if (result && !reuseport_has_conns(sk, false))
 450                                return result;
 451
 452                        result = result ? : sk;
 453                        badness = score;
 454                }
 455        }
 456        return result;
 457}
 458
 459static struct sock *udp4_lookup_run_bpf(struct net *net,
 460                                        struct udp_table *udptable,
 461                                        struct sk_buff *skb,
 462                                        __be32 saddr, __be16 sport,
 463                                        __be32 daddr, u16 hnum)
 464{
 465        struct sock *sk, *reuse_sk;
 466        bool no_reuseport;
 467
 468        if (udptable != &udp_table)
 469                return NULL; /* only UDP is supported */
 470
 471        no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_UDP,
 472                                            saddr, sport, daddr, hnum, &sk);
 473        if (no_reuseport || IS_ERR_OR_NULL(sk))
 474                return sk;
 475
 476        reuse_sk = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum);
 477        if (reuse_sk)
 478                sk = reuse_sk;
 479        return sk;
 480}
 481
 482/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
 483 * harder than this. -DaveM
 484 */
 485struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 486                __be16 sport, __be32 daddr, __be16 dport, int dif,
 487                int sdif, struct udp_table *udptable, struct sk_buff *skb)
 488{
 489        unsigned short hnum = ntohs(dport);
 490        unsigned int hash2, slot2;
 491        struct udp_hslot *hslot2;
 492        struct sock *result, *sk;
 493
 494        hash2 = ipv4_portaddr_hash(net, daddr, hnum);
 495        slot2 = hash2 & udptable->mask;
 496        hslot2 = &udptable->hash2[slot2];
 497
 498        /* Lookup connected or non-wildcard socket */
 499        result = udp4_lib_lookup2(net, saddr, sport,
 500                                  daddr, hnum, dif, sdif,
 501                                  hslot2, skb);
 502        if (!IS_ERR_OR_NULL(result) && result->sk_state == TCP_ESTABLISHED)
 503                goto done;
 504
 505        /* Lookup redirect from BPF */
 506        if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
 507                sk = udp4_lookup_run_bpf(net, udptable, skb,
 508                                         saddr, sport, daddr, hnum);
 509                if (sk) {
 510                        result = sk;
 511                        goto done;
 512                }
 513        }
 514
 515        /* Got non-wildcard socket or error on first lookup */
 516        if (result)
 517                goto done;
 518
 519        /* Lookup wildcard sockets */
 520        hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
 521        slot2 = hash2 & udptable->mask;
 522        hslot2 = &udptable->hash2[slot2];
 523
 524        result = udp4_lib_lookup2(net, saddr, sport,
 525                                  htonl(INADDR_ANY), hnum, dif, sdif,
 526                                  hslot2, skb);
 527done:
 528        if (IS_ERR(result))
 529                return NULL;
 530        return result;
 531}
 532EXPORT_SYMBOL_GPL(__udp4_lib_lookup);
 533
 534static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
 535                                                 __be16 sport, __be16 dport,
 536                                                 struct udp_table *udptable)
 537{
 538        const struct iphdr *iph = ip_hdr(skb);
 539
 540        return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
 541                                 iph->daddr, dport, inet_iif(skb),
 542                                 inet_sdif(skb), udptable, skb);
 543}
 544
 545struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb,
 546                                 __be16 sport, __be16 dport)
 547{
 548        const struct iphdr *iph = ip_hdr(skb);
 549
 550        return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
 551                                 iph->daddr, dport, inet_iif(skb),
 552                                 inet_sdif(skb), &udp_table, NULL);
 553}
 554
 555/* Must be called under rcu_read_lock().
 556 * Does increment socket refcount.
 557 */
 558#if IS_ENABLED(CONFIG_NF_TPROXY_IPV4) || IS_ENABLED(CONFIG_NF_SOCKET_IPV4)
 559struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
 560                             __be32 daddr, __be16 dport, int dif)
 561{
 562        struct sock *sk;
 563
 564        sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport,
 565                               dif, 0, &udp_table, NULL);
 566        if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
 567                sk = NULL;
 568        return sk;
 569}
 570EXPORT_SYMBOL_GPL(udp4_lib_lookup);
 571#endif
 572
 573static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
 574                                       __be16 loc_port, __be32 loc_addr,
 575                                       __be16 rmt_port, __be32 rmt_addr,
 576                                       int dif, int sdif, unsigned short hnum)
 577{
 578        struct inet_sock *inet = inet_sk(sk);
 579
 580        if (!net_eq(sock_net(sk), net) ||
 581            udp_sk(sk)->udp_port_hash != hnum ||
 582            (inet->inet_daddr && inet->inet_daddr != rmt_addr) ||
 583            (inet->inet_dport != rmt_port && inet->inet_dport) ||
 584            (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) ||
 585            ipv6_only_sock(sk) ||
 586            !udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
 587                return false;
 588        if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif, sdif))
 589                return false;
 590        return true;
 591}
 592
 593DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
 594void udp_encap_enable(void)
 595{
 596        static_branch_inc(&udp_encap_needed_key);
 597}
 598EXPORT_SYMBOL(udp_encap_enable);
 599
 600void udp_encap_disable(void)
 601{
 602        static_branch_dec(&udp_encap_needed_key);
 603}
 604EXPORT_SYMBOL(udp_encap_disable);
 605
 606/* Handler for tunnels with arbitrary destination ports: no socket lookup, go
 607 * through error handlers in encapsulations looking for a match.
 608 */
 609static int __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info)
 610{
 611        int i;
 612
 613        for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) {
 614                int (*handler)(struct sk_buff *skb, u32 info);
 615                const struct ip_tunnel_encap_ops *encap;
 616
 617                encap = rcu_dereference(iptun_encaps[i]);
 618                if (!encap)
 619                        continue;
 620                handler = encap->err_handler;
 621                if (handler && !handler(skb, info))
 622                        return 0;
 623        }
 624
 625        return -ENOENT;
 626}
 627
 628/* Try to match ICMP errors to UDP tunnels by looking up a socket without
 629 * reversing source and destination port: this will match tunnels that force the
 630 * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that
 631 * lwtunnels might actually break this assumption by being configured with
 632 * different destination ports on endpoints, in this case we won't be able to
 633 * trace ICMP messages back to them.
 634 *
 635 * If this doesn't match any socket, probe tunnels with arbitrary destination
 636 * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port
 637 * we've sent packets to won't necessarily match the local destination port.
 638 *
 639 * Then ask the tunnel implementation to match the error against a valid
 640 * association.
 641 *
 642 * Return an error if we can't find a match, the socket if we need further
 643 * processing, zero otherwise.
 644 */
 645static struct sock *__udp4_lib_err_encap(struct net *net,
 646                                         const struct iphdr *iph,
 647                                         struct udphdr *uh,
 648                                         struct udp_table *udptable,
 649                                         struct sock *sk,
 650                                         struct sk_buff *skb, u32 info)
 651{
 652        int (*lookup)(struct sock *sk, struct sk_buff *skb);
 653        int network_offset, transport_offset;
 654        struct udp_sock *up;
 655
 656        network_offset = skb_network_offset(skb);
 657        transport_offset = skb_transport_offset(skb);
 658
 659        /* Network header needs to point to the outer IPv4 header inside ICMP */
 660        skb_reset_network_header(skb);
 661
 662        /* Transport header needs to point to the UDP header */
 663        skb_set_transport_header(skb, iph->ihl << 2);
 664
 665        if (sk) {
 666                up = udp_sk(sk);
 667
 668                lookup = READ_ONCE(up->encap_err_lookup);
 669                if (lookup && lookup(sk, skb))
 670                        sk = NULL;
 671
 672                goto out;
 673        }
 674
 675        sk = __udp4_lib_lookup(net, iph->daddr, uh->source,
 676                               iph->saddr, uh->dest, skb->dev->ifindex, 0,
 677                               udptable, NULL);
 678        if (sk) {
 679                up = udp_sk(sk);
 680
 681                lookup = READ_ONCE(up->encap_err_lookup);
 682                if (!lookup || lookup(sk, skb))
 683                        sk = NULL;
 684        }
 685
 686out:
 687        if (!sk)
 688                sk = ERR_PTR(__udp4_lib_err_encap_no_sk(skb, info));
 689
 690        skb_set_transport_header(skb, transport_offset);
 691        skb_set_network_header(skb, network_offset);
 692
 693        return sk;
 694}
 695
 696/*
 697 * This routine is called by the ICMP module when it gets some
 698 * sort of error condition.  If err < 0 then the socket should
 699 * be closed and the error returned to the user.  If err > 0
 700 * it's just the icmp type << 8 | icmp code.
 701 * Header points to the ip header of the error packet. We move
 702 * on past this. Then (as it used to claim before adjustment)
 703 * header points to the first 8 bytes of the udp header.  We need
 704 * to find the appropriate port.
 705 */
 706
 707int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 708{
 709        struct inet_sock *inet;
 710        const struct iphdr *iph = (const struct iphdr *)skb->data;
 711        struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2));
 712        const int type = icmp_hdr(skb)->type;
 713        const int code = icmp_hdr(skb)->code;
 714        bool tunnel = false;
 715        struct sock *sk;
 716        int harderr;
 717        int err;
 718        struct net *net = dev_net(skb->dev);
 719
 720        sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
 721                               iph->saddr, uh->source, skb->dev->ifindex,
 722                               inet_sdif(skb), udptable, NULL);
 723
 724        if (!sk || udp_sk(sk)->encap_type) {
 725                /* No socket for error: try tunnels before discarding */
 726                if (static_branch_unlikely(&udp_encap_needed_key)) {
 727                        sk = __udp4_lib_err_encap(net, iph, uh, udptable, sk, skb,
 728                                                  info);
 729                        if (!sk)
 730                                return 0;
 731                } else
 732                        sk = ERR_PTR(-ENOENT);
 733
 734                if (IS_ERR(sk)) {
 735                        __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
 736                        return PTR_ERR(sk);
 737                }
 738
 739                tunnel = true;
 740        }
 741
 742        err = 0;
 743        harderr = 0;
 744        inet = inet_sk(sk);
 745
 746        switch (type) {
 747        default:
 748        case ICMP_TIME_EXCEEDED:
 749                err = EHOSTUNREACH;
 750                break;
 751        case ICMP_SOURCE_QUENCH:
 752                goto out;
 753        case ICMP_PARAMETERPROB:
 754                err = EPROTO;
 755                harderr = 1;
 756                break;
 757        case ICMP_DEST_UNREACH:
 758                if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
 759                        ipv4_sk_update_pmtu(skb, sk, info);
 760                        if (inet->pmtudisc != IP_PMTUDISC_DONT) {
 761                                err = EMSGSIZE;
 762                                harderr = 1;
 763                                break;
 764                        }
 765                        goto out;
 766                }
 767                err = EHOSTUNREACH;
 768                if (code <= NR_ICMP_UNREACH) {
 769                        harderr = icmp_err_convert[code].fatal;
 770                        err = icmp_err_convert[code].errno;
 771                }
 772                break;
 773        case ICMP_REDIRECT:
 774                ipv4_sk_redirect(skb, sk);
 775                goto out;
 776        }
 777
 778        /*
 779         *      RFC1122: OK.  Passes ICMP errors back to application, as per
 780         *      4.1.3.3.
 781         */
 782        if (tunnel) {
 783                /* ...not for tunnels though: we don't have a sending socket */
 784                goto out;
 785        }
 786        if (!inet->recverr) {
 787                if (!harderr || sk->sk_state != TCP_ESTABLISHED)
 788                        goto out;
 789        } else
 790                ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1));
 791
 792        sk->sk_err = err;
 793        sk_error_report(sk);
 794out:
 795        return 0;
 796}
 797
 798int udp_err(struct sk_buff *skb, u32 info)
 799{
 800        return __udp4_lib_err(skb, info, &udp_table);
 801}
 802
 803/*
 804 * Throw away all pending data and cancel the corking. Socket is locked.
 805 */
 806void udp_flush_pending_frames(struct sock *sk)
 807{
 808        struct udp_sock *up = udp_sk(sk);
 809
 810        if (up->pending) {
 811                up->len = 0;
 812                up->pending = 0;
 813                ip_flush_pending_frames(sk);
 814        }
 815}
 816EXPORT_SYMBOL(udp_flush_pending_frames);
 817
 818/**
 819 *      udp4_hwcsum  -  handle outgoing HW checksumming
 820 *      @skb:   sk_buff containing the filled-in UDP header
 821 *              (checksum field must be zeroed out)
 822 *      @src:   source IP address
 823 *      @dst:   destination IP address
 824 */
 825void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
 826{
 827        struct udphdr *uh = udp_hdr(skb);
 828        int offset = skb_transport_offset(skb);
 829        int len = skb->len - offset;
 830        int hlen = len;
 831        __wsum csum = 0;
 832
 833        if (!skb_has_frag_list(skb)) {
 834                /*
 835                 * Only one fragment on the socket.
 836                 */
 837                skb->csum_start = skb_transport_header(skb) - skb->head;
 838                skb->csum_offset = offsetof(struct udphdr, check);
 839                uh->check = ~csum_tcpudp_magic(src, dst, len,
 840                                               IPPROTO_UDP, 0);
 841        } else {
 842                struct sk_buff *frags;
 843
 844                /*
 845                 * HW-checksum won't work as there are two or more
 846                 * fragments on the socket so that all csums of sk_buffs
 847                 * should be together
 848                 */
 849                skb_walk_frags(skb, frags) {
 850                        csum = csum_add(csum, frags->csum);
 851                        hlen -= frags->len;
 852                }
 853
 854                csum = skb_checksum(skb, offset, hlen, csum);
 855                skb->ip_summed = CHECKSUM_NONE;
 856
 857                uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum);
 858                if (uh->check == 0)
 859                        uh->check = CSUM_MANGLED_0;
 860        }
 861}
 862EXPORT_SYMBOL_GPL(udp4_hwcsum);
 863
 864/* Function to set UDP checksum for an IPv4 UDP packet. This is intended
 865 * for the simple case like when setting the checksum for a UDP tunnel.
 866 */
 867void udp_set_csum(bool nocheck, struct sk_buff *skb,
 868                  __be32 saddr, __be32 daddr, int len)
 869{
 870        struct udphdr *uh = udp_hdr(skb);
 871
 872        if (nocheck) {
 873                uh->check = 0;
 874        } else if (skb_is_gso(skb)) {
 875                uh->check = ~udp_v4_check(len, saddr, daddr, 0);
 876        } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
 877                uh->check = 0;
 878                uh->check = udp_v4_check(len, saddr, daddr, lco_csum(skb));
 879                if (uh->check == 0)
 880                        uh->check = CSUM_MANGLED_0;
 881        } else {
 882                skb->ip_summed = CHECKSUM_PARTIAL;
 883                skb->csum_start = skb_transport_header(skb) - skb->head;
 884                skb->csum_offset = offsetof(struct udphdr, check);
 885                uh->check = ~udp_v4_check(len, saddr, daddr, 0);
 886        }
 887}
 888EXPORT_SYMBOL(udp_set_csum);
 889
 890static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
 891                        struct inet_cork *cork)
 892{
 893        struct sock *sk = skb->sk;
 894        struct inet_sock *inet = inet_sk(sk);
 895        struct udphdr *uh;
 896        int err;
 897        int is_udplite = IS_UDPLITE(sk);
 898        int offset = skb_transport_offset(skb);
 899        int len = skb->len - offset;
 900        int datalen = len - sizeof(*uh);
 901        __wsum csum = 0;
 902
 903        /*
 904         * Create a UDP header
 905         */
 906        uh = udp_hdr(skb);
 907        uh->source = inet->inet_sport;
 908        uh->dest = fl4->fl4_dport;
 909        uh->len = htons(len);
 910        uh->check = 0;
 911
 912        if (cork->gso_size) {
 913                const int hlen = skb_network_header_len(skb) +
 914                                 sizeof(struct udphdr);
 915
 916                if (hlen + cork->gso_size > cork->fragsize) {
 917                        kfree_skb(skb);
 918                        return -EINVAL;
 919                }
 920                if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS) {
 921                        kfree_skb(skb);
 922                        return -EINVAL;
 923                }
 924                if (sk->sk_no_check_tx) {
 925                        kfree_skb(skb);
 926                        return -EINVAL;
 927                }
 928                if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite ||
 929                    dst_xfrm(skb_dst(skb))) {
 930                        kfree_skb(skb);
 931                        return -EIO;
 932                }
 933
 934                if (datalen > cork->gso_size) {
 935                        skb_shinfo(skb)->gso_size = cork->gso_size;
 936                        skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
 937                        skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen,
 938                                                                 cork->gso_size);
 939                }
 940                goto csum_partial;
 941        }
 942
 943        if (is_udplite)                                  /*     UDP-Lite      */
 944                csum = udplite_csum(skb);
 945
 946        else if (sk->sk_no_check_tx) {                   /* UDP csum off */
 947
 948                skb->ip_summed = CHECKSUM_NONE;
 949                goto send;
 950
 951        } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
 952csum_partial:
 953
 954                udp4_hwcsum(skb, fl4->saddr, fl4->daddr);
 955                goto send;
 956
 957        } else
 958                csum = udp_csum(skb);
 959
 960        /* add protocol-dependent pseudo-header */
 961        uh->check = csum_tcpudp_magic(fl4->saddr, fl4->daddr, len,
 962                                      sk->sk_protocol, csum);
 963        if (uh->check == 0)
 964                uh->check = CSUM_MANGLED_0;
 965
 966send:
 967        err = ip_send_skb(sock_net(sk), skb);
 968        if (err) {
 969                if (err == -ENOBUFS && !inet->recverr) {
 970                        UDP_INC_STATS(sock_net(sk),
 971                                      UDP_MIB_SNDBUFERRORS, is_udplite);
 972                        err = 0;
 973                }
 974        } else
 975                UDP_INC_STATS(sock_net(sk),
 976                              UDP_MIB_OUTDATAGRAMS, is_udplite);
 977        return err;
 978}
 979
 980/*
 981 * Push out all pending data as one UDP datagram. Socket is locked.
 982 */
 983int udp_push_pending_frames(struct sock *sk)
 984{
 985        struct udp_sock  *up = udp_sk(sk);
 986        struct inet_sock *inet = inet_sk(sk);
 987        struct flowi4 *fl4 = &inet->cork.fl.u.ip4;
 988        struct sk_buff *skb;
 989        int err = 0;
 990
 991        skb = ip_finish_skb(sk, fl4);
 992        if (!skb)
 993                goto out;
 994
 995        err = udp_send_skb(skb, fl4, &inet->cork.base);
 996
 997out:
 998        up->len = 0;
 999        up->pending = 0;
1000        return err;
1001}
1002EXPORT_SYMBOL(udp_push_pending_frames);
1003
1004static int __udp_cmsg_send(struct cmsghdr *cmsg, u16 *gso_size)
1005{
1006        switch (cmsg->cmsg_type) {
1007        case UDP_SEGMENT:
1008                if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u16)))
1009                        return -EINVAL;
1010                *gso_size = *(__u16 *)CMSG_DATA(cmsg);
1011                return 0;
1012        default:
1013                return -EINVAL;
1014        }
1015}
1016
1017int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size)
1018{
1019        struct cmsghdr *cmsg;
1020        bool need_ip = false;
1021        int err;
1022
1023        for_each_cmsghdr(cmsg, msg) {
1024                if (!CMSG_OK(msg, cmsg))
1025                        return -EINVAL;
1026
1027                if (cmsg->cmsg_level != SOL_UDP) {
1028                        need_ip = true;
1029                        continue;
1030                }
1031
1032                err = __udp_cmsg_send(cmsg, gso_size);
1033                if (err)
1034                        return err;
1035        }
1036
1037        return need_ip;
1038}
1039EXPORT_SYMBOL_GPL(udp_cmsg_send);
1040
1041int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
1042{
1043        struct inet_sock *inet = inet_sk(sk);
1044        struct udp_sock *up = udp_sk(sk);
1045        DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
1046        struct flowi4 fl4_stack;
1047        struct flowi4 *fl4;
1048        int ulen = len;
1049        struct ipcm_cookie ipc;
1050        struct rtable *rt = NULL;
1051        int free = 0;
1052        int connected = 0;
1053        __be32 daddr, faddr, saddr;
1054        __be16 dport;
1055        u8  tos;
1056        int err, is_udplite = IS_UDPLITE(sk);
1057        int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE;
1058        int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
1059        struct sk_buff *skb;
1060        struct ip_options_data opt_copy;
1061
1062        if (len > 0xFFFF)
1063                return -EMSGSIZE;
1064
1065        /*
1066         *      Check the flags.
1067         */
1068
1069        if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */
1070                return -EOPNOTSUPP;
1071
1072        getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
1073
1074        fl4 = &inet->cork.fl.u.ip4;
1075        if (up->pending) {
1076                /*
1077                 * There are pending frames.
1078                 * The socket lock must be held while it's corked.
1079                 */
1080                lock_sock(sk);
1081                if (likely(up->pending)) {
1082                        if (unlikely(up->pending != AF_INET)) {
1083                                release_sock(sk);
1084                                return -EINVAL;
1085                        }
1086                        goto do_append_data;
1087                }
1088                release_sock(sk);
1089        }
1090        ulen += sizeof(struct udphdr);
1091
1092        /*
1093         *      Get and verify the address.
1094         */
1095        if (usin) {
1096                if (msg->msg_namelen < sizeof(*usin))
1097                        return -EINVAL;
1098                if (usin->sin_family != AF_INET) {
1099                        if (usin->sin_family != AF_UNSPEC)
1100                                return -EAFNOSUPPORT;
1101                }
1102
1103                daddr = usin->sin_addr.s_addr;
1104                dport = usin->sin_port;
1105                if (dport == 0)
1106                        return -EINVAL;
1107        } else {
1108                if (sk->sk_state != TCP_ESTABLISHED)
1109                        return -EDESTADDRREQ;
1110                daddr = inet->inet_daddr;
1111                dport = inet->inet_dport;
1112                /* Open fast path for connected socket.
1113                   Route will not be used, if at least one option is set.
1114                 */
1115                connected = 1;
1116        }
1117
1118        ipcm_init_sk(&ipc, inet);
1119        ipc.gso_size = READ_ONCE(up->gso_size);
1120
1121        if (msg->msg_controllen) {
1122                err = udp_cmsg_send(sk, msg, &ipc.gso_size);
1123                if (err > 0)
1124                        err = ip_cmsg_send(sk, msg, &ipc,
1125                                           sk->sk_family == AF_INET6);
1126                if (unlikely(err < 0)) {
1127                        kfree(ipc.opt);
1128                        return err;
1129                }
1130                if (ipc.opt)
1131                        free = 1;
1132                connected = 0;
1133        }
1134        if (!ipc.opt) {
1135                struct ip_options_rcu *inet_opt;
1136
1137                rcu_read_lock();
1138                inet_opt = rcu_dereference(inet->inet_opt);
1139                if (inet_opt) {
1140                        memcpy(&opt_copy, inet_opt,
1141                               sizeof(*inet_opt) + inet_opt->opt.optlen);
1142                        ipc.opt = &opt_copy.opt;
1143                }
1144                rcu_read_unlock();
1145        }
1146
1147        if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) {
1148                err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
1149                                            (struct sockaddr *)usin, &ipc.addr);
1150                if (err)
1151                        goto out_free;
1152                if (usin) {
1153                        if (usin->sin_port == 0) {
1154                                /* BPF program set invalid port. Reject it. */
1155                                err = -EINVAL;
1156                                goto out_free;
1157                        }
1158                        daddr = usin->sin_addr.s_addr;
1159                        dport = usin->sin_port;
1160                }
1161        }
1162
1163        saddr = ipc.addr;
1164        ipc.addr = faddr = daddr;
1165
1166        if (ipc.opt && ipc.opt->opt.srr) {
1167                if (!daddr) {
1168                        err = -EINVAL;
1169                        goto out_free;
1170                }
1171                faddr = ipc.opt->opt.faddr;
1172                connected = 0;
1173        }
1174        tos = get_rttos(&ipc, inet);
1175        if (sock_flag(sk, SOCK_LOCALROUTE) ||
1176            (msg->msg_flags & MSG_DONTROUTE) ||
1177            (ipc.opt && ipc.opt->opt.is_strictroute)) {
1178                tos |= RTO_ONLINK;
1179                connected = 0;
1180        }
1181
1182        if (ipv4_is_multicast(daddr)) {
1183                if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
1184                        ipc.oif = inet->mc_index;
1185                if (!saddr)
1186                        saddr = inet->mc_addr;
1187                connected = 0;
1188        } else if (!ipc.oif) {
1189                ipc.oif = inet->uc_index;
1190        } else if (ipv4_is_lbcast(daddr) && inet->uc_index) {
1191                /* oif is set, packet is to local broadcast and
1192                 * uc_index is set. oif is most likely set
1193                 * by sk_bound_dev_if. If uc_index != oif check if the
1194                 * oif is an L3 master and uc_index is an L3 slave.
1195                 * If so, we want to allow the send using the uc_index.
1196                 */
1197                if (ipc.oif != inet->uc_index &&
1198                    ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),
1199                                                              inet->uc_index)) {
1200                        ipc.oif = inet->uc_index;
1201                }
1202        }
1203
1204        if (connected)
1205                rt = (struct rtable *)sk_dst_check(sk, 0);
1206
1207        if (!rt) {
1208                struct net *net = sock_net(sk);
1209                __u8 flow_flags = inet_sk_flowi_flags(sk);
1210
1211                fl4 = &fl4_stack;
1212
1213                flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, tos,
1214                                   RT_SCOPE_UNIVERSE, sk->sk_protocol,
1215                                   flow_flags,
1216                                   faddr, saddr, dport, inet->inet_sport,
1217                                   sk->sk_uid);
1218
1219                security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
1220                rt = ip_route_output_flow(net, fl4, sk);
1221                if (IS_ERR(rt)) {
1222                        err = PTR_ERR(rt);
1223                        rt = NULL;
1224                        if (err == -ENETUNREACH)
1225                                IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
1226                        goto out;
1227                }
1228
1229                err = -EACCES;
1230                if ((rt->rt_flags & RTCF_BROADCAST) &&
1231                    !sock_flag(sk, SOCK_BROADCAST))
1232                        goto out;
1233                if (connected)
1234                        sk_dst_set(sk, dst_clone(&rt->dst));
1235        }
1236
1237        if (msg->msg_flags&MSG_CONFIRM)
1238                goto do_confirm;
1239back_from_confirm:
1240
1241        saddr = fl4->saddr;
1242        if (!ipc.addr)
1243                daddr = ipc.addr = fl4->daddr;
1244
1245        /* Lockless fast path for the non-corking case. */
1246        if (!corkreq) {
1247                struct inet_cork cork;
1248
1249                skb = ip_make_skb(sk, fl4, getfrag, msg, ulen,
1250                                  sizeof(struct udphdr), &ipc, &rt,
1251                                  &cork, msg->msg_flags);
1252                err = PTR_ERR(skb);
1253                if (!IS_ERR_OR_NULL(skb))
1254                        err = udp_send_skb(skb, fl4, &cork);
1255                goto out;
1256        }
1257
1258        lock_sock(sk);
1259        if (unlikely(up->pending)) {
1260                /* The socket is already corked while preparing it. */
1261                /* ... which is an evident application bug. --ANK */
1262                release_sock(sk);
1263
1264                net_dbg_ratelimited("socket already corked\n");
1265                err = -EINVAL;
1266                goto out;
1267        }
1268        /*
1269         *      Now cork the socket to pend data.
1270         */
1271        fl4 = &inet->cork.fl.u.ip4;
1272        fl4->daddr = daddr;
1273        fl4->saddr = saddr;
1274        fl4->fl4_dport = dport;
1275        fl4->fl4_sport = inet->inet_sport;
1276        up->pending = AF_INET;
1277
1278do_append_data:
1279        up->len += ulen;
1280        err = ip_append_data(sk, fl4, getfrag, msg, ulen,
1281                             sizeof(struct udphdr), &ipc, &rt,
1282                             corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
1283        if (err)
1284                udp_flush_pending_frames(sk);
1285        else if (!corkreq)
1286                err = udp_push_pending_frames(sk);
1287        else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
1288                up->pending = 0;
1289        release_sock(sk);
1290
1291out:
1292        ip_rt_put(rt);
1293out_free:
1294        if (free)
1295                kfree(ipc.opt);
1296        if (!err)
1297                return len;
1298        /*
1299         * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space.  Reporting
1300         * ENOBUFS might not be good (it's not tunable per se), but otherwise
1301         * we don't have a good statistic (IpOutDiscards but it can be too many
1302         * things).  We could add another new stat but at least for now that
1303         * seems like overkill.
1304         */
1305        if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
1306                UDP_INC_STATS(sock_net(sk),
1307                              UDP_MIB_SNDBUFERRORS, is_udplite);
1308        }
1309        return err;
1310
1311do_confirm:
1312        if (msg->msg_flags & MSG_PROBE)
1313                dst_confirm_neigh(&rt->dst, &fl4->daddr);
1314        if (!(msg->msg_flags&MSG_PROBE) || len)
1315                goto back_from_confirm;
1316        err = 0;
1317        goto out;
1318}
1319EXPORT_SYMBOL(udp_sendmsg);
1320
1321int udp_sendpage(struct sock *sk, struct page *page, int offset,
1322                 size_t size, int flags)
1323{
1324        struct inet_sock *inet = inet_sk(sk);
1325        struct udp_sock *up = udp_sk(sk);
1326        int ret;
1327
1328        if (flags & MSG_SENDPAGE_NOTLAST)
1329                flags |= MSG_MORE;
1330
1331        if (!up->pending) {
1332                struct msghdr msg = {   .msg_flags = flags|MSG_MORE };
1333
1334                /* Call udp_sendmsg to specify destination address which
1335                 * sendpage interface can't pass.
1336                 * This will succeed only when the socket is connected.
1337                 */
1338                ret = udp_sendmsg(sk, &msg, 0);
1339                if (ret < 0)
1340                        return ret;
1341        }
1342
1343        lock_sock(sk);
1344
1345        if (unlikely(!up->pending)) {
1346                release_sock(sk);
1347
1348                net_dbg_ratelimited("cork failed\n");
1349                return -EINVAL;
1350        }
1351
1352        ret = ip_append_page(sk, &inet->cork.fl.u.ip4,
1353                             page, offset, size, flags);
1354        if (ret == -EOPNOTSUPP) {
1355                release_sock(sk);
1356                return sock_no_sendpage(sk->sk_socket, page, offset,
1357                                        size, flags);
1358        }
1359        if (ret < 0) {
1360                udp_flush_pending_frames(sk);
1361                goto out;
1362        }
1363
1364        up->len += size;
1365        if (!(READ_ONCE(up->corkflag) || (flags&MSG_MORE)))
1366                ret = udp_push_pending_frames(sk);
1367        if (!ret)
1368                ret = size;
1369out:
1370        release_sock(sk);
1371        return ret;
1372}
1373
1374#define UDP_SKB_IS_STATELESS 0x80000000
1375
1376/* all head states (dst, sk, nf conntrack) except skb extensions are
1377 * cleared by udp_rcv().
1378 *
1379 * We need to preserve secpath, if present, to eventually process
1380 * IP_CMSG_PASSSEC at recvmsg() time.
1381 *
1382 * Other extensions can be cleared.
1383 */
1384static bool udp_try_make_stateless(struct sk_buff *skb)
1385{
1386        if (!skb_has_extensions(skb))
1387                return true;
1388
1389        if (!secpath_exists(skb)) {
1390                skb_ext_reset(skb);
1391                return true;
1392        }
1393
1394        return false;
1395}
1396
1397static void udp_set_dev_scratch(struct sk_buff *skb)
1398{
1399        struct udp_dev_scratch *scratch = udp_skb_scratch(skb);
1400
1401        BUILD_BUG_ON(sizeof(struct udp_dev_scratch) > sizeof(long));
1402        scratch->_tsize_state = skb->truesize;
1403#if BITS_PER_LONG == 64
1404        scratch->len = skb->len;
1405        scratch->csum_unnecessary = !!skb_csum_unnecessary(skb);
1406        scratch->is_linear = !skb_is_nonlinear(skb);
1407#endif
1408        if (udp_try_make_stateless(skb))
1409                scratch->_tsize_state |= UDP_SKB_IS_STATELESS;
1410}
1411
1412static void udp_skb_csum_unnecessary_set(struct sk_buff *skb)
1413{
1414        /* We come here after udp_lib_checksum_complete() returned 0.
1415         * This means that __skb_checksum_complete() might have
1416         * set skb->csum_valid to 1.
1417         * On 64bit platforms, we can set csum_unnecessary
1418         * to true, but only if the skb is not shared.
1419         */
1420#if BITS_PER_LONG == 64
1421        if (!skb_shared(skb))
1422                udp_skb_scratch(skb)->csum_unnecessary = true;
1423#endif
1424}
1425
1426static int udp_skb_truesize(struct sk_buff *skb)
1427{
1428        return udp_skb_scratch(skb)->_tsize_state & ~UDP_SKB_IS_STATELESS;
1429}
1430
1431static bool udp_skb_has_head_state(struct sk_buff *skb)
1432{
1433        return !(udp_skb_scratch(skb)->_tsize_state & UDP_SKB_IS_STATELESS);
1434}
1435
1436/* fully reclaim rmem/fwd memory allocated for skb */
1437static void udp_rmem_release(struct sock *sk, int size, int partial,
1438                             bool rx_queue_lock_held)
1439{
1440        struct udp_sock *up = udp_sk(sk);
1441        struct sk_buff_head *sk_queue;
1442        int amt;
1443
1444        if (likely(partial)) {
1445                up->forward_deficit += size;
1446                size = up->forward_deficit;
1447                if (size < (sk->sk_rcvbuf >> 2) &&
1448                    !skb_queue_empty(&up->reader_queue))
1449                        return;
1450        } else {
1451                size += up->forward_deficit;
1452        }
1453        up->forward_deficit = 0;
1454
1455        /* acquire the sk_receive_queue for fwd allocated memory scheduling,
1456         * if the called don't held it already
1457         */
1458        sk_queue = &sk->sk_receive_queue;
1459        if (!rx_queue_lock_held)
1460                spin_lock(&sk_queue->lock);
1461
1462
1463        sk->sk_forward_alloc += size;
1464        amt = (sk->sk_forward_alloc - partial) & ~(SK_MEM_QUANTUM - 1);
1465        sk->sk_forward_alloc -= amt;
1466
1467        if (amt)
1468                __sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT);
1469
1470        atomic_sub(size, &sk->sk_rmem_alloc);
1471
1472        /* this can save us from acquiring the rx queue lock on next receive */
1473        skb_queue_splice_tail_init(sk_queue, &up->reader_queue);
1474
1475        if (!rx_queue_lock_held)
1476                spin_unlock(&sk_queue->lock);
1477}
1478
1479/* Note: called with reader_queue.lock held.
1480 * Instead of using skb->truesize here, find a copy of it in skb->dev_scratch
1481 * This avoids a cache line miss while receive_queue lock is held.
1482 * Look at __udp_enqueue_schedule_skb() to find where this copy is done.
1483 */
1484void udp_skb_destructor(struct sock *sk, struct sk_buff *skb)
1485{
1486        prefetch(&skb->data);
1487        udp_rmem_release(sk, udp_skb_truesize(skb), 1, false);
1488}
1489EXPORT_SYMBOL(udp_skb_destructor);
1490
1491/* as above, but the caller held the rx queue lock, too */
1492static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb)
1493{
1494        prefetch(&skb->data);
1495        udp_rmem_release(sk, udp_skb_truesize(skb), 1, true);
1496}
1497
1498/* Idea of busylocks is to let producers grab an extra spinlock
1499 * to relieve pressure on the receive_queue spinlock shared by consumer.
1500 * Under flood, this means that only one producer can be in line
1501 * trying to acquire the receive_queue spinlock.
1502 * These busylock can be allocated on a per cpu manner, instead of a
1503 * per socket one (that would consume a cache line per socket)
1504 */
1505static int udp_busylocks_log __read_mostly;
1506static spinlock_t *udp_busylocks __read_mostly;
1507
1508static spinlock_t *busylock_acquire(void *ptr)
1509{
1510        spinlock_t *busy;
1511
1512        busy = udp_busylocks + hash_ptr(ptr, udp_busylocks_log);
1513        spin_lock(busy);
1514        return busy;
1515}
1516
1517static void busylock_release(spinlock_t *busy)
1518{
1519        if (busy)
1520                spin_unlock(busy);
1521}
1522
1523int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
1524{
1525        struct sk_buff_head *list = &sk->sk_receive_queue;
1526        int rmem, delta, amt, err = -ENOMEM;
1527        spinlock_t *busy = NULL;
1528        int size;
1529
1530        /* try to avoid the costly atomic add/sub pair when the receive
1531         * queue is full; always allow at least a packet
1532         */
1533        rmem = atomic_read(&sk->sk_rmem_alloc);
1534        if (rmem > sk->sk_rcvbuf)
1535                goto drop;
1536
1537        /* Under mem pressure, it might be helpful to help udp_recvmsg()
1538         * having linear skbs :
1539         * - Reduce memory overhead and thus increase receive queue capacity
1540         * - Less cache line misses at copyout() time
1541         * - Less work at consume_skb() (less alien page frag freeing)
1542         */
1543        if (rmem > (sk->sk_rcvbuf >> 1)) {
1544                skb_condense(skb);
1545
1546                busy = busylock_acquire(sk);
1547        }
1548        size = skb->truesize;
1549        udp_set_dev_scratch(skb);
1550
1551        /* we drop only if the receive buf is full and the receive
1552         * queue contains some other skb
1553         */
1554        rmem = atomic_add_return(size, &sk->sk_rmem_alloc);
1555        if (rmem > (size + (unsigned int)sk->sk_rcvbuf))
1556                goto uncharge_drop;
1557
1558        spin_lock(&list->lock);
1559        if (size >= sk->sk_forward_alloc) {
1560                amt = sk_mem_pages(size);
1561                delta = amt << SK_MEM_QUANTUM_SHIFT;
1562                if (!__sk_mem_raise_allocated(sk, delta, amt, SK_MEM_RECV)) {
1563                        err = -ENOBUFS;
1564                        spin_unlock(&list->lock);
1565                        goto uncharge_drop;
1566                }
1567
1568                sk->sk_forward_alloc += delta;
1569        }
1570
1571        sk->sk_forward_alloc -= size;
1572
1573        /* no need to setup a destructor, we will explicitly release the
1574         * forward allocated memory on dequeue
1575         */
1576        sock_skb_set_dropcount(sk, skb);
1577
1578        __skb_queue_tail(list, skb);
1579        spin_unlock(&list->lock);
1580
1581        if (!sock_flag(sk, SOCK_DEAD))
1582                sk->sk_data_ready(sk);
1583
1584        busylock_release(busy);
1585        return 0;
1586
1587uncharge_drop:
1588        atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1589
1590drop:
1591        atomic_inc(&sk->sk_drops);
1592        busylock_release(busy);
1593        return err;
1594}
1595EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb);
1596
1597void udp_destruct_sock(struct sock *sk)
1598{
1599        /* reclaim completely the forward allocated memory */
1600        struct udp_sock *up = udp_sk(sk);
1601        unsigned int total = 0;
1602        struct sk_buff *skb;
1603
1604        skb_queue_splice_tail_init(&sk->sk_receive_queue, &up->reader_queue);
1605        while ((skb = __skb_dequeue(&up->reader_queue)) != NULL) {
1606                total += skb->truesize;
1607                kfree_skb(skb);
1608        }
1609        udp_rmem_release(sk, total, 0, true);
1610
1611        inet_sock_destruct(sk);
1612}
1613EXPORT_SYMBOL_GPL(udp_destruct_sock);
1614
1615int udp_init_sock(struct sock *sk)
1616{
1617        skb_queue_head_init(&udp_sk(sk)->reader_queue);
1618        sk->sk_destruct = udp_destruct_sock;
1619        return 0;
1620}
1621EXPORT_SYMBOL_GPL(udp_init_sock);
1622
1623void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
1624{
1625        if (unlikely(READ_ONCE(sk->sk_peek_off) >= 0)) {
1626                bool slow = lock_sock_fast(sk);
1627
1628                sk_peek_offset_bwd(sk, len);
1629                unlock_sock_fast(sk, slow);
1630        }
1631
1632        if (!skb_unref(skb))
1633                return;
1634
1635        /* In the more common cases we cleared the head states previously,
1636         * see __udp_queue_rcv_skb().
1637         */
1638        if (unlikely(udp_skb_has_head_state(skb)))
1639                skb_release_head_state(skb);
1640        __consume_stateless_skb(skb);
1641}
1642EXPORT_SYMBOL_GPL(skb_consume_udp);
1643
1644static struct sk_buff *__first_packet_length(struct sock *sk,
1645                                             struct sk_buff_head *rcvq,
1646                                             int *total)
1647{
1648        struct sk_buff *skb;
1649
1650        while ((skb = skb_peek(rcvq)) != NULL) {
1651                if (udp_lib_checksum_complete(skb)) {
1652                        __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS,
1653                                        IS_UDPLITE(sk));
1654                        __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
1655                                        IS_UDPLITE(sk));
1656                        atomic_inc(&sk->sk_drops);
1657                        __skb_unlink(skb, rcvq);
1658                        *total += skb->truesize;
1659                        kfree_skb(skb);
1660                } else {
1661                        udp_skb_csum_unnecessary_set(skb);
1662                        break;
1663                }
1664        }
1665        return skb;
1666}
1667
1668/**
1669 *      first_packet_length     - return length of first packet in receive queue
1670 *      @sk: socket
1671 *
1672 *      Drops all bad checksum frames, until a valid one is found.
1673 *      Returns the length of found skb, or -1 if none is found.
1674 */
1675static int first_packet_length(struct sock *sk)
1676{
1677        struct sk_buff_head *rcvq = &udp_sk(sk)->reader_queue;
1678        struct sk_buff_head *sk_queue = &sk->sk_receive_queue;
1679        struct sk_buff *skb;
1680        int total = 0;
1681        int res;
1682
1683        spin_lock_bh(&rcvq->lock);
1684        skb = __first_packet_length(sk, rcvq, &total);
1685        if (!skb && !skb_queue_empty_lockless(sk_queue)) {
1686                spin_lock(&sk_queue->lock);
1687                skb_queue_splice_tail_init(sk_queue, rcvq);
1688                spin_unlock(&sk_queue->lock);
1689
1690                skb = __first_packet_length(sk, rcvq, &total);
1691        }
1692        res = skb ? skb->len : -1;
1693        if (total)
1694                udp_rmem_release(sk, total, 1, false);
1695        spin_unlock_bh(&rcvq->lock);
1696        return res;
1697}
1698
1699/*
1700 *      IOCTL requests applicable to the UDP protocol
1701 */
1702
1703int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
1704{
1705        switch (cmd) {
1706        case SIOCOUTQ:
1707        {
1708                int amount = sk_wmem_alloc_get(sk);
1709
1710                return put_user(amount, (int __user *)arg);
1711        }
1712
1713        case SIOCINQ:
1714        {
1715                int amount = max_t(int, 0, first_packet_length(sk));
1716
1717                return put_user(amount, (int __user *)arg);
1718        }
1719
1720        default:
1721                return -ENOIOCTLCMD;
1722        }
1723
1724        return 0;
1725}
1726EXPORT_SYMBOL(udp_ioctl);
1727
1728struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
1729                               int noblock, int *off, int *err)
1730{
1731        struct sk_buff_head *sk_queue = &sk->sk_receive_queue;
1732        struct sk_buff_head *queue;
1733        struct sk_buff *last;
1734        long timeo;
1735        int error;
1736
1737        queue = &udp_sk(sk)->reader_queue;
1738        flags |= noblock ? MSG_DONTWAIT : 0;
1739        timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
1740        do {
1741                struct sk_buff *skb;
1742
1743                error = sock_error(sk);
1744                if (error)
1745                        break;
1746
1747                error = -EAGAIN;
1748                do {
1749                        spin_lock_bh(&queue->lock);
1750                        skb = __skb_try_recv_from_queue(sk, queue, flags, off,
1751                                                        err, &last);
1752                        if (skb) {
1753                                if (!(flags & MSG_PEEK))
1754                                        udp_skb_destructor(sk, skb);
1755                                spin_unlock_bh(&queue->lock);
1756                                return skb;
1757                        }
1758
1759                        if (skb_queue_empty_lockless(sk_queue)) {
1760                                spin_unlock_bh(&queue->lock);
1761                                goto busy_check;
1762                        }
1763
1764                        /* refill the reader queue and walk it again
1765                         * keep both queues locked to avoid re-acquiring
1766                         * the sk_receive_queue lock if fwd memory scheduling
1767                         * is needed.
1768                         */
1769                        spin_lock(&sk_queue->lock);
1770                        skb_queue_splice_tail_init(sk_queue, queue);
1771
1772                        skb = __skb_try_recv_from_queue(sk, queue, flags, off,
1773                                                        err, &last);
1774                        if (skb && !(flags & MSG_PEEK))
1775                                udp_skb_dtor_locked(sk, skb);
1776                        spin_unlock(&sk_queue->lock);
1777                        spin_unlock_bh(&queue->lock);
1778                        if (skb)
1779                                return skb;
1780
1781busy_check:
1782                        if (!sk_can_busy_loop(sk))
1783                                break;
1784
1785                        sk_busy_loop(sk, flags & MSG_DONTWAIT);
1786                } while (!skb_queue_empty_lockless(sk_queue));
1787
1788                /* sk_queue is empty, reader_queue may contain peeked packets */
1789        } while (timeo &&
1790                 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
1791                                              &error, &timeo,
1792                                              (struct sk_buff *)sk_queue));
1793
1794        *err = error;
1795        return NULL;
1796}
1797EXPORT_SYMBOL(__skb_recv_udp);
1798
1799int udp_read_sock(struct sock *sk, read_descriptor_t *desc,
1800                  sk_read_actor_t recv_actor)
1801{
1802        int copied = 0;
1803
1804        while (1) {
1805                struct sk_buff *skb;
1806                int err, used;
1807
1808                skb = skb_recv_udp(sk, 0, 1, &err);
1809                if (!skb)
1810                        return err;
1811                used = recv_actor(desc, skb, 0, skb->len);
1812                if (used <= 0) {
1813                        if (!copied)
1814                                copied = used;
1815                        kfree_skb(skb);
1816                        break;
1817                } else if (used <= skb->len) {
1818                        copied += used;
1819                }
1820
1821                kfree_skb(skb);
1822                if (!desc->count)
1823                        break;
1824        }
1825
1826        return copied;
1827}
1828EXPORT_SYMBOL(udp_read_sock);
1829
1830/*
1831 *      This should be easy, if there is something there we
1832 *      return it, otherwise we block.
1833 */
1834
1835int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
1836                int flags, int *addr_len)
1837{
1838        struct inet_sock *inet = inet_sk(sk);
1839        DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
1840        struct sk_buff *skb;
1841        unsigned int ulen, copied;
1842        int off, err, peeking = flags & MSG_PEEK;
1843        int is_udplite = IS_UDPLITE(sk);
1844        bool checksum_valid = false;
1845
1846        if (flags & MSG_ERRQUEUE)
1847                return ip_recv_error(sk, msg, len, addr_len);
1848
1849try_again:
1850        off = sk_peek_offset(sk, flags);
1851        skb = __skb_recv_udp(sk, flags, noblock, &off, &err);
1852        if (!skb)
1853                return err;
1854
1855        ulen = udp_skb_len(skb);
1856        copied = len;
1857        if (copied > ulen - off)
1858                copied = ulen - off;
1859        else if (copied < ulen)
1860                msg->msg_flags |= MSG_TRUNC;
1861
1862        /*
1863         * If checksum is needed at all, try to do it while copying the
1864         * data.  If the data is truncated, or if we only want a partial
1865         * coverage checksum (UDP-Lite), do it before the copy.
1866         */
1867
1868        if (copied < ulen || peeking ||
1869            (is_udplite && UDP_SKB_CB(skb)->partial_cov)) {
1870                checksum_valid = udp_skb_csum_unnecessary(skb) ||
1871                                !__udp_lib_checksum_complete(skb);
1872                if (!checksum_valid)
1873                        goto csum_copy_err;
1874        }
1875
1876        if (checksum_valid || udp_skb_csum_unnecessary(skb)) {
1877                if (udp_skb_is_linear(skb))
1878                        err = copy_linear_skb(skb, copied, off, &msg->msg_iter);
1879                else
1880                        err = skb_copy_datagram_msg(skb, off, msg, copied);
1881        } else {
1882                err = skb_copy_and_csum_datagram_msg(skb, off, msg);
1883
1884                if (err == -EINVAL)
1885                        goto csum_copy_err;
1886        }
1887
1888        if (unlikely(err)) {
1889                if (!peeking) {
1890                        atomic_inc(&sk->sk_drops);
1891                        UDP_INC_STATS(sock_net(sk),
1892                                      UDP_MIB_INERRORS, is_udplite);
1893                }
1894                kfree_skb(skb);
1895                return err;
1896        }
1897
1898        if (!peeking)
1899                UDP_INC_STATS(sock_net(sk),
1900                              UDP_MIB_INDATAGRAMS, is_udplite);
1901
1902        sock_recv_ts_and_drops(msg, sk, skb);
1903
1904        /* Copy the address. */
1905        if (sin) {
1906                sin->sin_family = AF_INET;
1907                sin->sin_port = udp_hdr(skb)->source;
1908                sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
1909                memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
1910                *addr_len = sizeof(*sin);
1911
1912                BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
1913                                                      (struct sockaddr *)sin);
1914        }
1915
1916        if (udp_sk(sk)->gro_enabled)
1917                udp_cmsg_recv(msg, sk, skb);
1918
1919        if (inet->cmsg_flags)
1920                ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off);
1921
1922        err = copied;
1923        if (flags & MSG_TRUNC)
1924                err = ulen;
1925
1926        skb_consume_udp(sk, skb, peeking ? -err : err);
1927        return err;
1928
1929csum_copy_err:
1930        if (!__sk_queue_drop_skb(sk, &udp_sk(sk)->reader_queue, skb, flags,
1931                                 udp_skb_destructor)) {
1932                UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
1933                UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
1934        }
1935        kfree_skb(skb);
1936
1937        /* starting over for a new packet, but check if we need to yield */
1938        cond_resched();
1939        msg->msg_flags &= ~MSG_TRUNC;
1940        goto try_again;
1941}
1942
1943int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
1944{
1945        /* This check is replicated from __ip4_datagram_connect() and
1946         * intended to prevent BPF program called below from accessing bytes
1947         * that are out of the bound specified by user in addr_len.
1948         */
1949        if (addr_len < sizeof(struct sockaddr_in))
1950                return -EINVAL;
1951
1952        return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr);
1953}
1954EXPORT_SYMBOL(udp_pre_connect);
1955
1956int __udp_disconnect(struct sock *sk, int flags)
1957{
1958        struct inet_sock *inet = inet_sk(sk);
1959        /*
1960         *      1003.1g - break association.
1961         */
1962
1963        sk->sk_state = TCP_CLOSE;
1964        inet->inet_daddr = 0;
1965        inet->inet_dport = 0;
1966        sock_rps_reset_rxhash(sk);
1967        sk->sk_bound_dev_if = 0;
1968        if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) {
1969                inet_reset_saddr(sk);
1970                if (sk->sk_prot->rehash &&
1971                    (sk->sk_userlocks & SOCK_BINDPORT_LOCK))
1972                        sk->sk_prot->rehash(sk);
1973        }
1974
1975        if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) {
1976                sk->sk_prot->unhash(sk);
1977                inet->inet_sport = 0;
1978        }
1979        sk_dst_reset(sk);
1980        return 0;
1981}
1982EXPORT_SYMBOL(__udp_disconnect);
1983
1984int udp_disconnect(struct sock *sk, int flags)
1985{
1986        lock_sock(sk);
1987        __udp_disconnect(sk, flags);
1988        release_sock(sk);
1989        return 0;
1990}
1991EXPORT_SYMBOL(udp_disconnect);
1992
1993void udp_lib_unhash(struct sock *sk)
1994{
1995        if (sk_hashed(sk)) {
1996                struct udp_table *udptable = sk->sk_prot->h.udp_table;
1997                struct udp_hslot *hslot, *hslot2;
1998
1999                hslot  = udp_hashslot(udptable, sock_net(sk),
2000                                      udp_sk(sk)->udp_port_hash);
2001                hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
2002
2003                spin_lock_bh(&hslot->lock);
2004                if (rcu_access_pointer(sk->sk_reuseport_cb))
2005                        reuseport_detach_sock(sk);
2006                if (sk_del_node_init_rcu(sk)) {
2007                        hslot->count--;
2008                        inet_sk(sk)->inet_num = 0;
2009                        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
2010
2011                        spin_lock(&hslot2->lock);
2012                        hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
2013                        hslot2->count--;
2014                        spin_unlock(&hslot2->lock);
2015                }
2016                spin_unlock_bh(&hslot->lock);
2017        }
2018}
2019EXPORT_SYMBOL(udp_lib_unhash);
2020
2021/*
2022 * inet_rcv_saddr was changed, we must rehash secondary hash
2023 */
2024void udp_lib_rehash(struct sock *sk, u16 newhash)
2025{
2026        if (sk_hashed(sk)) {
2027                struct udp_table *udptable = sk->sk_prot->h.udp_table;
2028                struct udp_hslot *hslot, *hslot2, *nhslot2;
2029
2030                hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
2031                nhslot2 = udp_hashslot2(udptable, newhash);
2032                udp_sk(sk)->udp_portaddr_hash = newhash;
2033
2034                if (hslot2 != nhslot2 ||
2035                    rcu_access_pointer(sk->sk_reuseport_cb)) {
2036                        hslot = udp_hashslot(udptable, sock_net(sk),
2037                                             udp_sk(sk)->udp_port_hash);
2038                        /* we must lock primary chain too */
2039                        spin_lock_bh(&hslot->lock);
2040                        if (rcu_access_pointer(sk->sk_reuseport_cb))
2041                                reuseport_detach_sock(sk);
2042
2043                        if (hslot2 != nhslot2) {
2044                                spin_lock(&hslot2->lock);
2045                                hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
2046                                hslot2->count--;
2047                                spin_unlock(&hslot2->lock);
2048
2049                                spin_lock(&nhslot2->lock);
2050                                hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
2051                                                         &nhslot2->head);
2052                                nhslot2->count++;
2053                                spin_unlock(&nhslot2->lock);
2054                        }
2055
2056                        spin_unlock_bh(&hslot->lock);
2057                }
2058        }
2059}
2060EXPORT_SYMBOL(udp_lib_rehash);
2061
2062void udp_v4_rehash(struct sock *sk)
2063{
2064        u16 new_hash = ipv4_portaddr_hash(sock_net(sk),
2065                                          inet_sk(sk)->inet_rcv_saddr,
2066                                          inet_sk(sk)->inet_num);
2067        udp_lib_rehash(sk, new_hash);
2068}
2069
2070static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
2071{
2072        int rc;
2073
2074        if (inet_sk(sk)->inet_daddr) {
2075                sock_rps_save_rxhash(sk, skb);
2076                sk_mark_napi_id(sk, skb);
2077                sk_incoming_cpu_update(sk);
2078        } else {
2079                sk_mark_napi_id_once(sk, skb);
2080        }
2081
2082        rc = __udp_enqueue_schedule_skb(sk, skb);
2083        if (rc < 0) {
2084                int is_udplite = IS_UDPLITE(sk);
2085
2086                /* Note that an ENOMEM error is charged twice */
2087                if (rc == -ENOMEM)
2088                        UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
2089                                        is_udplite);
2090                else
2091                        UDP_INC_STATS(sock_net(sk), UDP_MIB_MEMERRORS,
2092                                      is_udplite);
2093                UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
2094                kfree_skb(skb);
2095                trace_udp_fail_queue_rcv_skb(rc, sk);
2096                return -1;
2097        }
2098
2099        return 0;
2100}
2101
2102/* returns:
2103 *  -1: error
2104 *   0: success
2105 *  >0: "udp encap" protocol resubmission
2106 *
2107 * Note that in the success and error cases, the skb is assumed to
2108 * have either been requeued or freed.
2109 */
2110static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
2111{
2112        struct udp_sock *up = udp_sk(sk);
2113        int is_udplite = IS_UDPLITE(sk);
2114
2115        /*
2116         *      Charge it to the socket, dropping if the queue is full.
2117         */
2118        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2119                goto drop;
2120        nf_reset_ct(skb);
2121
2122        if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
2123                int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
2124
2125                /*
2126                 * This is an encapsulation socket so pass the skb to
2127                 * the socket's udp_encap_rcv() hook. Otherwise, just
2128                 * fall through and pass this up the UDP socket.
2129                 * up->encap_rcv() returns the following value:
2130                 * =0 if skb was successfully passed to the encap
2131                 *    handler or was discarded by it.
2132                 * >0 if skb should be passed on to UDP.
2133                 * <0 if skb should be resubmitted as proto -N
2134                 */
2135
2136                /* if we're overly short, let UDP handle it */
2137                encap_rcv = READ_ONCE(up->encap_rcv);
2138                if (encap_rcv) {
2139                        int ret;
2140
2141                        /* Verify checksum before giving to encap */
2142                        if (udp_lib_checksum_complete(skb))
2143                                goto csum_error;
2144
2145                        ret = encap_rcv(sk, skb);
2146                        if (ret <= 0) {
2147                                __UDP_INC_STATS(sock_net(sk),
2148                                                UDP_MIB_INDATAGRAMS,
2149                                                is_udplite);
2150                                return -ret;
2151                        }
2152                }
2153
2154                /* FALLTHROUGH -- it's a UDP Packet */
2155        }
2156
2157        /*
2158         *      UDP-Lite specific tests, ignored on UDP sockets
2159         */
2160        if ((up->pcflag & UDPLITE_RECV_CC)  &&  UDP_SKB_CB(skb)->partial_cov) {
2161
2162                /*
2163                 * MIB statistics other than incrementing the error count are
2164                 * disabled for the following two types of errors: these depend
2165                 * on the application settings, not on the functioning of the
2166                 * protocol stack as such.
2167                 *
2168                 * RFC 3828 here recommends (sec 3.3): "There should also be a
2169                 * way ... to ... at least let the receiving application block
2170                 * delivery of packets with coverage values less than a value
2171                 * provided by the application."
2172                 */
2173                if (up->pcrlen == 0) {          /* full coverage was set  */
2174                        net_dbg_ratelimited("UDPLite: partial coverage %d while full coverage %d requested\n",
2175                                            UDP_SKB_CB(skb)->cscov, skb->len);
2176                        goto drop;
2177                }
2178                /* The next case involves violating the min. coverage requested
2179                 * by the receiver. This is subtle: if receiver wants x and x is
2180                 * greater than the buffersize/MTU then receiver will complain
2181                 * that it wants x while sender emits packets of smaller size y.
2182                 * Therefore the above ...()->partial_cov statement is essential.
2183                 */
2184                if (UDP_SKB_CB(skb)->cscov  <  up->pcrlen) {
2185                        net_dbg_ratelimited("UDPLite: coverage %d too small, need min %d\n",
2186                                            UDP_SKB_CB(skb)->cscov, up->pcrlen);
2187                        goto drop;
2188                }
2189        }
2190
2191        prefetch(&sk->sk_rmem_alloc);
2192        if (rcu_access_pointer(sk->sk_filter) &&
2193            udp_lib_checksum_complete(skb))
2194                        goto csum_error;
2195
2196        if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr)))
2197                goto drop;
2198
2199        udp_csum_pull_header(skb);
2200
2201        ipv4_pktinfo_prepare(sk, skb);
2202        return __udp_queue_rcv_skb(sk, skb);
2203
2204csum_error:
2205        __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
2206drop:
2207        __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
2208        atomic_inc(&sk->sk_drops);
2209        kfree_skb(skb);
2210        return -1;
2211}
2212
2213static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
2214{
2215        struct sk_buff *next, *segs;
2216        int ret;
2217
2218        if (likely(!udp_unexpected_gso(sk, skb)))
2219                return udp_queue_rcv_one_skb(sk, skb);
2220
2221        BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_GSO_CB_OFFSET);
2222        __skb_push(skb, -skb_mac_offset(skb));
2223        segs = udp_rcv_segment(sk, skb, true);
2224        skb_list_walk_safe(segs, skb, next) {
2225                __skb_pull(skb, skb_transport_offset(skb));
2226
2227                udp_post_segment_fix_csum(skb);
2228                ret = udp_queue_rcv_one_skb(sk, skb);
2229                if (ret > 0)
2230                        ip_protocol_deliver_rcu(dev_net(skb->dev), skb, ret);
2231        }
2232        return 0;
2233}
2234
2235/* For TCP sockets, sk_rx_dst is protected by socket lock
2236 * For UDP, we use xchg() to guard against concurrent changes.
2237 */
2238bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
2239{
2240        struct dst_entry *old;
2241
2242        if (dst_hold_safe(dst)) {
2243                old = xchg(&sk->sk_rx_dst, dst);
2244                dst_release(old);
2245                return old != dst;
2246        }
2247        return false;
2248}
2249EXPORT_SYMBOL(udp_sk_rx_dst_set);
2250
2251/*
2252 *      Multicasts and broadcasts go to each listener.
2253 *
2254 *      Note: called only from the BH handler context.
2255 */
2256static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
2257                                    struct udphdr  *uh,
2258                                    __be32 saddr, __be32 daddr,
2259                                    struct udp_table *udptable,
2260                                    int proto)
2261{
2262        struct sock *sk, *first = NULL;
2263        unsigned short hnum = ntohs(uh->dest);
2264        struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
2265        unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
2266        unsigned int offset = offsetof(typeof(*sk), sk_node);
2267        int dif = skb->dev->ifindex;
2268        int sdif = inet_sdif(skb);
2269        struct hlist_node *node;
2270        struct sk_buff *nskb;
2271
2272        if (use_hash2) {
2273                hash2_any = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
2274                            udptable->mask;
2275                hash2 = ipv4_portaddr_hash(net, daddr, hnum) & udptable->mask;
2276start_lookup:
2277                hslot = &udptable->hash2[hash2];
2278                offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
2279        }
2280
2281        sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
2282                if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr,
2283                                         uh->source, saddr, dif, sdif, hnum))
2284                        continue;
2285
2286                if (!first) {
2287                        first = sk;
2288                        continue;
2289                }
2290                nskb = skb_clone(skb, GFP_ATOMIC);
2291
2292                if (unlikely(!nskb)) {
2293                        atomic_inc(&sk->sk_drops);
2294                        __UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS,
2295                                        IS_UDPLITE(sk));
2296                        __UDP_INC_STATS(net, UDP_MIB_INERRORS,
2297                                        IS_UDPLITE(sk));
2298                        continue;
2299                }
2300                if (udp_queue_rcv_skb(sk, nskb) > 0)
2301                        consume_skb(nskb);
2302        }
2303
2304        /* Also lookup *:port if we are using hash2 and haven't done so yet. */
2305        if (use_hash2 && hash2 != hash2_any) {
2306                hash2 = hash2_any;
2307                goto start_lookup;
2308        }
2309
2310        if (first) {
2311                if (udp_queue_rcv_skb(first, skb) > 0)
2312                        consume_skb(skb);
2313        } else {
2314                kfree_skb(skb);
2315                __UDP_INC_STATS(net, UDP_MIB_IGNOREDMULTI,
2316                                proto == IPPROTO_UDPLITE);
2317        }
2318        return 0;
2319}
2320
2321/* Initialize UDP checksum. If exited with zero value (success),
2322 * CHECKSUM_UNNECESSARY means, that no more checks are required.
2323 * Otherwise, csum completion requires checksumming packet body,
2324 * including udp header and folding it to skb->csum.
2325 */
2326static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
2327                                 int proto)
2328{
2329        int err;
2330
2331        UDP_SKB_CB(skb)->partial_cov = 0;
2332        UDP_SKB_CB(skb)->cscov = skb->len;
2333
2334        if (proto == IPPROTO_UDPLITE) {
2335                err = udplite_checksum_init(skb, uh);
2336                if (err)
2337                        return err;
2338
2339                if (UDP_SKB_CB(skb)->partial_cov) {
2340                        skb->csum = inet_compute_pseudo(skb, proto);
2341                        return 0;
2342                }
2343        }
2344
2345        /* Note, we are only interested in != 0 or == 0, thus the
2346         * force to int.
2347         */
2348        err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check,
2349                                                        inet_compute_pseudo);
2350        if (err)
2351                return err;
2352
2353        if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) {
2354                /* If SW calculated the value, we know it's bad */
2355                if (skb->csum_complete_sw)
2356                        return 1;
2357
2358                /* HW says the value is bad. Let's validate that.
2359                 * skb->csum is no longer the full packet checksum,
2360                 * so don't treat it as such.
2361                 */
2362                skb_checksum_complete_unset(skb);
2363        }
2364
2365        return 0;
2366}
2367
2368/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and
2369 * return code conversion for ip layer consumption
2370 */
2371static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
2372                               struct udphdr *uh)
2373{
2374        int ret;
2375
2376        if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
2377                skb_checksum_try_convert(skb, IPPROTO_UDP, inet_compute_pseudo);
2378
2379        ret = udp_queue_rcv_skb(sk, skb);
2380
2381        /* a return value > 0 means to resubmit the input, but
2382         * it wants the return to be -protocol, or 0
2383         */
2384        if (ret > 0)
2385                return -ret;
2386        return 0;
2387}
2388
2389/*
2390 *      All we need to do is get the socket, and then do a checksum.
2391 */
2392
2393int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
2394                   int proto)
2395{
2396        struct sock *sk;
2397        struct udphdr *uh;
2398        unsigned short ulen;
2399        struct rtable *rt = skb_rtable(skb);
2400        __be32 saddr, daddr;
2401        struct net *net = dev_net(skb->dev);
2402        bool refcounted;
2403
2404        /*
2405         *  Validate the packet.
2406         */
2407        if (!pskb_may_pull(skb, sizeof(struct udphdr)))
2408                goto drop;              /* No space for header. */
2409
2410        uh   = udp_hdr(skb);
2411        ulen = ntohs(uh->len);
2412        saddr = ip_hdr(skb)->saddr;
2413        daddr = ip_hdr(skb)->daddr;
2414
2415        if (ulen > skb->len)
2416                goto short_packet;
2417
2418        if (proto == IPPROTO_UDP) {
2419                /* UDP validates ulen. */
2420                if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen))
2421                        goto short_packet;
2422                uh = udp_hdr(skb);
2423        }
2424
2425        if (udp4_csum_init(skb, uh, proto))
2426                goto csum_error;
2427
2428        sk = skb_steal_sock(skb, &refcounted);
2429        if (sk) {
2430                struct dst_entry *dst = skb_dst(skb);
2431                int ret;
2432
2433                if (unlikely(sk->sk_rx_dst != dst))
2434                        udp_sk_rx_dst_set(sk, dst);
2435
2436                ret = udp_unicast_rcv_skb(sk, skb, uh);
2437                if (refcounted)
2438                        sock_put(sk);
2439                return ret;
2440        }
2441
2442        if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
2443                return __udp4_lib_mcast_deliver(net, skb, uh,
2444                                                saddr, daddr, udptable, proto);
2445
2446        sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
2447        if (sk)
2448                return udp_unicast_rcv_skb(sk, skb, uh);
2449
2450        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2451                goto drop;
2452        nf_reset_ct(skb);
2453
2454        /* No socket. Drop packet silently, if checksum is wrong */
2455        if (udp_lib_checksum_complete(skb))
2456                goto csum_error;
2457
2458        __UDP_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
2459        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
2460
2461        /*
2462         * Hmm.  We got an UDP packet to a port to which we
2463         * don't wanna listen.  Ignore it.
2464         */
2465        kfree_skb(skb);
2466        return 0;
2467
2468short_packet:
2469        net_dbg_ratelimited("UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n",
2470                            proto == IPPROTO_UDPLITE ? "Lite" : "",
2471                            &saddr, ntohs(uh->source),
2472                            ulen, skb->len,
2473                            &daddr, ntohs(uh->dest));
2474        goto drop;
2475
2476csum_error:
2477        /*
2478         * RFC1122: OK.  Discards the bad packet silently (as far as
2479         * the network is concerned, anyway) as per 4.1.3.4 (MUST).
2480         */
2481        net_dbg_ratelimited("UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n",
2482                            proto == IPPROTO_UDPLITE ? "Lite" : "",
2483                            &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest),
2484                            ulen);
2485        __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
2486drop:
2487        __UDP_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
2488        kfree_skb(skb);
2489        return 0;
2490}
2491
2492/* We can only early demux multicast if there is a single matching socket.
2493 * If more than one socket found returns NULL
2494 */
2495static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
2496                                                  __be16 loc_port, __be32 loc_addr,
2497                                                  __be16 rmt_port, __be32 rmt_addr,
2498                                                  int dif, int sdif)
2499{
2500        struct sock *sk, *result;
2501        unsigned short hnum = ntohs(loc_port);
2502        unsigned int slot = udp_hashfn(net, hnum, udp_table.mask);
2503        struct udp_hslot *hslot = &udp_table.hash[slot];
2504
2505        /* Do not bother scanning a too big list */
2506        if (hslot->count > 10)
2507                return NULL;
2508
2509        result = NULL;
2510        sk_for_each_rcu(sk, &hslot->head) {
2511                if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr,
2512                                        rmt_port, rmt_addr, dif, sdif, hnum)) {
2513                        if (result)
2514                                return NULL;
2515                        result = sk;
2516                }
2517        }
2518
2519        return result;
2520}
2521
2522/* For unicast we should only early demux connected sockets or we can
2523 * break forwarding setups.  The chains here can be long so only check
2524 * if the first socket is an exact match and if not move on.
2525 */
2526static struct sock *__udp4_lib_demux_lookup(struct net *net,
2527                                            __be16 loc_port, __be32 loc_addr,
2528                                            __be16 rmt_port, __be32 rmt_addr,
2529                                            int dif, int sdif)
2530{
2531        unsigned short hnum = ntohs(loc_port);
2532        unsigned int hash2 = ipv4_portaddr_hash(net, loc_addr, hnum);
2533        unsigned int slot2 = hash2 & udp_table.mask;
2534        struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
2535        INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
2536        const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum);
2537        struct sock *sk;
2538
2539        udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
2540                if (INET_MATCH(sk, net, acookie, rmt_addr,
2541                               loc_addr, ports, dif, sdif))
2542                        return sk;
2543                /* Only check first socket in chain */
2544                break;
2545        }
2546        return NULL;
2547}
2548
2549int udp_v4_early_demux(struct sk_buff *skb)
2550{
2551        struct net *net = dev_net(skb->dev);
2552        struct in_device *in_dev = NULL;
2553        const struct iphdr *iph;
2554        const struct udphdr *uh;
2555        struct sock *sk = NULL;
2556        struct dst_entry *dst;
2557        int dif = skb->dev->ifindex;
2558        int sdif = inet_sdif(skb);
2559        int ours;
2560
2561        /* validate the packet */
2562        if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr)))
2563                return 0;
2564
2565        iph = ip_hdr(skb);
2566        uh = udp_hdr(skb);
2567
2568        if (skb->pkt_type == PACKET_MULTICAST) {
2569                in_dev = __in_dev_get_rcu(skb->dev);
2570
2571                if (!in_dev)
2572                        return 0;
2573
2574                ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr,
2575                                       iph->protocol);
2576                if (!ours)
2577                        return 0;
2578
2579                sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
2580                                                   uh->source, iph->saddr,
2581                                                   dif, sdif);
2582        } else if (skb->pkt_type == PACKET_HOST) {
2583                sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr,
2584                                             uh->source, iph->saddr, dif, sdif);
2585        }
2586
2587        if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
2588                return 0;
2589
2590        skb->sk = sk;
2591        skb->destructor = sock_efree;
2592        dst = READ_ONCE(sk->sk_rx_dst);
2593
2594        if (dst)
2595                dst = dst_check(dst, 0);
2596        if (dst) {
2597                u32 itag = 0;
2598
2599                /* set noref for now.
2600                 * any place which wants to hold dst has to call
2601                 * dst_hold_safe()
2602                 */
2603                skb_dst_set_noref(skb, dst);
2604
2605                /* for unconnected multicast sockets we need to validate
2606                 * the source on each packet
2607                 */
2608                if (!inet_sk(sk)->inet_daddr && in_dev)
2609                        return ip_mc_validate_source(skb, iph->daddr,
2610                                                     iph->saddr,
2611                                                     iph->tos & IPTOS_RT_MASK,
2612                                                     skb->dev, in_dev, &itag);
2613        }
2614        return 0;
2615}
2616
2617int udp_rcv(struct sk_buff *skb)
2618{
2619        return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);
2620}
2621
2622void udp_destroy_sock(struct sock *sk)
2623{
2624        struct udp_sock *up = udp_sk(sk);
2625        bool slow = lock_sock_fast(sk);
2626
2627        /* protects from races with udp_abort() */
2628        sock_set_flag(sk, SOCK_DEAD);
2629        udp_flush_pending_frames(sk);
2630        unlock_sock_fast(sk, slow);
2631        if (static_branch_unlikely(&udp_encap_needed_key)) {
2632                if (up->encap_type) {
2633                        void (*encap_destroy)(struct sock *sk);
2634                        encap_destroy = READ_ONCE(up->encap_destroy);
2635                        if (encap_destroy)
2636                                encap_destroy(sk);
2637                }
2638                if (up->encap_enabled)
2639                        static_branch_dec(&udp_encap_needed_key);
2640        }
2641}
2642
2643/*
2644 *      Socket option code for UDP
2645 */
2646int udp_lib_setsockopt(struct sock *sk, int level, int optname,
2647                       sockptr_t optval, unsigned int optlen,
2648                       int (*push_pending_frames)(struct sock *))
2649{
2650        struct udp_sock *up = udp_sk(sk);
2651        int val, valbool;
2652        int err = 0;
2653        int is_udplite = IS_UDPLITE(sk);
2654
2655        if (optlen < sizeof(int))
2656                return -EINVAL;
2657
2658        if (copy_from_sockptr(&val, optval, sizeof(val)))
2659                return -EFAULT;
2660
2661        valbool = val ? 1 : 0;
2662
2663        switch (optname) {
2664        case UDP_CORK:
2665                if (val != 0) {
2666                        WRITE_ONCE(up->corkflag, 1);
2667                } else {
2668                        WRITE_ONCE(up->corkflag, 0);
2669                        lock_sock(sk);
2670                        push_pending_frames(sk);
2671                        release_sock(sk);
2672                }
2673                break;
2674
2675        case UDP_ENCAP:
2676                switch (val) {
2677                case 0:
2678#ifdef CONFIG_XFRM
2679                case UDP_ENCAP_ESPINUDP:
2680                case UDP_ENCAP_ESPINUDP_NON_IKE:
2681#if IS_ENABLED(CONFIG_IPV6)
2682                        if (sk->sk_family == AF_INET6)
2683                                up->encap_rcv = ipv6_stub->xfrm6_udp_encap_rcv;
2684                        else
2685#endif
2686                                up->encap_rcv = xfrm4_udp_encap_rcv;
2687#endif
2688                        fallthrough;
2689                case UDP_ENCAP_L2TPINUDP:
2690                        up->encap_type = val;
2691                        lock_sock(sk);
2692                        udp_tunnel_encap_enable(sk->sk_socket);
2693                        release_sock(sk);
2694                        break;
2695                default:
2696                        err = -ENOPROTOOPT;
2697                        break;
2698                }
2699                break;
2700
2701        case UDP_NO_CHECK6_TX:
2702                up->no_check6_tx = valbool;
2703                break;
2704
2705        case UDP_NO_CHECK6_RX:
2706                up->no_check6_rx = valbool;
2707                break;
2708
2709        case UDP_SEGMENT:
2710                if (val < 0 || val > USHRT_MAX)
2711                        return -EINVAL;
2712                WRITE_ONCE(up->gso_size, val);
2713                break;
2714
2715        case UDP_GRO:
2716                lock_sock(sk);
2717
2718                /* when enabling GRO, accept the related GSO packet type */
2719                if (valbool)
2720                        udp_tunnel_encap_enable(sk->sk_socket);
2721                up->gro_enabled = valbool;
2722                up->accept_udp_l4 = valbool;
2723                release_sock(sk);
2724                break;
2725
2726        /*
2727         *      UDP-Lite's partial checksum coverage (RFC 3828).
2728         */
2729        /* The sender sets actual checksum coverage length via this option.
2730         * The case coverage > packet length is handled by send module. */
2731        case UDPLITE_SEND_CSCOV:
2732                if (!is_udplite)         /* Disable the option on UDP sockets */
2733                        return -ENOPROTOOPT;
2734                if (val != 0 && val < 8) /* Illegal coverage: use default (8) */
2735                        val = 8;
2736                else if (val > USHRT_MAX)
2737                        val = USHRT_MAX;
2738                up->pcslen = val;
2739                up->pcflag |= UDPLITE_SEND_CC;
2740                break;
2741
2742        /* The receiver specifies a minimum checksum coverage value. To make
2743         * sense, this should be set to at least 8 (as done below). If zero is
2744         * used, this again means full checksum coverage.                     */
2745        case UDPLITE_RECV_CSCOV:
2746                if (!is_udplite)         /* Disable the option on UDP sockets */
2747                        return -ENOPROTOOPT;
2748                if (val != 0 && val < 8) /* Avoid silly minimal values.       */
2749                        val = 8;
2750                else if (val > USHRT_MAX)
2751                        val = USHRT_MAX;
2752                up->pcrlen = val;
2753                up->pcflag |= UDPLITE_RECV_CC;
2754                break;
2755
2756        default:
2757                err = -ENOPROTOOPT;
2758                break;
2759        }
2760
2761        return err;
2762}
2763EXPORT_SYMBOL(udp_lib_setsockopt);
2764
2765int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
2766                   unsigned int optlen)
2767{
2768        if (level == SOL_UDP  ||  level == SOL_UDPLITE)
2769                return udp_lib_setsockopt(sk, level, optname,
2770                                          optval, optlen,
2771                                          udp_push_pending_frames);
2772        return ip_setsockopt(sk, level, optname, optval, optlen);
2773}
2774
2775int udp_lib_getsockopt(struct sock *sk, int level, int optname,
2776                       char __user *optval, int __user *optlen)
2777{
2778        struct udp_sock *up = udp_sk(sk);
2779        int val, len;
2780
2781        if (get_user(len, optlen))
2782                return -EFAULT;
2783
2784        len = min_t(unsigned int, len, sizeof(int));
2785
2786        if (len < 0)
2787                return -EINVAL;
2788
2789        switch (optname) {
2790        case UDP_CORK:
2791                val = READ_ONCE(up->corkflag);
2792                break;
2793
2794        case UDP_ENCAP:
2795                val = up->encap_type;
2796                break;
2797
2798        case UDP_NO_CHECK6_TX:
2799                val = up->no_check6_tx;
2800                break;
2801
2802        case UDP_NO_CHECK6_RX:
2803                val = up->no_check6_rx;
2804                break;
2805
2806        case UDP_SEGMENT:
2807                val = READ_ONCE(up->gso_size);
2808                break;
2809
2810        case UDP_GRO:
2811                val = up->gro_enabled;
2812                break;
2813
2814        /* The following two cannot be changed on UDP sockets, the return is
2815         * always 0 (which corresponds to the full checksum coverage of UDP). */
2816        case UDPLITE_SEND_CSCOV:
2817                val = up->pcslen;
2818                break;
2819
2820        case UDPLITE_RECV_CSCOV:
2821                val = up->pcrlen;
2822                break;
2823
2824        default:
2825                return -ENOPROTOOPT;
2826        }
2827
2828        if (put_user(len, optlen))
2829                return -EFAULT;
2830        if (copy_to_user(optval, &val, len))
2831                return -EFAULT;
2832        return 0;
2833}
2834EXPORT_SYMBOL(udp_lib_getsockopt);
2835
2836int udp_getsockopt(struct sock *sk, int level, int optname,
2837                   char __user *optval, int __user *optlen)
2838{
2839        if (level == SOL_UDP  ||  level == SOL_UDPLITE)
2840                return udp_lib_getsockopt(sk, level, optname, optval, optlen);
2841        return ip_getsockopt(sk, level, optname, optval, optlen);
2842}
2843
2844/**
2845 *      udp_poll - wait for a UDP event.
2846 *      @file: - file struct
2847 *      @sock: - socket
2848 *      @wait: - poll table
2849 *
2850 *      This is same as datagram poll, except for the special case of
2851 *      blocking sockets. If application is using a blocking fd
2852 *      and a packet with checksum error is in the queue;
2853 *      then it could get return from select indicating data available
2854 *      but then block when reading it. Add special case code
2855 *      to work around these arguably broken applications.
2856 */
2857__poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
2858{
2859        __poll_t mask = datagram_poll(file, sock, wait);
2860        struct sock *sk = sock->sk;
2861
2862        if (!skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
2863                mask |= EPOLLIN | EPOLLRDNORM;
2864
2865        /* Check for false positives due to checksum errors */
2866        if ((mask & EPOLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
2867            !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
2868                mask &= ~(EPOLLIN | EPOLLRDNORM);
2869
2870        /* psock ingress_msg queue should not contain any bad checksum frames */
2871        if (sk_is_readable(sk))
2872                mask |= EPOLLIN | EPOLLRDNORM;
2873        return mask;
2874
2875}
2876EXPORT_SYMBOL(udp_poll);
2877
2878int udp_abort(struct sock *sk, int err)
2879{
2880        lock_sock(sk);
2881
2882        /* udp{v6}_destroy_sock() sets it under the sk lock, avoid racing
2883         * with close()
2884         */
2885        if (sock_flag(sk, SOCK_DEAD))
2886                goto out;
2887
2888        sk->sk_err = err;
2889        sk_error_report(sk);
2890        __udp_disconnect(sk, 0);
2891
2892out:
2893        release_sock(sk);
2894
2895        return 0;
2896}
2897EXPORT_SYMBOL_GPL(udp_abort);
2898
2899struct proto udp_prot = {
2900        .name                   = "UDP",
2901        .owner                  = THIS_MODULE,
2902        .close                  = udp_lib_close,
2903        .pre_connect            = udp_pre_connect,
2904        .connect                = ip4_datagram_connect,
2905        .disconnect             = udp_disconnect,
2906        .ioctl                  = udp_ioctl,
2907        .init                   = udp_init_sock,
2908        .destroy                = udp_destroy_sock,
2909        .setsockopt             = udp_setsockopt,
2910        .getsockopt             = udp_getsockopt,
2911        .sendmsg                = udp_sendmsg,
2912        .recvmsg                = udp_recvmsg,
2913        .sendpage               = udp_sendpage,
2914        .release_cb             = ip4_datagram_release_cb,
2915        .hash                   = udp_lib_hash,
2916        .unhash                 = udp_lib_unhash,
2917        .rehash                 = udp_v4_rehash,
2918        .get_port               = udp_v4_get_port,
2919#ifdef CONFIG_BPF_SYSCALL
2920        .psock_update_sk_prot   = udp_bpf_update_proto,
2921#endif
2922        .memory_allocated       = &udp_memory_allocated,
2923        .sysctl_mem             = sysctl_udp_mem,
2924        .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
2925        .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_udp_rmem_min),
2926        .obj_size               = sizeof(struct udp_sock),
2927        .h.udp_table            = &udp_table,
2928        .diag_destroy           = udp_abort,
2929};
2930EXPORT_SYMBOL(udp_prot);
2931
2932/* ------------------------------------------------------------------------ */
2933#ifdef CONFIG_PROC_FS
2934
2935static struct sock *udp_get_first(struct seq_file *seq, int start)
2936{
2937        struct sock *sk;
2938        struct udp_seq_afinfo *afinfo;
2939        struct udp_iter_state *state = seq->private;
2940        struct net *net = seq_file_net(seq);
2941
2942        if (state->bpf_seq_afinfo)
2943                afinfo = state->bpf_seq_afinfo;
2944        else
2945                afinfo = PDE_DATA(file_inode(seq->file));
2946
2947        for (state->bucket = start; state->bucket <= afinfo->udp_table->mask;
2948             ++state->bucket) {
2949                struct udp_hslot *hslot = &afinfo->udp_table->hash[state->bucket];
2950
2951                if (hlist_empty(&hslot->head))
2952                        continue;
2953
2954                spin_lock_bh(&hslot->lock);
2955                sk_for_each(sk, &hslot->head) {
2956                        if (!net_eq(sock_net(sk), net))
2957                                continue;
2958                        if (afinfo->family == AF_UNSPEC ||
2959                            sk->sk_family == afinfo->family)
2960                                goto found;
2961                }
2962                spin_unlock_bh(&hslot->lock);
2963        }
2964        sk = NULL;
2965found:
2966        return sk;
2967}
2968
2969static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
2970{
2971        struct udp_seq_afinfo *afinfo;
2972        struct udp_iter_state *state = seq->private;
2973        struct net *net = seq_file_net(seq);
2974
2975        if (state->bpf_seq_afinfo)
2976                afinfo = state->bpf_seq_afinfo;
2977        else
2978                afinfo = PDE_DATA(file_inode(seq->file));
2979
2980        do {
2981                sk = sk_next(sk);
2982        } while (sk && (!net_eq(sock_net(sk), net) ||
2983                        (afinfo->family != AF_UNSPEC &&
2984                         sk->sk_family != afinfo->family)));
2985
2986        if (!sk) {
2987                if (state->bucket <= afinfo->udp_table->mask)
2988                        spin_unlock_bh(&afinfo->udp_table->hash[state->bucket].lock);
2989                return udp_get_first(seq, state->bucket + 1);
2990        }
2991        return sk;
2992}
2993
2994static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
2995{
2996        struct sock *sk = udp_get_first(seq, 0);
2997
2998        if (sk)
2999                while (pos && (sk = udp_get_next(seq, sk)) != NULL)
3000                        --pos;
3001        return pos ? NULL : sk;
3002}
3003
3004void *udp_seq_start(struct seq_file *seq, loff_t *pos)
3005{
3006        struct udp_iter_state *state = seq->private;
3007        state->bucket = MAX_UDP_PORTS;
3008
3009        return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
3010}
3011EXPORT_SYMBOL(udp_seq_start);
3012
3013void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3014{
3015        struct sock *sk;
3016
3017        if (v == SEQ_START_TOKEN)
3018                sk = udp_get_idx(seq, 0);
3019        else
3020                sk = udp_get_next(seq, v);
3021
3022        ++*pos;
3023        return sk;
3024}
3025EXPORT_SYMBOL(udp_seq_next);
3026
3027void udp_seq_stop(struct seq_file *seq, void *v)
3028{
3029        struct udp_seq_afinfo *afinfo;
3030        struct udp_iter_state *state = seq->private;
3031
3032        if (state->bpf_seq_afinfo)
3033                afinfo = state->bpf_seq_afinfo;
3034        else
3035                afinfo = PDE_DATA(file_inode(seq->file));
3036
3037        if (state->bucket <= afinfo->udp_table->mask)
3038                spin_unlock_bh(&afinfo->udp_table->hash[state->bucket].lock);
3039}
3040EXPORT_SYMBOL(udp_seq_stop);
3041
3042/* ------------------------------------------------------------------------ */
3043static void udp4_format_sock(struct sock *sp, struct seq_file *f,
3044                int bucket)
3045{
3046        struct inet_sock *inet = inet_sk(sp);
3047        __be32 dest = inet->inet_daddr;
3048        __be32 src  = inet->inet_rcv_saddr;
3049        __u16 destp       = ntohs(inet->inet_dport);
3050        __u16 srcp        = ntohs(inet->inet_sport);
3051
3052        seq_printf(f, "%5d: %08X:%04X %08X:%04X"
3053                " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u",
3054                bucket, src, srcp, dest, destp, sp->sk_state,
3055                sk_wmem_alloc_get(sp),
3056                udp_rqueue_get(sp),
3057                0, 0L, 0,
3058                from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
3059                0, sock_i_ino(sp),
3060                refcount_read(&sp->sk_refcnt), sp,
3061                atomic_read(&sp->sk_drops));
3062}
3063
3064int udp4_seq_show(struct seq_file *seq, void *v)
3065{
3066        seq_setwidth(seq, 127);
3067        if (v == SEQ_START_TOKEN)
3068                seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
3069                           "rx_queue tr tm->when retrnsmt   uid  timeout "
3070                           "inode ref pointer drops");
3071        else {
3072                struct udp_iter_state *state = seq->private;
3073
3074                udp4_format_sock(v, seq, state->bucket);
3075        }
3076        seq_pad(seq, '\n');
3077        return 0;
3078}
3079
3080#ifdef CONFIG_BPF_SYSCALL
3081struct bpf_iter__udp {
3082        __bpf_md_ptr(struct bpf_iter_meta *, meta);
3083        __bpf_md_ptr(struct udp_sock *, udp_sk);
3084        uid_t uid __aligned(8);
3085        int bucket __aligned(8);
3086};
3087
3088static int udp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3089                             struct udp_sock *udp_sk, uid_t uid, int bucket)
3090{
3091        struct bpf_iter__udp ctx;
3092
3093        meta->seq_num--;  /* skip SEQ_START_TOKEN */
3094        ctx.meta = meta;
3095        ctx.udp_sk = udp_sk;
3096        ctx.uid = uid;
3097        ctx.bucket = bucket;
3098        return bpf_iter_run_prog(prog, &ctx);
3099}
3100
3101static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v)
3102{
3103        struct udp_iter_state *state = seq->private;
3104        struct bpf_iter_meta meta;
3105        struct bpf_prog *prog;
3106        struct sock *sk = v;
3107        uid_t uid;
3108
3109        if (v == SEQ_START_TOKEN)
3110                return 0;
3111
3112        uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3113        meta.seq = seq;
3114        prog = bpf_iter_get_info(&meta, false);
3115        return udp_prog_seq_show(prog, &meta, v, uid, state->bucket);
3116}
3117
3118static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
3119{
3120        struct bpf_iter_meta meta;
3121        struct bpf_prog *prog;
3122
3123        if (!v) {
3124                meta.seq = seq;
3125                prog = bpf_iter_get_info(&meta, true);
3126                if (prog)
3127                        (void)udp_prog_seq_show(prog, &meta, v, 0, 0);
3128        }
3129
3130        udp_seq_stop(seq, v);
3131}
3132
3133static const struct seq_operations bpf_iter_udp_seq_ops = {
3134        .start          = udp_seq_start,
3135        .next           = udp_seq_next,
3136        .stop           = bpf_iter_udp_seq_stop,
3137        .show           = bpf_iter_udp_seq_show,
3138};
3139#endif
3140
3141const struct seq_operations udp_seq_ops = {
3142        .start          = udp_seq_start,
3143        .next           = udp_seq_next,
3144        .stop           = udp_seq_stop,
3145        .show           = udp4_seq_show,
3146};
3147EXPORT_SYMBOL(udp_seq_ops);
3148
3149static struct udp_seq_afinfo udp4_seq_afinfo = {
3150        .family         = AF_INET,
3151        .udp_table      = &udp_table,
3152};
3153
3154static int __net_init udp4_proc_init_net(struct net *net)
3155{
3156        if (!proc_create_net_data("udp", 0444, net->proc_net, &udp_seq_ops,
3157                        sizeof(struct udp_iter_state), &udp4_seq_afinfo))
3158                return -ENOMEM;
3159        return 0;
3160}
3161
3162static void __net_exit udp4_proc_exit_net(struct net *net)
3163{
3164        remove_proc_entry("udp", net->proc_net);
3165}
3166
3167static struct pernet_operations udp4_net_ops = {
3168        .init = udp4_proc_init_net,
3169        .exit = udp4_proc_exit_net,
3170};
3171
3172int __init udp4_proc_init(void)
3173{
3174        return register_pernet_subsys(&udp4_net_ops);
3175}
3176
3177void udp4_proc_exit(void)
3178{
3179        unregister_pernet_subsys(&udp4_net_ops);
3180}
3181#endif /* CONFIG_PROC_FS */
3182
3183static __initdata unsigned long uhash_entries;
3184static int __init set_uhash_entries(char *str)
3185{
3186        ssize_t ret;
3187
3188        if (!str)
3189                return 0;
3190
3191        ret = kstrtoul(str, 0, &uhash_entries);
3192        if (ret)
3193                return 0;
3194
3195        if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN)
3196                uhash_entries = UDP_HTABLE_SIZE_MIN;
3197        return 1;
3198}
3199__setup("uhash_entries=", set_uhash_entries);
3200
3201void __init udp_table_init(struct udp_table *table, const char *name)
3202{
3203        unsigned int i;
3204
3205        table->hash = alloc_large_system_hash(name,
3206                                              2 * sizeof(struct udp_hslot),
3207                                              uhash_entries,
3208                                              21, /* one slot per 2 MB */
3209                                              0,
3210                                              &table->log,
3211                                              &table->mask,
3212                                              UDP_HTABLE_SIZE_MIN,
3213                                              64 * 1024);
3214
3215        table->hash2 = table->hash + (table->mask + 1);
3216        for (i = 0; i <= table->mask; i++) {
3217                INIT_HLIST_HEAD(&table->hash[i].head);
3218                table->hash[i].count = 0;
3219                spin_lock_init(&table->hash[i].lock);
3220        }
3221        for (i = 0; i <= table->mask; i++) {
3222                INIT_HLIST_HEAD(&table->hash2[i].head);
3223                table->hash2[i].count = 0;
3224                spin_lock_init(&table->hash2[i].lock);
3225        }
3226}
3227
3228u32 udp_flow_hashrnd(void)
3229{
3230        static u32 hashrnd __read_mostly;
3231
3232        net_get_random_once(&hashrnd, sizeof(hashrnd));
3233
3234        return hashrnd;
3235}
3236EXPORT_SYMBOL(udp_flow_hashrnd);
3237
3238static void __udp_sysctl_init(struct net *net)
3239{
3240        net->ipv4.sysctl_udp_rmem_min = SK_MEM_QUANTUM;
3241        net->ipv4.sysctl_udp_wmem_min = SK_MEM_QUANTUM;
3242
3243#ifdef CONFIG_NET_L3_MASTER_DEV
3244        net->ipv4.sysctl_udp_l3mdev_accept = 0;
3245#endif
3246}
3247
3248static int __net_init udp_sysctl_init(struct net *net)
3249{
3250        __udp_sysctl_init(net);
3251        return 0;
3252}
3253
3254static struct pernet_operations __net_initdata udp_sysctl_ops = {
3255        .init   = udp_sysctl_init,
3256};
3257
3258#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3259DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta,
3260                     struct udp_sock *udp_sk, uid_t uid, int bucket)
3261
3262static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
3263{
3264        struct udp_iter_state *st = priv_data;
3265        struct udp_seq_afinfo *afinfo;
3266        int ret;
3267
3268        afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN);
3269        if (!afinfo)
3270                return -ENOMEM;
3271
3272        afinfo->family = AF_UNSPEC;
3273        afinfo->udp_table = &udp_table;
3274        st->bpf_seq_afinfo = afinfo;
3275        ret = bpf_iter_init_seq_net(priv_data, aux);
3276        if (ret)
3277                kfree(afinfo);
3278        return ret;
3279}
3280
3281static void bpf_iter_fini_udp(void *priv_data)
3282{
3283        struct udp_iter_state *st = priv_data;
3284
3285        kfree(st->bpf_seq_afinfo);
3286        bpf_iter_fini_seq_net(priv_data);
3287}
3288
3289static const struct bpf_iter_seq_info udp_seq_info = {
3290        .seq_ops                = &bpf_iter_udp_seq_ops,
3291        .init_seq_private       = bpf_iter_init_udp,
3292        .fini_seq_private       = bpf_iter_fini_udp,
3293        .seq_priv_size          = sizeof(struct udp_iter_state),
3294};
3295
3296static struct bpf_iter_reg udp_reg_info = {
3297        .target                 = "udp",
3298        .ctx_arg_info_size      = 1,
3299        .ctx_arg_info           = {
3300                { offsetof(struct bpf_iter__udp, udp_sk),
3301                  PTR_TO_BTF_ID_OR_NULL },
3302        },
3303        .seq_info               = &udp_seq_info,
3304};
3305
3306static void __init bpf_iter_register(void)
3307{
3308        udp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UDP];
3309        if (bpf_iter_reg_target(&udp_reg_info))
3310                pr_warn("Warning: could not register bpf iterator udp\n");
3311}
3312#endif
3313
3314void __init udp_init(void)
3315{
3316        unsigned long limit;
3317        unsigned int i;
3318
3319        udp_table_init(&udp_table, "UDP");
3320        limit = nr_free_buffer_pages() / 8;
3321        limit = max(limit, 128UL);
3322        sysctl_udp_mem[0] = limit / 4 * 3;
3323        sysctl_udp_mem[1] = limit;
3324        sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2;
3325
3326        __udp_sysctl_init(&init_net);
3327
3328        /* 16 spinlocks per cpu */
3329        udp_busylocks_log = ilog2(nr_cpu_ids) + 4;
3330        udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log,
3331                                GFP_KERNEL);
3332        if (!udp_busylocks)
3333                panic("UDP: failed to alloc udp_busylocks\n");
3334        for (i = 0; i < (1U << udp_busylocks_log); i++)
3335                spin_lock_init(udp_busylocks + i);
3336
3337        if (register_pernet_subsys(&udp_sysctl_ops))
3338                panic("UDP: failed to init sysctl parameters.\n");
3339
3340#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3341        bpf_iter_register();
3342#endif
3343}
3344