linux/net/ipv4/udp.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              The User Datagram Protocol (UDP).
   7 *
   8 * Authors:     Ross Biro
   9 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  11 *              Alan Cox, <alan@lxorguk.ukuu.org.uk>
  12 *              Hirokazu Takahashi, <taka@valinux.co.jp>
  13 *
  14 * Fixes:
  15 *              Alan Cox        :       verify_area() calls
  16 *              Alan Cox        :       stopped close while in use off icmp
  17 *                                      messages. Not a fix but a botch that
  18 *                                      for udp at least is 'valid'.
  19 *              Alan Cox        :       Fixed icmp handling properly
  20 *              Alan Cox        :       Correct error for oversized datagrams
  21 *              Alan Cox        :       Tidied select() semantics.
  22 *              Alan Cox        :       udp_err() fixed properly, also now
  23 *                                      select and read wake correctly on errors
  24 *              Alan Cox        :       udp_send verify_area moved to avoid mem leak
  25 *              Alan Cox        :       UDP can count its memory
  26 *              Alan Cox        :       send to an unknown connection causes
  27 *                                      an ECONNREFUSED off the icmp, but
  28 *                                      does NOT close.
  29 *              Alan Cox        :       Switched to new sk_buff handlers. No more backlog!
  30 *              Alan Cox        :       Using generic datagram code. Even smaller and the PEEK
  31 *                                      bug no longer crashes it.
  32 *              Fred Van Kempen :       Net2e support for sk->broadcast.
  33 *              Alan Cox        :       Uses skb_free_datagram
  34 *              Alan Cox        :       Added get/set sockopt support.
  35 *              Alan Cox        :       Broadcasting without option set returns EACCES.
  36 *              Alan Cox        :       No wakeup calls. Instead we now use the callbacks.
  37 *              Alan Cox        :       Use ip_tos and ip_ttl
  38 *              Alan Cox        :       SNMP Mibs
  39 *              Alan Cox        :       MSG_DONTROUTE, and 0.0.0.0 support.
  40 *              Matt Dillon     :       UDP length checks.
  41 *              Alan Cox        :       Smarter af_inet used properly.
  42 *              Alan Cox        :       Use new kernel side addressing.
  43 *              Alan Cox        :       Incorrect return on truncated datagram receive.
  44 *      Arnt Gulbrandsen        :       New udp_send and stuff
  45 *              Alan Cox        :       Cache last socket
  46 *              Alan Cox        :       Route cache
  47 *              Jon Peatfield   :       Minor efficiency fix to sendto().
  48 *              Mike Shaver     :       RFC1122 checks.
  49 *              Alan Cox        :       Nonblocking error fix.
  50 *      Willy Konynenberg       :       Transparent proxying support.
  51 *              Mike McLagan    :       Routing by source
  52 *              David S. Miller :       New socket lookup architecture.
  53 *                                      Last socket cache retained as it
  54 *                                      does have a high hit rate.
  55 *              Olaf Kirch      :       Don't linearise iovec on sendmsg.
  56 *              Andi Kleen      :       Some cleanups, cache destination entry
  57 *                                      for connect.
  58 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  59 *              Melvin Smith    :       Check msg_name not msg_namelen in sendto(),
  60 *                                      return ENOTCONN for unconnected sockets (POSIX)
  61 *              Janos Farkas    :       don't deliver multi/broadcasts to a different
  62 *                                      bound-to-device socket
  63 *      Hirokazu Takahashi      :       HW checksumming for outgoing UDP
  64 *                                      datagrams.
  65 *      Hirokazu Takahashi      :       sendfile() on UDP works now.
  66 *              Arnaldo C. Melo :       convert /proc/net/udp to seq_file
  67 *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  68 *      Alexey Kuznetsov:               allow both IPv4 and IPv6 sockets to bind
  69 *                                      a single port at the same time.
  70 *      Derek Atkins <derek@ihtfp.com>: Add Encapulation Support
  71 *      James Chapman           :       Add L2TP encapsulation type.
  72 *
  73 *
  74 *              This program is free software; you can redistribute it and/or
  75 *              modify it under the terms of the GNU General Public License
  76 *              as published by the Free Software Foundation; either version
  77 *              2 of the License, or (at your option) any later version.
  78 */
  79
  80#include <asm/system.h>
  81#include <asm/uaccess.h>
  82#include <asm/ioctls.h>
  83#include <linux/bootmem.h>
  84#include <linux/highmem.h>
  85#include <linux/swap.h>
  86#include <linux/types.h>
  87#include <linux/fcntl.h>
  88#include <linux/module.h>
  89#include <linux/socket.h>
  90#include <linux/sockios.h>
  91#include <linux/igmp.h>
  92#include <linux/in.h>
  93#include <linux/errno.h>
  94#include <linux/timer.h>
  95#include <linux/mm.h>
  96#include <linux/inet.h>
  97#include <linux/netdevice.h>
  98#include <net/tcp_states.h>
  99#include <linux/skbuff.h>
 100#include <linux/proc_fs.h>
 101#include <linux/seq_file.h>
 102#include <net/net_namespace.h>
 103#include <net/icmp.h>
 104#include <net/route.h>
 105#include <net/checksum.h>
 106#include <net/xfrm.h>
 107#include "udp_impl.h"
 108
 109struct udp_table udp_table;
 110EXPORT_SYMBOL(udp_table);
 111
 112int sysctl_udp_mem[3] __read_mostly;
 113EXPORT_SYMBOL(sysctl_udp_mem);
 114
 115int sysctl_udp_rmem_min __read_mostly;
 116EXPORT_SYMBOL(sysctl_udp_rmem_min);
 117
 118int sysctl_udp_wmem_min __read_mostly;
 119EXPORT_SYMBOL(sysctl_udp_wmem_min);
 120
 121atomic_t udp_memory_allocated;
 122EXPORT_SYMBOL(udp_memory_allocated);
 123
 124#define PORTS_PER_CHAIN (65536 / UDP_HTABLE_SIZE)
 125
 126static int udp_lib_lport_inuse(struct net *net, __u16 num,
 127                               const struct udp_hslot *hslot,
 128                               unsigned long *bitmap,
 129                               struct sock *sk,
 130                               int (*saddr_comp)(const struct sock *sk1,
 131                                                 const struct sock *sk2))
 132{
 133        struct sock *sk2;
 134        struct hlist_nulls_node *node;
 135
 136        sk_nulls_for_each(sk2, node, &hslot->head)
 137                if (net_eq(sock_net(sk2), net)                  &&
 138                    sk2 != sk                                   &&
 139                    (bitmap || sk2->sk_hash == num)             &&
 140                    (!sk2->sk_reuse || !sk->sk_reuse)           &&
 141                    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if
 142                        || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
 143                    (*saddr_comp)(sk, sk2)) {
 144                        if (bitmap)
 145                                __set_bit(sk2->sk_hash / UDP_HTABLE_SIZE,
 146                                          bitmap);
 147                        else
 148                                return 1;
 149                }
 150        return 0;
 151}
 152
 153/**
 154 *  udp_lib_get_port  -  UDP/-Lite port lookup for IPv4 and IPv6
 155 *
 156 *  @sk:          socket struct in question
 157 *  @snum:        port number to look up
 158 *  @saddr_comp:  AF-dependent comparison of bound local IP addresses
 159 */
 160int udp_lib_get_port(struct sock *sk, unsigned short snum,
 161                       int (*saddr_comp)(const struct sock *sk1,
 162                                         const struct sock *sk2))
 163{
 164        struct udp_hslot *hslot;
 165        struct udp_table *udptable = sk->sk_prot->h.udp_table;
 166        int    error = 1;
 167        struct net *net = sock_net(sk);
 168
 169        if (!snum) {
 170                int low, high, remaining;
 171                unsigned rand;
 172                unsigned short first, last;
 173                DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
 174
 175                inet_get_local_port_range(&low, &high);
 176                remaining = (high - low) + 1;
 177
 178                rand = net_random();
 179                first = (((u64)rand * remaining) >> 32) + low;
 180                /*
 181                 * force rand to be an odd multiple of UDP_HTABLE_SIZE
 182                 */
 183                rand = (rand | 1) * UDP_HTABLE_SIZE;
 184                for (last = first + UDP_HTABLE_SIZE; first != last; first++) {
 185                        hslot = &udptable->hash[udp_hashfn(net, first)];
 186                        bitmap_zero(bitmap, PORTS_PER_CHAIN);
 187                        spin_lock_bh(&hslot->lock);
 188                        udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
 189                                            saddr_comp);
 190
 191                        snum = first;
 192                        /*
 193                         * Iterate on all possible values of snum for this hash.
 194                         * Using steps of an odd multiple of UDP_HTABLE_SIZE
 195                         * give us randomization and full range coverage.
 196                         */
 197                        do {
 198                                if (low <= snum && snum <= high &&
 199                                    !test_bit(snum / UDP_HTABLE_SIZE, bitmap))
 200                                        goto found;
 201                                snum += rand;
 202                        } while (snum != first);
 203                        spin_unlock_bh(&hslot->lock);
 204                }
 205                goto fail;
 206        } else {
 207                hslot = &udptable->hash[udp_hashfn(net, snum)];
 208                spin_lock_bh(&hslot->lock);
 209                if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, saddr_comp))
 210                        goto fail_unlock;
 211        }
 212found:
 213        inet_sk(sk)->num = snum;
 214        sk->sk_hash = snum;
 215        if (sk_unhashed(sk)) {
 216                sk_nulls_add_node_rcu(sk, &hslot->head);
 217                sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 218        }
 219        error = 0;
 220fail_unlock:
 221        spin_unlock_bh(&hslot->lock);
 222fail:
 223        return error;
 224}
 225EXPORT_SYMBOL(udp_lib_get_port);
 226
 227static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
 228{
 229        struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
 230
 231        return  (!ipv6_only_sock(sk2)  &&
 232                 (!inet1->rcv_saddr || !inet2->rcv_saddr ||
 233                   inet1->rcv_saddr == inet2->rcv_saddr));
 234}
 235
 236int udp_v4_get_port(struct sock *sk, unsigned short snum)
 237{
 238        return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal);
 239}
 240
 241static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
 242                         unsigned short hnum,
 243                         __be16 sport, __be32 daddr, __be16 dport, int dif)
 244{
 245        int score = -1;
 246
 247        if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
 248                        !ipv6_only_sock(sk)) {
 249                struct inet_sock *inet = inet_sk(sk);
 250
 251                score = (sk->sk_family == PF_INET ? 1 : 0);
 252                if (inet->rcv_saddr) {
 253                        if (inet->rcv_saddr != daddr)
 254                                return -1;
 255                        score += 2;
 256                }
 257                if (inet->daddr) {
 258                        if (inet->daddr != saddr)
 259                                return -1;
 260                        score += 2;
 261                }
 262                if (inet->dport) {
 263                        if (inet->dport != sport)
 264                                return -1;
 265                        score += 2;
 266                }
 267                if (sk->sk_bound_dev_if) {
 268                        if (sk->sk_bound_dev_if != dif)
 269                                return -1;
 270                        score += 2;
 271                }
 272        }
 273        return score;
 274}
 275
 276/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
 277 * harder than this. -DaveM
 278 */
 279static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
 280                __be16 sport, __be32 daddr, __be16 dport,
 281                int dif, struct udp_table *udptable)
 282{
 283        struct sock *sk, *result;
 284        struct hlist_nulls_node *node;
 285        unsigned short hnum = ntohs(dport);
 286        unsigned int hash = udp_hashfn(net, hnum);
 287        struct udp_hslot *hslot = &udptable->hash[hash];
 288        int score, badness;
 289
 290        rcu_read_lock();
 291begin:
 292        result = NULL;
 293        badness = -1;
 294        sk_nulls_for_each_rcu(sk, node, &hslot->head) {
 295                score = compute_score(sk, net, saddr, hnum, sport,
 296                                      daddr, dport, dif);
 297                if (score > badness) {
 298                        result = sk;
 299                        badness = score;
 300                }
 301        }
 302        /*
 303         * if the nulls value we got at the end of this lookup is
 304         * not the expected one, we must restart lookup.
 305         * We probably met an item that was moved to another chain.
 306         */
 307        if (get_nulls_value(node) != hash)
 308                goto begin;
 309
 310        if (result) {
 311                if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
 312                        result = NULL;
 313                else if (unlikely(compute_score(result, net, saddr, hnum, sport,
 314                                  daddr, dport, dif) < badness)) {
 315                        sock_put(result);
 316                        goto begin;
 317                }
 318        }
 319        rcu_read_unlock();
 320        return result;
 321}
 322
 323static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
 324                                                 __be16 sport, __be16 dport,
 325                                                 struct udp_table *udptable)
 326{
 327        struct sock *sk;
 328        const struct iphdr *iph = ip_hdr(skb);
 329
 330        if (unlikely(sk = skb_steal_sock(skb)))
 331                return sk;
 332        else
 333                return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport,
 334                                         iph->daddr, dport, inet_iif(skb),
 335                                         udptable);
 336}
 337
 338struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
 339                             __be32 daddr, __be16 dport, int dif)
 340{
 341        return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table);
 342}
 343EXPORT_SYMBOL_GPL(udp4_lib_lookup);
 344
 345static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
 346                                             __be16 loc_port, __be32 loc_addr,
 347                                             __be16 rmt_port, __be32 rmt_addr,
 348                                             int dif)
 349{
 350        struct hlist_nulls_node *node;
 351        struct sock *s = sk;
 352        unsigned short hnum = ntohs(loc_port);
 353
 354        sk_nulls_for_each_from(s, node) {
 355                struct inet_sock *inet = inet_sk(s);
 356
 357                if (!net_eq(sock_net(s), net)                           ||
 358                    s->sk_hash != hnum                                  ||
 359                    (inet->daddr && inet->daddr != rmt_addr)            ||
 360                    (inet->dport != rmt_port && inet->dport)            ||
 361                    (inet->rcv_saddr && inet->rcv_saddr != loc_addr)    ||
 362                    ipv6_only_sock(s)                                   ||
 363                    (s->sk_bound_dev_if && s->sk_bound_dev_if != dif))
 364                        continue;
 365                if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif))
 366                        continue;
 367                goto found;
 368        }
 369        s = NULL;
 370found:
 371        return s;
 372}
 373
 374/*
 375 * This routine is called by the ICMP module when it gets some
 376 * sort of error condition.  If err < 0 then the socket should
 377 * be closed and the error returned to the user.  If err > 0
 378 * it's just the icmp type << 8 | icmp code.
 379 * Header points to the ip header of the error packet. We move
 380 * on past this. Then (as it used to claim before adjustment)
 381 * header points to the first 8 bytes of the udp header.  We need
 382 * to find the appropriate port.
 383 */
 384
 385void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 386{
 387        struct inet_sock *inet;
 388        struct iphdr *iph = (struct iphdr *)skb->data;
 389        struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2));
 390        const int type = icmp_hdr(skb)->type;
 391        const int code = icmp_hdr(skb)->code;
 392        struct sock *sk;
 393        int harderr;
 394        int err;
 395        struct net *net = dev_net(skb->dev);
 396
 397        sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
 398                        iph->saddr, uh->source, skb->dev->ifindex, udptable);
 399        if (sk == NULL) {
 400                ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 401                return; /* No socket for error */
 402        }
 403
 404        err = 0;
 405        harderr = 0;
 406        inet = inet_sk(sk);
 407
 408        switch (type) {
 409        default:
 410        case ICMP_TIME_EXCEEDED:
 411                err = EHOSTUNREACH;
 412                break;
 413        case ICMP_SOURCE_QUENCH:
 414                goto out;
 415        case ICMP_PARAMETERPROB:
 416                err = EPROTO;
 417                harderr = 1;
 418                break;
 419        case ICMP_DEST_UNREACH:
 420                if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
 421                        if (inet->pmtudisc != IP_PMTUDISC_DONT) {
 422                                err = EMSGSIZE;
 423                                harderr = 1;
 424                                break;
 425                        }
 426                        goto out;
 427                }
 428                err = EHOSTUNREACH;
 429                if (code <= NR_ICMP_UNREACH) {
 430                        harderr = icmp_err_convert[code].fatal;
 431                        err = icmp_err_convert[code].errno;
 432                }
 433                break;
 434        }
 435
 436        /*
 437         *      RFC1122: OK.  Passes ICMP errors back to application, as per
 438         *      4.1.3.3.
 439         */
 440        if (!inet->recverr) {
 441                if (!harderr || sk->sk_state != TCP_ESTABLISHED)
 442                        goto out;
 443        } else {
 444                ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1));
 445        }
 446        sk->sk_err = err;
 447        sk->sk_error_report(sk);
 448out:
 449        sock_put(sk);
 450}
 451
 452void udp_err(struct sk_buff *skb, u32 info)
 453{
 454        __udp4_lib_err(skb, info, &udp_table);
 455}
 456
 457/*
 458 * Throw away all pending data and cancel the corking. Socket is locked.
 459 */
 460void udp_flush_pending_frames(struct sock *sk)
 461{
 462        struct udp_sock *up = udp_sk(sk);
 463
 464        if (up->pending) {
 465                up->len = 0;
 466                up->pending = 0;
 467                ip_flush_pending_frames(sk);
 468        }
 469}
 470EXPORT_SYMBOL(udp_flush_pending_frames);
 471
 472/**
 473 *      udp4_hwcsum_outgoing  -  handle outgoing HW checksumming
 474 *      @sk:    socket we are sending on
 475 *      @skb:   sk_buff containing the filled-in UDP header
 476 *              (checksum field must be zeroed out)
 477 */
 478static void udp4_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb,
 479                                 __be32 src, __be32 dst, int len)
 480{
 481        unsigned int offset;
 482        struct udphdr *uh = udp_hdr(skb);
 483        __wsum csum = 0;
 484
 485        if (skb_queue_len(&sk->sk_write_queue) == 1) {
 486                /*
 487                 * Only one fragment on the socket.
 488                 */
 489                skb->csum_start = skb_transport_header(skb) - skb->head;
 490                skb->csum_offset = offsetof(struct udphdr, check);
 491                uh->check = ~csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, 0);
 492        } else {
 493                /*
 494                 * HW-checksum won't work as there are two or more
 495                 * fragments on the socket so that all csums of sk_buffs
 496                 * should be together
 497                 */
 498                offset = skb_transport_offset(skb);
 499                skb->csum = skb_checksum(skb, offset, skb->len - offset, 0);
 500
 501                skb->ip_summed = CHECKSUM_NONE;
 502
 503                skb_queue_walk(&sk->sk_write_queue, skb) {
 504                        csum = csum_add(csum, skb->csum);
 505                }
 506
 507                uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum);
 508                if (uh->check == 0)
 509                        uh->check = CSUM_MANGLED_0;
 510        }
 511}
 512
 513/*
 514 * Push out all pending data as one UDP datagram. Socket is locked.
 515 */
 516static int udp_push_pending_frames(struct sock *sk)
 517{
 518        struct udp_sock  *up = udp_sk(sk);
 519        struct inet_sock *inet = inet_sk(sk);
 520        struct flowi *fl = &inet->cork.fl;
 521        struct sk_buff *skb;
 522        struct udphdr *uh;
 523        int err = 0;
 524        int is_udplite = IS_UDPLITE(sk);
 525        __wsum csum = 0;
 526
 527        /* Grab the skbuff where UDP header space exists. */
 528        if ((skb = skb_peek(&sk->sk_write_queue)) == NULL)
 529                goto out;
 530
 531        /*
 532         * Create a UDP header
 533         */
 534        uh = udp_hdr(skb);
 535        uh->source = fl->fl_ip_sport;
 536        uh->dest = fl->fl_ip_dport;
 537        uh->len = htons(up->len);
 538        uh->check = 0;
 539
 540        if (is_udplite)                                  /*     UDP-Lite      */
 541                csum  = udplite_csum_outgoing(sk, skb);
 542
 543        else if (sk->sk_no_check == UDP_CSUM_NOXMIT) {   /* UDP csum disabled */
 544
 545                skb->ip_summed = CHECKSUM_NONE;
 546                goto send;
 547
 548        } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
 549
 550                udp4_hwcsum_outgoing(sk, skb, fl->fl4_src, fl->fl4_dst, up->len);
 551                goto send;
 552
 553        } else                                           /*   `normal' UDP    */
 554                csum = udp_csum_outgoing(sk, skb);
 555
 556        /* add protocol-dependent pseudo-header */
 557        uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, up->len,
 558                                      sk->sk_protocol, csum);
 559        if (uh->check == 0)
 560                uh->check = CSUM_MANGLED_0;
 561
 562send:
 563        err = ip_push_pending_frames(sk);
 564        if (err) {
 565                if (err == -ENOBUFS && !inet->recverr) {
 566                        UDP_INC_STATS_USER(sock_net(sk),
 567                                           UDP_MIB_SNDBUFERRORS, is_udplite);
 568                        err = 0;
 569                }
 570        } else
 571                UDP_INC_STATS_USER(sock_net(sk),
 572                                   UDP_MIB_OUTDATAGRAMS, is_udplite);
 573out:
 574        up->len = 0;
 575        up->pending = 0;
 576        return err;
 577}
 578
 579int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 580                size_t len)
 581{
 582        struct inet_sock *inet = inet_sk(sk);
 583        struct udp_sock *up = udp_sk(sk);
 584        int ulen = len;
 585        struct ipcm_cookie ipc;
 586        struct rtable *rt = NULL;
 587        int free = 0;
 588        int connected = 0;
 589        __be32 daddr, faddr, saddr;
 590        __be16 dport;
 591        u8  tos;
 592        int err, is_udplite = IS_UDPLITE(sk);
 593        int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
 594        int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
 595
 596        if (len > 0xFFFF)
 597                return -EMSGSIZE;
 598
 599        /*
 600         *      Check the flags.
 601         */
 602
 603        if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */
 604                return -EOPNOTSUPP;
 605
 606        ipc.opt = NULL;
 607        ipc.shtx.flags = 0;
 608
 609        if (up->pending) {
 610                /*
 611                 * There are pending frames.
 612                 * The socket lock must be held while it's corked.
 613                 */
 614                lock_sock(sk);
 615                if (likely(up->pending)) {
 616                        if (unlikely(up->pending != AF_INET)) {
 617                                release_sock(sk);
 618                                return -EINVAL;
 619                        }
 620                        goto do_append_data;
 621                }
 622                release_sock(sk);
 623        }
 624        ulen += sizeof(struct udphdr);
 625
 626        /*
 627         *      Get and verify the address.
 628         */
 629        if (msg->msg_name) {
 630                struct sockaddr_in * usin = (struct sockaddr_in *)msg->msg_name;
 631                if (msg->msg_namelen < sizeof(*usin))
 632                        return -EINVAL;
 633                if (usin->sin_family != AF_INET) {
 634                        if (usin->sin_family != AF_UNSPEC)
 635                                return -EAFNOSUPPORT;
 636                }
 637
 638                daddr = usin->sin_addr.s_addr;
 639                dport = usin->sin_port;
 640                if (dport == 0)
 641                        return -EINVAL;
 642        } else {
 643                if (sk->sk_state != TCP_ESTABLISHED)
 644                        return -EDESTADDRREQ;
 645                daddr = inet->daddr;
 646                dport = inet->dport;
 647                /* Open fast path for connected socket.
 648                   Route will not be used, if at least one option is set.
 649                 */
 650                connected = 1;
 651        }
 652        ipc.addr = inet->saddr;
 653
 654        ipc.oif = sk->sk_bound_dev_if;
 655        err = sock_tx_timestamp(msg, sk, &ipc.shtx);
 656        if (err)
 657                return err;
 658        if (msg->msg_controllen) {
 659                err = ip_cmsg_send(sock_net(sk), msg, &ipc);
 660                if (err)
 661                        return err;
 662                if (ipc.opt)
 663                        free = 1;
 664                connected = 0;
 665        }
 666        if (!ipc.opt)
 667                ipc.opt = inet->opt;
 668
 669        saddr = ipc.addr;
 670        ipc.addr = faddr = daddr;
 671
 672        if (ipc.opt && ipc.opt->srr) {
 673                if (!daddr)
 674                        return -EINVAL;
 675                faddr = ipc.opt->faddr;
 676                connected = 0;
 677        }
 678        tos = RT_TOS(inet->tos);
 679        if (sock_flag(sk, SOCK_LOCALROUTE) ||
 680            (msg->msg_flags & MSG_DONTROUTE) ||
 681            (ipc.opt && ipc.opt->is_strictroute)) {
 682                tos |= RTO_ONLINK;
 683                connected = 0;
 684        }
 685
 686        if (ipv4_is_multicast(daddr)) {
 687                if (!ipc.oif)
 688                        ipc.oif = inet->mc_index;
 689                if (!saddr)
 690                        saddr = inet->mc_addr;
 691                connected = 0;
 692        }
 693
 694        if (connected)
 695                rt = (struct rtable *)sk_dst_check(sk, 0);
 696
 697        if (rt == NULL) {
 698                struct flowi fl = { .oif = ipc.oif,
 699                                    .mark = sk->sk_mark,
 700                                    .nl_u = { .ip4_u =
 701                                              { .daddr = faddr,
 702                                                .saddr = saddr,
 703                                                .tos = tos } },
 704                                    .proto = sk->sk_protocol,
 705                                    .flags = inet_sk_flowi_flags(sk),
 706                                    .uli_u = { .ports =
 707                                               { .sport = inet->sport,
 708                                                 .dport = dport } } };
 709                struct net *net = sock_net(sk);
 710
 711                security_sk_classify_flow(sk, &fl);
 712                err = ip_route_output_flow(net, &rt, &fl, sk, 1);
 713                if (err) {
 714                        if (err == -ENETUNREACH)
 715                                IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
 716                        goto out;
 717                }
 718
 719                err = -EACCES;
 720                if ((rt->rt_flags & RTCF_BROADCAST) &&
 721                    !sock_flag(sk, SOCK_BROADCAST))
 722                        goto out;
 723                if (connected)
 724                        sk_dst_set(sk, dst_clone(&rt->u.dst));
 725        }
 726
 727        if (msg->msg_flags&MSG_CONFIRM)
 728                goto do_confirm;
 729back_from_confirm:
 730
 731        saddr = rt->rt_src;
 732        if (!ipc.addr)
 733                daddr = ipc.addr = rt->rt_dst;
 734
 735        lock_sock(sk);
 736        if (unlikely(up->pending)) {
 737                /* The socket is already corked while preparing it. */
 738                /* ... which is an evident application bug. --ANK */
 739                release_sock(sk);
 740
 741                LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n");
 742                err = -EINVAL;
 743                goto out;
 744        }
 745        /*
 746         *      Now cork the socket to pend data.
 747         */
 748        inet->cork.fl.fl4_dst = daddr;
 749        inet->cork.fl.fl_ip_dport = dport;
 750        inet->cork.fl.fl4_src = saddr;
 751        inet->cork.fl.fl_ip_sport = inet->sport;
 752        up->pending = AF_INET;
 753
 754do_append_data:
 755        up->len += ulen;
 756        getfrag  =  is_udplite ?  udplite_getfrag : ip_generic_getfrag;
 757        err = ip_append_data(sk, getfrag, msg->msg_iov, ulen,
 758                        sizeof(struct udphdr), &ipc, &rt,
 759                        corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
 760        if (err)
 761                udp_flush_pending_frames(sk);
 762        else if (!corkreq)
 763                err = udp_push_pending_frames(sk);
 764        else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
 765                up->pending = 0;
 766        release_sock(sk);
 767
 768out:
 769        ip_rt_put(rt);
 770        if (free)
 771                kfree(ipc.opt);
 772        if (!err)
 773                return len;
 774        /*
 775         * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space.  Reporting
 776         * ENOBUFS might not be good (it's not tunable per se), but otherwise
 777         * we don't have a good statistic (IpOutDiscards but it can be too many
 778         * things).  We could add another new stat but at least for now that
 779         * seems like overkill.
 780         */
 781        if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
 782                UDP_INC_STATS_USER(sock_net(sk),
 783                                UDP_MIB_SNDBUFERRORS, is_udplite);
 784        }
 785        return err;
 786
 787do_confirm:
 788        dst_confirm(&rt->u.dst);
 789        if (!(msg->msg_flags&MSG_PROBE) || len)
 790                goto back_from_confirm;
 791        err = 0;
 792        goto out;
 793}
 794EXPORT_SYMBOL(udp_sendmsg);
 795
 796int udp_sendpage(struct sock *sk, struct page *page, int offset,
 797                 size_t size, int flags)
 798{
 799        struct udp_sock *up = udp_sk(sk);
 800        int ret;
 801
 802        if (!up->pending) {
 803                struct msghdr msg = {   .msg_flags = flags|MSG_MORE };
 804
 805                /* Call udp_sendmsg to specify destination address which
 806                 * sendpage interface can't pass.
 807                 * This will succeed only when the socket is connected.
 808                 */
 809                ret = udp_sendmsg(NULL, sk, &msg, 0);
 810                if (ret < 0)
 811                        return ret;
 812        }
 813
 814        lock_sock(sk);
 815
 816        if (unlikely(!up->pending)) {
 817                release_sock(sk);
 818
 819                LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 3\n");
 820                return -EINVAL;
 821        }
 822
 823        ret = ip_append_page(sk, page, offset, size, flags);
 824        if (ret == -EOPNOTSUPP) {
 825                release_sock(sk);
 826                return sock_no_sendpage(sk->sk_socket, page, offset,
 827                                        size, flags);
 828        }
 829        if (ret < 0) {
 830                udp_flush_pending_frames(sk);
 831                goto out;
 832        }
 833
 834        up->len += size;
 835        if (!(up->corkflag || (flags&MSG_MORE)))
 836                ret = udp_push_pending_frames(sk);
 837        if (!ret)
 838                ret = size;
 839out:
 840        release_sock(sk);
 841        return ret;
 842}
 843
 844
 845/**
 846 *      first_packet_length     - return length of first packet in receive queue
 847 *      @sk: socket
 848 *
 849 *      Drops all bad checksum frames, until a valid one is found.
 850 *      Returns the length of found skb, or 0 if none is found.
 851 */
 852static unsigned int first_packet_length(struct sock *sk)
 853{
 854        struct sk_buff_head list_kill, *rcvq = &sk->sk_receive_queue;
 855        struct sk_buff *skb;
 856        unsigned int res;
 857
 858        __skb_queue_head_init(&list_kill);
 859
 860        spin_lock_bh(&rcvq->lock);
 861        while ((skb = skb_peek(rcvq)) != NULL &&
 862                udp_lib_checksum_complete(skb)) {
 863                UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
 864                                 IS_UDPLITE(sk));
 865                __skb_unlink(skb, rcvq);
 866                __skb_queue_tail(&list_kill, skb);
 867        }
 868        res = skb ? skb->len : 0;
 869        spin_unlock_bh(&rcvq->lock);
 870
 871        if (!skb_queue_empty(&list_kill)) {
 872                lock_sock(sk);
 873                __skb_queue_purge(&list_kill);
 874                sk_mem_reclaim_partial(sk);
 875                release_sock(sk);
 876        }
 877        return res;
 878}
 879
 880/*
 881 *      IOCTL requests applicable to the UDP protocol
 882 */
 883
 884int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 885{
 886        switch (cmd) {
 887        case SIOCOUTQ:
 888        {
 889                int amount = sk_wmem_alloc_get(sk);
 890
 891                return put_user(amount, (int __user *)arg);
 892        }
 893
 894        case SIOCINQ:
 895        {
 896                unsigned int amount = first_packet_length(sk);
 897
 898                if (amount)
 899                        /*
 900                         * We will only return the amount
 901                         * of this packet since that is all
 902                         * that will be read.
 903                         */
 904                        amount -= sizeof(struct udphdr);
 905
 906                return put_user(amount, (int __user *)arg);
 907        }
 908
 909        default:
 910                return -ENOIOCTLCMD;
 911        }
 912
 913        return 0;
 914}
 915EXPORT_SYMBOL(udp_ioctl);
 916
 917/*
 918 *      This should be easy, if there is something there we
 919 *      return it, otherwise we block.
 920 */
 921
 922int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 923                size_t len, int noblock, int flags, int *addr_len)
 924{
 925        struct inet_sock *inet = inet_sk(sk);
 926        struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
 927        struct sk_buff *skb;
 928        unsigned int ulen, copied;
 929        int peeked;
 930        int err;
 931        int is_udplite = IS_UDPLITE(sk);
 932
 933        /*
 934         *      Check any passed addresses
 935         */
 936        if (addr_len)
 937                *addr_len = sizeof(*sin);
 938
 939        if (flags & MSG_ERRQUEUE)
 940                return ip_recv_error(sk, msg, len);
 941
 942try_again:
 943        skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
 944                                  &peeked, &err);
 945        if (!skb)
 946                goto out;
 947
 948        ulen = skb->len - sizeof(struct udphdr);
 949        copied = len;
 950        if (copied > ulen)
 951                copied = ulen;
 952        else if (copied < ulen)
 953                msg->msg_flags |= MSG_TRUNC;
 954
 955        /*
 956         * If checksum is needed at all, try to do it while copying the
 957         * data.  If the data is truncated, or if we only want a partial
 958         * coverage checksum (UDP-Lite), do it before the copy.
 959         */
 960
 961        if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) {
 962                if (udp_lib_checksum_complete(skb))
 963                        goto csum_copy_err;
 964        }
 965
 966        if (skb_csum_unnecessary(skb))
 967                err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr),
 968                                              msg->msg_iov, copied);
 969        else {
 970                err = skb_copy_and_csum_datagram_iovec(skb,
 971                                                       sizeof(struct udphdr),
 972                                                       msg->msg_iov);
 973
 974                if (err == -EINVAL)
 975                        goto csum_copy_err;
 976        }
 977
 978        if (err)
 979                goto out_free;
 980
 981        if (!peeked)
 982                UDP_INC_STATS_USER(sock_net(sk),
 983                                UDP_MIB_INDATAGRAMS, is_udplite);
 984
 985        sock_recv_timestamp(msg, sk, skb);
 986
 987        /* Copy the address. */
 988        if (sin) {
 989                sin->sin_family = AF_INET;
 990                sin->sin_port = udp_hdr(skb)->source;
 991                sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
 992                memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
 993        }
 994        if (inet->cmsg_flags)
 995                ip_cmsg_recv(msg, skb);
 996
 997        err = copied;
 998        if (flags & MSG_TRUNC)
 999                err = ulen;
1000
1001out_free:
1002        skb_free_datagram_locked(sk, skb);
1003out:
1004        return err;
1005
1006csum_copy_err:
1007        lock_sock(sk);
1008        if (!skb_kill_datagram(sk, skb, flags))
1009                UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
1010        release_sock(sk);
1011
1012        if (noblock)
1013                return -EAGAIN;
1014        goto try_again;
1015}
1016
1017
1018int udp_disconnect(struct sock *sk, int flags)
1019{
1020        struct inet_sock *inet = inet_sk(sk);
1021        /*
1022         *      1003.1g - break association.
1023         */
1024
1025        sk->sk_state = TCP_CLOSE;
1026        inet->daddr = 0;
1027        inet->dport = 0;
1028        sk->sk_bound_dev_if = 0;
1029        if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1030                inet_reset_saddr(sk);
1031
1032        if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) {
1033                sk->sk_prot->unhash(sk);
1034                inet->sport = 0;
1035        }
1036        sk_dst_reset(sk);
1037        return 0;
1038}
1039EXPORT_SYMBOL(udp_disconnect);
1040
1041void udp_lib_unhash(struct sock *sk)
1042{
1043        if (sk_hashed(sk)) {
1044                struct udp_table *udptable = sk->sk_prot->h.udp_table;
1045                unsigned int hash = udp_hashfn(sock_net(sk), sk->sk_hash);
1046                struct udp_hslot *hslot = &udptable->hash[hash];
1047
1048                spin_lock_bh(&hslot->lock);
1049                if (sk_nulls_del_node_init_rcu(sk)) {
1050                        inet_sk(sk)->num = 0;
1051                        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
1052                }
1053                spin_unlock_bh(&hslot->lock);
1054        }
1055}
1056EXPORT_SYMBOL(udp_lib_unhash);
1057
1058static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1059{
1060        int is_udplite = IS_UDPLITE(sk);
1061        int rc;
1062
1063        if ((rc = sock_queue_rcv_skb(sk, skb)) < 0) {
1064                /* Note that an ENOMEM error is charged twice */
1065                if (rc == -ENOMEM) {
1066                        UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
1067                                         is_udplite);
1068                        atomic_inc(&sk->sk_drops);
1069                }
1070                goto drop;
1071        }
1072
1073        return 0;
1074
1075drop:
1076        UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
1077        kfree_skb(skb);
1078        return -1;
1079}
1080
1081/* returns:
1082 *  -1: error
1083 *   0: success
1084 *  >0: "udp encap" protocol resubmission
1085 *
1086 * Note that in the success and error cases, the skb is assumed to
1087 * have either been requeued or freed.
1088 */
1089int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1090{
1091        struct udp_sock *up = udp_sk(sk);
1092        int rc;
1093        int is_udplite = IS_UDPLITE(sk);
1094
1095        /*
1096         *      Charge it to the socket, dropping if the queue is full.
1097         */
1098        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1099                goto drop;
1100        nf_reset(skb);
1101
1102        if (up->encap_type) {
1103                /*
1104                 * This is an encapsulation socket so pass the skb to
1105                 * the socket's udp_encap_rcv() hook. Otherwise, just
1106                 * fall through and pass this up the UDP socket.
1107                 * up->encap_rcv() returns the following value:
1108                 * =0 if skb was successfully passed to the encap
1109                 *    handler or was discarded by it.
1110                 * >0 if skb should be passed on to UDP.
1111                 * <0 if skb should be resubmitted as proto -N
1112                 */
1113
1114                /* if we're overly short, let UDP handle it */
1115                if (skb->len > sizeof(struct udphdr) &&
1116                    up->encap_rcv != NULL) {
1117                        int ret;
1118
1119                        ret = (*up->encap_rcv)(sk, skb);
1120                        if (ret <= 0) {
1121                                UDP_INC_STATS_BH(sock_net(sk),
1122                                                 UDP_MIB_INDATAGRAMS,
1123                                                 is_udplite);
1124                                return -ret;
1125                        }
1126                }
1127
1128                /* FALLTHROUGH -- it's a UDP Packet */
1129        }
1130
1131        /*
1132         *      UDP-Lite specific tests, ignored on UDP sockets
1133         */
1134        if ((is_udplite & UDPLITE_RECV_CC)  &&  UDP_SKB_CB(skb)->partial_cov) {
1135
1136                /*
1137                 * MIB statistics other than incrementing the error count are
1138                 * disabled for the following two types of errors: these depend
1139                 * on the application settings, not on the functioning of the
1140                 * protocol stack as such.
1141                 *
1142                 * RFC 3828 here recommends (sec 3.3): "There should also be a
1143                 * way ... to ... at least let the receiving application block
1144                 * delivery of packets with coverage values less than a value
1145                 * provided by the application."
1146                 */
1147                if (up->pcrlen == 0) {          /* full coverage was set  */
1148                        LIMIT_NETDEBUG(KERN_WARNING "UDPLITE: partial coverage "
1149                                "%d while full coverage %d requested\n",
1150                                UDP_SKB_CB(skb)->cscov, skb->len);
1151                        goto drop;
1152                }
1153                /* The next case involves violating the min. coverage requested
1154                 * by the receiver. This is subtle: if receiver wants x and x is
1155                 * greater than the buffersize/MTU then receiver will complain
1156                 * that it wants x while sender emits packets of smaller size y.
1157                 * Therefore the above ...()->partial_cov statement is essential.
1158                 */
1159                if (UDP_SKB_CB(skb)->cscov  <  up->pcrlen) {
1160                        LIMIT_NETDEBUG(KERN_WARNING
1161                                "UDPLITE: coverage %d too small, need min %d\n",
1162                                UDP_SKB_CB(skb)->cscov, up->pcrlen);
1163                        goto drop;
1164                }
1165        }
1166
1167        if (sk->sk_filter) {
1168                if (udp_lib_checksum_complete(skb))
1169                        goto drop;
1170        }
1171
1172        rc = 0;
1173
1174        bh_lock_sock(sk);
1175        if (!sock_owned_by_user(sk))
1176                rc = __udp_queue_rcv_skb(sk, skb);
1177        else
1178                sk_add_backlog(sk, skb);
1179        bh_unlock_sock(sk);
1180
1181        return rc;
1182
1183drop:
1184        UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
1185        kfree_skb(skb);
1186        return -1;
1187}
1188
1189/*
1190 *      Multicasts and broadcasts go to each listener.
1191 *
1192 *      Note: called only from the BH handler context,
1193 *      so we don't need to lock the hashes.
1194 */
1195static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
1196                                    struct udphdr  *uh,
1197                                    __be32 saddr, __be32 daddr,
1198                                    struct udp_table *udptable)
1199{
1200        struct sock *sk;
1201        struct udp_hslot *hslot = &udptable->hash[udp_hashfn(net, ntohs(uh->dest))];
1202        int dif;
1203
1204        spin_lock(&hslot->lock);
1205        sk = sk_nulls_head(&hslot->head);
1206        dif = skb->dev->ifindex;
1207        sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
1208        if (sk) {
1209                struct sock *sknext = NULL;
1210
1211                do {
1212                        struct sk_buff *skb1 = skb;
1213
1214                        sknext = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
1215                                                   daddr, uh->source, saddr,
1216                                                   dif);
1217                        if (sknext)
1218                                skb1 = skb_clone(skb, GFP_ATOMIC);
1219
1220                        if (skb1) {
1221                                int ret = udp_queue_rcv_skb(sk, skb1);
1222                                if (ret > 0)
1223                                        /* we should probably re-process instead
1224                                         * of dropping packets here. */
1225                                        kfree_skb(skb1);
1226                        }
1227                        sk = sknext;
1228                } while (sknext);
1229        } else
1230                consume_skb(skb);
1231        spin_unlock(&hslot->lock);
1232        return 0;
1233}
1234
1235/* Initialize UDP checksum. If exited with zero value (success),
1236 * CHECKSUM_UNNECESSARY means, that no more checks are required.
1237 * Otherwise, csum completion requires chacksumming packet body,
1238 * including udp header and folding it to skb->csum.
1239 */
1240static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
1241                                 int proto)
1242{
1243        const struct iphdr *iph;
1244        int err;
1245
1246        UDP_SKB_CB(skb)->partial_cov = 0;
1247        UDP_SKB_CB(skb)->cscov = skb->len;
1248
1249        if (proto == IPPROTO_UDPLITE) {
1250                err = udplite_checksum_init(skb, uh);
1251                if (err)
1252                        return err;
1253        }
1254
1255        iph = ip_hdr(skb);
1256        if (uh->check == 0) {
1257                skb->ip_summed = CHECKSUM_UNNECESSARY;
1258        } else if (skb->ip_summed == CHECKSUM_COMPLETE) {
1259                if (!csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len,
1260                                      proto, skb->csum))
1261                        skb->ip_summed = CHECKSUM_UNNECESSARY;
1262        }
1263        if (!skb_csum_unnecessary(skb))
1264                skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1265                                               skb->len, proto, 0);
1266        /* Probably, we should checksum udp header (it should be in cache
1267         * in any case) and data in tiny packets (< rx copybreak).
1268         */
1269
1270        return 0;
1271}
1272
1273/*
1274 *      All we need to do is get the socket, and then do a checksum.
1275 */
1276
1277int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
1278                   int proto)
1279{
1280        struct sock *sk;
1281        struct udphdr *uh;
1282        unsigned short ulen;
1283        struct rtable *rt = skb_rtable(skb);
1284        __be32 saddr, daddr;
1285        struct net *net = dev_net(skb->dev);
1286
1287        /*
1288         *  Validate the packet.
1289         */
1290        if (!pskb_may_pull(skb, sizeof(struct udphdr)))
1291                goto drop;              /* No space for header. */
1292
1293        uh   = udp_hdr(skb);
1294        ulen = ntohs(uh->len);
1295        if (ulen > skb->len)
1296                goto short_packet;
1297
1298        if (proto == IPPROTO_UDP) {
1299                /* UDP validates ulen. */
1300                if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen))
1301                        goto short_packet;
1302                uh = udp_hdr(skb);
1303        }
1304
1305        if (udp4_csum_init(skb, uh, proto))
1306                goto csum_error;
1307
1308        saddr = ip_hdr(skb)->saddr;
1309        daddr = ip_hdr(skb)->daddr;
1310
1311        if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
1312                return __udp4_lib_mcast_deliver(net, skb, uh,
1313                                saddr, daddr, udptable);
1314
1315        sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
1316
1317        if (sk != NULL) {
1318                int ret = udp_queue_rcv_skb(sk, skb);
1319                sock_put(sk);
1320
1321                /* a return value > 0 means to resubmit the input, but
1322                 * it wants the return to be -protocol, or 0
1323                 */
1324                if (ret > 0)
1325                        return -ret;
1326                return 0;
1327        }
1328
1329        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1330                goto drop;
1331        nf_reset(skb);
1332
1333        /* No socket. Drop packet silently, if checksum is wrong */
1334        if (udp_lib_checksum_complete(skb))
1335                goto csum_error;
1336
1337        UDP_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
1338        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
1339
1340        /*
1341         * Hmm.  We got an UDP packet to a port to which we
1342         * don't wanna listen.  Ignore it.
1343         */
1344        kfree_skb(skb);
1345        return 0;
1346
1347short_packet:
1348        LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n",
1349                       proto == IPPROTO_UDPLITE ? "-Lite" : "",
1350                       &saddr,
1351                       ntohs(uh->source),
1352                       ulen,
1353                       skb->len,
1354                       &daddr,
1355                       ntohs(uh->dest));
1356        goto drop;
1357
1358csum_error:
1359        /*
1360         * RFC1122: OK.  Discards the bad packet silently (as far as
1361         * the network is concerned, anyway) as per 4.1.3.4 (MUST).
1362         */
1363        LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n",
1364                       proto == IPPROTO_UDPLITE ? "-Lite" : "",
1365                       &saddr,
1366                       ntohs(uh->source),
1367                       &daddr,
1368                       ntohs(uh->dest),
1369                       ulen);
1370drop:
1371        UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
1372        kfree_skb(skb);
1373        return 0;
1374}
1375
1376int udp_rcv(struct sk_buff *skb)
1377{
1378        return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);
1379}
1380
1381void udp_destroy_sock(struct sock *sk)
1382{
1383        lock_sock(sk);
1384        udp_flush_pending_frames(sk);
1385        release_sock(sk);
1386}
1387
1388/*
1389 *      Socket option code for UDP
1390 */
1391int udp_lib_setsockopt(struct sock *sk, int level, int optname,
1392                       char __user *optval, unsigned int optlen,
1393                       int (*push_pending_frames)(struct sock *))
1394{
1395        struct udp_sock *up = udp_sk(sk);
1396        int val;
1397        int err = 0;
1398        int is_udplite = IS_UDPLITE(sk);
1399
1400        if (optlen < sizeof(int))
1401                return -EINVAL;
1402
1403        if (get_user(val, (int __user *)optval))
1404                return -EFAULT;
1405
1406        switch (optname) {
1407        case UDP_CORK:
1408                if (val != 0) {
1409                        up->corkflag = 1;
1410                } else {
1411                        up->corkflag = 0;
1412                        lock_sock(sk);
1413                        (*push_pending_frames)(sk);
1414                        release_sock(sk);
1415                }
1416                break;
1417
1418        case UDP_ENCAP:
1419                switch (val) {
1420                case 0:
1421                case UDP_ENCAP_ESPINUDP:
1422                case UDP_ENCAP_ESPINUDP_NON_IKE:
1423                        up->encap_rcv = xfrm4_udp_encap_rcv;
1424                        /* FALLTHROUGH */
1425                case UDP_ENCAP_L2TPINUDP:
1426                        up->encap_type = val;
1427                        break;
1428                default:
1429                        err = -ENOPROTOOPT;
1430                        break;
1431                }
1432                break;
1433
1434        /*
1435         *      UDP-Lite's partial checksum coverage (RFC 3828).
1436         */
1437        /* The sender sets actual checksum coverage length via this option.
1438         * The case coverage > packet length is handled by send module. */
1439        case UDPLITE_SEND_CSCOV:
1440                if (!is_udplite)         /* Disable the option on UDP sockets */
1441                        return -ENOPROTOOPT;
1442                if (val != 0 && val < 8) /* Illegal coverage: use default (8) */
1443                        val = 8;
1444                else if (val > USHORT_MAX)
1445                        val = USHORT_MAX;
1446                up->pcslen = val;
1447                up->pcflag |= UDPLITE_SEND_CC;
1448                break;
1449
1450        /* The receiver specifies a minimum checksum coverage value. To make
1451         * sense, this should be set to at least 8 (as done below). If zero is
1452         * used, this again means full checksum coverage.                     */
1453        case UDPLITE_RECV_CSCOV:
1454                if (!is_udplite)         /* Disable the option on UDP sockets */
1455                        return -ENOPROTOOPT;
1456                if (val != 0 && val < 8) /* Avoid silly minimal values.       */
1457                        val = 8;
1458                else if (val > USHORT_MAX)
1459                        val = USHORT_MAX;
1460                up->pcrlen = val;
1461                up->pcflag |= UDPLITE_RECV_CC;
1462                break;
1463
1464        default:
1465                err = -ENOPROTOOPT;
1466                break;
1467        }
1468
1469        return err;
1470}
1471EXPORT_SYMBOL(udp_lib_setsockopt);
1472
1473int udp_setsockopt(struct sock *sk, int level, int optname,
1474                   char __user *optval, unsigned int optlen)
1475{
1476        if (level == SOL_UDP  ||  level == SOL_UDPLITE)
1477                return udp_lib_setsockopt(sk, level, optname, optval, optlen,
1478                                          udp_push_pending_frames);
1479        return ip_setsockopt(sk, level, optname, optval, optlen);
1480}
1481
1482#ifdef CONFIG_COMPAT
1483int compat_udp_setsockopt(struct sock *sk, int level, int optname,
1484                          char __user *optval, unsigned int optlen)
1485{
1486        if (level == SOL_UDP  ||  level == SOL_UDPLITE)
1487                return udp_lib_setsockopt(sk, level, optname, optval, optlen,
1488                                          udp_push_pending_frames);
1489        return compat_ip_setsockopt(sk, level, optname, optval, optlen);
1490}
1491#endif
1492
1493int udp_lib_getsockopt(struct sock *sk, int level, int optname,
1494                       char __user *optval, int __user *optlen)
1495{
1496        struct udp_sock *up = udp_sk(sk);
1497        int val, len;
1498
1499        if (get_user(len, optlen))
1500                return -EFAULT;
1501
1502        len = min_t(unsigned int, len, sizeof(int));
1503
1504        if (len < 0)
1505                return -EINVAL;
1506
1507        switch (optname) {
1508        case UDP_CORK:
1509                val = up->corkflag;
1510                break;
1511
1512        case UDP_ENCAP:
1513                val = up->encap_type;
1514                break;
1515
1516        /* The following two cannot be changed on UDP sockets, the return is
1517         * always 0 (which corresponds to the full checksum coverage of UDP). */
1518        case UDPLITE_SEND_CSCOV:
1519                val = up->pcslen;
1520                break;
1521
1522        case UDPLITE_RECV_CSCOV:
1523                val = up->pcrlen;
1524                break;
1525
1526        default:
1527                return -ENOPROTOOPT;
1528        }
1529
1530        if (put_user(len, optlen))
1531                return -EFAULT;
1532        if (copy_to_user(optval, &val, len))
1533                return -EFAULT;
1534        return 0;
1535}
1536EXPORT_SYMBOL(udp_lib_getsockopt);
1537
1538int udp_getsockopt(struct sock *sk, int level, int optname,
1539                   char __user *optval, int __user *optlen)
1540{
1541        if (level == SOL_UDP  ||  level == SOL_UDPLITE)
1542                return udp_lib_getsockopt(sk, level, optname, optval, optlen);
1543        return ip_getsockopt(sk, level, optname, optval, optlen);
1544}
1545
1546#ifdef CONFIG_COMPAT
1547int compat_udp_getsockopt(struct sock *sk, int level, int optname,
1548                                 char __user *optval, int __user *optlen)
1549{
1550        if (level == SOL_UDP  ||  level == SOL_UDPLITE)
1551                return udp_lib_getsockopt(sk, level, optname, optval, optlen);
1552        return compat_ip_getsockopt(sk, level, optname, optval, optlen);
1553}
1554#endif
1555/**
1556 *      udp_poll - wait for a UDP event.
1557 *      @file - file struct
1558 *      @sock - socket
1559 *      @wait - poll table
1560 *
1561 *      This is same as datagram poll, except for the special case of
1562 *      blocking sockets. If application is using a blocking fd
1563 *      and a packet with checksum error is in the queue;
1564 *      then it could get return from select indicating data available
1565 *      but then block when reading it. Add special case code
1566 *      to work around these arguably broken applications.
1567 */
1568unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
1569{
1570        unsigned int mask = datagram_poll(file, sock, wait);
1571        struct sock *sk = sock->sk;
1572
1573        /* Check for false positives due to checksum errors */
1574        if ((mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
1575            !(sk->sk_shutdown & RCV_SHUTDOWN) && !first_packet_length(sk))
1576                mask &= ~(POLLIN | POLLRDNORM);
1577
1578        return mask;
1579
1580}
1581EXPORT_SYMBOL(udp_poll);
1582
1583struct proto udp_prot = {
1584        .name              = "UDP",
1585        .owner             = THIS_MODULE,
1586        .close             = udp_lib_close,
1587        .connect           = ip4_datagram_connect,
1588        .disconnect        = udp_disconnect,
1589        .ioctl             = udp_ioctl,
1590        .destroy           = udp_destroy_sock,
1591        .setsockopt        = udp_setsockopt,
1592        .getsockopt        = udp_getsockopt,
1593        .sendmsg           = udp_sendmsg,
1594        .recvmsg           = udp_recvmsg,
1595        .sendpage          = udp_sendpage,
1596        .backlog_rcv       = __udp_queue_rcv_skb,
1597        .hash              = udp_lib_hash,
1598        .unhash            = udp_lib_unhash,
1599        .get_port          = udp_v4_get_port,
1600        .memory_allocated  = &udp_memory_allocated,
1601        .sysctl_mem        = sysctl_udp_mem,
1602        .sysctl_wmem       = &sysctl_udp_wmem_min,
1603        .sysctl_rmem       = &sysctl_udp_rmem_min,
1604        .obj_size          = sizeof(struct udp_sock),
1605        .slab_flags        = SLAB_DESTROY_BY_RCU,
1606        .h.udp_table       = &udp_table,
1607#ifdef CONFIG_COMPAT
1608        .compat_setsockopt = compat_udp_setsockopt,
1609        .compat_getsockopt = compat_udp_getsockopt,
1610#endif
1611};
1612EXPORT_SYMBOL(udp_prot);
1613
1614/* ------------------------------------------------------------------------ */
1615#ifdef CONFIG_PROC_FS
1616
1617static struct sock *udp_get_first(struct seq_file *seq, int start)
1618{
1619        struct sock *sk;
1620        struct udp_iter_state *state = seq->private;
1621        struct net *net = seq_file_net(seq);
1622
1623        for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) {
1624                struct hlist_nulls_node *node;
1625                struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
1626                spin_lock_bh(&hslot->lock);
1627                sk_nulls_for_each(sk, node, &hslot->head) {
1628                        if (!net_eq(sock_net(sk), net))
1629                                continue;
1630                        if (sk->sk_family == state->family)
1631                                goto found;
1632                }
1633                spin_unlock_bh(&hslot->lock);
1634        }
1635        sk = NULL;
1636found:
1637        return sk;
1638}
1639
1640static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
1641{
1642        struct udp_iter_state *state = seq->private;
1643        struct net *net = seq_file_net(seq);
1644
1645        do {
1646                sk = sk_nulls_next(sk);
1647        } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
1648
1649        if (!sk) {
1650                if (state->bucket < UDP_HTABLE_SIZE)
1651                        spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
1652                return udp_get_first(seq, state->bucket + 1);
1653        }
1654        return sk;
1655}
1656
1657static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
1658{
1659        struct sock *sk = udp_get_first(seq, 0);
1660
1661        if (sk)
1662                while (pos && (sk = udp_get_next(seq, sk)) != NULL)
1663                        --pos;
1664        return pos ? NULL : sk;
1665}
1666
1667static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
1668{
1669        struct udp_iter_state *state = seq->private;
1670        state->bucket = UDP_HTABLE_SIZE;
1671
1672        return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
1673}
1674
1675static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1676{
1677        struct sock *sk;
1678
1679        if (v == SEQ_START_TOKEN)
1680                sk = udp_get_idx(seq, 0);
1681        else
1682                sk = udp_get_next(seq, v);
1683
1684        ++*pos;
1685        return sk;
1686}
1687
1688static void udp_seq_stop(struct seq_file *seq, void *v)
1689{
1690        struct udp_iter_state *state = seq->private;
1691
1692        if (state->bucket < UDP_HTABLE_SIZE)
1693                spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
1694}
1695
1696static int udp_seq_open(struct inode *inode, struct file *file)
1697{
1698        struct udp_seq_afinfo *afinfo = PDE(inode)->data;
1699        struct udp_iter_state *s;
1700        int err;
1701
1702        err = seq_open_net(inode, file, &afinfo->seq_ops,
1703                           sizeof(struct udp_iter_state));
1704        if (err < 0)
1705                return err;
1706
1707        s = ((struct seq_file *)file->private_data)->private;
1708        s->family               = afinfo->family;
1709        s->udp_table            = afinfo->udp_table;
1710        return err;
1711}
1712
1713/* ------------------------------------------------------------------------ */
1714int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo)
1715{
1716        struct proc_dir_entry *p;
1717        int rc = 0;
1718
1719        afinfo->seq_fops.open           = udp_seq_open;
1720        afinfo->seq_fops.read           = seq_read;
1721        afinfo->seq_fops.llseek         = seq_lseek;
1722        afinfo->seq_fops.release        = seq_release_net;
1723
1724        afinfo->seq_ops.start           = udp_seq_start;
1725        afinfo->seq_ops.next            = udp_seq_next;
1726        afinfo->seq_ops.stop            = udp_seq_stop;
1727
1728        p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
1729                             &afinfo->seq_fops, afinfo);
1730        if (!p)
1731                rc = -ENOMEM;
1732        return rc;
1733}
1734EXPORT_SYMBOL(udp_proc_register);
1735
1736void udp_proc_unregister(struct net *net, struct udp_seq_afinfo *afinfo)
1737{
1738        proc_net_remove(net, afinfo->name);
1739}
1740EXPORT_SYMBOL(udp_proc_unregister);
1741
1742/* ------------------------------------------------------------------------ */
1743static void udp4_format_sock(struct sock *sp, struct seq_file *f,
1744                int bucket, int *len)
1745{
1746        struct inet_sock *inet = inet_sk(sp);
1747        __be32 dest = inet->daddr;
1748        __be32 src  = inet->rcv_saddr;
1749        __u16 destp       = ntohs(inet->dport);
1750        __u16 srcp        = ntohs(inet->sport);
1751
1752        seq_printf(f, "%4d: %08X:%04X %08X:%04X"
1753                " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n",
1754                bucket, src, srcp, dest, destp, sp->sk_state,
1755                sk_wmem_alloc_get(sp),
1756                sk_rmem_alloc_get(sp),
1757                0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
1758                atomic_read(&sp->sk_refcnt), sp,
1759                atomic_read(&sp->sk_drops), len);
1760}
1761
1762int udp4_seq_show(struct seq_file *seq, void *v)
1763{
1764        if (v == SEQ_START_TOKEN)
1765                seq_printf(seq, "%-127s\n",
1766                           "  sl  local_address rem_address   st tx_queue "
1767                           "rx_queue tr tm->when retrnsmt   uid  timeout "
1768                           "inode ref pointer drops");
1769        else {
1770                struct udp_iter_state *state = seq->private;
1771                int len;
1772
1773                udp4_format_sock(v, seq, state->bucket, &len);
1774                seq_printf(seq, "%*s\n", 127 - len, "");
1775        }
1776        return 0;
1777}
1778
1779/* ------------------------------------------------------------------------ */
1780static struct udp_seq_afinfo udp4_seq_afinfo = {
1781        .name           = "udp",
1782        .family         = AF_INET,
1783        .udp_table      = &udp_table,
1784        .seq_fops       = {
1785                .owner  =       THIS_MODULE,
1786        },
1787        .seq_ops        = {
1788                .show           = udp4_seq_show,
1789        },
1790};
1791
1792static int udp4_proc_init_net(struct net *net)
1793{
1794        return udp_proc_register(net, &udp4_seq_afinfo);
1795}
1796
1797static void udp4_proc_exit_net(struct net *net)
1798{
1799        udp_proc_unregister(net, &udp4_seq_afinfo);
1800}
1801
1802static struct pernet_operations udp4_net_ops = {
1803        .init = udp4_proc_init_net,
1804        .exit = udp4_proc_exit_net,
1805};
1806
1807int __init udp4_proc_init(void)
1808{
1809        return register_pernet_subsys(&udp4_net_ops);
1810}
1811
1812void udp4_proc_exit(void)
1813{
1814        unregister_pernet_subsys(&udp4_net_ops);
1815}
1816#endif /* CONFIG_PROC_FS */
1817
1818void __init udp_table_init(struct udp_table *table)
1819{
1820        int i;
1821
1822        for (i = 0; i < UDP_HTABLE_SIZE; i++) {
1823                INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
1824                spin_lock_init(&table->hash[i].lock);
1825        }
1826}
1827
1828void __init udp_init(void)
1829{
1830        unsigned long nr_pages, limit;
1831
1832        udp_table_init(&udp_table);
1833        /* Set the pressure threshold up by the same strategy of TCP. It is a
1834         * fraction of global memory that is up to 1/2 at 256 MB, decreasing
1835         * toward zero with the amount of memory, with a floor of 128 pages.
1836         */
1837        nr_pages = totalram_pages - totalhigh_pages;
1838        limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);
1839        limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);
1840        limit = max(limit, 128UL);
1841        sysctl_udp_mem[0] = limit / 4 * 3;
1842        sysctl_udp_mem[1] = limit;
1843        sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2;
1844
1845        sysctl_udp_rmem_min = SK_MEM_QUANTUM;
1846        sysctl_udp_wmem_min = SK_MEM_QUANTUM;
1847}
1848
1849int udp4_ufo_send_check(struct sk_buff *skb)
1850{
1851        const struct iphdr *iph;
1852        struct udphdr *uh;
1853
1854        if (!pskb_may_pull(skb, sizeof(*uh)))
1855                return -EINVAL;
1856
1857        iph = ip_hdr(skb);
1858        uh = udp_hdr(skb);
1859
1860        uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len,
1861                                       IPPROTO_UDP, 0);
1862        skb->csum_start = skb_transport_header(skb) - skb->head;
1863        skb->csum_offset = offsetof(struct udphdr, check);
1864        skb->ip_summed = CHECKSUM_PARTIAL;
1865        return 0;
1866}
1867
1868struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features)
1869{
1870        struct sk_buff *segs = ERR_PTR(-EINVAL);
1871        unsigned int mss;
1872        int offset;
1873        __wsum csum;
1874
1875        mss = skb_shinfo(skb)->gso_size;
1876        if (unlikely(skb->len <= mss))
1877                goto out;
1878
1879        if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
1880                /* Packet is from an untrusted source, reset gso_segs. */
1881                int type = skb_shinfo(skb)->gso_type;
1882
1883                if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY) ||
1884                             !(type & (SKB_GSO_UDP))))
1885                        goto out;
1886
1887                skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
1888
1889                segs = NULL;
1890                goto out;
1891        }
1892
1893        /* Do software UFO. Complete and fill in the UDP checksum as HW cannot
1894         * do checksum of UDP packets sent as multiple IP fragments.
1895         */
1896        offset = skb->csum_start - skb_headroom(skb);
1897        csum = skb_checksum(skb, offset, skb->len - offset, 0);
1898        offset += skb->csum_offset;
1899        *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1900        skb->ip_summed = CHECKSUM_NONE;
1901
1902        /* Fragment the skb. IP headers of the fragments are updated in
1903         * inet_gso_segment()
1904         */
1905        segs = skb_segment(skb, features);
1906out:
1907        return segs;
1908}
1909
1910