linux/net/ipv4/ip_input.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              The Internet Protocol (IP) module.
   7 *
   8 * Authors:     Ross Biro
   9 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *              Donald Becker, <becker@super.org>
  11 *              Alan Cox, <alan@lxorguk.ukuu.org.uk>
  12 *              Richard Underwood
  13 *              Stefan Becker, <stefanb@yello.ping.de>
  14 *              Jorge Cwik, <jorge@laser.satlink.net>
  15 *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  16 *
  17 *
  18 * Fixes:
  19 *              Alan Cox        :       Commented a couple of minor bits of surplus code
  20 *              Alan Cox        :       Undefining IP_FORWARD doesn't include the code
  21 *                                      (just stops a compiler warning).
  22 *              Alan Cox        :       Frames with >=MAX_ROUTE record routes, strict routes or loose routes
  23 *                                      are junked rather than corrupting things.
  24 *              Alan Cox        :       Frames to bad broadcast subnets are dumped
  25 *                                      We used to process them non broadcast and
  26 *                                      boy could that cause havoc.
  27 *              Alan Cox        :       ip_forward sets the free flag on the
  28 *                                      new frame it queues. Still crap because
  29 *                                      it copies the frame but at least it
  30 *                                      doesn't eat memory too.
  31 *              Alan Cox        :       Generic queue code and memory fixes.
  32 *              Fred Van Kempen :       IP fragment support (borrowed from NET2E)
  33 *              Gerhard Koerting:       Forward fragmented frames correctly.
  34 *              Gerhard Koerting:       Fixes to my fix of the above 8-).
  35 *              Gerhard Koerting:       IP interface addressing fix.
  36 *              Linus Torvalds  :       More robustness checks
  37 *              Alan Cox        :       Even more checks: Still not as robust as it ought to be
  38 *              Alan Cox        :       Save IP header pointer for later
  39 *              Alan Cox        :       ip option setting
  40 *              Alan Cox        :       Use ip_tos/ip_ttl settings
  41 *              Alan Cox        :       Fragmentation bogosity removed
  42 *                                      (Thanks to Mark.Bush@prg.ox.ac.uk)
  43 *              Dmitry Gorodchanin :    Send of a raw packet crash fix.
  44 *              Alan Cox        :       Silly ip bug when an overlength
  45 *                                      fragment turns up. Now frees the
  46 *                                      queue.
  47 *              Linus Torvalds/ :       Memory leakage on fragmentation
  48 *              Alan Cox        :       handling.
  49 *              Gerhard Koerting:       Forwarding uses IP priority hints
  50 *              Teemu Rantanen  :       Fragment problems.
  51 *              Alan Cox        :       General cleanup, comments and reformat
  52 *              Alan Cox        :       SNMP statistics
  53 *              Alan Cox        :       BSD address rule semantics. Also see
  54 *                                      UDP as there is a nasty checksum issue
  55 *                                      if you do things the wrong way.
  56 *              Alan Cox        :       Always defrag, moved IP_FORWARD to the config.in file
  57 *              Alan Cox        :       IP options adjust sk->priority.
  58 *              Pedro Roque     :       Fix mtu/length error in ip_forward.
  59 *              Alan Cox        :       Avoid ip_chk_addr when possible.
  60 *      Richard Underwood       :       IP multicasting.
  61 *              Alan Cox        :       Cleaned up multicast handlers.
  62 *              Alan Cox        :       RAW sockets demultiplex in the BSD style.
  63 *              Gunther Mayer   :       Fix the SNMP reporting typo
  64 *              Alan Cox        :       Always in group 224.0.0.1
  65 *      Pauline Middelink       :       Fast ip_checksum update when forwarding
  66 *                                      Masquerading support.
  67 *              Alan Cox        :       Multicast loopback error for 224.0.0.1
  68 *              Alan Cox        :       IP_MULTICAST_LOOP option.
  69 *              Alan Cox        :       Use notifiers.
  70 *              Bjorn Ekwall    :       Removed ip_csum (from slhc.c too)
  71 *              Bjorn Ekwall    :       Moved ip_fast_csum to ip.h (inline!)
  72 *              Stefan Becker   :       Send out ICMP HOST REDIRECT
  73 *      Arnt Gulbrandsen        :       ip_build_xmit
  74 *              Alan Cox        :       Per socket routing cache
  75 *              Alan Cox        :       Fixed routing cache, added header cache.
  76 *              Alan Cox        :       Loopback didn't work right in original ip_build_xmit - fixed it.
  77 *              Alan Cox        :       Only send ICMP_REDIRECT if src/dest are the same net.
  78 *              Alan Cox        :       Incoming IP option handling.
  79 *              Alan Cox        :       Set saddr on raw output frames as per BSD.
  80 *              Alan Cox        :       Stopped broadcast source route explosions.
  81 *              Alan Cox        :       Can disable source routing
  82 *              Takeshi Sone    :       Masquerading didn't work.
  83 *      Dave Bonn,Alan Cox      :       Faster IP forwarding whenever possible.
  84 *              Alan Cox        :       Memory leaks, tramples, misc debugging.
  85 *              Alan Cox        :       Fixed multicast (by popular demand 8))
  86 *              Alan Cox        :       Fixed forwarding (by even more popular demand 8))
  87 *              Alan Cox        :       Fixed SNMP statistics [I think]
  88 *      Gerhard Koerting        :       IP fragmentation forwarding fix
  89 *              Alan Cox        :       Device lock against page fault.
  90 *              Alan Cox        :       IP_HDRINCL facility.
  91 *      Werner Almesberger      :       Zero fragment bug
  92 *              Alan Cox        :       RAW IP frame length bug
  93 *              Alan Cox        :       Outgoing firewall on build_xmit
  94 *              A.N.Kuznetsov   :       IP_OPTIONS support throughout the kernel
  95 *              Alan Cox        :       Multicast routing hooks
  96 *              Jos Vos         :       Do accounting *before* call_in_firewall
  97 *      Willy Konynenberg       :       Transparent proxying support
  98 *
  99 *
 100 *
 101 * To Fix:
 102 *              IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient
 103 *              and could be made very efficient with the addition of some virtual memory hacks to permit
 104 *              the allocation of a buffer that can then be 'grown' by twiddling page tables.
 105 *              Output fragmentation wants updating along with the buffer management to use a single
 106 *              interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet
 107 *              output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause
 108 *              fragmentation anyway.
 109 *
 110 *              This program is free software; you can redistribute it and/or
 111 *              modify it under the terms of the GNU General Public License
 112 *              as published by the Free Software Foundation; either version
 113 *              2 of the License, or (at your option) any later version.
 114 */
 115
 116#include <asm/system.h>
 117#include <linux/module.h>
 118#include <linux/types.h>
 119#include <linux/kernel.h>
 120#include <linux/string.h>
 121#include <linux/errno.h>
 122
 123#include <linux/net.h>
 124#include <linux/socket.h>
 125#include <linux/sockios.h>
 126#include <linux/in.h>
 127#include <linux/inet.h>
 128#include <linux/inetdevice.h>
 129#include <linux/netdevice.h>
 130#include <linux/etherdevice.h>
 131
 132#include <net/snmp.h>
 133#include <net/ip.h>
 134#include <net/protocol.h>
 135#include <net/route.h>
 136#include <linux/skbuff.h>
 137#include <net/sock.h>
 138#include <net/arp.h>
 139#include <net/icmp.h>
 140#include <net/raw.h>
 141#include <net/checksum.h>
 142#include <linux/netfilter_ipv4.h>
 143#include <net/xfrm.h>
 144#include <linux/mroute.h>
 145#include <linux/netlink.h>
 146
 147/*
 148 *      Process Router Attention IP option
 149 */
 150int ip_call_ra_chain(struct sk_buff *skb)
 151{
 152        struct ip_ra_chain *ra;
 153        u8 protocol = ip_hdr(skb)->protocol;
 154        struct sock *last = NULL;
 155        struct net_device *dev = skb->dev;
 156
 157        read_lock(&ip_ra_lock);
 158        for (ra = ip_ra_chain; ra; ra = ra->next) {
 159                struct sock *sk = ra->sk;
 160
 161                /* If socket is bound to an interface, only report
 162                 * the packet if it came  from that interface.
 163                 */
 164                if (sk && inet_sk(sk)->num == protocol &&
 165                    (!sk->sk_bound_dev_if ||
 166                     sk->sk_bound_dev_if == dev->ifindex) &&
 167                    sock_net(sk) == dev_net(dev)) {
 168                        if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
 169                                if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) {
 170                                        read_unlock(&ip_ra_lock);
 171                                        return 1;
 172                                }
 173                        }
 174                        if (last) {
 175                                struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 176                                if (skb2)
 177                                        raw_rcv(last, skb2);
 178                        }
 179                        last = sk;
 180                }
 181        }
 182
 183        if (last) {
 184                raw_rcv(last, skb);
 185                read_unlock(&ip_ra_lock);
 186                return 1;
 187        }
 188        read_unlock(&ip_ra_lock);
 189        return 0;
 190}
 191
 192static int ip_local_deliver_finish(struct sk_buff *skb)
 193{
 194        struct net *net = dev_net(skb->dev);
 195
 196        __skb_pull(skb, ip_hdrlen(skb));
 197
 198        /* Point into the IP datagram, just past the header. */
 199        skb_reset_transport_header(skb);
 200
 201        rcu_read_lock();
 202        {
 203                int protocol = ip_hdr(skb)->protocol;
 204                int hash, raw;
 205                const struct net_protocol *ipprot;
 206
 207        resubmit:
 208                raw = raw_local_deliver(skb, protocol);
 209
 210                hash = protocol & (MAX_INET_PROTOS - 1);
 211                ipprot = rcu_dereference(inet_protos[hash]);
 212                if (ipprot != NULL) {
 213                        int ret;
 214
 215                        if (!net_eq(net, &init_net) && !ipprot->netns_ok) {
 216                                if (net_ratelimit())
 217                                        printk("%s: proto %d isn't netns-ready\n",
 218                                                __func__, protocol);
 219                                kfree_skb(skb);
 220                                goto out;
 221                        }
 222
 223                        if (!ipprot->no_policy) {
 224                                if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
 225                                        kfree_skb(skb);
 226                                        goto out;
 227                                }
 228                                nf_reset(skb);
 229                        }
 230                        ret = ipprot->handler(skb);
 231                        if (ret < 0) {
 232                                protocol = -ret;
 233                                goto resubmit;
 234                        }
 235                        IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
 236                } else {
 237                        if (!raw) {
 238                                if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
 239                                        IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS);
 240                                        icmp_send(skb, ICMP_DEST_UNREACH,
 241                                                  ICMP_PROT_UNREACH, 0);
 242                                }
 243                        } else
 244                                IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
 245                        kfree_skb(skb);
 246                }
 247        }
 248 out:
 249        rcu_read_unlock();
 250
 251        return 0;
 252}
 253
 254/*
 255 *      Deliver IP Packets to the higher protocol layers.
 256 */
 257int ip_local_deliver(struct sk_buff *skb)
 258{
 259        /*
 260         *      Reassemble IP fragments.
 261         */
 262
 263        if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
 264                if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
 265                        return 0;
 266        }
 267
 268        return NF_HOOK(PF_INET, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
 269                       ip_local_deliver_finish);
 270}
 271
 272static inline int ip_rcv_options(struct sk_buff *skb)
 273{
 274        struct ip_options *opt;
 275        struct iphdr *iph;
 276        struct net_device *dev = skb->dev;
 277
 278        /* It looks as overkill, because not all
 279           IP options require packet mangling.
 280           But it is the easiest for now, especially taking
 281           into account that combination of IP options
 282           and running sniffer is extremely rare condition.
 283                                              --ANK (980813)
 284        */
 285        if (skb_cow(skb, skb_headroom(skb))) {
 286                IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
 287                goto drop;
 288        }
 289
 290        iph = ip_hdr(skb);
 291        opt = &(IPCB(skb)->opt);
 292        opt->optlen = iph->ihl*4 - sizeof(struct iphdr);
 293
 294        if (ip_options_compile(dev_net(dev), opt, skb)) {
 295                IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
 296                goto drop;
 297        }
 298
 299        if (unlikely(opt->srr)) {
 300                struct in_device *in_dev = in_dev_get(dev);
 301                if (in_dev) {
 302                        if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
 303                                if (IN_DEV_LOG_MARTIANS(in_dev) &&
 304                                    net_ratelimit())
 305                                        printk(KERN_INFO "source route option %pI4 -> %pI4\n",
 306                                               &iph->saddr, &iph->daddr);
 307                                in_dev_put(in_dev);
 308                                goto drop;
 309                        }
 310
 311                        in_dev_put(in_dev);
 312                }
 313
 314                if (ip_options_rcv_srr(skb))
 315                        goto drop;
 316        }
 317
 318        return 0;
 319drop:
 320        return -1;
 321}
 322
 323static int ip_rcv_finish(struct sk_buff *skb)
 324{
 325        const struct iphdr *iph = ip_hdr(skb);
 326        struct rtable *rt;
 327
 328        /*
 329         *      Initialise the virtual path cache for the packet. It describes
 330         *      how the packet travels inside Linux networking.
 331         */
 332        if (skb_dst(skb) == NULL) {
 333                int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
 334                                         skb->dev);
 335                if (unlikely(err)) {
 336                        if (err == -EHOSTUNREACH)
 337                                IP_INC_STATS_BH(dev_net(skb->dev),
 338                                                IPSTATS_MIB_INADDRERRORS);
 339                        else if (err == -ENETUNREACH)
 340                                IP_INC_STATS_BH(dev_net(skb->dev),
 341                                                IPSTATS_MIB_INNOROUTES);
 342                        goto drop;
 343                }
 344        }
 345
 346#ifdef CONFIG_NET_CLS_ROUTE
 347        if (unlikely(skb_dst(skb)->tclassid)) {
 348                struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id());
 349                u32 idx = skb_dst(skb)->tclassid;
 350                st[idx&0xFF].o_packets++;
 351                st[idx&0xFF].o_bytes += skb->len;
 352                st[(idx>>16)&0xFF].i_packets++;
 353                st[(idx>>16)&0xFF].i_bytes += skb->len;
 354        }
 355#endif
 356
 357        if (iph->ihl > 5 && ip_rcv_options(skb))
 358                goto drop;
 359
 360        rt = skb_rtable(skb);
 361        if (rt->rt_type == RTN_MULTICAST) {
 362                IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INMCAST,
 363                                skb->len);
 364        } else if (rt->rt_type == RTN_BROADCAST)
 365                IP_UPD_PO_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INBCAST,
 366                                skb->len);
 367
 368        return dst_input(skb);
 369
 370drop:
 371        kfree_skb(skb);
 372        return NET_RX_DROP;
 373}
 374
 375/*
 376 *      Main IP Receive routine.
 377 */
 378int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
 379{
 380        struct iphdr *iph;
 381        u32 len;
 382
 383        /* When the interface is in promisc. mode, drop all the crap
 384         * that it receives, do not try to analyse it.
 385         */
 386        if (skb->pkt_type == PACKET_OTHERHOST)
 387                goto drop;
 388
 389
 390        IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);
 391
 392        if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
 393                IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
 394                goto out;
 395        }
 396
 397        if (!pskb_may_pull(skb, sizeof(struct iphdr)))
 398                goto inhdr_error;
 399
 400        iph = ip_hdr(skb);
 401
 402        /*
 403         *      RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
 404         *
 405         *      Is the datagram acceptable?
 406         *
 407         *      1.      Length at least the size of an ip header
 408         *      2.      Version of 4
 409         *      3.      Checksums correctly. [Speed optimisation for later, skip loopback checksums]
 410         *      4.      Doesn't have a bogus length
 411         */
 412
 413        if (iph->ihl < 5 || iph->version != 4)
 414                goto inhdr_error;
 415
 416        if (!pskb_may_pull(skb, iph->ihl*4))
 417                goto inhdr_error;
 418
 419        iph = ip_hdr(skb);
 420
 421        if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
 422                goto inhdr_error;
 423
 424        len = ntohs(iph->tot_len);
 425        if (skb->len < len) {
 426                IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
 427                goto drop;
 428        } else if (len < (iph->ihl*4))
 429                goto inhdr_error;
 430
 431        /* Our transport medium may have padded the buffer out. Now we know it
 432         * is IP we can trim to the true length of the frame.
 433         * Note this now means skb->len holds ntohs(iph->tot_len).
 434         */
 435        if (pskb_trim_rcsum(skb, len)) {
 436                IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
 437                goto drop;
 438        }
 439
 440        /* Remove any debris in the socket control block */
 441        memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
 442
 443        /* Must drop socket now because of tproxy. */
 444        skb_orphan(skb);
 445
 446        return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL,
 447                       ip_rcv_finish);
 448
 449inhdr_error:
 450        IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
 451drop:
 452        kfree_skb(skb);
 453out:
 454        return NET_RX_DROP;
 455}
 456