linux/net/ipv4/ip_output.c
<<
>>
Prefs
   1/*
   2 * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3 *              operating system.  INET is implemented using the  BSD Socket
   4 *              interface as the means of communication with the user level.
   5 *
   6 *              The Internet Protocol (IP) output module.
   7 *
   8 * Authors:     Ross Biro
   9 *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10 *              Donald Becker, <becker@super.org>
  11 *              Alan Cox, <Alan.Cox@linux.org>
  12 *              Richard Underwood
  13 *              Stefan Becker, <stefanb@yello.ping.de>
  14 *              Jorge Cwik, <jorge@laser.satlink.net>
  15 *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  16 *              Hirokazu Takahashi, <taka@valinux.co.jp>
  17 *
  18 *      See ip_input.c for original log
  19 *
  20 *      Fixes:
  21 *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  22 *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  23 *              Bradford Johnson:       Fix faulty handling of some frames when
  24 *                                      no route is found.
  25 *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  26 *                                      (in case if packet not accepted by
  27 *                                      output firewall rules)
  28 *              Mike McLagan    :       Routing by source
  29 *              Alexey Kuznetsov:       use new route cache
  30 *              Andi Kleen:             Fix broken PMTU recovery and remove
  31 *                                      some redundant tests.
  32 *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  33 *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  34 *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  35 *                                      for decreased register pressure on x86
  36 *                                      and more readibility.
  37 *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  38 *                                      silently drop skb instead of failing with -EPERM.
  39 *              Detlev Wengorz  :       Copy protocol for fragments.
  40 *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  41 *                                      datagrams.
  42 *              Hirokazu Takahashi:     sendfile() on UDP works now.
  43 */
  44
  45#include <asm/uaccess.h>
  46#include <asm/system.h>
  47#include <linux/module.h>
  48#include <linux/types.h>
  49#include <linux/kernel.h>
  50#include <linux/mm.h>
  51#include <linux/string.h>
  52#include <linux/errno.h>
  53#include <linux/highmem.h>
  54#include <linux/slab.h>
  55
  56#include <linux/socket.h>
  57#include <linux/sockios.h>
  58#include <linux/in.h>
  59#include <linux/inet.h>
  60#include <linux/netdevice.h>
  61#include <linux/etherdevice.h>
  62#include <linux/proc_fs.h>
  63#include <linux/stat.h>
  64#include <linux/init.h>
  65
  66#include <net/snmp.h>
  67#include <net/ip.h>
  68#include <net/protocol.h>
  69#include <net/route.h>
  70#include <net/xfrm.h>
  71#include <linux/skbuff.h>
  72#include <net/sock.h>
  73#include <net/arp.h>
  74#include <net/icmp.h>
  75#include <net/checksum.h>
  76#include <net/inetpeer.h>
  77#include <linux/igmp.h>
  78#include <linux/netfilter_ipv4.h>
  79#include <linux/netfilter_bridge.h>
  80#include <linux/mroute.h>
  81#include <linux/netlink.h>
  82#include <linux/tcp.h>
  83
  84int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
  85EXPORT_SYMBOL(sysctl_ip_default_ttl);
  86
  87/* Generate a checksum for an outgoing IP datagram. */
  88__inline__ void ip_send_check(struct iphdr *iph)
  89{
  90        iph->check = 0;
  91        iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  92}
  93EXPORT_SYMBOL(ip_send_check);
  94
  95int __ip_local_out(struct sk_buff *skb)
  96{
  97        struct iphdr *iph = ip_hdr(skb);
  98
  99        iph->tot_len = htons(skb->len);
 100        ip_send_check(iph);
 101        return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
 102                       skb_dst(skb)->dev, dst_output);
 103}
 104
 105int ip_local_out(struct sk_buff *skb)
 106{
 107        int err;
 108
 109        err = __ip_local_out(skb);
 110        if (likely(err == 1))
 111                err = dst_output(skb);
 112
 113        return err;
 114}
 115EXPORT_SYMBOL_GPL(ip_local_out);
 116
 117/* dev_loopback_xmit for use with netfilter. */
 118static int ip_dev_loopback_xmit(struct sk_buff *newskb)
 119{
 120        skb_reset_mac_header(newskb);
 121        __skb_pull(newskb, skb_network_offset(newskb));
 122        newskb->pkt_type = PACKET_LOOPBACK;
 123        newskb->ip_summed = CHECKSUM_UNNECESSARY;
 124        WARN_ON(!skb_dst(newskb));
 125        netif_rx_ni(newskb);
 126        return 0;
 127}
 128
 129static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 130{
 131        int ttl = inet->uc_ttl;
 132
 133        if (ttl < 0)
 134                ttl = ip4_dst_hoplimit(dst);
 135        return ttl;
 136}
 137
 138/*
 139 *              Add an ip header to a skbuff and send it out.
 140 *
 141 */
 142int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 143                          __be32 saddr, __be32 daddr, struct ip_options *opt)
 144{
 145        struct inet_sock *inet = inet_sk(sk);
 146        struct rtable *rt = skb_rtable(skb);
 147        struct iphdr *iph;
 148
 149        /* Build the IP header. */
 150        skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 151        skb_reset_network_header(skb);
 152        iph = ip_hdr(skb);
 153        iph->version  = 4;
 154        iph->ihl      = 5;
 155        iph->tos      = inet->tos;
 156        if (ip_dont_fragment(sk, &rt->dst))
 157                iph->frag_off = htons(IP_DF);
 158        else
 159                iph->frag_off = 0;
 160        iph->ttl      = ip_select_ttl(inet, &rt->dst);
 161        iph->daddr    = rt->rt_dst;
 162        iph->saddr    = rt->rt_src;
 163        iph->protocol = sk->sk_protocol;
 164        ip_select_ident(iph, &rt->dst, sk);
 165
 166        if (opt && opt->optlen) {
 167                iph->ihl += opt->optlen>>2;
 168                ip_options_build(skb, opt, daddr, rt, 0);
 169        }
 170
 171        skb->priority = sk->sk_priority;
 172        skb->mark = sk->sk_mark;
 173
 174        /* Send it out. */
 175        return ip_local_out(skb);
 176}
 177EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 178
 179static inline int ip_finish_output2(struct sk_buff *skb)
 180{
 181        struct dst_entry *dst = skb_dst(skb);
 182        struct rtable *rt = (struct rtable *)dst;
 183        struct net_device *dev = dst->dev;
 184        unsigned int hh_len = LL_RESERVED_SPACE(dev);
 185
 186        if (rt->rt_type == RTN_MULTICAST) {
 187                IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
 188        } else if (rt->rt_type == RTN_BROADCAST)
 189                IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
 190
 191        /* Be paranoid, rather than too clever. */
 192        if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
 193                struct sk_buff *skb2;
 194
 195                skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 196                if (skb2 == NULL) {
 197                        kfree_skb(skb);
 198                        return -ENOMEM;
 199                }
 200                if (skb->sk)
 201                        skb_set_owner_w(skb2, skb->sk);
 202                kfree_skb(skb);
 203                skb = skb2;
 204        }
 205
 206        if (dst->hh)
 207                return neigh_hh_output(dst->hh, skb);
 208        else if (dst->neighbour)
 209                return dst->neighbour->output(skb);
 210
 211        if (net_ratelimit())
 212                printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 213        kfree_skb(skb);
 214        return -EINVAL;
 215}
 216
 217static inline int ip_skb_dst_mtu(struct sk_buff *skb)
 218{
 219        struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
 220
 221        return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
 222               skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
 223}
 224
 225static int ip_finish_output(struct sk_buff *skb)
 226{
 227#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 228        /* Policy lookup after SNAT yielded a new policy */
 229        if (skb_dst(skb)->xfrm != NULL) {
 230                IPCB(skb)->flags |= IPSKB_REROUTED;
 231                return dst_output(skb);
 232        }
 233#endif
 234        if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
 235                return ip_fragment(skb, ip_finish_output2);
 236        else
 237                return ip_finish_output2(skb);
 238}
 239
 240int ip_mc_output(struct sk_buff *skb)
 241{
 242        struct sock *sk = skb->sk;
 243        struct rtable *rt = skb_rtable(skb);
 244        struct net_device *dev = rt->dst.dev;
 245
 246        /*
 247         *      If the indicated interface is up and running, send the packet.
 248         */
 249        IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
 250
 251        skb->dev = dev;
 252        skb->protocol = htons(ETH_P_IP);
 253
 254        /*
 255         *      Multicasts are looped back for other local users
 256         */
 257
 258        if (rt->rt_flags&RTCF_MULTICAST) {
 259                if (sk_mc_loop(sk)
 260#ifdef CONFIG_IP_MROUTE
 261                /* Small optimization: do not loopback not local frames,
 262                   which returned after forwarding; they will be  dropped
 263                   by ip_mr_input in any case.
 264                   Note, that local frames are looped back to be delivered
 265                   to local recipients.
 266
 267                   This check is duplicated in ip_mr_input at the moment.
 268                 */
 269                    &&
 270                    ((rt->rt_flags & RTCF_LOCAL) ||
 271                     !(IPCB(skb)->flags & IPSKB_FORWARDED))
 272#endif
 273                   ) {
 274                        struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 275                        if (newskb)
 276                                NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
 277                                        newskb, NULL, newskb->dev,
 278                                        ip_dev_loopback_xmit);
 279                }
 280
 281                /* Multicasts with ttl 0 must not go beyond the host */
 282
 283                if (ip_hdr(skb)->ttl == 0) {
 284                        kfree_skb(skb);
 285                        return 0;
 286                }
 287        }
 288
 289        if (rt->rt_flags&RTCF_BROADCAST) {
 290                struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 291                if (newskb)
 292                        NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
 293                                NULL, newskb->dev, ip_dev_loopback_xmit);
 294        }
 295
 296        return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
 297                            skb->dev, ip_finish_output,
 298                            !(IPCB(skb)->flags & IPSKB_REROUTED));
 299}
 300
 301int ip_output(struct sk_buff *skb)
 302{
 303        struct net_device *dev = skb_dst(skb)->dev;
 304
 305        IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
 306
 307        skb->dev = dev;
 308        skb->protocol = htons(ETH_P_IP);
 309
 310        return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
 311                            ip_finish_output,
 312                            !(IPCB(skb)->flags & IPSKB_REROUTED));
 313}
 314
 315int ip_queue_xmit(struct sk_buff *skb)
 316{
 317        struct sock *sk = skb->sk;
 318        struct inet_sock *inet = inet_sk(sk);
 319        struct ip_options *opt = inet->opt;
 320        struct rtable *rt;
 321        struct iphdr *iph;
 322        int res;
 323
 324        /* Skip all of this if the packet is already routed,
 325         * f.e. by something like SCTP.
 326         */
 327        rcu_read_lock();
 328        rt = skb_rtable(skb);
 329        if (rt != NULL)
 330                goto packet_routed;
 331
 332        /* Make sure we can route this packet. */
 333        rt = (struct rtable *)__sk_dst_check(sk, 0);
 334        if (rt == NULL) {
 335                __be32 daddr;
 336
 337                /* Use correct destination address if we have options. */
 338                daddr = inet->inet_daddr;
 339                if(opt && opt->srr)
 340                        daddr = opt->faddr;
 341
 342                {
 343                        struct flowi fl = { .oif = sk->sk_bound_dev_if,
 344                                            .mark = sk->sk_mark,
 345                                            .fl4_dst = daddr,
 346                                            .fl4_src = inet->inet_saddr,
 347                                            .fl4_tos = RT_CONN_FLAGS(sk),
 348                                            .proto = sk->sk_protocol,
 349                                            .flags = inet_sk_flowi_flags(sk),
 350                                            .fl_ip_sport = inet->inet_sport,
 351                                            .fl_ip_dport = inet->inet_dport };
 352
 353                        /* If this fails, retransmit mechanism of transport layer will
 354                         * keep trying until route appears or the connection times
 355                         * itself out.
 356                         */
 357                        security_sk_classify_flow(sk, &fl);
 358                        if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
 359                                goto no_route;
 360                }
 361                sk_setup_caps(sk, &rt->dst);
 362        }
 363        skb_dst_set_noref(skb, &rt->dst);
 364
 365packet_routed:
 366        if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 367                goto no_route;
 368
 369        /* OK, we know where to send it, allocate and build IP header. */
 370        skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 371        skb_reset_network_header(skb);
 372        iph = ip_hdr(skb);
 373        *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 374        if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
 375                iph->frag_off = htons(IP_DF);
 376        else
 377                iph->frag_off = 0;
 378        iph->ttl      = ip_select_ttl(inet, &rt->dst);
 379        iph->protocol = sk->sk_protocol;
 380        iph->saddr    = rt->rt_src;
 381        iph->daddr    = rt->rt_dst;
 382        /* Transport layer set skb->h.foo itself. */
 383
 384        if (opt && opt->optlen) {
 385                iph->ihl += opt->optlen >> 2;
 386                ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
 387        }
 388
 389        ip_select_ident_more(iph, &rt->dst, sk,
 390                             (skb_shinfo(skb)->gso_segs ?: 1) - 1);
 391
 392        skb->priority = sk->sk_priority;
 393        skb->mark = sk->sk_mark;
 394
 395        res = ip_local_out(skb);
 396        rcu_read_unlock();
 397        return res;
 398
 399no_route:
 400        rcu_read_unlock();
 401        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 402        kfree_skb(skb);
 403        return -EHOSTUNREACH;
 404}
 405EXPORT_SYMBOL(ip_queue_xmit);
 406
 407
 408static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 409{
 410        to->pkt_type = from->pkt_type;
 411        to->priority = from->priority;
 412        to->protocol = from->protocol;
 413        skb_dst_drop(to);
 414        skb_dst_copy(to, from);
 415        to->dev = from->dev;
 416        to->mark = from->mark;
 417
 418        /* Copy the flags to each fragment. */
 419        IPCB(to)->flags = IPCB(from)->flags;
 420
 421#ifdef CONFIG_NET_SCHED
 422        to->tc_index = from->tc_index;
 423#endif
 424        nf_copy(to, from);
 425#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 426    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 427        to->nf_trace = from->nf_trace;
 428#endif
 429#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 430        to->ipvs_property = from->ipvs_property;
 431#endif
 432        skb_copy_secmark(to, from);
 433}
 434
 435/*
 436 *      This IP datagram is too large to be sent in one piece.  Break it up into
 437 *      smaller pieces (each of size equal to IP header plus
 438 *      a block of the data of the original IP data part) that will yet fit in a
 439 *      single device frame, and queue such a frame for sending.
 440 */
 441
 442int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 443{
 444        struct iphdr *iph;
 445        int ptr;
 446        struct net_device *dev;
 447        struct sk_buff *skb2;
 448        unsigned int mtu, hlen, left, len, ll_rs;
 449        int offset;
 450        __be16 not_last_frag;
 451        struct rtable *rt = skb_rtable(skb);
 452        int err = 0;
 453
 454        dev = rt->dst.dev;
 455
 456        /*
 457         *      Point into the IP datagram header.
 458         */
 459
 460        iph = ip_hdr(skb);
 461
 462        if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 463                IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 464                icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 465                          htonl(ip_skb_dst_mtu(skb)));
 466                kfree_skb(skb);
 467                return -EMSGSIZE;
 468        }
 469
 470        /*
 471         *      Setup starting values.
 472         */
 473
 474        hlen = iph->ihl * 4;
 475        mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
 476#ifdef CONFIG_BRIDGE_NETFILTER
 477        if (skb->nf_bridge)
 478                mtu -= nf_bridge_mtu_reduction(skb);
 479#endif
 480        IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 481
 482        /* When frag_list is given, use it. First, check its validity:
 483         * some transformers could create wrong frag_list or break existing
 484         * one, it is not prohibited. In this case fall back to copying.
 485         *
 486         * LATER: this step can be merged to real generation of fragments,
 487         * we can switch to copy when see the first bad fragment.
 488         */
 489        if (skb_has_frag_list(skb)) {
 490                struct sk_buff *frag, *frag2;
 491                int first_len = skb_pagelen(skb);
 492
 493                if (first_len - hlen > mtu ||
 494                    ((first_len - hlen) & 7) ||
 495                    (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
 496                    skb_cloned(skb))
 497                        goto slow_path;
 498
 499                skb_walk_frags(skb, frag) {
 500                        /* Correct geometry. */
 501                        if (frag->len > mtu ||
 502                            ((frag->len & 7) && frag->next) ||
 503                            skb_headroom(frag) < hlen)
 504                                goto slow_path_clean;
 505
 506                        /* Partially cloned skb? */
 507                        if (skb_shared(frag))
 508                                goto slow_path_clean;
 509
 510                        BUG_ON(frag->sk);
 511                        if (skb->sk) {
 512                                frag->sk = skb->sk;
 513                                frag->destructor = sock_wfree;
 514                        }
 515                        skb->truesize -= frag->truesize;
 516                }
 517
 518                /* Everything is OK. Generate! */
 519
 520                err = 0;
 521                offset = 0;
 522                frag = skb_shinfo(skb)->frag_list;
 523                skb_frag_list_init(skb);
 524                skb->data_len = first_len - skb_headlen(skb);
 525                skb->len = first_len;
 526                iph->tot_len = htons(first_len);
 527                iph->frag_off = htons(IP_MF);
 528                ip_send_check(iph);
 529
 530                for (;;) {
 531                        /* Prepare header of the next frame,
 532                         * before previous one went down. */
 533                        if (frag) {
 534                                frag->ip_summed = CHECKSUM_NONE;
 535                                skb_reset_transport_header(frag);
 536                                __skb_push(frag, hlen);
 537                                skb_reset_network_header(frag);
 538                                memcpy(skb_network_header(frag), iph, hlen);
 539                                iph = ip_hdr(frag);
 540                                iph->tot_len = htons(frag->len);
 541                                ip_copy_metadata(frag, skb);
 542                                if (offset == 0)
 543                                        ip_options_fragment(frag);
 544                                offset += skb->len - hlen;
 545                                iph->frag_off = htons(offset>>3);
 546                                if (frag->next != NULL)
 547                                        iph->frag_off |= htons(IP_MF);
 548                                /* Ready, complete checksum */
 549                                ip_send_check(iph);
 550                        }
 551
 552                        err = output(skb);
 553
 554                        if (!err)
 555                                IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
 556                        if (err || !frag)
 557                                break;
 558
 559                        skb = frag;
 560                        frag = skb->next;
 561                        skb->next = NULL;
 562                }
 563
 564                if (err == 0) {
 565                        IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
 566                        return 0;
 567                }
 568
 569                while (frag) {
 570                        skb = frag->next;
 571                        kfree_skb(frag);
 572                        frag = skb;
 573                }
 574                IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 575                return err;
 576
 577slow_path_clean:
 578                skb_walk_frags(skb, frag2) {
 579                        if (frag2 == frag)
 580                                break;
 581                        frag2->sk = NULL;
 582                        frag2->destructor = NULL;
 583                        skb->truesize += frag2->truesize;
 584                }
 585        }
 586
 587slow_path:
 588        left = skb->len - hlen;         /* Space per frame */
 589        ptr = hlen;             /* Where to start from */
 590
 591        /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 592         * we need to make room for the encapsulating header
 593         */
 594        ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
 595
 596        /*
 597         *      Fragment the datagram.
 598         */
 599
 600        offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 601        not_last_frag = iph->frag_off & htons(IP_MF);
 602
 603        /*
 604         *      Keep copying data until we run out.
 605         */
 606
 607        while (left > 0) {
 608                len = left;
 609                /* IF: it doesn't fit, use 'mtu' - the data space left */
 610                if (len > mtu)
 611                        len = mtu;
 612                /* IF: we are not sending upto and including the packet end
 613                   then align the next start on an eight byte boundary */
 614                if (len < left) {
 615                        len &= ~7;
 616                }
 617                /*
 618                 *      Allocate buffer.
 619                 */
 620
 621                if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 622                        NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
 623                        err = -ENOMEM;
 624                        goto fail;
 625                }
 626
 627                /*
 628                 *      Set up data on packet
 629                 */
 630
 631                ip_copy_metadata(skb2, skb);
 632                skb_reserve(skb2, ll_rs);
 633                skb_put(skb2, len + hlen);
 634                skb_reset_network_header(skb2);
 635                skb2->transport_header = skb2->network_header + hlen;
 636
 637                /*
 638                 *      Charge the memory for the fragment to any owner
 639                 *      it might possess
 640                 */
 641
 642                if (skb->sk)
 643                        skb_set_owner_w(skb2, skb->sk);
 644
 645                /*
 646                 *      Copy the packet header into the new buffer.
 647                 */
 648
 649                skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
 650
 651                /*
 652                 *      Copy a block of the IP datagram.
 653                 */
 654                if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
 655                        BUG();
 656                left -= len;
 657
 658                /*
 659                 *      Fill in the new header fields.
 660                 */
 661                iph = ip_hdr(skb2);
 662                iph->frag_off = htons((offset >> 3));
 663
 664                /* ANK: dirty, but effective trick. Upgrade options only if
 665                 * the segment to be fragmented was THE FIRST (otherwise,
 666                 * options are already fixed) and make it ONCE
 667                 * on the initial skb, so that all the following fragments
 668                 * will inherit fixed options.
 669                 */
 670                if (offset == 0)
 671                        ip_options_fragment(skb);
 672
 673                /*
 674                 *      Added AC : If we are fragmenting a fragment that's not the
 675                 *                 last fragment then keep MF on each bit
 676                 */
 677                if (left > 0 || not_last_frag)
 678                        iph->frag_off |= htons(IP_MF);
 679                ptr += len;
 680                offset += len;
 681
 682                /*
 683                 *      Put this fragment into the sending queue.
 684                 */
 685                iph->tot_len = htons(len + hlen);
 686
 687                ip_send_check(iph);
 688
 689                err = output(skb2);
 690                if (err)
 691                        goto fail;
 692
 693                IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
 694        }
 695        kfree_skb(skb);
 696        IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
 697        return err;
 698
 699fail:
 700        kfree_skb(skb);
 701        IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 702        return err;
 703}
 704EXPORT_SYMBOL(ip_fragment);
 705
 706int
 707ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 708{
 709        struct iovec *iov = from;
 710
 711        if (skb->ip_summed == CHECKSUM_PARTIAL) {
 712                if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 713                        return -EFAULT;
 714        } else {
 715                __wsum csum = 0;
 716                if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 717                        return -EFAULT;
 718                skb->csum = csum_block_add(skb->csum, csum, odd);
 719        }
 720        return 0;
 721}
 722EXPORT_SYMBOL(ip_generic_getfrag);
 723
 724static inline __wsum
 725csum_page(struct page *page, int offset, int copy)
 726{
 727        char *kaddr;
 728        __wsum csum;
 729        kaddr = kmap(page);
 730        csum = csum_partial(kaddr + offset, copy, 0);
 731        kunmap(page);
 732        return csum;
 733}
 734
 735static inline int ip_ufo_append_data(struct sock *sk,
 736                        int getfrag(void *from, char *to, int offset, int len,
 737                               int odd, struct sk_buff *skb),
 738                        void *from, int length, int hh_len, int fragheaderlen,
 739                        int transhdrlen, int mtu, unsigned int flags)
 740{
 741        struct sk_buff *skb;
 742        int err;
 743
 744        /* There is support for UDP fragmentation offload by network
 745         * device, so create one single skb packet containing complete
 746         * udp datagram
 747         */
 748        if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
 749                skb = sock_alloc_send_skb(sk,
 750                        hh_len + fragheaderlen + transhdrlen + 20,
 751                        (flags & MSG_DONTWAIT), &err);
 752
 753                if (skb == NULL)
 754                        return err;
 755
 756                /* reserve space for Hardware header */
 757                skb_reserve(skb, hh_len);
 758
 759                /* create space for UDP/IP header */
 760                skb_put(skb, fragheaderlen + transhdrlen);
 761
 762                /* initialize network header pointer */
 763                skb_reset_network_header(skb);
 764
 765                /* initialize protocol header pointer */
 766                skb->transport_header = skb->network_header + fragheaderlen;
 767
 768                skb->ip_summed = CHECKSUM_PARTIAL;
 769                skb->csum = 0;
 770                sk->sk_sndmsg_off = 0;
 771
 772                /* specify the length of each IP datagram fragment */
 773                skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
 774                skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
 775                __skb_queue_tail(&sk->sk_write_queue, skb);
 776        }
 777
 778        return skb_append_datato_frags(sk, skb, getfrag, from,
 779                                       (length - transhdrlen));
 780}
 781
 782/*
 783 *      ip_append_data() and ip_append_page() can make one large IP datagram
 784 *      from many pieces of data. Each pieces will be holded on the socket
 785 *      until ip_push_pending_frames() is called. Each piece can be a page
 786 *      or non-page data.
 787 *
 788 *      Not only UDP, other transport protocols - e.g. raw sockets - can use
 789 *      this interface potentially.
 790 *
 791 *      LATER: length must be adjusted by pad at tail, when it is required.
 792 */
 793int ip_append_data(struct sock *sk,
 794                   int getfrag(void *from, char *to, int offset, int len,
 795                               int odd, struct sk_buff *skb),
 796                   void *from, int length, int transhdrlen,
 797                   struct ipcm_cookie *ipc, struct rtable **rtp,
 798                   unsigned int flags)
 799{
 800        struct inet_sock *inet = inet_sk(sk);
 801        struct sk_buff *skb;
 802
 803        struct ip_options *opt = NULL;
 804        int hh_len;
 805        int exthdrlen;
 806        int mtu;
 807        int copy;
 808        int err;
 809        int offset = 0;
 810        unsigned int maxfraglen, fragheaderlen;
 811        int csummode = CHECKSUM_NONE;
 812        struct rtable *rt;
 813
 814        if (flags&MSG_PROBE)
 815                return 0;
 816
 817        if (skb_queue_empty(&sk->sk_write_queue)) {
 818                /*
 819                 * setup for corking.
 820                 */
 821                opt = ipc->opt;
 822                if (opt) {
 823                        if (inet->cork.opt == NULL) {
 824                                inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
 825                                if (unlikely(inet->cork.opt == NULL))
 826                                        return -ENOBUFS;
 827                        }
 828                        memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
 829                        inet->cork.flags |= IPCORK_OPT;
 830                        inet->cork.addr = ipc->addr;
 831                }
 832                rt = *rtp;
 833                if (unlikely(!rt))
 834                        return -EFAULT;
 835                /*
 836                 * We steal reference to this route, caller should not release it
 837                 */
 838                *rtp = NULL;
 839                inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
 840                                            rt->dst.dev->mtu :
 841                                            dst_mtu(rt->dst.path);
 842                inet->cork.dst = &rt->dst;
 843                inet->cork.length = 0;
 844                sk->sk_sndmsg_page = NULL;
 845                sk->sk_sndmsg_off = 0;
 846                exthdrlen = rt->dst.header_len;
 847                length += exthdrlen;
 848                transhdrlen += exthdrlen;
 849        } else {
 850                rt = (struct rtable *)inet->cork.dst;
 851                if (inet->cork.flags & IPCORK_OPT)
 852                        opt = inet->cork.opt;
 853
 854                transhdrlen = 0;
 855                exthdrlen = 0;
 856                mtu = inet->cork.fragsize;
 857        }
 858        hh_len = LL_RESERVED_SPACE(rt->dst.dev);
 859
 860        fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 861        maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 862
 863        if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
 864                ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
 865                               mtu-exthdrlen);
 866                return -EMSGSIZE;
 867        }
 868
 869        /*
 870         * transhdrlen > 0 means that this is the first fragment and we wish
 871         * it won't be fragmented in the future.
 872         */
 873        if (transhdrlen &&
 874            length + fragheaderlen <= mtu &&
 875            rt->dst.dev->features & NETIF_F_V4_CSUM &&
 876            !exthdrlen)
 877                csummode = CHECKSUM_PARTIAL;
 878
 879        skb = skb_peek_tail(&sk->sk_write_queue);
 880
 881        inet->cork.length += length;
 882        if (((length > mtu) || (skb && skb_is_gso(skb))) &&
 883            (sk->sk_protocol == IPPROTO_UDP) &&
 884            (rt->dst.dev->features & NETIF_F_UFO)) {
 885                err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
 886                                         fragheaderlen, transhdrlen, mtu,
 887                                         flags);
 888                if (err)
 889                        goto error;
 890                return 0;
 891        }
 892
 893        /* So, what's going on in the loop below?
 894         *
 895         * We use calculated fragment length to generate chained skb,
 896         * each of segments is IP fragment ready for sending to network after
 897         * adding appropriate IP header.
 898         */
 899
 900        if (!skb)
 901                goto alloc_new_skb;
 902
 903        while (length > 0) {
 904                /* Check if the remaining data fits into current packet. */
 905                copy = mtu - skb->len;
 906                if (copy < length)
 907                        copy = maxfraglen - skb->len;
 908                if (copy <= 0) {
 909                        char *data;
 910                        unsigned int datalen;
 911                        unsigned int fraglen;
 912                        unsigned int fraggap;
 913                        unsigned int alloclen;
 914                        struct sk_buff *skb_prev;
 915alloc_new_skb:
 916                        skb_prev = skb;
 917                        if (skb_prev)
 918                                fraggap = skb_prev->len - maxfraglen;
 919                        else
 920                                fraggap = 0;
 921
 922                        /*
 923                         * If remaining data exceeds the mtu,
 924                         * we know we need more fragment(s).
 925                         */
 926                        datalen = length + fraggap;
 927                        if (datalen > mtu - fragheaderlen)
 928                                datalen = maxfraglen - fragheaderlen;
 929                        fraglen = datalen + fragheaderlen;
 930
 931                        if ((flags & MSG_MORE) &&
 932                            !(rt->dst.dev->features&NETIF_F_SG))
 933                                alloclen = mtu;
 934                        else
 935                                alloclen = fraglen;
 936
 937                        /* The last fragment gets additional space at tail.
 938                         * Note, with MSG_MORE we overallocate on fragments,
 939                         * because we have no idea what fragment will be
 940                         * the last.
 941                         */
 942                        if (datalen == length + fraggap) {
 943                                alloclen += rt->dst.trailer_len;
 944                                /* make sure mtu is not reached */
 945                                if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
 946                                        datalen -= ALIGN(rt->dst.trailer_len, 8);
 947                        }
 948                        if (transhdrlen) {
 949                                skb = sock_alloc_send_skb(sk,
 950                                                alloclen + hh_len + 15,
 951                                                (flags & MSG_DONTWAIT), &err);
 952                        } else {
 953                                skb = NULL;
 954                                if (atomic_read(&sk->sk_wmem_alloc) <=
 955                                    2 * sk->sk_sndbuf)
 956                                        skb = sock_wmalloc(sk,
 957                                                           alloclen + hh_len + 15, 1,
 958                                                           sk->sk_allocation);
 959                                if (unlikely(skb == NULL))
 960                                        err = -ENOBUFS;
 961                                else
 962                                        /* only the initial fragment is
 963                                           time stamped */
 964                                        ipc->tx_flags = 0;
 965                        }
 966                        if (skb == NULL)
 967                                goto error;
 968
 969                        /*
 970                         *      Fill in the control structures
 971                         */
 972                        skb->ip_summed = csummode;
 973                        skb->csum = 0;
 974                        skb_reserve(skb, hh_len);
 975                        skb_shinfo(skb)->tx_flags = ipc->tx_flags;
 976
 977                        /*
 978                         *      Find where to start putting bytes.
 979                         */
 980                        data = skb_put(skb, fraglen);
 981                        skb_set_network_header(skb, exthdrlen);
 982                        skb->transport_header = (skb->network_header +
 983                                                 fragheaderlen);
 984                        data += fragheaderlen;
 985
 986                        if (fraggap) {
 987                                skb->csum = skb_copy_and_csum_bits(
 988                                        skb_prev, maxfraglen,
 989                                        data + transhdrlen, fraggap, 0);
 990                                skb_prev->csum = csum_sub(skb_prev->csum,
 991                                                          skb->csum);
 992                                data += fraggap;
 993                                pskb_trim_unique(skb_prev, maxfraglen);
 994                        }
 995
 996                        copy = datalen - transhdrlen - fraggap;
 997                        if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 998                                err = -EFAULT;
 999                                kfree_skb(skb);
1000                                goto error;
1001                        }
1002
1003                        offset += copy;
1004                        length -= datalen - fraggap;
1005                        transhdrlen = 0;
1006                        exthdrlen = 0;
1007                        csummode = CHECKSUM_NONE;
1008
1009                        /*
1010                         * Put the packet on the pending queue.
1011                         */
1012                        __skb_queue_tail(&sk->sk_write_queue, skb);
1013                        continue;
1014                }
1015
1016                if (copy > length)
1017                        copy = length;
1018
1019                if (!(rt->dst.dev->features&NETIF_F_SG)) {
1020                        unsigned int off;
1021
1022                        off = skb->len;
1023                        if (getfrag(from, skb_put(skb, copy),
1024                                        offset, copy, off, skb) < 0) {
1025                                __skb_trim(skb, off);
1026                                err = -EFAULT;
1027                                goto error;
1028                        }
1029                } else {
1030                        int i = skb_shinfo(skb)->nr_frags;
1031                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1032                        struct page *page = sk->sk_sndmsg_page;
1033                        int off = sk->sk_sndmsg_off;
1034                        unsigned int left;
1035
1036                        if (page && (left = PAGE_SIZE - off) > 0) {
1037                                if (copy >= left)
1038                                        copy = left;
1039                                if (page != frag->page) {
1040                                        if (i == MAX_SKB_FRAGS) {
1041                                                err = -EMSGSIZE;
1042                                                goto error;
1043                                        }
1044                                        get_page(page);
1045                                        skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1046                                        frag = &skb_shinfo(skb)->frags[i];
1047                                }
1048                        } else if (i < MAX_SKB_FRAGS) {
1049                                if (copy > PAGE_SIZE)
1050                                        copy = PAGE_SIZE;
1051                                page = alloc_pages(sk->sk_allocation, 0);
1052                                if (page == NULL)  {
1053                                        err = -ENOMEM;
1054                                        goto error;
1055                                }
1056                                sk->sk_sndmsg_page = page;
1057                                sk->sk_sndmsg_off = 0;
1058
1059                                skb_fill_page_desc(skb, i, page, 0, 0);
1060                                frag = &skb_shinfo(skb)->frags[i];
1061                        } else {
1062                                err = -EMSGSIZE;
1063                                goto error;
1064                        }
1065                        if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1066                                err = -EFAULT;
1067                                goto error;
1068                        }
1069                        sk->sk_sndmsg_off += copy;
1070                        frag->size += copy;
1071                        skb->len += copy;
1072                        skb->data_len += copy;
1073                        skb->truesize += copy;
1074                        atomic_add(copy, &sk->sk_wmem_alloc);
1075                }
1076                offset += copy;
1077                length -= copy;
1078        }
1079
1080        return 0;
1081
1082error:
1083        inet->cork.length -= length;
1084        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1085        return err;
1086}
1087
1088ssize_t ip_append_page(struct sock *sk, struct page *page,
1089                       int offset, size_t size, int flags)
1090{
1091        struct inet_sock *inet = inet_sk(sk);
1092        struct sk_buff *skb;
1093        struct rtable *rt;
1094        struct ip_options *opt = NULL;
1095        int hh_len;
1096        int mtu;
1097        int len;
1098        int err;
1099        unsigned int maxfraglen, fragheaderlen, fraggap;
1100
1101        if (inet->hdrincl)
1102                return -EPERM;
1103
1104        if (flags&MSG_PROBE)
1105                return 0;
1106
1107        if (skb_queue_empty(&sk->sk_write_queue))
1108                return -EINVAL;
1109
1110        rt = (struct rtable *)inet->cork.dst;
1111        if (inet->cork.flags & IPCORK_OPT)
1112                opt = inet->cork.opt;
1113
1114        if (!(rt->dst.dev->features&NETIF_F_SG))
1115                return -EOPNOTSUPP;
1116
1117        hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1118        mtu = inet->cork.fragsize;
1119
1120        fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1121        maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1122
1123        if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1124                ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
1125                return -EMSGSIZE;
1126        }
1127
1128        if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1129                return -EINVAL;
1130
1131        inet->cork.length += size;
1132        if ((size + skb->len > mtu) &&
1133            (sk->sk_protocol == IPPROTO_UDP) &&
1134            (rt->dst.dev->features & NETIF_F_UFO)) {
1135                skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1136                skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1137        }
1138
1139
1140        while (size > 0) {
1141                int i;
1142
1143                if (skb_is_gso(skb))
1144                        len = size;
1145                else {
1146
1147                        /* Check if the remaining data fits into current packet. */
1148                        len = mtu - skb->len;
1149                        if (len < size)
1150                                len = maxfraglen - skb->len;
1151                }
1152                if (len <= 0) {
1153                        struct sk_buff *skb_prev;
1154                        int alloclen;
1155
1156                        skb_prev = skb;
1157                        fraggap = skb_prev->len - maxfraglen;
1158
1159                        alloclen = fragheaderlen + hh_len + fraggap + 15;
1160                        skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1161                        if (unlikely(!skb)) {
1162                                err = -ENOBUFS;
1163                                goto error;
1164                        }
1165
1166                        /*
1167                         *      Fill in the control structures
1168                         */
1169                        skb->ip_summed = CHECKSUM_NONE;
1170                        skb->csum = 0;
1171                        skb_reserve(skb, hh_len);
1172
1173                        /*
1174                         *      Find where to start putting bytes.
1175                         */
1176                        skb_put(skb, fragheaderlen + fraggap);
1177                        skb_reset_network_header(skb);
1178                        skb->transport_header = (skb->network_header +
1179                                                 fragheaderlen);
1180                        if (fraggap) {
1181                                skb->csum = skb_copy_and_csum_bits(skb_prev,
1182                                                                   maxfraglen,
1183                                                    skb_transport_header(skb),
1184                                                                   fraggap, 0);
1185                                skb_prev->csum = csum_sub(skb_prev->csum,
1186                                                          skb->csum);
1187                                pskb_trim_unique(skb_prev, maxfraglen);
1188                        }
1189
1190                        /*
1191                         * Put the packet on the pending queue.
1192                         */
1193                        __skb_queue_tail(&sk->sk_write_queue, skb);
1194                        continue;
1195                }
1196
1197                i = skb_shinfo(skb)->nr_frags;
1198                if (len > size)
1199                        len = size;
1200                if (skb_can_coalesce(skb, i, page, offset)) {
1201                        skb_shinfo(skb)->frags[i-1].size += len;
1202                } else if (i < MAX_SKB_FRAGS) {
1203                        get_page(page);
1204                        skb_fill_page_desc(skb, i, page, offset, len);
1205                } else {
1206                        err = -EMSGSIZE;
1207                        goto error;
1208                }
1209
1210                if (skb->ip_summed == CHECKSUM_NONE) {
1211                        __wsum csum;
1212                        csum = csum_page(page, offset, len);
1213                        skb->csum = csum_block_add(skb->csum, csum, skb->len);
1214                }
1215
1216                skb->len += len;
1217                skb->data_len += len;
1218                skb->truesize += len;
1219                atomic_add(len, &sk->sk_wmem_alloc);
1220                offset += len;
1221                size -= len;
1222        }
1223        return 0;
1224
1225error:
1226        inet->cork.length -= size;
1227        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1228        return err;
1229}
1230
1231static void ip_cork_release(struct inet_sock *inet)
1232{
1233        inet->cork.flags &= ~IPCORK_OPT;
1234        kfree(inet->cork.opt);
1235        inet->cork.opt = NULL;
1236        dst_release(inet->cork.dst);
1237        inet->cork.dst = NULL;
1238}
1239
1240/*
1241 *      Combined all pending IP fragments on the socket as one IP datagram
1242 *      and push them out.
1243 */
1244int ip_push_pending_frames(struct sock *sk)
1245{
1246        struct sk_buff *skb, *tmp_skb;
1247        struct sk_buff **tail_skb;
1248        struct inet_sock *inet = inet_sk(sk);
1249        struct net *net = sock_net(sk);
1250        struct ip_options *opt = NULL;
1251        struct rtable *rt = (struct rtable *)inet->cork.dst;
1252        struct iphdr *iph;
1253        __be16 df = 0;
1254        __u8 ttl;
1255        int err = 0;
1256
1257        if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1258                goto out;
1259        tail_skb = &(skb_shinfo(skb)->frag_list);
1260
1261        /* move skb->data to ip header from ext header */
1262        if (skb->data < skb_network_header(skb))
1263                __skb_pull(skb, skb_network_offset(skb));
1264        while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1265                __skb_pull(tmp_skb, skb_network_header_len(skb));
1266                *tail_skb = tmp_skb;
1267                tail_skb = &(tmp_skb->next);
1268                skb->len += tmp_skb->len;
1269                skb->data_len += tmp_skb->len;
1270                skb->truesize += tmp_skb->truesize;
1271                tmp_skb->destructor = NULL;
1272                tmp_skb->sk = NULL;
1273        }
1274
1275        /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1276         * to fragment the frame generated here. No matter, what transforms
1277         * how transforms change size of the packet, it will come out.
1278         */
1279        if (inet->pmtudisc < IP_PMTUDISC_DO)
1280                skb->local_df = 1;
1281
1282        /* DF bit is set when we want to see DF on outgoing frames.
1283         * If local_df is set too, we still allow to fragment this frame
1284         * locally. */
1285        if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1286            (skb->len <= dst_mtu(&rt->dst) &&
1287             ip_dont_fragment(sk, &rt->dst)))
1288                df = htons(IP_DF);
1289
1290        if (inet->cork.flags & IPCORK_OPT)
1291                opt = inet->cork.opt;
1292
1293        if (rt->rt_type == RTN_MULTICAST)
1294                ttl = inet->mc_ttl;
1295        else
1296                ttl = ip_select_ttl(inet, &rt->dst);
1297
1298        iph = (struct iphdr *)skb->data;
1299        iph->version = 4;
1300        iph->ihl = 5;
1301        if (opt) {
1302                iph->ihl += opt->optlen>>2;
1303                ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1304        }
1305        iph->tos = inet->tos;
1306        iph->frag_off = df;
1307        ip_select_ident(iph, &rt->dst, sk);
1308        iph->ttl = ttl;
1309        iph->protocol = sk->sk_protocol;
1310        iph->saddr = rt->rt_src;
1311        iph->daddr = rt->rt_dst;
1312
1313        skb->priority = sk->sk_priority;
1314        skb->mark = sk->sk_mark;
1315        /*
1316         * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1317         * on dst refcount
1318         */
1319        inet->cork.dst = NULL;
1320        skb_dst_set(skb, &rt->dst);
1321
1322        if (iph->protocol == IPPROTO_ICMP)
1323                icmp_out_count(net, ((struct icmphdr *)
1324                        skb_transport_header(skb))->type);
1325
1326        /* Netfilter gets whole the not fragmented skb. */
1327        err = ip_local_out(skb);
1328        if (err) {
1329                if (err > 0)
1330                        err = net_xmit_errno(err);
1331                if (err)
1332                        goto error;
1333        }
1334
1335out:
1336        ip_cork_release(inet);
1337        return err;
1338
1339error:
1340        IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1341        goto out;
1342}
1343
1344/*
1345 *      Throw away all pending data on the socket.
1346 */
1347void ip_flush_pending_frames(struct sock *sk)
1348{
1349        struct sk_buff *skb;
1350
1351        while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1352                kfree_skb(skb);
1353
1354        ip_cork_release(inet_sk(sk));
1355}
1356
1357
1358/*
1359 *      Fetch data from kernel space and fill in checksum if needed.
1360 */
1361static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1362                              int len, int odd, struct sk_buff *skb)
1363{
1364        __wsum csum;
1365
1366        csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1367        skb->csum = csum_block_add(skb->csum, csum, odd);
1368        return 0;
1369}
1370
1371/*
1372 *      Generic function to send a packet as reply to another packet.
1373 *      Used to send TCP resets so far. ICMP should use this function too.
1374 *
1375 *      Should run single threaded per socket because it uses the sock
1376 *      structure to pass arguments.
1377 */
1378void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1379                   unsigned int len)
1380{
1381        struct inet_sock *inet = inet_sk(sk);
1382        struct {
1383                struct ip_options       opt;
1384                char                    data[40];
1385        } replyopts;
1386        struct ipcm_cookie ipc;
1387        __be32 daddr;
1388        struct rtable *rt = skb_rtable(skb);
1389
1390        if (ip_options_echo(&replyopts.opt, skb))
1391                return;
1392
1393        daddr = ipc.addr = rt->rt_src;
1394        ipc.opt = NULL;
1395        ipc.tx_flags = 0;
1396
1397        if (replyopts.opt.optlen) {
1398                ipc.opt = &replyopts.opt;
1399
1400                if (ipc.opt->srr)
1401                        daddr = replyopts.opt.faddr;
1402        }
1403
1404        {
1405                struct flowi fl = { .oif = arg->bound_dev_if,
1406                                    .fl4_dst = daddr,
1407                                    .fl4_src = rt->rt_spec_dst,
1408                                    .fl4_tos = RT_TOS(ip_hdr(skb)->tos),
1409                                    .fl_ip_sport = tcp_hdr(skb)->dest,
1410                                    .fl_ip_dport = tcp_hdr(skb)->source,
1411                                    .proto = sk->sk_protocol,
1412                                    .flags = ip_reply_arg_flowi_flags(arg) };
1413                security_skb_classify_flow(skb, &fl);
1414                if (ip_route_output_key(sock_net(sk), &rt, &fl))
1415                        return;
1416        }
1417
1418        /* And let IP do all the hard work.
1419
1420           This chunk is not reenterable, hence spinlock.
1421           Note that it uses the fact, that this function is called
1422           with locally disabled BH and that sk cannot be already spinlocked.
1423         */
1424        bh_lock_sock(sk);
1425        inet->tos = ip_hdr(skb)->tos;
1426        sk->sk_priority = skb->priority;
1427        sk->sk_protocol = ip_hdr(skb)->protocol;
1428        sk->sk_bound_dev_if = arg->bound_dev_if;
1429        ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1430                       &ipc, &rt, MSG_DONTWAIT);
1431        if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1432                if (arg->csumoffset >= 0)
1433                        *((__sum16 *)skb_transport_header(skb) +
1434                          arg->csumoffset) = csum_fold(csum_add(skb->csum,
1435                                                                arg->csum));
1436                skb->ip_summed = CHECKSUM_NONE;
1437                ip_push_pending_frames(sk);
1438        }
1439
1440        bh_unlock_sock(sk);
1441
1442        ip_rt_put(rt);
1443}
1444
1445void __init ip_init(void)
1446{
1447        ip_rt_init();
1448        inet_initpeers();
1449
1450#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1451        igmp_mc_proc_init();
1452#endif
1453}
1454