linux/net/ipv6/ip6_output.c
<<
>>
Prefs
   1/*
   2 *      IPv6 output functions
   3 *      Linux INET6 implementation
   4 *
   5 *      Authors:
   6 *      Pedro Roque             <roque@di.fc.ul.pt>
   7 *
   8 *      Based on linux/net/ipv4/ip_output.c
   9 *
  10 *      This program is free software; you can redistribute it and/or
  11 *      modify it under the terms of the GNU General Public License
  12 *      as published by the Free Software Foundation; either version
  13 *      2 of the License, or (at your option) any later version.
  14 *
  15 *      Changes:
  16 *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17 *                              extension headers are implemented.
  18 *                              route changes now work.
  19 *                              ip6_forward does not confuse sniffers.
  20 *                              etc.
  21 *
  22 *      H. von Brand    :       Added missing #include <linux/string.h>
  23 *      Imran Patel     :       frag id should be in NBO
  24 *      Kazunori MIYAZAWA @USAGI
  25 *                      :       add ip6_append_data and related functions
  26 *                              for datagram xmit
  27 */
  28
  29#include <linux/errno.h>
  30#include <linux/kernel.h>
  31#include <linux/string.h>
  32#include <linux/socket.h>
  33#include <linux/net.h>
  34#include <linux/netdevice.h>
  35#include <linux/if_arp.h>
  36#include <linux/in6.h>
  37#include <linux/tcp.h>
  38#include <linux/route.h>
  39#include <linux/module.h>
  40#include <linux/slab.h>
  41
  42#include <linux/netfilter.h>
  43#include <linux/netfilter_ipv6.h>
  44
  45#include <net/sock.h>
  46#include <net/snmp.h>
  47
  48#include <net/ipv6.h>
  49#include <net/ndisc.h>
  50#include <net/protocol.h>
  51#include <net/ip6_route.h>
  52#include <net/addrconf.h>
  53#include <net/rawv6.h>
  54#include <net/icmp.h>
  55#include <net/xfrm.h>
  56#include <net/checksum.h>
  57#include <linux/mroute6.h>
  58
  59int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
  60
  61int __ip6_local_out(struct sk_buff *skb)
  62{
  63        int len;
  64
  65        len = skb->len - sizeof(struct ipv6hdr);
  66        if (len > IPV6_MAXPLEN)
  67                len = 0;
  68        ipv6_hdr(skb)->payload_len = htons(len);
  69
  70        return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
  71                       skb_dst(skb)->dev, dst_output);
  72}
  73
  74int ip6_local_out(struct sk_buff *skb)
  75{
  76        int err;
  77
  78        err = __ip6_local_out(skb);
  79        if (likely(err == 1))
  80                err = dst_output(skb);
  81
  82        return err;
  83}
  84EXPORT_SYMBOL_GPL(ip6_local_out);
  85
  86/* dev_loopback_xmit for use with netfilter. */
  87static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
  88{
  89        skb_reset_mac_header(newskb);
  90        __skb_pull(newskb, skb_network_offset(newskb));
  91        newskb->pkt_type = PACKET_LOOPBACK;
  92        newskb->ip_summed = CHECKSUM_UNNECESSARY;
  93        WARN_ON(!skb_dst(newskb));
  94
  95        netif_rx_ni(newskb);
  96        return 0;
  97}
  98
  99static int ip6_finish_output2(struct sk_buff *skb)
 100{
 101        struct dst_entry *dst = skb_dst(skb);
 102        struct net_device *dev = dst->dev;
 103
 104        skb->protocol = htons(ETH_P_IPV6);
 105        skb->dev = dev;
 106
 107        if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
 108                struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 109
 110                if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
 111                    ((mroute6_socket(dev_net(dev), skb) &&
 112                     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 113                     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 114                                         &ipv6_hdr(skb)->saddr))) {
 115                        struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 116
 117                        /* Do not check for IFF_ALLMULTI; multicast routing
 118                           is not supported in any case.
 119                         */
 120                        if (newskb)
 121                                NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 122                                        newskb, NULL, newskb->dev,
 123                                        ip6_dev_loopback_xmit);
 124
 125                        if (ipv6_hdr(skb)->hop_limit == 0) {
 126                                IP6_INC_STATS(dev_net(dev), idev,
 127                                              IPSTATS_MIB_OUTDISCARDS);
 128                                kfree_skb(skb);
 129                                return 0;
 130                        }
 131                }
 132
 133                IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
 134                                skb->len);
 135        }
 136
 137        if (dst->hh)
 138                return neigh_hh_output(dst->hh, skb);
 139        else if (dst->neighbour)
 140                return dst->neighbour->output(skb);
 141
 142        IP6_INC_STATS_BH(dev_net(dst->dev),
 143                         ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 144        kfree_skb(skb);
 145        return -EINVAL;
 146}
 147
 148static int ip6_finish_output(struct sk_buff *skb)
 149{
 150        if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 151            dst_allfrag(skb_dst(skb)))
 152                return ip6_fragment(skb, ip6_finish_output2);
 153        else
 154                return ip6_finish_output2(skb);
 155}
 156
 157int ip6_output(struct sk_buff *skb)
 158{
 159        struct net_device *dev = skb_dst(skb)->dev;
 160        struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 161        if (unlikely(idev->cnf.disable_ipv6)) {
 162                IP6_INC_STATS(dev_net(dev), idev,
 163                              IPSTATS_MIB_OUTDISCARDS);
 164                kfree_skb(skb);
 165                return 0;
 166        }
 167
 168        return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
 169                            ip6_finish_output,
 170                            !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 171}
 172
 173/*
 174 *      xmit an sk_buff (used by TCP, SCTP and DCCP)
 175 */
 176
 177int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 178             struct ipv6_txoptions *opt)
 179{
 180        struct net *net = sock_net(sk);
 181        struct ipv6_pinfo *np = inet6_sk(sk);
 182        struct in6_addr *first_hop = &fl->fl6_dst;
 183        struct dst_entry *dst = skb_dst(skb);
 184        struct ipv6hdr *hdr;
 185        u8  proto = fl->proto;
 186        int seg_len = skb->len;
 187        int hlimit = -1;
 188        int tclass = 0;
 189        u32 mtu;
 190
 191        if (opt) {
 192                unsigned int head_room;
 193
 194                /* First: exthdrs may take lots of space (~8K for now)
 195                   MAX_HEADER is not enough.
 196                 */
 197                head_room = opt->opt_nflen + opt->opt_flen;
 198                seg_len += head_room;
 199                head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 200
 201                if (skb_headroom(skb) < head_room) {
 202                        struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 203                        if (skb2 == NULL) {
 204                                IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 205                                              IPSTATS_MIB_OUTDISCARDS);
 206                                kfree_skb(skb);
 207                                return -ENOBUFS;
 208                        }
 209                        kfree_skb(skb);
 210                        skb = skb2;
 211                        skb_set_owner_w(skb, sk);
 212                }
 213                if (opt->opt_flen)
 214                        ipv6_push_frag_opts(skb, opt, &proto);
 215                if (opt->opt_nflen)
 216                        ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 217        }
 218
 219        skb_push(skb, sizeof(struct ipv6hdr));
 220        skb_reset_network_header(skb);
 221        hdr = ipv6_hdr(skb);
 222
 223        /*
 224         *      Fill in the IPv6 header
 225         */
 226        if (np) {
 227                tclass = np->tclass;
 228                hlimit = np->hop_limit;
 229        }
 230        if (hlimit < 0)
 231                hlimit = ip6_dst_hoplimit(dst);
 232
 233        *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
 234
 235        hdr->payload_len = htons(seg_len);
 236        hdr->nexthdr = proto;
 237        hdr->hop_limit = hlimit;
 238
 239        ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
 240        ipv6_addr_copy(&hdr->daddr, first_hop);
 241
 242        skb->priority = sk->sk_priority;
 243        skb->mark = sk->sk_mark;
 244
 245        mtu = dst_mtu(dst);
 246        if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 247                IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 248                              IPSTATS_MIB_OUT, skb->len);
 249                return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
 250                               dst->dev, dst_output);
 251        }
 252
 253        if (net_ratelimit())
 254                printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
 255        skb->dev = dst->dev;
 256        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 257        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 258        kfree_skb(skb);
 259        return -EMSGSIZE;
 260}
 261
 262EXPORT_SYMBOL(ip6_xmit);
 263
 264/*
 265 *      To avoid extra problems ND packets are send through this
 266 *      routine. It's code duplication but I really want to avoid
 267 *      extra checks since ipv6_build_header is used by TCP (which
 268 *      is for us performance critical)
 269 */
 270
 271int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
 272               const struct in6_addr *saddr, const struct in6_addr *daddr,
 273               int proto, int len)
 274{
 275        struct ipv6_pinfo *np = inet6_sk(sk);
 276        struct ipv6hdr *hdr;
 277        int totlen;
 278
 279        skb->protocol = htons(ETH_P_IPV6);
 280        skb->dev = dev;
 281
 282        totlen = len + sizeof(struct ipv6hdr);
 283
 284        skb_reset_network_header(skb);
 285        skb_put(skb, sizeof(struct ipv6hdr));
 286        hdr = ipv6_hdr(skb);
 287
 288        *(__be32*)hdr = htonl(0x60000000);
 289
 290        hdr->payload_len = htons(len);
 291        hdr->nexthdr = proto;
 292        hdr->hop_limit = np->hop_limit;
 293
 294        ipv6_addr_copy(&hdr->saddr, saddr);
 295        ipv6_addr_copy(&hdr->daddr, daddr);
 296
 297        return 0;
 298}
 299
 300static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 301{
 302        struct ip6_ra_chain *ra;
 303        struct sock *last = NULL;
 304
 305        read_lock(&ip6_ra_lock);
 306        for (ra = ip6_ra_chain; ra; ra = ra->next) {
 307                struct sock *sk = ra->sk;
 308                if (sk && ra->sel == sel &&
 309                    (!sk->sk_bound_dev_if ||
 310                     sk->sk_bound_dev_if == skb->dev->ifindex)) {
 311                        if (last) {
 312                                struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 313                                if (skb2)
 314                                        rawv6_rcv(last, skb2);
 315                        }
 316                        last = sk;
 317                }
 318        }
 319
 320        if (last) {
 321                rawv6_rcv(last, skb);
 322                read_unlock(&ip6_ra_lock);
 323                return 1;
 324        }
 325        read_unlock(&ip6_ra_lock);
 326        return 0;
 327}
 328
 329static int ip6_forward_proxy_check(struct sk_buff *skb)
 330{
 331        struct ipv6hdr *hdr = ipv6_hdr(skb);
 332        u8 nexthdr = hdr->nexthdr;
 333        int offset;
 334
 335        if (ipv6_ext_hdr(nexthdr)) {
 336                offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
 337                if (offset < 0)
 338                        return 0;
 339        } else
 340                offset = sizeof(struct ipv6hdr);
 341
 342        if (nexthdr == IPPROTO_ICMPV6) {
 343                struct icmp6hdr *icmp6;
 344
 345                if (!pskb_may_pull(skb, (skb_network_header(skb) +
 346                                         offset + 1 - skb->data)))
 347                        return 0;
 348
 349                icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 350
 351                switch (icmp6->icmp6_type) {
 352                case NDISC_ROUTER_SOLICITATION:
 353                case NDISC_ROUTER_ADVERTISEMENT:
 354                case NDISC_NEIGHBOUR_SOLICITATION:
 355                case NDISC_NEIGHBOUR_ADVERTISEMENT:
 356                case NDISC_REDIRECT:
 357                        /* For reaction involving unicast neighbor discovery
 358                         * message destined to the proxied address, pass it to
 359                         * input function.
 360                         */
 361                        return 1;
 362                default:
 363                        break;
 364                }
 365        }
 366
 367        /*
 368         * The proxying router can't forward traffic sent to a link-local
 369         * address, so signal the sender and discard the packet. This
 370         * behavior is clarified by the MIPv6 specification.
 371         */
 372        if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 373                dst_link_failure(skb);
 374                return -1;
 375        }
 376
 377        return 0;
 378}
 379
 380static inline int ip6_forward_finish(struct sk_buff *skb)
 381{
 382        return dst_output(skb);
 383}
 384
 385int ip6_forward(struct sk_buff *skb)
 386{
 387        struct dst_entry *dst = skb_dst(skb);
 388        struct ipv6hdr *hdr = ipv6_hdr(skb);
 389        struct inet6_skb_parm *opt = IP6CB(skb);
 390        struct net *net = dev_net(dst->dev);
 391        u32 mtu;
 392
 393        if (net->ipv6.devconf_all->forwarding == 0)
 394                goto error;
 395
 396        if (skb_warn_if_lro(skb))
 397                goto drop;
 398
 399        if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 400                IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 401                goto drop;
 402        }
 403
 404        if (skb->pkt_type != PACKET_HOST)
 405                goto drop;
 406
 407        skb_forward_csum(skb);
 408
 409        /*
 410         *      We DO NOT make any processing on
 411         *      RA packets, pushing them to user level AS IS
 412         *      without ane WARRANTY that application will be able
 413         *      to interpret them. The reason is that we
 414         *      cannot make anything clever here.
 415         *
 416         *      We are not end-node, so that if packet contains
 417         *      AH/ESP, we cannot make anything.
 418         *      Defragmentation also would be mistake, RA packets
 419         *      cannot be fragmented, because there is no warranty
 420         *      that different fragments will go along one path. --ANK
 421         */
 422        if (opt->ra) {
 423                u8 *ptr = skb_network_header(skb) + opt->ra;
 424                if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
 425                        return 0;
 426        }
 427
 428        /*
 429         *      check and decrement ttl
 430         */
 431        if (hdr->hop_limit <= 1) {
 432                /* Force OUTPUT device used as source address */
 433                skb->dev = dst->dev;
 434                icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 435                IP6_INC_STATS_BH(net,
 436                                 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 437
 438                kfree_skb(skb);
 439                return -ETIMEDOUT;
 440        }
 441
 442        /* XXX: idev->cnf.proxy_ndp? */
 443        if (net->ipv6.devconf_all->proxy_ndp &&
 444            pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 445                int proxied = ip6_forward_proxy_check(skb);
 446                if (proxied > 0)
 447                        return ip6_input(skb);
 448                else if (proxied < 0) {
 449                        IP6_INC_STATS(net, ip6_dst_idev(dst),
 450                                      IPSTATS_MIB_INDISCARDS);
 451                        goto drop;
 452                }
 453        }
 454
 455        if (!xfrm6_route_forward(skb)) {
 456                IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 457                goto drop;
 458        }
 459        dst = skb_dst(skb);
 460
 461        /* IPv6 specs say nothing about it, but it is clear that we cannot
 462           send redirects to source routed frames.
 463           We don't send redirects to frames decapsulated from IPsec.
 464         */
 465        if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
 466            !skb_sec_path(skb)) {
 467                struct in6_addr *target = NULL;
 468                struct rt6_info *rt;
 469                struct neighbour *n = dst->neighbour;
 470
 471                /*
 472                 *      incoming and outgoing devices are the same
 473                 *      send a redirect.
 474                 */
 475
 476                rt = (struct rt6_info *) dst;
 477                if ((rt->rt6i_flags & RTF_GATEWAY))
 478                        target = (struct in6_addr*)&n->primary_key;
 479                else
 480                        target = &hdr->daddr;
 481
 482                /* Limit redirects both by destination (here)
 483                   and by source (inside ndisc_send_redirect)
 484                 */
 485                if (xrlim_allow(dst, 1*HZ))
 486                        ndisc_send_redirect(skb, n, target);
 487        } else {
 488                int addrtype = ipv6_addr_type(&hdr->saddr);
 489
 490                /* This check is security critical. */
 491                if (addrtype == IPV6_ADDR_ANY ||
 492                    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 493                        goto error;
 494                if (addrtype & IPV6_ADDR_LINKLOCAL) {
 495                        icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 496                                    ICMPV6_NOT_NEIGHBOUR, 0);
 497                        goto error;
 498                }
 499        }
 500
 501        mtu = dst_mtu(dst);
 502        if (mtu < IPV6_MIN_MTU)
 503                mtu = IPV6_MIN_MTU;
 504
 505        if (skb->len > mtu && !skb_is_gso(skb)) {
 506                /* Again, force OUTPUT device used as source address */
 507                skb->dev = dst->dev;
 508                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 509                IP6_INC_STATS_BH(net,
 510                                 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 511                IP6_INC_STATS_BH(net,
 512                                 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 513                kfree_skb(skb);
 514                return -EMSGSIZE;
 515        }
 516
 517        if (skb_cow(skb, dst->dev->hard_header_len)) {
 518                IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 519                goto drop;
 520        }
 521
 522        hdr = ipv6_hdr(skb);
 523
 524        /* Mangling hops number delayed to point after skb COW */
 525
 526        hdr->hop_limit--;
 527
 528        IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 529        return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 530                       ip6_forward_finish);
 531
 532error:
 533        IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 534drop:
 535        kfree_skb(skb);
 536        return -EINVAL;
 537}
 538
 539static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 540{
 541        to->pkt_type = from->pkt_type;
 542        to->priority = from->priority;
 543        to->protocol = from->protocol;
 544        skb_dst_drop(to);
 545        skb_dst_set(to, dst_clone(skb_dst(from)));
 546        to->dev = from->dev;
 547        to->mark = from->mark;
 548
 549#ifdef CONFIG_NET_SCHED
 550        to->tc_index = from->tc_index;
 551#endif
 552        nf_copy(to, from);
 553#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 554    defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 555        to->nf_trace = from->nf_trace;
 556#endif
 557        skb_copy_secmark(to, from);
 558}
 559
 560int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 561{
 562        u16 offset = sizeof(struct ipv6hdr);
 563        struct ipv6_opt_hdr *exthdr =
 564                                (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 565        unsigned int packet_len = skb->tail - skb->network_header;
 566        int found_rhdr = 0;
 567        *nexthdr = &ipv6_hdr(skb)->nexthdr;
 568
 569        while (offset + 1 <= packet_len) {
 570
 571                switch (**nexthdr) {
 572
 573                case NEXTHDR_HOP:
 574                        break;
 575                case NEXTHDR_ROUTING:
 576                        found_rhdr = 1;
 577                        break;
 578                case NEXTHDR_DEST:
 579#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 580                        if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
 581                                break;
 582#endif
 583                        if (found_rhdr)
 584                                return offset;
 585                        break;
 586                default :
 587                        return offset;
 588                }
 589
 590                offset += ipv6_optlen(exthdr);
 591                *nexthdr = &exthdr->nexthdr;
 592                exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 593                                                 offset);
 594        }
 595
 596        return offset;
 597}
 598
 599int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 600{
 601        struct sk_buff *frag;
 602        struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
 603        struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 604        struct ipv6hdr *tmp_hdr;
 605        struct frag_hdr *fh;
 606        unsigned int mtu, hlen, left, len;
 607        __be32 frag_id = 0;
 608        int ptr, offset = 0, err=0;
 609        u8 *prevhdr, nexthdr = 0;
 610        struct net *net = dev_net(skb_dst(skb)->dev);
 611
 612        hlen = ip6_find_1stfragopt(skb, &prevhdr);
 613        nexthdr = *prevhdr;
 614
 615        mtu = ip6_skb_dst_mtu(skb);
 616
 617        /* We must not fragment if the socket is set to force MTU discovery
 618         * or if the skb it not generated by a local socket.
 619         */
 620        if (!skb->local_df && skb->len > mtu) {
 621                skb->dev = skb_dst(skb)->dev;
 622                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 623                IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 624                              IPSTATS_MIB_FRAGFAILS);
 625                kfree_skb(skb);
 626                return -EMSGSIZE;
 627        }
 628
 629        if (np && np->frag_size < mtu) {
 630                if (np->frag_size)
 631                        mtu = np->frag_size;
 632        }
 633        mtu -= hlen + sizeof(struct frag_hdr);
 634
 635        if (skb_has_frag_list(skb)) {
 636                int first_len = skb_pagelen(skb);
 637                struct sk_buff *frag2;
 638
 639                if (first_len - hlen > mtu ||
 640                    ((first_len - hlen) & 7) ||
 641                    skb_cloned(skb))
 642                        goto slow_path;
 643
 644                skb_walk_frags(skb, frag) {
 645                        /* Correct geometry. */
 646                        if (frag->len > mtu ||
 647                            ((frag->len & 7) && frag->next) ||
 648                            skb_headroom(frag) < hlen)
 649                                goto slow_path_clean;
 650
 651                        /* Partially cloned skb? */
 652                        if (skb_shared(frag))
 653                                goto slow_path_clean;
 654
 655                        BUG_ON(frag->sk);
 656                        if (skb->sk) {
 657                                frag->sk = skb->sk;
 658                                frag->destructor = sock_wfree;
 659                        }
 660                        skb->truesize -= frag->truesize;
 661                }
 662
 663                err = 0;
 664                offset = 0;
 665                frag = skb_shinfo(skb)->frag_list;
 666                skb_frag_list_init(skb);
 667                /* BUILD HEADER */
 668
 669                *prevhdr = NEXTHDR_FRAGMENT;
 670                tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 671                if (!tmp_hdr) {
 672                        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 673                                      IPSTATS_MIB_FRAGFAILS);
 674                        return -ENOMEM;
 675                }
 676
 677                __skb_pull(skb, hlen);
 678                fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 679                __skb_push(skb, hlen);
 680                skb_reset_network_header(skb);
 681                memcpy(skb_network_header(skb), tmp_hdr, hlen);
 682
 683                ipv6_select_ident(fh);
 684                fh->nexthdr = nexthdr;
 685                fh->reserved = 0;
 686                fh->frag_off = htons(IP6_MF);
 687                frag_id = fh->identification;
 688
 689                first_len = skb_pagelen(skb);
 690                skb->data_len = first_len - skb_headlen(skb);
 691                skb->len = first_len;
 692                ipv6_hdr(skb)->payload_len = htons(first_len -
 693                                                   sizeof(struct ipv6hdr));
 694
 695                dst_hold(&rt->dst);
 696
 697                for (;;) {
 698                        /* Prepare header of the next frame,
 699                         * before previous one went down. */
 700                        if (frag) {
 701                                frag->ip_summed = CHECKSUM_NONE;
 702                                skb_reset_transport_header(frag);
 703                                fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 704                                __skb_push(frag, hlen);
 705                                skb_reset_network_header(frag);
 706                                memcpy(skb_network_header(frag), tmp_hdr,
 707                                       hlen);
 708                                offset += skb->len - hlen - sizeof(struct frag_hdr);
 709                                fh->nexthdr = nexthdr;
 710                                fh->reserved = 0;
 711                                fh->frag_off = htons(offset);
 712                                if (frag->next != NULL)
 713                                        fh->frag_off |= htons(IP6_MF);
 714                                fh->identification = frag_id;
 715                                ipv6_hdr(frag)->payload_len =
 716                                                htons(frag->len -
 717                                                      sizeof(struct ipv6hdr));
 718                                ip6_copy_metadata(frag, skb);
 719                        }
 720
 721                        err = output(skb);
 722                        if(!err)
 723                                IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 724                                              IPSTATS_MIB_FRAGCREATES);
 725
 726                        if (err || !frag)
 727                                break;
 728
 729                        skb = frag;
 730                        frag = skb->next;
 731                        skb->next = NULL;
 732                }
 733
 734                kfree(tmp_hdr);
 735
 736                if (err == 0) {
 737                        IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 738                                      IPSTATS_MIB_FRAGOKS);
 739                        dst_release(&rt->dst);
 740                        return 0;
 741                }
 742
 743                while (frag) {
 744                        skb = frag->next;
 745                        kfree_skb(frag);
 746                        frag = skb;
 747                }
 748
 749                IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 750                              IPSTATS_MIB_FRAGFAILS);
 751                dst_release(&rt->dst);
 752                return err;
 753
 754slow_path_clean:
 755                skb_walk_frags(skb, frag2) {
 756                        if (frag2 == frag)
 757                                break;
 758                        frag2->sk = NULL;
 759                        frag2->destructor = NULL;
 760                        skb->truesize += frag2->truesize;
 761                }
 762        }
 763
 764slow_path:
 765        left = skb->len - hlen;         /* Space per frame */
 766        ptr = hlen;                     /* Where to start from */
 767
 768        /*
 769         *      Fragment the datagram.
 770         */
 771
 772        *prevhdr = NEXTHDR_FRAGMENT;
 773
 774        /*
 775         *      Keep copying data until we run out.
 776         */
 777        while(left > 0) {
 778                len = left;
 779                /* IF: it doesn't fit, use 'mtu' - the data space left */
 780                if (len > mtu)
 781                        len = mtu;
 782                /* IF: we are not sending upto and including the packet end
 783                   then align the next start on an eight byte boundary */
 784                if (len < left) {
 785                        len &= ~7;
 786                }
 787                /*
 788                 *      Allocate buffer.
 789                 */
 790
 791                if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
 792                        NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 793                        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 794                                      IPSTATS_MIB_FRAGFAILS);
 795                        err = -ENOMEM;
 796                        goto fail;
 797                }
 798
 799                /*
 800                 *      Set up data on packet
 801                 */
 802
 803                ip6_copy_metadata(frag, skb);
 804                skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
 805                skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 806                skb_reset_network_header(frag);
 807                fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 808                frag->transport_header = (frag->network_header + hlen +
 809                                          sizeof(struct frag_hdr));
 810
 811                /*
 812                 *      Charge the memory for the fragment to any owner
 813                 *      it might possess
 814                 */
 815                if (skb->sk)
 816                        skb_set_owner_w(frag, skb->sk);
 817
 818                /*
 819                 *      Copy the packet header into the new buffer.
 820                 */
 821                skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 822
 823                /*
 824                 *      Build fragment header.
 825                 */
 826                fh->nexthdr = nexthdr;
 827                fh->reserved = 0;
 828                if (!frag_id) {
 829                        ipv6_select_ident(fh);
 830                        frag_id = fh->identification;
 831                } else
 832                        fh->identification = frag_id;
 833
 834                /*
 835                 *      Copy a block of the IP datagram.
 836                 */
 837                if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 838                        BUG();
 839                left -= len;
 840
 841                fh->frag_off = htons(offset);
 842                if (left > 0)
 843                        fh->frag_off |= htons(IP6_MF);
 844                ipv6_hdr(frag)->payload_len = htons(frag->len -
 845                                                    sizeof(struct ipv6hdr));
 846
 847                ptr += len;
 848                offset += len;
 849
 850                /*
 851                 *      Put this fragment into the sending queue.
 852                 */
 853                err = output(frag);
 854                if (err)
 855                        goto fail;
 856
 857                IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 858                              IPSTATS_MIB_FRAGCREATES);
 859        }
 860        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 861                      IPSTATS_MIB_FRAGOKS);
 862        kfree_skb(skb);
 863        return err;
 864
 865fail:
 866        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 867                      IPSTATS_MIB_FRAGFAILS);
 868        kfree_skb(skb);
 869        return err;
 870}
 871
 872static inline int ip6_rt_check(struct rt6key *rt_key,
 873                               struct in6_addr *fl_addr,
 874                               struct in6_addr *addr_cache)
 875{
 876        return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 877                (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
 878}
 879
 880static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 881                                          struct dst_entry *dst,
 882                                          struct flowi *fl)
 883{
 884        struct ipv6_pinfo *np = inet6_sk(sk);
 885        struct rt6_info *rt = (struct rt6_info *)dst;
 886
 887        if (!dst)
 888                goto out;
 889
 890        /* Yes, checking route validity in not connected
 891         * case is not very simple. Take into account,
 892         * that we do not support routing by source, TOS,
 893         * and MSG_DONTROUTE            --ANK (980726)
 894         *
 895         * 1. ip6_rt_check(): If route was host route,
 896         *    check that cached destination is current.
 897         *    If it is network route, we still may
 898         *    check its validity using saved pointer
 899         *    to the last used address: daddr_cache.
 900         *    We do not want to save whole address now,
 901         *    (because main consumer of this service
 902         *    is tcp, which has not this problem),
 903         *    so that the last trick works only on connected
 904         *    sockets.
 905         * 2. oif also should be the same.
 906         */
 907        if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
 908#ifdef CONFIG_IPV6_SUBTREES
 909            ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
 910#endif
 911            (fl->oif && fl->oif != dst->dev->ifindex)) {
 912                dst_release(dst);
 913                dst = NULL;
 914        }
 915
 916out:
 917        return dst;
 918}
 919
 920static int ip6_dst_lookup_tail(struct sock *sk,
 921                               struct dst_entry **dst, struct flowi *fl)
 922{
 923        int err;
 924        struct net *net = sock_net(sk);
 925
 926        if (*dst == NULL)
 927                *dst = ip6_route_output(net, sk, fl);
 928
 929        if ((err = (*dst)->error))
 930                goto out_err_release;
 931
 932        if (ipv6_addr_any(&fl->fl6_src)) {
 933                err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
 934                                         &fl->fl6_dst,
 935                                         sk ? inet6_sk(sk)->srcprefs : 0,
 936                                         &fl->fl6_src);
 937                if (err)
 938                        goto out_err_release;
 939        }
 940
 941#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 942        /*
 943         * Here if the dst entry we've looked up
 944         * has a neighbour entry that is in the INCOMPLETE
 945         * state and the src address from the flow is
 946         * marked as OPTIMISTIC, we release the found
 947         * dst entry and replace it instead with the
 948         * dst entry of the nexthop router
 949         */
 950        if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
 951                struct inet6_ifaddr *ifp;
 952                struct flowi fl_gw;
 953                int redirect;
 954
 955                ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
 956                                      (*dst)->dev, 1);
 957
 958                redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 959                if (ifp)
 960                        in6_ifa_put(ifp);
 961
 962                if (redirect) {
 963                        /*
 964                         * We need to get the dst entry for the
 965                         * default router instead
 966                         */
 967                        dst_release(*dst);
 968                        memcpy(&fl_gw, fl, sizeof(struct flowi));
 969                        memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
 970                        *dst = ip6_route_output(net, sk, &fl_gw);
 971                        if ((err = (*dst)->error))
 972                                goto out_err_release;
 973                }
 974        }
 975#endif
 976
 977        return 0;
 978
 979out_err_release:
 980        if (err == -ENETUNREACH)
 981                IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
 982        dst_release(*dst);
 983        *dst = NULL;
 984        return err;
 985}
 986
 987/**
 988 *      ip6_dst_lookup - perform route lookup on flow
 989 *      @sk: socket which provides route info
 990 *      @dst: pointer to dst_entry * for result
 991 *      @fl: flow to lookup
 992 *
 993 *      This function performs a route lookup on the given flow.
 994 *
 995 *      It returns zero on success, or a standard errno code on error.
 996 */
 997int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
 998{
 999        *dst = NULL;
1000        return ip6_dst_lookup_tail(sk, dst, fl);
1001}
1002EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1003
1004/**
1005 *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
1006 *      @sk: socket which provides the dst cache and route info
1007 *      @dst: pointer to dst_entry * for result
1008 *      @fl: flow to lookup
1009 *
1010 *      This function performs a route lookup on the given flow with the
1011 *      possibility of using the cached route in the socket if it is valid.
1012 *      It will take the socket dst lock when operating on the dst cache.
1013 *      As a result, this function can only be used in process context.
1014 *
1015 *      It returns zero on success, or a standard errno code on error.
1016 */
1017int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1018{
1019        *dst = NULL;
1020        if (sk) {
1021                *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1022                *dst = ip6_sk_dst_check(sk, *dst, fl);
1023        }
1024
1025        return ip6_dst_lookup_tail(sk, dst, fl);
1026}
1027EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1028
1029static inline int ip6_ufo_append_data(struct sock *sk,
1030                        int getfrag(void *from, char *to, int offset, int len,
1031                        int odd, struct sk_buff *skb),
1032                        void *from, int length, int hh_len, int fragheaderlen,
1033                        int transhdrlen, int mtu,unsigned int flags)
1034
1035{
1036        struct sk_buff *skb;
1037        int err;
1038
1039        /* There is support for UDP large send offload by network
1040         * device, so create one single skb packet containing complete
1041         * udp datagram
1042         */
1043        if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1044                skb = sock_alloc_send_skb(sk,
1045                        hh_len + fragheaderlen + transhdrlen + 20,
1046                        (flags & MSG_DONTWAIT), &err);
1047                if (skb == NULL)
1048                        return -ENOMEM;
1049
1050                /* reserve space for Hardware header */
1051                skb_reserve(skb, hh_len);
1052
1053                /* create space for UDP/IP header */
1054                skb_put(skb,fragheaderlen + transhdrlen);
1055
1056                /* initialize network header pointer */
1057                skb_reset_network_header(skb);
1058
1059                /* initialize protocol header pointer */
1060                skb->transport_header = skb->network_header + fragheaderlen;
1061
1062                skb->ip_summed = CHECKSUM_PARTIAL;
1063                skb->csum = 0;
1064                sk->sk_sndmsg_off = 0;
1065        }
1066
1067        err = skb_append_datato_frags(sk,skb, getfrag, from,
1068                                      (length - transhdrlen));
1069        if (!err) {
1070                struct frag_hdr fhdr;
1071
1072                /* Specify the length of each IPv6 datagram fragment.
1073                 * It has to be a multiple of 8.
1074                 */
1075                skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1076                                             sizeof(struct frag_hdr)) & ~7;
1077                skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1078                ipv6_select_ident(&fhdr);
1079                skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1080                __skb_queue_tail(&sk->sk_write_queue, skb);
1081
1082                return 0;
1083        }
1084        /* There is not enough support do UPD LSO,
1085         * so follow normal path
1086         */
1087        kfree_skb(skb);
1088
1089        return err;
1090}
1091
1092static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1093                                               gfp_t gfp)
1094{
1095        return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1096}
1097
1098static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1099                                                gfp_t gfp)
1100{
1101        return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1102}
1103
1104int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1105        int offset, int len, int odd, struct sk_buff *skb),
1106        void *from, int length, int transhdrlen,
1107        int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1108        struct rt6_info *rt, unsigned int flags, int dontfrag)
1109{
1110        struct inet_sock *inet = inet_sk(sk);
1111        struct ipv6_pinfo *np = inet6_sk(sk);
1112        struct sk_buff *skb;
1113        unsigned int maxfraglen, fragheaderlen;
1114        int exthdrlen;
1115        int hh_len;
1116        int mtu;
1117        int copy;
1118        int err;
1119        int offset = 0;
1120        int csummode = CHECKSUM_NONE;
1121
1122        if (flags&MSG_PROBE)
1123                return 0;
1124        if (skb_queue_empty(&sk->sk_write_queue)) {
1125                /*
1126                 * setup for corking
1127                 */
1128                if (opt) {
1129                        if (WARN_ON(np->cork.opt))
1130                                return -EINVAL;
1131
1132                        np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1133                        if (unlikely(np->cork.opt == NULL))
1134                                return -ENOBUFS;
1135
1136                        np->cork.opt->tot_len = opt->tot_len;
1137                        np->cork.opt->opt_flen = opt->opt_flen;
1138                        np->cork.opt->opt_nflen = opt->opt_nflen;
1139
1140                        np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1141                                                            sk->sk_allocation);
1142                        if (opt->dst0opt && !np->cork.opt->dst0opt)
1143                                return -ENOBUFS;
1144
1145                        np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1146                                                            sk->sk_allocation);
1147                        if (opt->dst1opt && !np->cork.opt->dst1opt)
1148                                return -ENOBUFS;
1149
1150                        np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1151                                                           sk->sk_allocation);
1152                        if (opt->hopopt && !np->cork.opt->hopopt)
1153                                return -ENOBUFS;
1154
1155                        np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1156                                                            sk->sk_allocation);
1157                        if (opt->srcrt && !np->cork.opt->srcrt)
1158                                return -ENOBUFS;
1159
1160                        /* need source address above miyazawa*/
1161                }
1162                dst_hold(&rt->dst);
1163                inet->cork.dst = &rt->dst;
1164                inet->cork.fl = *fl;
1165                np->cork.hop_limit = hlimit;
1166                np->cork.tclass = tclass;
1167                mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1168                      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1169                if (np->frag_size < mtu) {
1170                        if (np->frag_size)
1171                                mtu = np->frag_size;
1172                }
1173                inet->cork.fragsize = mtu;
1174                if (dst_allfrag(rt->dst.path))
1175                        inet->cork.flags |= IPCORK_ALLFRAG;
1176                inet->cork.length = 0;
1177                sk->sk_sndmsg_page = NULL;
1178                sk->sk_sndmsg_off = 0;
1179                exthdrlen = rt->dst.header_len + (opt ? opt->opt_flen : 0) -
1180                            rt->rt6i_nfheader_len;
1181                length += exthdrlen;
1182                transhdrlen += exthdrlen;
1183        } else {
1184                rt = (struct rt6_info *)inet->cork.dst;
1185                fl = &inet->cork.fl;
1186                opt = np->cork.opt;
1187                transhdrlen = 0;
1188                exthdrlen = 0;
1189                mtu = inet->cork.fragsize;
1190        }
1191
1192        hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1193
1194        fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1195                        (opt ? opt->opt_nflen : 0);
1196        maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1197
1198        if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1199                if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1200                        ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1201                        return -EMSGSIZE;
1202                }
1203        }
1204
1205        /*
1206         * Let's try using as much space as possible.
1207         * Use MTU if total length of the message fits into the MTU.
1208         * Otherwise, we need to reserve fragment header and
1209         * fragment alignment (= 8-15 octects, in total).
1210         *
1211         * Note that we may need to "move" the data from the tail of
1212         * of the buffer to the new fragment when we split
1213         * the message.
1214         *
1215         * FIXME: It may be fragmented into multiple chunks
1216         *        at once if non-fragmentable extension headers
1217         *        are too large.
1218         * --yoshfuji
1219         */
1220
1221        inet->cork.length += length;
1222        if (length > mtu) {
1223                int proto = sk->sk_protocol;
1224                if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1225                        ipv6_local_rxpmtu(sk, fl, mtu-exthdrlen);
1226                        return -EMSGSIZE;
1227                }
1228
1229                if (proto == IPPROTO_UDP &&
1230                    (rt->dst.dev->features & NETIF_F_UFO)) {
1231
1232                        err = ip6_ufo_append_data(sk, getfrag, from, length,
1233                                                  hh_len, fragheaderlen,
1234                                                  transhdrlen, mtu, flags);
1235                        if (err)
1236                                goto error;
1237                        return 0;
1238                }
1239        }
1240
1241        if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1242                goto alloc_new_skb;
1243
1244        while (length > 0) {
1245                /* Check if the remaining data fits into current packet. */
1246                copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1247                if (copy < length)
1248                        copy = maxfraglen - skb->len;
1249
1250                if (copy <= 0) {
1251                        char *data;
1252                        unsigned int datalen;
1253                        unsigned int fraglen;
1254                        unsigned int fraggap;
1255                        unsigned int alloclen;
1256                        struct sk_buff *skb_prev;
1257alloc_new_skb:
1258                        skb_prev = skb;
1259
1260                        /* There's no room in the current skb */
1261                        if (skb_prev)
1262                                fraggap = skb_prev->len - maxfraglen;
1263                        else
1264                                fraggap = 0;
1265
1266                        /*
1267                         * If remaining data exceeds the mtu,
1268                         * we know we need more fragment(s).
1269                         */
1270                        datalen = length + fraggap;
1271                        if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1272                                datalen = maxfraglen - fragheaderlen;
1273
1274                        fraglen = datalen + fragheaderlen;
1275                        if ((flags & MSG_MORE) &&
1276                            !(rt->dst.dev->features&NETIF_F_SG))
1277                                alloclen = mtu;
1278                        else
1279                                alloclen = datalen + fragheaderlen;
1280
1281                        /*
1282                         * The last fragment gets additional space at tail.
1283                         * Note: we overallocate on fragments with MSG_MODE
1284                         * because we have no idea if we're the last one.
1285                         */
1286                        if (datalen == length + fraggap)
1287                                alloclen += rt->dst.trailer_len;
1288
1289                        /*
1290                         * We just reserve space for fragment header.
1291                         * Note: this may be overallocation if the message
1292                         * (without MSG_MORE) fits into the MTU.
1293                         */
1294                        alloclen += sizeof(struct frag_hdr);
1295
1296                        if (transhdrlen) {
1297                                skb = sock_alloc_send_skb(sk,
1298                                                alloclen + hh_len,
1299                                                (flags & MSG_DONTWAIT), &err);
1300                        } else {
1301                                skb = NULL;
1302                                if (atomic_read(&sk->sk_wmem_alloc) <=
1303                                    2 * sk->sk_sndbuf)
1304                                        skb = sock_wmalloc(sk,
1305                                                           alloclen + hh_len, 1,
1306                                                           sk->sk_allocation);
1307                                if (unlikely(skb == NULL))
1308                                        err = -ENOBUFS;
1309                        }
1310                        if (skb == NULL)
1311                                goto error;
1312                        /*
1313                         *      Fill in the control structures
1314                         */
1315                        skb->ip_summed = csummode;
1316                        skb->csum = 0;
1317                        /* reserve for fragmentation */
1318                        skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1319
1320                        /*
1321                         *      Find where to start putting bytes
1322                         */
1323                        data = skb_put(skb, fraglen);
1324                        skb_set_network_header(skb, exthdrlen);
1325                        data += fragheaderlen;
1326                        skb->transport_header = (skb->network_header +
1327                                                 fragheaderlen);
1328                        if (fraggap) {
1329                                skb->csum = skb_copy_and_csum_bits(
1330                                        skb_prev, maxfraglen,
1331                                        data + transhdrlen, fraggap, 0);
1332                                skb_prev->csum = csum_sub(skb_prev->csum,
1333                                                          skb->csum);
1334                                data += fraggap;
1335                                pskb_trim_unique(skb_prev, maxfraglen);
1336                        }
1337                        copy = datalen - transhdrlen - fraggap;
1338                        if (copy < 0) {
1339                                err = -EINVAL;
1340                                kfree_skb(skb);
1341                                goto error;
1342                        } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1343                                err = -EFAULT;
1344                                kfree_skb(skb);
1345                                goto error;
1346                        }
1347
1348                        offset += copy;
1349                        length -= datalen - fraggap;
1350                        transhdrlen = 0;
1351                        exthdrlen = 0;
1352                        csummode = CHECKSUM_NONE;
1353
1354                        /*
1355                         * Put the packet on the pending queue
1356                         */
1357                        __skb_queue_tail(&sk->sk_write_queue, skb);
1358                        continue;
1359                }
1360
1361                if (copy > length)
1362                        copy = length;
1363
1364                if (!(rt->dst.dev->features&NETIF_F_SG)) {
1365                        unsigned int off;
1366
1367                        off = skb->len;
1368                        if (getfrag(from, skb_put(skb, copy),
1369                                                offset, copy, off, skb) < 0) {
1370                                __skb_trim(skb, off);
1371                                err = -EFAULT;
1372                                goto error;
1373                        }
1374                } else {
1375                        int i = skb_shinfo(skb)->nr_frags;
1376                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1377                        struct page *page = sk->sk_sndmsg_page;
1378                        int off = sk->sk_sndmsg_off;
1379                        unsigned int left;
1380
1381                        if (page && (left = PAGE_SIZE - off) > 0) {
1382                                if (copy >= left)
1383                                        copy = left;
1384                                if (page != frag->page) {
1385                                        if (i == MAX_SKB_FRAGS) {
1386                                                err = -EMSGSIZE;
1387                                                goto error;
1388                                        }
1389                                        get_page(page);
1390                                        skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1391                                        frag = &skb_shinfo(skb)->frags[i];
1392                                }
1393                        } else if(i < MAX_SKB_FRAGS) {
1394                                if (copy > PAGE_SIZE)
1395                                        copy = PAGE_SIZE;
1396                                page = alloc_pages(sk->sk_allocation, 0);
1397                                if (page == NULL) {
1398                                        err = -ENOMEM;
1399                                        goto error;
1400                                }
1401                                sk->sk_sndmsg_page = page;
1402                                sk->sk_sndmsg_off = 0;
1403
1404                                skb_fill_page_desc(skb, i, page, 0, 0);
1405                                frag = &skb_shinfo(skb)->frags[i];
1406                        } else {
1407                                err = -EMSGSIZE;
1408                                goto error;
1409                        }
1410                        if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1411                                err = -EFAULT;
1412                                goto error;
1413                        }
1414                        sk->sk_sndmsg_off += copy;
1415                        frag->size += copy;
1416                        skb->len += copy;
1417                        skb->data_len += copy;
1418                        skb->truesize += copy;
1419                        atomic_add(copy, &sk->sk_wmem_alloc);
1420                }
1421                offset += copy;
1422                length -= copy;
1423        }
1424        return 0;
1425error:
1426        inet->cork.length -= length;
1427        IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1428        return err;
1429}
1430
1431static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1432{
1433        if (np->cork.opt) {
1434                kfree(np->cork.opt->dst0opt);
1435                kfree(np->cork.opt->dst1opt);
1436                kfree(np->cork.opt->hopopt);
1437                kfree(np->cork.opt->srcrt);
1438                kfree(np->cork.opt);
1439                np->cork.opt = NULL;
1440        }
1441
1442        if (inet->cork.dst) {
1443                dst_release(inet->cork.dst);
1444                inet->cork.dst = NULL;
1445                inet->cork.flags &= ~IPCORK_ALLFRAG;
1446        }
1447        memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1448}
1449
1450int ip6_push_pending_frames(struct sock *sk)
1451{
1452        struct sk_buff *skb, *tmp_skb;
1453        struct sk_buff **tail_skb;
1454        struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1455        struct inet_sock *inet = inet_sk(sk);
1456        struct ipv6_pinfo *np = inet6_sk(sk);
1457        struct net *net = sock_net(sk);
1458        struct ipv6hdr *hdr;
1459        struct ipv6_txoptions *opt = np->cork.opt;
1460        struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1461        struct flowi *fl = &inet->cork.fl;
1462        unsigned char proto = fl->proto;
1463        int err = 0;
1464
1465        if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1466                goto out;
1467        tail_skb = &(skb_shinfo(skb)->frag_list);
1468
1469        /* move skb->data to ip header from ext header */
1470        if (skb->data < skb_network_header(skb))
1471                __skb_pull(skb, skb_network_offset(skb));
1472        while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1473                __skb_pull(tmp_skb, skb_network_header_len(skb));
1474                *tail_skb = tmp_skb;
1475                tail_skb = &(tmp_skb->next);
1476                skb->len += tmp_skb->len;
1477                skb->data_len += tmp_skb->len;
1478                skb->truesize += tmp_skb->truesize;
1479                tmp_skb->destructor = NULL;
1480                tmp_skb->sk = NULL;
1481        }
1482
1483        /* Allow local fragmentation. */
1484        if (np->pmtudisc < IPV6_PMTUDISC_DO)
1485                skb->local_df = 1;
1486
1487        ipv6_addr_copy(final_dst, &fl->fl6_dst);
1488        __skb_pull(skb, skb_network_header_len(skb));
1489        if (opt && opt->opt_flen)
1490                ipv6_push_frag_opts(skb, opt, &proto);
1491        if (opt && opt->opt_nflen)
1492                ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1493
1494        skb_push(skb, sizeof(struct ipv6hdr));
1495        skb_reset_network_header(skb);
1496        hdr = ipv6_hdr(skb);
1497
1498        *(__be32*)hdr = fl->fl6_flowlabel |
1499                     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1500
1501        hdr->hop_limit = np->cork.hop_limit;
1502        hdr->nexthdr = proto;
1503        ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1504        ipv6_addr_copy(&hdr->daddr, final_dst);
1505
1506        skb->priority = sk->sk_priority;
1507        skb->mark = sk->sk_mark;
1508
1509        skb_dst_set(skb, dst_clone(&rt->dst));
1510        IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1511        if (proto == IPPROTO_ICMPV6) {
1512                struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1513
1514                ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1515                ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1516        }
1517
1518        err = ip6_local_out(skb);
1519        if (err) {
1520                if (err > 0)
1521                        err = net_xmit_errno(err);
1522                if (err)
1523                        goto error;
1524        }
1525
1526out:
1527        ip6_cork_release(inet, np);
1528        return err;
1529error:
1530        IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1531        goto out;
1532}
1533
1534void ip6_flush_pending_frames(struct sock *sk)
1535{
1536        struct sk_buff *skb;
1537
1538        while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1539                if (skb_dst(skb))
1540                        IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1541                                      IPSTATS_MIB_OUTDISCARDS);
1542                kfree_skb(skb);
1543        }
1544
1545        ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1546}
1547