linux/net/ipv6/ip6_output.c
<<
>>
Prefs
   1/*
   2 *      IPv6 output functions
   3 *      Linux INET6 implementation
   4 *
   5 *      Authors:
   6 *      Pedro Roque             <roque@di.fc.ul.pt>
   7 *
   8 *      Based on linux/net/ipv4/ip_output.c
   9 *
  10 *      This program is free software; you can redistribute it and/or
  11 *      modify it under the terms of the GNU General Public License
  12 *      as published by the Free Software Foundation; either version
  13 *      2 of the License, or (at your option) any later version.
  14 *
  15 *      Changes:
  16 *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17 *                              extension headers are implemented.
  18 *                              route changes now work.
  19 *                              ip6_forward does not confuse sniffers.
  20 *                              etc.
  21 *
  22 *      H. von Brand    :       Added missing #include <linux/string.h>
  23 *      Imran Patel     :       frag id should be in NBO
  24 *      Kazunori MIYAZAWA @USAGI
  25 *                      :       add ip6_append_data and related functions
  26 *                              for datagram xmit
  27 */
  28
  29#include <linux/errno.h>
  30#include <linux/kernel.h>
  31#include <linux/string.h>
  32#include <linux/socket.h>
  33#include <linux/net.h>
  34#include <linux/netdevice.h>
  35#include <linux/if_arp.h>
  36#include <linux/in6.h>
  37#include <linux/tcp.h>
  38#include <linux/route.h>
  39#include <linux/module.h>
  40#include <linux/slab.h>
  41
  42#include <linux/netfilter.h>
  43#include <linux/netfilter_ipv6.h>
  44
  45#include <net/sock.h>
  46#include <net/snmp.h>
  47
  48#include <net/ipv6.h>
  49#include <net/ndisc.h>
  50#include <net/protocol.h>
  51#include <net/ip6_route.h>
  52#include <net/addrconf.h>
  53#include <net/rawv6.h>
  54#include <net/icmp.h>
  55#include <net/xfrm.h>
  56#include <net/checksum.h>
  57#include <linux/mroute6.h>
  58
  59int __ip6_local_out(struct sk_buff *skb)
  60{
  61        int len;
  62
  63        len = skb->len - sizeof(struct ipv6hdr);
  64        if (len > IPV6_MAXPLEN)
  65                len = 0;
  66        ipv6_hdr(skb)->payload_len = htons(len);
  67
  68        return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
  69                       skb_dst(skb)->dev, dst_output);
  70}
  71
  72int ip6_local_out(struct sk_buff *skb)
  73{
  74        int err;
  75
  76        err = __ip6_local_out(skb);
  77        if (likely(err == 1))
  78                err = dst_output(skb);
  79
  80        return err;
  81}
  82EXPORT_SYMBOL_GPL(ip6_local_out);
  83
  84static int ip6_finish_output2(struct sk_buff *skb)
  85{
  86        struct dst_entry *dst = skb_dst(skb);
  87        struct net_device *dev = dst->dev;
  88        struct neighbour *neigh;
  89        struct in6_addr *nexthop;
  90        int ret;
  91
  92        skb->protocol = htons(ETH_P_IPV6);
  93        skb->dev = dev;
  94
  95        if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  96                struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  97
  98                if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
  99                    ((mroute6_socket(dev_net(dev), skb) &&
 100                     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
 101                     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
 102                                         &ipv6_hdr(skb)->saddr))) {
 103                        struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 104
 105                        /* Do not check for IFF_ALLMULTI; multicast routing
 106                           is not supported in any case.
 107                         */
 108                        if (newskb)
 109                                NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 110                                        newskb, NULL, newskb->dev,
 111                                        dev_loopback_xmit);
 112
 113                        if (ipv6_hdr(skb)->hop_limit == 0) {
 114                                IP6_INC_STATS(dev_net(dev), idev,
 115                                              IPSTATS_MIB_OUTDISCARDS);
 116                                kfree_skb(skb);
 117                                return 0;
 118                        }
 119                }
 120
 121                IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
 122                                skb->len);
 123
 124                if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
 125                    IPV6_ADDR_SCOPE_NODELOCAL &&
 126                    !(dev->flags & IFF_LOOPBACK)) {
 127                        kfree_skb(skb);
 128                        return 0;
 129                }
 130        }
 131
 132        rcu_read_lock_bh();
 133        nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 134        neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 135        if (unlikely(!neigh))
 136                neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 137        if (!IS_ERR(neigh)) {
 138                ret = dst_neigh_output(dst, neigh, skb);
 139                rcu_read_unlock_bh();
 140                return ret;
 141        }
 142        rcu_read_unlock_bh();
 143
 144        IP6_INC_STATS_BH(dev_net(dst->dev),
 145                         ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 146        kfree_skb(skb);
 147        return -EINVAL;
 148}
 149
 150static int ip6_finish_output(struct sk_buff *skb)
 151{
 152        if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 153            dst_allfrag(skb_dst(skb)))
 154                return ip6_fragment(skb, ip6_finish_output2);
 155        else
 156                return ip6_finish_output2(skb);
 157}
 158
 159int ip6_output(struct sk_buff *skb)
 160{
 161        struct net_device *dev = skb_dst(skb)->dev;
 162        struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 163        if (unlikely(idev->cnf.disable_ipv6)) {
 164                IP6_INC_STATS(dev_net(dev), idev,
 165                              IPSTATS_MIB_OUTDISCARDS);
 166                kfree_skb(skb);
 167                return 0;
 168        }
 169
 170        return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
 171                            ip6_finish_output,
 172                            !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 173}
 174
 175/*
 176 *      xmit an sk_buff (used by TCP, SCTP and DCCP)
 177 */
 178
 179int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 180             struct ipv6_txoptions *opt, int tclass)
 181{
 182        struct net *net = sock_net(sk);
 183        struct ipv6_pinfo *np = inet6_sk(sk);
 184        struct in6_addr *first_hop = &fl6->daddr;
 185        struct dst_entry *dst = skb_dst(skb);
 186        struct ipv6hdr *hdr;
 187        u8  proto = fl6->flowi6_proto;
 188        int seg_len = skb->len;
 189        int hlimit = -1;
 190        u32 mtu;
 191
 192        if (opt) {
 193                unsigned int head_room;
 194
 195                /* First: exthdrs may take lots of space (~8K for now)
 196                   MAX_HEADER is not enough.
 197                 */
 198                head_room = opt->opt_nflen + opt->opt_flen;
 199                seg_len += head_room;
 200                head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 201
 202                if (skb_headroom(skb) < head_room) {
 203                        struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 204                        if (skb2 == NULL) {
 205                                IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 206                                              IPSTATS_MIB_OUTDISCARDS);
 207                                kfree_skb(skb);
 208                                return -ENOBUFS;
 209                        }
 210                        consume_skb(skb);
 211                        skb = skb2;
 212                        skb_set_owner_w(skb, sk);
 213                }
 214                if (opt->opt_flen)
 215                        ipv6_push_frag_opts(skb, opt, &proto);
 216                if (opt->opt_nflen)
 217                        ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 218        }
 219
 220        skb_push(skb, sizeof(struct ipv6hdr));
 221        skb_reset_network_header(skb);
 222        hdr = ipv6_hdr(skb);
 223
 224        /*
 225         *      Fill in the IPv6 header
 226         */
 227        if (np)
 228                hlimit = np->hop_limit;
 229        if (hlimit < 0)
 230                hlimit = ip6_dst_hoplimit(dst);
 231
 232        ip6_flow_hdr(hdr, tclass, fl6->flowlabel);
 233
 234        hdr->payload_len = htons(seg_len);
 235        hdr->nexthdr = proto;
 236        hdr->hop_limit = hlimit;
 237
 238        hdr->saddr = fl6->saddr;
 239        hdr->daddr = *first_hop;
 240
 241        skb->protocol = htons(ETH_P_IPV6);
 242        skb->priority = sk->sk_priority;
 243        skb->mark = sk->sk_mark;
 244
 245        mtu = dst_mtu(dst);
 246        if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
 247                IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 248                              IPSTATS_MIB_OUT, skb->len);
 249                return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
 250                               dst->dev, dst_output);
 251        }
 252
 253        skb->dev = dst->dev;
 254        ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
 255        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 256        kfree_skb(skb);
 257        return -EMSGSIZE;
 258}
 259
 260EXPORT_SYMBOL(ip6_xmit);
 261
 262static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 263{
 264        struct ip6_ra_chain *ra;
 265        struct sock *last = NULL;
 266
 267        read_lock(&ip6_ra_lock);
 268        for (ra = ip6_ra_chain; ra; ra = ra->next) {
 269                struct sock *sk = ra->sk;
 270                if (sk && ra->sel == sel &&
 271                    (!sk->sk_bound_dev_if ||
 272                     sk->sk_bound_dev_if == skb->dev->ifindex)) {
 273                        if (last) {
 274                                struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 275                                if (skb2)
 276                                        rawv6_rcv(last, skb2);
 277                        }
 278                        last = sk;
 279                }
 280        }
 281
 282        if (last) {
 283                rawv6_rcv(last, skb);
 284                read_unlock(&ip6_ra_lock);
 285                return 1;
 286        }
 287        read_unlock(&ip6_ra_lock);
 288        return 0;
 289}
 290
 291static int ip6_forward_proxy_check(struct sk_buff *skb)
 292{
 293        struct ipv6hdr *hdr = ipv6_hdr(skb);
 294        u8 nexthdr = hdr->nexthdr;
 295        __be16 frag_off;
 296        int offset;
 297
 298        if (ipv6_ext_hdr(nexthdr)) {
 299                offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 300                if (offset < 0)
 301                        return 0;
 302        } else
 303                offset = sizeof(struct ipv6hdr);
 304
 305        if (nexthdr == IPPROTO_ICMPV6) {
 306                struct icmp6hdr *icmp6;
 307
 308                if (!pskb_may_pull(skb, (skb_network_header(skb) +
 309                                         offset + 1 - skb->data)))
 310                        return 0;
 311
 312                icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 313
 314                switch (icmp6->icmp6_type) {
 315                case NDISC_ROUTER_SOLICITATION:
 316                case NDISC_ROUTER_ADVERTISEMENT:
 317                case NDISC_NEIGHBOUR_SOLICITATION:
 318                case NDISC_NEIGHBOUR_ADVERTISEMENT:
 319                case NDISC_REDIRECT:
 320                        /* For reaction involving unicast neighbor discovery
 321                         * message destined to the proxied address, pass it to
 322                         * input function.
 323                         */
 324                        return 1;
 325                default:
 326                        break;
 327                }
 328        }
 329
 330        /*
 331         * The proxying router can't forward traffic sent to a link-local
 332         * address, so signal the sender and discard the packet. This
 333         * behavior is clarified by the MIPv6 specification.
 334         */
 335        if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 336                dst_link_failure(skb);
 337                return -1;
 338        }
 339
 340        return 0;
 341}
 342
 343static inline int ip6_forward_finish(struct sk_buff *skb)
 344{
 345        return dst_output(skb);
 346}
 347
 348int ip6_forward(struct sk_buff *skb)
 349{
 350        struct dst_entry *dst = skb_dst(skb);
 351        struct ipv6hdr *hdr = ipv6_hdr(skb);
 352        struct inet6_skb_parm *opt = IP6CB(skb);
 353        struct net *net = dev_net(dst->dev);
 354        u32 mtu;
 355
 356        if (net->ipv6.devconf_all->forwarding == 0)
 357                goto error;
 358
 359        if (skb_warn_if_lro(skb))
 360                goto drop;
 361
 362        if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 363                IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 364                goto drop;
 365        }
 366
 367        if (skb->pkt_type != PACKET_HOST)
 368                goto drop;
 369
 370        skb_forward_csum(skb);
 371
 372        /*
 373         *      We DO NOT make any processing on
 374         *      RA packets, pushing them to user level AS IS
 375         *      without ane WARRANTY that application will be able
 376         *      to interpret them. The reason is that we
 377         *      cannot make anything clever here.
 378         *
 379         *      We are not end-node, so that if packet contains
 380         *      AH/ESP, we cannot make anything.
 381         *      Defragmentation also would be mistake, RA packets
 382         *      cannot be fragmented, because there is no warranty
 383         *      that different fragments will go along one path. --ANK
 384         */
 385        if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 386                if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 387                        return 0;
 388        }
 389
 390        /*
 391         *      check and decrement ttl
 392         */
 393        if (hdr->hop_limit <= 1) {
 394                /* Force OUTPUT device used as source address */
 395                skb->dev = dst->dev;
 396                icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 397                IP6_INC_STATS_BH(net,
 398                                 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
 399
 400                kfree_skb(skb);
 401                return -ETIMEDOUT;
 402        }
 403
 404        /* XXX: idev->cnf.proxy_ndp? */
 405        if (net->ipv6.devconf_all->proxy_ndp &&
 406            pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 407                int proxied = ip6_forward_proxy_check(skb);
 408                if (proxied > 0)
 409                        return ip6_input(skb);
 410                else if (proxied < 0) {
 411                        IP6_INC_STATS(net, ip6_dst_idev(dst),
 412                                      IPSTATS_MIB_INDISCARDS);
 413                        goto drop;
 414                }
 415        }
 416
 417        if (!xfrm6_route_forward(skb)) {
 418                IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
 419                goto drop;
 420        }
 421        dst = skb_dst(skb);
 422
 423        /* IPv6 specs say nothing about it, but it is clear that we cannot
 424           send redirects to source routed frames.
 425           We don't send redirects to frames decapsulated from IPsec.
 426         */
 427        if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 428                struct in6_addr *target = NULL;
 429                struct inet_peer *peer;
 430                struct rt6_info *rt;
 431
 432                /*
 433                 *      incoming and outgoing devices are the same
 434                 *      send a redirect.
 435                 */
 436
 437                rt = (struct rt6_info *) dst;
 438                if (rt->rt6i_flags & RTF_GATEWAY)
 439                        target = &rt->rt6i_gateway;
 440                else
 441                        target = &hdr->daddr;
 442
 443                peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
 444
 445                /* Limit redirects both by destination (here)
 446                   and by source (inside ndisc_send_redirect)
 447                 */
 448                if (inet_peer_xrlim_allow(peer, 1*HZ))
 449                        ndisc_send_redirect(skb, target);
 450                if (peer)
 451                        inet_putpeer(peer);
 452        } else {
 453                int addrtype = ipv6_addr_type(&hdr->saddr);
 454
 455                /* This check is security critical. */
 456                if (addrtype == IPV6_ADDR_ANY ||
 457                    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 458                        goto error;
 459                if (addrtype & IPV6_ADDR_LINKLOCAL) {
 460                        icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 461                                    ICMPV6_NOT_NEIGHBOUR, 0);
 462                        goto error;
 463                }
 464        }
 465
 466        mtu = dst_mtu(dst);
 467        if (mtu < IPV6_MIN_MTU)
 468                mtu = IPV6_MIN_MTU;
 469
 470        if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
 471            (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {
 472                /* Again, force OUTPUT device used as source address */
 473                skb->dev = dst->dev;
 474                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 475                IP6_INC_STATS_BH(net,
 476                                 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
 477                IP6_INC_STATS_BH(net,
 478                                 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
 479                kfree_skb(skb);
 480                return -EMSGSIZE;
 481        }
 482
 483        if (skb_cow(skb, dst->dev->hard_header_len)) {
 484                IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 485                goto drop;
 486        }
 487
 488        hdr = ipv6_hdr(skb);
 489
 490        /* Mangling hops number delayed to point after skb COW */
 491
 492        hdr->hop_limit--;
 493
 494        IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 495        IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 496        return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
 497                       ip6_forward_finish);
 498
 499error:
 500        IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 501drop:
 502        kfree_skb(skb);
 503        return -EINVAL;
 504}
 505
 506static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 507{
 508        to->pkt_type = from->pkt_type;
 509        to->priority = from->priority;
 510        to->protocol = from->protocol;
 511        skb_dst_drop(to);
 512        skb_dst_set(to, dst_clone(skb_dst(from)));
 513        to->dev = from->dev;
 514        to->mark = from->mark;
 515
 516#ifdef CONFIG_NET_SCHED
 517        to->tc_index = from->tc_index;
 518#endif
 519        nf_copy(to, from);
 520#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
 521        to->nf_trace = from->nf_trace;
 522#endif
 523        skb_copy_secmark(to, from);
 524}
 525
 526int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 527{
 528        struct sk_buff *frag;
 529        struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
 530        struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
 531        struct ipv6hdr *tmp_hdr;
 532        struct frag_hdr *fh;
 533        unsigned int mtu, hlen, left, len;
 534        int hroom, troom;
 535        __be32 frag_id = 0;
 536        int ptr, offset = 0, err=0;
 537        u8 *prevhdr, nexthdr = 0;
 538        struct net *net = dev_net(skb_dst(skb)->dev);
 539
 540        hlen = ip6_find_1stfragopt(skb, &prevhdr);
 541        nexthdr = *prevhdr;
 542
 543        mtu = ip6_skb_dst_mtu(skb);
 544
 545        /* We must not fragment if the socket is set to force MTU discovery
 546         * or if the skb it not generated by a local socket.
 547         */
 548        if (unlikely(!skb->local_df && skb->len > mtu) ||
 549                     (IP6CB(skb)->frag_max_size &&
 550                      IP6CB(skb)->frag_max_size > mtu)) {
 551                if (skb->sk && dst_allfrag(skb_dst(skb)))
 552                        sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 553
 554                skb->dev = skb_dst(skb)->dev;
 555                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 556                IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 557                              IPSTATS_MIB_FRAGFAILS);
 558                kfree_skb(skb);
 559                return -EMSGSIZE;
 560        }
 561
 562        if (np && np->frag_size < mtu) {
 563                if (np->frag_size)
 564                        mtu = np->frag_size;
 565        }
 566        mtu -= hlen + sizeof(struct frag_hdr);
 567
 568        if (skb_has_frag_list(skb)) {
 569                int first_len = skb_pagelen(skb);
 570                struct sk_buff *frag2;
 571
 572                if (first_len - hlen > mtu ||
 573                    ((first_len - hlen) & 7) ||
 574                    skb_cloned(skb))
 575                        goto slow_path;
 576
 577                skb_walk_frags(skb, frag) {
 578                        /* Correct geometry. */
 579                        if (frag->len > mtu ||
 580                            ((frag->len & 7) && frag->next) ||
 581                            skb_headroom(frag) < hlen)
 582                                goto slow_path_clean;
 583
 584                        /* Partially cloned skb? */
 585                        if (skb_shared(frag))
 586                                goto slow_path_clean;
 587
 588                        BUG_ON(frag->sk);
 589                        if (skb->sk) {
 590                                frag->sk = skb->sk;
 591                                frag->destructor = sock_wfree;
 592                        }
 593                        skb->truesize -= frag->truesize;
 594                }
 595
 596                err = 0;
 597                offset = 0;
 598                frag = skb_shinfo(skb)->frag_list;
 599                skb_frag_list_init(skb);
 600                /* BUILD HEADER */
 601
 602                *prevhdr = NEXTHDR_FRAGMENT;
 603                tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 604                if (!tmp_hdr) {
 605                        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 606                                      IPSTATS_MIB_FRAGFAILS);
 607                        return -ENOMEM;
 608                }
 609
 610                __skb_pull(skb, hlen);
 611                fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
 612                __skb_push(skb, hlen);
 613                skb_reset_network_header(skb);
 614                memcpy(skb_network_header(skb), tmp_hdr, hlen);
 615
 616                ipv6_select_ident(fh, rt);
 617                fh->nexthdr = nexthdr;
 618                fh->reserved = 0;
 619                fh->frag_off = htons(IP6_MF);
 620                frag_id = fh->identification;
 621
 622                first_len = skb_pagelen(skb);
 623                skb->data_len = first_len - skb_headlen(skb);
 624                skb->len = first_len;
 625                ipv6_hdr(skb)->payload_len = htons(first_len -
 626                                                   sizeof(struct ipv6hdr));
 627
 628                dst_hold(&rt->dst);
 629
 630                for (;;) {
 631                        /* Prepare header of the next frame,
 632                         * before previous one went down. */
 633                        if (frag) {
 634                                frag->ip_summed = CHECKSUM_NONE;
 635                                skb_reset_transport_header(frag);
 636                                fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
 637                                __skb_push(frag, hlen);
 638                                skb_reset_network_header(frag);
 639                                memcpy(skb_network_header(frag), tmp_hdr,
 640                                       hlen);
 641                                offset += skb->len - hlen - sizeof(struct frag_hdr);
 642                                fh->nexthdr = nexthdr;
 643                                fh->reserved = 0;
 644                                fh->frag_off = htons(offset);
 645                                if (frag->next != NULL)
 646                                        fh->frag_off |= htons(IP6_MF);
 647                                fh->identification = frag_id;
 648                                ipv6_hdr(frag)->payload_len =
 649                                                htons(frag->len -
 650                                                      sizeof(struct ipv6hdr));
 651                                ip6_copy_metadata(frag, skb);
 652                        }
 653
 654                        err = output(skb);
 655                        if(!err)
 656                                IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 657                                              IPSTATS_MIB_FRAGCREATES);
 658
 659                        if (err || !frag)
 660                                break;
 661
 662                        skb = frag;
 663                        frag = skb->next;
 664                        skb->next = NULL;
 665                }
 666
 667                kfree(tmp_hdr);
 668
 669                if (err == 0) {
 670                        IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 671                                      IPSTATS_MIB_FRAGOKS);
 672                        ip6_rt_put(rt);
 673                        return 0;
 674                }
 675
 676                while (frag) {
 677                        skb = frag->next;
 678                        kfree_skb(frag);
 679                        frag = skb;
 680                }
 681
 682                IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 683                              IPSTATS_MIB_FRAGFAILS);
 684                ip6_rt_put(rt);
 685                return err;
 686
 687slow_path_clean:
 688                skb_walk_frags(skb, frag2) {
 689                        if (frag2 == frag)
 690                                break;
 691                        frag2->sk = NULL;
 692                        frag2->destructor = NULL;
 693                        skb->truesize += frag2->truesize;
 694                }
 695        }
 696
 697slow_path:
 698        if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
 699            skb_checksum_help(skb))
 700                goto fail;
 701
 702        left = skb->len - hlen;         /* Space per frame */
 703        ptr = hlen;                     /* Where to start from */
 704
 705        /*
 706         *      Fragment the datagram.
 707         */
 708
 709        *prevhdr = NEXTHDR_FRAGMENT;
 710        hroom = LL_RESERVED_SPACE(rt->dst.dev);
 711        troom = rt->dst.dev->needed_tailroom;
 712
 713        /*
 714         *      Keep copying data until we run out.
 715         */
 716        while(left > 0) {
 717                len = left;
 718                /* IF: it doesn't fit, use 'mtu' - the data space left */
 719                if (len > mtu)
 720                        len = mtu;
 721                /* IF: we are not sending up to and including the packet end
 722                   then align the next start on an eight byte boundary */
 723                if (len < left) {
 724                        len &= ~7;
 725                }
 726                /*
 727                 *      Allocate buffer.
 728                 */
 729
 730                if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 731                                      hroom + troom, GFP_ATOMIC)) == NULL) {
 732                        NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
 733                        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 734                                      IPSTATS_MIB_FRAGFAILS);
 735                        err = -ENOMEM;
 736                        goto fail;
 737                }
 738
 739                /*
 740                 *      Set up data on packet
 741                 */
 742
 743                ip6_copy_metadata(frag, skb);
 744                skb_reserve(frag, hroom);
 745                skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 746                skb_reset_network_header(frag);
 747                fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 748                frag->transport_header = (frag->network_header + hlen +
 749                                          sizeof(struct frag_hdr));
 750
 751                /*
 752                 *      Charge the memory for the fragment to any owner
 753                 *      it might possess
 754                 */
 755                if (skb->sk)
 756                        skb_set_owner_w(frag, skb->sk);
 757
 758                /*
 759                 *      Copy the packet header into the new buffer.
 760                 */
 761                skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 762
 763                /*
 764                 *      Build fragment header.
 765                 */
 766                fh->nexthdr = nexthdr;
 767                fh->reserved = 0;
 768                if (!frag_id) {
 769                        ipv6_select_ident(fh, rt);
 770                        frag_id = fh->identification;
 771                } else
 772                        fh->identification = frag_id;
 773
 774                /*
 775                 *      Copy a block of the IP datagram.
 776                 */
 777                if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
 778                        BUG();
 779                left -= len;
 780
 781                fh->frag_off = htons(offset);
 782                if (left > 0)
 783                        fh->frag_off |= htons(IP6_MF);
 784                ipv6_hdr(frag)->payload_len = htons(frag->len -
 785                                                    sizeof(struct ipv6hdr));
 786
 787                ptr += len;
 788                offset += len;
 789
 790                /*
 791                 *      Put this fragment into the sending queue.
 792                 */
 793                err = output(frag);
 794                if (err)
 795                        goto fail;
 796
 797                IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 798                              IPSTATS_MIB_FRAGCREATES);
 799        }
 800        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 801                      IPSTATS_MIB_FRAGOKS);
 802        consume_skb(skb);
 803        return err;
 804
 805fail:
 806        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 807                      IPSTATS_MIB_FRAGFAILS);
 808        kfree_skb(skb);
 809        return err;
 810}
 811
 812static inline int ip6_rt_check(const struct rt6key *rt_key,
 813                               const struct in6_addr *fl_addr,
 814                               const struct in6_addr *addr_cache)
 815{
 816        return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 817                (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
 818}
 819
 820static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 821                                          struct dst_entry *dst,
 822                                          const struct flowi6 *fl6)
 823{
 824        struct ipv6_pinfo *np = inet6_sk(sk);
 825        struct rt6_info *rt;
 826
 827        if (!dst)
 828                goto out;
 829
 830        if (dst->ops->family != AF_INET6) {
 831                dst_release(dst);
 832                return NULL;
 833        }
 834
 835        rt = (struct rt6_info *)dst;
 836        /* Yes, checking route validity in not connected
 837         * case is not very simple. Take into account,
 838         * that we do not support routing by source, TOS,
 839         * and MSG_DONTROUTE            --ANK (980726)
 840         *
 841         * 1. ip6_rt_check(): If route was host route,
 842         *    check that cached destination is current.
 843         *    If it is network route, we still may
 844         *    check its validity using saved pointer
 845         *    to the last used address: daddr_cache.
 846         *    We do not want to save whole address now,
 847         *    (because main consumer of this service
 848         *    is tcp, which has not this problem),
 849         *    so that the last trick works only on connected
 850         *    sockets.
 851         * 2. oif also should be the same.
 852         */
 853        if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 854#ifdef CONFIG_IPV6_SUBTREES
 855            ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 856#endif
 857            (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
 858                dst_release(dst);
 859                dst = NULL;
 860        }
 861
 862out:
 863        return dst;
 864}
 865
 866static int ip6_dst_lookup_tail(struct sock *sk,
 867                               struct dst_entry **dst, struct flowi6 *fl6)
 868{
 869        struct net *net = sock_net(sk);
 870#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 871        struct neighbour *n;
 872        struct rt6_info *rt;
 873#endif
 874        int err;
 875
 876        if (*dst == NULL)
 877                *dst = ip6_route_output(net, sk, fl6);
 878
 879        if ((err = (*dst)->error))
 880                goto out_err_release;
 881
 882        if (ipv6_addr_any(&fl6->saddr)) {
 883                struct rt6_info *rt = (struct rt6_info *) *dst;
 884                err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 885                                          sk ? inet6_sk(sk)->srcprefs : 0,
 886                                          &fl6->saddr);
 887                if (err)
 888                        goto out_err_release;
 889        }
 890
 891#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 892        /*
 893         * Here if the dst entry we've looked up
 894         * has a neighbour entry that is in the INCOMPLETE
 895         * state and the src address from the flow is
 896         * marked as OPTIMISTIC, we release the found
 897         * dst entry and replace it instead with the
 898         * dst entry of the nexthop router
 899         */
 900        rt = (struct rt6_info *) *dst;
 901        rcu_read_lock_bh();
 902        n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt, &fl6->daddr));
 903        err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
 904        rcu_read_unlock_bh();
 905
 906        if (err) {
 907                struct inet6_ifaddr *ifp;
 908                struct flowi6 fl_gw6;
 909                int redirect;
 910
 911                ifp = ipv6_get_ifaddr(net, &fl6->saddr,
 912                                      (*dst)->dev, 1);
 913
 914                redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 915                if (ifp)
 916                        in6_ifa_put(ifp);
 917
 918                if (redirect) {
 919                        /*
 920                         * We need to get the dst entry for the
 921                         * default router instead
 922                         */
 923                        dst_release(*dst);
 924                        memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
 925                        memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
 926                        *dst = ip6_route_output(net, sk, &fl_gw6);
 927                        if ((err = (*dst)->error))
 928                                goto out_err_release;
 929                }
 930        }
 931#endif
 932
 933        return 0;
 934
 935out_err_release:
 936        if (err == -ENETUNREACH)
 937                IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
 938        dst_release(*dst);
 939        *dst = NULL;
 940        return err;
 941}
 942
 943/**
 944 *      ip6_dst_lookup - perform route lookup on flow
 945 *      @sk: socket which provides route info
 946 *      @dst: pointer to dst_entry * for result
 947 *      @fl6: flow to lookup
 948 *
 949 *      This function performs a route lookup on the given flow.
 950 *
 951 *      It returns zero on success, or a standard errno code on error.
 952 */
 953int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
 954{
 955        *dst = NULL;
 956        return ip6_dst_lookup_tail(sk, dst, fl6);
 957}
 958EXPORT_SYMBOL_GPL(ip6_dst_lookup);
 959
 960/**
 961 *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
 962 *      @sk: socket which provides route info
 963 *      @fl6: flow to lookup
 964 *      @final_dst: final destination address for ipsec lookup
 965 *      @can_sleep: we are in a sleepable context
 966 *
 967 *      This function performs a route lookup on the given flow.
 968 *
 969 *      It returns a valid dst pointer on success, or a pointer encoded
 970 *      error code.
 971 */
 972struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
 973                                      const struct in6_addr *final_dst,
 974                                      bool can_sleep)
 975{
 976        struct dst_entry *dst = NULL;
 977        int err;
 978
 979        err = ip6_dst_lookup_tail(sk, &dst, fl6);
 980        if (err)
 981                return ERR_PTR(err);
 982        if (final_dst)
 983                fl6->daddr = *final_dst;
 984        if (can_sleep)
 985                fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
 986
 987        return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
 988}
 989EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
 990
 991/**
 992 *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
 993 *      @sk: socket which provides the dst cache and route info
 994 *      @fl6: flow to lookup
 995 *      @final_dst: final destination address for ipsec lookup
 996 *      @can_sleep: we are in a sleepable context
 997 *
 998 *      This function performs a route lookup on the given flow with the
 999 *      possibility of using the cached route in the socket if it is valid.
1000 *      It will take the socket dst lock when operating on the dst cache.
1001 *      As a result, this function can only be used in process context.
1002 *
1003 *      It returns a valid dst pointer on success, or a pointer encoded
1004 *      error code.
1005 */
1006struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1007                                         const struct in6_addr *final_dst,
1008                                         bool can_sleep)
1009{
1010        struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1011        int err;
1012
1013        dst = ip6_sk_dst_check(sk, dst, fl6);
1014
1015        err = ip6_dst_lookup_tail(sk, &dst, fl6);
1016        if (err)
1017                return ERR_PTR(err);
1018        if (final_dst)
1019                fl6->daddr = *final_dst;
1020        if (can_sleep)
1021                fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1022
1023        return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1024}
1025EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1026
1027static inline int ip6_ufo_append_data(struct sock *sk,
1028                        int getfrag(void *from, char *to, int offset, int len,
1029                        int odd, struct sk_buff *skb),
1030                        void *from, int length, int hh_len, int fragheaderlen,
1031                        int transhdrlen, int mtu,unsigned int flags,
1032                        struct rt6_info *rt)
1033
1034{
1035        struct sk_buff *skb;
1036        int err;
1037
1038        /* There is support for UDP large send offload by network
1039         * device, so create one single skb packet containing complete
1040         * udp datagram
1041         */
1042        if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1043                skb = sock_alloc_send_skb(sk,
1044                        hh_len + fragheaderlen + transhdrlen + 20,
1045                        (flags & MSG_DONTWAIT), &err);
1046                if (skb == NULL)
1047                        return err;
1048
1049                /* reserve space for Hardware header */
1050                skb_reserve(skb, hh_len);
1051
1052                /* create space for UDP/IP header */
1053                skb_put(skb,fragheaderlen + transhdrlen);
1054
1055                /* initialize network header pointer */
1056                skb_reset_network_header(skb);
1057
1058                /* initialize protocol header pointer */
1059                skb->transport_header = skb->network_header + fragheaderlen;
1060
1061                skb->protocol = htons(ETH_P_IPV6);
1062                skb->ip_summed = CHECKSUM_PARTIAL;
1063                skb->csum = 0;
1064        }
1065
1066        err = skb_append_datato_frags(sk,skb, getfrag, from,
1067                                      (length - transhdrlen));
1068        if (!err) {
1069                struct frag_hdr fhdr;
1070
1071                /* Specify the length of each IPv6 datagram fragment.
1072                 * It has to be a multiple of 8.
1073                 */
1074                skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1075                                             sizeof(struct frag_hdr)) & ~7;
1076                skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1077                ipv6_select_ident(&fhdr, rt);
1078                skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1079                __skb_queue_tail(&sk->sk_write_queue, skb);
1080
1081                return 0;
1082        }
1083        /* There is not enough support do UPD LSO,
1084         * so follow normal path
1085         */
1086        kfree_skb(skb);
1087
1088        return err;
1089}
1090
1091static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1092                                               gfp_t gfp)
1093{
1094        return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1095}
1096
1097static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1098                                                gfp_t gfp)
1099{
1100        return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1101}
1102
1103static void ip6_append_data_mtu(unsigned int *mtu,
1104                                int *maxfraglen,
1105                                unsigned int fragheaderlen,
1106                                struct sk_buff *skb,
1107                                struct rt6_info *rt,
1108                                bool pmtuprobe)
1109{
1110        if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1111                if (skb == NULL) {
1112                        /* first fragment, reserve header_len */
1113                        *mtu = *mtu - rt->dst.header_len;
1114
1115                } else {
1116                        /*
1117                         * this fragment is not first, the headers
1118                         * space is regarded as data space.
1119                         */
1120                        *mtu = min(*mtu, pmtuprobe ?
1121                                   rt->dst.dev->mtu :
1122                                   dst_mtu(rt->dst.path));
1123                }
1124                *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1125                              + fragheaderlen - sizeof(struct frag_hdr);
1126        }
1127}
1128
1129int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1130        int offset, int len, int odd, struct sk_buff *skb),
1131        void *from, int length, int transhdrlen,
1132        int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1133        struct rt6_info *rt, unsigned int flags, int dontfrag)
1134{
1135        struct inet_sock *inet = inet_sk(sk);
1136        struct ipv6_pinfo *np = inet6_sk(sk);
1137        struct inet_cork *cork;
1138        struct sk_buff *skb, *skb_prev = NULL;
1139        unsigned int maxfraglen, fragheaderlen, mtu;
1140        int exthdrlen;
1141        int dst_exthdrlen;
1142        int hh_len;
1143        int copy;
1144        int err;
1145        int offset = 0;
1146        __u8 tx_flags = 0;
1147
1148        if (flags&MSG_PROBE)
1149                return 0;
1150        cork = &inet->cork.base;
1151        if (skb_queue_empty(&sk->sk_write_queue)) {
1152                /*
1153                 * setup for corking
1154                 */
1155                if (opt) {
1156                        if (WARN_ON(np->cork.opt))
1157                                return -EINVAL;
1158
1159                        np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
1160                        if (unlikely(np->cork.opt == NULL))
1161                                return -ENOBUFS;
1162
1163                        np->cork.opt->tot_len = opt->tot_len;
1164                        np->cork.opt->opt_flen = opt->opt_flen;
1165                        np->cork.opt->opt_nflen = opt->opt_nflen;
1166
1167                        np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1168                                                            sk->sk_allocation);
1169                        if (opt->dst0opt && !np->cork.opt->dst0opt)
1170                                return -ENOBUFS;
1171
1172                        np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1173                                                            sk->sk_allocation);
1174                        if (opt->dst1opt && !np->cork.opt->dst1opt)
1175                                return -ENOBUFS;
1176
1177                        np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1178                                                           sk->sk_allocation);
1179                        if (opt->hopopt && !np->cork.opt->hopopt)
1180                                return -ENOBUFS;
1181
1182                        np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1183                                                            sk->sk_allocation);
1184                        if (opt->srcrt && !np->cork.opt->srcrt)
1185                                return -ENOBUFS;
1186
1187                        /* need source address above miyazawa*/
1188                }
1189                dst_hold(&rt->dst);
1190                cork->dst = &rt->dst;
1191                inet->cork.fl.u.ip6 = *fl6;
1192                np->cork.hop_limit = hlimit;
1193                np->cork.tclass = tclass;
1194                if (rt->dst.flags & DST_XFRM_TUNNEL)
1195                        mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1196                              rt->dst.dev->mtu : dst_mtu(&rt->dst);
1197                else
1198                        mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1199                              rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1200                if (np->frag_size < mtu) {
1201                        if (np->frag_size)
1202                                mtu = np->frag_size;
1203                }
1204                cork->fragsize = mtu;
1205                if (dst_allfrag(rt->dst.path))
1206                        cork->flags |= IPCORK_ALLFRAG;
1207                cork->length = 0;
1208                exthdrlen = (opt ? opt->opt_flen : 0);
1209                length += exthdrlen;
1210                transhdrlen += exthdrlen;
1211                dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1212        } else {
1213                rt = (struct rt6_info *)cork->dst;
1214                fl6 = &inet->cork.fl.u.ip6;
1215                opt = np->cork.opt;
1216                transhdrlen = 0;
1217                exthdrlen = 0;
1218                dst_exthdrlen = 0;
1219                mtu = cork->fragsize;
1220        }
1221
1222        hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1223
1224        fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1225                        (opt ? opt->opt_nflen : 0);
1226        maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1227
1228        if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1229                if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1230                        ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1231                        return -EMSGSIZE;
1232                }
1233        }
1234
1235        /* For UDP, check if TX timestamp is enabled */
1236        if (sk->sk_type == SOCK_DGRAM)
1237                sock_tx_timestamp(sk, &tx_flags);
1238
1239        /*
1240         * Let's try using as much space as possible.
1241         * Use MTU if total length of the message fits into the MTU.
1242         * Otherwise, we need to reserve fragment header and
1243         * fragment alignment (= 8-15 octects, in total).
1244         *
1245         * Note that we may need to "move" the data from the tail of
1246         * of the buffer to the new fragment when we split
1247         * the message.
1248         *
1249         * FIXME: It may be fragmented into multiple chunks
1250         *        at once if non-fragmentable extension headers
1251         *        are too large.
1252         * --yoshfuji
1253         */
1254
1255        cork->length += length;
1256        if (length > mtu) {
1257                int proto = sk->sk_protocol;
1258                if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1259                        ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1260                        return -EMSGSIZE;
1261                }
1262
1263                if (proto == IPPROTO_UDP &&
1264                    (rt->dst.dev->features & NETIF_F_UFO)) {
1265
1266                        err = ip6_ufo_append_data(sk, getfrag, from, length,
1267                                                  hh_len, fragheaderlen,
1268                                                  transhdrlen, mtu, flags, rt);
1269                        if (err)
1270                                goto error;
1271                        return 0;
1272                }
1273        }
1274
1275        if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1276                goto alloc_new_skb;
1277
1278        while (length > 0) {
1279                /* Check if the remaining data fits into current packet. */
1280                copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1281                if (copy < length)
1282                        copy = maxfraglen - skb->len;
1283
1284                if (copy <= 0) {
1285                        char *data;
1286                        unsigned int datalen;
1287                        unsigned int fraglen;
1288                        unsigned int fraggap;
1289                        unsigned int alloclen;
1290alloc_new_skb:
1291                        /* There's no room in the current skb */
1292                        if (skb)
1293                                fraggap = skb->len - maxfraglen;
1294                        else
1295                                fraggap = 0;
1296                        /* update mtu and maxfraglen if necessary */
1297                        if (skb == NULL || skb_prev == NULL)
1298                                ip6_append_data_mtu(&mtu, &maxfraglen,
1299                                                    fragheaderlen, skb, rt,
1300                                                    np->pmtudisc ==
1301                                                    IPV6_PMTUDISC_PROBE);
1302
1303                        skb_prev = skb;
1304
1305                        /*
1306                         * If remaining data exceeds the mtu,
1307                         * we know we need more fragment(s).
1308                         */
1309                        datalen = length + fraggap;
1310
1311                        if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1312                                datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1313                        if ((flags & MSG_MORE) &&
1314                            !(rt->dst.dev->features&NETIF_F_SG))
1315                                alloclen = mtu;
1316                        else
1317                                alloclen = datalen + fragheaderlen;
1318
1319                        alloclen += dst_exthdrlen;
1320
1321                        if (datalen != length + fraggap) {
1322                                /*
1323                                 * this is not the last fragment, the trailer
1324                                 * space is regarded as data space.
1325                                 */
1326                                datalen += rt->dst.trailer_len;
1327                        }
1328
1329                        alloclen += rt->dst.trailer_len;
1330                        fraglen = datalen + fragheaderlen;
1331
1332                        /*
1333                         * We just reserve space for fragment header.
1334                         * Note: this may be overallocation if the message
1335                         * (without MSG_MORE) fits into the MTU.
1336                         */
1337                        alloclen += sizeof(struct frag_hdr);
1338
1339                        if (transhdrlen) {
1340                                skb = sock_alloc_send_skb(sk,
1341                                                alloclen + hh_len,
1342                                                (flags & MSG_DONTWAIT), &err);
1343                        } else {
1344                                skb = NULL;
1345                                if (atomic_read(&sk->sk_wmem_alloc) <=
1346                                    2 * sk->sk_sndbuf)
1347                                        skb = sock_wmalloc(sk,
1348                                                           alloclen + hh_len, 1,
1349                                                           sk->sk_allocation);
1350                                if (unlikely(skb == NULL))
1351                                        err = -ENOBUFS;
1352                                else {
1353                                        /* Only the initial fragment
1354                                         * is time stamped.
1355                                         */
1356                                        tx_flags = 0;
1357                                }
1358                        }
1359                        if (skb == NULL)
1360                                goto error;
1361                        /*
1362                         *      Fill in the control structures
1363                         */
1364                        skb->protocol = htons(ETH_P_IPV6);
1365                        skb->ip_summed = CHECKSUM_NONE;
1366                        skb->csum = 0;
1367                        /* reserve for fragmentation and ipsec header */
1368                        skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1369                                    dst_exthdrlen);
1370
1371                        if (sk->sk_type == SOCK_DGRAM)
1372                                skb_shinfo(skb)->tx_flags = tx_flags;
1373
1374                        /*
1375                         *      Find where to start putting bytes
1376                         */
1377                        data = skb_put(skb, fraglen);
1378                        skb_set_network_header(skb, exthdrlen);
1379                        data += fragheaderlen;
1380                        skb->transport_header = (skb->network_header +
1381                                                 fragheaderlen);
1382                        if (fraggap) {
1383                                skb->csum = skb_copy_and_csum_bits(
1384                                        skb_prev, maxfraglen,
1385                                        data + transhdrlen, fraggap, 0);
1386                                skb_prev->csum = csum_sub(skb_prev->csum,
1387                                                          skb->csum);
1388                                data += fraggap;
1389                                pskb_trim_unique(skb_prev, maxfraglen);
1390                        }
1391                        copy = datalen - transhdrlen - fraggap;
1392
1393                        if (copy < 0) {
1394                                err = -EINVAL;
1395                                kfree_skb(skb);
1396                                goto error;
1397                        } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1398                                err = -EFAULT;
1399                                kfree_skb(skb);
1400                                goto error;
1401                        }
1402
1403                        offset += copy;
1404                        length -= datalen - fraggap;
1405                        transhdrlen = 0;
1406                        exthdrlen = 0;
1407                        dst_exthdrlen = 0;
1408
1409                        /*
1410                         * Put the packet on the pending queue
1411                         */
1412                        __skb_queue_tail(&sk->sk_write_queue, skb);
1413                        continue;
1414                }
1415
1416                if (copy > length)
1417                        copy = length;
1418
1419                if (!(rt->dst.dev->features&NETIF_F_SG)) {
1420                        unsigned int off;
1421
1422                        off = skb->len;
1423                        if (getfrag(from, skb_put(skb, copy),
1424                                                offset, copy, off, skb) < 0) {
1425                                __skb_trim(skb, off);
1426                                err = -EFAULT;
1427                                goto error;
1428                        }
1429                } else {
1430                        int i = skb_shinfo(skb)->nr_frags;
1431                        struct page_frag *pfrag = sk_page_frag(sk);
1432
1433                        err = -ENOMEM;
1434                        if (!sk_page_frag_refill(sk, pfrag))
1435                                goto error;
1436
1437                        if (!skb_can_coalesce(skb, i, pfrag->page,
1438                                              pfrag->offset)) {
1439                                err = -EMSGSIZE;
1440                                if (i == MAX_SKB_FRAGS)
1441                                        goto error;
1442
1443                                __skb_fill_page_desc(skb, i, pfrag->page,
1444                                                     pfrag->offset, 0);
1445                                skb_shinfo(skb)->nr_frags = ++i;
1446                                get_page(pfrag->page);
1447                        }
1448                        copy = min_t(int, copy, pfrag->size - pfrag->offset);
1449                        if (getfrag(from,
1450                                    page_address(pfrag->page) + pfrag->offset,
1451                                    offset, copy, skb->len, skb) < 0)
1452                                goto error_efault;
1453
1454                        pfrag->offset += copy;
1455                        skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1456                        skb->len += copy;
1457                        skb->data_len += copy;
1458                        skb->truesize += copy;
1459                        atomic_add(copy, &sk->sk_wmem_alloc);
1460                }
1461                offset += copy;
1462                length -= copy;
1463        }
1464
1465        return 0;
1466
1467error_efault:
1468        err = -EFAULT;
1469error:
1470        cork->length -= length;
1471        IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1472        return err;
1473}
1474EXPORT_SYMBOL_GPL(ip6_append_data);
1475
1476static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1477{
1478        if (np->cork.opt) {
1479                kfree(np->cork.opt->dst0opt);
1480                kfree(np->cork.opt->dst1opt);
1481                kfree(np->cork.opt->hopopt);
1482                kfree(np->cork.opt->srcrt);
1483                kfree(np->cork.opt);
1484                np->cork.opt = NULL;
1485        }
1486
1487        if (inet->cork.base.dst) {
1488                dst_release(inet->cork.base.dst);
1489                inet->cork.base.dst = NULL;
1490                inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1491        }
1492        memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1493}
1494
1495int ip6_push_pending_frames(struct sock *sk)
1496{
1497        struct sk_buff *skb, *tmp_skb;
1498        struct sk_buff **tail_skb;
1499        struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1500        struct inet_sock *inet = inet_sk(sk);
1501        struct ipv6_pinfo *np = inet6_sk(sk);
1502        struct net *net = sock_net(sk);
1503        struct ipv6hdr *hdr;
1504        struct ipv6_txoptions *opt = np->cork.opt;
1505        struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1506        struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1507        unsigned char proto = fl6->flowi6_proto;
1508        int err = 0;
1509
1510        if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1511                goto out;
1512        tail_skb = &(skb_shinfo(skb)->frag_list);
1513
1514        /* move skb->data to ip header from ext header */
1515        if (skb->data < skb_network_header(skb))
1516                __skb_pull(skb, skb_network_offset(skb));
1517        while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1518                __skb_pull(tmp_skb, skb_network_header_len(skb));
1519                *tail_skb = tmp_skb;
1520                tail_skb = &(tmp_skb->next);
1521                skb->len += tmp_skb->len;
1522                skb->data_len += tmp_skb->len;
1523                skb->truesize += tmp_skb->truesize;
1524                tmp_skb->destructor = NULL;
1525                tmp_skb->sk = NULL;
1526        }
1527
1528        /* Allow local fragmentation. */
1529        if (np->pmtudisc < IPV6_PMTUDISC_DO)
1530                skb->local_df = 1;
1531
1532        *final_dst = fl6->daddr;
1533        __skb_pull(skb, skb_network_header_len(skb));
1534        if (opt && opt->opt_flen)
1535                ipv6_push_frag_opts(skb, opt, &proto);
1536        if (opt && opt->opt_nflen)
1537                ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1538
1539        skb_push(skb, sizeof(struct ipv6hdr));
1540        skb_reset_network_header(skb);
1541        hdr = ipv6_hdr(skb);
1542
1543        ip6_flow_hdr(hdr, np->cork.tclass, fl6->flowlabel);
1544        hdr->hop_limit = np->cork.hop_limit;
1545        hdr->nexthdr = proto;
1546        hdr->saddr = fl6->saddr;
1547        hdr->daddr = *final_dst;
1548
1549        skb->priority = sk->sk_priority;
1550        skb->mark = sk->sk_mark;
1551
1552        skb_dst_set(skb, dst_clone(&rt->dst));
1553        IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1554        if (proto == IPPROTO_ICMPV6) {
1555                struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1556
1557                ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1558                ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1559        }
1560
1561        err = ip6_local_out(skb);
1562        if (err) {
1563                if (err > 0)
1564                        err = net_xmit_errno(err);
1565                if (err)
1566                        goto error;
1567        }
1568
1569out:
1570        ip6_cork_release(inet, np);
1571        return err;
1572error:
1573        IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1574        goto out;
1575}
1576EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1577
1578void ip6_flush_pending_frames(struct sock *sk)
1579{
1580        struct sk_buff *skb;
1581
1582        while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1583                if (skb_dst(skb))
1584                        IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1585                                      IPSTATS_MIB_OUTDISCARDS);
1586                kfree_skb(skb);
1587        }
1588
1589        ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1590}
1591EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1592