linux/net/ipv6/ip6_output.c
<<
>>
Prefs
   1/*
   2 *      IPv6 output functions
   3 *      Linux INET6 implementation
   4 *
   5 *      Authors:
   6 *      Pedro Roque             <roque@di.fc.ul.pt>
   7 *
   8 *      Based on linux/net/ipv4/ip_output.c
   9 *
  10 *      This program is free software; you can redistribute it and/or
  11 *      modify it under the terms of the GNU General Public License
  12 *      as published by the Free Software Foundation; either version
  13 *      2 of the License, or (at your option) any later version.
  14 *
  15 *      Changes:
  16 *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17 *                              extension headers are implemented.
  18 *                              route changes now work.
  19 *                              ip6_forward does not confuse sniffers.
  20 *                              etc.
  21 *
  22 *      H. von Brand    :       Added missing #include <linux/string.h>
  23 *      Imran Patel     :       frag id should be in NBO
  24 *      Kazunori MIYAZAWA @USAGI
  25 *                      :       add ip6_append_data and related functions
  26 *                              for datagram xmit
  27 */
  28
  29#include <linux/errno.h>
  30#include <linux/kernel.h>
  31#include <linux/string.h>
  32#include <linux/socket.h>
  33#include <linux/net.h>
  34#include <linux/netdevice.h>
  35#include <linux/if_arp.h>
  36#include <linux/in6.h>
  37#include <linux/tcp.h>
  38#include <linux/route.h>
  39#include <linux/module.h>
  40#include <linux/slab.h>
  41
  42#include <linux/bpf-cgroup.h>
  43#include <linux/netfilter.h>
  44#include <linux/netfilter_ipv6.h>
  45
  46#include <net/sock.h>
  47#include <net/snmp.h>
  48
  49#include <net/ipv6.h>
  50#include <net/ndisc.h>
  51#include <net/protocol.h>
  52#include <net/ip6_route.h>
  53#include <net/addrconf.h>
  54#include <net/rawv6.h>
  55#include <net/icmp.h>
  56#include <net/xfrm.h>
  57#include <net/checksum.h>
  58#include <linux/mroute6.h>
  59#include <net/l3mdev.h>
  60#include <net/lwtunnel.h>
  61
  62static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  63{
  64        struct dst_entry *dst = skb_dst(skb);
  65        struct net_device *dev = dst->dev;
  66        struct neighbour *neigh;
  67        struct in6_addr *nexthop;
  68        int ret;
  69
  70        if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  71                struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  72
  73                if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  74                    ((mroute6_socket(net, skb) &&
  75                     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  76                     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  77                                         &ipv6_hdr(skb)->saddr))) {
  78                        struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  79
  80                        /* Do not check for IFF_ALLMULTI; multicast routing
  81                           is not supported in any case.
  82                         */
  83                        if (newskb)
  84                                NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  85                                        net, sk, newskb, NULL, newskb->dev,
  86                                        dev_loopback_xmit);
  87
  88                        if (ipv6_hdr(skb)->hop_limit == 0) {
  89                                IP6_INC_STATS(net, idev,
  90                                              IPSTATS_MIB_OUTDISCARDS);
  91                                kfree_skb(skb);
  92                                return 0;
  93                        }
  94                }
  95
  96                IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  97
  98                if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
  99                    IPV6_ADDR_SCOPE_NODELOCAL &&
 100                    !(dev->flags & IFF_LOOPBACK)) {
 101                        kfree_skb(skb);
 102                        return 0;
 103                }
 104        }
 105
 106        if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 107                int res = lwtunnel_xmit(skb);
 108
 109                if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 110                        return res;
 111        }
 112
 113        rcu_read_lock_bh();
 114        nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 115        neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 116        if (unlikely(!neigh))
 117                neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 118        if (!IS_ERR(neigh)) {
 119                sock_confirm_neigh(skb, neigh);
 120                ret = neigh_output(neigh, skb);
 121                rcu_read_unlock_bh();
 122                return ret;
 123        }
 124        rcu_read_unlock_bh();
 125
 126        IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 127        kfree_skb(skb);
 128        return -EINVAL;
 129}
 130
 131static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 132{
 133        int ret;
 134
 135        ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 136        if (ret) {
 137                kfree_skb(skb);
 138                return ret;
 139        }
 140
 141        if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 142            dst_allfrag(skb_dst(skb)) ||
 143            (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 144                return ip6_fragment(net, sk, skb, ip6_finish_output2);
 145        else
 146                return ip6_finish_output2(net, sk, skb);
 147}
 148
 149int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 150{
 151        struct net_device *dev = skb_dst(skb)->dev;
 152        struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 153
 154        skb->protocol = htons(ETH_P_IPV6);
 155        skb->dev = dev;
 156
 157        if (unlikely(idev->cnf.disable_ipv6)) {
 158                IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 159                kfree_skb(skb);
 160                return 0;
 161        }
 162
 163        return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 164                            net, sk, skb, NULL, dev,
 165                            ip6_finish_output,
 166                            !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 167}
 168
 169/*
 170 * xmit an sk_buff (used by TCP, SCTP and DCCP)
 171 * Note : socket lock is not held for SYNACK packets, but might be modified
 172 * by calls to skb_set_owner_w() and ipv6_local_error(),
 173 * which are using proper atomic operations or spinlocks.
 174 */
 175int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 176             __u32 mark, struct ipv6_txoptions *opt, int tclass)
 177{
 178        struct net *net = sock_net(sk);
 179        const struct ipv6_pinfo *np = inet6_sk(sk);
 180        struct in6_addr *first_hop = &fl6->daddr;
 181        struct dst_entry *dst = skb_dst(skb);
 182        struct ipv6hdr *hdr;
 183        u8  proto = fl6->flowi6_proto;
 184        int seg_len = skb->len;
 185        int hlimit = -1;
 186        u32 mtu;
 187
 188        if (opt) {
 189                unsigned int head_room;
 190
 191                /* First: exthdrs may take lots of space (~8K for now)
 192                   MAX_HEADER is not enough.
 193                 */
 194                head_room = opt->opt_nflen + opt->opt_flen;
 195                seg_len += head_room;
 196                head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 197
 198                if (skb_headroom(skb) < head_room) {
 199                        struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 200                        if (!skb2) {
 201                                IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 202                                              IPSTATS_MIB_OUTDISCARDS);
 203                                kfree_skb(skb);
 204                                return -ENOBUFS;
 205                        }
 206                        consume_skb(skb);
 207                        skb = skb2;
 208                        /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
 209                         * it is safe to call in our context (socket lock not held)
 210                         */
 211                        skb_set_owner_w(skb, (struct sock *)sk);
 212                }
 213                if (opt->opt_flen)
 214                        ipv6_push_frag_opts(skb, opt, &proto);
 215                if (opt->opt_nflen)
 216                        ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 217                                             &fl6->saddr);
 218        }
 219
 220        skb_push(skb, sizeof(struct ipv6hdr));
 221        skb_reset_network_header(skb);
 222        hdr = ipv6_hdr(skb);
 223
 224        /*
 225         *      Fill in the IPv6 header
 226         */
 227        if (np)
 228                hlimit = np->hop_limit;
 229        if (hlimit < 0)
 230                hlimit = ip6_dst_hoplimit(dst);
 231
 232        ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 233                                                     np->autoflowlabel, fl6));
 234
 235        hdr->payload_len = htons(seg_len);
 236        hdr->nexthdr = proto;
 237        hdr->hop_limit = hlimit;
 238
 239        hdr->saddr = fl6->saddr;
 240        hdr->daddr = *first_hop;
 241
 242        skb->protocol = htons(ETH_P_IPV6);
 243        skb->priority = sk->sk_priority;
 244        skb->mark = mark;
 245
 246        mtu = dst_mtu(dst);
 247        if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 248                IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 249                              IPSTATS_MIB_OUT, skb->len);
 250
 251                /* if egress device is enslaved to an L3 master device pass the
 252                 * skb to its handler for processing
 253                 */
 254                skb = l3mdev_ip6_out((struct sock *)sk, skb);
 255                if (unlikely(!skb))
 256                        return 0;
 257
 258                /* hooks should never assume socket lock is held.
 259                 * we promote our socket to non const
 260                 */
 261                return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 262                               net, (struct sock *)sk, skb, NULL, dst->dev,
 263                               dst_output);
 264        }
 265
 266        skb->dev = dst->dev;
 267        /* ipv6_local_error() does not require socket lock,
 268         * we promote our socket to non const
 269         */
 270        ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 271
 272        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 273        kfree_skb(skb);
 274        return -EMSGSIZE;
 275}
 276EXPORT_SYMBOL(ip6_xmit);
 277
 278static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 279{
 280        struct ip6_ra_chain *ra;
 281        struct sock *last = NULL;
 282
 283        read_lock(&ip6_ra_lock);
 284        for (ra = ip6_ra_chain; ra; ra = ra->next) {
 285                struct sock *sk = ra->sk;
 286                if (sk && ra->sel == sel &&
 287                    (!sk->sk_bound_dev_if ||
 288                     sk->sk_bound_dev_if == skb->dev->ifindex)) {
 289                        if (last) {
 290                                struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 291                                if (skb2)
 292                                        rawv6_rcv(last, skb2);
 293                        }
 294                        last = sk;
 295                }
 296        }
 297
 298        if (last) {
 299                rawv6_rcv(last, skb);
 300                read_unlock(&ip6_ra_lock);
 301                return 1;
 302        }
 303        read_unlock(&ip6_ra_lock);
 304        return 0;
 305}
 306
 307static int ip6_forward_proxy_check(struct sk_buff *skb)
 308{
 309        struct ipv6hdr *hdr = ipv6_hdr(skb);
 310        u8 nexthdr = hdr->nexthdr;
 311        __be16 frag_off;
 312        int offset;
 313
 314        if (ipv6_ext_hdr(nexthdr)) {
 315                offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 316                if (offset < 0)
 317                        return 0;
 318        } else
 319                offset = sizeof(struct ipv6hdr);
 320
 321        if (nexthdr == IPPROTO_ICMPV6) {
 322                struct icmp6hdr *icmp6;
 323
 324                if (!pskb_may_pull(skb, (skb_network_header(skb) +
 325                                         offset + 1 - skb->data)))
 326                        return 0;
 327
 328                icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 329
 330                switch (icmp6->icmp6_type) {
 331                case NDISC_ROUTER_SOLICITATION:
 332                case NDISC_ROUTER_ADVERTISEMENT:
 333                case NDISC_NEIGHBOUR_SOLICITATION:
 334                case NDISC_NEIGHBOUR_ADVERTISEMENT:
 335                case NDISC_REDIRECT:
 336                        /* For reaction involving unicast neighbor discovery
 337                         * message destined to the proxied address, pass it to
 338                         * input function.
 339                         */
 340                        return 1;
 341                default:
 342                        break;
 343                }
 344        }
 345
 346        /*
 347         * The proxying router can't forward traffic sent to a link-local
 348         * address, so signal the sender and discard the packet. This
 349         * behavior is clarified by the MIPv6 specification.
 350         */
 351        if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 352                dst_link_failure(skb);
 353                return -1;
 354        }
 355
 356        return 0;
 357}
 358
 359static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 360                                     struct sk_buff *skb)
 361{
 362        return dst_output(net, sk, skb);
 363}
 364
 365static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 366{
 367        unsigned int mtu;
 368        struct inet6_dev *idev;
 369
 370        if (dst_metric_locked(dst, RTAX_MTU)) {
 371                mtu = dst_metric_raw(dst, RTAX_MTU);
 372                if (mtu)
 373                        return mtu;
 374        }
 375
 376        mtu = IPV6_MIN_MTU;
 377        rcu_read_lock();
 378        idev = __in6_dev_get(dst->dev);
 379        if (idev)
 380                mtu = idev->cnf.mtu6;
 381        rcu_read_unlock();
 382
 383        return mtu;
 384}
 385
 386static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 387{
 388        if (skb->len <= mtu)
 389                return false;
 390
 391        /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 392        if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 393                return true;
 394
 395        if (skb->ignore_df)
 396                return false;
 397
 398        if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
 399                return false;
 400
 401        return true;
 402}
 403
 404int ip6_forward(struct sk_buff *skb)
 405{
 406        struct dst_entry *dst = skb_dst(skb);
 407        struct ipv6hdr *hdr = ipv6_hdr(skb);
 408        struct inet6_skb_parm *opt = IP6CB(skb);
 409        struct net *net = dev_net(dst->dev);
 410        u32 mtu;
 411
 412        if (net->ipv6.devconf_all->forwarding == 0)
 413                goto error;
 414
 415        if (skb->pkt_type != PACKET_HOST)
 416                goto drop;
 417
 418        if (unlikely(skb->sk))
 419                goto drop;
 420
 421        if (skb_warn_if_lro(skb))
 422                goto drop;
 423
 424        if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 425                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 426                                IPSTATS_MIB_INDISCARDS);
 427                goto drop;
 428        }
 429
 430        skb_forward_csum(skb);
 431
 432        /*
 433         *      We DO NOT make any processing on
 434         *      RA packets, pushing them to user level AS IS
 435         *      without ane WARRANTY that application will be able
 436         *      to interpret them. The reason is that we
 437         *      cannot make anything clever here.
 438         *
 439         *      We are not end-node, so that if packet contains
 440         *      AH/ESP, we cannot make anything.
 441         *      Defragmentation also would be mistake, RA packets
 442         *      cannot be fragmented, because there is no warranty
 443         *      that different fragments will go along one path. --ANK
 444         */
 445        if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 446                if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 447                        return 0;
 448        }
 449
 450        /*
 451         *      check and decrement ttl
 452         */
 453        if (hdr->hop_limit <= 1) {
 454                /* Force OUTPUT device used as source address */
 455                skb->dev = dst->dev;
 456                icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 457                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 458                                IPSTATS_MIB_INHDRERRORS);
 459
 460                kfree_skb(skb);
 461                return -ETIMEDOUT;
 462        }
 463
 464        /* XXX: idev->cnf.proxy_ndp? */
 465        if (net->ipv6.devconf_all->proxy_ndp &&
 466            pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 467                int proxied = ip6_forward_proxy_check(skb);
 468                if (proxied > 0)
 469                        return ip6_input(skb);
 470                else if (proxied < 0) {
 471                        __IP6_INC_STATS(net, ip6_dst_idev(dst),
 472                                        IPSTATS_MIB_INDISCARDS);
 473                        goto drop;
 474                }
 475        }
 476
 477        if (!xfrm6_route_forward(skb)) {
 478                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 479                                IPSTATS_MIB_INDISCARDS);
 480                goto drop;
 481        }
 482        dst = skb_dst(skb);
 483
 484        /* IPv6 specs say nothing about it, but it is clear that we cannot
 485           send redirects to source routed frames.
 486           We don't send redirects to frames decapsulated from IPsec.
 487         */
 488        if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 489                struct in6_addr *target = NULL;
 490                struct inet_peer *peer;
 491                struct rt6_info *rt;
 492
 493                /*
 494                 *      incoming and outgoing devices are the same
 495                 *      send a redirect.
 496                 */
 497
 498                rt = (struct rt6_info *) dst;
 499                if (rt->rt6i_flags & RTF_GATEWAY)
 500                        target = &rt->rt6i_gateway;
 501                else
 502                        target = &hdr->daddr;
 503
 504                peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 505
 506                /* Limit redirects both by destination (here)
 507                   and by source (inside ndisc_send_redirect)
 508                 */
 509                if (inet_peer_xrlim_allow(peer, 1*HZ))
 510                        ndisc_send_redirect(skb, target);
 511                if (peer)
 512                        inet_putpeer(peer);
 513        } else {
 514                int addrtype = ipv6_addr_type(&hdr->saddr);
 515
 516                /* This check is security critical. */
 517                if (addrtype == IPV6_ADDR_ANY ||
 518                    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 519                        goto error;
 520                if (addrtype & IPV6_ADDR_LINKLOCAL) {
 521                        icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 522                                    ICMPV6_NOT_NEIGHBOUR, 0);
 523                        goto error;
 524                }
 525        }
 526
 527        mtu = ip6_dst_mtu_forward(dst);
 528        if (mtu < IPV6_MIN_MTU)
 529                mtu = IPV6_MIN_MTU;
 530
 531        if (ip6_pkt_too_big(skb, mtu)) {
 532                /* Again, force OUTPUT device used as source address */
 533                skb->dev = dst->dev;
 534                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 535                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 536                                IPSTATS_MIB_INTOOBIGERRORS);
 537                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 538                                IPSTATS_MIB_FRAGFAILS);
 539                kfree_skb(skb);
 540                return -EMSGSIZE;
 541        }
 542
 543        if (skb_cow(skb, dst->dev->hard_header_len)) {
 544                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 545                                IPSTATS_MIB_OUTDISCARDS);
 546                goto drop;
 547        }
 548
 549        hdr = ipv6_hdr(skb);
 550
 551        /* Mangling hops number delayed to point after skb COW */
 552
 553        hdr->hop_limit--;
 554
 555        __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 556        __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 557        return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 558                       net, NULL, skb, skb->dev, dst->dev,
 559                       ip6_forward_finish);
 560
 561error:
 562        __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 563drop:
 564        kfree_skb(skb);
 565        return -EINVAL;
 566}
 567
 568static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 569{
 570        to->pkt_type = from->pkt_type;
 571        to->priority = from->priority;
 572        to->protocol = from->protocol;
 573        skb_dst_drop(to);
 574        skb_dst_set(to, dst_clone(skb_dst(from)));
 575        to->dev = from->dev;
 576        to->mark = from->mark;
 577
 578#ifdef CONFIG_NET_SCHED
 579        to->tc_index = from->tc_index;
 580#endif
 581        nf_copy(to, from);
 582        skb_copy_secmark(to, from);
 583}
 584
 585int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 586                 int (*output)(struct net *, struct sock *, struct sk_buff *))
 587{
 588        struct sk_buff *frag;
 589        struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 590        struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 591                                inet6_sk(skb->sk) : NULL;
 592        struct ipv6hdr *tmp_hdr;
 593        struct frag_hdr *fh;
 594        unsigned int mtu, hlen, left, len;
 595        int hroom, troom;
 596        __be32 frag_id;
 597        int ptr, offset = 0, err = 0;
 598        u8 *prevhdr, nexthdr = 0;
 599
 600        err = ip6_find_1stfragopt(skb, &prevhdr);
 601        if (err < 0)
 602                goto fail;
 603        hlen = err;
 604        nexthdr = *prevhdr;
 605
 606        mtu = ip6_skb_dst_mtu(skb);
 607
 608        /* We must not fragment if the socket is set to force MTU discovery
 609         * or if the skb it not generated by a local socket.
 610         */
 611        if (unlikely(!skb->ignore_df && skb->len > mtu))
 612                goto fail_toobig;
 613
 614        if (IP6CB(skb)->frag_max_size) {
 615                if (IP6CB(skb)->frag_max_size > mtu)
 616                        goto fail_toobig;
 617
 618                /* don't send fragments larger than what we received */
 619                mtu = IP6CB(skb)->frag_max_size;
 620                if (mtu < IPV6_MIN_MTU)
 621                        mtu = IPV6_MIN_MTU;
 622        }
 623
 624        if (np && np->frag_size < mtu) {
 625                if (np->frag_size)
 626                        mtu = np->frag_size;
 627        }
 628        if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 629                goto fail_toobig;
 630        mtu -= hlen + sizeof(struct frag_hdr);
 631
 632        frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 633                                    &ipv6_hdr(skb)->saddr);
 634
 635        if (skb->ip_summed == CHECKSUM_PARTIAL &&
 636            (err = skb_checksum_help(skb)))
 637                goto fail;
 638
 639        hroom = LL_RESERVED_SPACE(rt->dst.dev);
 640        if (skb_has_frag_list(skb)) {
 641                unsigned int first_len = skb_pagelen(skb);
 642                struct sk_buff *frag2;
 643
 644                if (first_len - hlen > mtu ||
 645                    ((first_len - hlen) & 7) ||
 646                    skb_cloned(skb) ||
 647                    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 648                        goto slow_path;
 649
 650                skb_walk_frags(skb, frag) {
 651                        /* Correct geometry. */
 652                        if (frag->len > mtu ||
 653                            ((frag->len & 7) && frag->next) ||
 654                            skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 655                                goto slow_path_clean;
 656
 657                        /* Partially cloned skb? */
 658                        if (skb_shared(frag))
 659                                goto slow_path_clean;
 660
 661                        BUG_ON(frag->sk);
 662                        if (skb->sk) {
 663                                frag->sk = skb->sk;
 664                                frag->destructor = sock_wfree;
 665                        }
 666                        skb->truesize -= frag->truesize;
 667                }
 668
 669                err = 0;
 670                offset = 0;
 671                /* BUILD HEADER */
 672
 673                *prevhdr = NEXTHDR_FRAGMENT;
 674                tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 675                if (!tmp_hdr) {
 676                        err = -ENOMEM;
 677                        goto fail;
 678                }
 679                frag = skb_shinfo(skb)->frag_list;
 680                skb_frag_list_init(skb);
 681
 682                __skb_pull(skb, hlen);
 683                fh = __skb_push(skb, sizeof(struct frag_hdr));
 684                __skb_push(skb, hlen);
 685                skb_reset_network_header(skb);
 686                memcpy(skb_network_header(skb), tmp_hdr, hlen);
 687
 688                fh->nexthdr = nexthdr;
 689                fh->reserved = 0;
 690                fh->frag_off = htons(IP6_MF);
 691                fh->identification = frag_id;
 692
 693                first_len = skb_pagelen(skb);
 694                skb->data_len = first_len - skb_headlen(skb);
 695                skb->len = first_len;
 696                ipv6_hdr(skb)->payload_len = htons(first_len -
 697                                                   sizeof(struct ipv6hdr));
 698
 699                for (;;) {
 700                        /* Prepare header of the next frame,
 701                         * before previous one went down. */
 702                        if (frag) {
 703                                frag->ip_summed = CHECKSUM_NONE;
 704                                skb_reset_transport_header(frag);
 705                                fh = __skb_push(frag, sizeof(struct frag_hdr));
 706                                __skb_push(frag, hlen);
 707                                skb_reset_network_header(frag);
 708                                memcpy(skb_network_header(frag), tmp_hdr,
 709                                       hlen);
 710                                offset += skb->len - hlen - sizeof(struct frag_hdr);
 711                                fh->nexthdr = nexthdr;
 712                                fh->reserved = 0;
 713                                fh->frag_off = htons(offset);
 714                                if (frag->next)
 715                                        fh->frag_off |= htons(IP6_MF);
 716                                fh->identification = frag_id;
 717                                ipv6_hdr(frag)->payload_len =
 718                                                htons(frag->len -
 719                                                      sizeof(struct ipv6hdr));
 720                                ip6_copy_metadata(frag, skb);
 721                        }
 722
 723                        err = output(net, sk, skb);
 724                        if (!err)
 725                                IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 726                                              IPSTATS_MIB_FRAGCREATES);
 727
 728                        if (err || !frag)
 729                                break;
 730
 731                        skb = frag;
 732                        frag = skb->next;
 733                        skb->next = NULL;
 734                }
 735
 736                kfree(tmp_hdr);
 737
 738                if (err == 0) {
 739                        IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 740                                      IPSTATS_MIB_FRAGOKS);
 741                        return 0;
 742                }
 743
 744                kfree_skb_list(frag);
 745
 746                IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 747                              IPSTATS_MIB_FRAGFAILS);
 748                return err;
 749
 750slow_path_clean:
 751                skb_walk_frags(skb, frag2) {
 752                        if (frag2 == frag)
 753                                break;
 754                        frag2->sk = NULL;
 755                        frag2->destructor = NULL;
 756                        skb->truesize += frag2->truesize;
 757                }
 758        }
 759
 760slow_path:
 761        left = skb->len - hlen;         /* Space per frame */
 762        ptr = hlen;                     /* Where to start from */
 763
 764        /*
 765         *      Fragment the datagram.
 766         */
 767
 768        troom = rt->dst.dev->needed_tailroom;
 769
 770        /*
 771         *      Keep copying data until we run out.
 772         */
 773        while (left > 0)        {
 774                u8 *fragnexthdr_offset;
 775
 776                len = left;
 777                /* IF: it doesn't fit, use 'mtu' - the data space left */
 778                if (len > mtu)
 779                        len = mtu;
 780                /* IF: we are not sending up to and including the packet end
 781                   then align the next start on an eight byte boundary */
 782                if (len < left) {
 783                        len &= ~7;
 784                }
 785
 786                /* Allocate buffer */
 787                frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 788                                 hroom + troom, GFP_ATOMIC);
 789                if (!frag) {
 790                        err = -ENOMEM;
 791                        goto fail;
 792                }
 793
 794                /*
 795                 *      Set up data on packet
 796                 */
 797
 798                ip6_copy_metadata(frag, skb);
 799                skb_reserve(frag, hroom);
 800                skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 801                skb_reset_network_header(frag);
 802                fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 803                frag->transport_header = (frag->network_header + hlen +
 804                                          sizeof(struct frag_hdr));
 805
 806                /*
 807                 *      Charge the memory for the fragment to any owner
 808                 *      it might possess
 809                 */
 810                if (skb->sk)
 811                        skb_set_owner_w(frag, skb->sk);
 812
 813                /*
 814                 *      Copy the packet header into the new buffer.
 815                 */
 816                skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 817
 818                fragnexthdr_offset = skb_network_header(frag);
 819                fragnexthdr_offset += prevhdr - skb_network_header(skb);
 820                *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 821
 822                /*
 823                 *      Build fragment header.
 824                 */
 825                fh->nexthdr = nexthdr;
 826                fh->reserved = 0;
 827                fh->identification = frag_id;
 828
 829                /*
 830                 *      Copy a block of the IP datagram.
 831                 */
 832                BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 833                                     len));
 834                left -= len;
 835
 836                fh->frag_off = htons(offset);
 837                if (left > 0)
 838                        fh->frag_off |= htons(IP6_MF);
 839                ipv6_hdr(frag)->payload_len = htons(frag->len -
 840                                                    sizeof(struct ipv6hdr));
 841
 842                ptr += len;
 843                offset += len;
 844
 845                /*
 846                 *      Put this fragment into the sending queue.
 847                 */
 848                err = output(net, sk, frag);
 849                if (err)
 850                        goto fail;
 851
 852                IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 853                              IPSTATS_MIB_FRAGCREATES);
 854        }
 855        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 856                      IPSTATS_MIB_FRAGOKS);
 857        consume_skb(skb);
 858        return err;
 859
 860fail_toobig:
 861        if (skb->sk && dst_allfrag(skb_dst(skb)))
 862                sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 863
 864        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 865        err = -EMSGSIZE;
 866
 867fail:
 868        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 869                      IPSTATS_MIB_FRAGFAILS);
 870        kfree_skb(skb);
 871        return err;
 872}
 873
 874static inline int ip6_rt_check(const struct rt6key *rt_key,
 875                               const struct in6_addr *fl_addr,
 876                               const struct in6_addr *addr_cache)
 877{
 878        return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 879                (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 880}
 881
 882static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 883                                          struct dst_entry *dst,
 884                                          const struct flowi6 *fl6)
 885{
 886        struct ipv6_pinfo *np = inet6_sk(sk);
 887        struct rt6_info *rt;
 888
 889        if (!dst)
 890                goto out;
 891
 892        if (dst->ops->family != AF_INET6) {
 893                dst_release(dst);
 894                return NULL;
 895        }
 896
 897        rt = (struct rt6_info *)dst;
 898        /* Yes, checking route validity in not connected
 899         * case is not very simple. Take into account,
 900         * that we do not support routing by source, TOS,
 901         * and MSG_DONTROUTE            --ANK (980726)
 902         *
 903         * 1. ip6_rt_check(): If route was host route,
 904         *    check that cached destination is current.
 905         *    If it is network route, we still may
 906         *    check its validity using saved pointer
 907         *    to the last used address: daddr_cache.
 908         *    We do not want to save whole address now,
 909         *    (because main consumer of this service
 910         *    is tcp, which has not this problem),
 911         *    so that the last trick works only on connected
 912         *    sockets.
 913         * 2. oif also should be the same.
 914         */
 915        if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 916#ifdef CONFIG_IPV6_SUBTREES
 917            ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 918#endif
 919           (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 920              (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 921                dst_release(dst);
 922                dst = NULL;
 923        }
 924
 925out:
 926        return dst;
 927}
 928
 929static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 930                               struct dst_entry **dst, struct flowi6 *fl6)
 931{
 932#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 933        struct neighbour *n;
 934        struct rt6_info *rt;
 935#endif
 936        int err;
 937        int flags = 0;
 938
 939        /* The correct way to handle this would be to do
 940         * ip6_route_get_saddr, and then ip6_route_output; however,
 941         * the route-specific preferred source forces the
 942         * ip6_route_output call _before_ ip6_route_get_saddr.
 943         *
 944         * In source specific routing (no src=any default route),
 945         * ip6_route_output will fail given src=any saddr, though, so
 946         * that's why we try it again later.
 947         */
 948        if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
 949                struct rt6_info *rt;
 950                bool had_dst = *dst != NULL;
 951
 952                if (!had_dst)
 953                        *dst = ip6_route_output(net, sk, fl6);
 954                rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
 955                err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 956                                          sk ? inet6_sk(sk)->srcprefs : 0,
 957                                          &fl6->saddr);
 958                if (err)
 959                        goto out_err_release;
 960
 961                /* If we had an erroneous initial result, pretend it
 962                 * never existed and let the SA-enabled version take
 963                 * over.
 964                 */
 965                if (!had_dst && (*dst)->error) {
 966                        dst_release(*dst);
 967                        *dst = NULL;
 968                }
 969
 970                if (fl6->flowi6_oif)
 971                        flags |= RT6_LOOKUP_F_IFACE;
 972        }
 973
 974        if (!*dst)
 975                *dst = ip6_route_output_flags(net, sk, fl6, flags);
 976
 977        err = (*dst)->error;
 978        if (err)
 979                goto out_err_release;
 980
 981#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 982        /*
 983         * Here if the dst entry we've looked up
 984         * has a neighbour entry that is in the INCOMPLETE
 985         * state and the src address from the flow is
 986         * marked as OPTIMISTIC, we release the found
 987         * dst entry and replace it instead with the
 988         * dst entry of the nexthop router
 989         */
 990        rt = (struct rt6_info *) *dst;
 991        rcu_read_lock_bh();
 992        n = __ipv6_neigh_lookup_noref(rt->dst.dev,
 993                                      rt6_nexthop(rt, &fl6->daddr));
 994        err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
 995        rcu_read_unlock_bh();
 996
 997        if (err) {
 998                struct inet6_ifaddr *ifp;
 999                struct flowi6 fl_gw6;
1000                int redirect;
1001
1002                ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1003                                      (*dst)->dev, 1);
1004
1005                redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1006                if (ifp)
1007                        in6_ifa_put(ifp);
1008
1009                if (redirect) {
1010                        /*
1011                         * We need to get the dst entry for the
1012                         * default router instead
1013                         */
1014                        dst_release(*dst);
1015                        memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1016                        memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1017                        *dst = ip6_route_output(net, sk, &fl_gw6);
1018                        err = (*dst)->error;
1019                        if (err)
1020                                goto out_err_release;
1021                }
1022        }
1023#endif
1024        if (ipv6_addr_v4mapped(&fl6->saddr) &&
1025            !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1026                err = -EAFNOSUPPORT;
1027                goto out_err_release;
1028        }
1029
1030        return 0;
1031
1032out_err_release:
1033        dst_release(*dst);
1034        *dst = NULL;
1035
1036        if (err == -ENETUNREACH)
1037                IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1038        return err;
1039}
1040
1041/**
1042 *      ip6_dst_lookup - perform route lookup on flow
1043 *      @sk: socket which provides route info
1044 *      @dst: pointer to dst_entry * for result
1045 *      @fl6: flow to lookup
1046 *
1047 *      This function performs a route lookup on the given flow.
1048 *
1049 *      It returns zero on success, or a standard errno code on error.
1050 */
1051int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1052                   struct flowi6 *fl6)
1053{
1054        *dst = NULL;
1055        return ip6_dst_lookup_tail(net, sk, dst, fl6);
1056}
1057EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1058
1059/**
1060 *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1061 *      @sk: socket which provides route info
1062 *      @fl6: flow to lookup
1063 *      @final_dst: final destination address for ipsec lookup
1064 *
1065 *      This function performs a route lookup on the given flow.
1066 *
1067 *      It returns a valid dst pointer on success, or a pointer encoded
1068 *      error code.
1069 */
1070struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1071                                      const struct in6_addr *final_dst)
1072{
1073        struct dst_entry *dst = NULL;
1074        int err;
1075
1076        err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1077        if (err)
1078                return ERR_PTR(err);
1079        if (final_dst)
1080                fl6->daddr = *final_dst;
1081
1082        return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1083}
1084EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1085
1086/**
1087 *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1088 *      @sk: socket which provides the dst cache and route info
1089 *      @fl6: flow to lookup
1090 *      @final_dst: final destination address for ipsec lookup
1091 *
1092 *      This function performs a route lookup on the given flow with the
1093 *      possibility of using the cached route in the socket if it is valid.
1094 *      It will take the socket dst lock when operating on the dst cache.
1095 *      As a result, this function can only be used in process context.
1096 *
1097 *      It returns a valid dst pointer on success, or a pointer encoded
1098 *      error code.
1099 */
1100struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1101                                         const struct in6_addr *final_dst)
1102{
1103        struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1104
1105        dst = ip6_sk_dst_check(sk, dst, fl6);
1106        if (!dst)
1107                dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1108
1109        return dst;
1110}
1111EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1112
1113static inline int ip6_ufo_append_data(struct sock *sk,
1114                        struct sk_buff_head *queue,
1115                        int getfrag(void *from, char *to, int offset, int len,
1116                        int odd, struct sk_buff *skb),
1117                        void *from, int length, int hh_len, int fragheaderlen,
1118                        int exthdrlen, int transhdrlen, int mtu,
1119                        unsigned int flags, const struct flowi6 *fl6)
1120
1121{
1122        struct sk_buff *skb;
1123        int err;
1124
1125        /* There is support for UDP large send offload by network
1126         * device, so create one single skb packet containing complete
1127         * udp datagram
1128         */
1129        skb = skb_peek_tail(queue);
1130        if (!skb) {
1131                skb = sock_alloc_send_skb(sk,
1132                        hh_len + fragheaderlen + transhdrlen + 20,
1133                        (flags & MSG_DONTWAIT), &err);
1134                if (!skb)
1135                        return err;
1136
1137                /* reserve space for Hardware header */
1138                skb_reserve(skb, hh_len);
1139
1140                /* create space for UDP/IP header */
1141                skb_put(skb, fragheaderlen + transhdrlen);
1142
1143                /* initialize network header pointer */
1144                skb_set_network_header(skb, exthdrlen);
1145
1146                /* initialize protocol header pointer */
1147                skb->transport_header = skb->network_header + fragheaderlen;
1148
1149                skb->protocol = htons(ETH_P_IPV6);
1150                skb->csum = 0;
1151
1152                if (flags & MSG_CONFIRM)
1153                        skb_set_dst_pending_confirm(skb, 1);
1154
1155                __skb_queue_tail(queue, skb);
1156        } else if (skb_is_gso(skb)) {
1157                goto append;
1158        }
1159
1160        skb->ip_summed = CHECKSUM_PARTIAL;
1161        /* Specify the length of each IPv6 datagram fragment.
1162         * It has to be a multiple of 8.
1163         */
1164        skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1165                                     sizeof(struct frag_hdr)) & ~7;
1166        skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1167        skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1168                                                         &fl6->daddr,
1169                                                         &fl6->saddr);
1170
1171append:
1172        return skb_append_datato_frags(sk, skb, getfrag, from,
1173                                       (length - transhdrlen));
1174}
1175
1176static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1177                                               gfp_t gfp)
1178{
1179        return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1180}
1181
1182static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1183                                                gfp_t gfp)
1184{
1185        return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1186}
1187
1188static void ip6_append_data_mtu(unsigned int *mtu,
1189                                int *maxfraglen,
1190                                unsigned int fragheaderlen,
1191                                struct sk_buff *skb,
1192                                struct rt6_info *rt,
1193                                unsigned int orig_mtu)
1194{
1195        if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1196                if (!skb) {
1197                        /* first fragment, reserve header_len */
1198                        *mtu = orig_mtu - rt->dst.header_len;
1199
1200                } else {
1201                        /*
1202                         * this fragment is not first, the headers
1203                         * space is regarded as data space.
1204                         */
1205                        *mtu = orig_mtu;
1206                }
1207                *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1208                              + fragheaderlen - sizeof(struct frag_hdr);
1209        }
1210}
1211
1212static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1213                          struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1214                          struct rt6_info *rt, struct flowi6 *fl6)
1215{
1216        struct ipv6_pinfo *np = inet6_sk(sk);
1217        unsigned int mtu;
1218        struct ipv6_txoptions *opt = ipc6->opt;
1219
1220        /*
1221         * setup for corking
1222         */
1223        if (opt) {
1224                if (WARN_ON(v6_cork->opt))
1225                        return -EINVAL;
1226
1227                v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1228                if (unlikely(!v6_cork->opt))
1229                        return -ENOBUFS;
1230
1231                v6_cork->opt->tot_len = opt->tot_len;
1232                v6_cork->opt->opt_flen = opt->opt_flen;
1233                v6_cork->opt->opt_nflen = opt->opt_nflen;
1234
1235                v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1236                                                    sk->sk_allocation);
1237                if (opt->dst0opt && !v6_cork->opt->dst0opt)
1238                        return -ENOBUFS;
1239
1240                v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1241                                                    sk->sk_allocation);
1242                if (opt->dst1opt && !v6_cork->opt->dst1opt)
1243                        return -ENOBUFS;
1244
1245                v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1246                                                   sk->sk_allocation);
1247                if (opt->hopopt && !v6_cork->opt->hopopt)
1248                        return -ENOBUFS;
1249
1250                v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1251                                                    sk->sk_allocation);
1252                if (opt->srcrt && !v6_cork->opt->srcrt)
1253                        return -ENOBUFS;
1254
1255                /* need source address above miyazawa*/
1256        }
1257        dst_hold(&rt->dst);
1258        cork->base.dst = &rt->dst;
1259        cork->fl.u.ip6 = *fl6;
1260        v6_cork->hop_limit = ipc6->hlimit;
1261        v6_cork->tclass = ipc6->tclass;
1262        if (rt->dst.flags & DST_XFRM_TUNNEL)
1263                mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1264                      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1265        else
1266                mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1267                      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1268        if (np->frag_size < mtu) {
1269                if (np->frag_size)
1270                        mtu = np->frag_size;
1271        }
1272        cork->base.fragsize = mtu;
1273        if (dst_allfrag(rt->dst.path))
1274                cork->base.flags |= IPCORK_ALLFRAG;
1275        cork->base.length = 0;
1276
1277        return 0;
1278}
1279
1280static int __ip6_append_data(struct sock *sk,
1281                             struct flowi6 *fl6,
1282                             struct sk_buff_head *queue,
1283                             struct inet_cork *cork,
1284                             struct inet6_cork *v6_cork,
1285                             struct page_frag *pfrag,
1286                             int getfrag(void *from, char *to, int offset,
1287                                         int len, int odd, struct sk_buff *skb),
1288                             void *from, int length, int transhdrlen,
1289                             unsigned int flags, struct ipcm6_cookie *ipc6,
1290                             const struct sockcm_cookie *sockc)
1291{
1292        struct sk_buff *skb, *skb_prev = NULL;
1293        unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1294        int exthdrlen = 0;
1295        int dst_exthdrlen = 0;
1296        int hh_len;
1297        int copy;
1298        int err;
1299        int offset = 0;
1300        __u8 tx_flags = 0;
1301        u32 tskey = 0;
1302        struct rt6_info *rt = (struct rt6_info *)cork->dst;
1303        struct ipv6_txoptions *opt = v6_cork->opt;
1304        int csummode = CHECKSUM_NONE;
1305        unsigned int maxnonfragsize, headersize;
1306
1307        skb = skb_peek_tail(queue);
1308        if (!skb) {
1309                exthdrlen = opt ? opt->opt_flen : 0;
1310                dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1311        }
1312
1313        mtu = cork->fragsize;
1314        orig_mtu = mtu;
1315
1316        hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1317
1318        fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1319                        (opt ? opt->opt_nflen : 0);
1320        maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1321                     sizeof(struct frag_hdr);
1322
1323        headersize = sizeof(struct ipv6hdr) +
1324                     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1325                     (dst_allfrag(&rt->dst) ?
1326                      sizeof(struct frag_hdr) : 0) +
1327                     rt->rt6i_nfheader_len;
1328
1329        if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1330            (sk->sk_protocol == IPPROTO_UDP ||
1331             sk->sk_protocol == IPPROTO_RAW)) {
1332                ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1333                                sizeof(struct ipv6hdr));
1334                goto emsgsize;
1335        }
1336
1337        if (ip6_sk_ignore_df(sk))
1338                maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1339        else
1340                maxnonfragsize = mtu;
1341
1342        if (cork->length + length > maxnonfragsize - headersize) {
1343emsgsize:
1344                ipv6_local_error(sk, EMSGSIZE, fl6,
1345                                 mtu - headersize +
1346                                 sizeof(struct ipv6hdr));
1347                return -EMSGSIZE;
1348        }
1349
1350        /* CHECKSUM_PARTIAL only with no extension headers and when
1351         * we are not going to fragment
1352         */
1353        if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1354            headersize == sizeof(struct ipv6hdr) &&
1355            length <= mtu - headersize &&
1356            !(flags & MSG_MORE) &&
1357            rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1358                csummode = CHECKSUM_PARTIAL;
1359
1360        if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1361                sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1362                if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1363                    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1364                        tskey = sk->sk_tskey++;
1365        }
1366
1367        /*
1368         * Let's try using as much space as possible.
1369         * Use MTU if total length of the message fits into the MTU.
1370         * Otherwise, we need to reserve fragment header and
1371         * fragment alignment (= 8-15 octects, in total).
1372         *
1373         * Note that we may need to "move" the data from the tail of
1374         * of the buffer to the new fragment when we split
1375         * the message.
1376         *
1377         * FIXME: It may be fragmented into multiple chunks
1378         *        at once if non-fragmentable extension headers
1379         *        are too large.
1380         * --yoshfuji
1381         */
1382
1383        cork->length += length;
1384        if ((skb && skb_is_gso(skb)) ||
1385            (((length + (skb ? skb->len : headersize)) > mtu) &&
1386            (skb_queue_len(queue) <= 1) &&
1387            (sk->sk_protocol == IPPROTO_UDP) &&
1388            (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) &&
1389            (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk))) {
1390                err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1391                                          hh_len, fragheaderlen, exthdrlen,
1392                                          transhdrlen, mtu, flags, fl6);
1393                if (err)
1394                        goto error;
1395                return 0;
1396        }
1397
1398        if (!skb)
1399                goto alloc_new_skb;
1400
1401        while (length > 0) {
1402                /* Check if the remaining data fits into current packet. */
1403                copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1404                if (copy < length)
1405                        copy = maxfraglen - skb->len;
1406
1407                if (copy <= 0) {
1408                        char *data;
1409                        unsigned int datalen;
1410                        unsigned int fraglen;
1411                        unsigned int fraggap;
1412                        unsigned int alloclen;
1413alloc_new_skb:
1414                        /* There's no room in the current skb */
1415                        if (skb)
1416                                fraggap = skb->len - maxfraglen;
1417                        else
1418                                fraggap = 0;
1419                        /* update mtu and maxfraglen if necessary */
1420                        if (!skb || !skb_prev)
1421                                ip6_append_data_mtu(&mtu, &maxfraglen,
1422                                                    fragheaderlen, skb, rt,
1423                                                    orig_mtu);
1424
1425                        skb_prev = skb;
1426
1427                        /*
1428                         * If remaining data exceeds the mtu,
1429                         * we know we need more fragment(s).
1430                         */
1431                        datalen = length + fraggap;
1432
1433                        if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1434                                datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1435                        if ((flags & MSG_MORE) &&
1436                            !(rt->dst.dev->features&NETIF_F_SG))
1437                                alloclen = mtu;
1438                        else
1439                                alloclen = datalen + fragheaderlen;
1440
1441                        alloclen += dst_exthdrlen;
1442
1443                        if (datalen != length + fraggap) {
1444                                /*
1445                                 * this is not the last fragment, the trailer
1446                                 * space is regarded as data space.
1447                                 */
1448                                datalen += rt->dst.trailer_len;
1449                        }
1450
1451                        alloclen += rt->dst.trailer_len;
1452                        fraglen = datalen + fragheaderlen;
1453
1454                        /*
1455                         * We just reserve space for fragment header.
1456                         * Note: this may be overallocation if the message
1457                         * (without MSG_MORE) fits into the MTU.
1458                         */
1459                        alloclen += sizeof(struct frag_hdr);
1460
1461                        copy = datalen - transhdrlen - fraggap;
1462                        if (copy < 0) {
1463                                err = -EINVAL;
1464                                goto error;
1465                        }
1466                        if (transhdrlen) {
1467                                skb = sock_alloc_send_skb(sk,
1468                                                alloclen + hh_len,
1469                                                (flags & MSG_DONTWAIT), &err);
1470                        } else {
1471                                skb = NULL;
1472                                if (refcount_read(&sk->sk_wmem_alloc) <=
1473                                    2 * sk->sk_sndbuf)
1474                                        skb = sock_wmalloc(sk,
1475                                                           alloclen + hh_len, 1,
1476                                                           sk->sk_allocation);
1477                                if (unlikely(!skb))
1478                                        err = -ENOBUFS;
1479                        }
1480                        if (!skb)
1481                                goto error;
1482                        /*
1483                         *      Fill in the control structures
1484                         */
1485                        skb->protocol = htons(ETH_P_IPV6);
1486                        skb->ip_summed = csummode;
1487                        skb->csum = 0;
1488                        /* reserve for fragmentation and ipsec header */
1489                        skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1490                                    dst_exthdrlen);
1491
1492                        /* Only the initial fragment is time stamped */
1493                        skb_shinfo(skb)->tx_flags = tx_flags;
1494                        tx_flags = 0;
1495                        skb_shinfo(skb)->tskey = tskey;
1496                        tskey = 0;
1497
1498                        /*
1499                         *      Find where to start putting bytes
1500                         */
1501                        data = skb_put(skb, fraglen);
1502                        skb_set_network_header(skb, exthdrlen);
1503                        data += fragheaderlen;
1504                        skb->transport_header = (skb->network_header +
1505                                                 fragheaderlen);
1506                        if (fraggap) {
1507                                skb->csum = skb_copy_and_csum_bits(
1508                                        skb_prev, maxfraglen,
1509                                        data + transhdrlen, fraggap, 0);
1510                                skb_prev->csum = csum_sub(skb_prev->csum,
1511                                                          skb->csum);
1512                                data += fraggap;
1513                                pskb_trim_unique(skb_prev, maxfraglen);
1514                        }
1515                        if (copy > 0 &&
1516                            getfrag(from, data + transhdrlen, offset,
1517                                    copy, fraggap, skb) < 0) {
1518                                err = -EFAULT;
1519                                kfree_skb(skb);
1520                                goto error;
1521                        }
1522
1523                        offset += copy;
1524                        length -= datalen - fraggap;
1525                        transhdrlen = 0;
1526                        exthdrlen = 0;
1527                        dst_exthdrlen = 0;
1528
1529                        if ((flags & MSG_CONFIRM) && !skb_prev)
1530                                skb_set_dst_pending_confirm(skb, 1);
1531
1532                        /*
1533                         * Put the packet on the pending queue
1534                         */
1535                        __skb_queue_tail(queue, skb);
1536                        continue;
1537                }
1538
1539                if (copy > length)
1540                        copy = length;
1541
1542                if (!(rt->dst.dev->features&NETIF_F_SG)) {
1543                        unsigned int off;
1544
1545                        off = skb->len;
1546                        if (getfrag(from, skb_put(skb, copy),
1547                                                offset, copy, off, skb) < 0) {
1548                                __skb_trim(skb, off);
1549                                err = -EFAULT;
1550                                goto error;
1551                        }
1552                } else {
1553                        int i = skb_shinfo(skb)->nr_frags;
1554
1555                        err = -ENOMEM;
1556                        if (!sk_page_frag_refill(sk, pfrag))
1557                                goto error;
1558
1559                        if (!skb_can_coalesce(skb, i, pfrag->page,
1560                                              pfrag->offset)) {
1561                                err = -EMSGSIZE;
1562                                if (i == MAX_SKB_FRAGS)
1563                                        goto error;
1564
1565                                __skb_fill_page_desc(skb, i, pfrag->page,
1566                                                     pfrag->offset, 0);
1567                                skb_shinfo(skb)->nr_frags = ++i;
1568                                get_page(pfrag->page);
1569                        }
1570                        copy = min_t(int, copy, pfrag->size - pfrag->offset);
1571                        if (getfrag(from,
1572                                    page_address(pfrag->page) + pfrag->offset,
1573                                    offset, copy, skb->len, skb) < 0)
1574                                goto error_efault;
1575
1576                        pfrag->offset += copy;
1577                        skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1578                        skb->len += copy;
1579                        skb->data_len += copy;
1580                        skb->truesize += copy;
1581                        refcount_add(copy, &sk->sk_wmem_alloc);
1582                }
1583                offset += copy;
1584                length -= copy;
1585        }
1586
1587        return 0;
1588
1589error_efault:
1590        err = -EFAULT;
1591error:
1592        cork->length -= length;
1593        IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1594        return err;
1595}
1596
1597int ip6_append_data(struct sock *sk,
1598                    int getfrag(void *from, char *to, int offset, int len,
1599                                int odd, struct sk_buff *skb),
1600                    void *from, int length, int transhdrlen,
1601                    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1602                    struct rt6_info *rt, unsigned int flags,
1603                    const struct sockcm_cookie *sockc)
1604{
1605        struct inet_sock *inet = inet_sk(sk);
1606        struct ipv6_pinfo *np = inet6_sk(sk);
1607        int exthdrlen;
1608        int err;
1609
1610        if (flags&MSG_PROBE)
1611                return 0;
1612        if (skb_queue_empty(&sk->sk_write_queue)) {
1613                /*
1614                 * setup for corking
1615                 */
1616                err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1617                                     ipc6, rt, fl6);
1618                if (err)
1619                        return err;
1620
1621                exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1622                length += exthdrlen;
1623                transhdrlen += exthdrlen;
1624        } else {
1625                fl6 = &inet->cork.fl.u.ip6;
1626                transhdrlen = 0;
1627        }
1628
1629        return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1630                                 &np->cork, sk_page_frag(sk), getfrag,
1631                                 from, length, transhdrlen, flags, ipc6, sockc);
1632}
1633EXPORT_SYMBOL_GPL(ip6_append_data);
1634
1635static void ip6_cork_release(struct inet_cork_full *cork,
1636                             struct inet6_cork *v6_cork)
1637{
1638        if (v6_cork->opt) {
1639                kfree(v6_cork->opt->dst0opt);
1640                kfree(v6_cork->opt->dst1opt);
1641                kfree(v6_cork->opt->hopopt);
1642                kfree(v6_cork->opt->srcrt);
1643                kfree(v6_cork->opt);
1644                v6_cork->opt = NULL;
1645        }
1646
1647        if (cork->base.dst) {
1648                dst_release(cork->base.dst);
1649                cork->base.dst = NULL;
1650                cork->base.flags &= ~IPCORK_ALLFRAG;
1651        }
1652        memset(&cork->fl, 0, sizeof(cork->fl));
1653}
1654
1655struct sk_buff *__ip6_make_skb(struct sock *sk,
1656                               struct sk_buff_head *queue,
1657                               struct inet_cork_full *cork,
1658                               struct inet6_cork *v6_cork)
1659{
1660        struct sk_buff *skb, *tmp_skb;
1661        struct sk_buff **tail_skb;
1662        struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1663        struct ipv6_pinfo *np = inet6_sk(sk);
1664        struct net *net = sock_net(sk);
1665        struct ipv6hdr *hdr;
1666        struct ipv6_txoptions *opt = v6_cork->opt;
1667        struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1668        struct flowi6 *fl6 = &cork->fl.u.ip6;
1669        unsigned char proto = fl6->flowi6_proto;
1670
1671        skb = __skb_dequeue(queue);
1672        if (!skb)
1673                goto out;
1674        tail_skb = &(skb_shinfo(skb)->frag_list);
1675
1676        /* move skb->data to ip header from ext header */
1677        if (skb->data < skb_network_header(skb))
1678                __skb_pull(skb, skb_network_offset(skb));
1679        while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1680                __skb_pull(tmp_skb, skb_network_header_len(skb));
1681                *tail_skb = tmp_skb;
1682                tail_skb = &(tmp_skb->next);
1683                skb->len += tmp_skb->len;
1684                skb->data_len += tmp_skb->len;
1685                skb->truesize += tmp_skb->truesize;
1686                tmp_skb->destructor = NULL;
1687                tmp_skb->sk = NULL;
1688        }
1689
1690        /* Allow local fragmentation. */
1691        skb->ignore_df = ip6_sk_ignore_df(sk);
1692
1693        *final_dst = fl6->daddr;
1694        __skb_pull(skb, skb_network_header_len(skb));
1695        if (opt && opt->opt_flen)
1696                ipv6_push_frag_opts(skb, opt, &proto);
1697        if (opt && opt->opt_nflen)
1698                ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1699
1700        skb_push(skb, sizeof(struct ipv6hdr));
1701        skb_reset_network_header(skb);
1702        hdr = ipv6_hdr(skb);
1703
1704        ip6_flow_hdr(hdr, v6_cork->tclass,
1705                     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1706                                        np->autoflowlabel, fl6));
1707        hdr->hop_limit = v6_cork->hop_limit;
1708        hdr->nexthdr = proto;
1709        hdr->saddr = fl6->saddr;
1710        hdr->daddr = *final_dst;
1711
1712        skb->priority = sk->sk_priority;
1713        skb->mark = sk->sk_mark;
1714
1715        skb_dst_set(skb, dst_clone(&rt->dst));
1716        IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1717        if (proto == IPPROTO_ICMPV6) {
1718                struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1719
1720                ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1721                ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1722        }
1723
1724        ip6_cork_release(cork, v6_cork);
1725out:
1726        return skb;
1727}
1728
1729int ip6_send_skb(struct sk_buff *skb)
1730{
1731        struct net *net = sock_net(skb->sk);
1732        struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1733        int err;
1734
1735        err = ip6_local_out(net, skb->sk, skb);
1736        if (err) {
1737                if (err > 0)
1738                        err = net_xmit_errno(err);
1739                if (err)
1740                        IP6_INC_STATS(net, rt->rt6i_idev,
1741                                      IPSTATS_MIB_OUTDISCARDS);
1742        }
1743
1744        return err;
1745}
1746
1747int ip6_push_pending_frames(struct sock *sk)
1748{
1749        struct sk_buff *skb;
1750
1751        skb = ip6_finish_skb(sk);
1752        if (!skb)
1753                return 0;
1754
1755        return ip6_send_skb(skb);
1756}
1757EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1758
1759static void __ip6_flush_pending_frames(struct sock *sk,
1760                                       struct sk_buff_head *queue,
1761                                       struct inet_cork_full *cork,
1762                                       struct inet6_cork *v6_cork)
1763{
1764        struct sk_buff *skb;
1765
1766        while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1767                if (skb_dst(skb))
1768                        IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1769                                      IPSTATS_MIB_OUTDISCARDS);
1770                kfree_skb(skb);
1771        }
1772
1773        ip6_cork_release(cork, v6_cork);
1774}
1775
1776void ip6_flush_pending_frames(struct sock *sk)
1777{
1778        __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1779                                   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1780}
1781EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1782
1783struct sk_buff *ip6_make_skb(struct sock *sk,
1784                             int getfrag(void *from, char *to, int offset,
1785                                         int len, int odd, struct sk_buff *skb),
1786                             void *from, int length, int transhdrlen,
1787                             struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1788                             struct rt6_info *rt, unsigned int flags,
1789                             const struct sockcm_cookie *sockc)
1790{
1791        struct inet_cork_full cork;
1792        struct inet6_cork v6_cork;
1793        struct sk_buff_head queue;
1794        int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1795        int err;
1796
1797        if (flags & MSG_PROBE)
1798                return NULL;
1799
1800        __skb_queue_head_init(&queue);
1801
1802        cork.base.flags = 0;
1803        cork.base.addr = 0;
1804        cork.base.opt = NULL;
1805        v6_cork.opt = NULL;
1806        err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1807        if (err)
1808                return ERR_PTR(err);
1809
1810        if (ipc6->dontfrag < 0)
1811                ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1812
1813        err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1814                                &current->task_frag, getfrag, from,
1815                                length + exthdrlen, transhdrlen + exthdrlen,
1816                                flags, ipc6, sockc);
1817        if (err) {
1818                __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1819                return ERR_PTR(err);
1820        }
1821
1822        return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1823}
1824