linux/net/ipv6/ip6_output.c
<<
>>
Prefs
   1/*
   2 *      IPv6 output functions
   3 *      Linux INET6 implementation
   4 *
   5 *      Authors:
   6 *      Pedro Roque             <roque@di.fc.ul.pt>
   7 *
   8 *      Based on linux/net/ipv4/ip_output.c
   9 *
  10 *      This program is free software; you can redistribute it and/or
  11 *      modify it under the terms of the GNU General Public License
  12 *      as published by the Free Software Foundation; either version
  13 *      2 of the License, or (at your option) any later version.
  14 *
  15 *      Changes:
  16 *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17 *                              extension headers are implemented.
  18 *                              route changes now work.
  19 *                              ip6_forward does not confuse sniffers.
  20 *                              etc.
  21 *
  22 *      H. von Brand    :       Added missing #include <linux/string.h>
  23 *      Imran Patel     :       frag id should be in NBO
  24 *      Kazunori MIYAZAWA @USAGI
  25 *                      :       add ip6_append_data and related functions
  26 *                              for datagram xmit
  27 */
  28
  29#include <linux/errno.h>
  30#include <linux/kernel.h>
  31#include <linux/string.h>
  32#include <linux/socket.h>
  33#include <linux/net.h>
  34#include <linux/netdevice.h>
  35#include <linux/if_arp.h>
  36#include <linux/in6.h>
  37#include <linux/tcp.h>
  38#include <linux/route.h>
  39#include <linux/module.h>
  40#include <linux/slab.h>
  41
  42#include <linux/bpf-cgroup.h>
  43#include <linux/netfilter.h>
  44#include <linux/netfilter_ipv6.h>
  45
  46#include <net/sock.h>
  47#include <net/snmp.h>
  48
  49#include <net/ipv6.h>
  50#include <net/ndisc.h>
  51#include <net/protocol.h>
  52#include <net/ip6_route.h>
  53#include <net/addrconf.h>
  54#include <net/rawv6.h>
  55#include <net/icmp.h>
  56#include <net/xfrm.h>
  57#include <net/checksum.h>
  58#include <linux/mroute6.h>
  59#include <net/l3mdev.h>
  60#include <net/lwtunnel.h>
  61
  62static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  63{
  64        struct dst_entry *dst = skb_dst(skb);
  65        struct net_device *dev = dst->dev;
  66        struct neighbour *neigh;
  67        struct in6_addr *nexthop;
  68        int ret;
  69
  70        skb->protocol = htons(ETH_P_IPV6);
  71        skb->dev = dev;
  72
  73        if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  74                struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  75
  76                if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  77                    ((mroute6_socket(net, skb) &&
  78                     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  79                     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  80                                         &ipv6_hdr(skb)->saddr))) {
  81                        struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  82
  83                        /* Do not check for IFF_ALLMULTI; multicast routing
  84                           is not supported in any case.
  85                         */
  86                        if (newskb)
  87                                NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  88                                        net, sk, newskb, NULL, newskb->dev,
  89                                        dev_loopback_xmit);
  90
  91                        if (ipv6_hdr(skb)->hop_limit == 0) {
  92                                IP6_INC_STATS(net, idev,
  93                                              IPSTATS_MIB_OUTDISCARDS);
  94                                kfree_skb(skb);
  95                                return 0;
  96                        }
  97                }
  98
  99                IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
 100
 101                if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
 102                    IPV6_ADDR_SCOPE_NODELOCAL &&
 103                    !(dev->flags & IFF_LOOPBACK)) {
 104                        kfree_skb(skb);
 105                        return 0;
 106                }
 107        }
 108
 109        if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 110                int res = lwtunnel_xmit(skb);
 111
 112                if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 113                        return res;
 114        }
 115
 116        rcu_read_lock_bh();
 117        nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 118        neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 119        if (unlikely(!neigh))
 120                neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 121        if (!IS_ERR(neigh)) {
 122                sock_confirm_neigh(skb, neigh);
 123                ret = neigh_output(neigh, skb);
 124                rcu_read_unlock_bh();
 125                return ret;
 126        }
 127        rcu_read_unlock_bh();
 128
 129        IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 130        kfree_skb(skb);
 131        return -EINVAL;
 132}
 133
 134static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 135{
 136        int ret;
 137
 138        ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 139        if (ret) {
 140                kfree_skb(skb);
 141                return ret;
 142        }
 143
 144        if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 145            dst_allfrag(skb_dst(skb)) ||
 146            (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 147                return ip6_fragment(net, sk, skb, ip6_finish_output2);
 148        else
 149                return ip6_finish_output2(net, sk, skb);
 150}
 151
 152int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 153{
 154        struct net_device *dev = skb_dst(skb)->dev;
 155        struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 156
 157        if (unlikely(idev->cnf.disable_ipv6)) {
 158                IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 159                kfree_skb(skb);
 160                return 0;
 161        }
 162
 163        return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 164                            net, sk, skb, NULL, dev,
 165                            ip6_finish_output,
 166                            !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 167}
 168
 169/*
 170 * xmit an sk_buff (used by TCP, SCTP and DCCP)
 171 * Note : socket lock is not held for SYNACK packets, but might be modified
 172 * by calls to skb_set_owner_w() and ipv6_local_error(),
 173 * which are using proper atomic operations or spinlocks.
 174 */
 175int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 176             __u32 mark, struct ipv6_txoptions *opt, int tclass)
 177{
 178        struct net *net = sock_net(sk);
 179        const struct ipv6_pinfo *np = inet6_sk(sk);
 180        struct in6_addr *first_hop = &fl6->daddr;
 181        struct dst_entry *dst = skb_dst(skb);
 182        struct ipv6hdr *hdr;
 183        u8  proto = fl6->flowi6_proto;
 184        int seg_len = skb->len;
 185        int hlimit = -1;
 186        u32 mtu;
 187
 188        if (opt) {
 189                unsigned int head_room;
 190
 191                /* First: exthdrs may take lots of space (~8K for now)
 192                   MAX_HEADER is not enough.
 193                 */
 194                head_room = opt->opt_nflen + opt->opt_flen;
 195                seg_len += head_room;
 196                head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 197
 198                if (skb_headroom(skb) < head_room) {
 199                        struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 200                        if (!skb2) {
 201                                IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 202                                              IPSTATS_MIB_OUTDISCARDS);
 203                                kfree_skb(skb);
 204                                return -ENOBUFS;
 205                        }
 206                        consume_skb(skb);
 207                        skb = skb2;
 208                        /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
 209                         * it is safe to call in our context (socket lock not held)
 210                         */
 211                        skb_set_owner_w(skb, (struct sock *)sk);
 212                }
 213                if (opt->opt_flen)
 214                        ipv6_push_frag_opts(skb, opt, &proto);
 215                if (opt->opt_nflen)
 216                        ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 217                                             &fl6->saddr);
 218        }
 219
 220        skb_push(skb, sizeof(struct ipv6hdr));
 221        skb_reset_network_header(skb);
 222        hdr = ipv6_hdr(skb);
 223
 224        /*
 225         *      Fill in the IPv6 header
 226         */
 227        if (np)
 228                hlimit = np->hop_limit;
 229        if (hlimit < 0)
 230                hlimit = ip6_dst_hoplimit(dst);
 231
 232        ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 233                                                     np->autoflowlabel, fl6));
 234
 235        hdr->payload_len = htons(seg_len);
 236        hdr->nexthdr = proto;
 237        hdr->hop_limit = hlimit;
 238
 239        hdr->saddr = fl6->saddr;
 240        hdr->daddr = *first_hop;
 241
 242        skb->protocol = htons(ETH_P_IPV6);
 243        skb->priority = sk->sk_priority;
 244        skb->mark = mark;
 245
 246        mtu = dst_mtu(dst);
 247        if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 248                IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 249                              IPSTATS_MIB_OUT, skb->len);
 250
 251                /* if egress device is enslaved to an L3 master device pass the
 252                 * skb to its handler for processing
 253                 */
 254                skb = l3mdev_ip6_out((struct sock *)sk, skb);
 255                if (unlikely(!skb))
 256                        return 0;
 257
 258                /* hooks should never assume socket lock is held.
 259                 * we promote our socket to non const
 260                 */
 261                return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 262                               net, (struct sock *)sk, skb, NULL, dst->dev,
 263                               dst_output);
 264        }
 265
 266        skb->dev = dst->dev;
 267        /* ipv6_local_error() does not require socket lock,
 268         * we promote our socket to non const
 269         */
 270        ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 271
 272        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 273        kfree_skb(skb);
 274        return -EMSGSIZE;
 275}
 276EXPORT_SYMBOL(ip6_xmit);
 277
 278static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 279{
 280        struct ip6_ra_chain *ra;
 281        struct sock *last = NULL;
 282
 283        read_lock(&ip6_ra_lock);
 284        for (ra = ip6_ra_chain; ra; ra = ra->next) {
 285                struct sock *sk = ra->sk;
 286                if (sk && ra->sel == sel &&
 287                    (!sk->sk_bound_dev_if ||
 288                     sk->sk_bound_dev_if == skb->dev->ifindex)) {
 289                        if (last) {
 290                                struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 291                                if (skb2)
 292                                        rawv6_rcv(last, skb2);
 293                        }
 294                        last = sk;
 295                }
 296        }
 297
 298        if (last) {
 299                rawv6_rcv(last, skb);
 300                read_unlock(&ip6_ra_lock);
 301                return 1;
 302        }
 303        read_unlock(&ip6_ra_lock);
 304        return 0;
 305}
 306
 307static int ip6_forward_proxy_check(struct sk_buff *skb)
 308{
 309        struct ipv6hdr *hdr = ipv6_hdr(skb);
 310        u8 nexthdr = hdr->nexthdr;
 311        __be16 frag_off;
 312        int offset;
 313
 314        if (ipv6_ext_hdr(nexthdr)) {
 315                offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 316                if (offset < 0)
 317                        return 0;
 318        } else
 319                offset = sizeof(struct ipv6hdr);
 320
 321        if (nexthdr == IPPROTO_ICMPV6) {
 322                struct icmp6hdr *icmp6;
 323
 324                if (!pskb_may_pull(skb, (skb_network_header(skb) +
 325                                         offset + 1 - skb->data)))
 326                        return 0;
 327
 328                icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 329
 330                switch (icmp6->icmp6_type) {
 331                case NDISC_ROUTER_SOLICITATION:
 332                case NDISC_ROUTER_ADVERTISEMENT:
 333                case NDISC_NEIGHBOUR_SOLICITATION:
 334                case NDISC_NEIGHBOUR_ADVERTISEMENT:
 335                case NDISC_REDIRECT:
 336                        /* For reaction involving unicast neighbor discovery
 337                         * message destined to the proxied address, pass it to
 338                         * input function.
 339                         */
 340                        return 1;
 341                default:
 342                        break;
 343                }
 344        }
 345
 346        /*
 347         * The proxying router can't forward traffic sent to a link-local
 348         * address, so signal the sender and discard the packet. This
 349         * behavior is clarified by the MIPv6 specification.
 350         */
 351        if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 352                dst_link_failure(skb);
 353                return -1;
 354        }
 355
 356        return 0;
 357}
 358
 359static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 360                                     struct sk_buff *skb)
 361{
 362        return dst_output(net, sk, skb);
 363}
 364
 365static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 366{
 367        unsigned int mtu;
 368        struct inet6_dev *idev;
 369
 370        if (dst_metric_locked(dst, RTAX_MTU)) {
 371                mtu = dst_metric_raw(dst, RTAX_MTU);
 372                if (mtu)
 373                        return mtu;
 374        }
 375
 376        mtu = IPV6_MIN_MTU;
 377        rcu_read_lock();
 378        idev = __in6_dev_get(dst->dev);
 379        if (idev)
 380                mtu = idev->cnf.mtu6;
 381        rcu_read_unlock();
 382
 383        return mtu;
 384}
 385
 386static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 387{
 388        if (skb->len <= mtu)
 389                return false;
 390
 391        /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 392        if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 393                return true;
 394
 395        if (skb->ignore_df)
 396                return false;
 397
 398        if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
 399                return false;
 400
 401        return true;
 402}
 403
 404int ip6_forward(struct sk_buff *skb)
 405{
 406        struct dst_entry *dst = skb_dst(skb);
 407        struct ipv6hdr *hdr = ipv6_hdr(skb);
 408        struct inet6_skb_parm *opt = IP6CB(skb);
 409        struct net *net = dev_net(dst->dev);
 410        u32 mtu;
 411
 412        if (net->ipv6.devconf_all->forwarding == 0)
 413                goto error;
 414
 415        if (skb->pkt_type != PACKET_HOST)
 416                goto drop;
 417
 418        if (unlikely(skb->sk))
 419                goto drop;
 420
 421        if (skb_warn_if_lro(skb))
 422                goto drop;
 423
 424        if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 425                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 426                                IPSTATS_MIB_INDISCARDS);
 427                goto drop;
 428        }
 429
 430        skb_forward_csum(skb);
 431
 432        /*
 433         *      We DO NOT make any processing on
 434         *      RA packets, pushing them to user level AS IS
 435         *      without ane WARRANTY that application will be able
 436         *      to interpret them. The reason is that we
 437         *      cannot make anything clever here.
 438         *
 439         *      We are not end-node, so that if packet contains
 440         *      AH/ESP, we cannot make anything.
 441         *      Defragmentation also would be mistake, RA packets
 442         *      cannot be fragmented, because there is no warranty
 443         *      that different fragments will go along one path. --ANK
 444         */
 445        if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 446                if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 447                        return 0;
 448        }
 449
 450        /*
 451         *      check and decrement ttl
 452         */
 453        if (hdr->hop_limit <= 1) {
 454                /* Force OUTPUT device used as source address */
 455                skb->dev = dst->dev;
 456                icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 457                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 458                                IPSTATS_MIB_INHDRERRORS);
 459
 460                kfree_skb(skb);
 461                return -ETIMEDOUT;
 462        }
 463
 464        /* XXX: idev->cnf.proxy_ndp? */
 465        if (net->ipv6.devconf_all->proxy_ndp &&
 466            pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 467                int proxied = ip6_forward_proxy_check(skb);
 468                if (proxied > 0)
 469                        return ip6_input(skb);
 470                else if (proxied < 0) {
 471                        __IP6_INC_STATS(net, ip6_dst_idev(dst),
 472                                        IPSTATS_MIB_INDISCARDS);
 473                        goto drop;
 474                }
 475        }
 476
 477        if (!xfrm6_route_forward(skb)) {
 478                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 479                                IPSTATS_MIB_INDISCARDS);
 480                goto drop;
 481        }
 482        dst = skb_dst(skb);
 483
 484        /* IPv6 specs say nothing about it, but it is clear that we cannot
 485           send redirects to source routed frames.
 486           We don't send redirects to frames decapsulated from IPsec.
 487         */
 488        if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 489                struct in6_addr *target = NULL;
 490                struct inet_peer *peer;
 491                struct rt6_info *rt;
 492
 493                /*
 494                 *      incoming and outgoing devices are the same
 495                 *      send a redirect.
 496                 */
 497
 498                rt = (struct rt6_info *) dst;
 499                if (rt->rt6i_flags & RTF_GATEWAY)
 500                        target = &rt->rt6i_gateway;
 501                else
 502                        target = &hdr->daddr;
 503
 504                peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 505
 506                /* Limit redirects both by destination (here)
 507                   and by source (inside ndisc_send_redirect)
 508                 */
 509                if (inet_peer_xrlim_allow(peer, 1*HZ))
 510                        ndisc_send_redirect(skb, target);
 511                if (peer)
 512                        inet_putpeer(peer);
 513        } else {
 514                int addrtype = ipv6_addr_type(&hdr->saddr);
 515
 516                /* This check is security critical. */
 517                if (addrtype == IPV6_ADDR_ANY ||
 518                    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 519                        goto error;
 520                if (addrtype & IPV6_ADDR_LINKLOCAL) {
 521                        icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 522                                    ICMPV6_NOT_NEIGHBOUR, 0);
 523                        goto error;
 524                }
 525        }
 526
 527        mtu = ip6_dst_mtu_forward(dst);
 528        if (mtu < IPV6_MIN_MTU)
 529                mtu = IPV6_MIN_MTU;
 530
 531        if (ip6_pkt_too_big(skb, mtu)) {
 532                /* Again, force OUTPUT device used as source address */
 533                skb->dev = dst->dev;
 534                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 535                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 536                                IPSTATS_MIB_INTOOBIGERRORS);
 537                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 538                                IPSTATS_MIB_FRAGFAILS);
 539                kfree_skb(skb);
 540                return -EMSGSIZE;
 541        }
 542
 543        if (skb_cow(skb, dst->dev->hard_header_len)) {
 544                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 545                                IPSTATS_MIB_OUTDISCARDS);
 546                goto drop;
 547        }
 548
 549        hdr = ipv6_hdr(skb);
 550
 551        /* Mangling hops number delayed to point after skb COW */
 552
 553        hdr->hop_limit--;
 554
 555        __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 556        __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 557        return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 558                       net, NULL, skb, skb->dev, dst->dev,
 559                       ip6_forward_finish);
 560
 561error:
 562        __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 563drop:
 564        kfree_skb(skb);
 565        return -EINVAL;
 566}
 567
 568static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 569{
 570        to->pkt_type = from->pkt_type;
 571        to->priority = from->priority;
 572        to->protocol = from->protocol;
 573        skb_dst_drop(to);
 574        skb_dst_set(to, dst_clone(skb_dst(from)));
 575        to->dev = from->dev;
 576        to->mark = from->mark;
 577
 578#ifdef CONFIG_NET_SCHED
 579        to->tc_index = from->tc_index;
 580#endif
 581        nf_copy(to, from);
 582        skb_copy_secmark(to, from);
 583}
 584
 585int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 586                 int (*output)(struct net *, struct sock *, struct sk_buff *))
 587{
 588        struct sk_buff *frag;
 589        struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 590        struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 591                                inet6_sk(skb->sk) : NULL;
 592        struct ipv6hdr *tmp_hdr;
 593        struct frag_hdr *fh;
 594        unsigned int mtu, hlen, left, len;
 595        int hroom, troom;
 596        __be32 frag_id;
 597        int ptr, offset = 0, err = 0;
 598        u8 *prevhdr, nexthdr = 0;
 599
 600        err = ip6_find_1stfragopt(skb, &prevhdr);
 601        if (err < 0)
 602                goto fail;
 603        hlen = err;
 604        nexthdr = *prevhdr;
 605
 606        mtu = ip6_skb_dst_mtu(skb);
 607
 608        /* We must not fragment if the socket is set to force MTU discovery
 609         * or if the skb it not generated by a local socket.
 610         */
 611        if (unlikely(!skb->ignore_df && skb->len > mtu))
 612                goto fail_toobig;
 613
 614        if (IP6CB(skb)->frag_max_size) {
 615                if (IP6CB(skb)->frag_max_size > mtu)
 616                        goto fail_toobig;
 617
 618                /* don't send fragments larger than what we received */
 619                mtu = IP6CB(skb)->frag_max_size;
 620                if (mtu < IPV6_MIN_MTU)
 621                        mtu = IPV6_MIN_MTU;
 622        }
 623
 624        if (np && np->frag_size < mtu) {
 625                if (np->frag_size)
 626                        mtu = np->frag_size;
 627        }
 628        if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 629                goto fail_toobig;
 630        mtu -= hlen + sizeof(struct frag_hdr);
 631
 632        frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 633                                    &ipv6_hdr(skb)->saddr);
 634
 635        if (skb->ip_summed == CHECKSUM_PARTIAL &&
 636            (err = skb_checksum_help(skb)))
 637                goto fail;
 638
 639        hroom = LL_RESERVED_SPACE(rt->dst.dev);
 640        if (skb_has_frag_list(skb)) {
 641                unsigned int first_len = skb_pagelen(skb);
 642                struct sk_buff *frag2;
 643
 644                if (first_len - hlen > mtu ||
 645                    ((first_len - hlen) & 7) ||
 646                    skb_cloned(skb) ||
 647                    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 648                        goto slow_path;
 649
 650                skb_walk_frags(skb, frag) {
 651                        /* Correct geometry. */
 652                        if (frag->len > mtu ||
 653                            ((frag->len & 7) && frag->next) ||
 654                            skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 655                                goto slow_path_clean;
 656
 657                        /* Partially cloned skb? */
 658                        if (skb_shared(frag))
 659                                goto slow_path_clean;
 660
 661                        BUG_ON(frag->sk);
 662                        if (skb->sk) {
 663                                frag->sk = skb->sk;
 664                                frag->destructor = sock_wfree;
 665                        }
 666                        skb->truesize -= frag->truesize;
 667                }
 668
 669                err = 0;
 670                offset = 0;
 671                /* BUILD HEADER */
 672
 673                *prevhdr = NEXTHDR_FRAGMENT;
 674                tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 675                if (!tmp_hdr) {
 676                        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 677                                      IPSTATS_MIB_FRAGFAILS);
 678                        err = -ENOMEM;
 679                        goto fail;
 680                }
 681                frag = skb_shinfo(skb)->frag_list;
 682                skb_frag_list_init(skb);
 683
 684                __skb_pull(skb, hlen);
 685                fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
 686                __skb_push(skb, hlen);
 687                skb_reset_network_header(skb);
 688                memcpy(skb_network_header(skb), tmp_hdr, hlen);
 689
 690                fh->nexthdr = nexthdr;
 691                fh->reserved = 0;
 692                fh->frag_off = htons(IP6_MF);
 693                fh->identification = frag_id;
 694
 695                first_len = skb_pagelen(skb);
 696                skb->data_len = first_len - skb_headlen(skb);
 697                skb->len = first_len;
 698                ipv6_hdr(skb)->payload_len = htons(first_len -
 699                                                   sizeof(struct ipv6hdr));
 700
 701                dst_hold(&rt->dst);
 702
 703                for (;;) {
 704                        /* Prepare header of the next frame,
 705                         * before previous one went down. */
 706                        if (frag) {
 707                                frag->ip_summed = CHECKSUM_NONE;
 708                                skb_reset_transport_header(frag);
 709                                fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
 710                                __skb_push(frag, hlen);
 711                                skb_reset_network_header(frag);
 712                                memcpy(skb_network_header(frag), tmp_hdr,
 713                                       hlen);
 714                                offset += skb->len - hlen - sizeof(struct frag_hdr);
 715                                fh->nexthdr = nexthdr;
 716                                fh->reserved = 0;
 717                                fh->frag_off = htons(offset);
 718                                if (frag->next)
 719                                        fh->frag_off |= htons(IP6_MF);
 720                                fh->identification = frag_id;
 721                                ipv6_hdr(frag)->payload_len =
 722                                                htons(frag->len -
 723                                                      sizeof(struct ipv6hdr));
 724                                ip6_copy_metadata(frag, skb);
 725                        }
 726
 727                        err = output(net, sk, skb);
 728                        if (!err)
 729                                IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 730                                              IPSTATS_MIB_FRAGCREATES);
 731
 732                        if (err || !frag)
 733                                break;
 734
 735                        skb = frag;
 736                        frag = skb->next;
 737                        skb->next = NULL;
 738                }
 739
 740                kfree(tmp_hdr);
 741
 742                if (err == 0) {
 743                        IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 744                                      IPSTATS_MIB_FRAGOKS);
 745                        ip6_rt_put(rt);
 746                        return 0;
 747                }
 748
 749                kfree_skb_list(frag);
 750
 751                IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 752                              IPSTATS_MIB_FRAGFAILS);
 753                ip6_rt_put(rt);
 754                return err;
 755
 756slow_path_clean:
 757                skb_walk_frags(skb, frag2) {
 758                        if (frag2 == frag)
 759                                break;
 760                        frag2->sk = NULL;
 761                        frag2->destructor = NULL;
 762                        skb->truesize += frag2->truesize;
 763                }
 764        }
 765
 766slow_path:
 767        left = skb->len - hlen;         /* Space per frame */
 768        ptr = hlen;                     /* Where to start from */
 769
 770        /*
 771         *      Fragment the datagram.
 772         */
 773
 774        troom = rt->dst.dev->needed_tailroom;
 775
 776        /*
 777         *      Keep copying data until we run out.
 778         */
 779        while (left > 0)        {
 780                u8 *fragnexthdr_offset;
 781
 782                len = left;
 783                /* IF: it doesn't fit, use 'mtu' - the data space left */
 784                if (len > mtu)
 785                        len = mtu;
 786                /* IF: we are not sending up to and including the packet end
 787                   then align the next start on an eight byte boundary */
 788                if (len < left) {
 789                        len &= ~7;
 790                }
 791
 792                /* Allocate buffer */
 793                frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 794                                 hroom + troom, GFP_ATOMIC);
 795                if (!frag) {
 796                        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 797                                      IPSTATS_MIB_FRAGFAILS);
 798                        err = -ENOMEM;
 799                        goto fail;
 800                }
 801
 802                /*
 803                 *      Set up data on packet
 804                 */
 805
 806                ip6_copy_metadata(frag, skb);
 807                skb_reserve(frag, hroom);
 808                skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 809                skb_reset_network_header(frag);
 810                fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 811                frag->transport_header = (frag->network_header + hlen +
 812                                          sizeof(struct frag_hdr));
 813
 814                /*
 815                 *      Charge the memory for the fragment to any owner
 816                 *      it might possess
 817                 */
 818                if (skb->sk)
 819                        skb_set_owner_w(frag, skb->sk);
 820
 821                /*
 822                 *      Copy the packet header into the new buffer.
 823                 */
 824                skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 825
 826                fragnexthdr_offset = skb_network_header(frag);
 827                fragnexthdr_offset += prevhdr - skb_network_header(skb);
 828                *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 829
 830                /*
 831                 *      Build fragment header.
 832                 */
 833                fh->nexthdr = nexthdr;
 834                fh->reserved = 0;
 835                fh->identification = frag_id;
 836
 837                /*
 838                 *      Copy a block of the IP datagram.
 839                 */
 840                BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 841                                     len));
 842                left -= len;
 843
 844                fh->frag_off = htons(offset);
 845                if (left > 0)
 846                        fh->frag_off |= htons(IP6_MF);
 847                ipv6_hdr(frag)->payload_len = htons(frag->len -
 848                                                    sizeof(struct ipv6hdr));
 849
 850                ptr += len;
 851                offset += len;
 852
 853                /*
 854                 *      Put this fragment into the sending queue.
 855                 */
 856                err = output(net, sk, frag);
 857                if (err)
 858                        goto fail;
 859
 860                IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 861                              IPSTATS_MIB_FRAGCREATES);
 862        }
 863        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 864                      IPSTATS_MIB_FRAGOKS);
 865        consume_skb(skb);
 866        return err;
 867
 868fail_toobig:
 869        if (skb->sk && dst_allfrag(skb_dst(skb)))
 870                sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 871
 872        skb->dev = skb_dst(skb)->dev;
 873        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 874        err = -EMSGSIZE;
 875
 876fail:
 877        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 878                      IPSTATS_MIB_FRAGFAILS);
 879        kfree_skb(skb);
 880        return err;
 881}
 882
 883static inline int ip6_rt_check(const struct rt6key *rt_key,
 884                               const struct in6_addr *fl_addr,
 885                               const struct in6_addr *addr_cache)
 886{
 887        return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 888                (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 889}
 890
 891static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 892                                          struct dst_entry *dst,
 893                                          const struct flowi6 *fl6)
 894{
 895        struct ipv6_pinfo *np = inet6_sk(sk);
 896        struct rt6_info *rt;
 897
 898        if (!dst)
 899                goto out;
 900
 901        if (dst->ops->family != AF_INET6) {
 902                dst_release(dst);
 903                return NULL;
 904        }
 905
 906        rt = (struct rt6_info *)dst;
 907        /* Yes, checking route validity in not connected
 908         * case is not very simple. Take into account,
 909         * that we do not support routing by source, TOS,
 910         * and MSG_DONTROUTE            --ANK (980726)
 911         *
 912         * 1. ip6_rt_check(): If route was host route,
 913         *    check that cached destination is current.
 914         *    If it is network route, we still may
 915         *    check its validity using saved pointer
 916         *    to the last used address: daddr_cache.
 917         *    We do not want to save whole address now,
 918         *    (because main consumer of this service
 919         *    is tcp, which has not this problem),
 920         *    so that the last trick works only on connected
 921         *    sockets.
 922         * 2. oif also should be the same.
 923         */
 924        if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 925#ifdef CONFIG_IPV6_SUBTREES
 926            ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 927#endif
 928           (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 929              (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 930                dst_release(dst);
 931                dst = NULL;
 932        }
 933
 934out:
 935        return dst;
 936}
 937
 938static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 939                               struct dst_entry **dst, struct flowi6 *fl6)
 940{
 941#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 942        struct neighbour *n;
 943        struct rt6_info *rt;
 944#endif
 945        int err;
 946        int flags = 0;
 947
 948        /* The correct way to handle this would be to do
 949         * ip6_route_get_saddr, and then ip6_route_output; however,
 950         * the route-specific preferred source forces the
 951         * ip6_route_output call _before_ ip6_route_get_saddr.
 952         *
 953         * In source specific routing (no src=any default route),
 954         * ip6_route_output will fail given src=any saddr, though, so
 955         * that's why we try it again later.
 956         */
 957        if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
 958                struct rt6_info *rt;
 959                bool had_dst = *dst != NULL;
 960
 961                if (!had_dst)
 962                        *dst = ip6_route_output(net, sk, fl6);
 963                rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
 964                err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 965                                          sk ? inet6_sk(sk)->srcprefs : 0,
 966                                          &fl6->saddr);
 967                if (err)
 968                        goto out_err_release;
 969
 970                /* If we had an erroneous initial result, pretend it
 971                 * never existed and let the SA-enabled version take
 972                 * over.
 973                 */
 974                if (!had_dst && (*dst)->error) {
 975                        dst_release(*dst);
 976                        *dst = NULL;
 977                }
 978
 979                if (fl6->flowi6_oif)
 980                        flags |= RT6_LOOKUP_F_IFACE;
 981        }
 982
 983        if (!*dst)
 984                *dst = ip6_route_output_flags(net, sk, fl6, flags);
 985
 986        err = (*dst)->error;
 987        if (err)
 988                goto out_err_release;
 989
 990#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 991        /*
 992         * Here if the dst entry we've looked up
 993         * has a neighbour entry that is in the INCOMPLETE
 994         * state and the src address from the flow is
 995         * marked as OPTIMISTIC, we release the found
 996         * dst entry and replace it instead with the
 997         * dst entry of the nexthop router
 998         */
 999        rt = (struct rt6_info *) *dst;
1000        rcu_read_lock_bh();
1001        n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1002                                      rt6_nexthop(rt, &fl6->daddr));
1003        err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1004        rcu_read_unlock_bh();
1005
1006        if (err) {
1007                struct inet6_ifaddr *ifp;
1008                struct flowi6 fl_gw6;
1009                int redirect;
1010
1011                ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1012                                      (*dst)->dev, 1);
1013
1014                redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1015                if (ifp)
1016                        in6_ifa_put(ifp);
1017
1018                if (redirect) {
1019                        /*
1020                         * We need to get the dst entry for the
1021                         * default router instead
1022                         */
1023                        dst_release(*dst);
1024                        memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1025                        memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1026                        *dst = ip6_route_output(net, sk, &fl_gw6);
1027                        err = (*dst)->error;
1028                        if (err)
1029                                goto out_err_release;
1030                }
1031        }
1032#endif
1033        if (ipv6_addr_v4mapped(&fl6->saddr) &&
1034            !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1035                err = -EAFNOSUPPORT;
1036                goto out_err_release;
1037        }
1038
1039        return 0;
1040
1041out_err_release:
1042        dst_release(*dst);
1043        *dst = NULL;
1044
1045        if (err == -ENETUNREACH)
1046                IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1047        return err;
1048}
1049
1050/**
1051 *      ip6_dst_lookup - perform route lookup on flow
1052 *      @sk: socket which provides route info
1053 *      @dst: pointer to dst_entry * for result
1054 *      @fl6: flow to lookup
1055 *
1056 *      This function performs a route lookup on the given flow.
1057 *
1058 *      It returns zero on success, or a standard errno code on error.
1059 */
1060int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1061                   struct flowi6 *fl6)
1062{
1063        *dst = NULL;
1064        return ip6_dst_lookup_tail(net, sk, dst, fl6);
1065}
1066EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1067
1068/**
1069 *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1070 *      @sk: socket which provides route info
1071 *      @fl6: flow to lookup
1072 *      @final_dst: final destination address for ipsec lookup
1073 *
1074 *      This function performs a route lookup on the given flow.
1075 *
1076 *      It returns a valid dst pointer on success, or a pointer encoded
1077 *      error code.
1078 */
1079struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1080                                      const struct in6_addr *final_dst)
1081{
1082        struct dst_entry *dst = NULL;
1083        int err;
1084
1085        err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1086        if (err)
1087                return ERR_PTR(err);
1088        if (final_dst)
1089                fl6->daddr = *final_dst;
1090
1091        return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1092}
1093EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1094
1095/**
1096 *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1097 *      @sk: socket which provides the dst cache and route info
1098 *      @fl6: flow to lookup
1099 *      @final_dst: final destination address for ipsec lookup
1100 *
1101 *      This function performs a route lookup on the given flow with the
1102 *      possibility of using the cached route in the socket if it is valid.
1103 *      It will take the socket dst lock when operating on the dst cache.
1104 *      As a result, this function can only be used in process context.
1105 *
1106 *      It returns a valid dst pointer on success, or a pointer encoded
1107 *      error code.
1108 */
1109struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1110                                         const struct in6_addr *final_dst)
1111{
1112        struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1113
1114        dst = ip6_sk_dst_check(sk, dst, fl6);
1115        if (!dst)
1116                dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1117
1118        return dst;
1119}
1120EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1121
1122static inline int ip6_ufo_append_data(struct sock *sk,
1123                        struct sk_buff_head *queue,
1124                        int getfrag(void *from, char *to, int offset, int len,
1125                        int odd, struct sk_buff *skb),
1126                        void *from, int length, int hh_len, int fragheaderlen,
1127                        int exthdrlen, int transhdrlen, int mtu,
1128                        unsigned int flags, const struct flowi6 *fl6)
1129
1130{
1131        struct sk_buff *skb;
1132        int err;
1133
1134        /* There is support for UDP large send offload by network
1135         * device, so create one single skb packet containing complete
1136         * udp datagram
1137         */
1138        skb = skb_peek_tail(queue);
1139        if (!skb) {
1140                skb = sock_alloc_send_skb(sk,
1141                        hh_len + fragheaderlen + transhdrlen + 20,
1142                        (flags & MSG_DONTWAIT), &err);
1143                if (!skb)
1144                        return err;
1145
1146                /* reserve space for Hardware header */
1147                skb_reserve(skb, hh_len);
1148
1149                /* create space for UDP/IP header */
1150                skb_put(skb, fragheaderlen + transhdrlen);
1151
1152                /* initialize network header pointer */
1153                skb_set_network_header(skb, exthdrlen);
1154
1155                /* initialize protocol header pointer */
1156                skb->transport_header = skb->network_header + fragheaderlen;
1157
1158                skb->protocol = htons(ETH_P_IPV6);
1159                skb->csum = 0;
1160
1161                if (flags & MSG_CONFIRM)
1162                        skb_set_dst_pending_confirm(skb, 1);
1163
1164                __skb_queue_tail(queue, skb);
1165        } else if (skb_is_gso(skb)) {
1166                goto append;
1167        }
1168
1169        skb->ip_summed = CHECKSUM_PARTIAL;
1170        /* Specify the length of each IPv6 datagram fragment.
1171         * It has to be a multiple of 8.
1172         */
1173        skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1174                                     sizeof(struct frag_hdr)) & ~7;
1175        skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1176        skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1177                                                         &fl6->daddr,
1178                                                         &fl6->saddr);
1179
1180append:
1181        return skb_append_datato_frags(sk, skb, getfrag, from,
1182                                       (length - transhdrlen));
1183}
1184
1185static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1186                                               gfp_t gfp)
1187{
1188        return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1189}
1190
1191static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1192                                                gfp_t gfp)
1193{
1194        return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1195}
1196
1197static void ip6_append_data_mtu(unsigned int *mtu,
1198                                int *maxfraglen,
1199                                unsigned int fragheaderlen,
1200                                struct sk_buff *skb,
1201                                struct rt6_info *rt,
1202                                unsigned int orig_mtu)
1203{
1204        if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1205                if (!skb) {
1206                        /* first fragment, reserve header_len */
1207                        *mtu = orig_mtu - rt->dst.header_len;
1208
1209                } else {
1210                        /*
1211                         * this fragment is not first, the headers
1212                         * space is regarded as data space.
1213                         */
1214                        *mtu = orig_mtu;
1215                }
1216                *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1217                              + fragheaderlen - sizeof(struct frag_hdr);
1218        }
1219}
1220
1221static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1222                          struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1223                          struct rt6_info *rt, struct flowi6 *fl6)
1224{
1225        struct ipv6_pinfo *np = inet6_sk(sk);
1226        unsigned int mtu;
1227        struct ipv6_txoptions *opt = ipc6->opt;
1228
1229        /*
1230         * setup for corking
1231         */
1232        if (opt) {
1233                if (WARN_ON(v6_cork->opt))
1234                        return -EINVAL;
1235
1236                v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1237                if (unlikely(!v6_cork->opt))
1238                        return -ENOBUFS;
1239
1240                v6_cork->opt->tot_len = opt->tot_len;
1241                v6_cork->opt->opt_flen = opt->opt_flen;
1242                v6_cork->opt->opt_nflen = opt->opt_nflen;
1243
1244                v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1245                                                    sk->sk_allocation);
1246                if (opt->dst0opt && !v6_cork->opt->dst0opt)
1247                        return -ENOBUFS;
1248
1249                v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1250                                                    sk->sk_allocation);
1251                if (opt->dst1opt && !v6_cork->opt->dst1opt)
1252                        return -ENOBUFS;
1253
1254                v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1255                                                   sk->sk_allocation);
1256                if (opt->hopopt && !v6_cork->opt->hopopt)
1257                        return -ENOBUFS;
1258
1259                v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1260                                                    sk->sk_allocation);
1261                if (opt->srcrt && !v6_cork->opt->srcrt)
1262                        return -ENOBUFS;
1263
1264                /* need source address above miyazawa*/
1265        }
1266        dst_hold(&rt->dst);
1267        cork->base.dst = &rt->dst;
1268        cork->fl.u.ip6 = *fl6;
1269        v6_cork->hop_limit = ipc6->hlimit;
1270        v6_cork->tclass = ipc6->tclass;
1271        if (rt->dst.flags & DST_XFRM_TUNNEL)
1272                mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1273                      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1274        else
1275                mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1276                      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1277        if (np->frag_size < mtu) {
1278                if (np->frag_size)
1279                        mtu = np->frag_size;
1280        }
1281        cork->base.fragsize = mtu;
1282        if (dst_allfrag(rt->dst.path))
1283                cork->base.flags |= IPCORK_ALLFRAG;
1284        cork->base.length = 0;
1285
1286        return 0;
1287}
1288
1289static int __ip6_append_data(struct sock *sk,
1290                             struct flowi6 *fl6,
1291                             struct sk_buff_head *queue,
1292                             struct inet_cork *cork,
1293                             struct inet6_cork *v6_cork,
1294                             struct page_frag *pfrag,
1295                             int getfrag(void *from, char *to, int offset,
1296                                         int len, int odd, struct sk_buff *skb),
1297                             void *from, int length, int transhdrlen,
1298                             unsigned int flags, struct ipcm6_cookie *ipc6,
1299                             const struct sockcm_cookie *sockc)
1300{
1301        struct sk_buff *skb, *skb_prev = NULL;
1302        unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1303        int exthdrlen = 0;
1304        int dst_exthdrlen = 0;
1305        int hh_len;
1306        int copy;
1307        int err;
1308        int offset = 0;
1309        __u8 tx_flags = 0;
1310        u32 tskey = 0;
1311        struct rt6_info *rt = (struct rt6_info *)cork->dst;
1312        struct ipv6_txoptions *opt = v6_cork->opt;
1313        int csummode = CHECKSUM_NONE;
1314        unsigned int maxnonfragsize, headersize;
1315
1316        skb = skb_peek_tail(queue);
1317        if (!skb) {
1318                exthdrlen = opt ? opt->opt_flen : 0;
1319                dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1320        }
1321
1322        mtu = cork->fragsize;
1323        orig_mtu = mtu;
1324
1325        hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1326
1327        fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1328                        (opt ? opt->opt_nflen : 0);
1329        maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1330                     sizeof(struct frag_hdr);
1331
1332        headersize = sizeof(struct ipv6hdr) +
1333                     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1334                     (dst_allfrag(&rt->dst) ?
1335                      sizeof(struct frag_hdr) : 0) +
1336                     rt->rt6i_nfheader_len;
1337
1338        if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1339            (sk->sk_protocol == IPPROTO_UDP ||
1340             sk->sk_protocol == IPPROTO_RAW)) {
1341                ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1342                                sizeof(struct ipv6hdr));
1343                goto emsgsize;
1344        }
1345
1346        if (ip6_sk_ignore_df(sk))
1347                maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1348        else
1349                maxnonfragsize = mtu;
1350
1351        if (cork->length + length > maxnonfragsize - headersize) {
1352emsgsize:
1353                ipv6_local_error(sk, EMSGSIZE, fl6,
1354                                 mtu - headersize +
1355                                 sizeof(struct ipv6hdr));
1356                return -EMSGSIZE;
1357        }
1358
1359        /* CHECKSUM_PARTIAL only with no extension headers and when
1360         * we are not going to fragment
1361         */
1362        if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1363            headersize == sizeof(struct ipv6hdr) &&
1364            length <= mtu - headersize &&
1365            !(flags & MSG_MORE) &&
1366            rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1367                csummode = CHECKSUM_PARTIAL;
1368
1369        if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1370                sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1371                if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1372                    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1373                        tskey = sk->sk_tskey++;
1374        }
1375
1376        /*
1377         * Let's try using as much space as possible.
1378         * Use MTU if total length of the message fits into the MTU.
1379         * Otherwise, we need to reserve fragment header and
1380         * fragment alignment (= 8-15 octects, in total).
1381         *
1382         * Note that we may need to "move" the data from the tail of
1383         * of the buffer to the new fragment when we split
1384         * the message.
1385         *
1386         * FIXME: It may be fragmented into multiple chunks
1387         *        at once if non-fragmentable extension headers
1388         *        are too large.
1389         * --yoshfuji
1390         */
1391
1392        cork->length += length;
1393        if ((((length + (skb ? skb->len : headersize)) > mtu) ||
1394             (skb && skb_is_gso(skb))) &&
1395            (sk->sk_protocol == IPPROTO_UDP) &&
1396            (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) &&
1397            (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
1398                err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1399                                          hh_len, fragheaderlen, exthdrlen,
1400                                          transhdrlen, mtu, flags, fl6);
1401                if (err)
1402                        goto error;
1403                return 0;
1404        }
1405
1406        if (!skb)
1407                goto alloc_new_skb;
1408
1409        while (length > 0) {
1410                /* Check if the remaining data fits into current packet. */
1411                copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1412                if (copy < length)
1413                        copy = maxfraglen - skb->len;
1414
1415                if (copy <= 0) {
1416                        char *data;
1417                        unsigned int datalen;
1418                        unsigned int fraglen;
1419                        unsigned int fraggap;
1420                        unsigned int alloclen;
1421alloc_new_skb:
1422                        /* There's no room in the current skb */
1423                        if (skb)
1424                                fraggap = skb->len - maxfraglen;
1425                        else
1426                                fraggap = 0;
1427                        /* update mtu and maxfraglen if necessary */
1428                        if (!skb || !skb_prev)
1429                                ip6_append_data_mtu(&mtu, &maxfraglen,
1430                                                    fragheaderlen, skb, rt,
1431                                                    orig_mtu);
1432
1433                        skb_prev = skb;
1434
1435                        /*
1436                         * If remaining data exceeds the mtu,
1437                         * we know we need more fragment(s).
1438                         */
1439                        datalen = length + fraggap;
1440
1441                        if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1442                                datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1443                        if ((flags & MSG_MORE) &&
1444                            !(rt->dst.dev->features&NETIF_F_SG))
1445                                alloclen = mtu;
1446                        else
1447                                alloclen = datalen + fragheaderlen;
1448
1449                        alloclen += dst_exthdrlen;
1450
1451                        if (datalen != length + fraggap) {
1452                                /*
1453                                 * this is not the last fragment, the trailer
1454                                 * space is regarded as data space.
1455                                 */
1456                                datalen += rt->dst.trailer_len;
1457                        }
1458
1459                        alloclen += rt->dst.trailer_len;
1460                        fraglen = datalen + fragheaderlen;
1461
1462                        /*
1463                         * We just reserve space for fragment header.
1464                         * Note: this may be overallocation if the message
1465                         * (without MSG_MORE) fits into the MTU.
1466                         */
1467                        alloclen += sizeof(struct frag_hdr);
1468
1469                        copy = datalen - transhdrlen - fraggap;
1470                        if (copy < 0) {
1471                                err = -EINVAL;
1472                                goto error;
1473                        }
1474                        if (transhdrlen) {
1475                                skb = sock_alloc_send_skb(sk,
1476                                                alloclen + hh_len,
1477                                                (flags & MSG_DONTWAIT), &err);
1478                        } else {
1479                                skb = NULL;
1480                                if (atomic_read(&sk->sk_wmem_alloc) <=
1481                                    2 * sk->sk_sndbuf)
1482                                        skb = sock_wmalloc(sk,
1483                                                           alloclen + hh_len, 1,
1484                                                           sk->sk_allocation);
1485                                if (unlikely(!skb))
1486                                        err = -ENOBUFS;
1487                        }
1488                        if (!skb)
1489                                goto error;
1490                        /*
1491                         *      Fill in the control structures
1492                         */
1493                        skb->protocol = htons(ETH_P_IPV6);
1494                        skb->ip_summed = csummode;
1495                        skb->csum = 0;
1496                        /* reserve for fragmentation and ipsec header */
1497                        skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1498                                    dst_exthdrlen);
1499
1500                        /* Only the initial fragment is time stamped */
1501                        skb_shinfo(skb)->tx_flags = tx_flags;
1502                        tx_flags = 0;
1503                        skb_shinfo(skb)->tskey = tskey;
1504                        tskey = 0;
1505
1506                        /*
1507                         *      Find where to start putting bytes
1508                         */
1509                        data = skb_put(skb, fraglen);
1510                        skb_set_network_header(skb, exthdrlen);
1511                        data += fragheaderlen;
1512                        skb->transport_header = (skb->network_header +
1513                                                 fragheaderlen);
1514                        if (fraggap) {
1515                                skb->csum = skb_copy_and_csum_bits(
1516                                        skb_prev, maxfraglen,
1517                                        data + transhdrlen, fraggap, 0);
1518                                skb_prev->csum = csum_sub(skb_prev->csum,
1519                                                          skb->csum);
1520                                data += fraggap;
1521                                pskb_trim_unique(skb_prev, maxfraglen);
1522                        }
1523                        if (copy > 0 &&
1524                            getfrag(from, data + transhdrlen, offset,
1525                                    copy, fraggap, skb) < 0) {
1526                                err = -EFAULT;
1527                                kfree_skb(skb);
1528                                goto error;
1529                        }
1530
1531                        offset += copy;
1532                        length -= datalen - fraggap;
1533                        transhdrlen = 0;
1534                        exthdrlen = 0;
1535                        dst_exthdrlen = 0;
1536
1537                        if ((flags & MSG_CONFIRM) && !skb_prev)
1538                                skb_set_dst_pending_confirm(skb, 1);
1539
1540                        /*
1541                         * Put the packet on the pending queue
1542                         */
1543                        __skb_queue_tail(queue, skb);
1544                        continue;
1545                }
1546
1547                if (copy > length)
1548                        copy = length;
1549
1550                if (!(rt->dst.dev->features&NETIF_F_SG)) {
1551                        unsigned int off;
1552
1553                        off = skb->len;
1554                        if (getfrag(from, skb_put(skb, copy),
1555                                                offset, copy, off, skb) < 0) {
1556                                __skb_trim(skb, off);
1557                                err = -EFAULT;
1558                                goto error;
1559                        }
1560                } else {
1561                        int i = skb_shinfo(skb)->nr_frags;
1562
1563                        err = -ENOMEM;
1564                        if (!sk_page_frag_refill(sk, pfrag))
1565                                goto error;
1566
1567                        if (!skb_can_coalesce(skb, i, pfrag->page,
1568                                              pfrag->offset)) {
1569                                err = -EMSGSIZE;
1570                                if (i == MAX_SKB_FRAGS)
1571                                        goto error;
1572
1573                                __skb_fill_page_desc(skb, i, pfrag->page,
1574                                                     pfrag->offset, 0);
1575                                skb_shinfo(skb)->nr_frags = ++i;
1576                                get_page(pfrag->page);
1577                        }
1578                        copy = min_t(int, copy, pfrag->size - pfrag->offset);
1579                        if (getfrag(from,
1580                                    page_address(pfrag->page) + pfrag->offset,
1581                                    offset, copy, skb->len, skb) < 0)
1582                                goto error_efault;
1583
1584                        pfrag->offset += copy;
1585                        skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1586                        skb->len += copy;
1587                        skb->data_len += copy;
1588                        skb->truesize += copy;
1589                        atomic_add(copy, &sk->sk_wmem_alloc);
1590                }
1591                offset += copy;
1592                length -= copy;
1593        }
1594
1595        return 0;
1596
1597error_efault:
1598        err = -EFAULT;
1599error:
1600        cork->length -= length;
1601        IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1602        return err;
1603}
1604
1605int ip6_append_data(struct sock *sk,
1606                    int getfrag(void *from, char *to, int offset, int len,
1607                                int odd, struct sk_buff *skb),
1608                    void *from, int length, int transhdrlen,
1609                    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1610                    struct rt6_info *rt, unsigned int flags,
1611                    const struct sockcm_cookie *sockc)
1612{
1613        struct inet_sock *inet = inet_sk(sk);
1614        struct ipv6_pinfo *np = inet6_sk(sk);
1615        int exthdrlen;
1616        int err;
1617
1618        if (flags&MSG_PROBE)
1619                return 0;
1620        if (skb_queue_empty(&sk->sk_write_queue)) {
1621                /*
1622                 * setup for corking
1623                 */
1624                err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1625                                     ipc6, rt, fl6);
1626                if (err)
1627                        return err;
1628
1629                exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1630                length += exthdrlen;
1631                transhdrlen += exthdrlen;
1632        } else {
1633                fl6 = &inet->cork.fl.u.ip6;
1634                transhdrlen = 0;
1635        }
1636
1637        return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1638                                 &np->cork, sk_page_frag(sk), getfrag,
1639                                 from, length, transhdrlen, flags, ipc6, sockc);
1640}
1641EXPORT_SYMBOL_GPL(ip6_append_data);
1642
1643static void ip6_cork_release(struct inet_cork_full *cork,
1644                             struct inet6_cork *v6_cork)
1645{
1646        if (v6_cork->opt) {
1647                kfree(v6_cork->opt->dst0opt);
1648                kfree(v6_cork->opt->dst1opt);
1649                kfree(v6_cork->opt->hopopt);
1650                kfree(v6_cork->opt->srcrt);
1651                kfree(v6_cork->opt);
1652                v6_cork->opt = NULL;
1653        }
1654
1655        if (cork->base.dst) {
1656                dst_release(cork->base.dst);
1657                cork->base.dst = NULL;
1658                cork->base.flags &= ~IPCORK_ALLFRAG;
1659        }
1660        memset(&cork->fl, 0, sizeof(cork->fl));
1661}
1662
1663struct sk_buff *__ip6_make_skb(struct sock *sk,
1664                               struct sk_buff_head *queue,
1665                               struct inet_cork_full *cork,
1666                               struct inet6_cork *v6_cork)
1667{
1668        struct sk_buff *skb, *tmp_skb;
1669        struct sk_buff **tail_skb;
1670        struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1671        struct ipv6_pinfo *np = inet6_sk(sk);
1672        struct net *net = sock_net(sk);
1673        struct ipv6hdr *hdr;
1674        struct ipv6_txoptions *opt = v6_cork->opt;
1675        struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1676        struct flowi6 *fl6 = &cork->fl.u.ip6;
1677        unsigned char proto = fl6->flowi6_proto;
1678
1679        skb = __skb_dequeue(queue);
1680        if (!skb)
1681                goto out;
1682        tail_skb = &(skb_shinfo(skb)->frag_list);
1683
1684        /* move skb->data to ip header from ext header */
1685        if (skb->data < skb_network_header(skb))
1686                __skb_pull(skb, skb_network_offset(skb));
1687        while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1688                __skb_pull(tmp_skb, skb_network_header_len(skb));
1689                *tail_skb = tmp_skb;
1690                tail_skb = &(tmp_skb->next);
1691                skb->len += tmp_skb->len;
1692                skb->data_len += tmp_skb->len;
1693                skb->truesize += tmp_skb->truesize;
1694                tmp_skb->destructor = NULL;
1695                tmp_skb->sk = NULL;
1696        }
1697
1698        /* Allow local fragmentation. */
1699        skb->ignore_df = ip6_sk_ignore_df(sk);
1700
1701        *final_dst = fl6->daddr;
1702        __skb_pull(skb, skb_network_header_len(skb));
1703        if (opt && opt->opt_flen)
1704                ipv6_push_frag_opts(skb, opt, &proto);
1705        if (opt && opt->opt_nflen)
1706                ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1707
1708        skb_push(skb, sizeof(struct ipv6hdr));
1709        skb_reset_network_header(skb);
1710        hdr = ipv6_hdr(skb);
1711
1712        ip6_flow_hdr(hdr, v6_cork->tclass,
1713                     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1714                                        np->autoflowlabel, fl6));
1715        hdr->hop_limit = v6_cork->hop_limit;
1716        hdr->nexthdr = proto;
1717        hdr->saddr = fl6->saddr;
1718        hdr->daddr = *final_dst;
1719
1720        skb->priority = sk->sk_priority;
1721        skb->mark = sk->sk_mark;
1722
1723        skb_dst_set(skb, dst_clone(&rt->dst));
1724        IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1725        if (proto == IPPROTO_ICMPV6) {
1726                struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1727
1728                ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1729                ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1730        }
1731
1732        ip6_cork_release(cork, v6_cork);
1733out:
1734        return skb;
1735}
1736
1737int ip6_send_skb(struct sk_buff *skb)
1738{
1739        struct net *net = sock_net(skb->sk);
1740        struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1741        int err;
1742
1743        err = ip6_local_out(net, skb->sk, skb);
1744        if (err) {
1745                if (err > 0)
1746                        err = net_xmit_errno(err);
1747                if (err)
1748                        IP6_INC_STATS(net, rt->rt6i_idev,
1749                                      IPSTATS_MIB_OUTDISCARDS);
1750        }
1751
1752        return err;
1753}
1754
1755int ip6_push_pending_frames(struct sock *sk)
1756{
1757        struct sk_buff *skb;
1758
1759        skb = ip6_finish_skb(sk);
1760        if (!skb)
1761                return 0;
1762
1763        return ip6_send_skb(skb);
1764}
1765EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1766
1767static void __ip6_flush_pending_frames(struct sock *sk,
1768                                       struct sk_buff_head *queue,
1769                                       struct inet_cork_full *cork,
1770                                       struct inet6_cork *v6_cork)
1771{
1772        struct sk_buff *skb;
1773
1774        while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1775                if (skb_dst(skb))
1776                        IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1777                                      IPSTATS_MIB_OUTDISCARDS);
1778                kfree_skb(skb);
1779        }
1780
1781        ip6_cork_release(cork, v6_cork);
1782}
1783
1784void ip6_flush_pending_frames(struct sock *sk)
1785{
1786        __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1787                                   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1788}
1789EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1790
1791struct sk_buff *ip6_make_skb(struct sock *sk,
1792                             int getfrag(void *from, char *to, int offset,
1793                                         int len, int odd, struct sk_buff *skb),
1794                             void *from, int length, int transhdrlen,
1795                             struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1796                             struct rt6_info *rt, unsigned int flags,
1797                             const struct sockcm_cookie *sockc)
1798{
1799        struct inet_cork_full cork;
1800        struct inet6_cork v6_cork;
1801        struct sk_buff_head queue;
1802        int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1803        int err;
1804
1805        if (flags & MSG_PROBE)
1806                return NULL;
1807
1808        __skb_queue_head_init(&queue);
1809
1810        cork.base.flags = 0;
1811        cork.base.addr = 0;
1812        cork.base.opt = NULL;
1813        v6_cork.opt = NULL;
1814        err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1815        if (err)
1816                return ERR_PTR(err);
1817
1818        if (ipc6->dontfrag < 0)
1819                ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1820
1821        err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1822                                &current->task_frag, getfrag, from,
1823                                length + exthdrlen, transhdrlen + exthdrlen,
1824                                flags, ipc6, sockc);
1825        if (err) {
1826                __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1827                return ERR_PTR(err);
1828        }
1829
1830        return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1831}
1832