linux/net/ipv6/ip6_output.c
<<
>>
Prefs
   1/*
   2 *      IPv6 output functions
   3 *      Linux INET6 implementation
   4 *
   5 *      Authors:
   6 *      Pedro Roque             <roque@di.fc.ul.pt>
   7 *
   8 *      Based on linux/net/ipv4/ip_output.c
   9 *
  10 *      This program is free software; you can redistribute it and/or
  11 *      modify it under the terms of the GNU General Public License
  12 *      as published by the Free Software Foundation; either version
  13 *      2 of the License, or (at your option) any later version.
  14 *
  15 *      Changes:
  16 *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17 *                              extension headers are implemented.
  18 *                              route changes now work.
  19 *                              ip6_forward does not confuse sniffers.
  20 *                              etc.
  21 *
  22 *      H. von Brand    :       Added missing #include <linux/string.h>
  23 *      Imran Patel     :       frag id should be in NBO
  24 *      Kazunori MIYAZAWA @USAGI
  25 *                      :       add ip6_append_data and related functions
  26 *                              for datagram xmit
  27 */
  28
  29#include <linux/errno.h>
  30#include <linux/kernel.h>
  31#include <linux/string.h>
  32#include <linux/socket.h>
  33#include <linux/net.h>
  34#include <linux/netdevice.h>
  35#include <linux/if_arp.h>
  36#include <linux/in6.h>
  37#include <linux/tcp.h>
  38#include <linux/route.h>
  39#include <linux/module.h>
  40#include <linux/slab.h>
  41
  42#include <linux/bpf-cgroup.h>
  43#include <linux/netfilter.h>
  44#include <linux/netfilter_ipv6.h>
  45
  46#include <net/sock.h>
  47#include <net/snmp.h>
  48
  49#include <net/ipv6.h>
  50#include <net/ndisc.h>
  51#include <net/protocol.h>
  52#include <net/ip6_route.h>
  53#include <net/addrconf.h>
  54#include <net/rawv6.h>
  55#include <net/icmp.h>
  56#include <net/xfrm.h>
  57#include <net/checksum.h>
  58#include <linux/mroute6.h>
  59#include <net/l3mdev.h>
  60#include <net/lwtunnel.h>
  61
  62static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  63{
  64        struct dst_entry *dst = skb_dst(skb);
  65        struct net_device *dev = dst->dev;
  66        struct neighbour *neigh;
  67        struct in6_addr *nexthop;
  68        int ret;
  69
  70        if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  71                struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  72
  73                if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  74                    ((mroute6_is_socket(net, skb) &&
  75                     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  76                     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  77                                         &ipv6_hdr(skb)->saddr))) {
  78                        struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  79
  80                        /* Do not check for IFF_ALLMULTI; multicast routing
  81                           is not supported in any case.
  82                         */
  83                        if (newskb)
  84                                NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  85                                        net, sk, newskb, NULL, newskb->dev,
  86                                        dev_loopback_xmit);
  87
  88                        if (ipv6_hdr(skb)->hop_limit == 0) {
  89                                IP6_INC_STATS(net, idev,
  90                                              IPSTATS_MIB_OUTDISCARDS);
  91                                kfree_skb(skb);
  92                                return 0;
  93                        }
  94                }
  95
  96                IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  97
  98                if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
  99                    IPV6_ADDR_SCOPE_NODELOCAL &&
 100                    !(dev->flags & IFF_LOOPBACK)) {
 101                        kfree_skb(skb);
 102                        return 0;
 103                }
 104        }
 105
 106        if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 107                int res = lwtunnel_xmit(skb);
 108
 109                if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 110                        return res;
 111        }
 112
 113        rcu_read_lock_bh();
 114        nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 115        neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 116        if (unlikely(!neigh))
 117                neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 118        if (!IS_ERR(neigh)) {
 119                sock_confirm_neigh(skb, neigh);
 120                ret = neigh_output(neigh, skb);
 121                rcu_read_unlock_bh();
 122                return ret;
 123        }
 124        rcu_read_unlock_bh();
 125
 126        IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 127        kfree_skb(skb);
 128        return -EINVAL;
 129}
 130
 131static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 132{
 133        int ret;
 134
 135        ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 136        if (ret) {
 137                kfree_skb(skb);
 138                return ret;
 139        }
 140
 141#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 142        /* Policy lookup after SNAT yielded a new policy */
 143        if (skb_dst(skb)->xfrm) {
 144                IPCB(skb)->flags |= IPSKB_REROUTED;
 145                return dst_output(net, sk, skb);
 146        }
 147#endif
 148
 149        if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 150            dst_allfrag(skb_dst(skb)) ||
 151            (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 152                return ip6_fragment(net, sk, skb, ip6_finish_output2);
 153        else
 154                return ip6_finish_output2(net, sk, skb);
 155}
 156
 157int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 158{
 159        struct net_device *dev = skb_dst(skb)->dev;
 160        struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 161
 162        skb->protocol = htons(ETH_P_IPV6);
 163        skb->dev = dev;
 164
 165        if (unlikely(idev->cnf.disable_ipv6)) {
 166                IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 167                kfree_skb(skb);
 168                return 0;
 169        }
 170
 171        return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 172                            net, sk, skb, NULL, dev,
 173                            ip6_finish_output,
 174                            !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 175}
 176
 177bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 178{
 179        if (!np->autoflowlabel_set)
 180                return ip6_default_np_autolabel(net);
 181        else
 182                return np->autoflowlabel;
 183}
 184
 185/*
 186 * xmit an sk_buff (used by TCP, SCTP and DCCP)
 187 * Note : socket lock is not held for SYNACK packets, but might be modified
 188 * by calls to skb_set_owner_w() and ipv6_local_error(),
 189 * which are using proper atomic operations or spinlocks.
 190 */
 191int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 192             __u32 mark, struct ipv6_txoptions *opt, int tclass)
 193{
 194        struct net *net = sock_net(sk);
 195        const struct ipv6_pinfo *np = inet6_sk(sk);
 196        struct in6_addr *first_hop = &fl6->daddr;
 197        struct dst_entry *dst = skb_dst(skb);
 198        struct ipv6hdr *hdr;
 199        u8  proto = fl6->flowi6_proto;
 200        int seg_len = skb->len;
 201        int hlimit = -1;
 202        u32 mtu;
 203
 204        if (opt) {
 205                unsigned int head_room;
 206
 207                /* First: exthdrs may take lots of space (~8K for now)
 208                   MAX_HEADER is not enough.
 209                 */
 210                head_room = opt->opt_nflen + opt->opt_flen;
 211                seg_len += head_room;
 212                head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 213
 214                if (skb_headroom(skb) < head_room) {
 215                        struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 216                        if (!skb2) {
 217                                IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 218                                              IPSTATS_MIB_OUTDISCARDS);
 219                                kfree_skb(skb);
 220                                return -ENOBUFS;
 221                        }
 222                        consume_skb(skb);
 223                        skb = skb2;
 224                        /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
 225                         * it is safe to call in our context (socket lock not held)
 226                         */
 227                        skb_set_owner_w(skb, (struct sock *)sk);
 228                }
 229                if (opt->opt_flen)
 230                        ipv6_push_frag_opts(skb, opt, &proto);
 231                if (opt->opt_nflen)
 232                        ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 233                                             &fl6->saddr);
 234        }
 235
 236        skb_push(skb, sizeof(struct ipv6hdr));
 237        skb_reset_network_header(skb);
 238        hdr = ipv6_hdr(skb);
 239
 240        /*
 241         *      Fill in the IPv6 header
 242         */
 243        if (np)
 244                hlimit = np->hop_limit;
 245        if (hlimit < 0)
 246                hlimit = ip6_dst_hoplimit(dst);
 247
 248        ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 249                                ip6_autoflowlabel(net, np), fl6));
 250
 251        hdr->payload_len = htons(seg_len);
 252        hdr->nexthdr = proto;
 253        hdr->hop_limit = hlimit;
 254
 255        hdr->saddr = fl6->saddr;
 256        hdr->daddr = *first_hop;
 257
 258        skb->protocol = htons(ETH_P_IPV6);
 259        skb->priority = sk->sk_priority;
 260        skb->mark = mark;
 261
 262        mtu = dst_mtu(dst);
 263        if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 264                IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 265                              IPSTATS_MIB_OUT, skb->len);
 266
 267                /* if egress device is enslaved to an L3 master device pass the
 268                 * skb to its handler for processing
 269                 */
 270                skb = l3mdev_ip6_out((struct sock *)sk, skb);
 271                if (unlikely(!skb))
 272                        return 0;
 273
 274                /* hooks should never assume socket lock is held.
 275                 * we promote our socket to non const
 276                 */
 277                return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 278                               net, (struct sock *)sk, skb, NULL, dst->dev,
 279                               dst_output);
 280        }
 281
 282        skb->dev = dst->dev;
 283        /* ipv6_local_error() does not require socket lock,
 284         * we promote our socket to non const
 285         */
 286        ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 287
 288        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 289        kfree_skb(skb);
 290        return -EMSGSIZE;
 291}
 292EXPORT_SYMBOL(ip6_xmit);
 293
 294static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 295{
 296        struct ip6_ra_chain *ra;
 297        struct sock *last = NULL;
 298
 299        read_lock(&ip6_ra_lock);
 300        for (ra = ip6_ra_chain; ra; ra = ra->next) {
 301                struct sock *sk = ra->sk;
 302                if (sk && ra->sel == sel &&
 303                    (!sk->sk_bound_dev_if ||
 304                     sk->sk_bound_dev_if == skb->dev->ifindex)) {
 305                        if (last) {
 306                                struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 307                                if (skb2)
 308                                        rawv6_rcv(last, skb2);
 309                        }
 310                        last = sk;
 311                }
 312        }
 313
 314        if (last) {
 315                rawv6_rcv(last, skb);
 316                read_unlock(&ip6_ra_lock);
 317                return 1;
 318        }
 319        read_unlock(&ip6_ra_lock);
 320        return 0;
 321}
 322
 323static int ip6_forward_proxy_check(struct sk_buff *skb)
 324{
 325        struct ipv6hdr *hdr = ipv6_hdr(skb);
 326        u8 nexthdr = hdr->nexthdr;
 327        __be16 frag_off;
 328        int offset;
 329
 330        if (ipv6_ext_hdr(nexthdr)) {
 331                offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 332                if (offset < 0)
 333                        return 0;
 334        } else
 335                offset = sizeof(struct ipv6hdr);
 336
 337        if (nexthdr == IPPROTO_ICMPV6) {
 338                struct icmp6hdr *icmp6;
 339
 340                if (!pskb_may_pull(skb, (skb_network_header(skb) +
 341                                         offset + 1 - skb->data)))
 342                        return 0;
 343
 344                icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 345
 346                switch (icmp6->icmp6_type) {
 347                case NDISC_ROUTER_SOLICITATION:
 348                case NDISC_ROUTER_ADVERTISEMENT:
 349                case NDISC_NEIGHBOUR_SOLICITATION:
 350                case NDISC_NEIGHBOUR_ADVERTISEMENT:
 351                case NDISC_REDIRECT:
 352                        /* For reaction involving unicast neighbor discovery
 353                         * message destined to the proxied address, pass it to
 354                         * input function.
 355                         */
 356                        return 1;
 357                default:
 358                        break;
 359                }
 360        }
 361
 362        /*
 363         * The proxying router can't forward traffic sent to a link-local
 364         * address, so signal the sender and discard the packet. This
 365         * behavior is clarified by the MIPv6 specification.
 366         */
 367        if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 368                dst_link_failure(skb);
 369                return -1;
 370        }
 371
 372        return 0;
 373}
 374
 375static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 376                                     struct sk_buff *skb)
 377{
 378        struct dst_entry *dst = skb_dst(skb);
 379
 380        __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 381        __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 382
 383        return dst_output(net, sk, skb);
 384}
 385
 386static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 387{
 388        if (skb->len <= mtu)
 389                return false;
 390
 391        /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 392        if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 393                return true;
 394
 395        if (skb->ignore_df)
 396                return false;
 397
 398        if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 399                return false;
 400
 401        return true;
 402}
 403
 404int ip6_forward(struct sk_buff *skb)
 405{
 406        struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
 407        struct dst_entry *dst = skb_dst(skb);
 408        struct ipv6hdr *hdr = ipv6_hdr(skb);
 409        struct inet6_skb_parm *opt = IP6CB(skb);
 410        struct net *net = dev_net(dst->dev);
 411        u32 mtu;
 412
 413        if (net->ipv6.devconf_all->forwarding == 0)
 414                goto error;
 415
 416        if (skb->pkt_type != PACKET_HOST)
 417                goto drop;
 418
 419        if (unlikely(skb->sk))
 420                goto drop;
 421
 422        if (skb_warn_if_lro(skb))
 423                goto drop;
 424
 425        if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 426                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 427                goto drop;
 428        }
 429
 430        skb_forward_csum(skb);
 431
 432        /*
 433         *      We DO NOT make any processing on
 434         *      RA packets, pushing them to user level AS IS
 435         *      without ane WARRANTY that application will be able
 436         *      to interpret them. The reason is that we
 437         *      cannot make anything clever here.
 438         *
 439         *      We are not end-node, so that if packet contains
 440         *      AH/ESP, we cannot make anything.
 441         *      Defragmentation also would be mistake, RA packets
 442         *      cannot be fragmented, because there is no warranty
 443         *      that different fragments will go along one path. --ANK
 444         */
 445        if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 446                if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 447                        return 0;
 448        }
 449
 450        /*
 451         *      check and decrement ttl
 452         */
 453        if (hdr->hop_limit <= 1) {
 454                /* Force OUTPUT device used as source address */
 455                skb->dev = dst->dev;
 456                icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 457                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 458
 459                kfree_skb(skb);
 460                return -ETIMEDOUT;
 461        }
 462
 463        /* XXX: idev->cnf.proxy_ndp? */
 464        if (net->ipv6.devconf_all->proxy_ndp &&
 465            pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 466                int proxied = ip6_forward_proxy_check(skb);
 467                if (proxied > 0)
 468                        return ip6_input(skb);
 469                else if (proxied < 0) {
 470                        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 471                        goto drop;
 472                }
 473        }
 474
 475        if (!xfrm6_route_forward(skb)) {
 476                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 477                goto drop;
 478        }
 479        dst = skb_dst(skb);
 480
 481        /* IPv6 specs say nothing about it, but it is clear that we cannot
 482           send redirects to source routed frames.
 483           We don't send redirects to frames decapsulated from IPsec.
 484         */
 485        if (IP6CB(skb)->iif == dst->dev->ifindex &&
 486            opt->srcrt == 0 && !skb_sec_path(skb)) {
 487                struct in6_addr *target = NULL;
 488                struct inet_peer *peer;
 489                struct rt6_info *rt;
 490
 491                /*
 492                 *      incoming and outgoing devices are the same
 493                 *      send a redirect.
 494                 */
 495
 496                rt = (struct rt6_info *) dst;
 497                if (rt->rt6i_flags & RTF_GATEWAY)
 498                        target = &rt->rt6i_gateway;
 499                else
 500                        target = &hdr->daddr;
 501
 502                peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 503
 504                /* Limit redirects both by destination (here)
 505                   and by source (inside ndisc_send_redirect)
 506                 */
 507                if (inet_peer_xrlim_allow(peer, 1*HZ))
 508                        ndisc_send_redirect(skb, target);
 509                if (peer)
 510                        inet_putpeer(peer);
 511        } else {
 512                int addrtype = ipv6_addr_type(&hdr->saddr);
 513
 514                /* This check is security critical. */
 515                if (addrtype == IPV6_ADDR_ANY ||
 516                    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 517                        goto error;
 518                if (addrtype & IPV6_ADDR_LINKLOCAL) {
 519                        icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 520                                    ICMPV6_NOT_NEIGHBOUR, 0);
 521                        goto error;
 522                }
 523        }
 524
 525        mtu = ip6_dst_mtu_forward(dst);
 526        if (mtu < IPV6_MIN_MTU)
 527                mtu = IPV6_MIN_MTU;
 528
 529        if (ip6_pkt_too_big(skb, mtu)) {
 530                /* Again, force OUTPUT device used as source address */
 531                skb->dev = dst->dev;
 532                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 533                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
 534                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 535                                IPSTATS_MIB_FRAGFAILS);
 536                kfree_skb(skb);
 537                return -EMSGSIZE;
 538        }
 539
 540        if (skb_cow(skb, dst->dev->hard_header_len)) {
 541                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 542                                IPSTATS_MIB_OUTDISCARDS);
 543                goto drop;
 544        }
 545
 546        hdr = ipv6_hdr(skb);
 547
 548        /* Mangling hops number delayed to point after skb COW */
 549
 550        hdr->hop_limit--;
 551
 552        return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 553                       net, NULL, skb, skb->dev, dst->dev,
 554                       ip6_forward_finish);
 555
 556error:
 557        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 558drop:
 559        kfree_skb(skb);
 560        return -EINVAL;
 561}
 562
 563static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 564{
 565        to->pkt_type = from->pkt_type;
 566        to->priority = from->priority;
 567        to->protocol = from->protocol;
 568        skb_dst_drop(to);
 569        skb_dst_set(to, dst_clone(skb_dst(from)));
 570        to->dev = from->dev;
 571        to->mark = from->mark;
 572
 573        skb_copy_hash(to, from);
 574
 575#ifdef CONFIG_NET_SCHED
 576        to->tc_index = from->tc_index;
 577#endif
 578        nf_copy(to, from);
 579        skb_copy_secmark(to, from);
 580}
 581
 582int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 583                 int (*output)(struct net *, struct sock *, struct sk_buff *))
 584{
 585        struct sk_buff *frag;
 586        struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 587        struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 588                                inet6_sk(skb->sk) : NULL;
 589        struct ipv6hdr *tmp_hdr;
 590        struct frag_hdr *fh;
 591        unsigned int mtu, hlen, left, len;
 592        int hroom, troom;
 593        __be32 frag_id;
 594        int ptr, offset = 0, err = 0;
 595        u8 *prevhdr, nexthdr = 0;
 596
 597        err = ip6_find_1stfragopt(skb, &prevhdr);
 598        if (err < 0)
 599                goto fail;
 600        hlen = err;
 601        nexthdr = *prevhdr;
 602
 603        mtu = ip6_skb_dst_mtu(skb);
 604
 605        /* We must not fragment if the socket is set to force MTU discovery
 606         * or if the skb it not generated by a local socket.
 607         */
 608        if (unlikely(!skb->ignore_df && skb->len > mtu))
 609                goto fail_toobig;
 610
 611        if (IP6CB(skb)->frag_max_size) {
 612                if (IP6CB(skb)->frag_max_size > mtu)
 613                        goto fail_toobig;
 614
 615                /* don't send fragments larger than what we received */
 616                mtu = IP6CB(skb)->frag_max_size;
 617                if (mtu < IPV6_MIN_MTU)
 618                        mtu = IPV6_MIN_MTU;
 619        }
 620
 621        if (np && np->frag_size < mtu) {
 622                if (np->frag_size)
 623                        mtu = np->frag_size;
 624        }
 625        if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 626                goto fail_toobig;
 627        mtu -= hlen + sizeof(struct frag_hdr);
 628
 629        frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 630                                    &ipv6_hdr(skb)->saddr);
 631
 632        if (skb->ip_summed == CHECKSUM_PARTIAL &&
 633            (err = skb_checksum_help(skb)))
 634                goto fail;
 635
 636        hroom = LL_RESERVED_SPACE(rt->dst.dev);
 637        if (skb_has_frag_list(skb)) {
 638                unsigned int first_len = skb_pagelen(skb);
 639                struct sk_buff *frag2;
 640
 641                if (first_len - hlen > mtu ||
 642                    ((first_len - hlen) & 7) ||
 643                    skb_cloned(skb) ||
 644                    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 645                        goto slow_path;
 646
 647                skb_walk_frags(skb, frag) {
 648                        /* Correct geometry. */
 649                        if (frag->len > mtu ||
 650                            ((frag->len & 7) && frag->next) ||
 651                            skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 652                                goto slow_path_clean;
 653
 654                        /* Partially cloned skb? */
 655                        if (skb_shared(frag))
 656                                goto slow_path_clean;
 657
 658                        BUG_ON(frag->sk);
 659                        if (skb->sk) {
 660                                frag->sk = skb->sk;
 661                                frag->destructor = sock_wfree;
 662                        }
 663                        skb->truesize -= frag->truesize;
 664                }
 665
 666                err = 0;
 667                offset = 0;
 668                /* BUILD HEADER */
 669
 670                *prevhdr = NEXTHDR_FRAGMENT;
 671                tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 672                if (!tmp_hdr) {
 673                        err = -ENOMEM;
 674                        goto fail;
 675                }
 676                frag = skb_shinfo(skb)->frag_list;
 677                skb_frag_list_init(skb);
 678
 679                __skb_pull(skb, hlen);
 680                fh = __skb_push(skb, sizeof(struct frag_hdr));
 681                __skb_push(skb, hlen);
 682                skb_reset_network_header(skb);
 683                memcpy(skb_network_header(skb), tmp_hdr, hlen);
 684
 685                fh->nexthdr = nexthdr;
 686                fh->reserved = 0;
 687                fh->frag_off = htons(IP6_MF);
 688                fh->identification = frag_id;
 689
 690                first_len = skb_pagelen(skb);
 691                skb->data_len = first_len - skb_headlen(skb);
 692                skb->len = first_len;
 693                ipv6_hdr(skb)->payload_len = htons(first_len -
 694                                                   sizeof(struct ipv6hdr));
 695
 696                for (;;) {
 697                        /* Prepare header of the next frame,
 698                         * before previous one went down. */
 699                        if (frag) {
 700                                frag->ip_summed = CHECKSUM_NONE;
 701                                skb_reset_transport_header(frag);
 702                                fh = __skb_push(frag, sizeof(struct frag_hdr));
 703                                __skb_push(frag, hlen);
 704                                skb_reset_network_header(frag);
 705                                memcpy(skb_network_header(frag), tmp_hdr,
 706                                       hlen);
 707                                offset += skb->len - hlen - sizeof(struct frag_hdr);
 708                                fh->nexthdr = nexthdr;
 709                                fh->reserved = 0;
 710                                fh->frag_off = htons(offset);
 711                                if (frag->next)
 712                                        fh->frag_off |= htons(IP6_MF);
 713                                fh->identification = frag_id;
 714                                ipv6_hdr(frag)->payload_len =
 715                                                htons(frag->len -
 716                                                      sizeof(struct ipv6hdr));
 717                                ip6_copy_metadata(frag, skb);
 718                        }
 719
 720                        err = output(net, sk, skb);
 721                        if (!err)
 722                                IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 723                                              IPSTATS_MIB_FRAGCREATES);
 724
 725                        if (err || !frag)
 726                                break;
 727
 728                        skb = frag;
 729                        frag = skb->next;
 730                        skb->next = NULL;
 731                }
 732
 733                kfree(tmp_hdr);
 734
 735                if (err == 0) {
 736                        IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 737                                      IPSTATS_MIB_FRAGOKS);
 738                        return 0;
 739                }
 740
 741                kfree_skb_list(frag);
 742
 743                IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 744                              IPSTATS_MIB_FRAGFAILS);
 745                return err;
 746
 747slow_path_clean:
 748                skb_walk_frags(skb, frag2) {
 749                        if (frag2 == frag)
 750                                break;
 751                        frag2->sk = NULL;
 752                        frag2->destructor = NULL;
 753                        skb->truesize += frag2->truesize;
 754                }
 755        }
 756
 757slow_path:
 758        left = skb->len - hlen;         /* Space per frame */
 759        ptr = hlen;                     /* Where to start from */
 760
 761        /*
 762         *      Fragment the datagram.
 763         */
 764
 765        troom = rt->dst.dev->needed_tailroom;
 766
 767        /*
 768         *      Keep copying data until we run out.
 769         */
 770        while (left > 0)        {
 771                u8 *fragnexthdr_offset;
 772
 773                len = left;
 774                /* IF: it doesn't fit, use 'mtu' - the data space left */
 775                if (len > mtu)
 776                        len = mtu;
 777                /* IF: we are not sending up to and including the packet end
 778                   then align the next start on an eight byte boundary */
 779                if (len < left) {
 780                        len &= ~7;
 781                }
 782
 783                /* Allocate buffer */
 784                frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 785                                 hroom + troom, GFP_ATOMIC);
 786                if (!frag) {
 787                        err = -ENOMEM;
 788                        goto fail;
 789                }
 790
 791                /*
 792                 *      Set up data on packet
 793                 */
 794
 795                ip6_copy_metadata(frag, skb);
 796                skb_reserve(frag, hroom);
 797                skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 798                skb_reset_network_header(frag);
 799                fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 800                frag->transport_header = (frag->network_header + hlen +
 801                                          sizeof(struct frag_hdr));
 802
 803                /*
 804                 *      Charge the memory for the fragment to any owner
 805                 *      it might possess
 806                 */
 807                if (skb->sk)
 808                        skb_set_owner_w(frag, skb->sk);
 809
 810                /*
 811                 *      Copy the packet header into the new buffer.
 812                 */
 813                skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 814
 815                fragnexthdr_offset = skb_network_header(frag);
 816                fragnexthdr_offset += prevhdr - skb_network_header(skb);
 817                *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 818
 819                /*
 820                 *      Build fragment header.
 821                 */
 822                fh->nexthdr = nexthdr;
 823                fh->reserved = 0;
 824                fh->identification = frag_id;
 825
 826                /*
 827                 *      Copy a block of the IP datagram.
 828                 */
 829                BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 830                                     len));
 831                left -= len;
 832
 833                fh->frag_off = htons(offset);
 834                if (left > 0)
 835                        fh->frag_off |= htons(IP6_MF);
 836                ipv6_hdr(frag)->payload_len = htons(frag->len -
 837                                                    sizeof(struct ipv6hdr));
 838
 839                ptr += len;
 840                offset += len;
 841
 842                /*
 843                 *      Put this fragment into the sending queue.
 844                 */
 845                err = output(net, sk, frag);
 846                if (err)
 847                        goto fail;
 848
 849                IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 850                              IPSTATS_MIB_FRAGCREATES);
 851        }
 852        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 853                      IPSTATS_MIB_FRAGOKS);
 854        consume_skb(skb);
 855        return err;
 856
 857fail_toobig:
 858        if (skb->sk && dst_allfrag(skb_dst(skb)))
 859                sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 860
 861        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 862        err = -EMSGSIZE;
 863
 864fail:
 865        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 866                      IPSTATS_MIB_FRAGFAILS);
 867        kfree_skb(skb);
 868        return err;
 869}
 870
 871static inline int ip6_rt_check(const struct rt6key *rt_key,
 872                               const struct in6_addr *fl_addr,
 873                               const struct in6_addr *addr_cache)
 874{
 875        return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 876                (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 877}
 878
 879static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 880                                          struct dst_entry *dst,
 881                                          const struct flowi6 *fl6)
 882{
 883        struct ipv6_pinfo *np = inet6_sk(sk);
 884        struct rt6_info *rt;
 885
 886        if (!dst)
 887                goto out;
 888
 889        if (dst->ops->family != AF_INET6) {
 890                dst_release(dst);
 891                return NULL;
 892        }
 893
 894        rt = (struct rt6_info *)dst;
 895        /* Yes, checking route validity in not connected
 896         * case is not very simple. Take into account,
 897         * that we do not support routing by source, TOS,
 898         * and MSG_DONTROUTE            --ANK (980726)
 899         *
 900         * 1. ip6_rt_check(): If route was host route,
 901         *    check that cached destination is current.
 902         *    If it is network route, we still may
 903         *    check its validity using saved pointer
 904         *    to the last used address: daddr_cache.
 905         *    We do not want to save whole address now,
 906         *    (because main consumer of this service
 907         *    is tcp, which has not this problem),
 908         *    so that the last trick works only on connected
 909         *    sockets.
 910         * 2. oif also should be the same.
 911         */
 912        if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 913#ifdef CONFIG_IPV6_SUBTREES
 914            ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 915#endif
 916           (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 917              (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 918                dst_release(dst);
 919                dst = NULL;
 920        }
 921
 922out:
 923        return dst;
 924}
 925
 926static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 927                               struct dst_entry **dst, struct flowi6 *fl6)
 928{
 929#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 930        struct neighbour *n;
 931        struct rt6_info *rt;
 932#endif
 933        int err;
 934        int flags = 0;
 935
 936        /* The correct way to handle this would be to do
 937         * ip6_route_get_saddr, and then ip6_route_output; however,
 938         * the route-specific preferred source forces the
 939         * ip6_route_output call _before_ ip6_route_get_saddr.
 940         *
 941         * In source specific routing (no src=any default route),
 942         * ip6_route_output will fail given src=any saddr, though, so
 943         * that's why we try it again later.
 944         */
 945        if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
 946                struct fib6_info *from;
 947                struct rt6_info *rt;
 948                bool had_dst = *dst != NULL;
 949
 950                if (!had_dst)
 951                        *dst = ip6_route_output(net, sk, fl6);
 952                rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
 953
 954                rcu_read_lock();
 955                from = rt ? rcu_dereference(rt->from) : NULL;
 956                err = ip6_route_get_saddr(net, from, &fl6->daddr,
 957                                          sk ? inet6_sk(sk)->srcprefs : 0,
 958                                          &fl6->saddr);
 959                rcu_read_unlock();
 960
 961                if (err)
 962                        goto out_err_release;
 963
 964                /* If we had an erroneous initial result, pretend it
 965                 * never existed and let the SA-enabled version take
 966                 * over.
 967                 */
 968                if (!had_dst && (*dst)->error) {
 969                        dst_release(*dst);
 970                        *dst = NULL;
 971                }
 972
 973                if (fl6->flowi6_oif)
 974                        flags |= RT6_LOOKUP_F_IFACE;
 975        }
 976
 977        if (!*dst)
 978                *dst = ip6_route_output_flags(net, sk, fl6, flags);
 979
 980        err = (*dst)->error;
 981        if (err)
 982                goto out_err_release;
 983
 984#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 985        /*
 986         * Here if the dst entry we've looked up
 987         * has a neighbour entry that is in the INCOMPLETE
 988         * state and the src address from the flow is
 989         * marked as OPTIMISTIC, we release the found
 990         * dst entry and replace it instead with the
 991         * dst entry of the nexthop router
 992         */
 993        rt = (struct rt6_info *) *dst;
 994        rcu_read_lock_bh();
 995        n = __ipv6_neigh_lookup_noref(rt->dst.dev,
 996                                      rt6_nexthop(rt, &fl6->daddr));
 997        err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
 998        rcu_read_unlock_bh();
 999
1000        if (err) {
1001                struct inet6_ifaddr *ifp;
1002                struct flowi6 fl_gw6;
1003                int redirect;
1004
1005                ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1006                                      (*dst)->dev, 1);
1007
1008                redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1009                if (ifp)
1010                        in6_ifa_put(ifp);
1011
1012                if (redirect) {
1013                        /*
1014                         * We need to get the dst entry for the
1015                         * default router instead
1016                         */
1017                        dst_release(*dst);
1018                        memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1019                        memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1020                        *dst = ip6_route_output(net, sk, &fl_gw6);
1021                        err = (*dst)->error;
1022                        if (err)
1023                                goto out_err_release;
1024                }
1025        }
1026#endif
1027        if (ipv6_addr_v4mapped(&fl6->saddr) &&
1028            !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1029                err = -EAFNOSUPPORT;
1030                goto out_err_release;
1031        }
1032
1033        return 0;
1034
1035out_err_release:
1036        dst_release(*dst);
1037        *dst = NULL;
1038
1039        if (err == -ENETUNREACH)
1040                IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1041        return err;
1042}
1043
1044/**
1045 *      ip6_dst_lookup - perform route lookup on flow
1046 *      @sk: socket which provides route info
1047 *      @dst: pointer to dst_entry * for result
1048 *      @fl6: flow to lookup
1049 *
1050 *      This function performs a route lookup on the given flow.
1051 *
1052 *      It returns zero on success, or a standard errno code on error.
1053 */
1054int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1055                   struct flowi6 *fl6)
1056{
1057        *dst = NULL;
1058        return ip6_dst_lookup_tail(net, sk, dst, fl6);
1059}
1060EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1061
1062/**
1063 *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1064 *      @sk: socket which provides route info
1065 *      @fl6: flow to lookup
1066 *      @final_dst: final destination address for ipsec lookup
1067 *
1068 *      This function performs a route lookup on the given flow.
1069 *
1070 *      It returns a valid dst pointer on success, or a pointer encoded
1071 *      error code.
1072 */
1073struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1074                                      const struct in6_addr *final_dst)
1075{
1076        struct dst_entry *dst = NULL;
1077        int err;
1078
1079        err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1080        if (err)
1081                return ERR_PTR(err);
1082        if (final_dst)
1083                fl6->daddr = *final_dst;
1084
1085        return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1086}
1087EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1088
1089/**
1090 *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1091 *      @sk: socket which provides the dst cache and route info
1092 *      @fl6: flow to lookup
1093 *      @final_dst: final destination address for ipsec lookup
1094 *      @connected: whether @sk is connected or not
1095 *
1096 *      This function performs a route lookup on the given flow with the
1097 *      possibility of using the cached route in the socket if it is valid.
1098 *      It will take the socket dst lock when operating on the dst cache.
1099 *      As a result, this function can only be used in process context.
1100 *
1101 *      In addition, for a connected socket, cache the dst in the socket
1102 *      if the current cache is not valid.
1103 *
1104 *      It returns a valid dst pointer on success, or a pointer encoded
1105 *      error code.
1106 */
1107struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1108                                         const struct in6_addr *final_dst,
1109                                         bool connected)
1110{
1111        struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1112
1113        dst = ip6_sk_dst_check(sk, dst, fl6);
1114        if (dst)
1115                return dst;
1116
1117        dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1118        if (connected && !IS_ERR(dst))
1119                ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1120
1121        return dst;
1122}
1123EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1124
1125static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1126                                               gfp_t gfp)
1127{
1128        return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1129}
1130
1131static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1132                                                gfp_t gfp)
1133{
1134        return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1135}
1136
1137static void ip6_append_data_mtu(unsigned int *mtu,
1138                                int *maxfraglen,
1139                                unsigned int fragheaderlen,
1140                                struct sk_buff *skb,
1141                                struct rt6_info *rt,
1142                                unsigned int orig_mtu)
1143{
1144        if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1145                if (!skb) {
1146                        /* first fragment, reserve header_len */
1147                        *mtu = orig_mtu - rt->dst.header_len;
1148
1149                } else {
1150                        /*
1151                         * this fragment is not first, the headers
1152                         * space is regarded as data space.
1153                         */
1154                        *mtu = orig_mtu;
1155                }
1156                *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1157                              + fragheaderlen - sizeof(struct frag_hdr);
1158        }
1159}
1160
1161static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1162                          struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1163                          struct rt6_info *rt, struct flowi6 *fl6)
1164{
1165        struct ipv6_pinfo *np = inet6_sk(sk);
1166        unsigned int mtu;
1167        struct ipv6_txoptions *opt = ipc6->opt;
1168
1169        /*
1170         * setup for corking
1171         */
1172        if (opt) {
1173                if (WARN_ON(v6_cork->opt))
1174                        return -EINVAL;
1175
1176                v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1177                if (unlikely(!v6_cork->opt))
1178                        return -ENOBUFS;
1179
1180                v6_cork->opt->tot_len = sizeof(*opt);
1181                v6_cork->opt->opt_flen = opt->opt_flen;
1182                v6_cork->opt->opt_nflen = opt->opt_nflen;
1183
1184                v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1185                                                    sk->sk_allocation);
1186                if (opt->dst0opt && !v6_cork->opt->dst0opt)
1187                        return -ENOBUFS;
1188
1189                v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1190                                                    sk->sk_allocation);
1191                if (opt->dst1opt && !v6_cork->opt->dst1opt)
1192                        return -ENOBUFS;
1193
1194                v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1195                                                   sk->sk_allocation);
1196                if (opt->hopopt && !v6_cork->opt->hopopt)
1197                        return -ENOBUFS;
1198
1199                v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1200                                                    sk->sk_allocation);
1201                if (opt->srcrt && !v6_cork->opt->srcrt)
1202                        return -ENOBUFS;
1203
1204                /* need source address above miyazawa*/
1205        }
1206        dst_hold(&rt->dst);
1207        cork->base.dst = &rt->dst;
1208        cork->fl.u.ip6 = *fl6;
1209        v6_cork->hop_limit = ipc6->hlimit;
1210        v6_cork->tclass = ipc6->tclass;
1211        if (rt->dst.flags & DST_XFRM_TUNNEL)
1212                mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1213                      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1214        else
1215                mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1216                        READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1217        if (np->frag_size < mtu) {
1218                if (np->frag_size)
1219                        mtu = np->frag_size;
1220        }
1221        if (mtu < IPV6_MIN_MTU)
1222                return -EINVAL;
1223        cork->base.fragsize = mtu;
1224        cork->base.gso_size = sk->sk_type == SOCK_DGRAM &&
1225                              sk->sk_protocol == IPPROTO_UDP ? ipc6->gso_size : 0;
1226
1227        if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1228                cork->base.flags |= IPCORK_ALLFRAG;
1229        cork->base.length = 0;
1230
1231        return 0;
1232}
1233
1234static int __ip6_append_data(struct sock *sk,
1235                             struct flowi6 *fl6,
1236                             struct sk_buff_head *queue,
1237                             struct inet_cork *cork,
1238                             struct inet6_cork *v6_cork,
1239                             struct page_frag *pfrag,
1240                             int getfrag(void *from, char *to, int offset,
1241                                         int len, int odd, struct sk_buff *skb),
1242                             void *from, int length, int transhdrlen,
1243                             unsigned int flags, struct ipcm6_cookie *ipc6,
1244                             const struct sockcm_cookie *sockc)
1245{
1246        struct sk_buff *skb, *skb_prev = NULL;
1247        unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1248        int exthdrlen = 0;
1249        int dst_exthdrlen = 0;
1250        int hh_len;
1251        int copy;
1252        int err;
1253        int offset = 0;
1254        __u8 tx_flags = 0;
1255        u32 tskey = 0;
1256        struct rt6_info *rt = (struct rt6_info *)cork->dst;
1257        struct ipv6_txoptions *opt = v6_cork->opt;
1258        int csummode = CHECKSUM_NONE;
1259        unsigned int maxnonfragsize, headersize;
1260        unsigned int wmem_alloc_delta = 0;
1261        bool paged;
1262
1263        skb = skb_peek_tail(queue);
1264        if (!skb) {
1265                exthdrlen = opt ? opt->opt_flen : 0;
1266                dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1267        }
1268
1269        paged = !!cork->gso_size;
1270        mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1271        orig_mtu = mtu;
1272
1273        hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1274
1275        fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1276                        (opt ? opt->opt_nflen : 0);
1277        maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1278                     sizeof(struct frag_hdr);
1279
1280        headersize = sizeof(struct ipv6hdr) +
1281                     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1282                     (dst_allfrag(&rt->dst) ?
1283                      sizeof(struct frag_hdr) : 0) +
1284                     rt->rt6i_nfheader_len;
1285
1286        /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1287         * the first fragment
1288         */
1289        if (headersize + transhdrlen > mtu)
1290                goto emsgsize;
1291
1292        if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1293            (sk->sk_protocol == IPPROTO_UDP ||
1294             sk->sk_protocol == IPPROTO_RAW)) {
1295                ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1296                                sizeof(struct ipv6hdr));
1297                goto emsgsize;
1298        }
1299
1300        if (ip6_sk_ignore_df(sk))
1301                maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1302        else
1303                maxnonfragsize = mtu;
1304
1305        if (cork->length + length > maxnonfragsize - headersize) {
1306emsgsize:
1307                pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1308                ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1309                return -EMSGSIZE;
1310        }
1311
1312        /* CHECKSUM_PARTIAL only with no extension headers and when
1313         * we are not going to fragment
1314         */
1315        if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1316            headersize == sizeof(struct ipv6hdr) &&
1317            length <= mtu - headersize &&
1318            (!(flags & MSG_MORE) || cork->gso_size) &&
1319            rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1320                csummode = CHECKSUM_PARTIAL;
1321
1322        if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1323                sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1324                if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1325                    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1326                        tskey = sk->sk_tskey++;
1327        }
1328
1329        /*
1330         * Let's try using as much space as possible.
1331         * Use MTU if total length of the message fits into the MTU.
1332         * Otherwise, we need to reserve fragment header and
1333         * fragment alignment (= 8-15 octects, in total).
1334         *
1335         * Note that we may need to "move" the data from the tail of
1336         * of the buffer to the new fragment when we split
1337         * the message.
1338         *
1339         * FIXME: It may be fragmented into multiple chunks
1340         *        at once if non-fragmentable extension headers
1341         *        are too large.
1342         * --yoshfuji
1343         */
1344
1345        cork->length += length;
1346        if (!skb)
1347                goto alloc_new_skb;
1348
1349        while (length > 0) {
1350                /* Check if the remaining data fits into current packet. */
1351                copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1352                if (copy < length)
1353                        copy = maxfraglen - skb->len;
1354
1355                if (copy <= 0) {
1356                        char *data;
1357                        unsigned int datalen;
1358                        unsigned int fraglen;
1359                        unsigned int fraggap;
1360                        unsigned int alloclen;
1361                        unsigned int pagedlen = 0;
1362alloc_new_skb:
1363                        /* There's no room in the current skb */
1364                        if (skb)
1365                                fraggap = skb->len - maxfraglen;
1366                        else
1367                                fraggap = 0;
1368                        /* update mtu and maxfraglen if necessary */
1369                        if (!skb || !skb_prev)
1370                                ip6_append_data_mtu(&mtu, &maxfraglen,
1371                                                    fragheaderlen, skb, rt,
1372                                                    orig_mtu);
1373
1374                        skb_prev = skb;
1375
1376                        /*
1377                         * If remaining data exceeds the mtu,
1378                         * we know we need more fragment(s).
1379                         */
1380                        datalen = length + fraggap;
1381
1382                        if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1383                                datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1384                        fraglen = datalen + fragheaderlen;
1385
1386                        if ((flags & MSG_MORE) &&
1387                            !(rt->dst.dev->features&NETIF_F_SG))
1388                                alloclen = mtu;
1389                        else if (!paged)
1390                                alloclen = fraglen;
1391                        else {
1392                                alloclen = min_t(int, fraglen, MAX_HEADER);
1393                                pagedlen = fraglen - alloclen;
1394                        }
1395
1396                        alloclen += dst_exthdrlen;
1397
1398                        if (datalen != length + fraggap) {
1399                                /*
1400                                 * this is not the last fragment, the trailer
1401                                 * space is regarded as data space.
1402                                 */
1403                                datalen += rt->dst.trailer_len;
1404                        }
1405
1406                        alloclen += rt->dst.trailer_len;
1407                        fraglen = datalen + fragheaderlen;
1408
1409                        /*
1410                         * We just reserve space for fragment header.
1411                         * Note: this may be overallocation if the message
1412                         * (without MSG_MORE) fits into the MTU.
1413                         */
1414                        alloclen += sizeof(struct frag_hdr);
1415
1416                        copy = datalen - transhdrlen - fraggap - pagedlen;
1417                        if (copy < 0) {
1418                                err = -EINVAL;
1419                                goto error;
1420                        }
1421                        if (transhdrlen) {
1422                                skb = sock_alloc_send_skb(sk,
1423                                                alloclen + hh_len,
1424                                                (flags & MSG_DONTWAIT), &err);
1425                        } else {
1426                                skb = NULL;
1427                                if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1428                                    2 * sk->sk_sndbuf)
1429                                        skb = alloc_skb(alloclen + hh_len,
1430                                                        sk->sk_allocation);
1431                                if (unlikely(!skb))
1432                                        err = -ENOBUFS;
1433                        }
1434                        if (!skb)
1435                                goto error;
1436                        /*
1437                         *      Fill in the control structures
1438                         */
1439                        skb->protocol = htons(ETH_P_IPV6);
1440                        skb->ip_summed = csummode;
1441                        skb->csum = 0;
1442                        /* reserve for fragmentation and ipsec header */
1443                        skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1444                                    dst_exthdrlen);
1445
1446                        /* Only the initial fragment is time stamped */
1447                        skb_shinfo(skb)->tx_flags = tx_flags;
1448                        tx_flags = 0;
1449                        skb_shinfo(skb)->tskey = tskey;
1450                        tskey = 0;
1451
1452                        /*
1453                         *      Find where to start putting bytes
1454                         */
1455                        data = skb_put(skb, fraglen - pagedlen);
1456                        skb_set_network_header(skb, exthdrlen);
1457                        data += fragheaderlen;
1458                        skb->transport_header = (skb->network_header +
1459                                                 fragheaderlen);
1460                        if (fraggap) {
1461                                skb->csum = skb_copy_and_csum_bits(
1462                                        skb_prev, maxfraglen,
1463                                        data + transhdrlen, fraggap, 0);
1464                                skb_prev->csum = csum_sub(skb_prev->csum,
1465                                                          skb->csum);
1466                                data += fraggap;
1467                                pskb_trim_unique(skb_prev, maxfraglen);
1468                        }
1469                        if (copy > 0 &&
1470                            getfrag(from, data + transhdrlen, offset,
1471                                    copy, fraggap, skb) < 0) {
1472                                err = -EFAULT;
1473                                kfree_skb(skb);
1474                                goto error;
1475                        }
1476
1477                        offset += copy;
1478                        length -= copy + transhdrlen;
1479                        transhdrlen = 0;
1480                        exthdrlen = 0;
1481                        dst_exthdrlen = 0;
1482
1483                        if ((flags & MSG_CONFIRM) && !skb_prev)
1484                                skb_set_dst_pending_confirm(skb, 1);
1485
1486                        /*
1487                         * Put the packet on the pending queue
1488                         */
1489                        if (!skb->destructor) {
1490                                skb->destructor = sock_wfree;
1491                                skb->sk = sk;
1492                                wmem_alloc_delta += skb->truesize;
1493                        }
1494                        __skb_queue_tail(queue, skb);
1495                        continue;
1496                }
1497
1498                if (copy > length)
1499                        copy = length;
1500
1501                if (!(rt->dst.dev->features&NETIF_F_SG) &&
1502                    skb_tailroom(skb) >= copy) {
1503                        unsigned int off;
1504
1505                        off = skb->len;
1506                        if (getfrag(from, skb_put(skb, copy),
1507                                                offset, copy, off, skb) < 0) {
1508                                __skb_trim(skb, off);
1509                                err = -EFAULT;
1510                                goto error;
1511                        }
1512                } else {
1513                        int i = skb_shinfo(skb)->nr_frags;
1514
1515                        err = -ENOMEM;
1516                        if (!sk_page_frag_refill(sk, pfrag))
1517                                goto error;
1518
1519                        if (!skb_can_coalesce(skb, i, pfrag->page,
1520                                              pfrag->offset)) {
1521                                err = -EMSGSIZE;
1522                                if (i == MAX_SKB_FRAGS)
1523                                        goto error;
1524
1525                                __skb_fill_page_desc(skb, i, pfrag->page,
1526                                                     pfrag->offset, 0);
1527                                skb_shinfo(skb)->nr_frags = ++i;
1528                                get_page(pfrag->page);
1529                        }
1530                        copy = min_t(int, copy, pfrag->size - pfrag->offset);
1531                        if (getfrag(from,
1532                                    page_address(pfrag->page) + pfrag->offset,
1533                                    offset, copy, skb->len, skb) < 0)
1534                                goto error_efault;
1535
1536                        pfrag->offset += copy;
1537                        skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1538                        skb->len += copy;
1539                        skb->data_len += copy;
1540                        skb->truesize += copy;
1541                        wmem_alloc_delta += copy;
1542                }
1543                offset += copy;
1544                length -= copy;
1545        }
1546
1547        if (wmem_alloc_delta)
1548                refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1549        return 0;
1550
1551error_efault:
1552        err = -EFAULT;
1553error:
1554        cork->length -= length;
1555        IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1556        refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1557        return err;
1558}
1559
1560int ip6_append_data(struct sock *sk,
1561                    int getfrag(void *from, char *to, int offset, int len,
1562                                int odd, struct sk_buff *skb),
1563                    void *from, int length, int transhdrlen,
1564                    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1565                    struct rt6_info *rt, unsigned int flags,
1566                    const struct sockcm_cookie *sockc)
1567{
1568        struct inet_sock *inet = inet_sk(sk);
1569        struct ipv6_pinfo *np = inet6_sk(sk);
1570        int exthdrlen;
1571        int err;
1572
1573        if (flags&MSG_PROBE)
1574                return 0;
1575        if (skb_queue_empty(&sk->sk_write_queue)) {
1576                /*
1577                 * setup for corking
1578                 */
1579                err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1580                                     ipc6, rt, fl6);
1581                if (err)
1582                        return err;
1583
1584                exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1585                length += exthdrlen;
1586                transhdrlen += exthdrlen;
1587        } else {
1588                fl6 = &inet->cork.fl.u.ip6;
1589                transhdrlen = 0;
1590        }
1591
1592        return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1593                                 &np->cork, sk_page_frag(sk), getfrag,
1594                                 from, length, transhdrlen, flags, ipc6, sockc);
1595}
1596EXPORT_SYMBOL_GPL(ip6_append_data);
1597
1598static void ip6_cork_release(struct inet_cork_full *cork,
1599                             struct inet6_cork *v6_cork)
1600{
1601        if (v6_cork->opt) {
1602                kfree(v6_cork->opt->dst0opt);
1603                kfree(v6_cork->opt->dst1opt);
1604                kfree(v6_cork->opt->hopopt);
1605                kfree(v6_cork->opt->srcrt);
1606                kfree(v6_cork->opt);
1607                v6_cork->opt = NULL;
1608        }
1609
1610        if (cork->base.dst) {
1611                dst_release(cork->base.dst);
1612                cork->base.dst = NULL;
1613                cork->base.flags &= ~IPCORK_ALLFRAG;
1614        }
1615        memset(&cork->fl, 0, sizeof(cork->fl));
1616}
1617
1618struct sk_buff *__ip6_make_skb(struct sock *sk,
1619                               struct sk_buff_head *queue,
1620                               struct inet_cork_full *cork,
1621                               struct inet6_cork *v6_cork)
1622{
1623        struct sk_buff *skb, *tmp_skb;
1624        struct sk_buff **tail_skb;
1625        struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1626        struct ipv6_pinfo *np = inet6_sk(sk);
1627        struct net *net = sock_net(sk);
1628        struct ipv6hdr *hdr;
1629        struct ipv6_txoptions *opt = v6_cork->opt;
1630        struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1631        struct flowi6 *fl6 = &cork->fl.u.ip6;
1632        unsigned char proto = fl6->flowi6_proto;
1633
1634        skb = __skb_dequeue(queue);
1635        if (!skb)
1636                goto out;
1637        tail_skb = &(skb_shinfo(skb)->frag_list);
1638
1639        /* move skb->data to ip header from ext header */
1640        if (skb->data < skb_network_header(skb))
1641                __skb_pull(skb, skb_network_offset(skb));
1642        while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1643                __skb_pull(tmp_skb, skb_network_header_len(skb));
1644                *tail_skb = tmp_skb;
1645                tail_skb = &(tmp_skb->next);
1646                skb->len += tmp_skb->len;
1647                skb->data_len += tmp_skb->len;
1648                skb->truesize += tmp_skb->truesize;
1649                tmp_skb->destructor = NULL;
1650                tmp_skb->sk = NULL;
1651        }
1652
1653        /* Allow local fragmentation. */
1654        skb->ignore_df = ip6_sk_ignore_df(sk);
1655
1656        *final_dst = fl6->daddr;
1657        __skb_pull(skb, skb_network_header_len(skb));
1658        if (opt && opt->opt_flen)
1659                ipv6_push_frag_opts(skb, opt, &proto);
1660        if (opt && opt->opt_nflen)
1661                ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1662
1663        skb_push(skb, sizeof(struct ipv6hdr));
1664        skb_reset_network_header(skb);
1665        hdr = ipv6_hdr(skb);
1666
1667        ip6_flow_hdr(hdr, v6_cork->tclass,
1668                     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1669                                        ip6_autoflowlabel(net, np), fl6));
1670        hdr->hop_limit = v6_cork->hop_limit;
1671        hdr->nexthdr = proto;
1672        hdr->saddr = fl6->saddr;
1673        hdr->daddr = *final_dst;
1674
1675        skb->priority = sk->sk_priority;
1676        skb->mark = sk->sk_mark;
1677
1678        skb_dst_set(skb, dst_clone(&rt->dst));
1679        IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1680        if (proto == IPPROTO_ICMPV6) {
1681                struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1682
1683                ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1684                ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1685        }
1686
1687        ip6_cork_release(cork, v6_cork);
1688out:
1689        return skb;
1690}
1691
1692int ip6_send_skb(struct sk_buff *skb)
1693{
1694        struct net *net = sock_net(skb->sk);
1695        struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1696        int err;
1697
1698        err = ip6_local_out(net, skb->sk, skb);
1699        if (err) {
1700                if (err > 0)
1701                        err = net_xmit_errno(err);
1702                if (err)
1703                        IP6_INC_STATS(net, rt->rt6i_idev,
1704                                      IPSTATS_MIB_OUTDISCARDS);
1705        }
1706
1707        return err;
1708}
1709
1710int ip6_push_pending_frames(struct sock *sk)
1711{
1712        struct sk_buff *skb;
1713
1714        skb = ip6_finish_skb(sk);
1715        if (!skb)
1716                return 0;
1717
1718        return ip6_send_skb(skb);
1719}
1720EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1721
1722static void __ip6_flush_pending_frames(struct sock *sk,
1723                                       struct sk_buff_head *queue,
1724                                       struct inet_cork_full *cork,
1725                                       struct inet6_cork *v6_cork)
1726{
1727        struct sk_buff *skb;
1728
1729        while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1730                if (skb_dst(skb))
1731                        IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1732                                      IPSTATS_MIB_OUTDISCARDS);
1733                kfree_skb(skb);
1734        }
1735
1736        ip6_cork_release(cork, v6_cork);
1737}
1738
1739void ip6_flush_pending_frames(struct sock *sk)
1740{
1741        __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1742                                   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1743}
1744EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1745
1746struct sk_buff *ip6_make_skb(struct sock *sk,
1747                             int getfrag(void *from, char *to, int offset,
1748                                         int len, int odd, struct sk_buff *skb),
1749                             void *from, int length, int transhdrlen,
1750                             struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1751                             struct rt6_info *rt, unsigned int flags,
1752                             struct inet_cork_full *cork,
1753                             const struct sockcm_cookie *sockc)
1754{
1755        struct inet6_cork v6_cork;
1756        struct sk_buff_head queue;
1757        int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1758        int err;
1759
1760        if (flags & MSG_PROBE)
1761                return NULL;
1762
1763        __skb_queue_head_init(&queue);
1764
1765        cork->base.flags = 0;
1766        cork->base.addr = 0;
1767        cork->base.opt = NULL;
1768        cork->base.dst = NULL;
1769        v6_cork.opt = NULL;
1770        err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1771        if (err) {
1772                ip6_cork_release(cork, &v6_cork);
1773                return ERR_PTR(err);
1774        }
1775        if (ipc6->dontfrag < 0)
1776                ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1777
1778        err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1779                                &current->task_frag, getfrag, from,
1780                                length + exthdrlen, transhdrlen + exthdrlen,
1781                                flags, ipc6, sockc);
1782        if (err) {
1783                __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1784                return ERR_PTR(err);
1785        }
1786
1787        return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1788}
1789