linux/net/ipv6/ip6_output.c
<<
>>
Prefs
   1/*
   2 *      IPv6 output functions
   3 *      Linux INET6 implementation
   4 *
   5 *      Authors:
   6 *      Pedro Roque             <roque@di.fc.ul.pt>
   7 *
   8 *      Based on linux/net/ipv4/ip_output.c
   9 *
  10 *      This program is free software; you can redistribute it and/or
  11 *      modify it under the terms of the GNU General Public License
  12 *      as published by the Free Software Foundation; either version
  13 *      2 of the License, or (at your option) any later version.
  14 *
  15 *      Changes:
  16 *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17 *                              extension headers are implemented.
  18 *                              route changes now work.
  19 *                              ip6_forward does not confuse sniffers.
  20 *                              etc.
  21 *
  22 *      H. von Brand    :       Added missing #include <linux/string.h>
  23 *      Imran Patel     :       frag id should be in NBO
  24 *      Kazunori MIYAZAWA @USAGI
  25 *                      :       add ip6_append_data and related functions
  26 *                              for datagram xmit
  27 */
  28
  29#include <linux/errno.h>
  30#include <linux/kernel.h>
  31#include <linux/string.h>
  32#include <linux/socket.h>
  33#include <linux/net.h>
  34#include <linux/netdevice.h>
  35#include <linux/if_arp.h>
  36#include <linux/in6.h>
  37#include <linux/tcp.h>
  38#include <linux/route.h>
  39#include <linux/module.h>
  40#include <linux/slab.h>
  41
  42#include <linux/bpf-cgroup.h>
  43#include <linux/netfilter.h>
  44#include <linux/netfilter_ipv6.h>
  45
  46#include <net/sock.h>
  47#include <net/snmp.h>
  48
  49#include <net/ipv6.h>
  50#include <net/ndisc.h>
  51#include <net/protocol.h>
  52#include <net/ip6_route.h>
  53#include <net/addrconf.h>
  54#include <net/rawv6.h>
  55#include <net/icmp.h>
  56#include <net/xfrm.h>
  57#include <net/checksum.h>
  58#include <linux/mroute6.h>
  59#include <net/l3mdev.h>
  60#include <net/lwtunnel.h>
  61
  62static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  63{
  64        struct dst_entry *dst = skb_dst(skb);
  65        struct net_device *dev = dst->dev;
  66        struct neighbour *neigh;
  67        struct in6_addr *nexthop;
  68        int ret;
  69
  70        if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  71                struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  72
  73                if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  74                    ((mroute6_is_socket(net, skb) &&
  75                     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  76                     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  77                                         &ipv6_hdr(skb)->saddr))) {
  78                        struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  79
  80                        /* Do not check for IFF_ALLMULTI; multicast routing
  81                           is not supported in any case.
  82                         */
  83                        if (newskb)
  84                                NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  85                                        net, sk, newskb, NULL, newskb->dev,
  86                                        dev_loopback_xmit);
  87
  88                        if (ipv6_hdr(skb)->hop_limit == 0) {
  89                                IP6_INC_STATS(net, idev,
  90                                              IPSTATS_MIB_OUTDISCARDS);
  91                                kfree_skb(skb);
  92                                return 0;
  93                        }
  94                }
  95
  96                IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  97
  98                if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
  99                    IPV6_ADDR_SCOPE_NODELOCAL &&
 100                    !(dev->flags & IFF_LOOPBACK)) {
 101                        kfree_skb(skb);
 102                        return 0;
 103                }
 104        }
 105
 106        if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 107                int res = lwtunnel_xmit(skb);
 108
 109                if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 110                        return res;
 111        }
 112
 113        rcu_read_lock_bh();
 114        nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 115        neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 116        if (unlikely(!neigh))
 117                neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 118        if (!IS_ERR(neigh)) {
 119                sock_confirm_neigh(skb, neigh);
 120                ret = neigh_output(neigh, skb);
 121                rcu_read_unlock_bh();
 122                return ret;
 123        }
 124        rcu_read_unlock_bh();
 125
 126        IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 127        kfree_skb(skb);
 128        return -EINVAL;
 129}
 130
 131static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 132{
 133        int ret;
 134
 135        ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 136        if (ret) {
 137                kfree_skb(skb);
 138                return ret;
 139        }
 140
 141#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 142        /* Policy lookup after SNAT yielded a new policy */
 143        if (skb_dst(skb)->xfrm) {
 144                IPCB(skb)->flags |= IPSKB_REROUTED;
 145                return dst_output(net, sk, skb);
 146        }
 147#endif
 148
 149        if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 150            dst_allfrag(skb_dst(skb)) ||
 151            (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 152                return ip6_fragment(net, sk, skb, ip6_finish_output2);
 153        else
 154                return ip6_finish_output2(net, sk, skb);
 155}
 156
 157int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 158{
 159        struct net_device *dev = skb_dst(skb)->dev;
 160        struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 161
 162        skb->protocol = htons(ETH_P_IPV6);
 163        skb->dev = dev;
 164
 165        if (unlikely(idev->cnf.disable_ipv6)) {
 166                IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 167                kfree_skb(skb);
 168                return 0;
 169        }
 170
 171        return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 172                            net, sk, skb, NULL, dev,
 173                            ip6_finish_output,
 174                            !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 175}
 176
 177bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 178{
 179        if (!np->autoflowlabel_set)
 180                return ip6_default_np_autolabel(net);
 181        else
 182                return np->autoflowlabel;
 183}
 184
 185/*
 186 * xmit an sk_buff (used by TCP, SCTP and DCCP)
 187 * Note : socket lock is not held for SYNACK packets, but might be modified
 188 * by calls to skb_set_owner_w() and ipv6_local_error(),
 189 * which are using proper atomic operations or spinlocks.
 190 */
 191int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 192             __u32 mark, struct ipv6_txoptions *opt, int tclass)
 193{
 194        struct net *net = sock_net(sk);
 195        const struct ipv6_pinfo *np = inet6_sk(sk);
 196        struct in6_addr *first_hop = &fl6->daddr;
 197        struct dst_entry *dst = skb_dst(skb);
 198        struct ipv6hdr *hdr;
 199        u8  proto = fl6->flowi6_proto;
 200        int seg_len = skb->len;
 201        int hlimit = -1;
 202        u32 mtu;
 203
 204        if (opt) {
 205                unsigned int head_room;
 206
 207                /* First: exthdrs may take lots of space (~8K for now)
 208                   MAX_HEADER is not enough.
 209                 */
 210                head_room = opt->opt_nflen + opt->opt_flen;
 211                seg_len += head_room;
 212                head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 213
 214                if (skb_headroom(skb) < head_room) {
 215                        struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 216                        if (!skb2) {
 217                                IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 218                                              IPSTATS_MIB_OUTDISCARDS);
 219                                kfree_skb(skb);
 220                                return -ENOBUFS;
 221                        }
 222                        consume_skb(skb);
 223                        skb = skb2;
 224                        /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
 225                         * it is safe to call in our context (socket lock not held)
 226                         */
 227                        skb_set_owner_w(skb, (struct sock *)sk);
 228                }
 229                if (opt->opt_flen)
 230                        ipv6_push_frag_opts(skb, opt, &proto);
 231                if (opt->opt_nflen)
 232                        ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 233                                             &fl6->saddr);
 234        }
 235
 236        skb_push(skb, sizeof(struct ipv6hdr));
 237        skb_reset_network_header(skb);
 238        hdr = ipv6_hdr(skb);
 239
 240        /*
 241         *      Fill in the IPv6 header
 242         */
 243        if (np)
 244                hlimit = np->hop_limit;
 245        if (hlimit < 0)
 246                hlimit = ip6_dst_hoplimit(dst);
 247
 248        ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 249                                ip6_autoflowlabel(net, np), fl6));
 250
 251        hdr->payload_len = htons(seg_len);
 252        hdr->nexthdr = proto;
 253        hdr->hop_limit = hlimit;
 254
 255        hdr->saddr = fl6->saddr;
 256        hdr->daddr = *first_hop;
 257
 258        skb->protocol = htons(ETH_P_IPV6);
 259        skb->priority = sk->sk_priority;
 260        skb->mark = mark;
 261
 262        mtu = dst_mtu(dst);
 263        if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 264                IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 265                              IPSTATS_MIB_OUT, skb->len);
 266
 267                /* if egress device is enslaved to an L3 master device pass the
 268                 * skb to its handler for processing
 269                 */
 270                skb = l3mdev_ip6_out((struct sock *)sk, skb);
 271                if (unlikely(!skb))
 272                        return 0;
 273
 274                /* hooks should never assume socket lock is held.
 275                 * we promote our socket to non const
 276                 */
 277                return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 278                               net, (struct sock *)sk, skb, NULL, dst->dev,
 279                               dst_output);
 280        }
 281
 282        skb->dev = dst->dev;
 283        /* ipv6_local_error() does not require socket lock,
 284         * we promote our socket to non const
 285         */
 286        ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 287
 288        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 289        kfree_skb(skb);
 290        return -EMSGSIZE;
 291}
 292EXPORT_SYMBOL(ip6_xmit);
 293
 294static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 295{
 296        struct ip6_ra_chain *ra;
 297        struct sock *last = NULL;
 298
 299        read_lock(&ip6_ra_lock);
 300        for (ra = ip6_ra_chain; ra; ra = ra->next) {
 301                struct sock *sk = ra->sk;
 302                if (sk && ra->sel == sel &&
 303                    (!sk->sk_bound_dev_if ||
 304                     sk->sk_bound_dev_if == skb->dev->ifindex)) {
 305                        if (last) {
 306                                struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 307                                if (skb2)
 308                                        rawv6_rcv(last, skb2);
 309                        }
 310                        last = sk;
 311                }
 312        }
 313
 314        if (last) {
 315                rawv6_rcv(last, skb);
 316                read_unlock(&ip6_ra_lock);
 317                return 1;
 318        }
 319        read_unlock(&ip6_ra_lock);
 320        return 0;
 321}
 322
 323static int ip6_forward_proxy_check(struct sk_buff *skb)
 324{
 325        struct ipv6hdr *hdr = ipv6_hdr(skb);
 326        u8 nexthdr = hdr->nexthdr;
 327        __be16 frag_off;
 328        int offset;
 329
 330        if (ipv6_ext_hdr(nexthdr)) {
 331                offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 332                if (offset < 0)
 333                        return 0;
 334        } else
 335                offset = sizeof(struct ipv6hdr);
 336
 337        if (nexthdr == IPPROTO_ICMPV6) {
 338                struct icmp6hdr *icmp6;
 339
 340                if (!pskb_may_pull(skb, (skb_network_header(skb) +
 341                                         offset + 1 - skb->data)))
 342                        return 0;
 343
 344                icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 345
 346                switch (icmp6->icmp6_type) {
 347                case NDISC_ROUTER_SOLICITATION:
 348                case NDISC_ROUTER_ADVERTISEMENT:
 349                case NDISC_NEIGHBOUR_SOLICITATION:
 350                case NDISC_NEIGHBOUR_ADVERTISEMENT:
 351                case NDISC_REDIRECT:
 352                        /* For reaction involving unicast neighbor discovery
 353                         * message destined to the proxied address, pass it to
 354                         * input function.
 355                         */
 356                        return 1;
 357                default:
 358                        break;
 359                }
 360        }
 361
 362        /*
 363         * The proxying router can't forward traffic sent to a link-local
 364         * address, so signal the sender and discard the packet. This
 365         * behavior is clarified by the MIPv6 specification.
 366         */
 367        if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 368                dst_link_failure(skb);
 369                return -1;
 370        }
 371
 372        return 0;
 373}
 374
 375static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 376                                     struct sk_buff *skb)
 377{
 378        struct dst_entry *dst = skb_dst(skb);
 379
 380        __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 381        __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 382
 383        return dst_output(net, sk, skb);
 384}
 385
 386unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 387{
 388        unsigned int mtu;
 389        struct inet6_dev *idev;
 390
 391        if (dst_metric_locked(dst, RTAX_MTU)) {
 392                mtu = dst_metric_raw(dst, RTAX_MTU);
 393                if (mtu)
 394                        return mtu;
 395        }
 396
 397        mtu = IPV6_MIN_MTU;
 398        rcu_read_lock();
 399        idev = __in6_dev_get(dst->dev);
 400        if (idev)
 401                mtu = idev->cnf.mtu6;
 402        rcu_read_unlock();
 403
 404        return mtu;
 405}
 406EXPORT_SYMBOL_GPL(ip6_dst_mtu_forward);
 407
 408static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 409{
 410        if (skb->len <= mtu)
 411                return false;
 412
 413        /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 414        if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 415                return true;
 416
 417        if (skb->ignore_df)
 418                return false;
 419
 420        if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 421                return false;
 422
 423        return true;
 424}
 425
 426int ip6_forward(struct sk_buff *skb)
 427{
 428        struct dst_entry *dst = skb_dst(skb);
 429        struct ipv6hdr *hdr = ipv6_hdr(skb);
 430        struct inet6_skb_parm *opt = IP6CB(skb);
 431        struct net *net = dev_net(dst->dev);
 432        u32 mtu;
 433
 434        if (net->ipv6.devconf_all->forwarding == 0)
 435                goto error;
 436
 437        if (skb->pkt_type != PACKET_HOST)
 438                goto drop;
 439
 440        if (unlikely(skb->sk))
 441                goto drop;
 442
 443        if (skb_warn_if_lro(skb))
 444                goto drop;
 445
 446        if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 447                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 448                                IPSTATS_MIB_INDISCARDS);
 449                goto drop;
 450        }
 451
 452        skb_forward_csum(skb);
 453
 454        /*
 455         *      We DO NOT make any processing on
 456         *      RA packets, pushing them to user level AS IS
 457         *      without ane WARRANTY that application will be able
 458         *      to interpret them. The reason is that we
 459         *      cannot make anything clever here.
 460         *
 461         *      We are not end-node, so that if packet contains
 462         *      AH/ESP, we cannot make anything.
 463         *      Defragmentation also would be mistake, RA packets
 464         *      cannot be fragmented, because there is no warranty
 465         *      that different fragments will go along one path. --ANK
 466         */
 467        if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 468                if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 469                        return 0;
 470        }
 471
 472        /*
 473         *      check and decrement ttl
 474         */
 475        if (hdr->hop_limit <= 1) {
 476                /* Force OUTPUT device used as source address */
 477                skb->dev = dst->dev;
 478                icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 479                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 480                                IPSTATS_MIB_INHDRERRORS);
 481
 482                kfree_skb(skb);
 483                return -ETIMEDOUT;
 484        }
 485
 486        /* XXX: idev->cnf.proxy_ndp? */
 487        if (net->ipv6.devconf_all->proxy_ndp &&
 488            pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 489                int proxied = ip6_forward_proxy_check(skb);
 490                if (proxied > 0)
 491                        return ip6_input(skb);
 492                else if (proxied < 0) {
 493                        __IP6_INC_STATS(net, ip6_dst_idev(dst),
 494                                        IPSTATS_MIB_INDISCARDS);
 495                        goto drop;
 496                }
 497        }
 498
 499        if (!xfrm6_route_forward(skb)) {
 500                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 501                                IPSTATS_MIB_INDISCARDS);
 502                goto drop;
 503        }
 504        dst = skb_dst(skb);
 505
 506        /* IPv6 specs say nothing about it, but it is clear that we cannot
 507           send redirects to source routed frames.
 508           We don't send redirects to frames decapsulated from IPsec.
 509         */
 510        if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 511                struct in6_addr *target = NULL;
 512                struct inet_peer *peer;
 513                struct rt6_info *rt;
 514
 515                /*
 516                 *      incoming and outgoing devices are the same
 517                 *      send a redirect.
 518                 */
 519
 520                rt = (struct rt6_info *) dst;
 521                if (rt->rt6i_flags & RTF_GATEWAY)
 522                        target = &rt->rt6i_gateway;
 523                else
 524                        target = &hdr->daddr;
 525
 526                peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 527
 528                /* Limit redirects both by destination (here)
 529                   and by source (inside ndisc_send_redirect)
 530                 */
 531                if (inet_peer_xrlim_allow(peer, 1*HZ))
 532                        ndisc_send_redirect(skb, target);
 533                if (peer)
 534                        inet_putpeer(peer);
 535        } else {
 536                int addrtype = ipv6_addr_type(&hdr->saddr);
 537
 538                /* This check is security critical. */
 539                if (addrtype == IPV6_ADDR_ANY ||
 540                    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 541                        goto error;
 542                if (addrtype & IPV6_ADDR_LINKLOCAL) {
 543                        icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 544                                    ICMPV6_NOT_NEIGHBOUR, 0);
 545                        goto error;
 546                }
 547        }
 548
 549        mtu = ip6_dst_mtu_forward(dst);
 550        if (mtu < IPV6_MIN_MTU)
 551                mtu = IPV6_MIN_MTU;
 552
 553        if (ip6_pkt_too_big(skb, mtu)) {
 554                /* Again, force OUTPUT device used as source address */
 555                skb->dev = dst->dev;
 556                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 557                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 558                                IPSTATS_MIB_INTOOBIGERRORS);
 559                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 560                                IPSTATS_MIB_FRAGFAILS);
 561                kfree_skb(skb);
 562                return -EMSGSIZE;
 563        }
 564
 565        if (skb_cow(skb, dst->dev->hard_header_len)) {
 566                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 567                                IPSTATS_MIB_OUTDISCARDS);
 568                goto drop;
 569        }
 570
 571        hdr = ipv6_hdr(skb);
 572
 573        /* Mangling hops number delayed to point after skb COW */
 574
 575        hdr->hop_limit--;
 576
 577        return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 578                       net, NULL, skb, skb->dev, dst->dev,
 579                       ip6_forward_finish);
 580
 581error:
 582        __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 583drop:
 584        kfree_skb(skb);
 585        return -EINVAL;
 586}
 587
 588static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 589{
 590        to->pkt_type = from->pkt_type;
 591        to->priority = from->priority;
 592        to->protocol = from->protocol;
 593        skb_dst_drop(to);
 594        skb_dst_set(to, dst_clone(skb_dst(from)));
 595        to->dev = from->dev;
 596        to->mark = from->mark;
 597
 598#ifdef CONFIG_NET_SCHED
 599        to->tc_index = from->tc_index;
 600#endif
 601        nf_copy(to, from);
 602        skb_copy_secmark(to, from);
 603}
 604
 605int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 606                 int (*output)(struct net *, struct sock *, struct sk_buff *))
 607{
 608        struct sk_buff *frag;
 609        struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 610        struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 611                                inet6_sk(skb->sk) : NULL;
 612        struct ipv6hdr *tmp_hdr;
 613        struct frag_hdr *fh;
 614        unsigned int mtu, hlen, left, len;
 615        int hroom, troom;
 616        __be32 frag_id;
 617        int ptr, offset = 0, err = 0;
 618        u8 *prevhdr, nexthdr = 0;
 619
 620        err = ip6_find_1stfragopt(skb, &prevhdr);
 621        if (err < 0)
 622                goto fail;
 623        hlen = err;
 624        nexthdr = *prevhdr;
 625
 626        mtu = ip6_skb_dst_mtu(skb);
 627
 628        /* We must not fragment if the socket is set to force MTU discovery
 629         * or if the skb it not generated by a local socket.
 630         */
 631        if (unlikely(!skb->ignore_df && skb->len > mtu))
 632                goto fail_toobig;
 633
 634        if (IP6CB(skb)->frag_max_size) {
 635                if (IP6CB(skb)->frag_max_size > mtu)
 636                        goto fail_toobig;
 637
 638                /* don't send fragments larger than what we received */
 639                mtu = IP6CB(skb)->frag_max_size;
 640                if (mtu < IPV6_MIN_MTU)
 641                        mtu = IPV6_MIN_MTU;
 642        }
 643
 644        if (np && np->frag_size < mtu) {
 645                if (np->frag_size)
 646                        mtu = np->frag_size;
 647        }
 648        if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 649                goto fail_toobig;
 650        mtu -= hlen + sizeof(struct frag_hdr);
 651
 652        frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 653                                    &ipv6_hdr(skb)->saddr);
 654
 655        if (skb->ip_summed == CHECKSUM_PARTIAL &&
 656            (err = skb_checksum_help(skb)))
 657                goto fail;
 658
 659        hroom = LL_RESERVED_SPACE(rt->dst.dev);
 660        if (skb_has_frag_list(skb)) {
 661                unsigned int first_len = skb_pagelen(skb);
 662                struct sk_buff *frag2;
 663
 664                if (first_len - hlen > mtu ||
 665                    ((first_len - hlen) & 7) ||
 666                    skb_cloned(skb) ||
 667                    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 668                        goto slow_path;
 669
 670                skb_walk_frags(skb, frag) {
 671                        /* Correct geometry. */
 672                        if (frag->len > mtu ||
 673                            ((frag->len & 7) && frag->next) ||
 674                            skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 675                                goto slow_path_clean;
 676
 677                        /* Partially cloned skb? */
 678                        if (skb_shared(frag))
 679                                goto slow_path_clean;
 680
 681                        BUG_ON(frag->sk);
 682                        if (skb->sk) {
 683                                frag->sk = skb->sk;
 684                                frag->destructor = sock_wfree;
 685                        }
 686                        skb->truesize -= frag->truesize;
 687                }
 688
 689                err = 0;
 690                offset = 0;
 691                /* BUILD HEADER */
 692
 693                *prevhdr = NEXTHDR_FRAGMENT;
 694                tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 695                if (!tmp_hdr) {
 696                        err = -ENOMEM;
 697                        goto fail;
 698                }
 699                frag = skb_shinfo(skb)->frag_list;
 700                skb_frag_list_init(skb);
 701
 702                __skb_pull(skb, hlen);
 703                fh = __skb_push(skb, sizeof(struct frag_hdr));
 704                __skb_push(skb, hlen);
 705                skb_reset_network_header(skb);
 706                memcpy(skb_network_header(skb), tmp_hdr, hlen);
 707
 708                fh->nexthdr = nexthdr;
 709                fh->reserved = 0;
 710                fh->frag_off = htons(IP6_MF);
 711                fh->identification = frag_id;
 712
 713                first_len = skb_pagelen(skb);
 714                skb->data_len = first_len - skb_headlen(skb);
 715                skb->len = first_len;
 716                ipv6_hdr(skb)->payload_len = htons(first_len -
 717                                                   sizeof(struct ipv6hdr));
 718
 719                for (;;) {
 720                        /* Prepare header of the next frame,
 721                         * before previous one went down. */
 722                        if (frag) {
 723                                frag->ip_summed = CHECKSUM_NONE;
 724                                skb_reset_transport_header(frag);
 725                                fh = __skb_push(frag, sizeof(struct frag_hdr));
 726                                __skb_push(frag, hlen);
 727                                skb_reset_network_header(frag);
 728                                memcpy(skb_network_header(frag), tmp_hdr,
 729                                       hlen);
 730                                offset += skb->len - hlen - sizeof(struct frag_hdr);
 731                                fh->nexthdr = nexthdr;
 732                                fh->reserved = 0;
 733                                fh->frag_off = htons(offset);
 734                                if (frag->next)
 735                                        fh->frag_off |= htons(IP6_MF);
 736                                fh->identification = frag_id;
 737                                ipv6_hdr(frag)->payload_len =
 738                                                htons(frag->len -
 739                                                      sizeof(struct ipv6hdr));
 740                                ip6_copy_metadata(frag, skb);
 741                        }
 742
 743                        err = output(net, sk, skb);
 744                        if (!err)
 745                                IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 746                                              IPSTATS_MIB_FRAGCREATES);
 747
 748                        if (err || !frag)
 749                                break;
 750
 751                        skb = frag;
 752                        frag = skb->next;
 753                        skb->next = NULL;
 754                }
 755
 756                kfree(tmp_hdr);
 757
 758                if (err == 0) {
 759                        IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 760                                      IPSTATS_MIB_FRAGOKS);
 761                        return 0;
 762                }
 763
 764                kfree_skb_list(frag);
 765
 766                IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 767                              IPSTATS_MIB_FRAGFAILS);
 768                return err;
 769
 770slow_path_clean:
 771                skb_walk_frags(skb, frag2) {
 772                        if (frag2 == frag)
 773                                break;
 774                        frag2->sk = NULL;
 775                        frag2->destructor = NULL;
 776                        skb->truesize += frag2->truesize;
 777                }
 778        }
 779
 780slow_path:
 781        left = skb->len - hlen;         /* Space per frame */
 782        ptr = hlen;                     /* Where to start from */
 783
 784        /*
 785         *      Fragment the datagram.
 786         */
 787
 788        troom = rt->dst.dev->needed_tailroom;
 789
 790        /*
 791         *      Keep copying data until we run out.
 792         */
 793        while (left > 0)        {
 794                u8 *fragnexthdr_offset;
 795
 796                len = left;
 797                /* IF: it doesn't fit, use 'mtu' - the data space left */
 798                if (len > mtu)
 799                        len = mtu;
 800                /* IF: we are not sending up to and including the packet end
 801                   then align the next start on an eight byte boundary */
 802                if (len < left) {
 803                        len &= ~7;
 804                }
 805
 806                /* Allocate buffer */
 807                frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 808                                 hroom + troom, GFP_ATOMIC);
 809                if (!frag) {
 810                        err = -ENOMEM;
 811                        goto fail;
 812                }
 813
 814                /*
 815                 *      Set up data on packet
 816                 */
 817
 818                ip6_copy_metadata(frag, skb);
 819                skb_reserve(frag, hroom);
 820                skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 821                skb_reset_network_header(frag);
 822                fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 823                frag->transport_header = (frag->network_header + hlen +
 824                                          sizeof(struct frag_hdr));
 825
 826                /*
 827                 *      Charge the memory for the fragment to any owner
 828                 *      it might possess
 829                 */
 830                if (skb->sk)
 831                        skb_set_owner_w(frag, skb->sk);
 832
 833                /*
 834                 *      Copy the packet header into the new buffer.
 835                 */
 836                skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 837
 838                fragnexthdr_offset = skb_network_header(frag);
 839                fragnexthdr_offset += prevhdr - skb_network_header(skb);
 840                *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 841
 842                /*
 843                 *      Build fragment header.
 844                 */
 845                fh->nexthdr = nexthdr;
 846                fh->reserved = 0;
 847                fh->identification = frag_id;
 848
 849                /*
 850                 *      Copy a block of the IP datagram.
 851                 */
 852                BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 853                                     len));
 854                left -= len;
 855
 856                fh->frag_off = htons(offset);
 857                if (left > 0)
 858                        fh->frag_off |= htons(IP6_MF);
 859                ipv6_hdr(frag)->payload_len = htons(frag->len -
 860                                                    sizeof(struct ipv6hdr));
 861
 862                ptr += len;
 863                offset += len;
 864
 865                /*
 866                 *      Put this fragment into the sending queue.
 867                 */
 868                err = output(net, sk, frag);
 869                if (err)
 870                        goto fail;
 871
 872                IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 873                              IPSTATS_MIB_FRAGCREATES);
 874        }
 875        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 876                      IPSTATS_MIB_FRAGOKS);
 877        consume_skb(skb);
 878        return err;
 879
 880fail_toobig:
 881        if (skb->sk && dst_allfrag(skb_dst(skb)))
 882                sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 883
 884        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 885        err = -EMSGSIZE;
 886
 887fail:
 888        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 889                      IPSTATS_MIB_FRAGFAILS);
 890        kfree_skb(skb);
 891        return err;
 892}
 893
 894static inline int ip6_rt_check(const struct rt6key *rt_key,
 895                               const struct in6_addr *fl_addr,
 896                               const struct in6_addr *addr_cache)
 897{
 898        return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 899                (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 900}
 901
 902static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 903                                          struct dst_entry *dst,
 904                                          const struct flowi6 *fl6)
 905{
 906        struct ipv6_pinfo *np = inet6_sk(sk);
 907        struct rt6_info *rt;
 908
 909        if (!dst)
 910                goto out;
 911
 912        if (dst->ops->family != AF_INET6) {
 913                dst_release(dst);
 914                return NULL;
 915        }
 916
 917        rt = (struct rt6_info *)dst;
 918        /* Yes, checking route validity in not connected
 919         * case is not very simple. Take into account,
 920         * that we do not support routing by source, TOS,
 921         * and MSG_DONTROUTE            --ANK (980726)
 922         *
 923         * 1. ip6_rt_check(): If route was host route,
 924         *    check that cached destination is current.
 925         *    If it is network route, we still may
 926         *    check its validity using saved pointer
 927         *    to the last used address: daddr_cache.
 928         *    We do not want to save whole address now,
 929         *    (because main consumer of this service
 930         *    is tcp, which has not this problem),
 931         *    so that the last trick works only on connected
 932         *    sockets.
 933         * 2. oif also should be the same.
 934         */
 935        if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 936#ifdef CONFIG_IPV6_SUBTREES
 937            ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 938#endif
 939           (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 940              (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 941                dst_release(dst);
 942                dst = NULL;
 943        }
 944
 945out:
 946        return dst;
 947}
 948
 949static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 950                               struct dst_entry **dst, struct flowi6 *fl6)
 951{
 952#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 953        struct neighbour *n;
 954        struct rt6_info *rt;
 955#endif
 956        int err;
 957        int flags = 0;
 958
 959        /* The correct way to handle this would be to do
 960         * ip6_route_get_saddr, and then ip6_route_output; however,
 961         * the route-specific preferred source forces the
 962         * ip6_route_output call _before_ ip6_route_get_saddr.
 963         *
 964         * In source specific routing (no src=any default route),
 965         * ip6_route_output will fail given src=any saddr, though, so
 966         * that's why we try it again later.
 967         */
 968        if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
 969                struct rt6_info *rt;
 970                bool had_dst = *dst != NULL;
 971
 972                if (!had_dst)
 973                        *dst = ip6_route_output(net, sk, fl6);
 974                rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
 975                err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 976                                          sk ? inet6_sk(sk)->srcprefs : 0,
 977                                          &fl6->saddr);
 978                if (err)
 979                        goto out_err_release;
 980
 981                /* If we had an erroneous initial result, pretend it
 982                 * never existed and let the SA-enabled version take
 983                 * over.
 984                 */
 985                if (!had_dst && (*dst)->error) {
 986                        dst_release(*dst);
 987                        *dst = NULL;
 988                }
 989
 990                if (fl6->flowi6_oif)
 991                        flags |= RT6_LOOKUP_F_IFACE;
 992        }
 993
 994        if (!*dst)
 995                *dst = ip6_route_output_flags(net, sk, fl6, flags);
 996
 997        err = (*dst)->error;
 998        if (err)
 999                goto out_err_release;
1000
1001#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1002        /*
1003         * Here if the dst entry we've looked up
1004         * has a neighbour entry that is in the INCOMPLETE
1005         * state and the src address from the flow is
1006         * marked as OPTIMISTIC, we release the found
1007         * dst entry and replace it instead with the
1008         * dst entry of the nexthop router
1009         */
1010        rt = (struct rt6_info *) *dst;
1011        rcu_read_lock_bh();
1012        n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1013                                      rt6_nexthop(rt, &fl6->daddr));
1014        err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1015        rcu_read_unlock_bh();
1016
1017        if (err) {
1018                struct inet6_ifaddr *ifp;
1019                struct flowi6 fl_gw6;
1020                int redirect;
1021
1022                ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1023                                      (*dst)->dev, 1);
1024
1025                redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1026                if (ifp)
1027                        in6_ifa_put(ifp);
1028
1029                if (redirect) {
1030                        /*
1031                         * We need to get the dst entry for the
1032                         * default router instead
1033                         */
1034                        dst_release(*dst);
1035                        memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1036                        memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1037                        *dst = ip6_route_output(net, sk, &fl_gw6);
1038                        err = (*dst)->error;
1039                        if (err)
1040                                goto out_err_release;
1041                }
1042        }
1043#endif
1044        if (ipv6_addr_v4mapped(&fl6->saddr) &&
1045            !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1046                err = -EAFNOSUPPORT;
1047                goto out_err_release;
1048        }
1049
1050        return 0;
1051
1052out_err_release:
1053        dst_release(*dst);
1054        *dst = NULL;
1055
1056        if (err == -ENETUNREACH)
1057                IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1058        return err;
1059}
1060
1061/**
1062 *      ip6_dst_lookup - perform route lookup on flow
1063 *      @sk: socket which provides route info
1064 *      @dst: pointer to dst_entry * for result
1065 *      @fl6: flow to lookup
1066 *
1067 *      This function performs a route lookup on the given flow.
1068 *
1069 *      It returns zero on success, or a standard errno code on error.
1070 */
1071int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1072                   struct flowi6 *fl6)
1073{
1074        *dst = NULL;
1075        return ip6_dst_lookup_tail(net, sk, dst, fl6);
1076}
1077EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1078
1079/**
1080 *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1081 *      @sk: socket which provides route info
1082 *      @fl6: flow to lookup
1083 *      @final_dst: final destination address for ipsec lookup
1084 *
1085 *      This function performs a route lookup on the given flow.
1086 *
1087 *      It returns a valid dst pointer on success, or a pointer encoded
1088 *      error code.
1089 */
1090struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1091                                      const struct in6_addr *final_dst)
1092{
1093        struct dst_entry *dst = NULL;
1094        int err;
1095
1096        err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1097        if (err)
1098                return ERR_PTR(err);
1099        if (final_dst)
1100                fl6->daddr = *final_dst;
1101
1102        return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1103}
1104EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1105
1106/**
1107 *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1108 *      @sk: socket which provides the dst cache and route info
1109 *      @fl6: flow to lookup
1110 *      @final_dst: final destination address for ipsec lookup
1111 *      @connected: whether @sk is connected or not
1112 *
1113 *      This function performs a route lookup on the given flow with the
1114 *      possibility of using the cached route in the socket if it is valid.
1115 *      It will take the socket dst lock when operating on the dst cache.
1116 *      As a result, this function can only be used in process context.
1117 *
1118 *      In addition, for a connected socket, cache the dst in the socket
1119 *      if the current cache is not valid.
1120 *
1121 *      It returns a valid dst pointer on success, or a pointer encoded
1122 *      error code.
1123 */
1124struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1125                                         const struct in6_addr *final_dst,
1126                                         bool connected)
1127{
1128        struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1129
1130        dst = ip6_sk_dst_check(sk, dst, fl6);
1131        if (dst)
1132                return dst;
1133
1134        dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1135        if (connected && !IS_ERR(dst))
1136                ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1137
1138        return dst;
1139}
1140EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1141
1142static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1143                                               gfp_t gfp)
1144{
1145        return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1146}
1147
1148static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1149                                                gfp_t gfp)
1150{
1151        return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1152}
1153
1154static void ip6_append_data_mtu(unsigned int *mtu,
1155                                int *maxfraglen,
1156                                unsigned int fragheaderlen,
1157                                struct sk_buff *skb,
1158                                struct rt6_info *rt,
1159                                unsigned int orig_mtu)
1160{
1161        if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1162                if (!skb) {
1163                        /* first fragment, reserve header_len */
1164                        *mtu = orig_mtu - rt->dst.header_len;
1165
1166                } else {
1167                        /*
1168                         * this fragment is not first, the headers
1169                         * space is regarded as data space.
1170                         */
1171                        *mtu = orig_mtu;
1172                }
1173                *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1174                              + fragheaderlen - sizeof(struct frag_hdr);
1175        }
1176}
1177
1178static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1179                          struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1180                          struct rt6_info *rt, struct flowi6 *fl6)
1181{
1182        struct ipv6_pinfo *np = inet6_sk(sk);
1183        unsigned int mtu;
1184        struct ipv6_txoptions *opt = ipc6->opt;
1185
1186        /*
1187         * setup for corking
1188         */
1189        if (opt) {
1190                if (WARN_ON(v6_cork->opt))
1191                        return -EINVAL;
1192
1193                v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1194                if (unlikely(!v6_cork->opt))
1195                        return -ENOBUFS;
1196
1197                v6_cork->opt->tot_len = sizeof(*opt);
1198                v6_cork->opt->opt_flen = opt->opt_flen;
1199                v6_cork->opt->opt_nflen = opt->opt_nflen;
1200
1201                v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1202                                                    sk->sk_allocation);
1203                if (opt->dst0opt && !v6_cork->opt->dst0opt)
1204                        return -ENOBUFS;
1205
1206                v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1207                                                    sk->sk_allocation);
1208                if (opt->dst1opt && !v6_cork->opt->dst1opt)
1209                        return -ENOBUFS;
1210
1211                v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1212                                                   sk->sk_allocation);
1213                if (opt->hopopt && !v6_cork->opt->hopopt)
1214                        return -ENOBUFS;
1215
1216                v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1217                                                    sk->sk_allocation);
1218                if (opt->srcrt && !v6_cork->opt->srcrt)
1219                        return -ENOBUFS;
1220
1221                /* need source address above miyazawa*/
1222        }
1223        dst_hold(&rt->dst);
1224        cork->base.dst = &rt->dst;
1225        cork->fl.u.ip6 = *fl6;
1226        v6_cork->hop_limit = ipc6->hlimit;
1227        v6_cork->tclass = ipc6->tclass;
1228        if (rt->dst.flags & DST_XFRM_TUNNEL)
1229                mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1230                      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1231        else
1232                mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1233                        READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1234        if (np->frag_size < mtu) {
1235                if (np->frag_size)
1236                        mtu = np->frag_size;
1237        }
1238        if (mtu < IPV6_MIN_MTU)
1239                return -EINVAL;
1240        cork->base.fragsize = mtu;
1241        if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1242                cork->base.flags |= IPCORK_ALLFRAG;
1243        cork->base.length = 0;
1244
1245        return 0;
1246}
1247
1248static int __ip6_append_data(struct sock *sk,
1249                             struct flowi6 *fl6,
1250                             struct sk_buff_head *queue,
1251                             struct inet_cork *cork,
1252                             struct inet6_cork *v6_cork,
1253                             struct page_frag *pfrag,
1254                             int getfrag(void *from, char *to, int offset,
1255                                         int len, int odd, struct sk_buff *skb),
1256                             void *from, int length, int transhdrlen,
1257                             unsigned int flags, struct ipcm6_cookie *ipc6,
1258                             const struct sockcm_cookie *sockc)
1259{
1260        struct sk_buff *skb, *skb_prev = NULL;
1261        unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1262        int exthdrlen = 0;
1263        int dst_exthdrlen = 0;
1264        int hh_len;
1265        int copy;
1266        int err;
1267        int offset = 0;
1268        __u8 tx_flags = 0;
1269        u32 tskey = 0;
1270        struct rt6_info *rt = (struct rt6_info *)cork->dst;
1271        struct ipv6_txoptions *opt = v6_cork->opt;
1272        int csummode = CHECKSUM_NONE;
1273        unsigned int maxnonfragsize, headersize;
1274        unsigned int wmem_alloc_delta = 0;
1275
1276        skb = skb_peek_tail(queue);
1277        if (!skb) {
1278                exthdrlen = opt ? opt->opt_flen : 0;
1279                dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1280        }
1281
1282        mtu = cork->fragsize;
1283        orig_mtu = mtu;
1284
1285        hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1286
1287        fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1288                        (opt ? opt->opt_nflen : 0);
1289        maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1290                     sizeof(struct frag_hdr);
1291
1292        headersize = sizeof(struct ipv6hdr) +
1293                     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1294                     (dst_allfrag(&rt->dst) ?
1295                      sizeof(struct frag_hdr) : 0) +
1296                     rt->rt6i_nfheader_len;
1297
1298        /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1299         * the first fragment
1300         */
1301        if (headersize + transhdrlen > mtu)
1302                goto emsgsize;
1303
1304        if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1305            (sk->sk_protocol == IPPROTO_UDP ||
1306             sk->sk_protocol == IPPROTO_RAW)) {
1307                ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1308                                sizeof(struct ipv6hdr));
1309                goto emsgsize;
1310        }
1311
1312        if (ip6_sk_ignore_df(sk))
1313                maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1314        else
1315                maxnonfragsize = mtu;
1316
1317        if (cork->length + length > maxnonfragsize - headersize) {
1318emsgsize:
1319                pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1320                ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1321                return -EMSGSIZE;
1322        }
1323
1324        /* CHECKSUM_PARTIAL only with no extension headers and when
1325         * we are not going to fragment
1326         */
1327        if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1328            headersize == sizeof(struct ipv6hdr) &&
1329            length <= mtu - headersize &&
1330            !(flags & MSG_MORE) &&
1331            rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1332                csummode = CHECKSUM_PARTIAL;
1333
1334        if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1335                sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1336                if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1337                    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1338                        tskey = sk->sk_tskey++;
1339        }
1340
1341        /*
1342         * Let's try using as much space as possible.
1343         * Use MTU if total length of the message fits into the MTU.
1344         * Otherwise, we need to reserve fragment header and
1345         * fragment alignment (= 8-15 octects, in total).
1346         *
1347         * Note that we may need to "move" the data from the tail of
1348         * of the buffer to the new fragment when we split
1349         * the message.
1350         *
1351         * FIXME: It may be fragmented into multiple chunks
1352         *        at once if non-fragmentable extension headers
1353         *        are too large.
1354         * --yoshfuji
1355         */
1356
1357        cork->length += length;
1358        if (!skb)
1359                goto alloc_new_skb;
1360
1361        while (length > 0) {
1362                /* Check if the remaining data fits into current packet. */
1363                copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1364                if (copy < length)
1365                        copy = maxfraglen - skb->len;
1366
1367                if (copy <= 0) {
1368                        char *data;
1369                        unsigned int datalen;
1370                        unsigned int fraglen;
1371                        unsigned int fraggap;
1372                        unsigned int alloclen;
1373alloc_new_skb:
1374                        /* There's no room in the current skb */
1375                        if (skb)
1376                                fraggap = skb->len - maxfraglen;
1377                        else
1378                                fraggap = 0;
1379                        /* update mtu and maxfraglen if necessary */
1380                        if (!skb || !skb_prev)
1381                                ip6_append_data_mtu(&mtu, &maxfraglen,
1382                                                    fragheaderlen, skb, rt,
1383                                                    orig_mtu);
1384
1385                        skb_prev = skb;
1386
1387                        /*
1388                         * If remaining data exceeds the mtu,
1389                         * we know we need more fragment(s).
1390                         */
1391                        datalen = length + fraggap;
1392
1393                        if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1394                                datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1395                        if ((flags & MSG_MORE) &&
1396                            !(rt->dst.dev->features&NETIF_F_SG))
1397                                alloclen = mtu;
1398                        else
1399                                alloclen = datalen + fragheaderlen;
1400
1401                        alloclen += dst_exthdrlen;
1402
1403                        if (datalen != length + fraggap) {
1404                                /*
1405                                 * this is not the last fragment, the trailer
1406                                 * space is regarded as data space.
1407                                 */
1408                                datalen += rt->dst.trailer_len;
1409                        }
1410
1411                        alloclen += rt->dst.trailer_len;
1412                        fraglen = datalen + fragheaderlen;
1413
1414                        /*
1415                         * We just reserve space for fragment header.
1416                         * Note: this may be overallocation if the message
1417                         * (without MSG_MORE) fits into the MTU.
1418                         */
1419                        alloclen += sizeof(struct frag_hdr);
1420
1421                        copy = datalen - transhdrlen - fraggap;
1422                        if (copy < 0) {
1423                                err = -EINVAL;
1424                                goto error;
1425                        }
1426                        if (transhdrlen) {
1427                                skb = sock_alloc_send_skb(sk,
1428                                                alloclen + hh_len,
1429                                                (flags & MSG_DONTWAIT), &err);
1430                        } else {
1431                                skb = NULL;
1432                                if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1433                                    2 * sk->sk_sndbuf)
1434                                        skb = alloc_skb(alloclen + hh_len,
1435                                                        sk->sk_allocation);
1436                                if (unlikely(!skb))
1437                                        err = -ENOBUFS;
1438                        }
1439                        if (!skb)
1440                                goto error;
1441                        /*
1442                         *      Fill in the control structures
1443                         */
1444                        skb->protocol = htons(ETH_P_IPV6);
1445                        skb->ip_summed = csummode;
1446                        skb->csum = 0;
1447                        /* reserve for fragmentation and ipsec header */
1448                        skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1449                                    dst_exthdrlen);
1450
1451                        /* Only the initial fragment is time stamped */
1452                        skb_shinfo(skb)->tx_flags = tx_flags;
1453                        tx_flags = 0;
1454                        skb_shinfo(skb)->tskey = tskey;
1455                        tskey = 0;
1456
1457                        /*
1458                         *      Find where to start putting bytes
1459                         */
1460                        data = skb_put(skb, fraglen);
1461                        skb_set_network_header(skb, exthdrlen);
1462                        data += fragheaderlen;
1463                        skb->transport_header = (skb->network_header +
1464                                                 fragheaderlen);
1465                        if (fraggap) {
1466                                skb->csum = skb_copy_and_csum_bits(
1467                                        skb_prev, maxfraglen,
1468                                        data + transhdrlen, fraggap, 0);
1469                                skb_prev->csum = csum_sub(skb_prev->csum,
1470                                                          skb->csum);
1471                                data += fraggap;
1472                                pskb_trim_unique(skb_prev, maxfraglen);
1473                        }
1474                        if (copy > 0 &&
1475                            getfrag(from, data + transhdrlen, offset,
1476                                    copy, fraggap, skb) < 0) {
1477                                err = -EFAULT;
1478                                kfree_skb(skb);
1479                                goto error;
1480                        }
1481
1482                        offset += copy;
1483                        length -= datalen - fraggap;
1484                        transhdrlen = 0;
1485                        exthdrlen = 0;
1486                        dst_exthdrlen = 0;
1487
1488                        if ((flags & MSG_CONFIRM) && !skb_prev)
1489                                skb_set_dst_pending_confirm(skb, 1);
1490
1491                        /*
1492                         * Put the packet on the pending queue
1493                         */
1494                        if (!skb->destructor) {
1495                                skb->destructor = sock_wfree;
1496                                skb->sk = sk;
1497                                wmem_alloc_delta += skb->truesize;
1498                        }
1499                        __skb_queue_tail(queue, skb);
1500                        continue;
1501                }
1502
1503                if (copy > length)
1504                        copy = length;
1505
1506                if (!(rt->dst.dev->features&NETIF_F_SG) &&
1507                    skb_tailroom(skb) >= copy) {
1508                        unsigned int off;
1509
1510                        off = skb->len;
1511                        if (getfrag(from, skb_put(skb, copy),
1512                                                offset, copy, off, skb) < 0) {
1513                                __skb_trim(skb, off);
1514                                err = -EFAULT;
1515                                goto error;
1516                        }
1517                } else {
1518                        int i = skb_shinfo(skb)->nr_frags;
1519
1520                        err = -ENOMEM;
1521                        if (!sk_page_frag_refill(sk, pfrag))
1522                                goto error;
1523
1524                        if (!skb_can_coalesce(skb, i, pfrag->page,
1525                                              pfrag->offset)) {
1526                                err = -EMSGSIZE;
1527                                if (i == MAX_SKB_FRAGS)
1528                                        goto error;
1529
1530                                __skb_fill_page_desc(skb, i, pfrag->page,
1531                                                     pfrag->offset, 0);
1532                                skb_shinfo(skb)->nr_frags = ++i;
1533                                get_page(pfrag->page);
1534                        }
1535                        copy = min_t(int, copy, pfrag->size - pfrag->offset);
1536                        if (getfrag(from,
1537                                    page_address(pfrag->page) + pfrag->offset,
1538                                    offset, copy, skb->len, skb) < 0)
1539                                goto error_efault;
1540
1541                        pfrag->offset += copy;
1542                        skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1543                        skb->len += copy;
1544                        skb->data_len += copy;
1545                        skb->truesize += copy;
1546                        wmem_alloc_delta += copy;
1547                }
1548                offset += copy;
1549                length -= copy;
1550        }
1551
1552        if (wmem_alloc_delta)
1553                refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1554        return 0;
1555
1556error_efault:
1557        err = -EFAULT;
1558error:
1559        cork->length -= length;
1560        IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1561        refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1562        return err;
1563}
1564
1565int ip6_append_data(struct sock *sk,
1566                    int getfrag(void *from, char *to, int offset, int len,
1567                                int odd, struct sk_buff *skb),
1568                    void *from, int length, int transhdrlen,
1569                    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1570                    struct rt6_info *rt, unsigned int flags,
1571                    const struct sockcm_cookie *sockc)
1572{
1573        struct inet_sock *inet = inet_sk(sk);
1574        struct ipv6_pinfo *np = inet6_sk(sk);
1575        int exthdrlen;
1576        int err;
1577
1578        if (flags&MSG_PROBE)
1579                return 0;
1580        if (skb_queue_empty(&sk->sk_write_queue)) {
1581                /*
1582                 * setup for corking
1583                 */
1584                err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1585                                     ipc6, rt, fl6);
1586                if (err)
1587                        return err;
1588
1589                exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1590                length += exthdrlen;
1591                transhdrlen += exthdrlen;
1592        } else {
1593                fl6 = &inet->cork.fl.u.ip6;
1594                transhdrlen = 0;
1595        }
1596
1597        return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1598                                 &np->cork, sk_page_frag(sk), getfrag,
1599                                 from, length, transhdrlen, flags, ipc6, sockc);
1600}
1601EXPORT_SYMBOL_GPL(ip6_append_data);
1602
1603static void ip6_cork_release(struct inet_cork_full *cork,
1604                             struct inet6_cork *v6_cork)
1605{
1606        if (v6_cork->opt) {
1607                kfree(v6_cork->opt->dst0opt);
1608                kfree(v6_cork->opt->dst1opt);
1609                kfree(v6_cork->opt->hopopt);
1610                kfree(v6_cork->opt->srcrt);
1611                kfree(v6_cork->opt);
1612                v6_cork->opt = NULL;
1613        }
1614
1615        if (cork->base.dst) {
1616                dst_release(cork->base.dst);
1617                cork->base.dst = NULL;
1618                cork->base.flags &= ~IPCORK_ALLFRAG;
1619        }
1620        memset(&cork->fl, 0, sizeof(cork->fl));
1621}
1622
1623struct sk_buff *__ip6_make_skb(struct sock *sk,
1624                               struct sk_buff_head *queue,
1625                               struct inet_cork_full *cork,
1626                               struct inet6_cork *v6_cork)
1627{
1628        struct sk_buff *skb, *tmp_skb;
1629        struct sk_buff **tail_skb;
1630        struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1631        struct ipv6_pinfo *np = inet6_sk(sk);
1632        struct net *net = sock_net(sk);
1633        struct ipv6hdr *hdr;
1634        struct ipv6_txoptions *opt = v6_cork->opt;
1635        struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1636        struct flowi6 *fl6 = &cork->fl.u.ip6;
1637        unsigned char proto = fl6->flowi6_proto;
1638
1639        skb = __skb_dequeue(queue);
1640        if (!skb)
1641                goto out;
1642        tail_skb = &(skb_shinfo(skb)->frag_list);
1643
1644        /* move skb->data to ip header from ext header */
1645        if (skb->data < skb_network_header(skb))
1646                __skb_pull(skb, skb_network_offset(skb));
1647        while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1648                __skb_pull(tmp_skb, skb_network_header_len(skb));
1649                *tail_skb = tmp_skb;
1650                tail_skb = &(tmp_skb->next);
1651                skb->len += tmp_skb->len;
1652                skb->data_len += tmp_skb->len;
1653                skb->truesize += tmp_skb->truesize;
1654                tmp_skb->destructor = NULL;
1655                tmp_skb->sk = NULL;
1656        }
1657
1658        /* Allow local fragmentation. */
1659        skb->ignore_df = ip6_sk_ignore_df(sk);
1660
1661        *final_dst = fl6->daddr;
1662        __skb_pull(skb, skb_network_header_len(skb));
1663        if (opt && opt->opt_flen)
1664                ipv6_push_frag_opts(skb, opt, &proto);
1665        if (opt && opt->opt_nflen)
1666                ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1667
1668        skb_push(skb, sizeof(struct ipv6hdr));
1669        skb_reset_network_header(skb);
1670        hdr = ipv6_hdr(skb);
1671
1672        ip6_flow_hdr(hdr, v6_cork->tclass,
1673                     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1674                                        ip6_autoflowlabel(net, np), fl6));
1675        hdr->hop_limit = v6_cork->hop_limit;
1676        hdr->nexthdr = proto;
1677        hdr->saddr = fl6->saddr;
1678        hdr->daddr = *final_dst;
1679
1680        skb->priority = sk->sk_priority;
1681        skb->mark = sk->sk_mark;
1682
1683        skb_dst_set(skb, dst_clone(&rt->dst));
1684        IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1685        if (proto == IPPROTO_ICMPV6) {
1686                struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1687
1688                ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1689                ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1690        }
1691
1692        ip6_cork_release(cork, v6_cork);
1693out:
1694        return skb;
1695}
1696
1697int ip6_send_skb(struct sk_buff *skb)
1698{
1699        struct net *net = sock_net(skb->sk);
1700        struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1701        int err;
1702
1703        err = ip6_local_out(net, skb->sk, skb);
1704        if (err) {
1705                if (err > 0)
1706                        err = net_xmit_errno(err);
1707                if (err)
1708                        IP6_INC_STATS(net, rt->rt6i_idev,
1709                                      IPSTATS_MIB_OUTDISCARDS);
1710        }
1711
1712        return err;
1713}
1714
1715int ip6_push_pending_frames(struct sock *sk)
1716{
1717        struct sk_buff *skb;
1718
1719        skb = ip6_finish_skb(sk);
1720        if (!skb)
1721                return 0;
1722
1723        return ip6_send_skb(skb);
1724}
1725EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1726
1727static void __ip6_flush_pending_frames(struct sock *sk,
1728                                       struct sk_buff_head *queue,
1729                                       struct inet_cork_full *cork,
1730                                       struct inet6_cork *v6_cork)
1731{
1732        struct sk_buff *skb;
1733
1734        while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1735                if (skb_dst(skb))
1736                        IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1737                                      IPSTATS_MIB_OUTDISCARDS);
1738                kfree_skb(skb);
1739        }
1740
1741        ip6_cork_release(cork, v6_cork);
1742}
1743
1744void ip6_flush_pending_frames(struct sock *sk)
1745{
1746        __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1747                                   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1748}
1749EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1750
1751struct sk_buff *ip6_make_skb(struct sock *sk,
1752                             int getfrag(void *from, char *to, int offset,
1753                                         int len, int odd, struct sk_buff *skb),
1754                             void *from, int length, int transhdrlen,
1755                             struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1756                             struct rt6_info *rt, unsigned int flags,
1757                             const struct sockcm_cookie *sockc)
1758{
1759        struct inet_cork_full cork;
1760        struct inet6_cork v6_cork;
1761        struct sk_buff_head queue;
1762        int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1763        int err;
1764
1765        if (flags & MSG_PROBE)
1766                return NULL;
1767
1768        __skb_queue_head_init(&queue);
1769
1770        cork.base.flags = 0;
1771        cork.base.addr = 0;
1772        cork.base.opt = NULL;
1773        cork.base.dst = NULL;
1774        v6_cork.opt = NULL;
1775        err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1776        if (err) {
1777                ip6_cork_release(&cork, &v6_cork);
1778                return ERR_PTR(err);
1779        }
1780        if (ipc6->dontfrag < 0)
1781                ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1782
1783        err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1784                                &current->task_frag, getfrag, from,
1785                                length + exthdrlen, transhdrlen + exthdrlen,
1786                                flags, ipc6, sockc);
1787        if (err) {
1788                __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1789                return ERR_PTR(err);
1790        }
1791
1792        return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1793}
1794