linux/net/ipv6/ip6_output.c
<<
>>
Prefs
   1/*
   2 *      IPv6 output functions
   3 *      Linux INET6 implementation
   4 *
   5 *      Authors:
   6 *      Pedro Roque             <roque@di.fc.ul.pt>
   7 *
   8 *      Based on linux/net/ipv4/ip_output.c
   9 *
  10 *      This program is free software; you can redistribute it and/or
  11 *      modify it under the terms of the GNU General Public License
  12 *      as published by the Free Software Foundation; either version
  13 *      2 of the License, or (at your option) any later version.
  14 *
  15 *      Changes:
  16 *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17 *                              extension headers are implemented.
  18 *                              route changes now work.
  19 *                              ip6_forward does not confuse sniffers.
  20 *                              etc.
  21 *
  22 *      H. von Brand    :       Added missing #include <linux/string.h>
  23 *      Imran Patel     :       frag id should be in NBO
  24 *      Kazunori MIYAZAWA @USAGI
  25 *                      :       add ip6_append_data and related functions
  26 *                              for datagram xmit
  27 */
  28
  29#include <linux/errno.h>
  30#include <linux/kernel.h>
  31#include <linux/string.h>
  32#include <linux/socket.h>
  33#include <linux/net.h>
  34#include <linux/netdevice.h>
  35#include <linux/if_arp.h>
  36#include <linux/in6.h>
  37#include <linux/tcp.h>
  38#include <linux/route.h>
  39#include <linux/module.h>
  40#include <linux/slab.h>
  41
  42#include <linux/bpf-cgroup.h>
  43#include <linux/netfilter.h>
  44#include <linux/netfilter_ipv6.h>
  45
  46#include <net/sock.h>
  47#include <net/snmp.h>
  48
  49#include <net/ipv6.h>
  50#include <net/ndisc.h>
  51#include <net/protocol.h>
  52#include <net/ip6_route.h>
  53#include <net/addrconf.h>
  54#include <net/rawv6.h>
  55#include <net/icmp.h>
  56#include <net/xfrm.h>
  57#include <net/checksum.h>
  58#include <linux/mroute6.h>
  59#include <net/l3mdev.h>
  60#include <net/lwtunnel.h>
  61
  62static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  63{
  64        struct dst_entry *dst = skb_dst(skb);
  65        struct net_device *dev = dst->dev;
  66        struct neighbour *neigh;
  67        struct in6_addr *nexthop;
  68        int ret;
  69
  70        if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  71                struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  72
  73                if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  74                    ((mroute6_is_socket(net, skb) &&
  75                     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  76                     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  77                                         &ipv6_hdr(skb)->saddr))) {
  78                        struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  79
  80                        /* Do not check for IFF_ALLMULTI; multicast routing
  81                           is not supported in any case.
  82                         */
  83                        if (newskb)
  84                                NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  85                                        net, sk, newskb, NULL, newskb->dev,
  86                                        dev_loopback_xmit);
  87
  88                        if (ipv6_hdr(skb)->hop_limit == 0) {
  89                                IP6_INC_STATS(net, idev,
  90                                              IPSTATS_MIB_OUTDISCARDS);
  91                                kfree_skb(skb);
  92                                return 0;
  93                        }
  94                }
  95
  96                IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  97
  98                if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
  99                    IPV6_ADDR_SCOPE_NODELOCAL &&
 100                    !(dev->flags & IFF_LOOPBACK)) {
 101                        kfree_skb(skb);
 102                        return 0;
 103                }
 104        }
 105
 106        if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 107                int res = lwtunnel_xmit(skb);
 108
 109                if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 110                        return res;
 111        }
 112
 113        rcu_read_lock_bh();
 114        nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 115        neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 116        if (unlikely(!neigh))
 117                neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 118        if (!IS_ERR(neigh)) {
 119                sock_confirm_neigh(skb, neigh);
 120                ret = neigh_output(neigh, skb);
 121                rcu_read_unlock_bh();
 122                return ret;
 123        }
 124        rcu_read_unlock_bh();
 125
 126        IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 127        kfree_skb(skb);
 128        return -EINVAL;
 129}
 130
 131static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 132{
 133        int ret;
 134
 135        ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 136        if (ret) {
 137                kfree_skb(skb);
 138                return ret;
 139        }
 140
 141#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 142        /* Policy lookup after SNAT yielded a new policy */
 143        if (skb_dst(skb)->xfrm) {
 144                IPCB(skb)->flags |= IPSKB_REROUTED;
 145                return dst_output(net, sk, skb);
 146        }
 147#endif
 148
 149        if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 150            dst_allfrag(skb_dst(skb)) ||
 151            (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 152                return ip6_fragment(net, sk, skb, ip6_finish_output2);
 153        else
 154                return ip6_finish_output2(net, sk, skb);
 155}
 156
 157int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 158{
 159        struct net_device *dev = skb_dst(skb)->dev;
 160        struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 161
 162        skb->protocol = htons(ETH_P_IPV6);
 163        skb->dev = dev;
 164
 165        if (unlikely(idev->cnf.disable_ipv6)) {
 166                IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 167                kfree_skb(skb);
 168                return 0;
 169        }
 170
 171        return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 172                            net, sk, skb, NULL, dev,
 173                            ip6_finish_output,
 174                            !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 175}
 176
 177bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 178{
 179        if (!np->autoflowlabel_set)
 180                return ip6_default_np_autolabel(net);
 181        else
 182                return np->autoflowlabel;
 183}
 184
 185/*
 186 * xmit an sk_buff (used by TCP, SCTP and DCCP)
 187 * Note : socket lock is not held for SYNACK packets, but might be modified
 188 * by calls to skb_set_owner_w() and ipv6_local_error(),
 189 * which are using proper atomic operations or spinlocks.
 190 */
 191int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 192             __u32 mark, struct ipv6_txoptions *opt, int tclass)
 193{
 194        struct net *net = sock_net(sk);
 195        const struct ipv6_pinfo *np = inet6_sk(sk);
 196        struct in6_addr *first_hop = &fl6->daddr;
 197        struct dst_entry *dst = skb_dst(skb);
 198        unsigned int head_room;
 199        struct ipv6hdr *hdr;
 200        u8  proto = fl6->flowi6_proto;
 201        int seg_len = skb->len;
 202        int hlimit = -1;
 203        u32 mtu;
 204
 205        head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 206        if (opt)
 207                head_room += opt->opt_nflen + opt->opt_flen;
 208
 209        if (unlikely(skb_headroom(skb) < head_room)) {
 210                struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 211                if (!skb2) {
 212                        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 213                                      IPSTATS_MIB_OUTDISCARDS);
 214                        kfree_skb(skb);
 215                        return -ENOBUFS;
 216                }
 217                if (skb->sk)
 218                        skb_set_owner_w(skb2, skb->sk);
 219                consume_skb(skb);
 220                skb = skb2;
 221        }
 222
 223        if (opt) {
 224                seg_len += opt->opt_nflen + opt->opt_flen;
 225
 226                if (opt->opt_flen)
 227                        ipv6_push_frag_opts(skb, opt, &proto);
 228
 229                if (opt->opt_nflen)
 230                        ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 231                                             &fl6->saddr);
 232        }
 233
 234        skb_push(skb, sizeof(struct ipv6hdr));
 235        skb_reset_network_header(skb);
 236        hdr = ipv6_hdr(skb);
 237
 238        /*
 239         *      Fill in the IPv6 header
 240         */
 241        if (np)
 242                hlimit = np->hop_limit;
 243        if (hlimit < 0)
 244                hlimit = ip6_dst_hoplimit(dst);
 245
 246        ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 247                                ip6_autoflowlabel(net, np), fl6));
 248
 249        hdr->payload_len = htons(seg_len);
 250        hdr->nexthdr = proto;
 251        hdr->hop_limit = hlimit;
 252
 253        hdr->saddr = fl6->saddr;
 254        hdr->daddr = *first_hop;
 255
 256        skb->protocol = htons(ETH_P_IPV6);
 257        skb->priority = sk->sk_priority;
 258        skb->mark = mark;
 259
 260        mtu = dst_mtu(dst);
 261        if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 262                IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 263                              IPSTATS_MIB_OUT, skb->len);
 264
 265                /* if egress device is enslaved to an L3 master device pass the
 266                 * skb to its handler for processing
 267                 */
 268                skb = l3mdev_ip6_out((struct sock *)sk, skb);
 269                if (unlikely(!skb))
 270                        return 0;
 271
 272                /* hooks should never assume socket lock is held.
 273                 * we promote our socket to non const
 274                 */
 275                return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 276                               net, (struct sock *)sk, skb, NULL, dst->dev,
 277                               dst_output);
 278        }
 279
 280        skb->dev = dst->dev;
 281        /* ipv6_local_error() does not require socket lock,
 282         * we promote our socket to non const
 283         */
 284        ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 285
 286        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 287        kfree_skb(skb);
 288        return -EMSGSIZE;
 289}
 290EXPORT_SYMBOL(ip6_xmit);
 291
 292static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 293{
 294        struct ip6_ra_chain *ra;
 295        struct sock *last = NULL;
 296
 297        read_lock(&ip6_ra_lock);
 298        for (ra = ip6_ra_chain; ra; ra = ra->next) {
 299                struct sock *sk = ra->sk;
 300                if (sk && ra->sel == sel &&
 301                    (!sk->sk_bound_dev_if ||
 302                     sk->sk_bound_dev_if == skb->dev->ifindex)) {
 303                        struct ipv6_pinfo *np = inet6_sk(sk);
 304
 305                        if (np && np->rtalert_isolate &&
 306                            !net_eq(sock_net(sk), dev_net(skb->dev))) {
 307                                continue;
 308                        }
 309                        if (last) {
 310                                struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 311                                if (skb2)
 312                                        rawv6_rcv(last, skb2);
 313                        }
 314                        last = sk;
 315                }
 316        }
 317
 318        if (last) {
 319                rawv6_rcv(last, skb);
 320                read_unlock(&ip6_ra_lock);
 321                return 1;
 322        }
 323        read_unlock(&ip6_ra_lock);
 324        return 0;
 325}
 326
 327static int ip6_forward_proxy_check(struct sk_buff *skb)
 328{
 329        struct ipv6hdr *hdr = ipv6_hdr(skb);
 330        u8 nexthdr = hdr->nexthdr;
 331        __be16 frag_off;
 332        int offset;
 333
 334        if (ipv6_ext_hdr(nexthdr)) {
 335                offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 336                if (offset < 0)
 337                        return 0;
 338        } else
 339                offset = sizeof(struct ipv6hdr);
 340
 341        if (nexthdr == IPPROTO_ICMPV6) {
 342                struct icmp6hdr *icmp6;
 343
 344                if (!pskb_may_pull(skb, (skb_network_header(skb) +
 345                                         offset + 1 - skb->data)))
 346                        return 0;
 347
 348                icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 349
 350                switch (icmp6->icmp6_type) {
 351                case NDISC_ROUTER_SOLICITATION:
 352                case NDISC_ROUTER_ADVERTISEMENT:
 353                case NDISC_NEIGHBOUR_SOLICITATION:
 354                case NDISC_NEIGHBOUR_ADVERTISEMENT:
 355                case NDISC_REDIRECT:
 356                        /* For reaction involving unicast neighbor discovery
 357                         * message destined to the proxied address, pass it to
 358                         * input function.
 359                         */
 360                        return 1;
 361                default:
 362                        break;
 363                }
 364        }
 365
 366        /*
 367         * The proxying router can't forward traffic sent to a link-local
 368         * address, so signal the sender and discard the packet. This
 369         * behavior is clarified by the MIPv6 specification.
 370         */
 371        if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 372                dst_link_failure(skb);
 373                return -1;
 374        }
 375
 376        return 0;
 377}
 378
 379static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 380                                     struct sk_buff *skb)
 381{
 382        struct dst_entry *dst = skb_dst(skb);
 383
 384        __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 385        __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 386
 387#ifdef CONFIG_NET_SWITCHDEV
 388        if (skb->offload_l3_fwd_mark) {
 389                consume_skb(skb);
 390                return 0;
 391        }
 392#endif
 393
 394        skb->tstamp = 0;
 395        return dst_output(net, sk, skb);
 396}
 397
 398static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 399{
 400        if (skb->len <= mtu)
 401                return false;
 402
 403        /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 404        if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 405                return true;
 406
 407        if (skb->ignore_df)
 408                return false;
 409
 410        if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 411                return false;
 412
 413        return true;
 414}
 415
 416int ip6_forward(struct sk_buff *skb)
 417{
 418        struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
 419        struct dst_entry *dst = skb_dst(skb);
 420        struct ipv6hdr *hdr = ipv6_hdr(skb);
 421        struct inet6_skb_parm *opt = IP6CB(skb);
 422        struct net *net = dev_net(dst->dev);
 423        u32 mtu;
 424
 425        if (net->ipv6.devconf_all->forwarding == 0)
 426                goto error;
 427
 428        if (skb->pkt_type != PACKET_HOST)
 429                goto drop;
 430
 431        if (unlikely(skb->sk))
 432                goto drop;
 433
 434        if (skb_warn_if_lro(skb))
 435                goto drop;
 436
 437        if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 438                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 439                goto drop;
 440        }
 441
 442        skb_forward_csum(skb);
 443
 444        /*
 445         *      We DO NOT make any processing on
 446         *      RA packets, pushing them to user level AS IS
 447         *      without ane WARRANTY that application will be able
 448         *      to interpret them. The reason is that we
 449         *      cannot make anything clever here.
 450         *
 451         *      We are not end-node, so that if packet contains
 452         *      AH/ESP, we cannot make anything.
 453         *      Defragmentation also would be mistake, RA packets
 454         *      cannot be fragmented, because there is no warranty
 455         *      that different fragments will go along one path. --ANK
 456         */
 457        if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 458                if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 459                        return 0;
 460        }
 461
 462        /*
 463         *      check and decrement ttl
 464         */
 465        if (hdr->hop_limit <= 1) {
 466                /* Force OUTPUT device used as source address */
 467                skb->dev = dst->dev;
 468                icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 469                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 470
 471                kfree_skb(skb);
 472                return -ETIMEDOUT;
 473        }
 474
 475        /* XXX: idev->cnf.proxy_ndp? */
 476        if (net->ipv6.devconf_all->proxy_ndp &&
 477            pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 478                int proxied = ip6_forward_proxy_check(skb);
 479                if (proxied > 0)
 480                        return ip6_input(skb);
 481                else if (proxied < 0) {
 482                        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 483                        goto drop;
 484                }
 485        }
 486
 487        if (!xfrm6_route_forward(skb)) {
 488                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 489                goto drop;
 490        }
 491        dst = skb_dst(skb);
 492
 493        /* IPv6 specs say nothing about it, but it is clear that we cannot
 494           send redirects to source routed frames.
 495           We don't send redirects to frames decapsulated from IPsec.
 496         */
 497        if (IP6CB(skb)->iif == dst->dev->ifindex &&
 498            opt->srcrt == 0 && !skb_sec_path(skb)) {
 499                struct in6_addr *target = NULL;
 500                struct inet_peer *peer;
 501                struct rt6_info *rt;
 502
 503                /*
 504                 *      incoming and outgoing devices are the same
 505                 *      send a redirect.
 506                 */
 507
 508                rt = (struct rt6_info *) dst;
 509                if (rt->rt6i_flags & RTF_GATEWAY)
 510                        target = &rt->rt6i_gateway;
 511                else
 512                        target = &hdr->daddr;
 513
 514                peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 515
 516                /* Limit redirects both by destination (here)
 517                   and by source (inside ndisc_send_redirect)
 518                 */
 519                if (inet_peer_xrlim_allow(peer, 1*HZ))
 520                        ndisc_send_redirect(skb, target);
 521                if (peer)
 522                        inet_putpeer(peer);
 523        } else {
 524                int addrtype = ipv6_addr_type(&hdr->saddr);
 525
 526                /* This check is security critical. */
 527                if (addrtype == IPV6_ADDR_ANY ||
 528                    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 529                        goto error;
 530                if (addrtype & IPV6_ADDR_LINKLOCAL) {
 531                        icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 532                                    ICMPV6_NOT_NEIGHBOUR, 0);
 533                        goto error;
 534                }
 535        }
 536
 537        mtu = ip6_dst_mtu_forward(dst);
 538        if (mtu < IPV6_MIN_MTU)
 539                mtu = IPV6_MIN_MTU;
 540
 541        if (ip6_pkt_too_big(skb, mtu)) {
 542                /* Again, force OUTPUT device used as source address */
 543                skb->dev = dst->dev;
 544                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 545                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
 546                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 547                                IPSTATS_MIB_FRAGFAILS);
 548                kfree_skb(skb);
 549                return -EMSGSIZE;
 550        }
 551
 552        if (skb_cow(skb, dst->dev->hard_header_len)) {
 553                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 554                                IPSTATS_MIB_OUTDISCARDS);
 555                goto drop;
 556        }
 557
 558        hdr = ipv6_hdr(skb);
 559
 560        /* Mangling hops number delayed to point after skb COW */
 561
 562        hdr->hop_limit--;
 563
 564        return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 565                       net, NULL, skb, skb->dev, dst->dev,
 566                       ip6_forward_finish);
 567
 568error:
 569        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 570drop:
 571        kfree_skb(skb);
 572        return -EINVAL;
 573}
 574
 575static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 576{
 577        to->pkt_type = from->pkt_type;
 578        to->priority = from->priority;
 579        to->protocol = from->protocol;
 580        skb_dst_drop(to);
 581        skb_dst_set(to, dst_clone(skb_dst(from)));
 582        to->dev = from->dev;
 583        to->mark = from->mark;
 584
 585        skb_copy_hash(to, from);
 586
 587#ifdef CONFIG_NET_SCHED
 588        to->tc_index = from->tc_index;
 589#endif
 590        nf_copy(to, from);
 591        skb_ext_copy(to, from);
 592        skb_copy_secmark(to, from);
 593}
 594
 595int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 596                 int (*output)(struct net *, struct sock *, struct sk_buff *))
 597{
 598        struct sk_buff *frag;
 599        struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 600        struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 601                                inet6_sk(skb->sk) : NULL;
 602        struct ipv6hdr *tmp_hdr;
 603        struct frag_hdr *fh;
 604        unsigned int mtu, hlen, left, len, nexthdr_offset;
 605        int hroom, troom;
 606        __be32 frag_id;
 607        int ptr, offset = 0, err = 0;
 608        u8 *prevhdr, nexthdr = 0;
 609
 610        err = ip6_find_1stfragopt(skb, &prevhdr);
 611        if (err < 0)
 612                goto fail;
 613        hlen = err;
 614        nexthdr = *prevhdr;
 615        nexthdr_offset = prevhdr - skb_network_header(skb);
 616
 617        mtu = ip6_skb_dst_mtu(skb);
 618
 619        /* We must not fragment if the socket is set to force MTU discovery
 620         * or if the skb it not generated by a local socket.
 621         */
 622        if (unlikely(!skb->ignore_df && skb->len > mtu))
 623                goto fail_toobig;
 624
 625        if (IP6CB(skb)->frag_max_size) {
 626                if (IP6CB(skb)->frag_max_size > mtu)
 627                        goto fail_toobig;
 628
 629                /* don't send fragments larger than what we received */
 630                mtu = IP6CB(skb)->frag_max_size;
 631                if (mtu < IPV6_MIN_MTU)
 632                        mtu = IPV6_MIN_MTU;
 633        }
 634
 635        if (np && np->frag_size < mtu) {
 636                if (np->frag_size)
 637                        mtu = np->frag_size;
 638        }
 639        if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 640                goto fail_toobig;
 641        mtu -= hlen + sizeof(struct frag_hdr);
 642
 643        frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 644                                    &ipv6_hdr(skb)->saddr);
 645
 646        if (skb->ip_summed == CHECKSUM_PARTIAL &&
 647            (err = skb_checksum_help(skb)))
 648                goto fail;
 649
 650        prevhdr = skb_network_header(skb) + nexthdr_offset;
 651        hroom = LL_RESERVED_SPACE(rt->dst.dev);
 652        if (skb_has_frag_list(skb)) {
 653                unsigned int first_len = skb_pagelen(skb);
 654                struct sk_buff *frag2;
 655
 656                if (first_len - hlen > mtu ||
 657                    ((first_len - hlen) & 7) ||
 658                    skb_cloned(skb) ||
 659                    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 660                        goto slow_path;
 661
 662                skb_walk_frags(skb, frag) {
 663                        /* Correct geometry. */
 664                        if (frag->len > mtu ||
 665                            ((frag->len & 7) && frag->next) ||
 666                            skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 667                                goto slow_path_clean;
 668
 669                        /* Partially cloned skb? */
 670                        if (skb_shared(frag))
 671                                goto slow_path_clean;
 672
 673                        BUG_ON(frag->sk);
 674                        if (skb->sk) {
 675                                frag->sk = skb->sk;
 676                                frag->destructor = sock_wfree;
 677                        }
 678                        skb->truesize -= frag->truesize;
 679                }
 680
 681                err = 0;
 682                offset = 0;
 683                /* BUILD HEADER */
 684
 685                *prevhdr = NEXTHDR_FRAGMENT;
 686                tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 687                if (!tmp_hdr) {
 688                        err = -ENOMEM;
 689                        goto fail;
 690                }
 691                frag = skb_shinfo(skb)->frag_list;
 692                skb_frag_list_init(skb);
 693
 694                __skb_pull(skb, hlen);
 695                fh = __skb_push(skb, sizeof(struct frag_hdr));
 696                __skb_push(skb, hlen);
 697                skb_reset_network_header(skb);
 698                memcpy(skb_network_header(skb), tmp_hdr, hlen);
 699
 700                fh->nexthdr = nexthdr;
 701                fh->reserved = 0;
 702                fh->frag_off = htons(IP6_MF);
 703                fh->identification = frag_id;
 704
 705                first_len = skb_pagelen(skb);
 706                skb->data_len = first_len - skb_headlen(skb);
 707                skb->len = first_len;
 708                ipv6_hdr(skb)->payload_len = htons(first_len -
 709                                                   sizeof(struct ipv6hdr));
 710
 711                for (;;) {
 712                        /* Prepare header of the next frame,
 713                         * before previous one went down. */
 714                        if (frag) {
 715                                frag->ip_summed = CHECKSUM_NONE;
 716                                skb_reset_transport_header(frag);
 717                                fh = __skb_push(frag, sizeof(struct frag_hdr));
 718                                __skb_push(frag, hlen);
 719                                skb_reset_network_header(frag);
 720                                memcpy(skb_network_header(frag), tmp_hdr,
 721                                       hlen);
 722                                offset += skb->len - hlen - sizeof(struct frag_hdr);
 723                                fh->nexthdr = nexthdr;
 724                                fh->reserved = 0;
 725                                fh->frag_off = htons(offset);
 726                                if (frag->next)
 727                                        fh->frag_off |= htons(IP6_MF);
 728                                fh->identification = frag_id;
 729                                ipv6_hdr(frag)->payload_len =
 730                                                htons(frag->len -
 731                                                      sizeof(struct ipv6hdr));
 732                                ip6_copy_metadata(frag, skb);
 733                        }
 734
 735                        err = output(net, sk, skb);
 736                        if (!err)
 737                                IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 738                                              IPSTATS_MIB_FRAGCREATES);
 739
 740                        if (err || !frag)
 741                                break;
 742
 743                        skb = frag;
 744                        frag = skb->next;
 745                        skb_mark_not_on_list(skb);
 746                }
 747
 748                kfree(tmp_hdr);
 749
 750                if (err == 0) {
 751                        IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 752                                      IPSTATS_MIB_FRAGOKS);
 753                        return 0;
 754                }
 755
 756                kfree_skb_list(frag);
 757
 758                IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 759                              IPSTATS_MIB_FRAGFAILS);
 760                return err;
 761
 762slow_path_clean:
 763                skb_walk_frags(skb, frag2) {
 764                        if (frag2 == frag)
 765                                break;
 766                        frag2->sk = NULL;
 767                        frag2->destructor = NULL;
 768                        skb->truesize += frag2->truesize;
 769                }
 770        }
 771
 772slow_path:
 773        left = skb->len - hlen;         /* Space per frame */
 774        ptr = hlen;                     /* Where to start from */
 775
 776        /*
 777         *      Fragment the datagram.
 778         */
 779
 780        troom = rt->dst.dev->needed_tailroom;
 781
 782        /*
 783         *      Keep copying data until we run out.
 784         */
 785        while (left > 0)        {
 786                u8 *fragnexthdr_offset;
 787
 788                len = left;
 789                /* IF: it doesn't fit, use 'mtu' - the data space left */
 790                if (len > mtu)
 791                        len = mtu;
 792                /* IF: we are not sending up to and including the packet end
 793                   then align the next start on an eight byte boundary */
 794                if (len < left) {
 795                        len &= ~7;
 796                }
 797
 798                /* Allocate buffer */
 799                frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 800                                 hroom + troom, GFP_ATOMIC);
 801                if (!frag) {
 802                        err = -ENOMEM;
 803                        goto fail;
 804                }
 805
 806                /*
 807                 *      Set up data on packet
 808                 */
 809
 810                ip6_copy_metadata(frag, skb);
 811                skb_reserve(frag, hroom);
 812                skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 813                skb_reset_network_header(frag);
 814                fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 815                frag->transport_header = (frag->network_header + hlen +
 816                                          sizeof(struct frag_hdr));
 817
 818                /*
 819                 *      Charge the memory for the fragment to any owner
 820                 *      it might possess
 821                 */
 822                if (skb->sk)
 823                        skb_set_owner_w(frag, skb->sk);
 824
 825                /*
 826                 *      Copy the packet header into the new buffer.
 827                 */
 828                skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 829
 830                fragnexthdr_offset = skb_network_header(frag);
 831                fragnexthdr_offset += prevhdr - skb_network_header(skb);
 832                *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 833
 834                /*
 835                 *      Build fragment header.
 836                 */
 837                fh->nexthdr = nexthdr;
 838                fh->reserved = 0;
 839                fh->identification = frag_id;
 840
 841                /*
 842                 *      Copy a block of the IP datagram.
 843                 */
 844                BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 845                                     len));
 846                left -= len;
 847
 848                fh->frag_off = htons(offset);
 849                if (left > 0)
 850                        fh->frag_off |= htons(IP6_MF);
 851                ipv6_hdr(frag)->payload_len = htons(frag->len -
 852                                                    sizeof(struct ipv6hdr));
 853
 854                ptr += len;
 855                offset += len;
 856
 857                /*
 858                 *      Put this fragment into the sending queue.
 859                 */
 860                err = output(net, sk, frag);
 861                if (err)
 862                        goto fail;
 863
 864                IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 865                              IPSTATS_MIB_FRAGCREATES);
 866        }
 867        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 868                      IPSTATS_MIB_FRAGOKS);
 869        consume_skb(skb);
 870        return err;
 871
 872fail_toobig:
 873        if (skb->sk && dst_allfrag(skb_dst(skb)))
 874                sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 875
 876        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 877        err = -EMSGSIZE;
 878
 879fail:
 880        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 881                      IPSTATS_MIB_FRAGFAILS);
 882        kfree_skb(skb);
 883        return err;
 884}
 885
 886static inline int ip6_rt_check(const struct rt6key *rt_key,
 887                               const struct in6_addr *fl_addr,
 888                               const struct in6_addr *addr_cache)
 889{
 890        return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 891                (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 892}
 893
 894static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 895                                          struct dst_entry *dst,
 896                                          const struct flowi6 *fl6)
 897{
 898        struct ipv6_pinfo *np = inet6_sk(sk);
 899        struct rt6_info *rt;
 900
 901        if (!dst)
 902                goto out;
 903
 904        if (dst->ops->family != AF_INET6) {
 905                dst_release(dst);
 906                return NULL;
 907        }
 908
 909        rt = (struct rt6_info *)dst;
 910        /* Yes, checking route validity in not connected
 911         * case is not very simple. Take into account,
 912         * that we do not support routing by source, TOS,
 913         * and MSG_DONTROUTE            --ANK (980726)
 914         *
 915         * 1. ip6_rt_check(): If route was host route,
 916         *    check that cached destination is current.
 917         *    If it is network route, we still may
 918         *    check its validity using saved pointer
 919         *    to the last used address: daddr_cache.
 920         *    We do not want to save whole address now,
 921         *    (because main consumer of this service
 922         *    is tcp, which has not this problem),
 923         *    so that the last trick works only on connected
 924         *    sockets.
 925         * 2. oif also should be the same.
 926         */
 927        if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 928#ifdef CONFIG_IPV6_SUBTREES
 929            ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 930#endif
 931           (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 932              (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 933                dst_release(dst);
 934                dst = NULL;
 935        }
 936
 937out:
 938        return dst;
 939}
 940
 941static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 942                               struct dst_entry **dst, struct flowi6 *fl6)
 943{
 944#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 945        struct neighbour *n;
 946        struct rt6_info *rt;
 947#endif
 948        int err;
 949        int flags = 0;
 950
 951        /* The correct way to handle this would be to do
 952         * ip6_route_get_saddr, and then ip6_route_output; however,
 953         * the route-specific preferred source forces the
 954         * ip6_route_output call _before_ ip6_route_get_saddr.
 955         *
 956         * In source specific routing (no src=any default route),
 957         * ip6_route_output will fail given src=any saddr, though, so
 958         * that's why we try it again later.
 959         */
 960        if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
 961                struct fib6_info *from;
 962                struct rt6_info *rt;
 963                bool had_dst = *dst != NULL;
 964
 965                if (!had_dst)
 966                        *dst = ip6_route_output(net, sk, fl6);
 967                rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
 968
 969                rcu_read_lock();
 970                from = rt ? rcu_dereference(rt->from) : NULL;
 971                err = ip6_route_get_saddr(net, from, &fl6->daddr,
 972                                          sk ? inet6_sk(sk)->srcprefs : 0,
 973                                          &fl6->saddr);
 974                rcu_read_unlock();
 975
 976                if (err)
 977                        goto out_err_release;
 978
 979                /* If we had an erroneous initial result, pretend it
 980                 * never existed and let the SA-enabled version take
 981                 * over.
 982                 */
 983                if (!had_dst && (*dst)->error) {
 984                        dst_release(*dst);
 985                        *dst = NULL;
 986                }
 987
 988                if (fl6->flowi6_oif)
 989                        flags |= RT6_LOOKUP_F_IFACE;
 990        }
 991
 992        if (!*dst)
 993                *dst = ip6_route_output_flags(net, sk, fl6, flags);
 994
 995        err = (*dst)->error;
 996        if (err)
 997                goto out_err_release;
 998
 999#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1000        /*
1001         * Here if the dst entry we've looked up
1002         * has a neighbour entry that is in the INCOMPLETE
1003         * state and the src address from the flow is
1004         * marked as OPTIMISTIC, we release the found
1005         * dst entry and replace it instead with the
1006         * dst entry of the nexthop router
1007         */
1008        rt = (struct rt6_info *) *dst;
1009        rcu_read_lock_bh();
1010        n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1011                                      rt6_nexthop(rt, &fl6->daddr));
1012        err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1013        rcu_read_unlock_bh();
1014
1015        if (err) {
1016                struct inet6_ifaddr *ifp;
1017                struct flowi6 fl_gw6;
1018                int redirect;
1019
1020                ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1021                                      (*dst)->dev, 1);
1022
1023                redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1024                if (ifp)
1025                        in6_ifa_put(ifp);
1026
1027                if (redirect) {
1028                        /*
1029                         * We need to get the dst entry for the
1030                         * default router instead
1031                         */
1032                        dst_release(*dst);
1033                        memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1034                        memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1035                        *dst = ip6_route_output(net, sk, &fl_gw6);
1036                        err = (*dst)->error;
1037                        if (err)
1038                                goto out_err_release;
1039                }
1040        }
1041#endif
1042        if (ipv6_addr_v4mapped(&fl6->saddr) &&
1043            !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1044                err = -EAFNOSUPPORT;
1045                goto out_err_release;
1046        }
1047
1048        return 0;
1049
1050out_err_release:
1051        dst_release(*dst);
1052        *dst = NULL;
1053
1054        if (err == -ENETUNREACH)
1055                IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1056        return err;
1057}
1058
1059/**
1060 *      ip6_dst_lookup - perform route lookup on flow
1061 *      @sk: socket which provides route info
1062 *      @dst: pointer to dst_entry * for result
1063 *      @fl6: flow to lookup
1064 *
1065 *      This function performs a route lookup on the given flow.
1066 *
1067 *      It returns zero on success, or a standard errno code on error.
1068 */
1069int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1070                   struct flowi6 *fl6)
1071{
1072        *dst = NULL;
1073        return ip6_dst_lookup_tail(net, sk, dst, fl6);
1074}
1075EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1076
1077/**
1078 *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1079 *      @sk: socket which provides route info
1080 *      @fl6: flow to lookup
1081 *      @final_dst: final destination address for ipsec lookup
1082 *
1083 *      This function performs a route lookup on the given flow.
1084 *
1085 *      It returns a valid dst pointer on success, or a pointer encoded
1086 *      error code.
1087 */
1088struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1089                                      const struct in6_addr *final_dst)
1090{
1091        struct dst_entry *dst = NULL;
1092        int err;
1093
1094        err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1095        if (err)
1096                return ERR_PTR(err);
1097        if (final_dst)
1098                fl6->daddr = *final_dst;
1099
1100        return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1101}
1102EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1103
1104/**
1105 *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1106 *      @sk: socket which provides the dst cache and route info
1107 *      @fl6: flow to lookup
1108 *      @final_dst: final destination address for ipsec lookup
1109 *      @connected: whether @sk is connected or not
1110 *
1111 *      This function performs a route lookup on the given flow with the
1112 *      possibility of using the cached route in the socket if it is valid.
1113 *      It will take the socket dst lock when operating on the dst cache.
1114 *      As a result, this function can only be used in process context.
1115 *
1116 *      In addition, for a connected socket, cache the dst in the socket
1117 *      if the current cache is not valid.
1118 *
1119 *      It returns a valid dst pointer on success, or a pointer encoded
1120 *      error code.
1121 */
1122struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1123                                         const struct in6_addr *final_dst,
1124                                         bool connected)
1125{
1126        struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1127
1128        dst = ip6_sk_dst_check(sk, dst, fl6);
1129        if (dst)
1130                return dst;
1131
1132        dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1133        if (connected && !IS_ERR(dst))
1134                ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1135
1136        return dst;
1137}
1138EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1139
1140static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1141                                               gfp_t gfp)
1142{
1143        return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1144}
1145
1146static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1147                                                gfp_t gfp)
1148{
1149        return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1150}
1151
1152static void ip6_append_data_mtu(unsigned int *mtu,
1153                                int *maxfraglen,
1154                                unsigned int fragheaderlen,
1155                                struct sk_buff *skb,
1156                                struct rt6_info *rt,
1157                                unsigned int orig_mtu)
1158{
1159        if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1160                if (!skb) {
1161                        /* first fragment, reserve header_len */
1162                        *mtu = orig_mtu - rt->dst.header_len;
1163
1164                } else {
1165                        /*
1166                         * this fragment is not first, the headers
1167                         * space is regarded as data space.
1168                         */
1169                        *mtu = orig_mtu;
1170                }
1171                *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1172                              + fragheaderlen - sizeof(struct frag_hdr);
1173        }
1174}
1175
1176static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1177                          struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1178                          struct rt6_info *rt, struct flowi6 *fl6)
1179{
1180        struct ipv6_pinfo *np = inet6_sk(sk);
1181        unsigned int mtu;
1182        struct ipv6_txoptions *opt = ipc6->opt;
1183
1184        /*
1185         * setup for corking
1186         */
1187        if (opt) {
1188                if (WARN_ON(v6_cork->opt))
1189                        return -EINVAL;
1190
1191                v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1192                if (unlikely(!v6_cork->opt))
1193                        return -ENOBUFS;
1194
1195                v6_cork->opt->tot_len = sizeof(*opt);
1196                v6_cork->opt->opt_flen = opt->opt_flen;
1197                v6_cork->opt->opt_nflen = opt->opt_nflen;
1198
1199                v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1200                                                    sk->sk_allocation);
1201                if (opt->dst0opt && !v6_cork->opt->dst0opt)
1202                        return -ENOBUFS;
1203
1204                v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1205                                                    sk->sk_allocation);
1206                if (opt->dst1opt && !v6_cork->opt->dst1opt)
1207                        return -ENOBUFS;
1208
1209                v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1210                                                   sk->sk_allocation);
1211                if (opt->hopopt && !v6_cork->opt->hopopt)
1212                        return -ENOBUFS;
1213
1214                v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1215                                                    sk->sk_allocation);
1216                if (opt->srcrt && !v6_cork->opt->srcrt)
1217                        return -ENOBUFS;
1218
1219                /* need source address above miyazawa*/
1220        }
1221        dst_hold(&rt->dst);
1222        cork->base.dst = &rt->dst;
1223        cork->fl.u.ip6 = *fl6;
1224        v6_cork->hop_limit = ipc6->hlimit;
1225        v6_cork->tclass = ipc6->tclass;
1226        if (rt->dst.flags & DST_XFRM_TUNNEL)
1227                mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1228                      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1229        else
1230                mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1231                        READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1232        if (np->frag_size < mtu) {
1233                if (np->frag_size)
1234                        mtu = np->frag_size;
1235        }
1236        if (mtu < IPV6_MIN_MTU)
1237                return -EINVAL;
1238        cork->base.fragsize = mtu;
1239        cork->base.gso_size = ipc6->gso_size;
1240        cork->base.tx_flags = 0;
1241        sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1242
1243        if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1244                cork->base.flags |= IPCORK_ALLFRAG;
1245        cork->base.length = 0;
1246
1247        cork->base.transmit_time = ipc6->sockc.transmit_time;
1248
1249        return 0;
1250}
1251
1252static int __ip6_append_data(struct sock *sk,
1253                             struct flowi6 *fl6,
1254                             struct sk_buff_head *queue,
1255                             struct inet_cork *cork,
1256                             struct inet6_cork *v6_cork,
1257                             struct page_frag *pfrag,
1258                             int getfrag(void *from, char *to, int offset,
1259                                         int len, int odd, struct sk_buff *skb),
1260                             void *from, int length, int transhdrlen,
1261                             unsigned int flags, struct ipcm6_cookie *ipc6)
1262{
1263        struct sk_buff *skb, *skb_prev = NULL;
1264        unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1265        struct ubuf_info *uarg = NULL;
1266        int exthdrlen = 0;
1267        int dst_exthdrlen = 0;
1268        int hh_len;
1269        int copy;
1270        int err;
1271        int offset = 0;
1272        u32 tskey = 0;
1273        struct rt6_info *rt = (struct rt6_info *)cork->dst;
1274        struct ipv6_txoptions *opt = v6_cork->opt;
1275        int csummode = CHECKSUM_NONE;
1276        unsigned int maxnonfragsize, headersize;
1277        unsigned int wmem_alloc_delta = 0;
1278        bool paged, extra_uref;
1279
1280        skb = skb_peek_tail(queue);
1281        if (!skb) {
1282                exthdrlen = opt ? opt->opt_flen : 0;
1283                dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1284        }
1285
1286        paged = !!cork->gso_size;
1287        mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1288        orig_mtu = mtu;
1289
1290        if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1291            sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1292                tskey = sk->sk_tskey++;
1293
1294        hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1295
1296        fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1297                        (opt ? opt->opt_nflen : 0);
1298        maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1299                     sizeof(struct frag_hdr);
1300
1301        headersize = sizeof(struct ipv6hdr) +
1302                     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1303                     (dst_allfrag(&rt->dst) ?
1304                      sizeof(struct frag_hdr) : 0) +
1305                     rt->rt6i_nfheader_len;
1306
1307        /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1308         * the first fragment
1309         */
1310        if (headersize + transhdrlen > mtu)
1311                goto emsgsize;
1312
1313        if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1314            (sk->sk_protocol == IPPROTO_UDP ||
1315             sk->sk_protocol == IPPROTO_RAW)) {
1316                ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1317                                sizeof(struct ipv6hdr));
1318                goto emsgsize;
1319        }
1320
1321        if (ip6_sk_ignore_df(sk))
1322                maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1323        else
1324                maxnonfragsize = mtu;
1325
1326        if (cork->length + length > maxnonfragsize - headersize) {
1327emsgsize:
1328                pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1329                ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1330                return -EMSGSIZE;
1331        }
1332
1333        /* CHECKSUM_PARTIAL only with no extension headers and when
1334         * we are not going to fragment
1335         */
1336        if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1337            headersize == sizeof(struct ipv6hdr) &&
1338            length <= mtu - headersize &&
1339            (!(flags & MSG_MORE) || cork->gso_size) &&
1340            rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1341                csummode = CHECKSUM_PARTIAL;
1342
1343        if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1344                uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
1345                if (!uarg)
1346                        return -ENOBUFS;
1347                extra_uref = true;
1348                if (rt->dst.dev->features & NETIF_F_SG &&
1349                    csummode == CHECKSUM_PARTIAL) {
1350                        paged = true;
1351                } else {
1352                        uarg->zerocopy = 0;
1353                        skb_zcopy_set(skb, uarg, &extra_uref);
1354                }
1355        }
1356
1357        /*
1358         * Let's try using as much space as possible.
1359         * Use MTU if total length of the message fits into the MTU.
1360         * Otherwise, we need to reserve fragment header and
1361         * fragment alignment (= 8-15 octects, in total).
1362         *
1363         * Note that we may need to "move" the data from the tail of
1364         * of the buffer to the new fragment when we split
1365         * the message.
1366         *
1367         * FIXME: It may be fragmented into multiple chunks
1368         *        at once if non-fragmentable extension headers
1369         *        are too large.
1370         * --yoshfuji
1371         */
1372
1373        cork->length += length;
1374        if (!skb)
1375                goto alloc_new_skb;
1376
1377        while (length > 0) {
1378                /* Check if the remaining data fits into current packet. */
1379                copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1380                if (copy < length)
1381                        copy = maxfraglen - skb->len;
1382
1383                if (copy <= 0) {
1384                        char *data;
1385                        unsigned int datalen;
1386                        unsigned int fraglen;
1387                        unsigned int fraggap;
1388                        unsigned int alloclen;
1389                        unsigned int pagedlen;
1390alloc_new_skb:
1391                        /* There's no room in the current skb */
1392                        if (skb)
1393                                fraggap = skb->len - maxfraglen;
1394                        else
1395                                fraggap = 0;
1396                        /* update mtu and maxfraglen if necessary */
1397                        if (!skb || !skb_prev)
1398                                ip6_append_data_mtu(&mtu, &maxfraglen,
1399                                                    fragheaderlen, skb, rt,
1400                                                    orig_mtu);
1401
1402                        skb_prev = skb;
1403
1404                        /*
1405                         * If remaining data exceeds the mtu,
1406                         * we know we need more fragment(s).
1407                         */
1408                        datalen = length + fraggap;
1409
1410                        if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1411                                datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1412                        fraglen = datalen + fragheaderlen;
1413                        pagedlen = 0;
1414
1415                        if ((flags & MSG_MORE) &&
1416                            !(rt->dst.dev->features&NETIF_F_SG))
1417                                alloclen = mtu;
1418                        else if (!paged)
1419                                alloclen = fraglen;
1420                        else {
1421                                alloclen = min_t(int, fraglen, MAX_HEADER);
1422                                pagedlen = fraglen - alloclen;
1423                        }
1424
1425                        alloclen += dst_exthdrlen;
1426
1427                        if (datalen != length + fraggap) {
1428                                /*
1429                                 * this is not the last fragment, the trailer
1430                                 * space is regarded as data space.
1431                                 */
1432                                datalen += rt->dst.trailer_len;
1433                        }
1434
1435                        alloclen += rt->dst.trailer_len;
1436                        fraglen = datalen + fragheaderlen;
1437
1438                        /*
1439                         * We just reserve space for fragment header.
1440                         * Note: this may be overallocation if the message
1441                         * (without MSG_MORE) fits into the MTU.
1442                         */
1443                        alloclen += sizeof(struct frag_hdr);
1444
1445                        copy = datalen - transhdrlen - fraggap - pagedlen;
1446                        if (copy < 0) {
1447                                err = -EINVAL;
1448                                goto error;
1449                        }
1450                        if (transhdrlen) {
1451                                skb = sock_alloc_send_skb(sk,
1452                                                alloclen + hh_len,
1453                                                (flags & MSG_DONTWAIT), &err);
1454                        } else {
1455                                skb = NULL;
1456                                if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1457                                    2 * sk->sk_sndbuf)
1458                                        skb = alloc_skb(alloclen + hh_len,
1459                                                        sk->sk_allocation);
1460                                if (unlikely(!skb))
1461                                        err = -ENOBUFS;
1462                        }
1463                        if (!skb)
1464                                goto error;
1465                        /*
1466                         *      Fill in the control structures
1467                         */
1468                        skb->protocol = htons(ETH_P_IPV6);
1469                        skb->ip_summed = csummode;
1470                        skb->csum = 0;
1471                        /* reserve for fragmentation and ipsec header */
1472                        skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1473                                    dst_exthdrlen);
1474
1475                        /*
1476                         *      Find where to start putting bytes
1477                         */
1478                        data = skb_put(skb, fraglen - pagedlen);
1479                        skb_set_network_header(skb, exthdrlen);
1480                        data += fragheaderlen;
1481                        skb->transport_header = (skb->network_header +
1482                                                 fragheaderlen);
1483                        if (fraggap) {
1484                                skb->csum = skb_copy_and_csum_bits(
1485                                        skb_prev, maxfraglen,
1486                                        data + transhdrlen, fraggap, 0);
1487                                skb_prev->csum = csum_sub(skb_prev->csum,
1488                                                          skb->csum);
1489                                data += fraggap;
1490                                pskb_trim_unique(skb_prev, maxfraglen);
1491                        }
1492                        if (copy > 0 &&
1493                            getfrag(from, data + transhdrlen, offset,
1494                                    copy, fraggap, skb) < 0) {
1495                                err = -EFAULT;
1496                                kfree_skb(skb);
1497                                goto error;
1498                        }
1499
1500                        offset += copy;
1501                        length -= copy + transhdrlen;
1502                        transhdrlen = 0;
1503                        exthdrlen = 0;
1504                        dst_exthdrlen = 0;
1505
1506                        /* Only the initial fragment is time stamped */
1507                        skb_shinfo(skb)->tx_flags = cork->tx_flags;
1508                        cork->tx_flags = 0;
1509                        skb_shinfo(skb)->tskey = tskey;
1510                        tskey = 0;
1511                        skb_zcopy_set(skb, uarg, &extra_uref);
1512
1513                        if ((flags & MSG_CONFIRM) && !skb_prev)
1514                                skb_set_dst_pending_confirm(skb, 1);
1515
1516                        /*
1517                         * Put the packet on the pending queue
1518                         */
1519                        if (!skb->destructor) {
1520                                skb->destructor = sock_wfree;
1521                                skb->sk = sk;
1522                                wmem_alloc_delta += skb->truesize;
1523                        }
1524                        __skb_queue_tail(queue, skb);
1525                        continue;
1526                }
1527
1528                if (copy > length)
1529                        copy = length;
1530
1531                if (!(rt->dst.dev->features&NETIF_F_SG) &&
1532                    skb_tailroom(skb) >= copy) {
1533                        unsigned int off;
1534
1535                        off = skb->len;
1536                        if (getfrag(from, skb_put(skb, copy),
1537                                                offset, copy, off, skb) < 0) {
1538                                __skb_trim(skb, off);
1539                                err = -EFAULT;
1540                                goto error;
1541                        }
1542                } else if (!uarg || !uarg->zerocopy) {
1543                        int i = skb_shinfo(skb)->nr_frags;
1544
1545                        err = -ENOMEM;
1546                        if (!sk_page_frag_refill(sk, pfrag))
1547                                goto error;
1548
1549                        if (!skb_can_coalesce(skb, i, pfrag->page,
1550                                              pfrag->offset)) {
1551                                err = -EMSGSIZE;
1552                                if (i == MAX_SKB_FRAGS)
1553                                        goto error;
1554
1555                                __skb_fill_page_desc(skb, i, pfrag->page,
1556                                                     pfrag->offset, 0);
1557                                skb_shinfo(skb)->nr_frags = ++i;
1558                                get_page(pfrag->page);
1559                        }
1560                        copy = min_t(int, copy, pfrag->size - pfrag->offset);
1561                        if (getfrag(from,
1562                                    page_address(pfrag->page) + pfrag->offset,
1563                                    offset, copy, skb->len, skb) < 0)
1564                                goto error_efault;
1565
1566                        pfrag->offset += copy;
1567                        skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1568                        skb->len += copy;
1569                        skb->data_len += copy;
1570                        skb->truesize += copy;
1571                        wmem_alloc_delta += copy;
1572                } else {
1573                        err = skb_zerocopy_iter_dgram(skb, from, copy);
1574                        if (err < 0)
1575                                goto error;
1576                }
1577                offset += copy;
1578                length -= copy;
1579        }
1580
1581        if (wmem_alloc_delta)
1582                refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1583        return 0;
1584
1585error_efault:
1586        err = -EFAULT;
1587error:
1588        if (uarg)
1589                sock_zerocopy_put_abort(uarg, extra_uref);
1590        cork->length -= length;
1591        IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1592        refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1593        return err;
1594}
1595
1596int ip6_append_data(struct sock *sk,
1597                    int getfrag(void *from, char *to, int offset, int len,
1598                                int odd, struct sk_buff *skb),
1599                    void *from, int length, int transhdrlen,
1600                    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1601                    struct rt6_info *rt, unsigned int flags)
1602{
1603        struct inet_sock *inet = inet_sk(sk);
1604        struct ipv6_pinfo *np = inet6_sk(sk);
1605        int exthdrlen;
1606        int err;
1607
1608        if (flags&MSG_PROBE)
1609                return 0;
1610        if (skb_queue_empty(&sk->sk_write_queue)) {
1611                /*
1612                 * setup for corking
1613                 */
1614                err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1615                                     ipc6, rt, fl6);
1616                if (err)
1617                        return err;
1618
1619                exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1620                length += exthdrlen;
1621                transhdrlen += exthdrlen;
1622        } else {
1623                fl6 = &inet->cork.fl.u.ip6;
1624                transhdrlen = 0;
1625        }
1626
1627        return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1628                                 &np->cork, sk_page_frag(sk), getfrag,
1629                                 from, length, transhdrlen, flags, ipc6);
1630}
1631EXPORT_SYMBOL_GPL(ip6_append_data);
1632
1633static void ip6_cork_release(struct inet_cork_full *cork,
1634                             struct inet6_cork *v6_cork)
1635{
1636        if (v6_cork->opt) {
1637                kfree(v6_cork->opt->dst0opt);
1638                kfree(v6_cork->opt->dst1opt);
1639                kfree(v6_cork->opt->hopopt);
1640                kfree(v6_cork->opt->srcrt);
1641                kfree(v6_cork->opt);
1642                v6_cork->opt = NULL;
1643        }
1644
1645        if (cork->base.dst) {
1646                dst_release(cork->base.dst);
1647                cork->base.dst = NULL;
1648                cork->base.flags &= ~IPCORK_ALLFRAG;
1649        }
1650        memset(&cork->fl, 0, sizeof(cork->fl));
1651}
1652
1653struct sk_buff *__ip6_make_skb(struct sock *sk,
1654                               struct sk_buff_head *queue,
1655                               struct inet_cork_full *cork,
1656                               struct inet6_cork *v6_cork)
1657{
1658        struct sk_buff *skb, *tmp_skb;
1659        struct sk_buff **tail_skb;
1660        struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1661        struct ipv6_pinfo *np = inet6_sk(sk);
1662        struct net *net = sock_net(sk);
1663        struct ipv6hdr *hdr;
1664        struct ipv6_txoptions *opt = v6_cork->opt;
1665        struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1666        struct flowi6 *fl6 = &cork->fl.u.ip6;
1667        unsigned char proto = fl6->flowi6_proto;
1668
1669        skb = __skb_dequeue(queue);
1670        if (!skb)
1671                goto out;
1672        tail_skb = &(skb_shinfo(skb)->frag_list);
1673
1674        /* move skb->data to ip header from ext header */
1675        if (skb->data < skb_network_header(skb))
1676                __skb_pull(skb, skb_network_offset(skb));
1677        while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1678                __skb_pull(tmp_skb, skb_network_header_len(skb));
1679                *tail_skb = tmp_skb;
1680                tail_skb = &(tmp_skb->next);
1681                skb->len += tmp_skb->len;
1682                skb->data_len += tmp_skb->len;
1683                skb->truesize += tmp_skb->truesize;
1684                tmp_skb->destructor = NULL;
1685                tmp_skb->sk = NULL;
1686        }
1687
1688        /* Allow local fragmentation. */
1689        skb->ignore_df = ip6_sk_ignore_df(sk);
1690
1691        *final_dst = fl6->daddr;
1692        __skb_pull(skb, skb_network_header_len(skb));
1693        if (opt && opt->opt_flen)
1694                ipv6_push_frag_opts(skb, opt, &proto);
1695        if (opt && opt->opt_nflen)
1696                ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1697
1698        skb_push(skb, sizeof(struct ipv6hdr));
1699        skb_reset_network_header(skb);
1700        hdr = ipv6_hdr(skb);
1701
1702        ip6_flow_hdr(hdr, v6_cork->tclass,
1703                     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1704                                        ip6_autoflowlabel(net, np), fl6));
1705        hdr->hop_limit = v6_cork->hop_limit;
1706        hdr->nexthdr = proto;
1707        hdr->saddr = fl6->saddr;
1708        hdr->daddr = *final_dst;
1709
1710        skb->priority = sk->sk_priority;
1711        skb->mark = sk->sk_mark;
1712
1713        skb->tstamp = cork->base.transmit_time;
1714
1715        skb_dst_set(skb, dst_clone(&rt->dst));
1716        IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1717        if (proto == IPPROTO_ICMPV6) {
1718                struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1719
1720                ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1721                ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1722        }
1723
1724        ip6_cork_release(cork, v6_cork);
1725out:
1726        return skb;
1727}
1728
1729int ip6_send_skb(struct sk_buff *skb)
1730{
1731        struct net *net = sock_net(skb->sk);
1732        struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1733        int err;
1734
1735        err = ip6_local_out(net, skb->sk, skb);
1736        if (err) {
1737                if (err > 0)
1738                        err = net_xmit_errno(err);
1739                if (err)
1740                        IP6_INC_STATS(net, rt->rt6i_idev,
1741                                      IPSTATS_MIB_OUTDISCARDS);
1742        }
1743
1744        return err;
1745}
1746
1747int ip6_push_pending_frames(struct sock *sk)
1748{
1749        struct sk_buff *skb;
1750
1751        skb = ip6_finish_skb(sk);
1752        if (!skb)
1753                return 0;
1754
1755        return ip6_send_skb(skb);
1756}
1757EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1758
1759static void __ip6_flush_pending_frames(struct sock *sk,
1760                                       struct sk_buff_head *queue,
1761                                       struct inet_cork_full *cork,
1762                                       struct inet6_cork *v6_cork)
1763{
1764        struct sk_buff *skb;
1765
1766        while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1767                if (skb_dst(skb))
1768                        IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1769                                      IPSTATS_MIB_OUTDISCARDS);
1770                kfree_skb(skb);
1771        }
1772
1773        ip6_cork_release(cork, v6_cork);
1774}
1775
1776void ip6_flush_pending_frames(struct sock *sk)
1777{
1778        __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1779                                   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1780}
1781EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1782
1783struct sk_buff *ip6_make_skb(struct sock *sk,
1784                             int getfrag(void *from, char *to, int offset,
1785                                         int len, int odd, struct sk_buff *skb),
1786                             void *from, int length, int transhdrlen,
1787                             struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1788                             struct rt6_info *rt, unsigned int flags,
1789                             struct inet_cork_full *cork)
1790{
1791        struct inet6_cork v6_cork;
1792        struct sk_buff_head queue;
1793        int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1794        int err;
1795
1796        if (flags & MSG_PROBE)
1797                return NULL;
1798
1799        __skb_queue_head_init(&queue);
1800
1801        cork->base.flags = 0;
1802        cork->base.addr = 0;
1803        cork->base.opt = NULL;
1804        cork->base.dst = NULL;
1805        v6_cork.opt = NULL;
1806        err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1807        if (err) {
1808                ip6_cork_release(cork, &v6_cork);
1809                return ERR_PTR(err);
1810        }
1811        if (ipc6->dontfrag < 0)
1812                ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1813
1814        err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1815                                &current->task_frag, getfrag, from,
1816                                length + exthdrlen, transhdrlen + exthdrlen,
1817                                flags, ipc6);
1818        if (err) {
1819                __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1820                return ERR_PTR(err);
1821        }
1822
1823        return __ip6_make_skb(sk, &queue, cork, &v6_cork);
1824}
1825