linux/net/ipv6/ip6_output.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 *      IPv6 output functions
   4 *      Linux INET6 implementation
   5 *
   6 *      Authors:
   7 *      Pedro Roque             <roque@di.fc.ul.pt>
   8 *
   9 *      Based on linux/net/ipv4/ip_output.c
  10 *
  11 *      Changes:
  12 *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  13 *                              extension headers are implemented.
  14 *                              route changes now work.
  15 *                              ip6_forward does not confuse sniffers.
  16 *                              etc.
  17 *
  18 *      H. von Brand    :       Added missing #include <linux/string.h>
  19 *      Imran Patel     :       frag id should be in NBO
  20 *      Kazunori MIYAZAWA @USAGI
  21 *                      :       add ip6_append_data and related functions
  22 *                              for datagram xmit
  23 */
  24
  25#include <linux/errno.h>
  26#include <linux/kernel.h>
  27#include <linux/string.h>
  28#include <linux/socket.h>
  29#include <linux/net.h>
  30#include <linux/netdevice.h>
  31#include <linux/if_arp.h>
  32#include <linux/in6.h>
  33#include <linux/tcp.h>
  34#include <linux/route.h>
  35#include <linux/module.h>
  36#include <linux/slab.h>
  37
  38#include <linux/bpf-cgroup.h>
  39#include <linux/netfilter.h>
  40#include <linux/netfilter_ipv6.h>
  41
  42#include <net/sock.h>
  43#include <net/snmp.h>
  44
  45#include <net/ipv6.h>
  46#include <net/ndisc.h>
  47#include <net/protocol.h>
  48#include <net/ip6_route.h>
  49#include <net/addrconf.h>
  50#include <net/rawv6.h>
  51#include <net/icmp.h>
  52#include <net/xfrm.h>
  53#include <net/checksum.h>
  54#include <linux/mroute6.h>
  55#include <net/l3mdev.h>
  56#include <net/lwtunnel.h>
  57#include <net/ip_tunnels.h>
  58
  59static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  60{
  61        struct dst_entry *dst = skb_dst(skb);
  62        struct net_device *dev = dst->dev;
  63        struct inet6_dev *idev = ip6_dst_idev(dst);
  64        unsigned int hh_len = LL_RESERVED_SPACE(dev);
  65        const struct in6_addr *daddr, *nexthop;
  66        struct ipv6hdr *hdr;
  67        struct neighbour *neigh;
  68        int ret;
  69
  70        /* Be paranoid, rather than too clever. */
  71        if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
  72                skb = skb_expand_head(skb, hh_len);
  73                if (!skb) {
  74                        IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
  75                        return -ENOMEM;
  76                }
  77        }
  78
  79        hdr = ipv6_hdr(skb);
  80        daddr = &hdr->daddr;
  81        if (ipv6_addr_is_multicast(daddr)) {
  82                if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  83                    ((mroute6_is_socket(net, skb) &&
  84                     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  85                     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
  86                        struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  87
  88                        /* Do not check for IFF_ALLMULTI; multicast routing
  89                           is not supported in any case.
  90                         */
  91                        if (newskb)
  92                                NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  93                                        net, sk, newskb, NULL, newskb->dev,
  94                                        dev_loopback_xmit);
  95
  96                        if (hdr->hop_limit == 0) {
  97                                IP6_INC_STATS(net, idev,
  98                                              IPSTATS_MIB_OUTDISCARDS);
  99                                kfree_skb(skb);
 100                                return 0;
 101                        }
 102                }
 103
 104                IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
 105                if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
 106                    !(dev->flags & IFF_LOOPBACK)) {
 107                        kfree_skb(skb);
 108                        return 0;
 109                }
 110        }
 111
 112        if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 113                int res = lwtunnel_xmit(skb);
 114
 115                if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 116                        return res;
 117        }
 118
 119        rcu_read_lock_bh();
 120        nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
 121        neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
 122
 123        if (unlikely(IS_ERR_OR_NULL(neigh))) {
 124                if (unlikely(!neigh))
 125                        neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
 126                if (IS_ERR(neigh)) {
 127                        rcu_read_unlock_bh();
 128                        IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
 129                        kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
 130                        return -EINVAL;
 131                }
 132        }
 133        sock_confirm_neigh(skb, neigh);
 134        ret = neigh_output(neigh, skb, false);
 135        rcu_read_unlock_bh();
 136        return ret;
 137}
 138
 139static int
 140ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
 141                                    struct sk_buff *skb, unsigned int mtu)
 142{
 143        struct sk_buff *segs, *nskb;
 144        netdev_features_t features;
 145        int ret = 0;
 146
 147        /* Please see corresponding comment in ip_finish_output_gso
 148         * describing the cases where GSO segment length exceeds the
 149         * egress MTU.
 150         */
 151        features = netif_skb_features(skb);
 152        segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 153        if (IS_ERR_OR_NULL(segs)) {
 154                kfree_skb(skb);
 155                return -ENOMEM;
 156        }
 157
 158        consume_skb(skb);
 159
 160        skb_list_walk_safe(segs, segs, nskb) {
 161                int err;
 162
 163                skb_mark_not_on_list(segs);
 164                err = ip6_fragment(net, sk, segs, ip6_finish_output2);
 165                if (err && ret == 0)
 166                        ret = err;
 167        }
 168
 169        return ret;
 170}
 171
 172static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 173{
 174        unsigned int mtu;
 175
 176#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 177        /* Policy lookup after SNAT yielded a new policy */
 178        if (skb_dst(skb)->xfrm) {
 179                IP6CB(skb)->flags |= IP6SKB_REROUTED;
 180                return dst_output(net, sk, skb);
 181        }
 182#endif
 183
 184        mtu = ip6_skb_dst_mtu(skb);
 185        if (skb_is_gso(skb) &&
 186            !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
 187            !skb_gso_validate_network_len(skb, mtu))
 188                return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
 189
 190        if ((skb->len > mtu && !skb_is_gso(skb)) ||
 191            dst_allfrag(skb_dst(skb)) ||
 192            (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 193                return ip6_fragment(net, sk, skb, ip6_finish_output2);
 194        else
 195                return ip6_finish_output2(net, sk, skb);
 196}
 197
 198static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 199{
 200        int ret;
 201
 202        ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 203        switch (ret) {
 204        case NET_XMIT_SUCCESS:
 205        case NET_XMIT_CN:
 206                return __ip6_finish_output(net, sk, skb) ? : ret;
 207        default:
 208                kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
 209                return ret;
 210        }
 211}
 212
 213int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 214{
 215        struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
 216        struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 217
 218        skb->protocol = htons(ETH_P_IPV6);
 219        skb->dev = dev;
 220
 221        if (unlikely(idev->cnf.disable_ipv6)) {
 222                IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 223                kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
 224                return 0;
 225        }
 226
 227        return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 228                            net, sk, skb, indev, dev,
 229                            ip6_finish_output,
 230                            !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 231}
 232EXPORT_SYMBOL(ip6_output);
 233
 234bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 235{
 236        if (!np->autoflowlabel_set)
 237                return ip6_default_np_autolabel(net);
 238        else
 239                return np->autoflowlabel;
 240}
 241
 242/*
 243 * xmit an sk_buff (used by TCP, SCTP and DCCP)
 244 * Note : socket lock is not held for SYNACK packets, but might be modified
 245 * by calls to skb_set_owner_w() and ipv6_local_error(),
 246 * which are using proper atomic operations or spinlocks.
 247 */
 248int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 249             __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
 250{
 251        struct net *net = sock_net(sk);
 252        const struct ipv6_pinfo *np = inet6_sk(sk);
 253        struct in6_addr *first_hop = &fl6->daddr;
 254        struct dst_entry *dst = skb_dst(skb);
 255        struct net_device *dev = dst->dev;
 256        struct inet6_dev *idev = ip6_dst_idev(dst);
 257        struct hop_jumbo_hdr *hop_jumbo;
 258        int hoplen = sizeof(*hop_jumbo);
 259        unsigned int head_room;
 260        struct ipv6hdr *hdr;
 261        u8  proto = fl6->flowi6_proto;
 262        int seg_len = skb->len;
 263        int hlimit = -1;
 264        u32 mtu;
 265
 266        head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
 267        if (opt)
 268                head_room += opt->opt_nflen + opt->opt_flen;
 269
 270        if (unlikely(head_room > skb_headroom(skb))) {
 271                skb = skb_expand_head(skb, head_room);
 272                if (!skb) {
 273                        IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 274                        return -ENOBUFS;
 275                }
 276        }
 277
 278        if (opt) {
 279                seg_len += opt->opt_nflen + opt->opt_flen;
 280
 281                if (opt->opt_flen)
 282                        ipv6_push_frag_opts(skb, opt, &proto);
 283
 284                if (opt->opt_nflen)
 285                        ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 286                                             &fl6->saddr);
 287        }
 288
 289        if (unlikely(seg_len > IPV6_MAXPLEN)) {
 290                hop_jumbo = skb_push(skb, hoplen);
 291
 292                hop_jumbo->nexthdr = proto;
 293                hop_jumbo->hdrlen = 0;
 294                hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
 295                hop_jumbo->tlv_len = 4;
 296                hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
 297
 298                proto = IPPROTO_HOPOPTS;
 299                seg_len = 0;
 300                IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
 301        }
 302
 303        skb_push(skb, sizeof(struct ipv6hdr));
 304        skb_reset_network_header(skb);
 305        hdr = ipv6_hdr(skb);
 306
 307        /*
 308         *      Fill in the IPv6 header
 309         */
 310        if (np)
 311                hlimit = np->hop_limit;
 312        if (hlimit < 0)
 313                hlimit = ip6_dst_hoplimit(dst);
 314
 315        ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 316                                ip6_autoflowlabel(net, np), fl6));
 317
 318        hdr->payload_len = htons(seg_len);
 319        hdr->nexthdr = proto;
 320        hdr->hop_limit = hlimit;
 321
 322        hdr->saddr = fl6->saddr;
 323        hdr->daddr = *first_hop;
 324
 325        skb->protocol = htons(ETH_P_IPV6);
 326        skb->priority = priority;
 327        skb->mark = mark;
 328
 329        mtu = dst_mtu(dst);
 330        if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 331                IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
 332
 333                /* if egress device is enslaved to an L3 master device pass the
 334                 * skb to its handler for processing
 335                 */
 336                skb = l3mdev_ip6_out((struct sock *)sk, skb);
 337                if (unlikely(!skb))
 338                        return 0;
 339
 340                /* hooks should never assume socket lock is held.
 341                 * we promote our socket to non const
 342                 */
 343                return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 344                               net, (struct sock *)sk, skb, NULL, dev,
 345                               dst_output);
 346        }
 347
 348        skb->dev = dev;
 349        /* ipv6_local_error() does not require socket lock,
 350         * we promote our socket to non const
 351         */
 352        ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 353
 354        IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
 355        kfree_skb(skb);
 356        return -EMSGSIZE;
 357}
 358EXPORT_SYMBOL(ip6_xmit);
 359
 360static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 361{
 362        struct ip6_ra_chain *ra;
 363        struct sock *last = NULL;
 364
 365        read_lock(&ip6_ra_lock);
 366        for (ra = ip6_ra_chain; ra; ra = ra->next) {
 367                struct sock *sk = ra->sk;
 368                if (sk && ra->sel == sel &&
 369                    (!sk->sk_bound_dev_if ||
 370                     sk->sk_bound_dev_if == skb->dev->ifindex)) {
 371                        struct ipv6_pinfo *np = inet6_sk(sk);
 372
 373                        if (np && np->rtalert_isolate &&
 374                            !net_eq(sock_net(sk), dev_net(skb->dev))) {
 375                                continue;
 376                        }
 377                        if (last) {
 378                                struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 379                                if (skb2)
 380                                        rawv6_rcv(last, skb2);
 381                        }
 382                        last = sk;
 383                }
 384        }
 385
 386        if (last) {
 387                rawv6_rcv(last, skb);
 388                read_unlock(&ip6_ra_lock);
 389                return 1;
 390        }
 391        read_unlock(&ip6_ra_lock);
 392        return 0;
 393}
 394
 395static int ip6_forward_proxy_check(struct sk_buff *skb)
 396{
 397        struct ipv6hdr *hdr = ipv6_hdr(skb);
 398        u8 nexthdr = hdr->nexthdr;
 399        __be16 frag_off;
 400        int offset;
 401
 402        if (ipv6_ext_hdr(nexthdr)) {
 403                offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 404                if (offset < 0)
 405                        return 0;
 406        } else
 407                offset = sizeof(struct ipv6hdr);
 408
 409        if (nexthdr == IPPROTO_ICMPV6) {
 410                struct icmp6hdr *icmp6;
 411
 412                if (!pskb_may_pull(skb, (skb_network_header(skb) +
 413                                         offset + 1 - skb->data)))
 414                        return 0;
 415
 416                icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 417
 418                switch (icmp6->icmp6_type) {
 419                case NDISC_ROUTER_SOLICITATION:
 420                case NDISC_ROUTER_ADVERTISEMENT:
 421                case NDISC_NEIGHBOUR_SOLICITATION:
 422                case NDISC_NEIGHBOUR_ADVERTISEMENT:
 423                case NDISC_REDIRECT:
 424                        /* For reaction involving unicast neighbor discovery
 425                         * message destined to the proxied address, pass it to
 426                         * input function.
 427                         */
 428                        return 1;
 429                default:
 430                        break;
 431                }
 432        }
 433
 434        /*
 435         * The proxying router can't forward traffic sent to a link-local
 436         * address, so signal the sender and discard the packet. This
 437         * behavior is clarified by the MIPv6 specification.
 438         */
 439        if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 440                dst_link_failure(skb);
 441                return -1;
 442        }
 443
 444        return 0;
 445}
 446
 447static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 448                                     struct sk_buff *skb)
 449{
 450        struct dst_entry *dst = skb_dst(skb);
 451
 452        __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 453        __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 454
 455#ifdef CONFIG_NET_SWITCHDEV
 456        if (skb->offload_l3_fwd_mark) {
 457                consume_skb(skb);
 458                return 0;
 459        }
 460#endif
 461
 462        skb_clear_tstamp(skb);
 463        return dst_output(net, sk, skb);
 464}
 465
 466static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 467{
 468        if (skb->len <= mtu)
 469                return false;
 470
 471        /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 472        if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 473                return true;
 474
 475        if (skb->ignore_df)
 476                return false;
 477
 478        if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 479                return false;
 480
 481        return true;
 482}
 483
 484int ip6_forward(struct sk_buff *skb)
 485{
 486        struct dst_entry *dst = skb_dst(skb);
 487        struct ipv6hdr *hdr = ipv6_hdr(skb);
 488        struct inet6_skb_parm *opt = IP6CB(skb);
 489        struct net *net = dev_net(dst->dev);
 490        struct inet6_dev *idev;
 491        SKB_DR(reason);
 492        u32 mtu;
 493
 494        idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
 495        if (net->ipv6.devconf_all->forwarding == 0)
 496                goto error;
 497
 498        if (skb->pkt_type != PACKET_HOST)
 499                goto drop;
 500
 501        if (unlikely(skb->sk))
 502                goto drop;
 503
 504        if (skb_warn_if_lro(skb))
 505                goto drop;
 506
 507        if (!net->ipv6.devconf_all->disable_policy &&
 508            (!idev || !idev->cnf.disable_policy) &&
 509            !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 510                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 511                goto drop;
 512        }
 513
 514        skb_forward_csum(skb);
 515
 516        /*
 517         *      We DO NOT make any processing on
 518         *      RA packets, pushing them to user level AS IS
 519         *      without ane WARRANTY that application will be able
 520         *      to interpret them. The reason is that we
 521         *      cannot make anything clever here.
 522         *
 523         *      We are not end-node, so that if packet contains
 524         *      AH/ESP, we cannot make anything.
 525         *      Defragmentation also would be mistake, RA packets
 526         *      cannot be fragmented, because there is no warranty
 527         *      that different fragments will go along one path. --ANK
 528         */
 529        if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 530                if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 531                        return 0;
 532        }
 533
 534        /*
 535         *      check and decrement ttl
 536         */
 537        if (hdr->hop_limit <= 1) {
 538                icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 539                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 540
 541                kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
 542                return -ETIMEDOUT;
 543        }
 544
 545        /* XXX: idev->cnf.proxy_ndp? */
 546        if (net->ipv6.devconf_all->proxy_ndp &&
 547            pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 548                int proxied = ip6_forward_proxy_check(skb);
 549                if (proxied > 0) {
 550                        hdr->hop_limit--;
 551                        return ip6_input(skb);
 552                } else if (proxied < 0) {
 553                        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 554                        goto drop;
 555                }
 556        }
 557
 558        if (!xfrm6_route_forward(skb)) {
 559                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 560                SKB_DR_SET(reason, XFRM_POLICY);
 561                goto drop;
 562        }
 563        dst = skb_dst(skb);
 564
 565        /* IPv6 specs say nothing about it, but it is clear that we cannot
 566           send redirects to source routed frames.
 567           We don't send redirects to frames decapsulated from IPsec.
 568         */
 569        if (IP6CB(skb)->iif == dst->dev->ifindex &&
 570            opt->srcrt == 0 && !skb_sec_path(skb)) {
 571                struct in6_addr *target = NULL;
 572                struct inet_peer *peer;
 573                struct rt6_info *rt;
 574
 575                /*
 576                 *      incoming and outgoing devices are the same
 577                 *      send a redirect.
 578                 */
 579
 580                rt = (struct rt6_info *) dst;
 581                if (rt->rt6i_flags & RTF_GATEWAY)
 582                        target = &rt->rt6i_gateway;
 583                else
 584                        target = &hdr->daddr;
 585
 586                peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 587
 588                /* Limit redirects both by destination (here)
 589                   and by source (inside ndisc_send_redirect)
 590                 */
 591                if (inet_peer_xrlim_allow(peer, 1*HZ))
 592                        ndisc_send_redirect(skb, target);
 593                if (peer)
 594                        inet_putpeer(peer);
 595        } else {
 596                int addrtype = ipv6_addr_type(&hdr->saddr);
 597
 598                /* This check is security critical. */
 599                if (addrtype == IPV6_ADDR_ANY ||
 600                    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 601                        goto error;
 602                if (addrtype & IPV6_ADDR_LINKLOCAL) {
 603                        icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 604                                    ICMPV6_NOT_NEIGHBOUR, 0);
 605                        goto error;
 606                }
 607        }
 608
 609        mtu = ip6_dst_mtu_maybe_forward(dst, true);
 610        if (mtu < IPV6_MIN_MTU)
 611                mtu = IPV6_MIN_MTU;
 612
 613        if (ip6_pkt_too_big(skb, mtu)) {
 614                /* Again, force OUTPUT device used as source address */
 615                skb->dev = dst->dev;
 616                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 617                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
 618                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 619                                IPSTATS_MIB_FRAGFAILS);
 620                kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
 621                return -EMSGSIZE;
 622        }
 623
 624        if (skb_cow(skb, dst->dev->hard_header_len)) {
 625                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 626                                IPSTATS_MIB_OUTDISCARDS);
 627                goto drop;
 628        }
 629
 630        hdr = ipv6_hdr(skb);
 631
 632        /* Mangling hops number delayed to point after skb COW */
 633
 634        hdr->hop_limit--;
 635
 636        return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 637                       net, NULL, skb, skb->dev, dst->dev,
 638                       ip6_forward_finish);
 639
 640error:
 641        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 642        SKB_DR_SET(reason, IP_INADDRERRORS);
 643drop:
 644        kfree_skb_reason(skb, reason);
 645        return -EINVAL;
 646}
 647
 648static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 649{
 650        to->pkt_type = from->pkt_type;
 651        to->priority = from->priority;
 652        to->protocol = from->protocol;
 653        skb_dst_drop(to);
 654        skb_dst_set(to, dst_clone(skb_dst(from)));
 655        to->dev = from->dev;
 656        to->mark = from->mark;
 657
 658        skb_copy_hash(to, from);
 659
 660#ifdef CONFIG_NET_SCHED
 661        to->tc_index = from->tc_index;
 662#endif
 663        nf_copy(to, from);
 664        skb_ext_copy(to, from);
 665        skb_copy_secmark(to, from);
 666}
 667
 668int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
 669                      u8 nexthdr, __be32 frag_id,
 670                      struct ip6_fraglist_iter *iter)
 671{
 672        unsigned int first_len;
 673        struct frag_hdr *fh;
 674
 675        /* BUILD HEADER */
 676        *prevhdr = NEXTHDR_FRAGMENT;
 677        iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 678        if (!iter->tmp_hdr)
 679                return -ENOMEM;
 680
 681        iter->frag = skb_shinfo(skb)->frag_list;
 682        skb_frag_list_init(skb);
 683
 684        iter->offset = 0;
 685        iter->hlen = hlen;
 686        iter->frag_id = frag_id;
 687        iter->nexthdr = nexthdr;
 688
 689        __skb_pull(skb, hlen);
 690        fh = __skb_push(skb, sizeof(struct frag_hdr));
 691        __skb_push(skb, hlen);
 692        skb_reset_network_header(skb);
 693        memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
 694
 695        fh->nexthdr = nexthdr;
 696        fh->reserved = 0;
 697        fh->frag_off = htons(IP6_MF);
 698        fh->identification = frag_id;
 699
 700        first_len = skb_pagelen(skb);
 701        skb->data_len = first_len - skb_headlen(skb);
 702        skb->len = first_len;
 703        ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
 704
 705        return 0;
 706}
 707EXPORT_SYMBOL(ip6_fraglist_init);
 708
 709void ip6_fraglist_prepare(struct sk_buff *skb,
 710                          struct ip6_fraglist_iter *iter)
 711{
 712        struct sk_buff *frag = iter->frag;
 713        unsigned int hlen = iter->hlen;
 714        struct frag_hdr *fh;
 715
 716        frag->ip_summed = CHECKSUM_NONE;
 717        skb_reset_transport_header(frag);
 718        fh = __skb_push(frag, sizeof(struct frag_hdr));
 719        __skb_push(frag, hlen);
 720        skb_reset_network_header(frag);
 721        memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
 722        iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
 723        fh->nexthdr = iter->nexthdr;
 724        fh->reserved = 0;
 725        fh->frag_off = htons(iter->offset);
 726        if (frag->next)
 727                fh->frag_off |= htons(IP6_MF);
 728        fh->identification = iter->frag_id;
 729        ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 730        ip6_copy_metadata(frag, skb);
 731}
 732EXPORT_SYMBOL(ip6_fraglist_prepare);
 733
 734void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
 735                   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
 736                   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
 737{
 738        state->prevhdr = prevhdr;
 739        state->nexthdr = nexthdr;
 740        state->frag_id = frag_id;
 741
 742        state->hlen = hlen;
 743        state->mtu = mtu;
 744
 745        state->left = skb->len - hlen;  /* Space per frame */
 746        state->ptr = hlen;              /* Where to start from */
 747
 748        state->hroom = hdr_room;
 749        state->troom = needed_tailroom;
 750
 751        state->offset = 0;
 752}
 753EXPORT_SYMBOL(ip6_frag_init);
 754
 755struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
 756{
 757        u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
 758        struct sk_buff *frag;
 759        struct frag_hdr *fh;
 760        unsigned int len;
 761
 762        len = state->left;
 763        /* IF: it doesn't fit, use 'mtu' - the data space left */
 764        if (len > state->mtu)
 765                len = state->mtu;
 766        /* IF: we are not sending up to and including the packet end
 767           then align the next start on an eight byte boundary */
 768        if (len < state->left)
 769                len &= ~7;
 770
 771        /* Allocate buffer */
 772        frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
 773                         state->hroom + state->troom, GFP_ATOMIC);
 774        if (!frag)
 775                return ERR_PTR(-ENOMEM);
 776
 777        /*
 778         *      Set up data on packet
 779         */
 780
 781        ip6_copy_metadata(frag, skb);
 782        skb_reserve(frag, state->hroom);
 783        skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
 784        skb_reset_network_header(frag);
 785        fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
 786        frag->transport_header = (frag->network_header + state->hlen +
 787                                  sizeof(struct frag_hdr));
 788
 789        /*
 790         *      Charge the memory for the fragment to any owner
 791         *      it might possess
 792         */
 793        if (skb->sk)
 794                skb_set_owner_w(frag, skb->sk);
 795
 796        /*
 797         *      Copy the packet header into the new buffer.
 798         */
 799        skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
 800
 801        fragnexthdr_offset = skb_network_header(frag);
 802        fragnexthdr_offset += prevhdr - skb_network_header(skb);
 803        *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 804
 805        /*
 806         *      Build fragment header.
 807         */
 808        fh->nexthdr = state->nexthdr;
 809        fh->reserved = 0;
 810        fh->identification = state->frag_id;
 811
 812        /*
 813         *      Copy a block of the IP datagram.
 814         */
 815        BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
 816                             len));
 817        state->left -= len;
 818
 819        fh->frag_off = htons(state->offset);
 820        if (state->left > 0)
 821                fh->frag_off |= htons(IP6_MF);
 822        ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 823
 824        state->ptr += len;
 825        state->offset += len;
 826
 827        return frag;
 828}
 829EXPORT_SYMBOL(ip6_frag_next);
 830
 831int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 832                 int (*output)(struct net *, struct sock *, struct sk_buff *))
 833{
 834        struct sk_buff *frag;
 835        struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 836        struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 837                                inet6_sk(skb->sk) : NULL;
 838        bool mono_delivery_time = skb->mono_delivery_time;
 839        struct ip6_frag_state state;
 840        unsigned int mtu, hlen, nexthdr_offset;
 841        ktime_t tstamp = skb->tstamp;
 842        int hroom, err = 0;
 843        __be32 frag_id;
 844        u8 *prevhdr, nexthdr = 0;
 845
 846        err = ip6_find_1stfragopt(skb, &prevhdr);
 847        if (err < 0)
 848                goto fail;
 849        hlen = err;
 850        nexthdr = *prevhdr;
 851        nexthdr_offset = prevhdr - skb_network_header(skb);
 852
 853        mtu = ip6_skb_dst_mtu(skb);
 854
 855        /* We must not fragment if the socket is set to force MTU discovery
 856         * or if the skb it not generated by a local socket.
 857         */
 858        if (unlikely(!skb->ignore_df && skb->len > mtu))
 859                goto fail_toobig;
 860
 861        if (IP6CB(skb)->frag_max_size) {
 862                if (IP6CB(skb)->frag_max_size > mtu)
 863                        goto fail_toobig;
 864
 865                /* don't send fragments larger than what we received */
 866                mtu = IP6CB(skb)->frag_max_size;
 867                if (mtu < IPV6_MIN_MTU)
 868                        mtu = IPV6_MIN_MTU;
 869        }
 870
 871        if (np && np->frag_size < mtu) {
 872                if (np->frag_size)
 873                        mtu = np->frag_size;
 874        }
 875        if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 876                goto fail_toobig;
 877        mtu -= hlen + sizeof(struct frag_hdr);
 878
 879        frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 880                                    &ipv6_hdr(skb)->saddr);
 881
 882        if (skb->ip_summed == CHECKSUM_PARTIAL &&
 883            (err = skb_checksum_help(skb)))
 884                goto fail;
 885
 886        prevhdr = skb_network_header(skb) + nexthdr_offset;
 887        hroom = LL_RESERVED_SPACE(rt->dst.dev);
 888        if (skb_has_frag_list(skb)) {
 889                unsigned int first_len = skb_pagelen(skb);
 890                struct ip6_fraglist_iter iter;
 891                struct sk_buff *frag2;
 892
 893                if (first_len - hlen > mtu ||
 894                    ((first_len - hlen) & 7) ||
 895                    skb_cloned(skb) ||
 896                    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 897                        goto slow_path;
 898
 899                skb_walk_frags(skb, frag) {
 900                        /* Correct geometry. */
 901                        if (frag->len > mtu ||
 902                            ((frag->len & 7) && frag->next) ||
 903                            skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 904                                goto slow_path_clean;
 905
 906                        /* Partially cloned skb? */
 907                        if (skb_shared(frag))
 908                                goto slow_path_clean;
 909
 910                        BUG_ON(frag->sk);
 911                        if (skb->sk) {
 912                                frag->sk = skb->sk;
 913                                frag->destructor = sock_wfree;
 914                        }
 915                        skb->truesize -= frag->truesize;
 916                }
 917
 918                err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
 919                                        &iter);
 920                if (err < 0)
 921                        goto fail;
 922
 923                for (;;) {
 924                        /* Prepare header of the next frame,
 925                         * before previous one went down. */
 926                        if (iter.frag)
 927                                ip6_fraglist_prepare(skb, &iter);
 928
 929                        skb_set_delivery_time(skb, tstamp, mono_delivery_time);
 930                        err = output(net, sk, skb);
 931                        if (!err)
 932                                IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 933                                              IPSTATS_MIB_FRAGCREATES);
 934
 935                        if (err || !iter.frag)
 936                                break;
 937
 938                        skb = ip6_fraglist_next(&iter);
 939                }
 940
 941                kfree(iter.tmp_hdr);
 942
 943                if (err == 0) {
 944                        IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 945                                      IPSTATS_MIB_FRAGOKS);
 946                        return 0;
 947                }
 948
 949                kfree_skb_list(iter.frag);
 950
 951                IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 952                              IPSTATS_MIB_FRAGFAILS);
 953                return err;
 954
 955slow_path_clean:
 956                skb_walk_frags(skb, frag2) {
 957                        if (frag2 == frag)
 958                                break;
 959                        frag2->sk = NULL;
 960                        frag2->destructor = NULL;
 961                        skb->truesize += frag2->truesize;
 962                }
 963        }
 964
 965slow_path:
 966        /*
 967         *      Fragment the datagram.
 968         */
 969
 970        ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
 971                      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
 972                      &state);
 973
 974        /*
 975         *      Keep copying data until we run out.
 976         */
 977
 978        while (state.left > 0) {
 979                frag = ip6_frag_next(skb, &state);
 980                if (IS_ERR(frag)) {
 981                        err = PTR_ERR(frag);
 982                        goto fail;
 983                }
 984
 985                /*
 986                 *      Put this fragment into the sending queue.
 987                 */
 988                skb_set_delivery_time(frag, tstamp, mono_delivery_time);
 989                err = output(net, sk, frag);
 990                if (err)
 991                        goto fail;
 992
 993                IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 994                              IPSTATS_MIB_FRAGCREATES);
 995        }
 996        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 997                      IPSTATS_MIB_FRAGOKS);
 998        consume_skb(skb);
 999        return err;
1000
1001fail_toobig:
1002        if (skb->sk && dst_allfrag(skb_dst(skb)))
1003                sk_gso_disable(skb->sk);
1004
1005        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1006        err = -EMSGSIZE;
1007
1008fail:
1009        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1010                      IPSTATS_MIB_FRAGFAILS);
1011        kfree_skb(skb);
1012        return err;
1013}
1014
1015static inline int ip6_rt_check(const struct rt6key *rt_key,
1016                               const struct in6_addr *fl_addr,
1017                               const struct in6_addr *addr_cache)
1018{
1019        return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1020                (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1021}
1022
1023static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1024                                          struct dst_entry *dst,
1025                                          const struct flowi6 *fl6)
1026{
1027        struct ipv6_pinfo *np = inet6_sk(sk);
1028        struct rt6_info *rt;
1029
1030        if (!dst)
1031                goto out;
1032
1033        if (dst->ops->family != AF_INET6) {
1034                dst_release(dst);
1035                return NULL;
1036        }
1037
1038        rt = (struct rt6_info *)dst;
1039        /* Yes, checking route validity in not connected
1040         * case is not very simple. Take into account,
1041         * that we do not support routing by source, TOS,
1042         * and MSG_DONTROUTE            --ANK (980726)
1043         *
1044         * 1. ip6_rt_check(): If route was host route,
1045         *    check that cached destination is current.
1046         *    If it is network route, we still may
1047         *    check its validity using saved pointer
1048         *    to the last used address: daddr_cache.
1049         *    We do not want to save whole address now,
1050         *    (because main consumer of this service
1051         *    is tcp, which has not this problem),
1052         *    so that the last trick works only on connected
1053         *    sockets.
1054         * 2. oif also should be the same.
1055         */
1056        if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1057#ifdef CONFIG_IPV6_SUBTREES
1058            ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1059#endif
1060           (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
1061                dst_release(dst);
1062                dst = NULL;
1063        }
1064
1065out:
1066        return dst;
1067}
1068
1069static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1070                               struct dst_entry **dst, struct flowi6 *fl6)
1071{
1072#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1073        struct neighbour *n;
1074        struct rt6_info *rt;
1075#endif
1076        int err;
1077        int flags = 0;
1078
1079        /* The correct way to handle this would be to do
1080         * ip6_route_get_saddr, and then ip6_route_output; however,
1081         * the route-specific preferred source forces the
1082         * ip6_route_output call _before_ ip6_route_get_saddr.
1083         *
1084         * In source specific routing (no src=any default route),
1085         * ip6_route_output will fail given src=any saddr, though, so
1086         * that's why we try it again later.
1087         */
1088        if (ipv6_addr_any(&fl6->saddr)) {
1089                struct fib6_info *from;
1090                struct rt6_info *rt;
1091
1092                *dst = ip6_route_output(net, sk, fl6);
1093                rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1094
1095                rcu_read_lock();
1096                from = rt ? rcu_dereference(rt->from) : NULL;
1097                err = ip6_route_get_saddr(net, from, &fl6->daddr,
1098                                          sk ? inet6_sk(sk)->srcprefs : 0,
1099                                          &fl6->saddr);
1100                rcu_read_unlock();
1101
1102                if (err)
1103                        goto out_err_release;
1104
1105                /* If we had an erroneous initial result, pretend it
1106                 * never existed and let the SA-enabled version take
1107                 * over.
1108                 */
1109                if ((*dst)->error) {
1110                        dst_release(*dst);
1111                        *dst = NULL;
1112                }
1113
1114                if (fl6->flowi6_oif)
1115                        flags |= RT6_LOOKUP_F_IFACE;
1116        }
1117
1118        if (!*dst)
1119                *dst = ip6_route_output_flags(net, sk, fl6, flags);
1120
1121        err = (*dst)->error;
1122        if (err)
1123                goto out_err_release;
1124
1125#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1126        /*
1127         * Here if the dst entry we've looked up
1128         * has a neighbour entry that is in the INCOMPLETE
1129         * state and the src address from the flow is
1130         * marked as OPTIMISTIC, we release the found
1131         * dst entry and replace it instead with the
1132         * dst entry of the nexthop router
1133         */
1134        rt = (struct rt6_info *) *dst;
1135        rcu_read_lock_bh();
1136        n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1137                                      rt6_nexthop(rt, &fl6->daddr));
1138        err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1139        rcu_read_unlock_bh();
1140
1141        if (err) {
1142                struct inet6_ifaddr *ifp;
1143                struct flowi6 fl_gw6;
1144                int redirect;
1145
1146                ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1147                                      (*dst)->dev, 1);
1148
1149                redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1150                if (ifp)
1151                        in6_ifa_put(ifp);
1152
1153                if (redirect) {
1154                        /*
1155                         * We need to get the dst entry for the
1156                         * default router instead
1157                         */
1158                        dst_release(*dst);
1159                        memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1160                        memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1161                        *dst = ip6_route_output(net, sk, &fl_gw6);
1162                        err = (*dst)->error;
1163                        if (err)
1164                                goto out_err_release;
1165                }
1166        }
1167#endif
1168        if (ipv6_addr_v4mapped(&fl6->saddr) &&
1169            !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1170                err = -EAFNOSUPPORT;
1171                goto out_err_release;
1172        }
1173
1174        return 0;
1175
1176out_err_release:
1177        dst_release(*dst);
1178        *dst = NULL;
1179
1180        if (err == -ENETUNREACH)
1181                IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1182        return err;
1183}
1184
1185/**
1186 *      ip6_dst_lookup - perform route lookup on flow
1187 *      @net: Network namespace to perform lookup in
1188 *      @sk: socket which provides route info
1189 *      @dst: pointer to dst_entry * for result
1190 *      @fl6: flow to lookup
1191 *
1192 *      This function performs a route lookup on the given flow.
1193 *
1194 *      It returns zero on success, or a standard errno code on error.
1195 */
1196int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1197                   struct flowi6 *fl6)
1198{
1199        *dst = NULL;
1200        return ip6_dst_lookup_tail(net, sk, dst, fl6);
1201}
1202EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1203
1204/**
1205 *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1206 *      @net: Network namespace to perform lookup in
1207 *      @sk: socket which provides route info
1208 *      @fl6: flow to lookup
1209 *      @final_dst: final destination address for ipsec lookup
1210 *
1211 *      This function performs a route lookup on the given flow.
1212 *
1213 *      It returns a valid dst pointer on success, or a pointer encoded
1214 *      error code.
1215 */
1216struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1217                                      const struct in6_addr *final_dst)
1218{
1219        struct dst_entry *dst = NULL;
1220        int err;
1221
1222        err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1223        if (err)
1224                return ERR_PTR(err);
1225        if (final_dst)
1226                fl6->daddr = *final_dst;
1227
1228        return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1229}
1230EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1231
1232/**
1233 *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1234 *      @sk: socket which provides the dst cache and route info
1235 *      @fl6: flow to lookup
1236 *      @final_dst: final destination address for ipsec lookup
1237 *      @connected: whether @sk is connected or not
1238 *
1239 *      This function performs a route lookup on the given flow with the
1240 *      possibility of using the cached route in the socket if it is valid.
1241 *      It will take the socket dst lock when operating on the dst cache.
1242 *      As a result, this function can only be used in process context.
1243 *
1244 *      In addition, for a connected socket, cache the dst in the socket
1245 *      if the current cache is not valid.
1246 *
1247 *      It returns a valid dst pointer on success, or a pointer encoded
1248 *      error code.
1249 */
1250struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1251                                         const struct in6_addr *final_dst,
1252                                         bool connected)
1253{
1254        struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1255
1256        dst = ip6_sk_dst_check(sk, dst, fl6);
1257        if (dst)
1258                return dst;
1259
1260        dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1261        if (connected && !IS_ERR(dst))
1262                ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1263
1264        return dst;
1265}
1266EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1267
1268/**
1269 *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1270 *      @skb: Packet for which lookup is done
1271 *      @dev: Tunnel device
1272 *      @net: Network namespace of tunnel device
1273 *      @sock: Socket which provides route info
1274 *      @saddr: Memory to store the src ip address
1275 *      @info: Tunnel information
1276 *      @protocol: IP protocol
1277 *      @use_cache: Flag to enable cache usage
1278 *      This function performs a route lookup on a tunnel
1279 *
1280 *      It returns a valid dst pointer and stores src address to be used in
1281 *      tunnel in param saddr on success, else a pointer encoded error code.
1282 */
1283
1284struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1285                                        struct net_device *dev,
1286                                        struct net *net,
1287                                        struct socket *sock,
1288                                        struct in6_addr *saddr,
1289                                        const struct ip_tunnel_info *info,
1290                                        u8 protocol,
1291                                        bool use_cache)
1292{
1293        struct dst_entry *dst = NULL;
1294#ifdef CONFIG_DST_CACHE
1295        struct dst_cache *dst_cache;
1296#endif
1297        struct flowi6 fl6;
1298        __u8 prio;
1299
1300#ifdef CONFIG_DST_CACHE
1301        dst_cache = (struct dst_cache *)&info->dst_cache;
1302        if (use_cache) {
1303                dst = dst_cache_get_ip6(dst_cache, saddr);
1304                if (dst)
1305                        return dst;
1306        }
1307#endif
1308        memset(&fl6, 0, sizeof(fl6));
1309        fl6.flowi6_mark = skb->mark;
1310        fl6.flowi6_proto = protocol;
1311        fl6.daddr = info->key.u.ipv6.dst;
1312        fl6.saddr = info->key.u.ipv6.src;
1313        prio = info->key.tos;
1314        fl6.flowlabel = ip6_make_flowinfo(RT_TOS(prio),
1315                                          info->key.label);
1316
1317        dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1318                                              NULL);
1319        if (IS_ERR(dst)) {
1320                netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1321                return ERR_PTR(-ENETUNREACH);
1322        }
1323        if (dst->dev == dev) { /* is this necessary? */
1324                netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1325                dst_release(dst);
1326                return ERR_PTR(-ELOOP);
1327        }
1328#ifdef CONFIG_DST_CACHE
1329        if (use_cache)
1330                dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1331#endif
1332        *saddr = fl6.saddr;
1333        return dst;
1334}
1335EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1336
1337static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1338                                               gfp_t gfp)
1339{
1340        return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1341}
1342
1343static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1344                                                gfp_t gfp)
1345{
1346        return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1347}
1348
1349static void ip6_append_data_mtu(unsigned int *mtu,
1350                                int *maxfraglen,
1351                                unsigned int fragheaderlen,
1352                                struct sk_buff *skb,
1353                                struct rt6_info *rt,
1354                                unsigned int orig_mtu)
1355{
1356        if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1357                if (!skb) {
1358                        /* first fragment, reserve header_len */
1359                        *mtu = orig_mtu - rt->dst.header_len;
1360
1361                } else {
1362                        /*
1363                         * this fragment is not first, the headers
1364                         * space is regarded as data space.
1365                         */
1366                        *mtu = orig_mtu;
1367                }
1368                *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1369                              + fragheaderlen - sizeof(struct frag_hdr);
1370        }
1371}
1372
1373static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1374                          struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1375                          struct rt6_info *rt)
1376{
1377        struct ipv6_pinfo *np = inet6_sk(sk);
1378        unsigned int mtu;
1379        struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1380
1381        /* callers pass dst together with a reference, set it first so
1382         * ip6_cork_release() can put it down even in case of an error.
1383         */
1384        cork->base.dst = &rt->dst;
1385
1386        /*
1387         * setup for corking
1388         */
1389        if (opt) {
1390                if (WARN_ON(v6_cork->opt))
1391                        return -EINVAL;
1392
1393                nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1394                if (unlikely(!nopt))
1395                        return -ENOBUFS;
1396
1397                nopt->tot_len = sizeof(*opt);
1398                nopt->opt_flen = opt->opt_flen;
1399                nopt->opt_nflen = opt->opt_nflen;
1400
1401                nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1402                if (opt->dst0opt && !nopt->dst0opt)
1403                        return -ENOBUFS;
1404
1405                nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1406                if (opt->dst1opt && !nopt->dst1opt)
1407                        return -ENOBUFS;
1408
1409                nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1410                if (opt->hopopt && !nopt->hopopt)
1411                        return -ENOBUFS;
1412
1413                nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1414                if (opt->srcrt && !nopt->srcrt)
1415                        return -ENOBUFS;
1416
1417                /* need source address above miyazawa*/
1418        }
1419        v6_cork->hop_limit = ipc6->hlimit;
1420        v6_cork->tclass = ipc6->tclass;
1421        if (rt->dst.flags & DST_XFRM_TUNNEL)
1422                mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1423                      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1424        else
1425                mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1426                        READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1427        if (np->frag_size < mtu) {
1428                if (np->frag_size)
1429                        mtu = np->frag_size;
1430        }
1431        cork->base.fragsize = mtu;
1432        cork->base.gso_size = ipc6->gso_size;
1433        cork->base.tx_flags = 0;
1434        cork->base.mark = ipc6->sockc.mark;
1435        sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1436
1437        if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1438                cork->base.flags |= IPCORK_ALLFRAG;
1439        cork->base.length = 0;
1440
1441        cork->base.transmit_time = ipc6->sockc.transmit_time;
1442
1443        return 0;
1444}
1445
1446static int __ip6_append_data(struct sock *sk,
1447                             struct sk_buff_head *queue,
1448                             struct inet_cork_full *cork_full,
1449                             struct inet6_cork *v6_cork,
1450                             struct page_frag *pfrag,
1451                             int getfrag(void *from, char *to, int offset,
1452                                         int len, int odd, struct sk_buff *skb),
1453                             void *from, size_t length, int transhdrlen,
1454                             unsigned int flags, struct ipcm6_cookie *ipc6)
1455{
1456        struct sk_buff *skb, *skb_prev = NULL;
1457        struct inet_cork *cork = &cork_full->base;
1458        struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1459        unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1460        struct ubuf_info *uarg = NULL;
1461        int exthdrlen = 0;
1462        int dst_exthdrlen = 0;
1463        int hh_len;
1464        int copy;
1465        int err;
1466        int offset = 0;
1467        u32 tskey = 0;
1468        struct rt6_info *rt = (struct rt6_info *)cork->dst;
1469        struct ipv6_txoptions *opt = v6_cork->opt;
1470        int csummode = CHECKSUM_NONE;
1471        unsigned int maxnonfragsize, headersize;
1472        unsigned int wmem_alloc_delta = 0;
1473        bool paged, extra_uref = false;
1474
1475        skb = skb_peek_tail(queue);
1476        if (!skb) {
1477                exthdrlen = opt ? opt->opt_flen : 0;
1478                dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1479        }
1480
1481        paged = !!cork->gso_size;
1482        mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1483        orig_mtu = mtu;
1484
1485        if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1486            sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1487                tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1488
1489        hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1490
1491        fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1492                        (opt ? opt->opt_nflen : 0);
1493
1494        headersize = sizeof(struct ipv6hdr) +
1495                     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1496                     (dst_allfrag(&rt->dst) ?
1497                      sizeof(struct frag_hdr) : 0) +
1498                     rt->rt6i_nfheader_len;
1499
1500        if (mtu <= fragheaderlen ||
1501            ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1502                goto emsgsize;
1503
1504        maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1505                     sizeof(struct frag_hdr);
1506
1507        /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1508         * the first fragment
1509         */
1510        if (headersize + transhdrlen > mtu)
1511                goto emsgsize;
1512
1513        if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1514            (sk->sk_protocol == IPPROTO_UDP ||
1515             sk->sk_protocol == IPPROTO_ICMPV6 ||
1516             sk->sk_protocol == IPPROTO_RAW)) {
1517                ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1518                                sizeof(struct ipv6hdr));
1519                goto emsgsize;
1520        }
1521
1522        if (ip6_sk_ignore_df(sk))
1523                maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1524        else
1525                maxnonfragsize = mtu;
1526
1527        if (cork->length + length > maxnonfragsize - headersize) {
1528emsgsize:
1529                pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1530                ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1531                return -EMSGSIZE;
1532        }
1533
1534        /* CHECKSUM_PARTIAL only with no extension headers and when
1535         * we are not going to fragment
1536         */
1537        if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1538            headersize == sizeof(struct ipv6hdr) &&
1539            length <= mtu - headersize &&
1540            (!(flags & MSG_MORE) || cork->gso_size) &&
1541            rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1542                csummode = CHECKSUM_PARTIAL;
1543
1544        if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1545                uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1546                if (!uarg)
1547                        return -ENOBUFS;
1548                extra_uref = !skb_zcopy(skb);   /* only ref on new uarg */
1549                if (rt->dst.dev->features & NETIF_F_SG &&
1550                    csummode == CHECKSUM_PARTIAL) {
1551                        paged = true;
1552                } else {
1553                        uarg->zerocopy = 0;
1554                        skb_zcopy_set(skb, uarg, &extra_uref);
1555                }
1556        }
1557
1558        /*
1559         * Let's try using as much space as possible.
1560         * Use MTU if total length of the message fits into the MTU.
1561         * Otherwise, we need to reserve fragment header and
1562         * fragment alignment (= 8-15 octects, in total).
1563         *
1564         * Note that we may need to "move" the data from the tail
1565         * of the buffer to the new fragment when we split
1566         * the message.
1567         *
1568         * FIXME: It may be fragmented into multiple chunks
1569         *        at once if non-fragmentable extension headers
1570         *        are too large.
1571         * --yoshfuji
1572         */
1573
1574        cork->length += length;
1575        if (!skb)
1576                goto alloc_new_skb;
1577
1578        while (length > 0) {
1579                /* Check if the remaining data fits into current packet. */
1580                copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1581                if (copy < length)
1582                        copy = maxfraglen - skb->len;
1583
1584                if (copy <= 0) {
1585                        char *data;
1586                        unsigned int datalen;
1587                        unsigned int fraglen;
1588                        unsigned int fraggap;
1589                        unsigned int alloclen, alloc_extra;
1590                        unsigned int pagedlen;
1591alloc_new_skb:
1592                        /* There's no room in the current skb */
1593                        if (skb)
1594                                fraggap = skb->len - maxfraglen;
1595                        else
1596                                fraggap = 0;
1597                        /* update mtu and maxfraglen if necessary */
1598                        if (!skb || !skb_prev)
1599                                ip6_append_data_mtu(&mtu, &maxfraglen,
1600                                                    fragheaderlen, skb, rt,
1601                                                    orig_mtu);
1602
1603                        skb_prev = skb;
1604
1605                        /*
1606                         * If remaining data exceeds the mtu,
1607                         * we know we need more fragment(s).
1608                         */
1609                        datalen = length + fraggap;
1610
1611                        if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1612                                datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1613                        fraglen = datalen + fragheaderlen;
1614                        pagedlen = 0;
1615
1616                        alloc_extra = hh_len;
1617                        alloc_extra += dst_exthdrlen;
1618                        alloc_extra += rt->dst.trailer_len;
1619
1620                        /* We just reserve space for fragment header.
1621                         * Note: this may be overallocation if the message
1622                         * (without MSG_MORE) fits into the MTU.
1623                         */
1624                        alloc_extra += sizeof(struct frag_hdr);
1625
1626                        if ((flags & MSG_MORE) &&
1627                            !(rt->dst.dev->features&NETIF_F_SG))
1628                                alloclen = mtu;
1629                        else if (!paged &&
1630                                 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1631                                  !(rt->dst.dev->features & NETIF_F_SG)))
1632                                alloclen = fraglen;
1633                        else {
1634                                alloclen = min_t(int, fraglen, MAX_HEADER);
1635                                pagedlen = fraglen - alloclen;
1636                        }
1637                        alloclen += alloc_extra;
1638
1639                        if (datalen != length + fraggap) {
1640                                /*
1641                                 * this is not the last fragment, the trailer
1642                                 * space is regarded as data space.
1643                                 */
1644                                datalen += rt->dst.trailer_len;
1645                        }
1646
1647                        fraglen = datalen + fragheaderlen;
1648
1649                        copy = datalen - transhdrlen - fraggap - pagedlen;
1650                        if (copy < 0) {
1651                                err = -EINVAL;
1652                                goto error;
1653                        }
1654                        if (transhdrlen) {
1655                                skb = sock_alloc_send_skb(sk, alloclen,
1656                                                (flags & MSG_DONTWAIT), &err);
1657                        } else {
1658                                skb = NULL;
1659                                if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1660                                    2 * sk->sk_sndbuf)
1661                                        skb = alloc_skb(alloclen,
1662                                                        sk->sk_allocation);
1663                                if (unlikely(!skb))
1664                                        err = -ENOBUFS;
1665                        }
1666                        if (!skb)
1667                                goto error;
1668                        /*
1669                         *      Fill in the control structures
1670                         */
1671                        skb->protocol = htons(ETH_P_IPV6);
1672                        skb->ip_summed = csummode;
1673                        skb->csum = 0;
1674                        /* reserve for fragmentation and ipsec header */
1675                        skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1676                                    dst_exthdrlen);
1677
1678                        /*
1679                         *      Find where to start putting bytes
1680                         */
1681                        data = skb_put(skb, fraglen - pagedlen);
1682                        skb_set_network_header(skb, exthdrlen);
1683                        data += fragheaderlen;
1684                        skb->transport_header = (skb->network_header +
1685                                                 fragheaderlen);
1686                        if (fraggap) {
1687                                skb->csum = skb_copy_and_csum_bits(
1688                                        skb_prev, maxfraglen,
1689                                        data + transhdrlen, fraggap);
1690                                skb_prev->csum = csum_sub(skb_prev->csum,
1691                                                          skb->csum);
1692                                data += fraggap;
1693                                pskb_trim_unique(skb_prev, maxfraglen);
1694                        }
1695                        if (copy > 0 &&
1696                            getfrag(from, data + transhdrlen, offset,
1697                                    copy, fraggap, skb) < 0) {
1698                                err = -EFAULT;
1699                                kfree_skb(skb);
1700                                goto error;
1701                        }
1702
1703                        offset += copy;
1704                        length -= copy + transhdrlen;
1705                        transhdrlen = 0;
1706                        exthdrlen = 0;
1707                        dst_exthdrlen = 0;
1708
1709                        /* Only the initial fragment is time stamped */
1710                        skb_shinfo(skb)->tx_flags = cork->tx_flags;
1711                        cork->tx_flags = 0;
1712                        skb_shinfo(skb)->tskey = tskey;
1713                        tskey = 0;
1714                        skb_zcopy_set(skb, uarg, &extra_uref);
1715
1716                        if ((flags & MSG_CONFIRM) && !skb_prev)
1717                                skb_set_dst_pending_confirm(skb, 1);
1718
1719                        /*
1720                         * Put the packet on the pending queue
1721                         */
1722                        if (!skb->destructor) {
1723                                skb->destructor = sock_wfree;
1724                                skb->sk = sk;
1725                                wmem_alloc_delta += skb->truesize;
1726                        }
1727                        __skb_queue_tail(queue, skb);
1728                        continue;
1729                }
1730
1731                if (copy > length)
1732                        copy = length;
1733
1734                if (!(rt->dst.dev->features&NETIF_F_SG) &&
1735                    skb_tailroom(skb) >= copy) {
1736                        unsigned int off;
1737
1738                        off = skb->len;
1739                        if (getfrag(from, skb_put(skb, copy),
1740                                                offset, copy, off, skb) < 0) {
1741                                __skb_trim(skb, off);
1742                                err = -EFAULT;
1743                                goto error;
1744                        }
1745                } else if (!uarg || !uarg->zerocopy) {
1746                        int i = skb_shinfo(skb)->nr_frags;
1747
1748                        err = -ENOMEM;
1749                        if (!sk_page_frag_refill(sk, pfrag))
1750                                goto error;
1751
1752                        if (!skb_can_coalesce(skb, i, pfrag->page,
1753                                              pfrag->offset)) {
1754                                err = -EMSGSIZE;
1755                                if (i == MAX_SKB_FRAGS)
1756                                        goto error;
1757
1758                                __skb_fill_page_desc(skb, i, pfrag->page,
1759                                                     pfrag->offset, 0);
1760                                skb_shinfo(skb)->nr_frags = ++i;
1761                                get_page(pfrag->page);
1762                        }
1763                        copy = min_t(int, copy, pfrag->size - pfrag->offset);
1764                        if (getfrag(from,
1765                                    page_address(pfrag->page) + pfrag->offset,
1766                                    offset, copy, skb->len, skb) < 0)
1767                                goto error_efault;
1768
1769                        pfrag->offset += copy;
1770                        skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1771                        skb->len += copy;
1772                        skb->data_len += copy;
1773                        skb->truesize += copy;
1774                        wmem_alloc_delta += copy;
1775                } else {
1776                        err = skb_zerocopy_iter_dgram(skb, from, copy);
1777                        if (err < 0)
1778                                goto error;
1779                }
1780                offset += copy;
1781                length -= copy;
1782        }
1783
1784        if (wmem_alloc_delta)
1785                refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1786        return 0;
1787
1788error_efault:
1789        err = -EFAULT;
1790error:
1791        net_zcopy_put_abort(uarg, extra_uref);
1792        cork->length -= length;
1793        IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1794        refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1795        return err;
1796}
1797
1798int ip6_append_data(struct sock *sk,
1799                    int getfrag(void *from, char *to, int offset, int len,
1800                                int odd, struct sk_buff *skb),
1801                    void *from, size_t length, int transhdrlen,
1802                    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1803                    struct rt6_info *rt, unsigned int flags)
1804{
1805        struct inet_sock *inet = inet_sk(sk);
1806        struct ipv6_pinfo *np = inet6_sk(sk);
1807        int exthdrlen;
1808        int err;
1809
1810        if (flags&MSG_PROBE)
1811                return 0;
1812        if (skb_queue_empty(&sk->sk_write_queue)) {
1813                /*
1814                 * setup for corking
1815                 */
1816                dst_hold(&rt->dst);
1817                err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1818                                     ipc6, rt);
1819                if (err)
1820                        return err;
1821
1822                inet->cork.fl.u.ip6 = *fl6;
1823                exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1824                length += exthdrlen;
1825                transhdrlen += exthdrlen;
1826        } else {
1827                transhdrlen = 0;
1828        }
1829
1830        return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1831                                 &np->cork, sk_page_frag(sk), getfrag,
1832                                 from, length, transhdrlen, flags, ipc6);
1833}
1834EXPORT_SYMBOL_GPL(ip6_append_data);
1835
1836static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1837{
1838        struct dst_entry *dst = cork->base.dst;
1839
1840        cork->base.dst = NULL;
1841        cork->base.flags &= ~IPCORK_ALLFRAG;
1842        skb_dst_set(skb, dst);
1843}
1844
1845static void ip6_cork_release(struct inet_cork_full *cork,
1846                             struct inet6_cork *v6_cork)
1847{
1848        if (v6_cork->opt) {
1849                struct ipv6_txoptions *opt = v6_cork->opt;
1850
1851                kfree(opt->dst0opt);
1852                kfree(opt->dst1opt);
1853                kfree(opt->hopopt);
1854                kfree(opt->srcrt);
1855                kfree(opt);
1856                v6_cork->opt = NULL;
1857        }
1858
1859        if (cork->base.dst) {
1860                dst_release(cork->base.dst);
1861                cork->base.dst = NULL;
1862                cork->base.flags &= ~IPCORK_ALLFRAG;
1863        }
1864}
1865
1866struct sk_buff *__ip6_make_skb(struct sock *sk,
1867                               struct sk_buff_head *queue,
1868                               struct inet_cork_full *cork,
1869                               struct inet6_cork *v6_cork)
1870{
1871        struct sk_buff *skb, *tmp_skb;
1872        struct sk_buff **tail_skb;
1873        struct in6_addr *final_dst;
1874        struct ipv6_pinfo *np = inet6_sk(sk);
1875        struct net *net = sock_net(sk);
1876        struct ipv6hdr *hdr;
1877        struct ipv6_txoptions *opt = v6_cork->opt;
1878        struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1879        struct flowi6 *fl6 = &cork->fl.u.ip6;
1880        unsigned char proto = fl6->flowi6_proto;
1881
1882        skb = __skb_dequeue(queue);
1883        if (!skb)
1884                goto out;
1885        tail_skb = &(skb_shinfo(skb)->frag_list);
1886
1887        /* move skb->data to ip header from ext header */
1888        if (skb->data < skb_network_header(skb))
1889                __skb_pull(skb, skb_network_offset(skb));
1890        while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1891                __skb_pull(tmp_skb, skb_network_header_len(skb));
1892                *tail_skb = tmp_skb;
1893                tail_skb = &(tmp_skb->next);
1894                skb->len += tmp_skb->len;
1895                skb->data_len += tmp_skb->len;
1896                skb->truesize += tmp_skb->truesize;
1897                tmp_skb->destructor = NULL;
1898                tmp_skb->sk = NULL;
1899        }
1900
1901        /* Allow local fragmentation. */
1902        skb->ignore_df = ip6_sk_ignore_df(sk);
1903        __skb_pull(skb, skb_network_header_len(skb));
1904
1905        final_dst = &fl6->daddr;
1906        if (opt && opt->opt_flen)
1907                ipv6_push_frag_opts(skb, opt, &proto);
1908        if (opt && opt->opt_nflen)
1909                ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1910
1911        skb_push(skb, sizeof(struct ipv6hdr));
1912        skb_reset_network_header(skb);
1913        hdr = ipv6_hdr(skb);
1914
1915        ip6_flow_hdr(hdr, v6_cork->tclass,
1916                     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1917                                        ip6_autoflowlabel(net, np), fl6));
1918        hdr->hop_limit = v6_cork->hop_limit;
1919        hdr->nexthdr = proto;
1920        hdr->saddr = fl6->saddr;
1921        hdr->daddr = *final_dst;
1922
1923        skb->priority = sk->sk_priority;
1924        skb->mark = cork->base.mark;
1925        skb->tstamp = cork->base.transmit_time;
1926
1927        ip6_cork_steal_dst(skb, cork);
1928        IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1929        if (proto == IPPROTO_ICMPV6) {
1930                struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1931
1932                ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1933                ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1934        }
1935
1936        ip6_cork_release(cork, v6_cork);
1937out:
1938        return skb;
1939}
1940
1941int ip6_send_skb(struct sk_buff *skb)
1942{
1943        struct net *net = sock_net(skb->sk);
1944        struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1945        int err;
1946
1947        err = ip6_local_out(net, skb->sk, skb);
1948        if (err) {
1949                if (err > 0)
1950                        err = net_xmit_errno(err);
1951                if (err)
1952                        IP6_INC_STATS(net, rt->rt6i_idev,
1953                                      IPSTATS_MIB_OUTDISCARDS);
1954        }
1955
1956        return err;
1957}
1958
1959int ip6_push_pending_frames(struct sock *sk)
1960{
1961        struct sk_buff *skb;
1962
1963        skb = ip6_finish_skb(sk);
1964        if (!skb)
1965                return 0;
1966
1967        return ip6_send_skb(skb);
1968}
1969EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1970
1971static void __ip6_flush_pending_frames(struct sock *sk,
1972                                       struct sk_buff_head *queue,
1973                                       struct inet_cork_full *cork,
1974                                       struct inet6_cork *v6_cork)
1975{
1976        struct sk_buff *skb;
1977
1978        while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1979                if (skb_dst(skb))
1980                        IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1981                                      IPSTATS_MIB_OUTDISCARDS);
1982                kfree_skb(skb);
1983        }
1984
1985        ip6_cork_release(cork, v6_cork);
1986}
1987
1988void ip6_flush_pending_frames(struct sock *sk)
1989{
1990        __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1991                                   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1992}
1993EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1994
1995struct sk_buff *ip6_make_skb(struct sock *sk,
1996                             int getfrag(void *from, char *to, int offset,
1997                                         int len, int odd, struct sk_buff *skb),
1998                             void *from, size_t length, int transhdrlen,
1999                             struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2000                             unsigned int flags, struct inet_cork_full *cork)
2001{
2002        struct inet6_cork v6_cork;
2003        struct sk_buff_head queue;
2004        int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2005        int err;
2006
2007        if (flags & MSG_PROBE) {
2008                dst_release(&rt->dst);
2009                return NULL;
2010        }
2011
2012        __skb_queue_head_init(&queue);
2013
2014        cork->base.flags = 0;
2015        cork->base.addr = 0;
2016        cork->base.opt = NULL;
2017        v6_cork.opt = NULL;
2018        err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2019        if (err) {
2020                ip6_cork_release(cork, &v6_cork);
2021                return ERR_PTR(err);
2022        }
2023        if (ipc6->dontfrag < 0)
2024                ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2025
2026        err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2027                                &current->task_frag, getfrag, from,
2028                                length + exthdrlen, transhdrlen + exthdrlen,
2029                                flags, ipc6);
2030        if (err) {
2031                __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2032                return ERR_PTR(err);
2033        }
2034
2035        return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2036}
2037