linux/net/ipv6/ip6_output.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 *      IPv6 output functions
   4 *      Linux INET6 implementation
   5 *
   6 *      Authors:
   7 *      Pedro Roque             <roque@di.fc.ul.pt>
   8 *
   9 *      Based on linux/net/ipv4/ip_output.c
  10 *
  11 *      Changes:
  12 *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  13 *                              extension headers are implemented.
  14 *                              route changes now work.
  15 *                              ip6_forward does not confuse sniffers.
  16 *                              etc.
  17 *
  18 *      H. von Brand    :       Added missing #include <linux/string.h>
  19 *      Imran Patel     :       frag id should be in NBO
  20 *      Kazunori MIYAZAWA @USAGI
  21 *                      :       add ip6_append_data and related functions
  22 *                              for datagram xmit
  23 */
  24
  25#include <linux/errno.h>
  26#include <linux/kernel.h>
  27#include <linux/string.h>
  28#include <linux/socket.h>
  29#include <linux/net.h>
  30#include <linux/netdevice.h>
  31#include <linux/if_arp.h>
  32#include <linux/in6.h>
  33#include <linux/tcp.h>
  34#include <linux/route.h>
  35#include <linux/module.h>
  36#include <linux/slab.h>
  37
  38#include <linux/bpf-cgroup.h>
  39#include <linux/netfilter.h>
  40#include <linux/netfilter_ipv6.h>
  41
  42#include <net/sock.h>
  43#include <net/snmp.h>
  44
  45#include <net/ipv6.h>
  46#include <net/ndisc.h>
  47#include <net/protocol.h>
  48#include <net/ip6_route.h>
  49#include <net/addrconf.h>
  50#include <net/rawv6.h>
  51#include <net/icmp.h>
  52#include <net/xfrm.h>
  53#include <net/checksum.h>
  54#include <linux/mroute6.h>
  55#include <net/l3mdev.h>
  56#include <net/lwtunnel.h>
  57#include <net/ip_tunnels.h>
  58
  59static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  60{
  61        struct dst_entry *dst = skb_dst(skb);
  62        struct net_device *dev = dst->dev;
  63        struct inet6_dev *idev = ip6_dst_idev(dst);
  64        unsigned int hh_len = LL_RESERVED_SPACE(dev);
  65        const struct in6_addr *daddr, *nexthop;
  66        struct ipv6hdr *hdr;
  67        struct neighbour *neigh;
  68        int ret;
  69
  70        /* Be paranoid, rather than too clever. */
  71        if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
  72                skb = skb_expand_head(skb, hh_len);
  73                if (!skb) {
  74                        IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
  75                        return -ENOMEM;
  76                }
  77        }
  78
  79        hdr = ipv6_hdr(skb);
  80        daddr = &hdr->daddr;
  81        if (ipv6_addr_is_multicast(daddr)) {
  82                if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  83                    ((mroute6_is_socket(net, skb) &&
  84                     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  85                     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
  86                        struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  87
  88                        /* Do not check for IFF_ALLMULTI; multicast routing
  89                           is not supported in any case.
  90                         */
  91                        if (newskb)
  92                                NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  93                                        net, sk, newskb, NULL, newskb->dev,
  94                                        dev_loopback_xmit);
  95
  96                        if (hdr->hop_limit == 0) {
  97                                IP6_INC_STATS(net, idev,
  98                                              IPSTATS_MIB_OUTDISCARDS);
  99                                kfree_skb(skb);
 100                                return 0;
 101                        }
 102                }
 103
 104                IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
 105                if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
 106                    !(dev->flags & IFF_LOOPBACK)) {
 107                        kfree_skb(skb);
 108                        return 0;
 109                }
 110        }
 111
 112        if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 113                int res = lwtunnel_xmit(skb);
 114
 115                if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 116                        return res;
 117        }
 118
 119        rcu_read_lock_bh();
 120        nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
 121        neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
 122        if (unlikely(!neigh))
 123                neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
 124        if (!IS_ERR(neigh)) {
 125                sock_confirm_neigh(skb, neigh);
 126                ret = neigh_output(neigh, skb, false);
 127                rcu_read_unlock_bh();
 128                return ret;
 129        }
 130        rcu_read_unlock_bh();
 131
 132        IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
 133        kfree_skb(skb);
 134        return -EINVAL;
 135}
 136
 137static int
 138ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
 139                                    struct sk_buff *skb, unsigned int mtu)
 140{
 141        struct sk_buff *segs, *nskb;
 142        netdev_features_t features;
 143        int ret = 0;
 144
 145        /* Please see corresponding comment in ip_finish_output_gso
 146         * describing the cases where GSO segment length exceeds the
 147         * egress MTU.
 148         */
 149        features = netif_skb_features(skb);
 150        segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 151        if (IS_ERR_OR_NULL(segs)) {
 152                kfree_skb(skb);
 153                return -ENOMEM;
 154        }
 155
 156        consume_skb(skb);
 157
 158        skb_list_walk_safe(segs, segs, nskb) {
 159                int err;
 160
 161                skb_mark_not_on_list(segs);
 162                err = ip6_fragment(net, sk, segs, ip6_finish_output2);
 163                if (err && ret == 0)
 164                        ret = err;
 165        }
 166
 167        return ret;
 168}
 169
 170static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 171{
 172        unsigned int mtu;
 173
 174#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 175        /* Policy lookup after SNAT yielded a new policy */
 176        if (skb_dst(skb)->xfrm) {
 177                IPCB(skb)->flags |= IPSKB_REROUTED;
 178                return dst_output(net, sk, skb);
 179        }
 180#endif
 181
 182        mtu = ip6_skb_dst_mtu(skb);
 183        if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
 184                return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
 185
 186        if ((skb->len > mtu && !skb_is_gso(skb)) ||
 187            dst_allfrag(skb_dst(skb)) ||
 188            (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 189                return ip6_fragment(net, sk, skb, ip6_finish_output2);
 190        else
 191                return ip6_finish_output2(net, sk, skb);
 192}
 193
 194static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 195{
 196        int ret;
 197
 198        ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 199        switch (ret) {
 200        case NET_XMIT_SUCCESS:
 201                return __ip6_finish_output(net, sk, skb);
 202        case NET_XMIT_CN:
 203                return __ip6_finish_output(net, sk, skb) ? : ret;
 204        default:
 205                kfree_skb(skb);
 206                return ret;
 207        }
 208}
 209
 210int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 211{
 212        struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
 213        struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 214
 215        skb->protocol = htons(ETH_P_IPV6);
 216        skb->dev = dev;
 217
 218        if (unlikely(idev->cnf.disable_ipv6)) {
 219                IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 220                kfree_skb(skb);
 221                return 0;
 222        }
 223
 224        return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 225                            net, sk, skb, indev, dev,
 226                            ip6_finish_output,
 227                            !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 228}
 229EXPORT_SYMBOL(ip6_output);
 230
 231bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 232{
 233        if (!np->autoflowlabel_set)
 234                return ip6_default_np_autolabel(net);
 235        else
 236                return np->autoflowlabel;
 237}
 238
 239/*
 240 * xmit an sk_buff (used by TCP, SCTP and DCCP)
 241 * Note : socket lock is not held for SYNACK packets, but might be modified
 242 * by calls to skb_set_owner_w() and ipv6_local_error(),
 243 * which are using proper atomic operations or spinlocks.
 244 */
 245int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 246             __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
 247{
 248        struct net *net = sock_net(sk);
 249        const struct ipv6_pinfo *np = inet6_sk(sk);
 250        struct in6_addr *first_hop = &fl6->daddr;
 251        struct dst_entry *dst = skb_dst(skb);
 252        struct net_device *dev = dst->dev;
 253        struct inet6_dev *idev = ip6_dst_idev(dst);
 254        unsigned int head_room;
 255        struct ipv6hdr *hdr;
 256        u8  proto = fl6->flowi6_proto;
 257        int seg_len = skb->len;
 258        int hlimit = -1;
 259        u32 mtu;
 260
 261        head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev);
 262        if (opt)
 263                head_room += opt->opt_nflen + opt->opt_flen;
 264
 265        if (unlikely(head_room > skb_headroom(skb))) {
 266                skb = skb_expand_head(skb, head_room);
 267                if (!skb) {
 268                        IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 269                        return -ENOBUFS;
 270                }
 271        }
 272
 273        if (opt) {
 274                seg_len += opt->opt_nflen + opt->opt_flen;
 275
 276                if (opt->opt_flen)
 277                        ipv6_push_frag_opts(skb, opt, &proto);
 278
 279                if (opt->opt_nflen)
 280                        ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 281                                             &fl6->saddr);
 282        }
 283
 284        skb_push(skb, sizeof(struct ipv6hdr));
 285        skb_reset_network_header(skb);
 286        hdr = ipv6_hdr(skb);
 287
 288        /*
 289         *      Fill in the IPv6 header
 290         */
 291        if (np)
 292                hlimit = np->hop_limit;
 293        if (hlimit < 0)
 294                hlimit = ip6_dst_hoplimit(dst);
 295
 296        ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 297                                ip6_autoflowlabel(net, np), fl6));
 298
 299        hdr->payload_len = htons(seg_len);
 300        hdr->nexthdr = proto;
 301        hdr->hop_limit = hlimit;
 302
 303        hdr->saddr = fl6->saddr;
 304        hdr->daddr = *first_hop;
 305
 306        skb->protocol = htons(ETH_P_IPV6);
 307        skb->priority = priority;
 308        skb->mark = mark;
 309
 310        mtu = dst_mtu(dst);
 311        if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 312                IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
 313
 314                /* if egress device is enslaved to an L3 master device pass the
 315                 * skb to its handler for processing
 316                 */
 317                skb = l3mdev_ip6_out((struct sock *)sk, skb);
 318                if (unlikely(!skb))
 319                        return 0;
 320
 321                /* hooks should never assume socket lock is held.
 322                 * we promote our socket to non const
 323                 */
 324                return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 325                               net, (struct sock *)sk, skb, NULL, dev,
 326                               dst_output);
 327        }
 328
 329        skb->dev = dev;
 330        /* ipv6_local_error() does not require socket lock,
 331         * we promote our socket to non const
 332         */
 333        ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 334
 335        IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
 336        kfree_skb(skb);
 337        return -EMSGSIZE;
 338}
 339EXPORT_SYMBOL(ip6_xmit);
 340
 341static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 342{
 343        struct ip6_ra_chain *ra;
 344        struct sock *last = NULL;
 345
 346        read_lock(&ip6_ra_lock);
 347        for (ra = ip6_ra_chain; ra; ra = ra->next) {
 348                struct sock *sk = ra->sk;
 349                if (sk && ra->sel == sel &&
 350                    (!sk->sk_bound_dev_if ||
 351                     sk->sk_bound_dev_if == skb->dev->ifindex)) {
 352                        struct ipv6_pinfo *np = inet6_sk(sk);
 353
 354                        if (np && np->rtalert_isolate &&
 355                            !net_eq(sock_net(sk), dev_net(skb->dev))) {
 356                                continue;
 357                        }
 358                        if (last) {
 359                                struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 360                                if (skb2)
 361                                        rawv6_rcv(last, skb2);
 362                        }
 363                        last = sk;
 364                }
 365        }
 366
 367        if (last) {
 368                rawv6_rcv(last, skb);
 369                read_unlock(&ip6_ra_lock);
 370                return 1;
 371        }
 372        read_unlock(&ip6_ra_lock);
 373        return 0;
 374}
 375
 376static int ip6_forward_proxy_check(struct sk_buff *skb)
 377{
 378        struct ipv6hdr *hdr = ipv6_hdr(skb);
 379        u8 nexthdr = hdr->nexthdr;
 380        __be16 frag_off;
 381        int offset;
 382
 383        if (ipv6_ext_hdr(nexthdr)) {
 384                offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 385                if (offset < 0)
 386                        return 0;
 387        } else
 388                offset = sizeof(struct ipv6hdr);
 389
 390        if (nexthdr == IPPROTO_ICMPV6) {
 391                struct icmp6hdr *icmp6;
 392
 393                if (!pskb_may_pull(skb, (skb_network_header(skb) +
 394                                         offset + 1 - skb->data)))
 395                        return 0;
 396
 397                icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 398
 399                switch (icmp6->icmp6_type) {
 400                case NDISC_ROUTER_SOLICITATION:
 401                case NDISC_ROUTER_ADVERTISEMENT:
 402                case NDISC_NEIGHBOUR_SOLICITATION:
 403                case NDISC_NEIGHBOUR_ADVERTISEMENT:
 404                case NDISC_REDIRECT:
 405                        /* For reaction involving unicast neighbor discovery
 406                         * message destined to the proxied address, pass it to
 407                         * input function.
 408                         */
 409                        return 1;
 410                default:
 411                        break;
 412                }
 413        }
 414
 415        /*
 416         * The proxying router can't forward traffic sent to a link-local
 417         * address, so signal the sender and discard the packet. This
 418         * behavior is clarified by the MIPv6 specification.
 419         */
 420        if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 421                dst_link_failure(skb);
 422                return -1;
 423        }
 424
 425        return 0;
 426}
 427
 428static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 429                                     struct sk_buff *skb)
 430{
 431        struct dst_entry *dst = skb_dst(skb);
 432
 433        __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 434        __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 435
 436#ifdef CONFIG_NET_SWITCHDEV
 437        if (skb->offload_l3_fwd_mark) {
 438                consume_skb(skb);
 439                return 0;
 440        }
 441#endif
 442
 443        skb->tstamp = 0;
 444        return dst_output(net, sk, skb);
 445}
 446
 447static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 448{
 449        if (skb->len <= mtu)
 450                return false;
 451
 452        /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 453        if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 454                return true;
 455
 456        if (skb->ignore_df)
 457                return false;
 458
 459        if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 460                return false;
 461
 462        return true;
 463}
 464
 465int ip6_forward(struct sk_buff *skb)
 466{
 467        struct dst_entry *dst = skb_dst(skb);
 468        struct ipv6hdr *hdr = ipv6_hdr(skb);
 469        struct inet6_skb_parm *opt = IP6CB(skb);
 470        struct net *net = dev_net(dst->dev);
 471        struct inet6_dev *idev;
 472        u32 mtu;
 473
 474        idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
 475        if (net->ipv6.devconf_all->forwarding == 0)
 476                goto error;
 477
 478        if (skb->pkt_type != PACKET_HOST)
 479                goto drop;
 480
 481        if (unlikely(skb->sk))
 482                goto drop;
 483
 484        if (skb_warn_if_lro(skb))
 485                goto drop;
 486
 487        if (!net->ipv6.devconf_all->disable_policy &&
 488            !idev->cnf.disable_policy &&
 489            !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 490                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 491                goto drop;
 492        }
 493
 494        skb_forward_csum(skb);
 495
 496        /*
 497         *      We DO NOT make any processing on
 498         *      RA packets, pushing them to user level AS IS
 499         *      without ane WARRANTY that application will be able
 500         *      to interpret them. The reason is that we
 501         *      cannot make anything clever here.
 502         *
 503         *      We are not end-node, so that if packet contains
 504         *      AH/ESP, we cannot make anything.
 505         *      Defragmentation also would be mistake, RA packets
 506         *      cannot be fragmented, because there is no warranty
 507         *      that different fragments will go along one path. --ANK
 508         */
 509        if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 510                if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 511                        return 0;
 512        }
 513
 514        /*
 515         *      check and decrement ttl
 516         */
 517        if (hdr->hop_limit <= 1) {
 518                icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 519                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
 520
 521                kfree_skb(skb);
 522                return -ETIMEDOUT;
 523        }
 524
 525        /* XXX: idev->cnf.proxy_ndp? */
 526        if (net->ipv6.devconf_all->proxy_ndp &&
 527            pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 528                int proxied = ip6_forward_proxy_check(skb);
 529                if (proxied > 0) {
 530                        hdr->hop_limit--;
 531                        return ip6_input(skb);
 532                } else if (proxied < 0) {
 533                        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 534                        goto drop;
 535                }
 536        }
 537
 538        if (!xfrm6_route_forward(skb)) {
 539                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
 540                goto drop;
 541        }
 542        dst = skb_dst(skb);
 543
 544        /* IPv6 specs say nothing about it, but it is clear that we cannot
 545           send redirects to source routed frames.
 546           We don't send redirects to frames decapsulated from IPsec.
 547         */
 548        if (IP6CB(skb)->iif == dst->dev->ifindex &&
 549            opt->srcrt == 0 && !skb_sec_path(skb)) {
 550                struct in6_addr *target = NULL;
 551                struct inet_peer *peer;
 552                struct rt6_info *rt;
 553
 554                /*
 555                 *      incoming and outgoing devices are the same
 556                 *      send a redirect.
 557                 */
 558
 559                rt = (struct rt6_info *) dst;
 560                if (rt->rt6i_flags & RTF_GATEWAY)
 561                        target = &rt->rt6i_gateway;
 562                else
 563                        target = &hdr->daddr;
 564
 565                peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 566
 567                /* Limit redirects both by destination (here)
 568                   and by source (inside ndisc_send_redirect)
 569                 */
 570                if (inet_peer_xrlim_allow(peer, 1*HZ))
 571                        ndisc_send_redirect(skb, target);
 572                if (peer)
 573                        inet_putpeer(peer);
 574        } else {
 575                int addrtype = ipv6_addr_type(&hdr->saddr);
 576
 577                /* This check is security critical. */
 578                if (addrtype == IPV6_ADDR_ANY ||
 579                    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 580                        goto error;
 581                if (addrtype & IPV6_ADDR_LINKLOCAL) {
 582                        icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 583                                    ICMPV6_NOT_NEIGHBOUR, 0);
 584                        goto error;
 585                }
 586        }
 587
 588        mtu = ip6_dst_mtu_maybe_forward(dst, true);
 589        if (mtu < IPV6_MIN_MTU)
 590                mtu = IPV6_MIN_MTU;
 591
 592        if (ip6_pkt_too_big(skb, mtu)) {
 593                /* Again, force OUTPUT device used as source address */
 594                skb->dev = dst->dev;
 595                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 596                __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
 597                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 598                                IPSTATS_MIB_FRAGFAILS);
 599                kfree_skb(skb);
 600                return -EMSGSIZE;
 601        }
 602
 603        if (skb_cow(skb, dst->dev->hard_header_len)) {
 604                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 605                                IPSTATS_MIB_OUTDISCARDS);
 606                goto drop;
 607        }
 608
 609        hdr = ipv6_hdr(skb);
 610
 611        /* Mangling hops number delayed to point after skb COW */
 612
 613        hdr->hop_limit--;
 614
 615        return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 616                       net, NULL, skb, skb->dev, dst->dev,
 617                       ip6_forward_finish);
 618
 619error:
 620        __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
 621drop:
 622        kfree_skb(skb);
 623        return -EINVAL;
 624}
 625
 626static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 627{
 628        to->pkt_type = from->pkt_type;
 629        to->priority = from->priority;
 630        to->protocol = from->protocol;
 631        skb_dst_drop(to);
 632        skb_dst_set(to, dst_clone(skb_dst(from)));
 633        to->dev = from->dev;
 634        to->mark = from->mark;
 635
 636        skb_copy_hash(to, from);
 637
 638#ifdef CONFIG_NET_SCHED
 639        to->tc_index = from->tc_index;
 640#endif
 641        nf_copy(to, from);
 642        skb_ext_copy(to, from);
 643        skb_copy_secmark(to, from);
 644}
 645
 646int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
 647                      u8 nexthdr, __be32 frag_id,
 648                      struct ip6_fraglist_iter *iter)
 649{
 650        unsigned int first_len;
 651        struct frag_hdr *fh;
 652
 653        /* BUILD HEADER */
 654        *prevhdr = NEXTHDR_FRAGMENT;
 655        iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 656        if (!iter->tmp_hdr)
 657                return -ENOMEM;
 658
 659        iter->frag = skb_shinfo(skb)->frag_list;
 660        skb_frag_list_init(skb);
 661
 662        iter->offset = 0;
 663        iter->hlen = hlen;
 664        iter->frag_id = frag_id;
 665        iter->nexthdr = nexthdr;
 666
 667        __skb_pull(skb, hlen);
 668        fh = __skb_push(skb, sizeof(struct frag_hdr));
 669        __skb_push(skb, hlen);
 670        skb_reset_network_header(skb);
 671        memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
 672
 673        fh->nexthdr = nexthdr;
 674        fh->reserved = 0;
 675        fh->frag_off = htons(IP6_MF);
 676        fh->identification = frag_id;
 677
 678        first_len = skb_pagelen(skb);
 679        skb->data_len = first_len - skb_headlen(skb);
 680        skb->len = first_len;
 681        ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
 682
 683        return 0;
 684}
 685EXPORT_SYMBOL(ip6_fraglist_init);
 686
 687void ip6_fraglist_prepare(struct sk_buff *skb,
 688                          struct ip6_fraglist_iter *iter)
 689{
 690        struct sk_buff *frag = iter->frag;
 691        unsigned int hlen = iter->hlen;
 692        struct frag_hdr *fh;
 693
 694        frag->ip_summed = CHECKSUM_NONE;
 695        skb_reset_transport_header(frag);
 696        fh = __skb_push(frag, sizeof(struct frag_hdr));
 697        __skb_push(frag, hlen);
 698        skb_reset_network_header(frag);
 699        memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
 700        iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
 701        fh->nexthdr = iter->nexthdr;
 702        fh->reserved = 0;
 703        fh->frag_off = htons(iter->offset);
 704        if (frag->next)
 705                fh->frag_off |= htons(IP6_MF);
 706        fh->identification = iter->frag_id;
 707        ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 708        ip6_copy_metadata(frag, skb);
 709}
 710EXPORT_SYMBOL(ip6_fraglist_prepare);
 711
 712void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
 713                   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
 714                   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
 715{
 716        state->prevhdr = prevhdr;
 717        state->nexthdr = nexthdr;
 718        state->frag_id = frag_id;
 719
 720        state->hlen = hlen;
 721        state->mtu = mtu;
 722
 723        state->left = skb->len - hlen;  /* Space per frame */
 724        state->ptr = hlen;              /* Where to start from */
 725
 726        state->hroom = hdr_room;
 727        state->troom = needed_tailroom;
 728
 729        state->offset = 0;
 730}
 731EXPORT_SYMBOL(ip6_frag_init);
 732
 733struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
 734{
 735        u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
 736        struct sk_buff *frag;
 737        struct frag_hdr *fh;
 738        unsigned int len;
 739
 740        len = state->left;
 741        /* IF: it doesn't fit, use 'mtu' - the data space left */
 742        if (len > state->mtu)
 743                len = state->mtu;
 744        /* IF: we are not sending up to and including the packet end
 745           then align the next start on an eight byte boundary */
 746        if (len < state->left)
 747                len &= ~7;
 748
 749        /* Allocate buffer */
 750        frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
 751                         state->hroom + state->troom, GFP_ATOMIC);
 752        if (!frag)
 753                return ERR_PTR(-ENOMEM);
 754
 755        /*
 756         *      Set up data on packet
 757         */
 758
 759        ip6_copy_metadata(frag, skb);
 760        skb_reserve(frag, state->hroom);
 761        skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
 762        skb_reset_network_header(frag);
 763        fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
 764        frag->transport_header = (frag->network_header + state->hlen +
 765                                  sizeof(struct frag_hdr));
 766
 767        /*
 768         *      Charge the memory for the fragment to any owner
 769         *      it might possess
 770         */
 771        if (skb->sk)
 772                skb_set_owner_w(frag, skb->sk);
 773
 774        /*
 775         *      Copy the packet header into the new buffer.
 776         */
 777        skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
 778
 779        fragnexthdr_offset = skb_network_header(frag);
 780        fragnexthdr_offset += prevhdr - skb_network_header(skb);
 781        *fragnexthdr_offset = NEXTHDR_FRAGMENT;
 782
 783        /*
 784         *      Build fragment header.
 785         */
 786        fh->nexthdr = state->nexthdr;
 787        fh->reserved = 0;
 788        fh->identification = state->frag_id;
 789
 790        /*
 791         *      Copy a block of the IP datagram.
 792         */
 793        BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
 794                             len));
 795        state->left -= len;
 796
 797        fh->frag_off = htons(state->offset);
 798        if (state->left > 0)
 799                fh->frag_off |= htons(IP6_MF);
 800        ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
 801
 802        state->ptr += len;
 803        state->offset += len;
 804
 805        return frag;
 806}
 807EXPORT_SYMBOL(ip6_frag_next);
 808
 809int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 810                 int (*output)(struct net *, struct sock *, struct sk_buff *))
 811{
 812        struct sk_buff *frag;
 813        struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 814        struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 815                                inet6_sk(skb->sk) : NULL;
 816        struct ip6_frag_state state;
 817        unsigned int mtu, hlen, nexthdr_offset;
 818        ktime_t tstamp = skb->tstamp;
 819        int hroom, err = 0;
 820        __be32 frag_id;
 821        u8 *prevhdr, nexthdr = 0;
 822
 823        err = ip6_find_1stfragopt(skb, &prevhdr);
 824        if (err < 0)
 825                goto fail;
 826        hlen = err;
 827        nexthdr = *prevhdr;
 828        nexthdr_offset = prevhdr - skb_network_header(skb);
 829
 830        mtu = ip6_skb_dst_mtu(skb);
 831
 832        /* We must not fragment if the socket is set to force MTU discovery
 833         * or if the skb it not generated by a local socket.
 834         */
 835        if (unlikely(!skb->ignore_df && skb->len > mtu))
 836                goto fail_toobig;
 837
 838        if (IP6CB(skb)->frag_max_size) {
 839                if (IP6CB(skb)->frag_max_size > mtu)
 840                        goto fail_toobig;
 841
 842                /* don't send fragments larger than what we received */
 843                mtu = IP6CB(skb)->frag_max_size;
 844                if (mtu < IPV6_MIN_MTU)
 845                        mtu = IPV6_MIN_MTU;
 846        }
 847
 848        if (np && np->frag_size < mtu) {
 849                if (np->frag_size)
 850                        mtu = np->frag_size;
 851        }
 852        if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 853                goto fail_toobig;
 854        mtu -= hlen + sizeof(struct frag_hdr);
 855
 856        frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 857                                    &ipv6_hdr(skb)->saddr);
 858
 859        if (skb->ip_summed == CHECKSUM_PARTIAL &&
 860            (err = skb_checksum_help(skb)))
 861                goto fail;
 862
 863        prevhdr = skb_network_header(skb) + nexthdr_offset;
 864        hroom = LL_RESERVED_SPACE(rt->dst.dev);
 865        if (skb_has_frag_list(skb)) {
 866                unsigned int first_len = skb_pagelen(skb);
 867                struct ip6_fraglist_iter iter;
 868                struct sk_buff *frag2;
 869
 870                if (first_len - hlen > mtu ||
 871                    ((first_len - hlen) & 7) ||
 872                    skb_cloned(skb) ||
 873                    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 874                        goto slow_path;
 875
 876                skb_walk_frags(skb, frag) {
 877                        /* Correct geometry. */
 878                        if (frag->len > mtu ||
 879                            ((frag->len & 7) && frag->next) ||
 880                            skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 881                                goto slow_path_clean;
 882
 883                        /* Partially cloned skb? */
 884                        if (skb_shared(frag))
 885                                goto slow_path_clean;
 886
 887                        BUG_ON(frag->sk);
 888                        if (skb->sk) {
 889                                frag->sk = skb->sk;
 890                                frag->destructor = sock_wfree;
 891                        }
 892                        skb->truesize -= frag->truesize;
 893                }
 894
 895                err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
 896                                        &iter);
 897                if (err < 0)
 898                        goto fail;
 899
 900                for (;;) {
 901                        /* Prepare header of the next frame,
 902                         * before previous one went down. */
 903                        if (iter.frag)
 904                                ip6_fraglist_prepare(skb, &iter);
 905
 906                        skb->tstamp = tstamp;
 907                        err = output(net, sk, skb);
 908                        if (!err)
 909                                IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 910                                              IPSTATS_MIB_FRAGCREATES);
 911
 912                        if (err || !iter.frag)
 913                                break;
 914
 915                        skb = ip6_fraglist_next(&iter);
 916                }
 917
 918                kfree(iter.tmp_hdr);
 919
 920                if (err == 0) {
 921                        IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 922                                      IPSTATS_MIB_FRAGOKS);
 923                        return 0;
 924                }
 925
 926                kfree_skb_list(iter.frag);
 927
 928                IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 929                              IPSTATS_MIB_FRAGFAILS);
 930                return err;
 931
 932slow_path_clean:
 933                skb_walk_frags(skb, frag2) {
 934                        if (frag2 == frag)
 935                                break;
 936                        frag2->sk = NULL;
 937                        frag2->destructor = NULL;
 938                        skb->truesize += frag2->truesize;
 939                }
 940        }
 941
 942slow_path:
 943        /*
 944         *      Fragment the datagram.
 945         */
 946
 947        ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
 948                      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
 949                      &state);
 950
 951        /*
 952         *      Keep copying data until we run out.
 953         */
 954
 955        while (state.left > 0) {
 956                frag = ip6_frag_next(skb, &state);
 957                if (IS_ERR(frag)) {
 958                        err = PTR_ERR(frag);
 959                        goto fail;
 960                }
 961
 962                /*
 963                 *      Put this fragment into the sending queue.
 964                 */
 965                frag->tstamp = tstamp;
 966                err = output(net, sk, frag);
 967                if (err)
 968                        goto fail;
 969
 970                IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 971                              IPSTATS_MIB_FRAGCREATES);
 972        }
 973        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 974                      IPSTATS_MIB_FRAGOKS);
 975        consume_skb(skb);
 976        return err;
 977
 978fail_toobig:
 979        if (skb->sk && dst_allfrag(skb_dst(skb)))
 980                sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 981
 982        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 983        err = -EMSGSIZE;
 984
 985fail:
 986        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 987                      IPSTATS_MIB_FRAGFAILS);
 988        kfree_skb(skb);
 989        return err;
 990}
 991
 992static inline int ip6_rt_check(const struct rt6key *rt_key,
 993                               const struct in6_addr *fl_addr,
 994                               const struct in6_addr *addr_cache)
 995{
 996        return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 997                (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 998}
 999
1000static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1001                                          struct dst_entry *dst,
1002                                          const struct flowi6 *fl6)
1003{
1004        struct ipv6_pinfo *np = inet6_sk(sk);
1005        struct rt6_info *rt;
1006
1007        if (!dst)
1008                goto out;
1009
1010        if (dst->ops->family != AF_INET6) {
1011                dst_release(dst);
1012                return NULL;
1013        }
1014
1015        rt = (struct rt6_info *)dst;
1016        /* Yes, checking route validity in not connected
1017         * case is not very simple. Take into account,
1018         * that we do not support routing by source, TOS,
1019         * and MSG_DONTROUTE            --ANK (980726)
1020         *
1021         * 1. ip6_rt_check(): If route was host route,
1022         *    check that cached destination is current.
1023         *    If it is network route, we still may
1024         *    check its validity using saved pointer
1025         *    to the last used address: daddr_cache.
1026         *    We do not want to save whole address now,
1027         *    (because main consumer of this service
1028         *    is tcp, which has not this problem),
1029         *    so that the last trick works only on connected
1030         *    sockets.
1031         * 2. oif also should be the same.
1032         */
1033        if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1034#ifdef CONFIG_IPV6_SUBTREES
1035            ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1036#endif
1037           (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
1038              (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
1039                dst_release(dst);
1040                dst = NULL;
1041        }
1042
1043out:
1044        return dst;
1045}
1046
1047static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1048                               struct dst_entry **dst, struct flowi6 *fl6)
1049{
1050#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1051        struct neighbour *n;
1052        struct rt6_info *rt;
1053#endif
1054        int err;
1055        int flags = 0;
1056
1057        /* The correct way to handle this would be to do
1058         * ip6_route_get_saddr, and then ip6_route_output; however,
1059         * the route-specific preferred source forces the
1060         * ip6_route_output call _before_ ip6_route_get_saddr.
1061         *
1062         * In source specific routing (no src=any default route),
1063         * ip6_route_output will fail given src=any saddr, though, so
1064         * that's why we try it again later.
1065         */
1066        if (ipv6_addr_any(&fl6->saddr)) {
1067                struct fib6_info *from;
1068                struct rt6_info *rt;
1069
1070                *dst = ip6_route_output(net, sk, fl6);
1071                rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1072
1073                rcu_read_lock();
1074                from = rt ? rcu_dereference(rt->from) : NULL;
1075                err = ip6_route_get_saddr(net, from, &fl6->daddr,
1076                                          sk ? inet6_sk(sk)->srcprefs : 0,
1077                                          &fl6->saddr);
1078                rcu_read_unlock();
1079
1080                if (err)
1081                        goto out_err_release;
1082
1083                /* If we had an erroneous initial result, pretend it
1084                 * never existed and let the SA-enabled version take
1085                 * over.
1086                 */
1087                if ((*dst)->error) {
1088                        dst_release(*dst);
1089                        *dst = NULL;
1090                }
1091
1092                if (fl6->flowi6_oif)
1093                        flags |= RT6_LOOKUP_F_IFACE;
1094        }
1095
1096        if (!*dst)
1097                *dst = ip6_route_output_flags(net, sk, fl6, flags);
1098
1099        err = (*dst)->error;
1100        if (err)
1101                goto out_err_release;
1102
1103#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1104        /*
1105         * Here if the dst entry we've looked up
1106         * has a neighbour entry that is in the INCOMPLETE
1107         * state and the src address from the flow is
1108         * marked as OPTIMISTIC, we release the found
1109         * dst entry and replace it instead with the
1110         * dst entry of the nexthop router
1111         */
1112        rt = (struct rt6_info *) *dst;
1113        rcu_read_lock_bh();
1114        n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1115                                      rt6_nexthop(rt, &fl6->daddr));
1116        err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1117        rcu_read_unlock_bh();
1118
1119        if (err) {
1120                struct inet6_ifaddr *ifp;
1121                struct flowi6 fl_gw6;
1122                int redirect;
1123
1124                ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1125                                      (*dst)->dev, 1);
1126
1127                redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1128                if (ifp)
1129                        in6_ifa_put(ifp);
1130
1131                if (redirect) {
1132                        /*
1133                         * We need to get the dst entry for the
1134                         * default router instead
1135                         */
1136                        dst_release(*dst);
1137                        memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1138                        memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1139                        *dst = ip6_route_output(net, sk, &fl_gw6);
1140                        err = (*dst)->error;
1141                        if (err)
1142                                goto out_err_release;
1143                }
1144        }
1145#endif
1146        if (ipv6_addr_v4mapped(&fl6->saddr) &&
1147            !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1148                err = -EAFNOSUPPORT;
1149                goto out_err_release;
1150        }
1151
1152        return 0;
1153
1154out_err_release:
1155        dst_release(*dst);
1156        *dst = NULL;
1157
1158        if (err == -ENETUNREACH)
1159                IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1160        return err;
1161}
1162
1163/**
1164 *      ip6_dst_lookup - perform route lookup on flow
1165 *      @net: Network namespace to perform lookup in
1166 *      @sk: socket which provides route info
1167 *      @dst: pointer to dst_entry * for result
1168 *      @fl6: flow to lookup
1169 *
1170 *      This function performs a route lookup on the given flow.
1171 *
1172 *      It returns zero on success, or a standard errno code on error.
1173 */
1174int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1175                   struct flowi6 *fl6)
1176{
1177        *dst = NULL;
1178        return ip6_dst_lookup_tail(net, sk, dst, fl6);
1179}
1180EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1181
1182/**
1183 *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1184 *      @net: Network namespace to perform lookup in
1185 *      @sk: socket which provides route info
1186 *      @fl6: flow to lookup
1187 *      @final_dst: final destination address for ipsec lookup
1188 *
1189 *      This function performs a route lookup on the given flow.
1190 *
1191 *      It returns a valid dst pointer on success, or a pointer encoded
1192 *      error code.
1193 */
1194struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1195                                      const struct in6_addr *final_dst)
1196{
1197        struct dst_entry *dst = NULL;
1198        int err;
1199
1200        err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1201        if (err)
1202                return ERR_PTR(err);
1203        if (final_dst)
1204                fl6->daddr = *final_dst;
1205
1206        return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1207}
1208EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1209
1210/**
1211 *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1212 *      @sk: socket which provides the dst cache and route info
1213 *      @fl6: flow to lookup
1214 *      @final_dst: final destination address for ipsec lookup
1215 *      @connected: whether @sk is connected or not
1216 *
1217 *      This function performs a route lookup on the given flow with the
1218 *      possibility of using the cached route in the socket if it is valid.
1219 *      It will take the socket dst lock when operating on the dst cache.
1220 *      As a result, this function can only be used in process context.
1221 *
1222 *      In addition, for a connected socket, cache the dst in the socket
1223 *      if the current cache is not valid.
1224 *
1225 *      It returns a valid dst pointer on success, or a pointer encoded
1226 *      error code.
1227 */
1228struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1229                                         const struct in6_addr *final_dst,
1230                                         bool connected)
1231{
1232        struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1233
1234        dst = ip6_sk_dst_check(sk, dst, fl6);
1235        if (dst)
1236                return dst;
1237
1238        dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1239        if (connected && !IS_ERR(dst))
1240                ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1241
1242        return dst;
1243}
1244EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1245
1246/**
1247 *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1248 *      @skb: Packet for which lookup is done
1249 *      @dev: Tunnel device
1250 *      @net: Network namespace of tunnel device
1251 *      @sock: Socket which provides route info
1252 *      @saddr: Memory to store the src ip address
1253 *      @info: Tunnel information
1254 *      @protocol: IP protocol
1255 *      @use_cache: Flag to enable cache usage
1256 *      This function performs a route lookup on a tunnel
1257 *
1258 *      It returns a valid dst pointer and stores src address to be used in
1259 *      tunnel in param saddr on success, else a pointer encoded error code.
1260 */
1261
1262struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1263                                        struct net_device *dev,
1264                                        struct net *net,
1265                                        struct socket *sock,
1266                                        struct in6_addr *saddr,
1267                                        const struct ip_tunnel_info *info,
1268                                        u8 protocol,
1269                                        bool use_cache)
1270{
1271        struct dst_entry *dst = NULL;
1272#ifdef CONFIG_DST_CACHE
1273        struct dst_cache *dst_cache;
1274#endif
1275        struct flowi6 fl6;
1276        __u8 prio;
1277
1278#ifdef CONFIG_DST_CACHE
1279        dst_cache = (struct dst_cache *)&info->dst_cache;
1280        if (use_cache) {
1281                dst = dst_cache_get_ip6(dst_cache, saddr);
1282                if (dst)
1283                        return dst;
1284        }
1285#endif
1286        memset(&fl6, 0, sizeof(fl6));
1287        fl6.flowi6_mark = skb->mark;
1288        fl6.flowi6_proto = protocol;
1289        fl6.daddr = info->key.u.ipv6.dst;
1290        fl6.saddr = info->key.u.ipv6.src;
1291        prio = info->key.tos;
1292        fl6.flowlabel = ip6_make_flowinfo(RT_TOS(prio),
1293                                          info->key.label);
1294
1295        dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1296                                              NULL);
1297        if (IS_ERR(dst)) {
1298                netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1299                return ERR_PTR(-ENETUNREACH);
1300        }
1301        if (dst->dev == dev) { /* is this necessary? */
1302                netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1303                dst_release(dst);
1304                return ERR_PTR(-ELOOP);
1305        }
1306#ifdef CONFIG_DST_CACHE
1307        if (use_cache)
1308                dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1309#endif
1310        *saddr = fl6.saddr;
1311        return dst;
1312}
1313EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1314
1315static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1316                                               gfp_t gfp)
1317{
1318        return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1319}
1320
1321static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1322                                                gfp_t gfp)
1323{
1324        return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1325}
1326
1327static void ip6_append_data_mtu(unsigned int *mtu,
1328                                int *maxfraglen,
1329                                unsigned int fragheaderlen,
1330                                struct sk_buff *skb,
1331                                struct rt6_info *rt,
1332                                unsigned int orig_mtu)
1333{
1334        if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1335                if (!skb) {
1336                        /* first fragment, reserve header_len */
1337                        *mtu = orig_mtu - rt->dst.header_len;
1338
1339                } else {
1340                        /*
1341                         * this fragment is not first, the headers
1342                         * space is regarded as data space.
1343                         */
1344                        *mtu = orig_mtu;
1345                }
1346                *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1347                              + fragheaderlen - sizeof(struct frag_hdr);
1348        }
1349}
1350
1351static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1352                          struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1353                          struct rt6_info *rt, struct flowi6 *fl6)
1354{
1355        struct ipv6_pinfo *np = inet6_sk(sk);
1356        unsigned int mtu;
1357        struct ipv6_txoptions *opt = ipc6->opt;
1358
1359        /*
1360         * setup for corking
1361         */
1362        if (opt) {
1363                if (WARN_ON(v6_cork->opt))
1364                        return -EINVAL;
1365
1366                v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1367                if (unlikely(!v6_cork->opt))
1368                        return -ENOBUFS;
1369
1370                v6_cork->opt->tot_len = sizeof(*opt);
1371                v6_cork->opt->opt_flen = opt->opt_flen;
1372                v6_cork->opt->opt_nflen = opt->opt_nflen;
1373
1374                v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1375                                                    sk->sk_allocation);
1376                if (opt->dst0opt && !v6_cork->opt->dst0opt)
1377                        return -ENOBUFS;
1378
1379                v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1380                                                    sk->sk_allocation);
1381                if (opt->dst1opt && !v6_cork->opt->dst1opt)
1382                        return -ENOBUFS;
1383
1384                v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1385                                                   sk->sk_allocation);
1386                if (opt->hopopt && !v6_cork->opt->hopopt)
1387                        return -ENOBUFS;
1388
1389                v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1390                                                    sk->sk_allocation);
1391                if (opt->srcrt && !v6_cork->opt->srcrt)
1392                        return -ENOBUFS;
1393
1394                /* need source address above miyazawa*/
1395        }
1396        dst_hold(&rt->dst);
1397        cork->base.dst = &rt->dst;
1398        cork->fl.u.ip6 = *fl6;
1399        v6_cork->hop_limit = ipc6->hlimit;
1400        v6_cork->tclass = ipc6->tclass;
1401        if (rt->dst.flags & DST_XFRM_TUNNEL)
1402                mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1403                      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1404        else
1405                mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1406                        READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1407        if (np->frag_size < mtu) {
1408                if (np->frag_size)
1409                        mtu = np->frag_size;
1410        }
1411        if (mtu < IPV6_MIN_MTU)
1412                return -EINVAL;
1413        cork->base.fragsize = mtu;
1414        cork->base.gso_size = ipc6->gso_size;
1415        cork->base.tx_flags = 0;
1416        cork->base.mark = ipc6->sockc.mark;
1417        sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1418
1419        if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1420                cork->base.flags |= IPCORK_ALLFRAG;
1421        cork->base.length = 0;
1422
1423        cork->base.transmit_time = ipc6->sockc.transmit_time;
1424
1425        return 0;
1426}
1427
1428static int __ip6_append_data(struct sock *sk,
1429                             struct flowi6 *fl6,
1430                             struct sk_buff_head *queue,
1431                             struct inet_cork *cork,
1432                             struct inet6_cork *v6_cork,
1433                             struct page_frag *pfrag,
1434                             int getfrag(void *from, char *to, int offset,
1435                                         int len, int odd, struct sk_buff *skb),
1436                             void *from, int length, int transhdrlen,
1437                             unsigned int flags, struct ipcm6_cookie *ipc6)
1438{
1439        struct sk_buff *skb, *skb_prev = NULL;
1440        unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1441        struct ubuf_info *uarg = NULL;
1442        int exthdrlen = 0;
1443        int dst_exthdrlen = 0;
1444        int hh_len;
1445        int copy;
1446        int err;
1447        int offset = 0;
1448        u32 tskey = 0;
1449        struct rt6_info *rt = (struct rt6_info *)cork->dst;
1450        struct ipv6_txoptions *opt = v6_cork->opt;
1451        int csummode = CHECKSUM_NONE;
1452        unsigned int maxnonfragsize, headersize;
1453        unsigned int wmem_alloc_delta = 0;
1454        bool paged, extra_uref = false;
1455
1456        skb = skb_peek_tail(queue);
1457        if (!skb) {
1458                exthdrlen = opt ? opt->opt_flen : 0;
1459                dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1460        }
1461
1462        paged = !!cork->gso_size;
1463        mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1464        orig_mtu = mtu;
1465
1466        if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
1467            sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1468                tskey = sk->sk_tskey++;
1469
1470        hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1471
1472        fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1473                        (opt ? opt->opt_nflen : 0);
1474        maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1475                     sizeof(struct frag_hdr);
1476
1477        headersize = sizeof(struct ipv6hdr) +
1478                     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1479                     (dst_allfrag(&rt->dst) ?
1480                      sizeof(struct frag_hdr) : 0) +
1481                     rt->rt6i_nfheader_len;
1482
1483        /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1484         * the first fragment
1485         */
1486        if (headersize + transhdrlen > mtu)
1487                goto emsgsize;
1488
1489        if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1490            (sk->sk_protocol == IPPROTO_UDP ||
1491             sk->sk_protocol == IPPROTO_RAW)) {
1492                ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1493                                sizeof(struct ipv6hdr));
1494                goto emsgsize;
1495        }
1496
1497        if (ip6_sk_ignore_df(sk))
1498                maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1499        else
1500                maxnonfragsize = mtu;
1501
1502        if (cork->length + length > maxnonfragsize - headersize) {
1503emsgsize:
1504                pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1505                ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1506                return -EMSGSIZE;
1507        }
1508
1509        /* CHECKSUM_PARTIAL only with no extension headers and when
1510         * we are not going to fragment
1511         */
1512        if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1513            headersize == sizeof(struct ipv6hdr) &&
1514            length <= mtu - headersize &&
1515            (!(flags & MSG_MORE) || cork->gso_size) &&
1516            rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1517                csummode = CHECKSUM_PARTIAL;
1518
1519        if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
1520                uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1521                if (!uarg)
1522                        return -ENOBUFS;
1523                extra_uref = !skb_zcopy(skb);   /* only ref on new uarg */
1524                if (rt->dst.dev->features & NETIF_F_SG &&
1525                    csummode == CHECKSUM_PARTIAL) {
1526                        paged = true;
1527                } else {
1528                        uarg->zerocopy = 0;
1529                        skb_zcopy_set(skb, uarg, &extra_uref);
1530                }
1531        }
1532
1533        /*
1534         * Let's try using as much space as possible.
1535         * Use MTU if total length of the message fits into the MTU.
1536         * Otherwise, we need to reserve fragment header and
1537         * fragment alignment (= 8-15 octects, in total).
1538         *
1539         * Note that we may need to "move" the data from the tail
1540         * of the buffer to the new fragment when we split
1541         * the message.
1542         *
1543         * FIXME: It may be fragmented into multiple chunks
1544         *        at once if non-fragmentable extension headers
1545         *        are too large.
1546         * --yoshfuji
1547         */
1548
1549        cork->length += length;
1550        if (!skb)
1551                goto alloc_new_skb;
1552
1553        while (length > 0) {
1554                /* Check if the remaining data fits into current packet. */
1555                copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1556                if (copy < length)
1557                        copy = maxfraglen - skb->len;
1558
1559                if (copy <= 0) {
1560                        char *data;
1561                        unsigned int datalen;
1562                        unsigned int fraglen;
1563                        unsigned int fraggap;
1564                        unsigned int alloclen, alloc_extra;
1565                        unsigned int pagedlen;
1566alloc_new_skb:
1567                        /* There's no room in the current skb */
1568                        if (skb)
1569                                fraggap = skb->len - maxfraglen;
1570                        else
1571                                fraggap = 0;
1572                        /* update mtu and maxfraglen if necessary */
1573                        if (!skb || !skb_prev)
1574                                ip6_append_data_mtu(&mtu, &maxfraglen,
1575                                                    fragheaderlen, skb, rt,
1576                                                    orig_mtu);
1577
1578                        skb_prev = skb;
1579
1580                        /*
1581                         * If remaining data exceeds the mtu,
1582                         * we know we need more fragment(s).
1583                         */
1584                        datalen = length + fraggap;
1585
1586                        if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1587                                datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1588                        fraglen = datalen + fragheaderlen;
1589                        pagedlen = 0;
1590
1591                        alloc_extra = hh_len;
1592                        alloc_extra += dst_exthdrlen;
1593                        alloc_extra += rt->dst.trailer_len;
1594
1595                        /* We just reserve space for fragment header.
1596                         * Note: this may be overallocation if the message
1597                         * (without MSG_MORE) fits into the MTU.
1598                         */
1599                        alloc_extra += sizeof(struct frag_hdr);
1600
1601                        if ((flags & MSG_MORE) &&
1602                            !(rt->dst.dev->features&NETIF_F_SG))
1603                                alloclen = mtu;
1604                        else if (!paged &&
1605                                 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1606                                  !(rt->dst.dev->features & NETIF_F_SG)))
1607                                alloclen = fraglen;
1608                        else {
1609                                alloclen = min_t(int, fraglen, MAX_HEADER);
1610                                pagedlen = fraglen - alloclen;
1611                        }
1612                        alloclen += alloc_extra;
1613
1614                        if (datalen != length + fraggap) {
1615                                /*
1616                                 * this is not the last fragment, the trailer
1617                                 * space is regarded as data space.
1618                                 */
1619                                datalen += rt->dst.trailer_len;
1620                        }
1621
1622                        fraglen = datalen + fragheaderlen;
1623
1624                        copy = datalen - transhdrlen - fraggap - pagedlen;
1625                        if (copy < 0) {
1626                                err = -EINVAL;
1627                                goto error;
1628                        }
1629                        if (transhdrlen) {
1630                                skb = sock_alloc_send_skb(sk, alloclen,
1631                                                (flags & MSG_DONTWAIT), &err);
1632                        } else {
1633                                skb = NULL;
1634                                if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1635                                    2 * sk->sk_sndbuf)
1636                                        skb = alloc_skb(alloclen,
1637                                                        sk->sk_allocation);
1638                                if (unlikely(!skb))
1639                                        err = -ENOBUFS;
1640                        }
1641                        if (!skb)
1642                                goto error;
1643                        /*
1644                         *      Fill in the control structures
1645                         */
1646                        skb->protocol = htons(ETH_P_IPV6);
1647                        skb->ip_summed = csummode;
1648                        skb->csum = 0;
1649                        /* reserve for fragmentation and ipsec header */
1650                        skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1651                                    dst_exthdrlen);
1652
1653                        /*
1654                         *      Find where to start putting bytes
1655                         */
1656                        data = skb_put(skb, fraglen - pagedlen);
1657                        skb_set_network_header(skb, exthdrlen);
1658                        data += fragheaderlen;
1659                        skb->transport_header = (skb->network_header +
1660                                                 fragheaderlen);
1661                        if (fraggap) {
1662                                skb->csum = skb_copy_and_csum_bits(
1663                                        skb_prev, maxfraglen,
1664                                        data + transhdrlen, fraggap);
1665                                skb_prev->csum = csum_sub(skb_prev->csum,
1666                                                          skb->csum);
1667                                data += fraggap;
1668                                pskb_trim_unique(skb_prev, maxfraglen);
1669                        }
1670                        if (copy > 0 &&
1671                            getfrag(from, data + transhdrlen, offset,
1672                                    copy, fraggap, skb) < 0) {
1673                                err = -EFAULT;
1674                                kfree_skb(skb);
1675                                goto error;
1676                        }
1677
1678                        offset += copy;
1679                        length -= copy + transhdrlen;
1680                        transhdrlen = 0;
1681                        exthdrlen = 0;
1682                        dst_exthdrlen = 0;
1683
1684                        /* Only the initial fragment is time stamped */
1685                        skb_shinfo(skb)->tx_flags = cork->tx_flags;
1686                        cork->tx_flags = 0;
1687                        skb_shinfo(skb)->tskey = tskey;
1688                        tskey = 0;
1689                        skb_zcopy_set(skb, uarg, &extra_uref);
1690
1691                        if ((flags & MSG_CONFIRM) && !skb_prev)
1692                                skb_set_dst_pending_confirm(skb, 1);
1693
1694                        /*
1695                         * Put the packet on the pending queue
1696                         */
1697                        if (!skb->destructor) {
1698                                skb->destructor = sock_wfree;
1699                                skb->sk = sk;
1700                                wmem_alloc_delta += skb->truesize;
1701                        }
1702                        __skb_queue_tail(queue, skb);
1703                        continue;
1704                }
1705
1706                if (copy > length)
1707                        copy = length;
1708
1709                if (!(rt->dst.dev->features&NETIF_F_SG) &&
1710                    skb_tailroom(skb) >= copy) {
1711                        unsigned int off;
1712
1713                        off = skb->len;
1714                        if (getfrag(from, skb_put(skb, copy),
1715                                                offset, copy, off, skb) < 0) {
1716                                __skb_trim(skb, off);
1717                                err = -EFAULT;
1718                                goto error;
1719                        }
1720                } else if (!uarg || !uarg->zerocopy) {
1721                        int i = skb_shinfo(skb)->nr_frags;
1722
1723                        err = -ENOMEM;
1724                        if (!sk_page_frag_refill(sk, pfrag))
1725                                goto error;
1726
1727                        if (!skb_can_coalesce(skb, i, pfrag->page,
1728                                              pfrag->offset)) {
1729                                err = -EMSGSIZE;
1730                                if (i == MAX_SKB_FRAGS)
1731                                        goto error;
1732
1733                                __skb_fill_page_desc(skb, i, pfrag->page,
1734                                                     pfrag->offset, 0);
1735                                skb_shinfo(skb)->nr_frags = ++i;
1736                                get_page(pfrag->page);
1737                        }
1738                        copy = min_t(int, copy, pfrag->size - pfrag->offset);
1739                        if (getfrag(from,
1740                                    page_address(pfrag->page) + pfrag->offset,
1741                                    offset, copy, skb->len, skb) < 0)
1742                                goto error_efault;
1743
1744                        pfrag->offset += copy;
1745                        skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1746                        skb->len += copy;
1747                        skb->data_len += copy;
1748                        skb->truesize += copy;
1749                        wmem_alloc_delta += copy;
1750                } else {
1751                        err = skb_zerocopy_iter_dgram(skb, from, copy);
1752                        if (err < 0)
1753                                goto error;
1754                }
1755                offset += copy;
1756                length -= copy;
1757        }
1758
1759        if (wmem_alloc_delta)
1760                refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1761        return 0;
1762
1763error_efault:
1764        err = -EFAULT;
1765error:
1766        net_zcopy_put_abort(uarg, extra_uref);
1767        cork->length -= length;
1768        IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1769        refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1770        return err;
1771}
1772
1773int ip6_append_data(struct sock *sk,
1774                    int getfrag(void *from, char *to, int offset, int len,
1775                                int odd, struct sk_buff *skb),
1776                    void *from, int length, int transhdrlen,
1777                    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1778                    struct rt6_info *rt, unsigned int flags)
1779{
1780        struct inet_sock *inet = inet_sk(sk);
1781        struct ipv6_pinfo *np = inet6_sk(sk);
1782        int exthdrlen;
1783        int err;
1784
1785        if (flags&MSG_PROBE)
1786                return 0;
1787        if (skb_queue_empty(&sk->sk_write_queue)) {
1788                /*
1789                 * setup for corking
1790                 */
1791                err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1792                                     ipc6, rt, fl6);
1793                if (err)
1794                        return err;
1795
1796                exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1797                length += exthdrlen;
1798                transhdrlen += exthdrlen;
1799        } else {
1800                fl6 = &inet->cork.fl.u.ip6;
1801                transhdrlen = 0;
1802        }
1803
1804        return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1805                                 &np->cork, sk_page_frag(sk), getfrag,
1806                                 from, length, transhdrlen, flags, ipc6);
1807}
1808EXPORT_SYMBOL_GPL(ip6_append_data);
1809
1810static void ip6_cork_release(struct inet_cork_full *cork,
1811                             struct inet6_cork *v6_cork)
1812{
1813        if (v6_cork->opt) {
1814                kfree(v6_cork->opt->dst0opt);
1815                kfree(v6_cork->opt->dst1opt);
1816                kfree(v6_cork->opt->hopopt);
1817                kfree(v6_cork->opt->srcrt);
1818                kfree(v6_cork->opt);
1819                v6_cork->opt = NULL;
1820        }
1821
1822        if (cork->base.dst) {
1823                dst_release(cork->base.dst);
1824                cork->base.dst = NULL;
1825                cork->base.flags &= ~IPCORK_ALLFRAG;
1826        }
1827        memset(&cork->fl, 0, sizeof(cork->fl));
1828}
1829
1830struct sk_buff *__ip6_make_skb(struct sock *sk,
1831                               struct sk_buff_head *queue,
1832                               struct inet_cork_full *cork,
1833                               struct inet6_cork *v6_cork)
1834{
1835        struct sk_buff *skb, *tmp_skb;
1836        struct sk_buff **tail_skb;
1837        struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1838        struct ipv6_pinfo *np = inet6_sk(sk);
1839        struct net *net = sock_net(sk);
1840        struct ipv6hdr *hdr;
1841        struct ipv6_txoptions *opt = v6_cork->opt;
1842        struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1843        struct flowi6 *fl6 = &cork->fl.u.ip6;
1844        unsigned char proto = fl6->flowi6_proto;
1845
1846        skb = __skb_dequeue(queue);
1847        if (!skb)
1848                goto out;
1849        tail_skb = &(skb_shinfo(skb)->frag_list);
1850
1851        /* move skb->data to ip header from ext header */
1852        if (skb->data < skb_network_header(skb))
1853                __skb_pull(skb, skb_network_offset(skb));
1854        while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1855                __skb_pull(tmp_skb, skb_network_header_len(skb));
1856                *tail_skb = tmp_skb;
1857                tail_skb = &(tmp_skb->next);
1858                skb->len += tmp_skb->len;
1859                skb->data_len += tmp_skb->len;
1860                skb->truesize += tmp_skb->truesize;
1861                tmp_skb->destructor = NULL;
1862                tmp_skb->sk = NULL;
1863        }
1864
1865        /* Allow local fragmentation. */
1866        skb->ignore_df = ip6_sk_ignore_df(sk);
1867
1868        *final_dst = fl6->daddr;
1869        __skb_pull(skb, skb_network_header_len(skb));
1870        if (opt && opt->opt_flen)
1871                ipv6_push_frag_opts(skb, opt, &proto);
1872        if (opt && opt->opt_nflen)
1873                ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1874
1875        skb_push(skb, sizeof(struct ipv6hdr));
1876        skb_reset_network_header(skb);
1877        hdr = ipv6_hdr(skb);
1878
1879        ip6_flow_hdr(hdr, v6_cork->tclass,
1880                     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1881                                        ip6_autoflowlabel(net, np), fl6));
1882        hdr->hop_limit = v6_cork->hop_limit;
1883        hdr->nexthdr = proto;
1884        hdr->saddr = fl6->saddr;
1885        hdr->daddr = *final_dst;
1886
1887        skb->priority = sk->sk_priority;
1888        skb->mark = cork->base.mark;
1889
1890        skb->tstamp = cork->base.transmit_time;
1891
1892        skb_dst_set(skb, dst_clone(&rt->dst));
1893        IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1894        if (proto == IPPROTO_ICMPV6) {
1895                struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1896
1897                ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1898                ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1899        }
1900
1901        ip6_cork_release(cork, v6_cork);
1902out:
1903        return skb;
1904}
1905
1906int ip6_send_skb(struct sk_buff *skb)
1907{
1908        struct net *net = sock_net(skb->sk);
1909        struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1910        int err;
1911
1912        err = ip6_local_out(net, skb->sk, skb);
1913        if (err) {
1914                if (err > 0)
1915                        err = net_xmit_errno(err);
1916                if (err)
1917                        IP6_INC_STATS(net, rt->rt6i_idev,
1918                                      IPSTATS_MIB_OUTDISCARDS);
1919        }
1920
1921        return err;
1922}
1923
1924int ip6_push_pending_frames(struct sock *sk)
1925{
1926        struct sk_buff *skb;
1927
1928        skb = ip6_finish_skb(sk);
1929        if (!skb)
1930                return 0;
1931
1932        return ip6_send_skb(skb);
1933}
1934EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1935
1936static void __ip6_flush_pending_frames(struct sock *sk,
1937                                       struct sk_buff_head *queue,
1938                                       struct inet_cork_full *cork,
1939                                       struct inet6_cork *v6_cork)
1940{
1941        struct sk_buff *skb;
1942
1943        while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1944                if (skb_dst(skb))
1945                        IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1946                                      IPSTATS_MIB_OUTDISCARDS);
1947                kfree_skb(skb);
1948        }
1949
1950        ip6_cork_release(cork, v6_cork);
1951}
1952
1953void ip6_flush_pending_frames(struct sock *sk)
1954{
1955        __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1956                                   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1957}
1958EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1959
1960struct sk_buff *ip6_make_skb(struct sock *sk,
1961                             int getfrag(void *from, char *to, int offset,
1962                                         int len, int odd, struct sk_buff *skb),
1963                             void *from, int length, int transhdrlen,
1964                             struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1965                             struct rt6_info *rt, unsigned int flags,
1966                             struct inet_cork_full *cork)
1967{
1968        struct inet6_cork v6_cork;
1969        struct sk_buff_head queue;
1970        int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1971        int err;
1972
1973        if (flags & MSG_PROBE)
1974                return NULL;
1975
1976        __skb_queue_head_init(&queue);
1977
1978        cork->base.flags = 0;
1979        cork->base.addr = 0;
1980        cork->base.opt = NULL;
1981        cork->base.dst = NULL;
1982        v6_cork.opt = NULL;
1983        err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
1984        if (err) {
1985                ip6_cork_release(cork, &v6_cork);
1986                return ERR_PTR(err);
1987        }
1988        if (ipc6->dontfrag < 0)
1989                ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1990
1991        err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
1992                                &current->task_frag, getfrag, from,
1993                                length + exthdrlen, transhdrlen + exthdrlen,
1994                                flags, ipc6);
1995        if (err) {
1996                __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
1997                return ERR_PTR(err);
1998        }
1999
2000        return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2001}
2002