linux/net/ipv6/ip6_output.c
<<
>>
Prefs
   1/*
   2 *      IPv6 output functions
   3 *      Linux INET6 implementation
   4 *
   5 *      Authors:
   6 *      Pedro Roque             <roque@di.fc.ul.pt>
   7 *
   8 *      Based on linux/net/ipv4/ip_output.c
   9 *
  10 *      This program is free software; you can redistribute it and/or
  11 *      modify it under the terms of the GNU General Public License
  12 *      as published by the Free Software Foundation; either version
  13 *      2 of the License, or (at your option) any later version.
  14 *
  15 *      Changes:
  16 *      A.N.Kuznetsov   :       airthmetics in fragmentation.
  17 *                              extension headers are implemented.
  18 *                              route changes now work.
  19 *                              ip6_forward does not confuse sniffers.
  20 *                              etc.
  21 *
  22 *      H. von Brand    :       Added missing #include <linux/string.h>
  23 *      Imran Patel     :       frag id should be in NBO
  24 *      Kazunori MIYAZAWA @USAGI
  25 *                      :       add ip6_append_data and related functions
  26 *                              for datagram xmit
  27 */
  28
  29#include <linux/errno.h>
  30#include <linux/kernel.h>
  31#include <linux/string.h>
  32#include <linux/socket.h>
  33#include <linux/net.h>
  34#include <linux/netdevice.h>
  35#include <linux/if_arp.h>
  36#include <linux/in6.h>
  37#include <linux/tcp.h>
  38#include <linux/route.h>
  39#include <linux/module.h>
  40#include <linux/slab.h>
  41
  42#include <linux/netfilter.h>
  43#include <linux/netfilter_ipv6.h>
  44
  45#include <net/sock.h>
  46#include <net/snmp.h>
  47
  48#include <net/ipv6.h>
  49#include <net/ndisc.h>
  50#include <net/protocol.h>
  51#include <net/ip6_route.h>
  52#include <net/addrconf.h>
  53#include <net/rawv6.h>
  54#include <net/icmp.h>
  55#include <net/xfrm.h>
  56#include <net/checksum.h>
  57#include <linux/mroute6.h>
  58#include <net/l3mdev.h>
  59#include <net/lwtunnel.h>
  60
  61static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  62{
  63        struct dst_entry *dst = skb_dst(skb);
  64        struct net_device *dev = dst->dev;
  65        struct neighbour *neigh;
  66        struct in6_addr *nexthop;
  67        int ret;
  68
  69        skb->protocol = htons(ETH_P_IPV6);
  70        skb->dev = dev;
  71
  72        if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  73                struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  74
  75                if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  76                    ((mroute6_socket(net, skb) &&
  77                     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  78                     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  79                                         &ipv6_hdr(skb)->saddr))) {
  80                        struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  81
  82                        /* Do not check for IFF_ALLMULTI; multicast routing
  83                           is not supported in any case.
  84                         */
  85                        if (newskb)
  86                                NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  87                                        net, sk, newskb, NULL, newskb->dev,
  88                                        dev_loopback_xmit);
  89
  90                        if (ipv6_hdr(skb)->hop_limit == 0) {
  91                                IP6_INC_STATS(net, idev,
  92                                              IPSTATS_MIB_OUTDISCARDS);
  93                                kfree_skb(skb);
  94                                return 0;
  95                        }
  96                }
  97
  98                IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  99
 100                if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
 101                    IPV6_ADDR_SCOPE_NODELOCAL &&
 102                    !(dev->flags & IFF_LOOPBACK)) {
 103                        kfree_skb(skb);
 104                        return 0;
 105                }
 106        }
 107
 108        if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 109                int res = lwtunnel_xmit(skb);
 110
 111                if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 112                        return res;
 113        }
 114
 115        rcu_read_lock_bh();
 116        nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 117        neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 118        if (unlikely(!neigh))
 119                neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 120        if (!IS_ERR(neigh)) {
 121                ret = dst_neigh_output(dst, neigh, skb);
 122                rcu_read_unlock_bh();
 123                return ret;
 124        }
 125        rcu_read_unlock_bh();
 126
 127        IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 128        kfree_skb(skb);
 129        return -EINVAL;
 130}
 131
 132static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 133{
 134        if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 135            dst_allfrag(skb_dst(skb)) ||
 136            (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 137                return ip6_fragment(net, sk, skb, ip6_finish_output2);
 138        else
 139                return ip6_finish_output2(net, sk, skb);
 140}
 141
 142int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 143{
 144        struct net_device *dev = skb_dst(skb)->dev;
 145        struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 146
 147        if (unlikely(idev->cnf.disable_ipv6)) {
 148                IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 149                kfree_skb(skb);
 150                return 0;
 151        }
 152
 153        return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 154                            net, sk, skb, NULL, dev,
 155                            ip6_finish_output,
 156                            !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 157}
 158
 159/*
 160 * xmit an sk_buff (used by TCP, SCTP and DCCP)
 161 * Note : socket lock is not held for SYNACK packets, but might be modified
 162 * by calls to skb_set_owner_w() and ipv6_local_error(),
 163 * which are using proper atomic operations or spinlocks.
 164 */
 165int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 166             struct ipv6_txoptions *opt, int tclass)
 167{
 168        struct net *net = sock_net(sk);
 169        const struct ipv6_pinfo *np = inet6_sk(sk);
 170        struct in6_addr *first_hop = &fl6->daddr;
 171        struct dst_entry *dst = skb_dst(skb);
 172        struct ipv6hdr *hdr;
 173        u8  proto = fl6->flowi6_proto;
 174        int seg_len = skb->len;
 175        int hlimit = -1;
 176        u32 mtu;
 177
 178        if (opt) {
 179                unsigned int head_room;
 180
 181                /* First: exthdrs may take lots of space (~8K for now)
 182                   MAX_HEADER is not enough.
 183                 */
 184                head_room = opt->opt_nflen + opt->opt_flen;
 185                seg_len += head_room;
 186                head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 187
 188                if (skb_headroom(skb) < head_room) {
 189                        struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 190                        if (!skb2) {
 191                                IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 192                                              IPSTATS_MIB_OUTDISCARDS);
 193                                kfree_skb(skb);
 194                                return -ENOBUFS;
 195                        }
 196                        consume_skb(skb);
 197                        skb = skb2;
 198                        /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
 199                         * it is safe to call in our context (socket lock not held)
 200                         */
 201                        skb_set_owner_w(skb, (struct sock *)sk);
 202                }
 203                if (opt->opt_flen)
 204                        ipv6_push_frag_opts(skb, opt, &proto);
 205                if (opt->opt_nflen)
 206                        ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
 207        }
 208
 209        skb_push(skb, sizeof(struct ipv6hdr));
 210        skb_reset_network_header(skb);
 211        hdr = ipv6_hdr(skb);
 212
 213        /*
 214         *      Fill in the IPv6 header
 215         */
 216        if (np)
 217                hlimit = np->hop_limit;
 218        if (hlimit < 0)
 219                hlimit = ip6_dst_hoplimit(dst);
 220
 221        ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 222                                                     np->autoflowlabel, fl6));
 223
 224        hdr->payload_len = htons(seg_len);
 225        hdr->nexthdr = proto;
 226        hdr->hop_limit = hlimit;
 227
 228        hdr->saddr = fl6->saddr;
 229        hdr->daddr = *first_hop;
 230
 231        skb->protocol = htons(ETH_P_IPV6);
 232        skb->priority = sk->sk_priority;
 233        skb->mark = sk->sk_mark;
 234
 235        mtu = dst_mtu(dst);
 236        if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 237                IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 238                              IPSTATS_MIB_OUT, skb->len);
 239
 240                /* if egress device is enslaved to an L3 master device pass the
 241                 * skb to its handler for processing
 242                 */
 243                skb = l3mdev_ip6_out((struct sock *)sk, skb);
 244                if (unlikely(!skb))
 245                        return 0;
 246
 247                /* hooks should never assume socket lock is held.
 248                 * we promote our socket to non const
 249                 */
 250                return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 251                               net, (struct sock *)sk, skb, NULL, dst->dev,
 252                               dst_output);
 253        }
 254
 255        skb->dev = dst->dev;
 256        /* ipv6_local_error() does not require socket lock,
 257         * we promote our socket to non const
 258         */
 259        ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 260
 261        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 262        kfree_skb(skb);
 263        return -EMSGSIZE;
 264}
 265EXPORT_SYMBOL(ip6_xmit);
 266
 267static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 268{
 269        struct ip6_ra_chain *ra;
 270        struct sock *last = NULL;
 271
 272        read_lock(&ip6_ra_lock);
 273        for (ra = ip6_ra_chain; ra; ra = ra->next) {
 274                struct sock *sk = ra->sk;
 275                if (sk && ra->sel == sel &&
 276                    (!sk->sk_bound_dev_if ||
 277                     sk->sk_bound_dev_if == skb->dev->ifindex)) {
 278                        if (last) {
 279                                struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 280                                if (skb2)
 281                                        rawv6_rcv(last, skb2);
 282                        }
 283                        last = sk;
 284                }
 285        }
 286
 287        if (last) {
 288                rawv6_rcv(last, skb);
 289                read_unlock(&ip6_ra_lock);
 290                return 1;
 291        }
 292        read_unlock(&ip6_ra_lock);
 293        return 0;
 294}
 295
 296static int ip6_forward_proxy_check(struct sk_buff *skb)
 297{
 298        struct ipv6hdr *hdr = ipv6_hdr(skb);
 299        u8 nexthdr = hdr->nexthdr;
 300        __be16 frag_off;
 301        int offset;
 302
 303        if (ipv6_ext_hdr(nexthdr)) {
 304                offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 305                if (offset < 0)
 306                        return 0;
 307        } else
 308                offset = sizeof(struct ipv6hdr);
 309
 310        if (nexthdr == IPPROTO_ICMPV6) {
 311                struct icmp6hdr *icmp6;
 312
 313                if (!pskb_may_pull(skb, (skb_network_header(skb) +
 314                                         offset + 1 - skb->data)))
 315                        return 0;
 316
 317                icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 318
 319                switch (icmp6->icmp6_type) {
 320                case NDISC_ROUTER_SOLICITATION:
 321                case NDISC_ROUTER_ADVERTISEMENT:
 322                case NDISC_NEIGHBOUR_SOLICITATION:
 323                case NDISC_NEIGHBOUR_ADVERTISEMENT:
 324                case NDISC_REDIRECT:
 325                        /* For reaction involving unicast neighbor discovery
 326                         * message destined to the proxied address, pass it to
 327                         * input function.
 328                         */
 329                        return 1;
 330                default:
 331                        break;
 332                }
 333        }
 334
 335        /*
 336         * The proxying router can't forward traffic sent to a link-local
 337         * address, so signal the sender and discard the packet. This
 338         * behavior is clarified by the MIPv6 specification.
 339         */
 340        if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 341                dst_link_failure(skb);
 342                return -1;
 343        }
 344
 345        return 0;
 346}
 347
 348static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 349                                     struct sk_buff *skb)
 350{
 351        return dst_output(net, sk, skb);
 352}
 353
 354static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 355{
 356        unsigned int mtu;
 357        struct inet6_dev *idev;
 358
 359        if (dst_metric_locked(dst, RTAX_MTU)) {
 360                mtu = dst_metric_raw(dst, RTAX_MTU);
 361                if (mtu)
 362                        return mtu;
 363        }
 364
 365        mtu = IPV6_MIN_MTU;
 366        rcu_read_lock();
 367        idev = __in6_dev_get(dst->dev);
 368        if (idev)
 369                mtu = idev->cnf.mtu6;
 370        rcu_read_unlock();
 371
 372        return mtu;
 373}
 374
 375static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 376{
 377        if (skb->len <= mtu)
 378                return false;
 379
 380        /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 381        if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 382                return true;
 383
 384        if (skb->ignore_df)
 385                return false;
 386
 387        if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
 388                return false;
 389
 390        return true;
 391}
 392
 393int ip6_forward(struct sk_buff *skb)
 394{
 395        struct dst_entry *dst = skb_dst(skb);
 396        struct ipv6hdr *hdr = ipv6_hdr(skb);
 397        struct inet6_skb_parm *opt = IP6CB(skb);
 398        struct net *net = dev_net(dst->dev);
 399        u32 mtu;
 400
 401        if (net->ipv6.devconf_all->forwarding == 0)
 402                goto error;
 403
 404        if (skb->pkt_type != PACKET_HOST)
 405                goto drop;
 406
 407        if (unlikely(skb->sk))
 408                goto drop;
 409
 410        if (skb_warn_if_lro(skb))
 411                goto drop;
 412
 413        if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 414                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 415                                IPSTATS_MIB_INDISCARDS);
 416                goto drop;
 417        }
 418
 419        skb_forward_csum(skb);
 420
 421        /*
 422         *      We DO NOT make any processing on
 423         *      RA packets, pushing them to user level AS IS
 424         *      without ane WARRANTY that application will be able
 425         *      to interpret them. The reason is that we
 426         *      cannot make anything clever here.
 427         *
 428         *      We are not end-node, so that if packet contains
 429         *      AH/ESP, we cannot make anything.
 430         *      Defragmentation also would be mistake, RA packets
 431         *      cannot be fragmented, because there is no warranty
 432         *      that different fragments will go along one path. --ANK
 433         */
 434        if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 435                if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 436                        return 0;
 437        }
 438
 439        /*
 440         *      check and decrement ttl
 441         */
 442        if (hdr->hop_limit <= 1) {
 443                /* Force OUTPUT device used as source address */
 444                skb->dev = dst->dev;
 445                icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 446                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 447                                IPSTATS_MIB_INHDRERRORS);
 448
 449                kfree_skb(skb);
 450                return -ETIMEDOUT;
 451        }
 452
 453        /* XXX: idev->cnf.proxy_ndp? */
 454        if (net->ipv6.devconf_all->proxy_ndp &&
 455            pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 456                int proxied = ip6_forward_proxy_check(skb);
 457                if (proxied > 0)
 458                        return ip6_input(skb);
 459                else if (proxied < 0) {
 460                        __IP6_INC_STATS(net, ip6_dst_idev(dst),
 461                                        IPSTATS_MIB_INDISCARDS);
 462                        goto drop;
 463                }
 464        }
 465
 466        if (!xfrm6_route_forward(skb)) {
 467                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 468                                IPSTATS_MIB_INDISCARDS);
 469                goto drop;
 470        }
 471        dst = skb_dst(skb);
 472
 473        /* IPv6 specs say nothing about it, but it is clear that we cannot
 474           send redirects to source routed frames.
 475           We don't send redirects to frames decapsulated from IPsec.
 476         */
 477        if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 478                struct in6_addr *target = NULL;
 479                struct inet_peer *peer;
 480                struct rt6_info *rt;
 481
 482                /*
 483                 *      incoming and outgoing devices are the same
 484                 *      send a redirect.
 485                 */
 486
 487                rt = (struct rt6_info *) dst;
 488                if (rt->rt6i_flags & RTF_GATEWAY)
 489                        target = &rt->rt6i_gateway;
 490                else
 491                        target = &hdr->daddr;
 492
 493                peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 494
 495                /* Limit redirects both by destination (here)
 496                   and by source (inside ndisc_send_redirect)
 497                 */
 498                if (inet_peer_xrlim_allow(peer, 1*HZ))
 499                        ndisc_send_redirect(skb, target);
 500                if (peer)
 501                        inet_putpeer(peer);
 502        } else {
 503                int addrtype = ipv6_addr_type(&hdr->saddr);
 504
 505                /* This check is security critical. */
 506                if (addrtype == IPV6_ADDR_ANY ||
 507                    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 508                        goto error;
 509                if (addrtype & IPV6_ADDR_LINKLOCAL) {
 510                        icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 511                                    ICMPV6_NOT_NEIGHBOUR, 0);
 512                        goto error;
 513                }
 514        }
 515
 516        mtu = ip6_dst_mtu_forward(dst);
 517        if (mtu < IPV6_MIN_MTU)
 518                mtu = IPV6_MIN_MTU;
 519
 520        if (ip6_pkt_too_big(skb, mtu)) {
 521                /* Again, force OUTPUT device used as source address */
 522                skb->dev = dst->dev;
 523                icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 524                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 525                                IPSTATS_MIB_INTOOBIGERRORS);
 526                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 527                                IPSTATS_MIB_FRAGFAILS);
 528                kfree_skb(skb);
 529                return -EMSGSIZE;
 530        }
 531
 532        if (skb_cow(skb, dst->dev->hard_header_len)) {
 533                __IP6_INC_STATS(net, ip6_dst_idev(dst),
 534                                IPSTATS_MIB_OUTDISCARDS);
 535                goto drop;
 536        }
 537
 538        hdr = ipv6_hdr(skb);
 539
 540        /* Mangling hops number delayed to point after skb COW */
 541
 542        hdr->hop_limit--;
 543
 544        __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 545        __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 546        return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 547                       net, NULL, skb, skb->dev, dst->dev,
 548                       ip6_forward_finish);
 549
 550error:
 551        __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 552drop:
 553        kfree_skb(skb);
 554        return -EINVAL;
 555}
 556
 557static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 558{
 559        to->pkt_type = from->pkt_type;
 560        to->priority = from->priority;
 561        to->protocol = from->protocol;
 562        skb_dst_drop(to);
 563        skb_dst_set(to, dst_clone(skb_dst(from)));
 564        to->dev = from->dev;
 565        to->mark = from->mark;
 566
 567#ifdef CONFIG_NET_SCHED
 568        to->tc_index = from->tc_index;
 569#endif
 570        nf_copy(to, from);
 571        skb_copy_secmark(to, from);
 572}
 573
 574int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 575                 int (*output)(struct net *, struct sock *, struct sk_buff *))
 576{
 577        struct sk_buff *frag;
 578        struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 579        struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 580                                inet6_sk(skb->sk) : NULL;
 581        struct ipv6hdr *tmp_hdr;
 582        struct frag_hdr *fh;
 583        unsigned int mtu, hlen, left, len;
 584        int hroom, troom;
 585        __be32 frag_id;
 586        int ptr, offset = 0, err = 0;
 587        u8 *prevhdr, nexthdr = 0;
 588
 589        hlen = ip6_find_1stfragopt(skb, &prevhdr);
 590        nexthdr = *prevhdr;
 591
 592        mtu = ip6_skb_dst_mtu(skb);
 593
 594        /* We must not fragment if the socket is set to force MTU discovery
 595         * or if the skb it not generated by a local socket.
 596         */
 597        if (unlikely(!skb->ignore_df && skb->len > mtu))
 598                goto fail_toobig;
 599
 600        if (IP6CB(skb)->frag_max_size) {
 601                if (IP6CB(skb)->frag_max_size > mtu)
 602                        goto fail_toobig;
 603
 604                /* don't send fragments larger than what we received */
 605                mtu = IP6CB(skb)->frag_max_size;
 606                if (mtu < IPV6_MIN_MTU)
 607                        mtu = IPV6_MIN_MTU;
 608        }
 609
 610        if (np && np->frag_size < mtu) {
 611                if (np->frag_size)
 612                        mtu = np->frag_size;
 613        }
 614        if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 615                goto fail_toobig;
 616        mtu -= hlen + sizeof(struct frag_hdr);
 617
 618        frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 619                                    &ipv6_hdr(skb)->saddr);
 620
 621        if (skb->ip_summed == CHECKSUM_PARTIAL &&
 622            (err = skb_checksum_help(skb)))
 623                goto fail;
 624
 625        hroom = LL_RESERVED_SPACE(rt->dst.dev);
 626        if (skb_has_frag_list(skb)) {
 627                int first_len = skb_pagelen(skb);
 628                struct sk_buff *frag2;
 629
 630                if (first_len - hlen > mtu ||
 631                    ((first_len - hlen) & 7) ||
 632                    skb_cloned(skb) ||
 633                    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 634                        goto slow_path;
 635
 636                skb_walk_frags(skb, frag) {
 637                        /* Correct geometry. */
 638                        if (frag->len > mtu ||
 639                            ((frag->len & 7) && frag->next) ||
 640                            skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 641                                goto slow_path_clean;
 642
 643                        /* Partially cloned skb? */
 644                        if (skb_shared(frag))
 645                                goto slow_path_clean;
 646
 647                        BUG_ON(frag->sk);
 648                        if (skb->sk) {
 649                                frag->sk = skb->sk;
 650                                frag->destructor = sock_wfree;
 651                        }
 652                        skb->truesize -= frag->truesize;
 653                }
 654
 655                err = 0;
 656                offset = 0;
 657                /* BUILD HEADER */
 658
 659                *prevhdr = NEXTHDR_FRAGMENT;
 660                tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 661                if (!tmp_hdr) {
 662                        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 663                                      IPSTATS_MIB_FRAGFAILS);
 664                        err = -ENOMEM;
 665                        goto fail;
 666                }
 667                frag = skb_shinfo(skb)->frag_list;
 668                skb_frag_list_init(skb);
 669
 670                __skb_pull(skb, hlen);
 671                fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
 672                __skb_push(skb, hlen);
 673                skb_reset_network_header(skb);
 674                memcpy(skb_network_header(skb), tmp_hdr, hlen);
 675
 676                fh->nexthdr = nexthdr;
 677                fh->reserved = 0;
 678                fh->frag_off = htons(IP6_MF);
 679                fh->identification = frag_id;
 680
 681                first_len = skb_pagelen(skb);
 682                skb->data_len = first_len - skb_headlen(skb);
 683                skb->len = first_len;
 684                ipv6_hdr(skb)->payload_len = htons(first_len -
 685                                                   sizeof(struct ipv6hdr));
 686
 687                dst_hold(&rt->dst);
 688
 689                for (;;) {
 690                        /* Prepare header of the next frame,
 691                         * before previous one went down. */
 692                        if (frag) {
 693                                frag->ip_summed = CHECKSUM_NONE;
 694                                skb_reset_transport_header(frag);
 695                                fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
 696                                __skb_push(frag, hlen);
 697                                skb_reset_network_header(frag);
 698                                memcpy(skb_network_header(frag), tmp_hdr,
 699                                       hlen);
 700                                offset += skb->len - hlen - sizeof(struct frag_hdr);
 701                                fh->nexthdr = nexthdr;
 702                                fh->reserved = 0;
 703                                fh->frag_off = htons(offset);
 704                                if (frag->next)
 705                                        fh->frag_off |= htons(IP6_MF);
 706                                fh->identification = frag_id;
 707                                ipv6_hdr(frag)->payload_len =
 708                                                htons(frag->len -
 709                                                      sizeof(struct ipv6hdr));
 710                                ip6_copy_metadata(frag, skb);
 711                        }
 712
 713                        err = output(net, sk, skb);
 714                        if (!err)
 715                                IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 716                                              IPSTATS_MIB_FRAGCREATES);
 717
 718                        if (err || !frag)
 719                                break;
 720
 721                        skb = frag;
 722                        frag = skb->next;
 723                        skb->next = NULL;
 724                }
 725
 726                kfree(tmp_hdr);
 727
 728                if (err == 0) {
 729                        IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 730                                      IPSTATS_MIB_FRAGOKS);
 731                        ip6_rt_put(rt);
 732                        return 0;
 733                }
 734
 735                kfree_skb_list(frag);
 736
 737                IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 738                              IPSTATS_MIB_FRAGFAILS);
 739                ip6_rt_put(rt);
 740                return err;
 741
 742slow_path_clean:
 743                skb_walk_frags(skb, frag2) {
 744                        if (frag2 == frag)
 745                                break;
 746                        frag2->sk = NULL;
 747                        frag2->destructor = NULL;
 748                        skb->truesize += frag2->truesize;
 749                }
 750        }
 751
 752slow_path:
 753        left = skb->len - hlen;         /* Space per frame */
 754        ptr = hlen;                     /* Where to start from */
 755
 756        /*
 757         *      Fragment the datagram.
 758         */
 759
 760        *prevhdr = NEXTHDR_FRAGMENT;
 761        troom = rt->dst.dev->needed_tailroom;
 762
 763        /*
 764         *      Keep copying data until we run out.
 765         */
 766        while (left > 0)        {
 767                len = left;
 768                /* IF: it doesn't fit, use 'mtu' - the data space left */
 769                if (len > mtu)
 770                        len = mtu;
 771                /* IF: we are not sending up to and including the packet end
 772                   then align the next start on an eight byte boundary */
 773                if (len < left) {
 774                        len &= ~7;
 775                }
 776
 777                /* Allocate buffer */
 778                frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 779                                 hroom + troom, GFP_ATOMIC);
 780                if (!frag) {
 781                        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 782                                      IPSTATS_MIB_FRAGFAILS);
 783                        err = -ENOMEM;
 784                        goto fail;
 785                }
 786
 787                /*
 788                 *      Set up data on packet
 789                 */
 790
 791                ip6_copy_metadata(frag, skb);
 792                skb_reserve(frag, hroom);
 793                skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 794                skb_reset_network_header(frag);
 795                fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 796                frag->transport_header = (frag->network_header + hlen +
 797                                          sizeof(struct frag_hdr));
 798
 799                /*
 800                 *      Charge the memory for the fragment to any owner
 801                 *      it might possess
 802                 */
 803                if (skb->sk)
 804                        skb_set_owner_w(frag, skb->sk);
 805
 806                /*
 807                 *      Copy the packet header into the new buffer.
 808                 */
 809                skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 810
 811                /*
 812                 *      Build fragment header.
 813                 */
 814                fh->nexthdr = nexthdr;
 815                fh->reserved = 0;
 816                fh->identification = frag_id;
 817
 818                /*
 819                 *      Copy a block of the IP datagram.
 820                 */
 821                BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 822                                     len));
 823                left -= len;
 824
 825                fh->frag_off = htons(offset);
 826                if (left > 0)
 827                        fh->frag_off |= htons(IP6_MF);
 828                ipv6_hdr(frag)->payload_len = htons(frag->len -
 829                                                    sizeof(struct ipv6hdr));
 830
 831                ptr += len;
 832                offset += len;
 833
 834                /*
 835                 *      Put this fragment into the sending queue.
 836                 */
 837                err = output(net, sk, frag);
 838                if (err)
 839                        goto fail;
 840
 841                IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 842                              IPSTATS_MIB_FRAGCREATES);
 843        }
 844        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 845                      IPSTATS_MIB_FRAGOKS);
 846        consume_skb(skb);
 847        return err;
 848
 849fail_toobig:
 850        if (skb->sk && dst_allfrag(skb_dst(skb)))
 851                sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 852
 853        skb->dev = skb_dst(skb)->dev;
 854        icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 855        err = -EMSGSIZE;
 856
 857fail:
 858        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 859                      IPSTATS_MIB_FRAGFAILS);
 860        kfree_skb(skb);
 861        return err;
 862}
 863
 864static inline int ip6_rt_check(const struct rt6key *rt_key,
 865                               const struct in6_addr *fl_addr,
 866                               const struct in6_addr *addr_cache)
 867{
 868        return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 869                (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 870}
 871
 872static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 873                                          struct dst_entry *dst,
 874                                          const struct flowi6 *fl6)
 875{
 876        struct ipv6_pinfo *np = inet6_sk(sk);
 877        struct rt6_info *rt;
 878
 879        if (!dst)
 880                goto out;
 881
 882        if (dst->ops->family != AF_INET6) {
 883                dst_release(dst);
 884                return NULL;
 885        }
 886
 887        rt = (struct rt6_info *)dst;
 888        /* Yes, checking route validity in not connected
 889         * case is not very simple. Take into account,
 890         * that we do not support routing by source, TOS,
 891         * and MSG_DONTROUTE            --ANK (980726)
 892         *
 893         * 1. ip6_rt_check(): If route was host route,
 894         *    check that cached destination is current.
 895         *    If it is network route, we still may
 896         *    check its validity using saved pointer
 897         *    to the last used address: daddr_cache.
 898         *    We do not want to save whole address now,
 899         *    (because main consumer of this service
 900         *    is tcp, which has not this problem),
 901         *    so that the last trick works only on connected
 902         *    sockets.
 903         * 2. oif also should be the same.
 904         */
 905        if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 906#ifdef CONFIG_IPV6_SUBTREES
 907            ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 908#endif
 909           (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 910              (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 911                dst_release(dst);
 912                dst = NULL;
 913        }
 914
 915out:
 916        return dst;
 917}
 918
 919static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 920                               struct dst_entry **dst, struct flowi6 *fl6)
 921{
 922#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 923        struct neighbour *n;
 924        struct rt6_info *rt;
 925#endif
 926        int err;
 927        int flags = 0;
 928
 929        /* The correct way to handle this would be to do
 930         * ip6_route_get_saddr, and then ip6_route_output; however,
 931         * the route-specific preferred source forces the
 932         * ip6_route_output call _before_ ip6_route_get_saddr.
 933         *
 934         * In source specific routing (no src=any default route),
 935         * ip6_route_output will fail given src=any saddr, though, so
 936         * that's why we try it again later.
 937         */
 938        if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
 939                struct rt6_info *rt;
 940                bool had_dst = *dst != NULL;
 941
 942                if (!had_dst)
 943                        *dst = ip6_route_output(net, sk, fl6);
 944                rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
 945                err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 946                                          sk ? inet6_sk(sk)->srcprefs : 0,
 947                                          &fl6->saddr);
 948                if (err)
 949                        goto out_err_release;
 950
 951                /* If we had an erroneous initial result, pretend it
 952                 * never existed and let the SA-enabled version take
 953                 * over.
 954                 */
 955                if (!had_dst && (*dst)->error) {
 956                        dst_release(*dst);
 957                        *dst = NULL;
 958                }
 959
 960                if (fl6->flowi6_oif)
 961                        flags |= RT6_LOOKUP_F_IFACE;
 962        }
 963
 964        if (!*dst)
 965                *dst = ip6_route_output_flags(net, sk, fl6, flags);
 966
 967        err = (*dst)->error;
 968        if (err)
 969                goto out_err_release;
 970
 971#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 972        /*
 973         * Here if the dst entry we've looked up
 974         * has a neighbour entry that is in the INCOMPLETE
 975         * state and the src address from the flow is
 976         * marked as OPTIMISTIC, we release the found
 977         * dst entry and replace it instead with the
 978         * dst entry of the nexthop router
 979         */
 980        rt = (struct rt6_info *) *dst;
 981        rcu_read_lock_bh();
 982        n = __ipv6_neigh_lookup_noref(rt->dst.dev,
 983                                      rt6_nexthop(rt, &fl6->daddr));
 984        err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
 985        rcu_read_unlock_bh();
 986
 987        if (err) {
 988                struct inet6_ifaddr *ifp;
 989                struct flowi6 fl_gw6;
 990                int redirect;
 991
 992                ifp = ipv6_get_ifaddr(net, &fl6->saddr,
 993                                      (*dst)->dev, 1);
 994
 995                redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
 996                if (ifp)
 997                        in6_ifa_put(ifp);
 998
 999                if (redirect) {
1000                        /*
1001                         * We need to get the dst entry for the
1002                         * default router instead
1003                         */
1004                        dst_release(*dst);
1005                        memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1006                        memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1007                        *dst = ip6_route_output(net, sk, &fl_gw6);
1008                        err = (*dst)->error;
1009                        if (err)
1010                                goto out_err_release;
1011                }
1012        }
1013#endif
1014
1015        return 0;
1016
1017out_err_release:
1018        dst_release(*dst);
1019        *dst = NULL;
1020
1021        if (err == -ENETUNREACH)
1022                IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1023        return err;
1024}
1025
1026/**
1027 *      ip6_dst_lookup - perform route lookup on flow
1028 *      @sk: socket which provides route info
1029 *      @dst: pointer to dst_entry * for result
1030 *      @fl6: flow to lookup
1031 *
1032 *      This function performs a route lookup on the given flow.
1033 *
1034 *      It returns zero on success, or a standard errno code on error.
1035 */
1036int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1037                   struct flowi6 *fl6)
1038{
1039        *dst = NULL;
1040        return ip6_dst_lookup_tail(net, sk, dst, fl6);
1041}
1042EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1043
1044/**
1045 *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1046 *      @sk: socket which provides route info
1047 *      @fl6: flow to lookup
1048 *      @final_dst: final destination address for ipsec lookup
1049 *
1050 *      This function performs a route lookup on the given flow.
1051 *
1052 *      It returns a valid dst pointer on success, or a pointer encoded
1053 *      error code.
1054 */
1055struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1056                                      const struct in6_addr *final_dst)
1057{
1058        struct dst_entry *dst = NULL;
1059        int err;
1060
1061        err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1062        if (err)
1063                return ERR_PTR(err);
1064        if (final_dst)
1065                fl6->daddr = *final_dst;
1066
1067        return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1068}
1069EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1070
1071/**
1072 *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1073 *      @sk: socket which provides the dst cache and route info
1074 *      @fl6: flow to lookup
1075 *      @final_dst: final destination address for ipsec lookup
1076 *
1077 *      This function performs a route lookup on the given flow with the
1078 *      possibility of using the cached route in the socket if it is valid.
1079 *      It will take the socket dst lock when operating on the dst cache.
1080 *      As a result, this function can only be used in process context.
1081 *
1082 *      It returns a valid dst pointer on success, or a pointer encoded
1083 *      error code.
1084 */
1085struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1086                                         const struct in6_addr *final_dst)
1087{
1088        struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1089
1090        dst = ip6_sk_dst_check(sk, dst, fl6);
1091        if (!dst)
1092                dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1093
1094        return dst;
1095}
1096EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1097
1098static inline int ip6_ufo_append_data(struct sock *sk,
1099                        struct sk_buff_head *queue,
1100                        int getfrag(void *from, char *to, int offset, int len,
1101                        int odd, struct sk_buff *skb),
1102                        void *from, int length, int hh_len, int fragheaderlen,
1103                        int exthdrlen, int transhdrlen, int mtu,
1104                        unsigned int flags, const struct flowi6 *fl6)
1105
1106{
1107        struct sk_buff *skb;
1108        int err;
1109
1110        /* There is support for UDP large send offload by network
1111         * device, so create one single skb packet containing complete
1112         * udp datagram
1113         */
1114        skb = skb_peek_tail(queue);
1115        if (!skb) {
1116                skb = sock_alloc_send_skb(sk,
1117                        hh_len + fragheaderlen + transhdrlen + 20,
1118                        (flags & MSG_DONTWAIT), &err);
1119                if (!skb)
1120                        return err;
1121
1122                /* reserve space for Hardware header */
1123                skb_reserve(skb, hh_len);
1124
1125                /* create space for UDP/IP header */
1126                skb_put(skb, fragheaderlen + transhdrlen);
1127
1128                /* initialize network header pointer */
1129                skb_set_network_header(skb, exthdrlen);
1130
1131                /* initialize protocol header pointer */
1132                skb->transport_header = skb->network_header + fragheaderlen;
1133
1134                skb->protocol = htons(ETH_P_IPV6);
1135                skb->csum = 0;
1136
1137                __skb_queue_tail(queue, skb);
1138        } else if (skb_is_gso(skb)) {
1139                goto append;
1140        }
1141
1142        skb->ip_summed = CHECKSUM_PARTIAL;
1143        /* Specify the length of each IPv6 datagram fragment.
1144         * It has to be a multiple of 8.
1145         */
1146        skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1147                                     sizeof(struct frag_hdr)) & ~7;
1148        skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1149        skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1150                                                         &fl6->daddr,
1151                                                         &fl6->saddr);
1152
1153append:
1154        return skb_append_datato_frags(sk, skb, getfrag, from,
1155                                       (length - transhdrlen));
1156}
1157
1158static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1159                                               gfp_t gfp)
1160{
1161        return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1162}
1163
1164static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1165                                                gfp_t gfp)
1166{
1167        return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1168}
1169
1170static void ip6_append_data_mtu(unsigned int *mtu,
1171                                int *maxfraglen,
1172                                unsigned int fragheaderlen,
1173                                struct sk_buff *skb,
1174                                struct rt6_info *rt,
1175                                unsigned int orig_mtu)
1176{
1177        if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1178                if (!skb) {
1179                        /* first fragment, reserve header_len */
1180                        *mtu = orig_mtu - rt->dst.header_len;
1181
1182                } else {
1183                        /*
1184                         * this fragment is not first, the headers
1185                         * space is regarded as data space.
1186                         */
1187                        *mtu = orig_mtu;
1188                }
1189                *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1190                              + fragheaderlen - sizeof(struct frag_hdr);
1191        }
1192}
1193
1194static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1195                          struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1196                          struct rt6_info *rt, struct flowi6 *fl6)
1197{
1198        struct ipv6_pinfo *np = inet6_sk(sk);
1199        unsigned int mtu;
1200        struct ipv6_txoptions *opt = ipc6->opt;
1201
1202        /*
1203         * setup for corking
1204         */
1205        if (opt) {
1206                if (WARN_ON(v6_cork->opt))
1207                        return -EINVAL;
1208
1209                v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1210                if (unlikely(!v6_cork->opt))
1211                        return -ENOBUFS;
1212
1213                v6_cork->opt->tot_len = opt->tot_len;
1214                v6_cork->opt->opt_flen = opt->opt_flen;
1215                v6_cork->opt->opt_nflen = opt->opt_nflen;
1216
1217                v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1218                                                    sk->sk_allocation);
1219                if (opt->dst0opt && !v6_cork->opt->dst0opt)
1220                        return -ENOBUFS;
1221
1222                v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1223                                                    sk->sk_allocation);
1224                if (opt->dst1opt && !v6_cork->opt->dst1opt)
1225                        return -ENOBUFS;
1226
1227                v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1228                                                   sk->sk_allocation);
1229                if (opt->hopopt && !v6_cork->opt->hopopt)
1230                        return -ENOBUFS;
1231
1232                v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1233                                                    sk->sk_allocation);
1234                if (opt->srcrt && !v6_cork->opt->srcrt)
1235                        return -ENOBUFS;
1236
1237                /* need source address above miyazawa*/
1238        }
1239        dst_hold(&rt->dst);
1240        cork->base.dst = &rt->dst;
1241        cork->fl.u.ip6 = *fl6;
1242        v6_cork->hop_limit = ipc6->hlimit;
1243        v6_cork->tclass = ipc6->tclass;
1244        if (rt->dst.flags & DST_XFRM_TUNNEL)
1245                mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1246                      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1247        else
1248                mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1249                      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1250        if (np->frag_size < mtu) {
1251                if (np->frag_size)
1252                        mtu = np->frag_size;
1253        }
1254        cork->base.fragsize = mtu;
1255        if (dst_allfrag(rt->dst.path))
1256                cork->base.flags |= IPCORK_ALLFRAG;
1257        cork->base.length = 0;
1258
1259        return 0;
1260}
1261
1262static int __ip6_append_data(struct sock *sk,
1263                             struct flowi6 *fl6,
1264                             struct sk_buff_head *queue,
1265                             struct inet_cork *cork,
1266                             struct inet6_cork *v6_cork,
1267                             struct page_frag *pfrag,
1268                             int getfrag(void *from, char *to, int offset,
1269                                         int len, int odd, struct sk_buff *skb),
1270                             void *from, int length, int transhdrlen,
1271                             unsigned int flags, struct ipcm6_cookie *ipc6,
1272                             const struct sockcm_cookie *sockc)
1273{
1274        struct sk_buff *skb, *skb_prev = NULL;
1275        unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1276        int exthdrlen = 0;
1277        int dst_exthdrlen = 0;
1278        int hh_len;
1279        int copy;
1280        int err;
1281        int offset = 0;
1282        __u8 tx_flags = 0;
1283        u32 tskey = 0;
1284        struct rt6_info *rt = (struct rt6_info *)cork->dst;
1285        struct ipv6_txoptions *opt = v6_cork->opt;
1286        int csummode = CHECKSUM_NONE;
1287        unsigned int maxnonfragsize, headersize;
1288
1289        skb = skb_peek_tail(queue);
1290        if (!skb) {
1291                exthdrlen = opt ? opt->opt_flen : 0;
1292                dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1293        }
1294
1295        mtu = cork->fragsize;
1296        orig_mtu = mtu;
1297
1298        hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1299
1300        fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1301                        (opt ? opt->opt_nflen : 0);
1302        maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1303                     sizeof(struct frag_hdr);
1304
1305        headersize = sizeof(struct ipv6hdr) +
1306                     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1307                     (dst_allfrag(&rt->dst) ?
1308                      sizeof(struct frag_hdr) : 0) +
1309                     rt->rt6i_nfheader_len;
1310
1311        if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1312            (sk->sk_protocol == IPPROTO_UDP ||
1313             sk->sk_protocol == IPPROTO_RAW)) {
1314                ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1315                                sizeof(struct ipv6hdr));
1316                goto emsgsize;
1317        }
1318
1319        if (ip6_sk_ignore_df(sk))
1320                maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1321        else
1322                maxnonfragsize = mtu;
1323
1324        if (cork->length + length > maxnonfragsize - headersize) {
1325emsgsize:
1326                ipv6_local_error(sk, EMSGSIZE, fl6,
1327                                 mtu - headersize +
1328                                 sizeof(struct ipv6hdr));
1329                return -EMSGSIZE;
1330        }
1331
1332        /* CHECKSUM_PARTIAL only with no extension headers and when
1333         * we are not going to fragment
1334         */
1335        if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1336            headersize == sizeof(struct ipv6hdr) &&
1337            length < mtu - headersize &&
1338            !(flags & MSG_MORE) &&
1339            rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1340                csummode = CHECKSUM_PARTIAL;
1341
1342        if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1343                sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1344                if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1345                    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1346                        tskey = sk->sk_tskey++;
1347        }
1348
1349        /*
1350         * Let's try using as much space as possible.
1351         * Use MTU if total length of the message fits into the MTU.
1352         * Otherwise, we need to reserve fragment header and
1353         * fragment alignment (= 8-15 octects, in total).
1354         *
1355         * Note that we may need to "move" the data from the tail of
1356         * of the buffer to the new fragment when we split
1357         * the message.
1358         *
1359         * FIXME: It may be fragmented into multiple chunks
1360         *        at once if non-fragmentable extension headers
1361         *        are too large.
1362         * --yoshfuji
1363         */
1364
1365        cork->length += length;
1366        if (((length > mtu) ||
1367             (skb && skb_is_gso(skb))) &&
1368            (sk->sk_protocol == IPPROTO_UDP) &&
1369            (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
1370            (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
1371                err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1372                                          hh_len, fragheaderlen, exthdrlen,
1373                                          transhdrlen, mtu, flags, fl6);
1374                if (err)
1375                        goto error;
1376                return 0;
1377        }
1378
1379        if (!skb)
1380                goto alloc_new_skb;
1381
1382        while (length > 0) {
1383                /* Check if the remaining data fits into current packet. */
1384                copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1385                if (copy < length)
1386                        copy = maxfraglen - skb->len;
1387
1388                if (copy <= 0) {
1389                        char *data;
1390                        unsigned int datalen;
1391                        unsigned int fraglen;
1392                        unsigned int fraggap;
1393                        unsigned int alloclen;
1394alloc_new_skb:
1395                        /* There's no room in the current skb */
1396                        if (skb)
1397                                fraggap = skb->len - maxfraglen;
1398                        else
1399                                fraggap = 0;
1400                        /* update mtu and maxfraglen if necessary */
1401                        if (!skb || !skb_prev)
1402                                ip6_append_data_mtu(&mtu, &maxfraglen,
1403                                                    fragheaderlen, skb, rt,
1404                                                    orig_mtu);
1405
1406                        skb_prev = skb;
1407
1408                        /*
1409                         * If remaining data exceeds the mtu,
1410                         * we know we need more fragment(s).
1411                         */
1412                        datalen = length + fraggap;
1413
1414                        if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1415                                datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1416                        if ((flags & MSG_MORE) &&
1417                            !(rt->dst.dev->features&NETIF_F_SG))
1418                                alloclen = mtu;
1419                        else
1420                                alloclen = datalen + fragheaderlen;
1421
1422                        alloclen += dst_exthdrlen;
1423
1424                        if (datalen != length + fraggap) {
1425                                /*
1426                                 * this is not the last fragment, the trailer
1427                                 * space is regarded as data space.
1428                                 */
1429                                datalen += rt->dst.trailer_len;
1430                        }
1431
1432                        alloclen += rt->dst.trailer_len;
1433                        fraglen = datalen + fragheaderlen;
1434
1435                        /*
1436                         * We just reserve space for fragment header.
1437                         * Note: this may be overallocation if the message
1438                         * (without MSG_MORE) fits into the MTU.
1439                         */
1440                        alloclen += sizeof(struct frag_hdr);
1441
1442                        if (transhdrlen) {
1443                                skb = sock_alloc_send_skb(sk,
1444                                                alloclen + hh_len,
1445                                                (flags & MSG_DONTWAIT), &err);
1446                        } else {
1447                                skb = NULL;
1448                                if (atomic_read(&sk->sk_wmem_alloc) <=
1449                                    2 * sk->sk_sndbuf)
1450                                        skb = sock_wmalloc(sk,
1451                                                           alloclen + hh_len, 1,
1452                                                           sk->sk_allocation);
1453                                if (unlikely(!skb))
1454                                        err = -ENOBUFS;
1455                        }
1456                        if (!skb)
1457                                goto error;
1458                        /*
1459                         *      Fill in the control structures
1460                         */
1461                        skb->protocol = htons(ETH_P_IPV6);
1462                        skb->ip_summed = csummode;
1463                        skb->csum = 0;
1464                        /* reserve for fragmentation and ipsec header */
1465                        skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1466                                    dst_exthdrlen);
1467
1468                        /* Only the initial fragment is time stamped */
1469                        skb_shinfo(skb)->tx_flags = tx_flags;
1470                        tx_flags = 0;
1471                        skb_shinfo(skb)->tskey = tskey;
1472                        tskey = 0;
1473
1474                        /*
1475                         *      Find where to start putting bytes
1476                         */
1477                        data = skb_put(skb, fraglen);
1478                        skb_set_network_header(skb, exthdrlen);
1479                        data += fragheaderlen;
1480                        skb->transport_header = (skb->network_header +
1481                                                 fragheaderlen);
1482                        if (fraggap) {
1483                                skb->csum = skb_copy_and_csum_bits(
1484                                        skb_prev, maxfraglen,
1485                                        data + transhdrlen, fraggap, 0);
1486                                skb_prev->csum = csum_sub(skb_prev->csum,
1487                                                          skb->csum);
1488                                data += fraggap;
1489                                pskb_trim_unique(skb_prev, maxfraglen);
1490                        }
1491                        copy = datalen - transhdrlen - fraggap;
1492
1493                        if (copy < 0) {
1494                                err = -EINVAL;
1495                                kfree_skb(skb);
1496                                goto error;
1497                        } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1498                                err = -EFAULT;
1499                                kfree_skb(skb);
1500                                goto error;
1501                        }
1502
1503                        offset += copy;
1504                        length -= datalen - fraggap;
1505                        transhdrlen = 0;
1506                        exthdrlen = 0;
1507                        dst_exthdrlen = 0;
1508
1509                        /*
1510                         * Put the packet on the pending queue
1511                         */
1512                        __skb_queue_tail(queue, skb);
1513                        continue;
1514                }
1515
1516                if (copy > length)
1517                        copy = length;
1518
1519                if (!(rt->dst.dev->features&NETIF_F_SG)) {
1520                        unsigned int off;
1521
1522                        off = skb->len;
1523                        if (getfrag(from, skb_put(skb, copy),
1524                                                offset, copy, off, skb) < 0) {
1525                                __skb_trim(skb, off);
1526                                err = -EFAULT;
1527                                goto error;
1528                        }
1529                } else {
1530                        int i = skb_shinfo(skb)->nr_frags;
1531
1532                        err = -ENOMEM;
1533                        if (!sk_page_frag_refill(sk, pfrag))
1534                                goto error;
1535
1536                        if (!skb_can_coalesce(skb, i, pfrag->page,
1537                                              pfrag->offset)) {
1538                                err = -EMSGSIZE;
1539                                if (i == MAX_SKB_FRAGS)
1540                                        goto error;
1541
1542                                __skb_fill_page_desc(skb, i, pfrag->page,
1543                                                     pfrag->offset, 0);
1544                                skb_shinfo(skb)->nr_frags = ++i;
1545                                get_page(pfrag->page);
1546                        }
1547                        copy = min_t(int, copy, pfrag->size - pfrag->offset);
1548                        if (getfrag(from,
1549                                    page_address(pfrag->page) + pfrag->offset,
1550                                    offset, copy, skb->len, skb) < 0)
1551                                goto error_efault;
1552
1553                        pfrag->offset += copy;
1554                        skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1555                        skb->len += copy;
1556                        skb->data_len += copy;
1557                        skb->truesize += copy;
1558                        atomic_add(copy, &sk->sk_wmem_alloc);
1559                }
1560                offset += copy;
1561                length -= copy;
1562        }
1563
1564        return 0;
1565
1566error_efault:
1567        err = -EFAULT;
1568error:
1569        cork->length -= length;
1570        IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1571        return err;
1572}
1573
1574int ip6_append_data(struct sock *sk,
1575                    int getfrag(void *from, char *to, int offset, int len,
1576                                int odd, struct sk_buff *skb),
1577                    void *from, int length, int transhdrlen,
1578                    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1579                    struct rt6_info *rt, unsigned int flags,
1580                    const struct sockcm_cookie *sockc)
1581{
1582        struct inet_sock *inet = inet_sk(sk);
1583        struct ipv6_pinfo *np = inet6_sk(sk);
1584        int exthdrlen;
1585        int err;
1586
1587        if (flags&MSG_PROBE)
1588                return 0;
1589        if (skb_queue_empty(&sk->sk_write_queue)) {
1590                /*
1591                 * setup for corking
1592                 */
1593                err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1594                                     ipc6, rt, fl6);
1595                if (err)
1596                        return err;
1597
1598                exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1599                length += exthdrlen;
1600                transhdrlen += exthdrlen;
1601        } else {
1602                fl6 = &inet->cork.fl.u.ip6;
1603                transhdrlen = 0;
1604        }
1605
1606        return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1607                                 &np->cork, sk_page_frag(sk), getfrag,
1608                                 from, length, transhdrlen, flags, ipc6, sockc);
1609}
1610EXPORT_SYMBOL_GPL(ip6_append_data);
1611
1612static void ip6_cork_release(struct inet_cork_full *cork,
1613                             struct inet6_cork *v6_cork)
1614{
1615        if (v6_cork->opt) {
1616                kfree(v6_cork->opt->dst0opt);
1617                kfree(v6_cork->opt->dst1opt);
1618                kfree(v6_cork->opt->hopopt);
1619                kfree(v6_cork->opt->srcrt);
1620                kfree(v6_cork->opt);
1621                v6_cork->opt = NULL;
1622        }
1623
1624        if (cork->base.dst) {
1625                dst_release(cork->base.dst);
1626                cork->base.dst = NULL;
1627                cork->base.flags &= ~IPCORK_ALLFRAG;
1628        }
1629        memset(&cork->fl, 0, sizeof(cork->fl));
1630}
1631
1632struct sk_buff *__ip6_make_skb(struct sock *sk,
1633                               struct sk_buff_head *queue,
1634                               struct inet_cork_full *cork,
1635                               struct inet6_cork *v6_cork)
1636{
1637        struct sk_buff *skb, *tmp_skb;
1638        struct sk_buff **tail_skb;
1639        struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1640        struct ipv6_pinfo *np = inet6_sk(sk);
1641        struct net *net = sock_net(sk);
1642        struct ipv6hdr *hdr;
1643        struct ipv6_txoptions *opt = v6_cork->opt;
1644        struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1645        struct flowi6 *fl6 = &cork->fl.u.ip6;
1646        unsigned char proto = fl6->flowi6_proto;
1647
1648        skb = __skb_dequeue(queue);
1649        if (!skb)
1650                goto out;
1651        tail_skb = &(skb_shinfo(skb)->frag_list);
1652
1653        /* move skb->data to ip header from ext header */
1654        if (skb->data < skb_network_header(skb))
1655                __skb_pull(skb, skb_network_offset(skb));
1656        while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1657                __skb_pull(tmp_skb, skb_network_header_len(skb));
1658                *tail_skb = tmp_skb;
1659                tail_skb = &(tmp_skb->next);
1660                skb->len += tmp_skb->len;
1661                skb->data_len += tmp_skb->len;
1662                skb->truesize += tmp_skb->truesize;
1663                tmp_skb->destructor = NULL;
1664                tmp_skb->sk = NULL;
1665        }
1666
1667        /* Allow local fragmentation. */
1668        skb->ignore_df = ip6_sk_ignore_df(sk);
1669
1670        *final_dst = fl6->daddr;
1671        __skb_pull(skb, skb_network_header_len(skb));
1672        if (opt && opt->opt_flen)
1673                ipv6_push_frag_opts(skb, opt, &proto);
1674        if (opt && opt->opt_nflen)
1675                ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1676
1677        skb_push(skb, sizeof(struct ipv6hdr));
1678        skb_reset_network_header(skb);
1679        hdr = ipv6_hdr(skb);
1680
1681        ip6_flow_hdr(hdr, v6_cork->tclass,
1682                     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1683                                        np->autoflowlabel, fl6));
1684        hdr->hop_limit = v6_cork->hop_limit;
1685        hdr->nexthdr = proto;
1686        hdr->saddr = fl6->saddr;
1687        hdr->daddr = *final_dst;
1688
1689        skb->priority = sk->sk_priority;
1690        skb->mark = sk->sk_mark;
1691
1692        skb_dst_set(skb, dst_clone(&rt->dst));
1693        IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1694        if (proto == IPPROTO_ICMPV6) {
1695                struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1696
1697                ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1698                ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1699        }
1700
1701        ip6_cork_release(cork, v6_cork);
1702out:
1703        return skb;
1704}
1705
1706int ip6_send_skb(struct sk_buff *skb)
1707{
1708        struct net *net = sock_net(skb->sk);
1709        struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1710        int err;
1711
1712        err = ip6_local_out(net, skb->sk, skb);
1713        if (err) {
1714                if (err > 0)
1715                        err = net_xmit_errno(err);
1716                if (err)
1717                        IP6_INC_STATS(net, rt->rt6i_idev,
1718                                      IPSTATS_MIB_OUTDISCARDS);
1719        }
1720
1721        return err;
1722}
1723
1724int ip6_push_pending_frames(struct sock *sk)
1725{
1726        struct sk_buff *skb;
1727
1728        skb = ip6_finish_skb(sk);
1729        if (!skb)
1730                return 0;
1731
1732        return ip6_send_skb(skb);
1733}
1734EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1735
1736static void __ip6_flush_pending_frames(struct sock *sk,
1737                                       struct sk_buff_head *queue,
1738                                       struct inet_cork_full *cork,
1739                                       struct inet6_cork *v6_cork)
1740{
1741        struct sk_buff *skb;
1742
1743        while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1744                if (skb_dst(skb))
1745                        IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1746                                      IPSTATS_MIB_OUTDISCARDS);
1747                kfree_skb(skb);
1748        }
1749
1750        ip6_cork_release(cork, v6_cork);
1751}
1752
1753void ip6_flush_pending_frames(struct sock *sk)
1754{
1755        __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1756                                   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1757}
1758EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1759
1760struct sk_buff *ip6_make_skb(struct sock *sk,
1761                             int getfrag(void *from, char *to, int offset,
1762                                         int len, int odd, struct sk_buff *skb),
1763                             void *from, int length, int transhdrlen,
1764                             struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1765                             struct rt6_info *rt, unsigned int flags,
1766                             const struct sockcm_cookie *sockc)
1767{
1768        struct inet_cork_full cork;
1769        struct inet6_cork v6_cork;
1770        struct sk_buff_head queue;
1771        int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1772        int err;
1773
1774        if (flags & MSG_PROBE)
1775                return NULL;
1776
1777        __skb_queue_head_init(&queue);
1778
1779        cork.base.flags = 0;
1780        cork.base.addr = 0;
1781        cork.base.opt = NULL;
1782        v6_cork.opt = NULL;
1783        err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1784        if (err)
1785                return ERR_PTR(err);
1786
1787        if (ipc6->dontfrag < 0)
1788                ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1789
1790        err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1791                                &current->task_frag, getfrag, from,
1792                                length + exthdrlen, transhdrlen + exthdrlen,
1793                                flags, ipc6, sockc);
1794        if (err) {
1795                __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1796                return ERR_PTR(err);
1797        }
1798
1799        return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1800}
1801