linux/net/ipv6/route.c
<<
>>
Prefs
   1/*
   2 *      Linux INET6 implementation
   3 *      FIB front-end.
   4 *
   5 *      Authors:
   6 *      Pedro Roque             <roque@di.fc.ul.pt>
   7 *
   8 *      This program is free software; you can redistribute it and/or
   9 *      modify it under the terms of the GNU General Public License
  10 *      as published by the Free Software Foundation; either version
  11 *      2 of the License, or (at your option) any later version.
  12 */
  13
  14/*      Changes:
  15 *
  16 *      YOSHIFUJI Hideaki @USAGI
  17 *              reworked default router selection.
  18 *              - respect outgoing interface
  19 *              - select from (probably) reachable routers (i.e.
  20 *              routers in REACHABLE, STALE, DELAY or PROBE states).
  21 *              - always select the same router if it is (probably)
  22 *              reachable.  otherwise, round-robin the list.
  23 *      Ville Nuorvala
  24 *              Fixed routing subtrees.
  25 */
  26
  27#define pr_fmt(fmt) "IPv6: " fmt
  28
  29#include <linux/capability.h>
  30#include <linux/errno.h>
  31#include <linux/export.h>
  32#include <linux/types.h>
  33#include <linux/times.h>
  34#include <linux/socket.h>
  35#include <linux/sockios.h>
  36#include <linux/net.h>
  37#include <linux/route.h>
  38#include <linux/netdevice.h>
  39#include <linux/in6.h>
  40#include <linux/mroute6.h>
  41#include <linux/init.h>
  42#include <linux/if_arp.h>
  43#include <linux/proc_fs.h>
  44#include <linux/seq_file.h>
  45#include <linux/nsproxy.h>
  46#include <linux/slab.h>
  47#include <net/net_namespace.h>
  48#include <net/snmp.h>
  49#include <net/ipv6.h>
  50#include <net/ip6_fib.h>
  51#include <net/ip6_route.h>
  52#include <net/ndisc.h>
  53#include <net/addrconf.h>
  54#include <net/tcp.h>
  55#include <linux/rtnetlink.h>
  56#include <net/dst.h>
  57#include <net/xfrm.h>
  58#include <net/netevent.h>
  59#include <net/netlink.h>
  60
  61#include <asm/uaccess.h>
  62
  63#ifdef CONFIG_SYSCTL
  64#include <linux/sysctl.h>
  65#endif
  66
  67static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
  68                                    const struct in6_addr *dest);
  69static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
  70static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
  71static unsigned int      ip6_mtu(const struct dst_entry *dst);
  72static struct dst_entry *ip6_negative_advice(struct dst_entry *);
  73static void             ip6_dst_destroy(struct dst_entry *);
  74static void             ip6_dst_ifdown(struct dst_entry *,
  75                                       struct net_device *dev, int how);
  76static int               ip6_dst_gc(struct dst_ops *ops);
  77
  78static int              ip6_pkt_discard(struct sk_buff *skb);
  79static int              ip6_pkt_discard_out(struct sk_buff *skb);
  80static void             ip6_link_failure(struct sk_buff *skb);
  81static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
  82
  83#ifdef CONFIG_IPV6_ROUTE_INFO
  84static struct rt6_info *rt6_add_route_info(struct net *net,
  85                                           const struct in6_addr *prefix, int prefixlen,
  86                                           const struct in6_addr *gwaddr, int ifindex,
  87                                           unsigned int pref);
  88static struct rt6_info *rt6_get_route_info(struct net *net,
  89                                           const struct in6_addr *prefix, int prefixlen,
  90                                           const struct in6_addr *gwaddr, int ifindex);
  91#endif
  92
  93static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
  94{
  95        struct rt6_info *rt = (struct rt6_info *) dst;
  96        struct inet_peer *peer;
  97        u32 *p = NULL;
  98
  99        if (!(rt->dst.flags & DST_HOST))
 100                return NULL;
 101
 102        if (!rt->rt6i_peer)
 103                rt6_bind_peer(rt, 1);
 104
 105        peer = rt->rt6i_peer;
 106        if (peer) {
 107                u32 *old_p = __DST_METRICS_PTR(old);
 108                unsigned long prev, new;
 109
 110                p = peer->metrics;
 111                if (inet_metrics_new(peer))
 112                        memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 113
 114                new = (unsigned long) p;
 115                prev = cmpxchg(&dst->_metrics, old, new);
 116
 117                if (prev != old) {
 118                        p = __DST_METRICS_PTR(prev);
 119                        if (prev & DST_METRICS_READ_ONLY)
 120                                p = NULL;
 121                }
 122        }
 123        return p;
 124}
 125
 126static inline const void *choose_neigh_daddr(struct rt6_info *rt, const void *daddr)
 127{
 128        struct in6_addr *p = &rt->rt6i_gateway;
 129
 130        if (!ipv6_addr_any(p))
 131                return (const void *) p;
 132        return daddr;
 133}
 134
 135static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
 136{
 137        struct rt6_info *rt = (struct rt6_info *) dst;
 138        struct neighbour *n;
 139
 140        daddr = choose_neigh_daddr(rt, daddr);
 141        n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
 142        if (n)
 143                return n;
 144        return neigh_create(&nd_tbl, daddr, dst->dev);
 145}
 146
 147static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
 148{
 149        struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
 150        if (!n) {
 151                n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
 152                if (IS_ERR(n))
 153                        return PTR_ERR(n);
 154        }
 155        dst_set_neighbour(&rt->dst, n);
 156
 157        return 0;
 158}
 159
 160static struct dst_ops ip6_dst_ops_template = {
 161        .family                 =       AF_INET6,
 162        .protocol               =       cpu_to_be16(ETH_P_IPV6),
 163        .gc                     =       ip6_dst_gc,
 164        .gc_thresh              =       1024,
 165        .check                  =       ip6_dst_check,
 166        .default_advmss         =       ip6_default_advmss,
 167        .mtu                    =       ip6_mtu,
 168        .cow_metrics            =       ipv6_cow_metrics,
 169        .destroy                =       ip6_dst_destroy,
 170        .ifdown                 =       ip6_dst_ifdown,
 171        .negative_advice        =       ip6_negative_advice,
 172        .link_failure           =       ip6_link_failure,
 173        .update_pmtu            =       ip6_rt_update_pmtu,
 174        .local_out              =       __ip6_local_out,
 175        .neigh_lookup           =       ip6_neigh_lookup,
 176};
 177
 178static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
 179{
 180        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
 181
 182        return mtu ? : dst->dev->mtu;
 183}
 184
 185static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
 186{
 187}
 188
 189static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
 190                                         unsigned long old)
 191{
 192        return NULL;
 193}
 194
 195static struct dst_ops ip6_dst_blackhole_ops = {
 196        .family                 =       AF_INET6,
 197        .protocol               =       cpu_to_be16(ETH_P_IPV6),
 198        .destroy                =       ip6_dst_destroy,
 199        .check                  =       ip6_dst_check,
 200        .mtu                    =       ip6_blackhole_mtu,
 201        .default_advmss         =       ip6_default_advmss,
 202        .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
 203        .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
 204        .neigh_lookup           =       ip6_neigh_lookup,
 205};
 206
 207static const u32 ip6_template_metrics[RTAX_MAX] = {
 208        [RTAX_HOPLIMIT - 1] = 255,
 209};
 210
 211static struct rt6_info ip6_null_entry_template = {
 212        .dst = {
 213                .__refcnt       = ATOMIC_INIT(1),
 214                .__use          = 1,
 215                .obsolete       = -1,
 216                .error          = -ENETUNREACH,
 217                .input          = ip6_pkt_discard,
 218                .output         = ip6_pkt_discard_out,
 219        },
 220        .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
 221        .rt6i_protocol  = RTPROT_KERNEL,
 222        .rt6i_metric    = ~(u32) 0,
 223        .rt6i_ref       = ATOMIC_INIT(1),
 224};
 225
 226#ifdef CONFIG_IPV6_MULTIPLE_TABLES
 227
 228static int ip6_pkt_prohibit(struct sk_buff *skb);
 229static int ip6_pkt_prohibit_out(struct sk_buff *skb);
 230
 231static struct rt6_info ip6_prohibit_entry_template = {
 232        .dst = {
 233                .__refcnt       = ATOMIC_INIT(1),
 234                .__use          = 1,
 235                .obsolete       = -1,
 236                .error          = -EACCES,
 237                .input          = ip6_pkt_prohibit,
 238                .output         = ip6_pkt_prohibit_out,
 239        },
 240        .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
 241        .rt6i_protocol  = RTPROT_KERNEL,
 242        .rt6i_metric    = ~(u32) 0,
 243        .rt6i_ref       = ATOMIC_INIT(1),
 244};
 245
 246static struct rt6_info ip6_blk_hole_entry_template = {
 247        .dst = {
 248                .__refcnt       = ATOMIC_INIT(1),
 249                .__use          = 1,
 250                .obsolete       = -1,
 251                .error          = -EINVAL,
 252                .input          = dst_discard,
 253                .output         = dst_discard,
 254        },
 255        .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
 256        .rt6i_protocol  = RTPROT_KERNEL,
 257        .rt6i_metric    = ~(u32) 0,
 258        .rt6i_ref       = ATOMIC_INIT(1),
 259};
 260
 261#endif
 262
 263/* allocate dst with ip6_dst_ops */
 264static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
 265                                             struct net_device *dev,
 266                                             int flags)
 267{
 268        struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
 269
 270        if (rt)
 271                memset(&rt->rt6i_table, 0,
 272                       sizeof(*rt) - sizeof(struct dst_entry));
 273
 274        return rt;
 275}
 276
 277static void ip6_dst_destroy(struct dst_entry *dst)
 278{
 279        struct rt6_info *rt = (struct rt6_info *)dst;
 280        struct inet6_dev *idev = rt->rt6i_idev;
 281        struct inet_peer *peer = rt->rt6i_peer;
 282
 283        if (!(rt->dst.flags & DST_HOST))
 284                dst_destroy_metrics_generic(dst);
 285
 286        if (idev) {
 287                rt->rt6i_idev = NULL;
 288                in6_dev_put(idev);
 289        }
 290
 291        if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from)
 292                dst_release(dst->from);
 293
 294        if (peer) {
 295                rt->rt6i_peer = NULL;
 296                inet_putpeer(peer);
 297        }
 298}
 299
 300static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
 301
 302static u32 rt6_peer_genid(void)
 303{
 304        return atomic_read(&__rt6_peer_genid);
 305}
 306
 307void rt6_bind_peer(struct rt6_info *rt, int create)
 308{
 309        struct inet_peer *peer;
 310
 311        peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
 312        if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
 313                inet_putpeer(peer);
 314        else
 315                rt->rt6i_peer_genid = rt6_peer_genid();
 316}
 317
 318static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 319                           int how)
 320{
 321        struct rt6_info *rt = (struct rt6_info *)dst;
 322        struct inet6_dev *idev = rt->rt6i_idev;
 323        struct net_device *loopback_dev =
 324                dev_net(dev)->loopback_dev;
 325
 326        if (dev != loopback_dev && idev && idev->dev == dev) {
 327                struct inet6_dev *loopback_idev =
 328                        in6_dev_get(loopback_dev);
 329                if (loopback_idev) {
 330                        rt->rt6i_idev = loopback_idev;
 331                        in6_dev_put(idev);
 332                }
 333        }
 334}
 335
 336static bool rt6_check_expired(const struct rt6_info *rt)
 337{
 338        struct rt6_info *ort = NULL;
 339
 340        if (rt->rt6i_flags & RTF_EXPIRES) {
 341                if (time_after(jiffies, rt->dst.expires))
 342                        return true;
 343        } else if (rt->dst.from) {
 344                ort = (struct rt6_info *) rt->dst.from;
 345                return (ort->rt6i_flags & RTF_EXPIRES) &&
 346                        time_after(jiffies, ort->dst.expires);
 347        }
 348        return false;
 349}
 350
 351static bool rt6_need_strict(const struct in6_addr *daddr)
 352{
 353        return ipv6_addr_type(daddr) &
 354                (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
 355}
 356
 357/*
 358 *      Route lookup. Any table->tb6_lock is implied.
 359 */
 360
 361static inline struct rt6_info *rt6_device_match(struct net *net,
 362                                                    struct rt6_info *rt,
 363                                                    const struct in6_addr *saddr,
 364                                                    int oif,
 365                                                    int flags)
 366{
 367        struct rt6_info *local = NULL;
 368        struct rt6_info *sprt;
 369
 370        if (!oif && ipv6_addr_any(saddr))
 371                goto out;
 372
 373        for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
 374                struct net_device *dev = sprt->dst.dev;
 375
 376                if (oif) {
 377                        if (dev->ifindex == oif)
 378                                return sprt;
 379                        if (dev->flags & IFF_LOOPBACK) {
 380                                if (!sprt->rt6i_idev ||
 381                                    sprt->rt6i_idev->dev->ifindex != oif) {
 382                                        if (flags & RT6_LOOKUP_F_IFACE && oif)
 383                                                continue;
 384                                        if (local && (!oif ||
 385                                                      local->rt6i_idev->dev->ifindex == oif))
 386                                                continue;
 387                                }
 388                                local = sprt;
 389                        }
 390                } else {
 391                        if (ipv6_chk_addr(net, saddr, dev,
 392                                          flags & RT6_LOOKUP_F_IFACE))
 393                                return sprt;
 394                }
 395        }
 396
 397        if (oif) {
 398                if (local)
 399                        return local;
 400
 401                if (flags & RT6_LOOKUP_F_IFACE)
 402                        return net->ipv6.ip6_null_entry;
 403        }
 404out:
 405        return rt;
 406}
 407
 408#ifdef CONFIG_IPV6_ROUTER_PREF
 409static void rt6_probe(struct rt6_info *rt)
 410{
 411        struct neighbour *neigh;
 412        /*
 413         * Okay, this does not seem to be appropriate
 414         * for now, however, we need to check if it
 415         * is really so; aka Router Reachability Probing.
 416         *
 417         * Router Reachability Probe MUST be rate-limited
 418         * to no more than one per minute.
 419         */
 420        rcu_read_lock();
 421        neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL;
 422        if (!neigh || (neigh->nud_state & NUD_VALID))
 423                goto out;
 424        read_lock_bh(&neigh->lock);
 425        if (!(neigh->nud_state & NUD_VALID) &&
 426            time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
 427                struct in6_addr mcaddr;
 428                struct in6_addr *target;
 429
 430                neigh->updated = jiffies;
 431                read_unlock_bh(&neigh->lock);
 432
 433                target = (struct in6_addr *)&neigh->primary_key;
 434                addrconf_addr_solict_mult(target, &mcaddr);
 435                ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
 436        } else {
 437                read_unlock_bh(&neigh->lock);
 438        }
 439out:
 440        rcu_read_unlock();
 441}
 442#else
 443static inline void rt6_probe(struct rt6_info *rt)
 444{
 445}
 446#endif
 447
 448/*
 449 * Default Router Selection (RFC 2461 6.3.6)
 450 */
 451static inline int rt6_check_dev(struct rt6_info *rt, int oif)
 452{
 453        struct net_device *dev = rt->dst.dev;
 454        if (!oif || dev->ifindex == oif)
 455                return 2;
 456        if ((dev->flags & IFF_LOOPBACK) &&
 457            rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
 458                return 1;
 459        return 0;
 460}
 461
 462static inline int rt6_check_neigh(struct rt6_info *rt)
 463{
 464        struct neighbour *neigh;
 465        int m;
 466
 467        rcu_read_lock();
 468        neigh = dst_get_neighbour_noref(&rt->dst);
 469        if (rt->rt6i_flags & RTF_NONEXTHOP ||
 470            !(rt->rt6i_flags & RTF_GATEWAY))
 471                m = 1;
 472        else if (neigh) {
 473                read_lock_bh(&neigh->lock);
 474                if (neigh->nud_state & NUD_VALID)
 475                        m = 2;
 476#ifdef CONFIG_IPV6_ROUTER_PREF
 477                else if (neigh->nud_state & NUD_FAILED)
 478                        m = 0;
 479#endif
 480                else
 481                        m = 1;
 482                read_unlock_bh(&neigh->lock);
 483        } else
 484                m = 0;
 485        rcu_read_unlock();
 486        return m;
 487}
 488
 489static int rt6_score_route(struct rt6_info *rt, int oif,
 490                           int strict)
 491{
 492        int m, n;
 493
 494        m = rt6_check_dev(rt, oif);
 495        if (!m && (strict & RT6_LOOKUP_F_IFACE))
 496                return -1;
 497#ifdef CONFIG_IPV6_ROUTER_PREF
 498        m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
 499#endif
 500        n = rt6_check_neigh(rt);
 501        if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
 502                return -1;
 503        return m;
 504}
 505
 506static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
 507                                   int *mpri, struct rt6_info *match)
 508{
 509        int m;
 510
 511        if (rt6_check_expired(rt))
 512                goto out;
 513
 514        m = rt6_score_route(rt, oif, strict);
 515        if (m < 0)
 516                goto out;
 517
 518        if (m > *mpri) {
 519                if (strict & RT6_LOOKUP_F_REACHABLE)
 520                        rt6_probe(match);
 521                *mpri = m;
 522                match = rt;
 523        } else if (strict & RT6_LOOKUP_F_REACHABLE) {
 524                rt6_probe(rt);
 525        }
 526
 527out:
 528        return match;
 529}
 530
 531static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
 532                                     struct rt6_info *rr_head,
 533                                     u32 metric, int oif, int strict)
 534{
 535        struct rt6_info *rt, *match;
 536        int mpri = -1;
 537
 538        match = NULL;
 539        for (rt = rr_head; rt && rt->rt6i_metric == metric;
 540             rt = rt->dst.rt6_next)
 541                match = find_match(rt, oif, strict, &mpri, match);
 542        for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
 543             rt = rt->dst.rt6_next)
 544                match = find_match(rt, oif, strict, &mpri, match);
 545
 546        return match;
 547}
 548
 549static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
 550{
 551        struct rt6_info *match, *rt0;
 552        struct net *net;
 553
 554        rt0 = fn->rr_ptr;
 555        if (!rt0)
 556                fn->rr_ptr = rt0 = fn->leaf;
 557
 558        match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
 559
 560        if (!match &&
 561            (strict & RT6_LOOKUP_F_REACHABLE)) {
 562                struct rt6_info *next = rt0->dst.rt6_next;
 563
 564                /* no entries matched; do round-robin */
 565                if (!next || next->rt6i_metric != rt0->rt6i_metric)
 566                        next = fn->leaf;
 567
 568                if (next != rt0)
 569                        fn->rr_ptr = next;
 570        }
 571
 572        net = dev_net(rt0->dst.dev);
 573        return match ? match : net->ipv6.ip6_null_entry;
 574}
 575
 576#ifdef CONFIG_IPV6_ROUTE_INFO
 577int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 578                  const struct in6_addr *gwaddr)
 579{
 580        struct net *net = dev_net(dev);
 581        struct route_info *rinfo = (struct route_info *) opt;
 582        struct in6_addr prefix_buf, *prefix;
 583        unsigned int pref;
 584        unsigned long lifetime;
 585        struct rt6_info *rt;
 586
 587        if (len < sizeof(struct route_info)) {
 588                return -EINVAL;
 589        }
 590
 591        /* Sanity check for prefix_len and length */
 592        if (rinfo->length > 3) {
 593                return -EINVAL;
 594        } else if (rinfo->prefix_len > 128) {
 595                return -EINVAL;
 596        } else if (rinfo->prefix_len > 64) {
 597                if (rinfo->length < 2) {
 598                        return -EINVAL;
 599                }
 600        } else if (rinfo->prefix_len > 0) {
 601                if (rinfo->length < 1) {
 602                        return -EINVAL;
 603                }
 604        }
 605
 606        pref = rinfo->route_pref;
 607        if (pref == ICMPV6_ROUTER_PREF_INVALID)
 608                return -EINVAL;
 609
 610        lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
 611
 612        if (rinfo->length == 3)
 613                prefix = (struct in6_addr *)rinfo->prefix;
 614        else {
 615                /* this function is safe */
 616                ipv6_addr_prefix(&prefix_buf,
 617                                 (struct in6_addr *)rinfo->prefix,
 618                                 rinfo->prefix_len);
 619                prefix = &prefix_buf;
 620        }
 621
 622        rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
 623                                dev->ifindex);
 624
 625        if (rt && !lifetime) {
 626                ip6_del_rt(rt);
 627                rt = NULL;
 628        }
 629
 630        if (!rt && lifetime)
 631                rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
 632                                        pref);
 633        else if (rt)
 634                rt->rt6i_flags = RTF_ROUTEINFO |
 635                                 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
 636
 637        if (rt) {
 638                if (!addrconf_finite_timeout(lifetime))
 639                        rt6_clean_expires(rt);
 640                else
 641                        rt6_set_expires(rt, jiffies + HZ * lifetime);
 642
 643                dst_release(&rt->dst);
 644        }
 645        return 0;
 646}
 647#endif
 648
 649#define BACKTRACK(__net, saddr)                 \
 650do { \
 651        if (rt == __net->ipv6.ip6_null_entry) { \
 652                struct fib6_node *pn; \
 653                while (1) { \
 654                        if (fn->fn_flags & RTN_TL_ROOT) \
 655                                goto out; \
 656                        pn = fn->parent; \
 657                        if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
 658                                fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
 659                        else \
 660                                fn = pn; \
 661                        if (fn->fn_flags & RTN_RTINFO) \
 662                                goto restart; \
 663                } \
 664        } \
 665} while (0)
 666
 667static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 668                                             struct fib6_table *table,
 669                                             struct flowi6 *fl6, int flags)
 670{
 671        struct fib6_node *fn;
 672        struct rt6_info *rt;
 673
 674        read_lock_bh(&table->tb6_lock);
 675        fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 676restart:
 677        rt = fn->leaf;
 678        rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
 679        BACKTRACK(net, &fl6->saddr);
 680out:
 681        dst_use(&rt->dst, jiffies);
 682        read_unlock_bh(&table->tb6_lock);
 683        return rt;
 684
 685}
 686
 687struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
 688                                    int flags)
 689{
 690        return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
 691}
 692EXPORT_SYMBOL_GPL(ip6_route_lookup);
 693
 694struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
 695                            const struct in6_addr *saddr, int oif, int strict)
 696{
 697        struct flowi6 fl6 = {
 698                .flowi6_oif = oif,
 699                .daddr = *daddr,
 700        };
 701        struct dst_entry *dst;
 702        int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
 703
 704        if (saddr) {
 705                memcpy(&fl6.saddr, saddr, sizeof(*saddr));
 706                flags |= RT6_LOOKUP_F_HAS_SADDR;
 707        }
 708
 709        dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
 710        if (dst->error == 0)
 711                return (struct rt6_info *) dst;
 712
 713        dst_release(dst);
 714
 715        return NULL;
 716}
 717
 718EXPORT_SYMBOL(rt6_lookup);
 719
 720/* ip6_ins_rt is called with FREE table->tb6_lock.
 721   It takes new route entry, the addition fails by any reason the
 722   route is freed. In any case, if caller does not hold it, it may
 723   be destroyed.
 724 */
 725
 726static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
 727{
 728        int err;
 729        struct fib6_table *table;
 730
 731        table = rt->rt6i_table;
 732        write_lock_bh(&table->tb6_lock);
 733        err = fib6_add(&table->tb6_root, rt, info);
 734        write_unlock_bh(&table->tb6_lock);
 735
 736        return err;
 737}
 738
 739int ip6_ins_rt(struct rt6_info *rt)
 740{
 741        struct nl_info info = {
 742                .nl_net = dev_net(rt->dst.dev),
 743        };
 744        return __ip6_ins_rt(rt, &info);
 745}
 746
 747static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
 748                                      const struct in6_addr *daddr,
 749                                      const struct in6_addr *saddr)
 750{
 751        struct rt6_info *rt;
 752
 753        /*
 754         *      Clone the route.
 755         */
 756
 757        rt = ip6_rt_copy(ort, daddr);
 758
 759        if (rt) {
 760                int attempts = !in_softirq();
 761
 762                if (!(rt->rt6i_flags & RTF_GATEWAY)) {
 763                        if (ort->rt6i_dst.plen != 128 &&
 764                            ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
 765                                rt->rt6i_flags |= RTF_ANYCAST;
 766                        rt->rt6i_gateway = *daddr;
 767                }
 768
 769                rt->rt6i_flags |= RTF_CACHE;
 770
 771#ifdef CONFIG_IPV6_SUBTREES
 772                if (rt->rt6i_src.plen && saddr) {
 773                        rt->rt6i_src.addr = *saddr;
 774                        rt->rt6i_src.plen = 128;
 775                }
 776#endif
 777
 778        retry:
 779                if (rt6_bind_neighbour(rt, rt->dst.dev)) {
 780                        struct net *net = dev_net(rt->dst.dev);
 781                        int saved_rt_min_interval =
 782                                net->ipv6.sysctl.ip6_rt_gc_min_interval;
 783                        int saved_rt_elasticity =
 784                                net->ipv6.sysctl.ip6_rt_gc_elasticity;
 785
 786                        if (attempts-- > 0) {
 787                                net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
 788                                net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
 789
 790                                ip6_dst_gc(&net->ipv6.ip6_dst_ops);
 791
 792                                net->ipv6.sysctl.ip6_rt_gc_elasticity =
 793                                        saved_rt_elasticity;
 794                                net->ipv6.sysctl.ip6_rt_gc_min_interval =
 795                                        saved_rt_min_interval;
 796                                goto retry;
 797                        }
 798
 799                        net_warn_ratelimited("Neighbour table overflow\n");
 800                        dst_free(&rt->dst);
 801                        return NULL;
 802                }
 803        }
 804
 805        return rt;
 806}
 807
 808static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
 809                                        const struct in6_addr *daddr)
 810{
 811        struct rt6_info *rt = ip6_rt_copy(ort, daddr);
 812
 813        if (rt) {
 814                rt->rt6i_flags |= RTF_CACHE;
 815                dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst)));
 816        }
 817        return rt;
 818}
 819
 820static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
 821                                      struct flowi6 *fl6, int flags)
 822{
 823        struct fib6_node *fn;
 824        struct rt6_info *rt, *nrt;
 825        int strict = 0;
 826        int attempts = 3;
 827        int err;
 828        int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
 829
 830        strict |= flags & RT6_LOOKUP_F_IFACE;
 831
 832relookup:
 833        read_lock_bh(&table->tb6_lock);
 834
 835restart_2:
 836        fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 837
 838restart:
 839        rt = rt6_select(fn, oif, strict | reachable);
 840
 841        BACKTRACK(net, &fl6->saddr);
 842        if (rt == net->ipv6.ip6_null_entry ||
 843            rt->rt6i_flags & RTF_CACHE)
 844                goto out;
 845
 846        dst_hold(&rt->dst);
 847        read_unlock_bh(&table->tb6_lock);
 848
 849        if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
 850                nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
 851        else if (!(rt->dst.flags & DST_HOST))
 852                nrt = rt6_alloc_clone(rt, &fl6->daddr);
 853        else
 854                goto out2;
 855
 856        dst_release(&rt->dst);
 857        rt = nrt ? : net->ipv6.ip6_null_entry;
 858
 859        dst_hold(&rt->dst);
 860        if (nrt) {
 861                err = ip6_ins_rt(nrt);
 862                if (!err)
 863                        goto out2;
 864        }
 865
 866        if (--attempts <= 0)
 867                goto out2;
 868
 869        /*
 870         * Race condition! In the gap, when table->tb6_lock was
 871         * released someone could insert this route.  Relookup.
 872         */
 873        dst_release(&rt->dst);
 874        goto relookup;
 875
 876out:
 877        if (reachable) {
 878                reachable = 0;
 879                goto restart_2;
 880        }
 881        dst_hold(&rt->dst);
 882        read_unlock_bh(&table->tb6_lock);
 883out2:
 884        rt->dst.lastuse = jiffies;
 885        rt->dst.__use++;
 886
 887        return rt;
 888}
 889
 890static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
 891                                            struct flowi6 *fl6, int flags)
 892{
 893        return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
 894}
 895
 896static struct dst_entry *ip6_route_input_lookup(struct net *net,
 897                                                struct net_device *dev,
 898                                                struct flowi6 *fl6, int flags)
 899{
 900        if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
 901                flags |= RT6_LOOKUP_F_IFACE;
 902
 903        return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
 904}
 905
 906void ip6_route_input(struct sk_buff *skb)
 907{
 908        const struct ipv6hdr *iph = ipv6_hdr(skb);
 909        struct net *net = dev_net(skb->dev);
 910        int flags = RT6_LOOKUP_F_HAS_SADDR;
 911        struct flowi6 fl6 = {
 912                .flowi6_iif = skb->dev->ifindex,
 913                .daddr = iph->daddr,
 914                .saddr = iph->saddr,
 915                .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
 916                .flowi6_mark = skb->mark,
 917                .flowi6_proto = iph->nexthdr,
 918        };
 919
 920        skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
 921}
 922
 923static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
 924                                             struct flowi6 *fl6, int flags)
 925{
 926        return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
 927}
 928
 929struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
 930                                    struct flowi6 *fl6)
 931{
 932        int flags = 0;
 933
 934        if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
 935                flags |= RT6_LOOKUP_F_IFACE;
 936
 937        if (!ipv6_addr_any(&fl6->saddr))
 938                flags |= RT6_LOOKUP_F_HAS_SADDR;
 939        else if (sk)
 940                flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
 941
 942        return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
 943}
 944
 945EXPORT_SYMBOL(ip6_route_output);
 946
 947struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
 948{
 949        struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
 950        struct dst_entry *new = NULL;
 951
 952        rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
 953        if (rt) {
 954                memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
 955
 956                new = &rt->dst;
 957
 958                new->__use = 1;
 959                new->input = dst_discard;
 960                new->output = dst_discard;
 961
 962                if (dst_metrics_read_only(&ort->dst))
 963                        new->_metrics = ort->dst._metrics;
 964                else
 965                        dst_copy_metrics(new, &ort->dst);
 966                rt->rt6i_idev = ort->rt6i_idev;
 967                if (rt->rt6i_idev)
 968                        in6_dev_hold(rt->rt6i_idev);
 969
 970                rt->rt6i_gateway = ort->rt6i_gateway;
 971                rt->rt6i_flags = ort->rt6i_flags;
 972                rt6_clean_expires(rt);
 973                rt->rt6i_metric = 0;
 974
 975                memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
 976#ifdef CONFIG_IPV6_SUBTREES
 977                memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
 978#endif
 979
 980                dst_free(new);
 981        }
 982
 983        dst_release(dst_orig);
 984        return new ? new : ERR_PTR(-ENOMEM);
 985}
 986
 987/*
 988 *      Destination cache support functions
 989 */
 990
 991static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 992{
 993        struct rt6_info *rt;
 994
 995        rt = (struct rt6_info *) dst;
 996
 997        if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
 998                if (rt->rt6i_peer_genid != rt6_peer_genid()) {
 999                        if (!rt->rt6i_peer)
1000                                rt6_bind_peer(rt, 0);
1001                        rt->rt6i_peer_genid = rt6_peer_genid();
1002                }
1003                return dst;
1004        }
1005        return NULL;
1006}
1007
1008static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1009{
1010        struct rt6_info *rt = (struct rt6_info *) dst;
1011
1012        if (rt) {
1013                if (rt->rt6i_flags & RTF_CACHE) {
1014                        if (rt6_check_expired(rt)) {
1015                                ip6_del_rt(rt);
1016                                dst = NULL;
1017                        }
1018                } else {
1019                        dst_release(dst);
1020                        dst = NULL;
1021                }
1022        }
1023        return dst;
1024}
1025
1026static void ip6_link_failure(struct sk_buff *skb)
1027{
1028        struct rt6_info *rt;
1029
1030        icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1031
1032        rt = (struct rt6_info *) skb_dst(skb);
1033        if (rt) {
1034                if (rt->rt6i_flags & RTF_CACHE)
1035                        rt6_update_expires(rt, 0);
1036                else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1037                        rt->rt6i_node->fn_sernum = -1;
1038        }
1039}
1040
1041static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1042{
1043        struct rt6_info *rt6 = (struct rt6_info*)dst;
1044
1045        if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1046                rt6->rt6i_flags |= RTF_MODIFIED;
1047                if (mtu < IPV6_MIN_MTU) {
1048                        u32 features = dst_metric(dst, RTAX_FEATURES);
1049                        mtu = IPV6_MIN_MTU;
1050                        features |= RTAX_FEATURE_ALLFRAG;
1051                        dst_metric_set(dst, RTAX_FEATURES, features);
1052                }
1053                dst_metric_set(dst, RTAX_MTU, mtu);
1054        }
1055}
1056
1057static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1058{
1059        struct net_device *dev = dst->dev;
1060        unsigned int mtu = dst_mtu(dst);
1061        struct net *net = dev_net(dev);
1062
1063        mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1064
1065        if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1066                mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1067
1068        /*
1069         * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1070         * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1071         * IPV6_MAXPLEN is also valid and means: "any MSS,
1072         * rely only on pmtu discovery"
1073         */
1074        if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1075                mtu = IPV6_MAXPLEN;
1076        return mtu;
1077}
1078
1079static unsigned int ip6_mtu(const struct dst_entry *dst)
1080{
1081        struct inet6_dev *idev;
1082        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1083
1084        if (mtu)
1085                return mtu;
1086
1087        mtu = IPV6_MIN_MTU;
1088
1089        rcu_read_lock();
1090        idev = __in6_dev_get(dst->dev);
1091        if (idev)
1092                mtu = idev->cnf.mtu6;
1093        rcu_read_unlock();
1094
1095        return mtu;
1096}
1097
1098static struct dst_entry *icmp6_dst_gc_list;
1099static DEFINE_SPINLOCK(icmp6_dst_lock);
1100
1101struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1102                                  struct neighbour *neigh,
1103                                  struct flowi6 *fl6)
1104{
1105        struct dst_entry *dst;
1106        struct rt6_info *rt;
1107        struct inet6_dev *idev = in6_dev_get(dev);
1108        struct net *net = dev_net(dev);
1109
1110        if (unlikely(!idev))
1111                return ERR_PTR(-ENODEV);
1112
1113        rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1114        if (unlikely(!rt)) {
1115                in6_dev_put(idev);
1116                dst = ERR_PTR(-ENOMEM);
1117                goto out;
1118        }
1119
1120        if (neigh)
1121                neigh_hold(neigh);
1122        else {
1123                neigh = ip6_neigh_lookup(&rt->dst, &fl6->daddr);
1124                if (IS_ERR(neigh)) {
1125                        in6_dev_put(idev);
1126                        dst_free(&rt->dst);
1127                        return ERR_CAST(neigh);
1128                }
1129        }
1130
1131        rt->dst.flags |= DST_HOST;
1132        rt->dst.output  = ip6_output;
1133        dst_set_neighbour(&rt->dst, neigh);
1134        atomic_set(&rt->dst.__refcnt, 1);
1135        rt->rt6i_dst.addr = fl6->daddr;
1136        rt->rt6i_dst.plen = 128;
1137        rt->rt6i_idev     = idev;
1138        dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1139
1140        spin_lock_bh(&icmp6_dst_lock);
1141        rt->dst.next = icmp6_dst_gc_list;
1142        icmp6_dst_gc_list = &rt->dst;
1143        spin_unlock_bh(&icmp6_dst_lock);
1144
1145        fib6_force_start_gc(net);
1146
1147        dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1148
1149out:
1150        return dst;
1151}
1152
1153int icmp6_dst_gc(void)
1154{
1155        struct dst_entry *dst, **pprev;
1156        int more = 0;
1157
1158        spin_lock_bh(&icmp6_dst_lock);
1159        pprev = &icmp6_dst_gc_list;
1160
1161        while ((dst = *pprev) != NULL) {
1162                if (!atomic_read(&dst->__refcnt)) {
1163                        *pprev = dst->next;
1164                        dst_free(dst);
1165                } else {
1166                        pprev = &dst->next;
1167                        ++more;
1168                }
1169        }
1170
1171        spin_unlock_bh(&icmp6_dst_lock);
1172
1173        return more;
1174}
1175
1176static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1177                            void *arg)
1178{
1179        struct dst_entry *dst, **pprev;
1180
1181        spin_lock_bh(&icmp6_dst_lock);
1182        pprev = &icmp6_dst_gc_list;
1183        while ((dst = *pprev) != NULL) {
1184                struct rt6_info *rt = (struct rt6_info *) dst;
1185                if (func(rt, arg)) {
1186                        *pprev = dst->next;
1187                        dst_free(dst);
1188                } else {
1189                        pprev = &dst->next;
1190                }
1191        }
1192        spin_unlock_bh(&icmp6_dst_lock);
1193}
1194
1195static int ip6_dst_gc(struct dst_ops *ops)
1196{
1197        unsigned long now = jiffies;
1198        struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1199        int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1200        int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1201        int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1202        int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1203        unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1204        int entries;
1205
1206        entries = dst_entries_get_fast(ops);
1207        if (time_after(rt_last_gc + rt_min_interval, now) &&
1208            entries <= rt_max_size)
1209                goto out;
1210
1211        net->ipv6.ip6_rt_gc_expire++;
1212        fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1213        net->ipv6.ip6_rt_last_gc = now;
1214        entries = dst_entries_get_slow(ops);
1215        if (entries < ops->gc_thresh)
1216                net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1217out:
1218        net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1219        return entries > rt_max_size;
1220}
1221
1222/* Clean host part of a prefix. Not necessary in radix tree,
1223   but results in cleaner routing tables.
1224
1225   Remove it only when all the things will work!
1226 */
1227
1228int ip6_dst_hoplimit(struct dst_entry *dst)
1229{
1230        int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1231        if (hoplimit == 0) {
1232                struct net_device *dev = dst->dev;
1233                struct inet6_dev *idev;
1234
1235                rcu_read_lock();
1236                idev = __in6_dev_get(dev);
1237                if (idev)
1238                        hoplimit = idev->cnf.hop_limit;
1239                else
1240                        hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1241                rcu_read_unlock();
1242        }
1243        return hoplimit;
1244}
1245EXPORT_SYMBOL(ip6_dst_hoplimit);
1246
1247/*
1248 *
1249 */
1250
1251int ip6_route_add(struct fib6_config *cfg)
1252{
1253        int err;
1254        struct net *net = cfg->fc_nlinfo.nl_net;
1255        struct rt6_info *rt = NULL;
1256        struct net_device *dev = NULL;
1257        struct inet6_dev *idev = NULL;
1258        struct fib6_table *table;
1259        int addr_type;
1260
1261        if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1262                return -EINVAL;
1263#ifndef CONFIG_IPV6_SUBTREES
1264        if (cfg->fc_src_len)
1265                return -EINVAL;
1266#endif
1267        if (cfg->fc_ifindex) {
1268                err = -ENODEV;
1269                dev = dev_get_by_index(net, cfg->fc_ifindex);
1270                if (!dev)
1271                        goto out;
1272                idev = in6_dev_get(dev);
1273                if (!idev)
1274                        goto out;
1275        }
1276
1277        if (cfg->fc_metric == 0)
1278                cfg->fc_metric = IP6_RT_PRIO_USER;
1279
1280        err = -ENOBUFS;
1281        if (cfg->fc_nlinfo.nlh &&
1282            !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1283                table = fib6_get_table(net, cfg->fc_table);
1284                if (!table) {
1285                        pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1286                        table = fib6_new_table(net, cfg->fc_table);
1287                }
1288        } else {
1289                table = fib6_new_table(net, cfg->fc_table);
1290        }
1291
1292        if (!table)
1293                goto out;
1294
1295        rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1296
1297        if (!rt) {
1298                err = -ENOMEM;
1299                goto out;
1300        }
1301
1302        rt->dst.obsolete = -1;
1303
1304        if (cfg->fc_flags & RTF_EXPIRES)
1305                rt6_set_expires(rt, jiffies +
1306                                clock_t_to_jiffies(cfg->fc_expires));
1307        else
1308                rt6_clean_expires(rt);
1309
1310        if (cfg->fc_protocol == RTPROT_UNSPEC)
1311                cfg->fc_protocol = RTPROT_BOOT;
1312        rt->rt6i_protocol = cfg->fc_protocol;
1313
1314        addr_type = ipv6_addr_type(&cfg->fc_dst);
1315
1316        if (addr_type & IPV6_ADDR_MULTICAST)
1317                rt->dst.input = ip6_mc_input;
1318        else if (cfg->fc_flags & RTF_LOCAL)
1319                rt->dst.input = ip6_input;
1320        else
1321                rt->dst.input = ip6_forward;
1322
1323        rt->dst.output = ip6_output;
1324
1325        ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1326        rt->rt6i_dst.plen = cfg->fc_dst_len;
1327        if (rt->rt6i_dst.plen == 128)
1328               rt->dst.flags |= DST_HOST;
1329
1330        if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1331                u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1332                if (!metrics) {
1333                        err = -ENOMEM;
1334                        goto out;
1335                }
1336                dst_init_metrics(&rt->dst, metrics, 0);
1337        }
1338#ifdef CONFIG_IPV6_SUBTREES
1339        ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1340        rt->rt6i_src.plen = cfg->fc_src_len;
1341#endif
1342
1343        rt->rt6i_metric = cfg->fc_metric;
1344
1345        /* We cannot add true routes via loopback here,
1346           they would result in kernel looping; promote them to reject routes
1347         */
1348        if ((cfg->fc_flags & RTF_REJECT) ||
1349            (dev && (dev->flags & IFF_LOOPBACK) &&
1350             !(addr_type & IPV6_ADDR_LOOPBACK) &&
1351             !(cfg->fc_flags & RTF_LOCAL))) {
1352                /* hold loopback dev/idev if we haven't done so. */
1353                if (dev != net->loopback_dev) {
1354                        if (dev) {
1355                                dev_put(dev);
1356                                in6_dev_put(idev);
1357                        }
1358                        dev = net->loopback_dev;
1359                        dev_hold(dev);
1360                        idev = in6_dev_get(dev);
1361                        if (!idev) {
1362                                err = -ENODEV;
1363                                goto out;
1364                        }
1365                }
1366                rt->dst.output = ip6_pkt_discard_out;
1367                rt->dst.input = ip6_pkt_discard;
1368                rt->dst.error = -ENETUNREACH;
1369                rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1370                goto install_route;
1371        }
1372
1373        if (cfg->fc_flags & RTF_GATEWAY) {
1374                const struct in6_addr *gw_addr;
1375                int gwa_type;
1376
1377                gw_addr = &cfg->fc_gateway;
1378                rt->rt6i_gateway = *gw_addr;
1379                gwa_type = ipv6_addr_type(gw_addr);
1380
1381                if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1382                        struct rt6_info *grt;
1383
1384                        /* IPv6 strictly inhibits using not link-local
1385                           addresses as nexthop address.
1386                           Otherwise, router will not able to send redirects.
1387                           It is very good, but in some (rare!) circumstances
1388                           (SIT, PtP, NBMA NOARP links) it is handy to allow
1389                           some exceptions. --ANK
1390                         */
1391                        err = -EINVAL;
1392                        if (!(gwa_type & IPV6_ADDR_UNICAST))
1393                                goto out;
1394
1395                        grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1396
1397                        err = -EHOSTUNREACH;
1398                        if (!grt)
1399                                goto out;
1400                        if (dev) {
1401                                if (dev != grt->dst.dev) {
1402                                        dst_release(&grt->dst);
1403                                        goto out;
1404                                }
1405                        } else {
1406                                dev = grt->dst.dev;
1407                                idev = grt->rt6i_idev;
1408                                dev_hold(dev);
1409                                in6_dev_hold(grt->rt6i_idev);
1410                        }
1411                        if (!(grt->rt6i_flags & RTF_GATEWAY))
1412                                err = 0;
1413                        dst_release(&grt->dst);
1414
1415                        if (err)
1416                                goto out;
1417                }
1418                err = -EINVAL;
1419                if (!dev || (dev->flags & IFF_LOOPBACK))
1420                        goto out;
1421        }
1422
1423        err = -ENODEV;
1424        if (!dev)
1425                goto out;
1426
1427        if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1428                if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1429                        err = -EINVAL;
1430                        goto out;
1431                }
1432                rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1433                rt->rt6i_prefsrc.plen = 128;
1434        } else
1435                rt->rt6i_prefsrc.plen = 0;
1436
1437        if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1438                err = rt6_bind_neighbour(rt, dev);
1439                if (err)
1440                        goto out;
1441        }
1442
1443        rt->rt6i_flags = cfg->fc_flags;
1444
1445install_route:
1446        if (cfg->fc_mx) {
1447                struct nlattr *nla;
1448                int remaining;
1449
1450                nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1451                        int type = nla_type(nla);
1452
1453                        if (type) {
1454                                if (type > RTAX_MAX) {
1455                                        err = -EINVAL;
1456                                        goto out;
1457                                }
1458
1459                                dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1460                        }
1461                }
1462        }
1463
1464        rt->dst.dev = dev;
1465        rt->rt6i_idev = idev;
1466        rt->rt6i_table = table;
1467
1468        cfg->fc_nlinfo.nl_net = dev_net(dev);
1469
1470        return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1471
1472out:
1473        if (dev)
1474                dev_put(dev);
1475        if (idev)
1476                in6_dev_put(idev);
1477        if (rt)
1478                dst_free(&rt->dst);
1479        return err;
1480}
1481
1482static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1483{
1484        int err;
1485        struct fib6_table *table;
1486        struct net *net = dev_net(rt->dst.dev);
1487
1488        if (rt == net->ipv6.ip6_null_entry)
1489                return -ENOENT;
1490
1491        table = rt->rt6i_table;
1492        write_lock_bh(&table->tb6_lock);
1493
1494        err = fib6_del(rt, info);
1495        dst_release(&rt->dst);
1496
1497        write_unlock_bh(&table->tb6_lock);
1498
1499        return err;
1500}
1501
1502int ip6_del_rt(struct rt6_info *rt)
1503{
1504        struct nl_info info = {
1505                .nl_net = dev_net(rt->dst.dev),
1506        };
1507        return __ip6_del_rt(rt, &info);
1508}
1509
1510static int ip6_route_del(struct fib6_config *cfg)
1511{
1512        struct fib6_table *table;
1513        struct fib6_node *fn;
1514        struct rt6_info *rt;
1515        int err = -ESRCH;
1516
1517        table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1518        if (!table)
1519                return err;
1520
1521        read_lock_bh(&table->tb6_lock);
1522
1523        fn = fib6_locate(&table->tb6_root,
1524                         &cfg->fc_dst, cfg->fc_dst_len,
1525                         &cfg->fc_src, cfg->fc_src_len);
1526
1527        if (fn) {
1528                for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1529                        if (cfg->fc_ifindex &&
1530                            (!rt->dst.dev ||
1531                             rt->dst.dev->ifindex != cfg->fc_ifindex))
1532                                continue;
1533                        if (cfg->fc_flags & RTF_GATEWAY &&
1534                            !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1535                                continue;
1536                        if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1537                                continue;
1538                        dst_hold(&rt->dst);
1539                        read_unlock_bh(&table->tb6_lock);
1540
1541                        return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1542                }
1543        }
1544        read_unlock_bh(&table->tb6_lock);
1545
1546        return err;
1547}
1548
1549/*
1550 *      Handle redirects
1551 */
1552struct ip6rd_flowi {
1553        struct flowi6 fl6;
1554        struct in6_addr gateway;
1555};
1556
1557static struct rt6_info *__ip6_route_redirect(struct net *net,
1558                                             struct fib6_table *table,
1559                                             struct flowi6 *fl6,
1560                                             int flags)
1561{
1562        struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1563        struct rt6_info *rt;
1564        struct fib6_node *fn;
1565
1566        /*
1567         * Get the "current" route for this destination and
1568         * check if the redirect has come from approriate router.
1569         *
1570         * RFC 2461 specifies that redirects should only be
1571         * accepted if they come from the nexthop to the target.
1572         * Due to the way the routes are chosen, this notion
1573         * is a bit fuzzy and one might need to check all possible
1574         * routes.
1575         */
1576
1577        read_lock_bh(&table->tb6_lock);
1578        fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1579restart:
1580        for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1581                /*
1582                 * Current route is on-link; redirect is always invalid.
1583                 *
1584                 * Seems, previous statement is not true. It could
1585                 * be node, which looks for us as on-link (f.e. proxy ndisc)
1586                 * But then router serving it might decide, that we should
1587                 * know truth 8)8) --ANK (980726).
1588                 */
1589                if (rt6_check_expired(rt))
1590                        continue;
1591                if (!(rt->rt6i_flags & RTF_GATEWAY))
1592                        continue;
1593                if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1594                        continue;
1595                if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1596                        continue;
1597                break;
1598        }
1599
1600        if (!rt)
1601                rt = net->ipv6.ip6_null_entry;
1602        BACKTRACK(net, &fl6->saddr);
1603out:
1604        dst_hold(&rt->dst);
1605
1606        read_unlock_bh(&table->tb6_lock);
1607
1608        return rt;
1609};
1610
1611static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1612                                           const struct in6_addr *src,
1613                                           const struct in6_addr *gateway,
1614                                           struct net_device *dev)
1615{
1616        int flags = RT6_LOOKUP_F_HAS_SADDR;
1617        struct net *net = dev_net(dev);
1618        struct ip6rd_flowi rdfl = {
1619                .fl6 = {
1620                        .flowi6_oif = dev->ifindex,
1621                        .daddr = *dest,
1622                        .saddr = *src,
1623                },
1624        };
1625
1626        rdfl.gateway = *gateway;
1627
1628        if (rt6_need_strict(dest))
1629                flags |= RT6_LOOKUP_F_IFACE;
1630
1631        return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1632                                                   flags, __ip6_route_redirect);
1633}
1634
1635void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1636                  const struct in6_addr *saddr,
1637                  struct neighbour *neigh, u8 *lladdr, int on_link)
1638{
1639        struct rt6_info *rt, *nrt = NULL;
1640        struct netevent_redirect netevent;
1641        struct net *net = dev_net(neigh->dev);
1642
1643        rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1644
1645        if (rt == net->ipv6.ip6_null_entry) {
1646                net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
1647                goto out;
1648        }
1649
1650        /*
1651         *      We have finally decided to accept it.
1652         */
1653
1654        neigh_update(neigh, lladdr, NUD_STALE,
1655                     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1656                     NEIGH_UPDATE_F_OVERRIDE|
1657                     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1658                                     NEIGH_UPDATE_F_ISROUTER))
1659                     );
1660
1661        /*
1662         * Redirect received -> path was valid.
1663         * Look, redirects are sent only in response to data packets,
1664         * so that this nexthop apparently is reachable. --ANK
1665         */
1666        dst_confirm(&rt->dst);
1667
1668        /* Duplicate redirect: silently ignore. */
1669        if (neigh == dst_get_neighbour_noref_raw(&rt->dst))
1670                goto out;
1671
1672        nrt = ip6_rt_copy(rt, dest);
1673        if (!nrt)
1674                goto out;
1675
1676        nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1677        if (on_link)
1678                nrt->rt6i_flags &= ~RTF_GATEWAY;
1679
1680        nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1681        dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1682
1683        if (ip6_ins_rt(nrt))
1684                goto out;
1685
1686        netevent.old = &rt->dst;
1687        netevent.new = &nrt->dst;
1688        call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1689
1690        if (rt->rt6i_flags & RTF_CACHE) {
1691                ip6_del_rt(rt);
1692                return;
1693        }
1694
1695out:
1696        dst_release(&rt->dst);
1697}
1698
1699/*
1700 *      Handle ICMP "packet too big" messages
1701 *      i.e. Path MTU discovery
1702 */
1703
1704static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1705                             struct net *net, u32 pmtu, int ifindex)
1706{
1707        struct rt6_info *rt, *nrt;
1708        int allfrag = 0;
1709again:
1710        rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1711        if (!rt)
1712                return;
1713
1714        if (rt6_check_expired(rt)) {
1715                ip6_del_rt(rt);
1716                goto again;
1717        }
1718
1719        if (pmtu >= dst_mtu(&rt->dst))
1720                goto out;
1721
1722        if (pmtu < IPV6_MIN_MTU) {
1723                /*
1724                 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1725                 * MTU (1280) and a fragment header should always be included
1726                 * after a node receiving Too Big message reporting PMTU is
1727                 * less than the IPv6 Minimum Link MTU.
1728                 */
1729                pmtu = IPV6_MIN_MTU;
1730                allfrag = 1;
1731        }
1732
1733        /* New mtu received -> path was valid.
1734           They are sent only in response to data packets,
1735           so that this nexthop apparently is reachable. --ANK
1736         */
1737        dst_confirm(&rt->dst);
1738
1739        /* Host route. If it is static, it would be better
1740           not to override it, but add new one, so that
1741           when cache entry will expire old pmtu
1742           would return automatically.
1743         */
1744        if (rt->rt6i_flags & RTF_CACHE) {
1745                dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1746                if (allfrag) {
1747                        u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1748                        features |= RTAX_FEATURE_ALLFRAG;
1749                        dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1750                }
1751                rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1752                rt->rt6i_flags |= RTF_MODIFIED;
1753                goto out;
1754        }
1755
1756        /* Network route.
1757           Two cases are possible:
1758           1. It is connected route. Action: COW
1759           2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1760         */
1761        if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1762                nrt = rt6_alloc_cow(rt, daddr, saddr);
1763        else
1764                nrt = rt6_alloc_clone(rt, daddr);
1765
1766        if (nrt) {
1767                dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1768                if (allfrag) {
1769                        u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1770                        features |= RTAX_FEATURE_ALLFRAG;
1771                        dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1772                }
1773
1774                /* According to RFC 1981, detecting PMTU increase shouldn't be
1775                 * happened within 5 mins, the recommended timer is 10 mins.
1776                 * Here this route expiration time is set to ip6_rt_mtu_expires
1777                 * which is 10 mins. After 10 mins the decreased pmtu is expired
1778                 * and detecting PMTU increase will be automatically happened.
1779                 */
1780                rt6_update_expires(nrt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1781                nrt->rt6i_flags |= RTF_DYNAMIC;
1782                ip6_ins_rt(nrt);
1783        }
1784out:
1785        dst_release(&rt->dst);
1786}
1787
1788void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1789                        struct net_device *dev, u32 pmtu)
1790{
1791        struct net *net = dev_net(dev);
1792
1793        /*
1794         * RFC 1981 states that a node "MUST reduce the size of the packets it
1795         * is sending along the path" that caused the Packet Too Big message.
1796         * Since it's not possible in the general case to determine which
1797         * interface was used to send the original packet, we update the MTU
1798         * on the interface that will be used to send future packets. We also
1799         * update the MTU on the interface that received the Packet Too Big in
1800         * case the original packet was forced out that interface with
1801         * SO_BINDTODEVICE or similar. This is the next best thing to the
1802         * correct behaviour, which would be to update the MTU on all
1803         * interfaces.
1804         */
1805        rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1806        rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1807}
1808
1809/*
1810 *      Misc support functions
1811 */
1812
1813static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
1814                                    const struct in6_addr *dest)
1815{
1816        struct net *net = dev_net(ort->dst.dev);
1817        struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1818                                            ort->dst.dev, 0);
1819
1820        if (rt) {
1821                rt->dst.input = ort->dst.input;
1822                rt->dst.output = ort->dst.output;
1823                rt->dst.flags |= DST_HOST;
1824
1825                rt->rt6i_dst.addr = *dest;
1826                rt->rt6i_dst.plen = 128;
1827                dst_copy_metrics(&rt->dst, &ort->dst);
1828                rt->dst.error = ort->dst.error;
1829                rt->rt6i_idev = ort->rt6i_idev;
1830                if (rt->rt6i_idev)
1831                        in6_dev_hold(rt->rt6i_idev);
1832                rt->dst.lastuse = jiffies;
1833
1834                rt->rt6i_gateway = ort->rt6i_gateway;
1835                rt->rt6i_flags = ort->rt6i_flags;
1836                if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ==
1837                    (RTF_DEFAULT | RTF_ADDRCONF))
1838                        rt6_set_from(rt, ort);
1839                else
1840                        rt6_clean_expires(rt);
1841                rt->rt6i_metric = 0;
1842
1843#ifdef CONFIG_IPV6_SUBTREES
1844                memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1845#endif
1846                memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1847                rt->rt6i_table = ort->rt6i_table;
1848        }
1849        return rt;
1850}
1851
1852#ifdef CONFIG_IPV6_ROUTE_INFO
1853static struct rt6_info *rt6_get_route_info(struct net *net,
1854                                           const struct in6_addr *prefix, int prefixlen,
1855                                           const struct in6_addr *gwaddr, int ifindex)
1856{
1857        struct fib6_node *fn;
1858        struct rt6_info *rt = NULL;
1859        struct fib6_table *table;
1860
1861        table = fib6_get_table(net, RT6_TABLE_INFO);
1862        if (!table)
1863                return NULL;
1864
1865        write_lock_bh(&table->tb6_lock);
1866        fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1867        if (!fn)
1868                goto out;
1869
1870        for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1871                if (rt->dst.dev->ifindex != ifindex)
1872                        continue;
1873                if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1874                        continue;
1875                if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1876                        continue;
1877                dst_hold(&rt->dst);
1878                break;
1879        }
1880out:
1881        write_unlock_bh(&table->tb6_lock);
1882        return rt;
1883}
1884
1885static struct rt6_info *rt6_add_route_info(struct net *net,
1886                                           const struct in6_addr *prefix, int prefixlen,
1887                                           const struct in6_addr *gwaddr, int ifindex,
1888                                           unsigned int pref)
1889{
1890        struct fib6_config cfg = {
1891                .fc_table       = RT6_TABLE_INFO,
1892                .fc_metric      = IP6_RT_PRIO_USER,
1893                .fc_ifindex     = ifindex,
1894                .fc_dst_len     = prefixlen,
1895                .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1896                                  RTF_UP | RTF_PREF(pref),
1897                .fc_nlinfo.pid = 0,
1898                .fc_nlinfo.nlh = NULL,
1899                .fc_nlinfo.nl_net = net,
1900        };
1901
1902        cfg.fc_dst = *prefix;
1903        cfg.fc_gateway = *gwaddr;
1904
1905        /* We should treat it as a default route if prefix length is 0. */
1906        if (!prefixlen)
1907                cfg.fc_flags |= RTF_DEFAULT;
1908
1909        ip6_route_add(&cfg);
1910
1911        return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1912}
1913#endif
1914
1915struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1916{
1917        struct rt6_info *rt;
1918        struct fib6_table *table;
1919
1920        table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1921        if (!table)
1922                return NULL;
1923
1924        write_lock_bh(&table->tb6_lock);
1925        for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1926                if (dev == rt->dst.dev &&
1927                    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1928                    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1929                        break;
1930        }
1931        if (rt)
1932                dst_hold(&rt->dst);
1933        write_unlock_bh(&table->tb6_lock);
1934        return rt;
1935}
1936
1937struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1938                                     struct net_device *dev,
1939                                     unsigned int pref)
1940{
1941        struct fib6_config cfg = {
1942                .fc_table       = RT6_TABLE_DFLT,
1943                .fc_metric      = IP6_RT_PRIO_USER,
1944                .fc_ifindex     = dev->ifindex,
1945                .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1946                                  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1947                .fc_nlinfo.pid = 0,
1948                .fc_nlinfo.nlh = NULL,
1949                .fc_nlinfo.nl_net = dev_net(dev),
1950        };
1951
1952        cfg.fc_gateway = *gwaddr;
1953
1954        ip6_route_add(&cfg);
1955
1956        return rt6_get_dflt_router(gwaddr, dev);
1957}
1958
1959void rt6_purge_dflt_routers(struct net *net)
1960{
1961        struct rt6_info *rt;
1962        struct fib6_table *table;
1963
1964        /* NOTE: Keep consistent with rt6_get_dflt_router */
1965        table = fib6_get_table(net, RT6_TABLE_DFLT);
1966        if (!table)
1967                return;
1968
1969restart:
1970        read_lock_bh(&table->tb6_lock);
1971        for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1972                if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1973                        dst_hold(&rt->dst);
1974                        read_unlock_bh(&table->tb6_lock);
1975                        ip6_del_rt(rt);
1976                        goto restart;
1977                }
1978        }
1979        read_unlock_bh(&table->tb6_lock);
1980}
1981
1982static void rtmsg_to_fib6_config(struct net *net,
1983                                 struct in6_rtmsg *rtmsg,
1984                                 struct fib6_config *cfg)
1985{
1986        memset(cfg, 0, sizeof(*cfg));
1987
1988        cfg->fc_table = RT6_TABLE_MAIN;
1989        cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1990        cfg->fc_metric = rtmsg->rtmsg_metric;
1991        cfg->fc_expires = rtmsg->rtmsg_info;
1992        cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1993        cfg->fc_src_len = rtmsg->rtmsg_src_len;
1994        cfg->fc_flags = rtmsg->rtmsg_flags;
1995
1996        cfg->fc_nlinfo.nl_net = net;
1997
1998        cfg->fc_dst = rtmsg->rtmsg_dst;
1999        cfg->fc_src = rtmsg->rtmsg_src;
2000        cfg->fc_gateway = rtmsg->rtmsg_gateway;
2001}
2002
2003int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2004{
2005        struct fib6_config cfg;
2006        struct in6_rtmsg rtmsg;
2007        int err;
2008
2009        switch(cmd) {
2010        case SIOCADDRT:         /* Add a route */
2011        case SIOCDELRT:         /* Delete a route */
2012                if (!capable(CAP_NET_ADMIN))
2013                        return -EPERM;
2014                err = copy_from_user(&rtmsg, arg,
2015                                     sizeof(struct in6_rtmsg));
2016                if (err)
2017                        return -EFAULT;
2018
2019                rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2020
2021                rtnl_lock();
2022                switch (cmd) {
2023                case SIOCADDRT:
2024                        err = ip6_route_add(&cfg);
2025                        break;
2026                case SIOCDELRT:
2027                        err = ip6_route_del(&cfg);
2028                        break;
2029                default:
2030                        err = -EINVAL;
2031                }
2032                rtnl_unlock();
2033
2034                return err;
2035        }
2036
2037        return -EINVAL;
2038}
2039
2040/*
2041 *      Drop the packet on the floor
2042 */
2043
2044static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2045{
2046        int type;
2047        struct dst_entry *dst = skb_dst(skb);
2048        switch (ipstats_mib_noroutes) {
2049        case IPSTATS_MIB_INNOROUTES:
2050                type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2051                if (type == IPV6_ADDR_ANY) {
2052                        IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2053                                      IPSTATS_MIB_INADDRERRORS);
2054                        break;
2055                }
2056                /* FALLTHROUGH */
2057        case IPSTATS_MIB_OUTNOROUTES:
2058                IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2059                              ipstats_mib_noroutes);
2060                break;
2061        }
2062        icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2063        kfree_skb(skb);
2064        return 0;
2065}
2066
2067static int ip6_pkt_discard(struct sk_buff *skb)
2068{
2069        return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2070}
2071
2072static int ip6_pkt_discard_out(struct sk_buff *skb)
2073{
2074        skb->dev = skb_dst(skb)->dev;
2075        return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2076}
2077
2078#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2079
2080static int ip6_pkt_prohibit(struct sk_buff *skb)
2081{
2082        return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2083}
2084
2085static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2086{
2087        skb->dev = skb_dst(skb)->dev;
2088        return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2089}
2090
2091#endif
2092
2093/*
2094 *      Allocate a dst for local (unicast / anycast) address.
2095 */
2096
2097struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2098                                    const struct in6_addr *addr,
2099                                    bool anycast)
2100{
2101        struct net *net = dev_net(idev->dev);
2102        struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2103                                            net->loopback_dev, 0);
2104        int err;
2105
2106        if (!rt) {
2107                net_warn_ratelimited("Maximum number of routes reached, consider increasing route/max_size\n");
2108                return ERR_PTR(-ENOMEM);
2109        }
2110
2111        in6_dev_hold(idev);
2112
2113        rt->dst.flags |= DST_HOST;
2114        rt->dst.input = ip6_input;
2115        rt->dst.output = ip6_output;
2116        rt->rt6i_idev = idev;
2117        rt->dst.obsolete = -1;
2118
2119        rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2120        if (anycast)
2121                rt->rt6i_flags |= RTF_ANYCAST;
2122        else
2123                rt->rt6i_flags |= RTF_LOCAL;
2124        err = rt6_bind_neighbour(rt, rt->dst.dev);
2125        if (err) {
2126                dst_free(&rt->dst);
2127                return ERR_PTR(err);
2128        }
2129
2130        rt->rt6i_dst.addr = *addr;
2131        rt->rt6i_dst.plen = 128;
2132        rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2133
2134        atomic_set(&rt->dst.__refcnt, 1);
2135
2136        return rt;
2137}
2138
2139int ip6_route_get_saddr(struct net *net,
2140                        struct rt6_info *rt,
2141                        const struct in6_addr *daddr,
2142                        unsigned int prefs,
2143                        struct in6_addr *saddr)
2144{
2145        struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2146        int err = 0;
2147        if (rt->rt6i_prefsrc.plen)
2148                *saddr = rt->rt6i_prefsrc.addr;
2149        else
2150                err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2151                                         daddr, prefs, saddr);
2152        return err;
2153}
2154
2155/* remove deleted ip from prefsrc entries */
2156struct arg_dev_net_ip {
2157        struct net_device *dev;
2158        struct net *net;
2159        struct in6_addr *addr;
2160};
2161
2162static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2163{
2164        struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2165        struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2166        struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2167
2168        if (((void *)rt->dst.dev == dev || !dev) &&
2169            rt != net->ipv6.ip6_null_entry &&
2170            ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2171                /* remove prefsrc entry */
2172                rt->rt6i_prefsrc.plen = 0;
2173        }
2174        return 0;
2175}
2176
2177void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2178{
2179        struct net *net = dev_net(ifp->idev->dev);
2180        struct arg_dev_net_ip adni = {
2181                .dev = ifp->idev->dev,
2182                .net = net,
2183                .addr = &ifp->addr,
2184        };
2185        fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2186}
2187
2188struct arg_dev_net {
2189        struct net_device *dev;
2190        struct net *net;
2191};
2192
2193static int fib6_ifdown(struct rt6_info *rt, void *arg)
2194{
2195        const struct arg_dev_net *adn = arg;
2196        const struct net_device *dev = adn->dev;
2197
2198        if ((rt->dst.dev == dev || !dev) &&
2199            rt != adn->net->ipv6.ip6_null_entry)
2200                return -1;
2201
2202        return 0;
2203}
2204
2205void rt6_ifdown(struct net *net, struct net_device *dev)
2206{
2207        struct arg_dev_net adn = {
2208                .dev = dev,
2209                .net = net,
2210        };
2211
2212        fib6_clean_all(net, fib6_ifdown, 0, &adn);
2213        icmp6_clean_all(fib6_ifdown, &adn);
2214}
2215
2216struct rt6_mtu_change_arg {
2217        struct net_device *dev;
2218        unsigned int mtu;
2219};
2220
2221static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2222{
2223        struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2224        struct inet6_dev *idev;
2225
2226        /* In IPv6 pmtu discovery is not optional,
2227           so that RTAX_MTU lock cannot disable it.
2228           We still use this lock to block changes
2229           caused by addrconf/ndisc.
2230        */
2231
2232        idev = __in6_dev_get(arg->dev);
2233        if (!idev)
2234                return 0;
2235
2236        /* For administrative MTU increase, there is no way to discover
2237           IPv6 PMTU increase, so PMTU increase should be updated here.
2238           Since RFC 1981 doesn't include administrative MTU increase
2239           update PMTU increase is a MUST. (i.e. jumbo frame)
2240         */
2241        /*
2242           If new MTU is less than route PMTU, this new MTU will be the
2243           lowest MTU in the path, update the route PMTU to reflect PMTU
2244           decreases; if new MTU is greater than route PMTU, and the
2245           old MTU is the lowest MTU in the path, update the route PMTU
2246           to reflect the increase. In this case if the other nodes' MTU
2247           also have the lowest MTU, TOO BIG MESSAGE will be lead to
2248           PMTU discouvery.
2249         */
2250        if (rt->dst.dev == arg->dev &&
2251            !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2252            (dst_mtu(&rt->dst) >= arg->mtu ||
2253             (dst_mtu(&rt->dst) < arg->mtu &&
2254              dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2255                dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2256        }
2257        return 0;
2258}
2259
2260void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2261{
2262        struct rt6_mtu_change_arg arg = {
2263                .dev = dev,
2264                .mtu = mtu,
2265        };
2266
2267        fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2268}
2269
2270static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2271        [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2272        [RTA_OIF]               = { .type = NLA_U32 },
2273        [RTA_IIF]               = { .type = NLA_U32 },
2274        [RTA_PRIORITY]          = { .type = NLA_U32 },
2275        [RTA_METRICS]           = { .type = NLA_NESTED },
2276};
2277
2278static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2279                              struct fib6_config *cfg)
2280{
2281        struct rtmsg *rtm;
2282        struct nlattr *tb[RTA_MAX+1];
2283        int err;
2284
2285        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2286        if (err < 0)
2287                goto errout;
2288
2289        err = -EINVAL;
2290        rtm = nlmsg_data(nlh);
2291        memset(cfg, 0, sizeof(*cfg));
2292
2293        cfg->fc_table = rtm->rtm_table;
2294        cfg->fc_dst_len = rtm->rtm_dst_len;
2295        cfg->fc_src_len = rtm->rtm_src_len;
2296        cfg->fc_flags = RTF_UP;
2297        cfg->fc_protocol = rtm->rtm_protocol;
2298
2299        if (rtm->rtm_type == RTN_UNREACHABLE)
2300                cfg->fc_flags |= RTF_REJECT;
2301
2302        if (rtm->rtm_type == RTN_LOCAL)
2303                cfg->fc_flags |= RTF_LOCAL;
2304
2305        cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2306        cfg->fc_nlinfo.nlh = nlh;
2307        cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2308
2309        if (tb[RTA_GATEWAY]) {
2310                nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2311                cfg->fc_flags |= RTF_GATEWAY;
2312        }
2313
2314        if (tb[RTA_DST]) {
2315                int plen = (rtm->rtm_dst_len + 7) >> 3;
2316
2317                if (nla_len(tb[RTA_DST]) < plen)
2318                        goto errout;
2319
2320                nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2321        }
2322
2323        if (tb[RTA_SRC]) {
2324                int plen = (rtm->rtm_src_len + 7) >> 3;
2325
2326                if (nla_len(tb[RTA_SRC]) < plen)
2327                        goto errout;
2328
2329                nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2330        }
2331
2332        if (tb[RTA_PREFSRC])
2333                nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2334
2335        if (tb[RTA_OIF])
2336                cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2337
2338        if (tb[RTA_PRIORITY])
2339                cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2340
2341        if (tb[RTA_METRICS]) {
2342                cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2343                cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2344        }
2345
2346        if (tb[RTA_TABLE])
2347                cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2348
2349        err = 0;
2350errout:
2351        return err;
2352}
2353
2354static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2355{
2356        struct fib6_config cfg;
2357        int err;
2358
2359        err = rtm_to_fib6_config(skb, nlh, &cfg);
2360        if (err < 0)
2361                return err;
2362
2363        return ip6_route_del(&cfg);
2364}
2365
2366static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2367{
2368        struct fib6_config cfg;
2369        int err;
2370
2371        err = rtm_to_fib6_config(skb, nlh, &cfg);
2372        if (err < 0)
2373                return err;
2374
2375        return ip6_route_add(&cfg);
2376}
2377
2378static inline size_t rt6_nlmsg_size(void)
2379{
2380        return NLMSG_ALIGN(sizeof(struct rtmsg))
2381               + nla_total_size(16) /* RTA_SRC */
2382               + nla_total_size(16) /* RTA_DST */
2383               + nla_total_size(16) /* RTA_GATEWAY */
2384               + nla_total_size(16) /* RTA_PREFSRC */
2385               + nla_total_size(4) /* RTA_TABLE */
2386               + nla_total_size(4) /* RTA_IIF */
2387               + nla_total_size(4) /* RTA_OIF */
2388               + nla_total_size(4) /* RTA_PRIORITY */
2389               + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2390               + nla_total_size(sizeof(struct rta_cacheinfo));
2391}
2392
2393static int rt6_fill_node(struct net *net,
2394                         struct sk_buff *skb, struct rt6_info *rt,
2395                         struct in6_addr *dst, struct in6_addr *src,
2396                         int iif, int type, u32 pid, u32 seq,
2397                         int prefix, int nowait, unsigned int flags)
2398{
2399        const struct inet_peer *peer;
2400        struct rtmsg *rtm;
2401        struct nlmsghdr *nlh;
2402        long expires;
2403        u32 table;
2404        struct neighbour *n;
2405        u32 ts, tsage;
2406
2407        if (prefix) {   /* user wants prefix routes only */
2408                if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2409                        /* success since this is not a prefix route */
2410                        return 1;
2411                }
2412        }
2413
2414        nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2415        if (!nlh)
2416                return -EMSGSIZE;
2417
2418        rtm = nlmsg_data(nlh);
2419        rtm->rtm_family = AF_INET6;
2420        rtm->rtm_dst_len = rt->rt6i_dst.plen;
2421        rtm->rtm_src_len = rt->rt6i_src.plen;
2422        rtm->rtm_tos = 0;
2423        if (rt->rt6i_table)
2424                table = rt->rt6i_table->tb6_id;
2425        else
2426                table = RT6_TABLE_UNSPEC;
2427        rtm->rtm_table = table;
2428        if (nla_put_u32(skb, RTA_TABLE, table))
2429                goto nla_put_failure;
2430        if (rt->rt6i_flags & RTF_REJECT)
2431                rtm->rtm_type = RTN_UNREACHABLE;
2432        else if (rt->rt6i_flags & RTF_LOCAL)
2433                rtm->rtm_type = RTN_LOCAL;
2434        else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2435                rtm->rtm_type = RTN_LOCAL;
2436        else
2437                rtm->rtm_type = RTN_UNICAST;
2438        rtm->rtm_flags = 0;
2439        rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2440        rtm->rtm_protocol = rt->rt6i_protocol;
2441        if (rt->rt6i_flags & RTF_DYNAMIC)
2442                rtm->rtm_protocol = RTPROT_REDIRECT;
2443        else if (rt->rt6i_flags & RTF_ADDRCONF)
2444                rtm->rtm_protocol = RTPROT_KERNEL;
2445        else if (rt->rt6i_flags & RTF_DEFAULT)
2446                rtm->rtm_protocol = RTPROT_RA;
2447
2448        if (rt->rt6i_flags & RTF_CACHE)
2449                rtm->rtm_flags |= RTM_F_CLONED;
2450
2451        if (dst) {
2452                if (nla_put(skb, RTA_DST, 16, dst))
2453                        goto nla_put_failure;
2454                rtm->rtm_dst_len = 128;
2455        } else if (rtm->rtm_dst_len)
2456                if (nla_put(skb, RTA_DST, 16, &rt->rt6i_dst.addr))
2457                        goto nla_put_failure;
2458#ifdef CONFIG_IPV6_SUBTREES
2459        if (src) {
2460                if (nla_put(skb, RTA_SRC, 16, src))
2461                        goto nla_put_failure;
2462                rtm->rtm_src_len = 128;
2463        } else if (rtm->rtm_src_len &&
2464                   nla_put(skb, RTA_SRC, 16, &rt->rt6i_src.addr))
2465                goto nla_put_failure;
2466#endif
2467        if (iif) {
2468#ifdef CONFIG_IPV6_MROUTE
2469                if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2470                        int err = ip6mr_get_route(net, skb, rtm, nowait);
2471                        if (err <= 0) {
2472                                if (!nowait) {
2473                                        if (err == 0)
2474                                                return 0;
2475                                        goto nla_put_failure;
2476                                } else {
2477                                        if (err == -EMSGSIZE)
2478                                                goto nla_put_failure;
2479                                }
2480                        }
2481                } else
2482#endif
2483                        if (nla_put_u32(skb, RTA_IIF, iif))
2484                                goto nla_put_failure;
2485        } else if (dst) {
2486                struct in6_addr saddr_buf;
2487                if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2488                    nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2489                        goto nla_put_failure;
2490        }
2491
2492        if (rt->rt6i_prefsrc.plen) {
2493                struct in6_addr saddr_buf;
2494                saddr_buf = rt->rt6i_prefsrc.addr;
2495                if (nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2496                        goto nla_put_failure;
2497        }
2498
2499        if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2500                goto nla_put_failure;
2501
2502        rcu_read_lock();
2503        n = dst_get_neighbour_noref(&rt->dst);
2504        if (n) {
2505                if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2506                        rcu_read_unlock();
2507                        goto nla_put_failure;
2508                }
2509        }
2510        rcu_read_unlock();
2511
2512        if (rt->dst.dev &&
2513            nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2514                goto nla_put_failure;
2515        if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2516                goto nla_put_failure;
2517        if (!(rt->rt6i_flags & RTF_EXPIRES))
2518                expires = 0;
2519        else if (rt->dst.expires - jiffies < INT_MAX)
2520                expires = rt->dst.expires - jiffies;
2521        else
2522                expires = INT_MAX;
2523
2524        peer = rt->rt6i_peer;
2525        ts = tsage = 0;
2526        if (peer && peer->tcp_ts_stamp) {
2527                ts = peer->tcp_ts;
2528                tsage = get_seconds() - peer->tcp_ts_stamp;
2529        }
2530
2531        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
2532                               expires, rt->dst.error) < 0)
2533                goto nla_put_failure;
2534
2535        return nlmsg_end(skb, nlh);
2536
2537nla_put_failure:
2538        nlmsg_cancel(skb, nlh);
2539        return -EMSGSIZE;
2540}
2541
2542int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2543{
2544        struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2545        int prefix;
2546
2547        if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2548                struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2549                prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2550        } else
2551                prefix = 0;
2552
2553        return rt6_fill_node(arg->net,
2554                     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2555                     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2556                     prefix, 0, NLM_F_MULTI);
2557}
2558
2559static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2560{
2561        struct net *net = sock_net(in_skb->sk);
2562        struct nlattr *tb[RTA_MAX+1];
2563        struct rt6_info *rt;
2564        struct sk_buff *skb;
2565        struct rtmsg *rtm;
2566        struct flowi6 fl6;
2567        int err, iif = 0, oif = 0;
2568
2569        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2570        if (err < 0)
2571                goto errout;
2572
2573        err = -EINVAL;
2574        memset(&fl6, 0, sizeof(fl6));
2575
2576        if (tb[RTA_SRC]) {
2577                if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2578                        goto errout;
2579
2580                fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2581        }
2582
2583        if (tb[RTA_DST]) {
2584                if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2585                        goto errout;
2586
2587                fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2588        }
2589
2590        if (tb[RTA_IIF])
2591                iif = nla_get_u32(tb[RTA_IIF]);
2592
2593        if (tb[RTA_OIF])
2594                oif = nla_get_u32(tb[RTA_OIF]);
2595
2596        if (iif) {
2597                struct net_device *dev;
2598                int flags = 0;
2599
2600                dev = __dev_get_by_index(net, iif);
2601                if (!dev) {
2602                        err = -ENODEV;
2603                        goto errout;
2604                }
2605
2606                fl6.flowi6_iif = iif;
2607
2608                if (!ipv6_addr_any(&fl6.saddr))
2609                        flags |= RT6_LOOKUP_F_HAS_SADDR;
2610
2611                rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
2612                                                               flags);
2613        } else {
2614                fl6.flowi6_oif = oif;
2615
2616                rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
2617        }
2618
2619        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2620        if (!skb) {
2621                dst_release(&rt->dst);
2622                err = -ENOBUFS;
2623                goto errout;
2624        }
2625
2626        /* Reserve room for dummy headers, this skb can pass
2627           through good chunk of routing engine.
2628         */
2629        skb_reset_mac_header(skb);
2630        skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2631
2632        skb_dst_set(skb, &rt->dst);
2633
2634        err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2635                            RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2636                            nlh->nlmsg_seq, 0, 0, 0);
2637        if (err < 0) {
2638                kfree_skb(skb);
2639                goto errout;
2640        }
2641
2642        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2643errout:
2644        return err;
2645}
2646
2647void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2648{
2649        struct sk_buff *skb;
2650        struct net *net = info->nl_net;
2651        u32 seq;
2652        int err;
2653
2654        err = -ENOBUFS;
2655        seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2656
2657        skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2658        if (!skb)
2659                goto errout;
2660
2661        err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2662                                event, info->pid, seq, 0, 0, 0);
2663        if (err < 0) {
2664                /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2665                WARN_ON(err == -EMSGSIZE);
2666                kfree_skb(skb);
2667                goto errout;
2668        }
2669        rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2670                    info->nlh, gfp_any());
2671        return;
2672errout:
2673        if (err < 0)
2674                rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2675}
2676
2677static int ip6_route_dev_notify(struct notifier_block *this,
2678                                unsigned long event, void *data)
2679{
2680        struct net_device *dev = (struct net_device *)data;
2681        struct net *net = dev_net(dev);
2682
2683        if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2684                net->ipv6.ip6_null_entry->dst.dev = dev;
2685                net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2686#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2687                net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2688                net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2689                net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2690                net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2691#endif
2692        }
2693
2694        return NOTIFY_OK;
2695}
2696
2697/*
2698 *      /proc
2699 */
2700
2701#ifdef CONFIG_PROC_FS
2702
2703struct rt6_proc_arg
2704{
2705        char *buffer;
2706        int offset;
2707        int length;
2708        int skip;
2709        int len;
2710};
2711
2712static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2713{
2714        struct seq_file *m = p_arg;
2715        struct neighbour *n;
2716
2717        seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2718
2719#ifdef CONFIG_IPV6_SUBTREES
2720        seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2721#else
2722        seq_puts(m, "00000000000000000000000000000000 00 ");
2723#endif
2724        rcu_read_lock();
2725        n = dst_get_neighbour_noref(&rt->dst);
2726        if (n) {
2727                seq_printf(m, "%pi6", n->primary_key);
2728        } else {
2729                seq_puts(m, "00000000000000000000000000000000");
2730        }
2731        rcu_read_unlock();
2732        seq_printf(m, " %08x %08x %08x %08x %8s\n",
2733                   rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2734                   rt->dst.__use, rt->rt6i_flags,
2735                   rt->dst.dev ? rt->dst.dev->name : "");
2736        return 0;
2737}
2738
2739static int ipv6_route_show(struct seq_file *m, void *v)
2740{
2741        struct net *net = (struct net *)m->private;
2742        fib6_clean_all_ro(net, rt6_info_route, 0, m);
2743        return 0;
2744}
2745
2746static int ipv6_route_open(struct inode *inode, struct file *file)
2747{
2748        return single_open_net(inode, file, ipv6_route_show);
2749}
2750
2751static const struct file_operations ipv6_route_proc_fops = {
2752        .owner          = THIS_MODULE,
2753        .open           = ipv6_route_open,
2754        .read           = seq_read,
2755        .llseek         = seq_lseek,
2756        .release        = single_release_net,
2757};
2758
2759static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2760{
2761        struct net *net = (struct net *)seq->private;
2762        seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2763                   net->ipv6.rt6_stats->fib_nodes,
2764                   net->ipv6.rt6_stats->fib_route_nodes,
2765                   net->ipv6.rt6_stats->fib_rt_alloc,
2766                   net->ipv6.rt6_stats->fib_rt_entries,
2767                   net->ipv6.rt6_stats->fib_rt_cache,
2768                   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2769                   net->ipv6.rt6_stats->fib_discarded_routes);
2770
2771        return 0;
2772}
2773
2774static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2775{
2776        return single_open_net(inode, file, rt6_stats_seq_show);
2777}
2778
2779static const struct file_operations rt6_stats_seq_fops = {
2780        .owner   = THIS_MODULE,
2781        .open    = rt6_stats_seq_open,
2782        .read    = seq_read,
2783        .llseek  = seq_lseek,
2784        .release = single_release_net,
2785};
2786#endif  /* CONFIG_PROC_FS */
2787
2788#ifdef CONFIG_SYSCTL
2789
2790static
2791int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2792                              void __user *buffer, size_t *lenp, loff_t *ppos)
2793{
2794        struct net *net;
2795        int delay;
2796        if (!write)
2797                return -EINVAL;
2798
2799        net = (struct net *)ctl->extra1;
2800        delay = net->ipv6.sysctl.flush_delay;
2801        proc_dointvec(ctl, write, buffer, lenp, ppos);
2802        fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2803        return 0;
2804}
2805
2806ctl_table ipv6_route_table_template[] = {
2807        {
2808                .procname       =       "flush",
2809                .data           =       &init_net.ipv6.sysctl.flush_delay,
2810                .maxlen         =       sizeof(int),
2811                .mode           =       0200,
2812                .proc_handler   =       ipv6_sysctl_rtcache_flush
2813        },
2814        {
2815                .procname       =       "gc_thresh",
2816                .data           =       &ip6_dst_ops_template.gc_thresh,
2817                .maxlen         =       sizeof(int),
2818                .mode           =       0644,
2819                .proc_handler   =       proc_dointvec,
2820        },
2821        {
2822                .procname       =       "max_size",
2823                .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2824                .maxlen         =       sizeof(int),
2825                .mode           =       0644,
2826                .proc_handler   =       proc_dointvec,
2827        },
2828        {
2829                .procname       =       "gc_min_interval",
2830                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2831                .maxlen         =       sizeof(int),
2832                .mode           =       0644,
2833                .proc_handler   =       proc_dointvec_jiffies,
2834        },
2835        {
2836                .procname       =       "gc_timeout",
2837                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2838                .maxlen         =       sizeof(int),
2839                .mode           =       0644,
2840                .proc_handler   =       proc_dointvec_jiffies,
2841        },
2842        {
2843                .procname       =       "gc_interval",
2844                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2845                .maxlen         =       sizeof(int),
2846                .mode           =       0644,
2847                .proc_handler   =       proc_dointvec_jiffies,
2848        },
2849        {
2850                .procname       =       "gc_elasticity",
2851                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2852                .maxlen         =       sizeof(int),
2853                .mode           =       0644,
2854                .proc_handler   =       proc_dointvec,
2855        },
2856        {
2857                .procname       =       "mtu_expires",
2858                .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2859                .maxlen         =       sizeof(int),
2860                .mode           =       0644,
2861                .proc_handler   =       proc_dointvec_jiffies,
2862        },
2863        {
2864                .procname       =       "min_adv_mss",
2865                .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2866                .maxlen         =       sizeof(int),
2867                .mode           =       0644,
2868                .proc_handler   =       proc_dointvec,
2869        },
2870        {
2871                .procname       =       "gc_min_interval_ms",
2872                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2873                .maxlen         =       sizeof(int),
2874                .mode           =       0644,
2875                .proc_handler   =       proc_dointvec_ms_jiffies,
2876        },
2877        { }
2878};
2879
2880struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2881{
2882        struct ctl_table *table;
2883
2884        table = kmemdup(ipv6_route_table_template,
2885                        sizeof(ipv6_route_table_template),
2886                        GFP_KERNEL);
2887
2888        if (table) {
2889                table[0].data = &net->ipv6.sysctl.flush_delay;
2890                table[0].extra1 = net;
2891                table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2892                table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2893                table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2894                table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2895                table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2896                table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2897                table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2898                table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2899                table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2900        }
2901
2902        return table;
2903}
2904#endif
2905
2906static int __net_init ip6_route_net_init(struct net *net)
2907{
2908        int ret = -ENOMEM;
2909
2910        memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2911               sizeof(net->ipv6.ip6_dst_ops));
2912
2913        if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2914                goto out_ip6_dst_ops;
2915
2916        net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2917                                           sizeof(*net->ipv6.ip6_null_entry),
2918                                           GFP_KERNEL);
2919        if (!net->ipv6.ip6_null_entry)
2920                goto out_ip6_dst_entries;
2921        net->ipv6.ip6_null_entry->dst.path =
2922                (struct dst_entry *)net->ipv6.ip6_null_entry;
2923        net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2924        dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2925                         ip6_template_metrics, true);
2926
2927#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2928        net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2929                                               sizeof(*net->ipv6.ip6_prohibit_entry),
2930                                               GFP_KERNEL);
2931        if (!net->ipv6.ip6_prohibit_entry)
2932                goto out_ip6_null_entry;
2933        net->ipv6.ip6_prohibit_entry->dst.path =
2934                (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2935        net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2936        dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2937                         ip6_template_metrics, true);
2938
2939        net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2940                                               sizeof(*net->ipv6.ip6_blk_hole_entry),
2941                                               GFP_KERNEL);
2942        if (!net->ipv6.ip6_blk_hole_entry)
2943                goto out_ip6_prohibit_entry;
2944        net->ipv6.ip6_blk_hole_entry->dst.path =
2945                (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2946        net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2947        dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2948                         ip6_template_metrics, true);
2949#endif
2950
2951        net->ipv6.sysctl.flush_delay = 0;
2952        net->ipv6.sysctl.ip6_rt_max_size = 4096;
2953        net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2954        net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2955        net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2956        net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2957        net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2958        net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2959
2960        net->ipv6.ip6_rt_gc_expire = 30*HZ;
2961
2962        ret = 0;
2963out:
2964        return ret;
2965
2966#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2967out_ip6_prohibit_entry:
2968        kfree(net->ipv6.ip6_prohibit_entry);
2969out_ip6_null_entry:
2970        kfree(net->ipv6.ip6_null_entry);
2971#endif
2972out_ip6_dst_entries:
2973        dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2974out_ip6_dst_ops:
2975        goto out;
2976}
2977
2978static void __net_exit ip6_route_net_exit(struct net *net)
2979{
2980        kfree(net->ipv6.ip6_null_entry);
2981#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2982        kfree(net->ipv6.ip6_prohibit_entry);
2983        kfree(net->ipv6.ip6_blk_hole_entry);
2984#endif
2985        dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2986}
2987
2988static int __net_init ip6_route_net_init_late(struct net *net)
2989{
2990#ifdef CONFIG_PROC_FS
2991        proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2992        proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2993#endif
2994        return 0;
2995}
2996
2997static void __net_exit ip6_route_net_exit_late(struct net *net)
2998{
2999#ifdef CONFIG_PROC_FS
3000        proc_net_remove(net, "ipv6_route");
3001        proc_net_remove(net, "rt6_stats");
3002#endif
3003}
3004
3005static struct pernet_operations ip6_route_net_ops = {
3006        .init = ip6_route_net_init,
3007        .exit = ip6_route_net_exit,
3008};
3009
3010static struct pernet_operations ip6_route_net_late_ops = {
3011        .init = ip6_route_net_init_late,
3012        .exit = ip6_route_net_exit_late,
3013};
3014
3015static struct notifier_block ip6_route_dev_notifier = {
3016        .notifier_call = ip6_route_dev_notify,
3017        .priority = 0,
3018};
3019
3020int __init ip6_route_init(void)
3021{
3022        int ret;
3023
3024        ret = -ENOMEM;
3025        ip6_dst_ops_template.kmem_cachep =
3026                kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3027                                  SLAB_HWCACHE_ALIGN, NULL);
3028        if (!ip6_dst_ops_template.kmem_cachep)
3029                goto out;
3030
3031        ret = dst_entries_init(&ip6_dst_blackhole_ops);
3032        if (ret)
3033                goto out_kmem_cache;
3034
3035        ret = register_pernet_subsys(&ip6_route_net_ops);
3036        if (ret)
3037                goto out_dst_entries;
3038
3039        ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3040
3041        /* Registering of the loopback is done before this portion of code,
3042         * the loopback reference in rt6_info will not be taken, do it
3043         * manually for init_net */
3044        init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3045        init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3046  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3047        init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3048        init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3049        init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3050        init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3051  #endif
3052        ret = fib6_init();
3053        if (ret)
3054                goto out_register_subsys;
3055
3056        ret = xfrm6_init();
3057        if (ret)
3058                goto out_fib6_init;
3059
3060        ret = fib6_rules_init();
3061        if (ret)
3062                goto xfrm6_init;
3063
3064        ret = register_pernet_subsys(&ip6_route_net_late_ops);
3065        if (ret)
3066                goto fib6_rules_init;
3067
3068        ret = -ENOBUFS;
3069        if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3070            __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3071            __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3072                goto out_register_late_subsys;
3073
3074        ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3075        if (ret)
3076                goto out_register_late_subsys;
3077
3078out:
3079        return ret;
3080
3081out_register_late_subsys:
3082        unregister_pernet_subsys(&ip6_route_net_late_ops);
3083fib6_rules_init:
3084        fib6_rules_cleanup();
3085xfrm6_init:
3086        xfrm6_fini();
3087out_fib6_init:
3088        fib6_gc_cleanup();
3089out_register_subsys:
3090        unregister_pernet_subsys(&ip6_route_net_ops);
3091out_dst_entries:
3092        dst_entries_destroy(&ip6_dst_blackhole_ops);
3093out_kmem_cache:
3094        kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3095        goto out;
3096}
3097
3098void ip6_route_cleanup(void)
3099{
3100        unregister_netdevice_notifier(&ip6_route_dev_notifier);
3101        unregister_pernet_subsys(&ip6_route_net_late_ops);
3102        fib6_rules_cleanup();
3103        xfrm6_fini();
3104        fib6_gc_cleanup();
3105        unregister_pernet_subsys(&ip6_route_net_ops);
3106        dst_entries_destroy(&ip6_dst_blackhole_ops);
3107        kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3108}
3109