linux/net/ipv6/route.c
<<
>>
Prefs
   1/*
   2 *      Linux INET6 implementation
   3 *      FIB front-end.
   4 *
   5 *      Authors:
   6 *      Pedro Roque             <roque@di.fc.ul.pt>
   7 *
   8 *      This program is free software; you can redistribute it and/or
   9 *      modify it under the terms of the GNU General Public License
  10 *      as published by the Free Software Foundation; either version
  11 *      2 of the License, or (at your option) any later version.
  12 */
  13
  14/*      Changes:
  15 *
  16 *      YOSHIFUJI Hideaki @USAGI
  17 *              reworked default router selection.
  18 *              - respect outgoing interface
  19 *              - select from (probably) reachable routers (i.e.
  20 *              routers in REACHABLE, STALE, DELAY or PROBE states).
  21 *              - always select the same router if it is (probably)
  22 *              reachable.  otherwise, round-robin the list.
  23 *      Ville Nuorvala
  24 *              Fixed routing subtrees.
  25 */
  26
  27#include <linux/capability.h>
  28#include <linux/errno.h>
  29#include <linux/export.h>
  30#include <linux/types.h>
  31#include <linux/times.h>
  32#include <linux/socket.h>
  33#include <linux/sockios.h>
  34#include <linux/net.h>
  35#include <linux/route.h>
  36#include <linux/netdevice.h>
  37#include <linux/in6.h>
  38#include <linux/mroute6.h>
  39#include <linux/init.h>
  40#include <linux/if_arp.h>
  41#include <linux/proc_fs.h>
  42#include <linux/seq_file.h>
  43#include <linux/nsproxy.h>
  44#include <linux/slab.h>
  45#include <net/net_namespace.h>
  46#include <net/snmp.h>
  47#include <net/ipv6.h>
  48#include <net/ip6_fib.h>
  49#include <net/ip6_route.h>
  50#include <net/ndisc.h>
  51#include <net/addrconf.h>
  52#include <net/tcp.h>
  53#include <linux/rtnetlink.h>
  54#include <net/dst.h>
  55#include <net/xfrm.h>
  56#include <net/netevent.h>
  57#include <net/netlink.h>
  58
  59#include <asm/uaccess.h>
  60
  61#ifdef CONFIG_SYSCTL
  62#include <linux/sysctl.h>
  63#endif
  64
  65static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
  66                                    const struct in6_addr *dest);
  67static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
  68static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
  69static unsigned int      ip6_mtu(const struct dst_entry *dst);
  70static struct dst_entry *ip6_negative_advice(struct dst_entry *);
  71static void             ip6_dst_destroy(struct dst_entry *);
  72static void             ip6_dst_ifdown(struct dst_entry *,
  73                                       struct net_device *dev, int how);
  74static int               ip6_dst_gc(struct dst_ops *ops);
  75
  76static int              ip6_pkt_discard(struct sk_buff *skb);
  77static int              ip6_pkt_discard_out(struct sk_buff *skb);
  78static void             ip6_link_failure(struct sk_buff *skb);
  79static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
  80
  81#ifdef CONFIG_IPV6_ROUTE_INFO
  82static struct rt6_info *rt6_add_route_info(struct net *net,
  83                                           const struct in6_addr *prefix, int prefixlen,
  84                                           const struct in6_addr *gwaddr, int ifindex,
  85                                           unsigned pref);
  86static struct rt6_info *rt6_get_route_info(struct net *net,
  87                                           const struct in6_addr *prefix, int prefixlen,
  88                                           const struct in6_addr *gwaddr, int ifindex);
  89#endif
  90
  91static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
  92{
  93        struct rt6_info *rt = (struct rt6_info *) dst;
  94        struct inet_peer *peer;
  95        u32 *p = NULL;
  96
  97        if (!(rt->dst.flags & DST_HOST))
  98                return NULL;
  99
 100        if (!rt->rt6i_peer)
 101                rt6_bind_peer(rt, 1);
 102
 103        peer = rt->rt6i_peer;
 104        if (peer) {
 105                u32 *old_p = __DST_METRICS_PTR(old);
 106                unsigned long prev, new;
 107
 108                p = peer->metrics;
 109                if (inet_metrics_new(peer))
 110                        memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 111
 112                new = (unsigned long) p;
 113                prev = cmpxchg(&dst->_metrics, old, new);
 114
 115                if (prev != old) {
 116                        p = __DST_METRICS_PTR(prev);
 117                        if (prev & DST_METRICS_READ_ONLY)
 118                                p = NULL;
 119                }
 120        }
 121        return p;
 122}
 123
 124static inline const void *choose_neigh_daddr(struct rt6_info *rt, const void *daddr)
 125{
 126        struct in6_addr *p = &rt->rt6i_gateway;
 127
 128        if (!ipv6_addr_any(p))
 129                return (const void *) p;
 130        return daddr;
 131}
 132
 133static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
 134{
 135        struct rt6_info *rt = (struct rt6_info *) dst;
 136        struct neighbour *n;
 137
 138        daddr = choose_neigh_daddr(rt, daddr);
 139        n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
 140        if (n)
 141                return n;
 142        return neigh_create(&nd_tbl, daddr, dst->dev);
 143}
 144
 145static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
 146{
 147        struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
 148        if (!n) {
 149                n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
 150                if (IS_ERR(n))
 151                        return PTR_ERR(n);
 152        }
 153        dst_set_neighbour(&rt->dst, n);
 154
 155        return 0;
 156}
 157
 158static struct dst_ops ip6_dst_ops_template = {
 159        .family                 =       AF_INET6,
 160        .protocol               =       cpu_to_be16(ETH_P_IPV6),
 161        .gc                     =       ip6_dst_gc,
 162        .gc_thresh              =       1024,
 163        .check                  =       ip6_dst_check,
 164        .default_advmss         =       ip6_default_advmss,
 165        .mtu                    =       ip6_mtu,
 166        .cow_metrics            =       ipv6_cow_metrics,
 167        .destroy                =       ip6_dst_destroy,
 168        .ifdown                 =       ip6_dst_ifdown,
 169        .negative_advice        =       ip6_negative_advice,
 170        .link_failure           =       ip6_link_failure,
 171        .update_pmtu            =       ip6_rt_update_pmtu,
 172        .local_out              =       __ip6_local_out,
 173        .neigh_lookup           =       ip6_neigh_lookup,
 174};
 175
 176static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
 177{
 178        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
 179
 180        return mtu ? : dst->dev->mtu;
 181}
 182
 183static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
 184{
 185}
 186
 187static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
 188                                         unsigned long old)
 189{
 190        return NULL;
 191}
 192
 193static struct dst_ops ip6_dst_blackhole_ops = {
 194        .family                 =       AF_INET6,
 195        .protocol               =       cpu_to_be16(ETH_P_IPV6),
 196        .destroy                =       ip6_dst_destroy,
 197        .check                  =       ip6_dst_check,
 198        .mtu                    =       ip6_blackhole_mtu,
 199        .default_advmss         =       ip6_default_advmss,
 200        .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
 201        .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
 202        .neigh_lookup           =       ip6_neigh_lookup,
 203};
 204
 205static const u32 ip6_template_metrics[RTAX_MAX] = {
 206        [RTAX_HOPLIMIT - 1] = 255,
 207};
 208
 209static struct rt6_info ip6_null_entry_template = {
 210        .dst = {
 211                .__refcnt       = ATOMIC_INIT(1),
 212                .__use          = 1,
 213                .obsolete       = -1,
 214                .error          = -ENETUNREACH,
 215                .input          = ip6_pkt_discard,
 216                .output         = ip6_pkt_discard_out,
 217        },
 218        .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
 219        .rt6i_protocol  = RTPROT_KERNEL,
 220        .rt6i_metric    = ~(u32) 0,
 221        .rt6i_ref       = ATOMIC_INIT(1),
 222};
 223
 224#ifdef CONFIG_IPV6_MULTIPLE_TABLES
 225
 226static int ip6_pkt_prohibit(struct sk_buff *skb);
 227static int ip6_pkt_prohibit_out(struct sk_buff *skb);
 228
 229static struct rt6_info ip6_prohibit_entry_template = {
 230        .dst = {
 231                .__refcnt       = ATOMIC_INIT(1),
 232                .__use          = 1,
 233                .obsolete       = -1,
 234                .error          = -EACCES,
 235                .input          = ip6_pkt_prohibit,
 236                .output         = ip6_pkt_prohibit_out,
 237        },
 238        .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
 239        .rt6i_protocol  = RTPROT_KERNEL,
 240        .rt6i_metric    = ~(u32) 0,
 241        .rt6i_ref       = ATOMIC_INIT(1),
 242};
 243
 244static struct rt6_info ip6_blk_hole_entry_template = {
 245        .dst = {
 246                .__refcnt       = ATOMIC_INIT(1),
 247                .__use          = 1,
 248                .obsolete       = -1,
 249                .error          = -EINVAL,
 250                .input          = dst_discard,
 251                .output         = dst_discard,
 252        },
 253        .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
 254        .rt6i_protocol  = RTPROT_KERNEL,
 255        .rt6i_metric    = ~(u32) 0,
 256        .rt6i_ref       = ATOMIC_INIT(1),
 257};
 258
 259#endif
 260
 261/* allocate dst with ip6_dst_ops */
 262static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
 263                                             struct net_device *dev,
 264                                             int flags)
 265{
 266        struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
 267
 268        if (rt)
 269                memset(&rt->rt6i_table, 0,
 270                       sizeof(*rt) - sizeof(struct dst_entry));
 271
 272        return rt;
 273}
 274
 275static void ip6_dst_destroy(struct dst_entry *dst)
 276{
 277        struct rt6_info *rt = (struct rt6_info *)dst;
 278        struct inet6_dev *idev = rt->rt6i_idev;
 279        struct inet_peer *peer = rt->rt6i_peer;
 280
 281        if (!(rt->dst.flags & DST_HOST))
 282                dst_destroy_metrics_generic(dst);
 283
 284        if (idev) {
 285                rt->rt6i_idev = NULL;
 286                in6_dev_put(idev);
 287        }
 288
 289        if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from)
 290                dst_release(dst->from);
 291
 292        if (peer) {
 293                rt->rt6i_peer = NULL;
 294                inet_putpeer(peer);
 295        }
 296}
 297
 298static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
 299
 300static u32 rt6_peer_genid(void)
 301{
 302        return atomic_read(&__rt6_peer_genid);
 303}
 304
 305void rt6_bind_peer(struct rt6_info *rt, int create)
 306{
 307        struct inet_peer *peer;
 308
 309        peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
 310        if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
 311                inet_putpeer(peer);
 312        else
 313                rt->rt6i_peer_genid = rt6_peer_genid();
 314}
 315
 316static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 317                           int how)
 318{
 319        struct rt6_info *rt = (struct rt6_info *)dst;
 320        struct inet6_dev *idev = rt->rt6i_idev;
 321        struct net_device *loopback_dev =
 322                dev_net(dev)->loopback_dev;
 323
 324        if (dev != loopback_dev && idev && idev->dev == dev) {
 325                struct inet6_dev *loopback_idev =
 326                        in6_dev_get(loopback_dev);
 327                if (loopback_idev) {
 328                        rt->rt6i_idev = loopback_idev;
 329                        in6_dev_put(idev);
 330                }
 331        }
 332}
 333
 334static __inline__ int rt6_check_expired(const struct rt6_info *rt)
 335{
 336        struct rt6_info *ort = NULL;
 337
 338        if (rt->rt6i_flags & RTF_EXPIRES) {
 339                if (time_after(jiffies, rt->dst.expires))
 340                        return 1;
 341        } else if (rt->dst.from) {
 342                ort = (struct rt6_info *) rt->dst.from;
 343                return (ort->rt6i_flags & RTF_EXPIRES) &&
 344                        time_after(jiffies, ort->dst.expires);
 345        }
 346        return 0;
 347}
 348
 349static inline int rt6_need_strict(const struct in6_addr *daddr)
 350{
 351        return ipv6_addr_type(daddr) &
 352                (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
 353}
 354
 355/*
 356 *      Route lookup. Any table->tb6_lock is implied.
 357 */
 358
 359static inline struct rt6_info *rt6_device_match(struct net *net,
 360                                                    struct rt6_info *rt,
 361                                                    const struct in6_addr *saddr,
 362                                                    int oif,
 363                                                    int flags)
 364{
 365        struct rt6_info *local = NULL;
 366        struct rt6_info *sprt;
 367
 368        if (!oif && ipv6_addr_any(saddr))
 369                goto out;
 370
 371        for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
 372                struct net_device *dev = sprt->dst.dev;
 373
 374                if (oif) {
 375                        if (dev->ifindex == oif)
 376                                return sprt;
 377                        if (dev->flags & IFF_LOOPBACK) {
 378                                if (!sprt->rt6i_idev ||
 379                                    sprt->rt6i_idev->dev->ifindex != oif) {
 380                                        if (flags & RT6_LOOKUP_F_IFACE && oif)
 381                                                continue;
 382                                        if (local && (!oif ||
 383                                                      local->rt6i_idev->dev->ifindex == oif))
 384                                                continue;
 385                                }
 386                                local = sprt;
 387                        }
 388                } else {
 389                        if (ipv6_chk_addr(net, saddr, dev,
 390                                          flags & RT6_LOOKUP_F_IFACE))
 391                                return sprt;
 392                }
 393        }
 394
 395        if (oif) {
 396                if (local)
 397                        return local;
 398
 399                if (flags & RT6_LOOKUP_F_IFACE)
 400                        return net->ipv6.ip6_null_entry;
 401        }
 402out:
 403        return rt;
 404}
 405
 406#ifdef CONFIG_IPV6_ROUTER_PREF
 407static void rt6_probe(struct rt6_info *rt)
 408{
 409        struct neighbour *neigh;
 410        /*
 411         * Okay, this does not seem to be appropriate
 412         * for now, however, we need to check if it
 413         * is really so; aka Router Reachability Probing.
 414         *
 415         * Router Reachability Probe MUST be rate-limited
 416         * to no more than one per minute.
 417         */
 418        rcu_read_lock();
 419        neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL;
 420        if (!neigh || (neigh->nud_state & NUD_VALID))
 421                goto out;
 422        read_lock_bh(&neigh->lock);
 423        if (!(neigh->nud_state & NUD_VALID) &&
 424            time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
 425                struct in6_addr mcaddr;
 426                struct in6_addr *target;
 427
 428                neigh->updated = jiffies;
 429                read_unlock_bh(&neigh->lock);
 430
 431                target = (struct in6_addr *)&neigh->primary_key;
 432                addrconf_addr_solict_mult(target, &mcaddr);
 433                ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
 434        } else {
 435                read_unlock_bh(&neigh->lock);
 436        }
 437out:
 438        rcu_read_unlock();
 439}
 440#else
 441static inline void rt6_probe(struct rt6_info *rt)
 442{
 443}
 444#endif
 445
 446/*
 447 * Default Router Selection (RFC 2461 6.3.6)
 448 */
 449static inline int rt6_check_dev(struct rt6_info *rt, int oif)
 450{
 451        struct net_device *dev = rt->dst.dev;
 452        if (!oif || dev->ifindex == oif)
 453                return 2;
 454        if ((dev->flags & IFF_LOOPBACK) &&
 455            rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
 456                return 1;
 457        return 0;
 458}
 459
 460static inline int rt6_check_neigh(struct rt6_info *rt)
 461{
 462        struct neighbour *neigh;
 463        int m;
 464
 465        rcu_read_lock();
 466        neigh = dst_get_neighbour_noref(&rt->dst);
 467        if (rt->rt6i_flags & RTF_NONEXTHOP ||
 468            !(rt->rt6i_flags & RTF_GATEWAY))
 469                m = 1;
 470        else if (neigh) {
 471                read_lock_bh(&neigh->lock);
 472                if (neigh->nud_state & NUD_VALID)
 473                        m = 2;
 474#ifdef CONFIG_IPV6_ROUTER_PREF
 475                else if (neigh->nud_state & NUD_FAILED)
 476                        m = 0;
 477#endif
 478                else
 479                        m = 1;
 480                read_unlock_bh(&neigh->lock);
 481        } else
 482                m = 0;
 483        rcu_read_unlock();
 484        return m;
 485}
 486
 487static int rt6_score_route(struct rt6_info *rt, int oif,
 488                           int strict)
 489{
 490        int m, n;
 491
 492        m = rt6_check_dev(rt, oif);
 493        if (!m && (strict & RT6_LOOKUP_F_IFACE))
 494                return -1;
 495#ifdef CONFIG_IPV6_ROUTER_PREF
 496        m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
 497#endif
 498        n = rt6_check_neigh(rt);
 499        if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
 500                return -1;
 501        return m;
 502}
 503
 504static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
 505                                   int *mpri, struct rt6_info *match)
 506{
 507        int m;
 508
 509        if (rt6_check_expired(rt))
 510                goto out;
 511
 512        m = rt6_score_route(rt, oif, strict);
 513        if (m < 0)
 514                goto out;
 515
 516        if (m > *mpri) {
 517                if (strict & RT6_LOOKUP_F_REACHABLE)
 518                        rt6_probe(match);
 519                *mpri = m;
 520                match = rt;
 521        } else if (strict & RT6_LOOKUP_F_REACHABLE) {
 522                rt6_probe(rt);
 523        }
 524
 525out:
 526        return match;
 527}
 528
 529static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
 530                                     struct rt6_info *rr_head,
 531                                     u32 metric, int oif, int strict)
 532{
 533        struct rt6_info *rt, *match;
 534        int mpri = -1;
 535
 536        match = NULL;
 537        for (rt = rr_head; rt && rt->rt6i_metric == metric;
 538             rt = rt->dst.rt6_next)
 539                match = find_match(rt, oif, strict, &mpri, match);
 540        for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
 541             rt = rt->dst.rt6_next)
 542                match = find_match(rt, oif, strict, &mpri, match);
 543
 544        return match;
 545}
 546
 547static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
 548{
 549        struct rt6_info *match, *rt0;
 550        struct net *net;
 551
 552        rt0 = fn->rr_ptr;
 553        if (!rt0)
 554                fn->rr_ptr = rt0 = fn->leaf;
 555
 556        match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
 557
 558        if (!match &&
 559            (strict & RT6_LOOKUP_F_REACHABLE)) {
 560                struct rt6_info *next = rt0->dst.rt6_next;
 561
 562                /* no entries matched; do round-robin */
 563                if (!next || next->rt6i_metric != rt0->rt6i_metric)
 564                        next = fn->leaf;
 565
 566                if (next != rt0)
 567                        fn->rr_ptr = next;
 568        }
 569
 570        net = dev_net(rt0->dst.dev);
 571        return match ? match : net->ipv6.ip6_null_entry;
 572}
 573
 574#ifdef CONFIG_IPV6_ROUTE_INFO
 575int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 576                  const struct in6_addr *gwaddr)
 577{
 578        struct net *net = dev_net(dev);
 579        struct route_info *rinfo = (struct route_info *) opt;
 580        struct in6_addr prefix_buf, *prefix;
 581        unsigned int pref;
 582        unsigned long lifetime;
 583        struct rt6_info *rt;
 584
 585        if (len < sizeof(struct route_info)) {
 586                return -EINVAL;
 587        }
 588
 589        /* Sanity check for prefix_len and length */
 590        if (rinfo->length > 3) {
 591                return -EINVAL;
 592        } else if (rinfo->prefix_len > 128) {
 593                return -EINVAL;
 594        } else if (rinfo->prefix_len > 64) {
 595                if (rinfo->length < 2) {
 596                        return -EINVAL;
 597                }
 598        } else if (rinfo->prefix_len > 0) {
 599                if (rinfo->length < 1) {
 600                        return -EINVAL;
 601                }
 602        }
 603
 604        pref = rinfo->route_pref;
 605        if (pref == ICMPV6_ROUTER_PREF_INVALID)
 606                return -EINVAL;
 607
 608        lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
 609
 610        if (rinfo->length == 3)
 611                prefix = (struct in6_addr *)rinfo->prefix;
 612        else {
 613                /* this function is safe */
 614                ipv6_addr_prefix(&prefix_buf,
 615                                 (struct in6_addr *)rinfo->prefix,
 616                                 rinfo->prefix_len);
 617                prefix = &prefix_buf;
 618        }
 619
 620        rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
 621                                dev->ifindex);
 622
 623        if (rt && !lifetime) {
 624                ip6_del_rt(rt);
 625                rt = NULL;
 626        }
 627
 628        if (!rt && lifetime)
 629                rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
 630                                        pref);
 631        else if (rt)
 632                rt->rt6i_flags = RTF_ROUTEINFO |
 633                                 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
 634
 635        if (rt) {
 636                if (!addrconf_finite_timeout(lifetime))
 637                        rt6_clean_expires(rt);
 638                else
 639                        rt6_set_expires(rt, jiffies + HZ * lifetime);
 640
 641                dst_release(&rt->dst);
 642        }
 643        return 0;
 644}
 645#endif
 646
 647#define BACKTRACK(__net, saddr)                 \
 648do { \
 649        if (rt == __net->ipv6.ip6_null_entry) { \
 650                struct fib6_node *pn; \
 651                while (1) { \
 652                        if (fn->fn_flags & RTN_TL_ROOT) \
 653                                goto out; \
 654                        pn = fn->parent; \
 655                        if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
 656                                fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
 657                        else \
 658                                fn = pn; \
 659                        if (fn->fn_flags & RTN_RTINFO) \
 660                                goto restart; \
 661                } \
 662        } \
 663} while (0)
 664
 665static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 666                                             struct fib6_table *table,
 667                                             struct flowi6 *fl6, int flags)
 668{
 669        struct fib6_node *fn;
 670        struct rt6_info *rt;
 671
 672        read_lock_bh(&table->tb6_lock);
 673        fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 674restart:
 675        rt = fn->leaf;
 676        rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
 677        BACKTRACK(net, &fl6->saddr);
 678out:
 679        dst_use(&rt->dst, jiffies);
 680        read_unlock_bh(&table->tb6_lock);
 681        return rt;
 682
 683}
 684
 685struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
 686                                    int flags)
 687{
 688        return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
 689}
 690EXPORT_SYMBOL_GPL(ip6_route_lookup);
 691
 692struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
 693                            const struct in6_addr *saddr, int oif, int strict)
 694{
 695        struct flowi6 fl6 = {
 696                .flowi6_oif = oif,
 697                .daddr = *daddr,
 698        };
 699        struct dst_entry *dst;
 700        int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
 701
 702        if (saddr) {
 703                memcpy(&fl6.saddr, saddr, sizeof(*saddr));
 704                flags |= RT6_LOOKUP_F_HAS_SADDR;
 705        }
 706
 707        dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
 708        if (dst->error == 0)
 709                return (struct rt6_info *) dst;
 710
 711        dst_release(dst);
 712
 713        return NULL;
 714}
 715
 716EXPORT_SYMBOL(rt6_lookup);
 717
 718/* ip6_ins_rt is called with FREE table->tb6_lock.
 719   It takes new route entry, the addition fails by any reason the
 720   route is freed. In any case, if caller does not hold it, it may
 721   be destroyed.
 722 */
 723
 724static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
 725{
 726        int err;
 727        struct fib6_table *table;
 728
 729        table = rt->rt6i_table;
 730        write_lock_bh(&table->tb6_lock);
 731        err = fib6_add(&table->tb6_root, rt, info);
 732        write_unlock_bh(&table->tb6_lock);
 733
 734        return err;
 735}
 736
 737int ip6_ins_rt(struct rt6_info *rt)
 738{
 739        struct nl_info info = {
 740                .nl_net = dev_net(rt->dst.dev),
 741        };
 742        return __ip6_ins_rt(rt, &info);
 743}
 744
 745static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
 746                                      const struct in6_addr *daddr,
 747                                      const struct in6_addr *saddr)
 748{
 749        struct rt6_info *rt;
 750
 751        /*
 752         *      Clone the route.
 753         */
 754
 755        rt = ip6_rt_copy(ort, daddr);
 756
 757        if (rt) {
 758                int attempts = !in_softirq();
 759
 760                if (!(rt->rt6i_flags & RTF_GATEWAY)) {
 761                        if (ort->rt6i_dst.plen != 128 &&
 762                            ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
 763                                rt->rt6i_flags |= RTF_ANYCAST;
 764                        rt->rt6i_gateway = *daddr;
 765                }
 766
 767                rt->rt6i_flags |= RTF_CACHE;
 768
 769#ifdef CONFIG_IPV6_SUBTREES
 770                if (rt->rt6i_src.plen && saddr) {
 771                        rt->rt6i_src.addr = *saddr;
 772                        rt->rt6i_src.plen = 128;
 773                }
 774#endif
 775
 776        retry:
 777                if (rt6_bind_neighbour(rt, rt->dst.dev)) {
 778                        struct net *net = dev_net(rt->dst.dev);
 779                        int saved_rt_min_interval =
 780                                net->ipv6.sysctl.ip6_rt_gc_min_interval;
 781                        int saved_rt_elasticity =
 782                                net->ipv6.sysctl.ip6_rt_gc_elasticity;
 783
 784                        if (attempts-- > 0) {
 785                                net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
 786                                net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
 787
 788                                ip6_dst_gc(&net->ipv6.ip6_dst_ops);
 789
 790                                net->ipv6.sysctl.ip6_rt_gc_elasticity =
 791                                        saved_rt_elasticity;
 792                                net->ipv6.sysctl.ip6_rt_gc_min_interval =
 793                                        saved_rt_min_interval;
 794                                goto retry;
 795                        }
 796
 797                        if (net_ratelimit())
 798                                printk(KERN_WARNING
 799                                       "ipv6: Neighbour table overflow.\n");
 800                        dst_free(&rt->dst);
 801                        return NULL;
 802                }
 803        }
 804
 805        return rt;
 806}
 807
 808static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
 809                                        const struct in6_addr *daddr)
 810{
 811        struct rt6_info *rt = ip6_rt_copy(ort, daddr);
 812
 813        if (rt) {
 814                rt->rt6i_flags |= RTF_CACHE;
 815                dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst)));
 816        }
 817        return rt;
 818}
 819
 820static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
 821                                      struct flowi6 *fl6, int flags)
 822{
 823        struct fib6_node *fn;
 824        struct rt6_info *rt, *nrt;
 825        int strict = 0;
 826        int attempts = 3;
 827        int err;
 828        int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
 829
 830        strict |= flags & RT6_LOOKUP_F_IFACE;
 831
 832relookup:
 833        read_lock_bh(&table->tb6_lock);
 834
 835restart_2:
 836        fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 837
 838restart:
 839        rt = rt6_select(fn, oif, strict | reachable);
 840
 841        BACKTRACK(net, &fl6->saddr);
 842        if (rt == net->ipv6.ip6_null_entry ||
 843            rt->rt6i_flags & RTF_CACHE)
 844                goto out;
 845
 846        dst_hold(&rt->dst);
 847        read_unlock_bh(&table->tb6_lock);
 848
 849        if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
 850                nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
 851        else if (!(rt->dst.flags & DST_HOST))
 852                nrt = rt6_alloc_clone(rt, &fl6->daddr);
 853        else
 854                goto out2;
 855
 856        dst_release(&rt->dst);
 857        rt = nrt ? : net->ipv6.ip6_null_entry;
 858
 859        dst_hold(&rt->dst);
 860        if (nrt) {
 861                err = ip6_ins_rt(nrt);
 862                if (!err)
 863                        goto out2;
 864        }
 865
 866        if (--attempts <= 0)
 867                goto out2;
 868
 869        /*
 870         * Race condition! In the gap, when table->tb6_lock was
 871         * released someone could insert this route.  Relookup.
 872         */
 873        dst_release(&rt->dst);
 874        goto relookup;
 875
 876out:
 877        if (reachable) {
 878                reachable = 0;
 879                goto restart_2;
 880        }
 881        dst_hold(&rt->dst);
 882        read_unlock_bh(&table->tb6_lock);
 883out2:
 884        rt->dst.lastuse = jiffies;
 885        rt->dst.__use++;
 886
 887        return rt;
 888}
 889
 890static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
 891                                            struct flowi6 *fl6, int flags)
 892{
 893        return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
 894}
 895
 896static struct dst_entry *ip6_route_input_lookup(struct net *net,
 897                                                struct net_device *dev,
 898                                                struct flowi6 *fl6, int flags)
 899{
 900        if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
 901                flags |= RT6_LOOKUP_F_IFACE;
 902
 903        return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
 904}
 905
 906void ip6_route_input(struct sk_buff *skb)
 907{
 908        const struct ipv6hdr *iph = ipv6_hdr(skb);
 909        struct net *net = dev_net(skb->dev);
 910        int flags = RT6_LOOKUP_F_HAS_SADDR;
 911        struct flowi6 fl6 = {
 912                .flowi6_iif = skb->dev->ifindex,
 913                .daddr = iph->daddr,
 914                .saddr = iph->saddr,
 915                .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
 916                .flowi6_mark = skb->mark,
 917                .flowi6_proto = iph->nexthdr,
 918        };
 919
 920        skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
 921}
 922
 923static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
 924                                             struct flowi6 *fl6, int flags)
 925{
 926        return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
 927}
 928
 929struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
 930                                    struct flowi6 *fl6)
 931{
 932        int flags = 0;
 933
 934        if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
 935                flags |= RT6_LOOKUP_F_IFACE;
 936
 937        if (!ipv6_addr_any(&fl6->saddr))
 938                flags |= RT6_LOOKUP_F_HAS_SADDR;
 939        else if (sk)
 940                flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
 941
 942        return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
 943}
 944
 945EXPORT_SYMBOL(ip6_route_output);
 946
 947struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
 948{
 949        struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
 950        struct dst_entry *new = NULL;
 951
 952        rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
 953        if (rt) {
 954                memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
 955
 956                new = &rt->dst;
 957
 958                new->__use = 1;
 959                new->input = dst_discard;
 960                new->output = dst_discard;
 961
 962                if (dst_metrics_read_only(&ort->dst))
 963                        new->_metrics = ort->dst._metrics;
 964                else
 965                        dst_copy_metrics(new, &ort->dst);
 966                rt->rt6i_idev = ort->rt6i_idev;
 967                if (rt->rt6i_idev)
 968                        in6_dev_hold(rt->rt6i_idev);
 969
 970                rt->rt6i_gateway = ort->rt6i_gateway;
 971                rt->rt6i_flags = ort->rt6i_flags;
 972                rt6_clean_expires(rt);
 973                rt->rt6i_metric = 0;
 974
 975                memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
 976#ifdef CONFIG_IPV6_SUBTREES
 977                memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
 978#endif
 979
 980                dst_free(new);
 981        }
 982
 983        dst_release(dst_orig);
 984        return new ? new : ERR_PTR(-ENOMEM);
 985}
 986
 987/*
 988 *      Destination cache support functions
 989 */
 990
 991static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 992{
 993        struct rt6_info *rt;
 994
 995        rt = (struct rt6_info *) dst;
 996
 997        if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
 998                if (rt->rt6i_peer_genid != rt6_peer_genid()) {
 999                        if (!rt->rt6i_peer)
1000                                rt6_bind_peer(rt, 0);
1001                        rt->rt6i_peer_genid = rt6_peer_genid();
1002                }
1003                return dst;
1004        }
1005        return NULL;
1006}
1007
1008static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1009{
1010        struct rt6_info *rt = (struct rt6_info *) dst;
1011
1012        if (rt) {
1013                if (rt->rt6i_flags & RTF_CACHE) {
1014                        if (rt6_check_expired(rt)) {
1015                                ip6_del_rt(rt);
1016                                dst = NULL;
1017                        }
1018                } else {
1019                        dst_release(dst);
1020                        dst = NULL;
1021                }
1022        }
1023        return dst;
1024}
1025
1026static void ip6_link_failure(struct sk_buff *skb)
1027{
1028        struct rt6_info *rt;
1029
1030        icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1031
1032        rt = (struct rt6_info *) skb_dst(skb);
1033        if (rt) {
1034                if (rt->rt6i_flags & RTF_CACHE)
1035                        rt6_update_expires(rt, 0);
1036                else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1037                        rt->rt6i_node->fn_sernum = -1;
1038        }
1039}
1040
1041static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1042{
1043        struct rt6_info *rt6 = (struct rt6_info*)dst;
1044
1045        if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1046                rt6->rt6i_flags |= RTF_MODIFIED;
1047                if (mtu < IPV6_MIN_MTU) {
1048                        u32 features = dst_metric(dst, RTAX_FEATURES);
1049                        mtu = IPV6_MIN_MTU;
1050                        features |= RTAX_FEATURE_ALLFRAG;
1051                        dst_metric_set(dst, RTAX_FEATURES, features);
1052                }
1053                dst_metric_set(dst, RTAX_MTU, mtu);
1054        }
1055}
1056
1057static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1058{
1059        struct net_device *dev = dst->dev;
1060        unsigned int mtu = dst_mtu(dst);
1061        struct net *net = dev_net(dev);
1062
1063        mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1064
1065        if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1066                mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1067
1068        /*
1069         * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1070         * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1071         * IPV6_MAXPLEN is also valid and means: "any MSS,
1072         * rely only on pmtu discovery"
1073         */
1074        if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1075                mtu = IPV6_MAXPLEN;
1076        return mtu;
1077}
1078
1079static unsigned int ip6_mtu(const struct dst_entry *dst)
1080{
1081        struct inet6_dev *idev;
1082        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1083
1084        if (mtu)
1085                return mtu;
1086
1087        mtu = IPV6_MIN_MTU;
1088
1089        rcu_read_lock();
1090        idev = __in6_dev_get(dst->dev);
1091        if (idev)
1092                mtu = idev->cnf.mtu6;
1093        rcu_read_unlock();
1094
1095        return mtu;
1096}
1097
1098static struct dst_entry *icmp6_dst_gc_list;
1099static DEFINE_SPINLOCK(icmp6_dst_lock);
1100
1101struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1102                                  struct neighbour *neigh,
1103                                  struct flowi6 *fl6)
1104{
1105        struct dst_entry *dst;
1106        struct rt6_info *rt;
1107        struct inet6_dev *idev = in6_dev_get(dev);
1108        struct net *net = dev_net(dev);
1109
1110        if (unlikely(!idev))
1111                return ERR_PTR(-ENODEV);
1112
1113        rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1114        if (unlikely(!rt)) {
1115                in6_dev_put(idev);
1116                dst = ERR_PTR(-ENOMEM);
1117                goto out;
1118        }
1119
1120        if (neigh)
1121                neigh_hold(neigh);
1122        else {
1123                neigh = ip6_neigh_lookup(&rt->dst, &fl6->daddr);
1124                if (IS_ERR(neigh)) {
1125                        in6_dev_put(idev);
1126                        dst_free(&rt->dst);
1127                        return ERR_CAST(neigh);
1128                }
1129        }
1130
1131        rt->dst.flags |= DST_HOST;
1132        rt->dst.output  = ip6_output;
1133        dst_set_neighbour(&rt->dst, neigh);
1134        atomic_set(&rt->dst.__refcnt, 1);
1135        rt->rt6i_dst.addr = fl6->daddr;
1136        rt->rt6i_dst.plen = 128;
1137        rt->rt6i_idev     = idev;
1138        dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1139
1140        spin_lock_bh(&icmp6_dst_lock);
1141        rt->dst.next = icmp6_dst_gc_list;
1142        icmp6_dst_gc_list = &rt->dst;
1143        spin_unlock_bh(&icmp6_dst_lock);
1144
1145        fib6_force_start_gc(net);
1146
1147        dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1148
1149out:
1150        return dst;
1151}
1152
1153int icmp6_dst_gc(void)
1154{
1155        struct dst_entry *dst, **pprev;
1156        int more = 0;
1157
1158        spin_lock_bh(&icmp6_dst_lock);
1159        pprev = &icmp6_dst_gc_list;
1160
1161        while ((dst = *pprev) != NULL) {
1162                if (!atomic_read(&dst->__refcnt)) {
1163                        *pprev = dst->next;
1164                        dst_free(dst);
1165                } else {
1166                        pprev = &dst->next;
1167                        ++more;
1168                }
1169        }
1170
1171        spin_unlock_bh(&icmp6_dst_lock);
1172
1173        return more;
1174}
1175
1176static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1177                            void *arg)
1178{
1179        struct dst_entry *dst, **pprev;
1180
1181        spin_lock_bh(&icmp6_dst_lock);
1182        pprev = &icmp6_dst_gc_list;
1183        while ((dst = *pprev) != NULL) {
1184                struct rt6_info *rt = (struct rt6_info *) dst;
1185                if (func(rt, arg)) {
1186                        *pprev = dst->next;
1187                        dst_free(dst);
1188                } else {
1189                        pprev = &dst->next;
1190                }
1191        }
1192        spin_unlock_bh(&icmp6_dst_lock);
1193}
1194
1195static int ip6_dst_gc(struct dst_ops *ops)
1196{
1197        unsigned long now = jiffies;
1198        struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1199        int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1200        int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1201        int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1202        int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1203        unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1204        int entries;
1205
1206        entries = dst_entries_get_fast(ops);
1207        if (time_after(rt_last_gc + rt_min_interval, now) &&
1208            entries <= rt_max_size)
1209                goto out;
1210
1211        net->ipv6.ip6_rt_gc_expire++;
1212        fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1213        net->ipv6.ip6_rt_last_gc = now;
1214        entries = dst_entries_get_slow(ops);
1215        if (entries < ops->gc_thresh)
1216                net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1217out:
1218        net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1219        return entries > rt_max_size;
1220}
1221
1222/* Clean host part of a prefix. Not necessary in radix tree,
1223   but results in cleaner routing tables.
1224
1225   Remove it only when all the things will work!
1226 */
1227
1228int ip6_dst_hoplimit(struct dst_entry *dst)
1229{
1230        int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1231        if (hoplimit == 0) {
1232                struct net_device *dev = dst->dev;
1233                struct inet6_dev *idev;
1234
1235                rcu_read_lock();
1236                idev = __in6_dev_get(dev);
1237                if (idev)
1238                        hoplimit = idev->cnf.hop_limit;
1239                else
1240                        hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1241                rcu_read_unlock();
1242        }
1243        return hoplimit;
1244}
1245EXPORT_SYMBOL(ip6_dst_hoplimit);
1246
1247/*
1248 *
1249 */
1250
1251int ip6_route_add(struct fib6_config *cfg)
1252{
1253        int err;
1254        struct net *net = cfg->fc_nlinfo.nl_net;
1255        struct rt6_info *rt = NULL;
1256        struct net_device *dev = NULL;
1257        struct inet6_dev *idev = NULL;
1258        struct fib6_table *table;
1259        int addr_type;
1260
1261        if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1262                return -EINVAL;
1263#ifndef CONFIG_IPV6_SUBTREES
1264        if (cfg->fc_src_len)
1265                return -EINVAL;
1266#endif
1267        if (cfg->fc_ifindex) {
1268                err = -ENODEV;
1269                dev = dev_get_by_index(net, cfg->fc_ifindex);
1270                if (!dev)
1271                        goto out;
1272                idev = in6_dev_get(dev);
1273                if (!idev)
1274                        goto out;
1275        }
1276
1277        if (cfg->fc_metric == 0)
1278                cfg->fc_metric = IP6_RT_PRIO_USER;
1279
1280        err = -ENOBUFS;
1281        if (cfg->fc_nlinfo.nlh &&
1282            !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1283                table = fib6_get_table(net, cfg->fc_table);
1284                if (!table) {
1285                        printk(KERN_WARNING "IPv6: NLM_F_CREATE should be specified when creating new route\n");
1286                        table = fib6_new_table(net, cfg->fc_table);
1287                }
1288        } else {
1289                table = fib6_new_table(net, cfg->fc_table);
1290        }
1291
1292        if (!table)
1293                goto out;
1294
1295        rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1296
1297        if (!rt) {
1298                err = -ENOMEM;
1299                goto out;
1300        }
1301
1302        rt->dst.obsolete = -1;
1303
1304        if (cfg->fc_flags & RTF_EXPIRES)
1305                rt6_set_expires(rt, jiffies +
1306                                clock_t_to_jiffies(cfg->fc_expires));
1307        else
1308                rt6_clean_expires(rt);
1309
1310        if (cfg->fc_protocol == RTPROT_UNSPEC)
1311                cfg->fc_protocol = RTPROT_BOOT;
1312        rt->rt6i_protocol = cfg->fc_protocol;
1313
1314        addr_type = ipv6_addr_type(&cfg->fc_dst);
1315
1316        if (addr_type & IPV6_ADDR_MULTICAST)
1317                rt->dst.input = ip6_mc_input;
1318        else if (cfg->fc_flags & RTF_LOCAL)
1319                rt->dst.input = ip6_input;
1320        else
1321                rt->dst.input = ip6_forward;
1322
1323        rt->dst.output = ip6_output;
1324
1325        ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1326        rt->rt6i_dst.plen = cfg->fc_dst_len;
1327        if (rt->rt6i_dst.plen == 128)
1328               rt->dst.flags |= DST_HOST;
1329
1330        if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1331                u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1332                if (!metrics) {
1333                        err = -ENOMEM;
1334                        goto out;
1335                }
1336                dst_init_metrics(&rt->dst, metrics, 0);
1337        }
1338#ifdef CONFIG_IPV6_SUBTREES
1339        ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1340        rt->rt6i_src.plen = cfg->fc_src_len;
1341#endif
1342
1343        rt->rt6i_metric = cfg->fc_metric;
1344
1345        /* We cannot add true routes via loopback here,
1346           they would result in kernel looping; promote them to reject routes
1347         */
1348        if ((cfg->fc_flags & RTF_REJECT) ||
1349            (dev && (dev->flags & IFF_LOOPBACK) &&
1350             !(addr_type & IPV6_ADDR_LOOPBACK) &&
1351             !(cfg->fc_flags & RTF_LOCAL))) {
1352                /* hold loopback dev/idev if we haven't done so. */
1353                if (dev != net->loopback_dev) {
1354                        if (dev) {
1355                                dev_put(dev);
1356                                in6_dev_put(idev);
1357                        }
1358                        dev = net->loopback_dev;
1359                        dev_hold(dev);
1360                        idev = in6_dev_get(dev);
1361                        if (!idev) {
1362                                err = -ENODEV;
1363                                goto out;
1364                        }
1365                }
1366                rt->dst.output = ip6_pkt_discard_out;
1367                rt->dst.input = ip6_pkt_discard;
1368                rt->dst.error = -ENETUNREACH;
1369                rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1370                goto install_route;
1371        }
1372
1373        if (cfg->fc_flags & RTF_GATEWAY) {
1374                const struct in6_addr *gw_addr;
1375                int gwa_type;
1376
1377                gw_addr = &cfg->fc_gateway;
1378                rt->rt6i_gateway = *gw_addr;
1379                gwa_type = ipv6_addr_type(gw_addr);
1380
1381                if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1382                        struct rt6_info *grt;
1383
1384                        /* IPv6 strictly inhibits using not link-local
1385                           addresses as nexthop address.
1386                           Otherwise, router will not able to send redirects.
1387                           It is very good, but in some (rare!) circumstances
1388                           (SIT, PtP, NBMA NOARP links) it is handy to allow
1389                           some exceptions. --ANK
1390                         */
1391                        err = -EINVAL;
1392                        if (!(gwa_type & IPV6_ADDR_UNICAST))
1393                                goto out;
1394
1395                        grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1396
1397                        err = -EHOSTUNREACH;
1398                        if (!grt)
1399                                goto out;
1400                        if (dev) {
1401                                if (dev != grt->dst.dev) {
1402                                        dst_release(&grt->dst);
1403                                        goto out;
1404                                }
1405                        } else {
1406                                dev = grt->dst.dev;
1407                                idev = grt->rt6i_idev;
1408                                dev_hold(dev);
1409                                in6_dev_hold(grt->rt6i_idev);
1410                        }
1411                        if (!(grt->rt6i_flags & RTF_GATEWAY))
1412                                err = 0;
1413                        dst_release(&grt->dst);
1414
1415                        if (err)
1416                                goto out;
1417                }
1418                err = -EINVAL;
1419                if (!dev || (dev->flags & IFF_LOOPBACK))
1420                        goto out;
1421        }
1422
1423        err = -ENODEV;
1424        if (!dev)
1425                goto out;
1426
1427        if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1428                if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1429                        err = -EINVAL;
1430                        goto out;
1431                }
1432                rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1433                rt->rt6i_prefsrc.plen = 128;
1434        } else
1435                rt->rt6i_prefsrc.plen = 0;
1436
1437        if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1438                err = rt6_bind_neighbour(rt, dev);
1439                if (err)
1440                        goto out;
1441        }
1442
1443        rt->rt6i_flags = cfg->fc_flags;
1444
1445install_route:
1446        if (cfg->fc_mx) {
1447                struct nlattr *nla;
1448                int remaining;
1449
1450                nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1451                        int type = nla_type(nla);
1452
1453                        if (type) {
1454                                if (type > RTAX_MAX) {
1455                                        err = -EINVAL;
1456                                        goto out;
1457                                }
1458
1459                                dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1460                        }
1461                }
1462        }
1463
1464        rt->dst.dev = dev;
1465        rt->rt6i_idev = idev;
1466        rt->rt6i_table = table;
1467
1468        cfg->fc_nlinfo.nl_net = dev_net(dev);
1469
1470        return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1471
1472out:
1473        if (dev)
1474                dev_put(dev);
1475        if (idev)
1476                in6_dev_put(idev);
1477        if (rt)
1478                dst_free(&rt->dst);
1479        return err;
1480}
1481
1482static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1483{
1484        int err;
1485        struct fib6_table *table;
1486        struct net *net = dev_net(rt->dst.dev);
1487
1488        if (rt == net->ipv6.ip6_null_entry)
1489                return -ENOENT;
1490
1491        table = rt->rt6i_table;
1492        write_lock_bh(&table->tb6_lock);
1493
1494        err = fib6_del(rt, info);
1495        dst_release(&rt->dst);
1496
1497        write_unlock_bh(&table->tb6_lock);
1498
1499        return err;
1500}
1501
1502int ip6_del_rt(struct rt6_info *rt)
1503{
1504        struct nl_info info = {
1505                .nl_net = dev_net(rt->dst.dev),
1506        };
1507        return __ip6_del_rt(rt, &info);
1508}
1509
1510static int ip6_route_del(struct fib6_config *cfg)
1511{
1512        struct fib6_table *table;
1513        struct fib6_node *fn;
1514        struct rt6_info *rt;
1515        int err = -ESRCH;
1516
1517        table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1518        if (!table)
1519                return err;
1520
1521        read_lock_bh(&table->tb6_lock);
1522
1523        fn = fib6_locate(&table->tb6_root,
1524                         &cfg->fc_dst, cfg->fc_dst_len,
1525                         &cfg->fc_src, cfg->fc_src_len);
1526
1527        if (fn) {
1528                for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1529                        if (cfg->fc_ifindex &&
1530                            (!rt->dst.dev ||
1531                             rt->dst.dev->ifindex != cfg->fc_ifindex))
1532                                continue;
1533                        if (cfg->fc_flags & RTF_GATEWAY &&
1534                            !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1535                                continue;
1536                        if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1537                                continue;
1538                        dst_hold(&rt->dst);
1539                        read_unlock_bh(&table->tb6_lock);
1540
1541                        return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1542                }
1543        }
1544        read_unlock_bh(&table->tb6_lock);
1545
1546        return err;
1547}
1548
1549/*
1550 *      Handle redirects
1551 */
1552struct ip6rd_flowi {
1553        struct flowi6 fl6;
1554        struct in6_addr gateway;
1555};
1556
1557static struct rt6_info *__ip6_route_redirect(struct net *net,
1558                                             struct fib6_table *table,
1559                                             struct flowi6 *fl6,
1560                                             int flags)
1561{
1562        struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1563        struct rt6_info *rt;
1564        struct fib6_node *fn;
1565
1566        /*
1567         * Get the "current" route for this destination and
1568         * check if the redirect has come from approriate router.
1569         *
1570         * RFC 2461 specifies that redirects should only be
1571         * accepted if they come from the nexthop to the target.
1572         * Due to the way the routes are chosen, this notion
1573         * is a bit fuzzy and one might need to check all possible
1574         * routes.
1575         */
1576
1577        read_lock_bh(&table->tb6_lock);
1578        fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1579restart:
1580        for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1581                /*
1582                 * Current route is on-link; redirect is always invalid.
1583                 *
1584                 * Seems, previous statement is not true. It could
1585                 * be node, which looks for us as on-link (f.e. proxy ndisc)
1586                 * But then router serving it might decide, that we should
1587                 * know truth 8)8) --ANK (980726).
1588                 */
1589                if (rt6_check_expired(rt))
1590                        continue;
1591                if (!(rt->rt6i_flags & RTF_GATEWAY))
1592                        continue;
1593                if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1594                        continue;
1595                if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1596                        continue;
1597                break;
1598        }
1599
1600        if (!rt)
1601                rt = net->ipv6.ip6_null_entry;
1602        BACKTRACK(net, &fl6->saddr);
1603out:
1604        dst_hold(&rt->dst);
1605
1606        read_unlock_bh(&table->tb6_lock);
1607
1608        return rt;
1609};
1610
1611static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1612                                           const struct in6_addr *src,
1613                                           const struct in6_addr *gateway,
1614                                           struct net_device *dev)
1615{
1616        int flags = RT6_LOOKUP_F_HAS_SADDR;
1617        struct net *net = dev_net(dev);
1618        struct ip6rd_flowi rdfl = {
1619                .fl6 = {
1620                        .flowi6_oif = dev->ifindex,
1621                        .daddr = *dest,
1622                        .saddr = *src,
1623                },
1624        };
1625
1626        rdfl.gateway = *gateway;
1627
1628        if (rt6_need_strict(dest))
1629                flags |= RT6_LOOKUP_F_IFACE;
1630
1631        return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1632                                                   flags, __ip6_route_redirect);
1633}
1634
1635void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1636                  const struct in6_addr *saddr,
1637                  struct neighbour *neigh, u8 *lladdr, int on_link)
1638{
1639        struct rt6_info *rt, *nrt = NULL;
1640        struct netevent_redirect netevent;
1641        struct net *net = dev_net(neigh->dev);
1642
1643        rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1644
1645        if (rt == net->ipv6.ip6_null_entry) {
1646                if (net_ratelimit())
1647                        printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1648                               "for redirect target\n");
1649                goto out;
1650        }
1651
1652        /*
1653         *      We have finally decided to accept it.
1654         */
1655
1656        neigh_update(neigh, lladdr, NUD_STALE,
1657                     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1658                     NEIGH_UPDATE_F_OVERRIDE|
1659                     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1660                                     NEIGH_UPDATE_F_ISROUTER))
1661                     );
1662
1663        /*
1664         * Redirect received -> path was valid.
1665         * Look, redirects are sent only in response to data packets,
1666         * so that this nexthop apparently is reachable. --ANK
1667         */
1668        dst_confirm(&rt->dst);
1669
1670        /* Duplicate redirect: silently ignore. */
1671        if (neigh == dst_get_neighbour_noref_raw(&rt->dst))
1672                goto out;
1673
1674        nrt = ip6_rt_copy(rt, dest);
1675        if (!nrt)
1676                goto out;
1677
1678        nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1679        if (on_link)
1680                nrt->rt6i_flags &= ~RTF_GATEWAY;
1681
1682        nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1683        dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1684
1685        if (ip6_ins_rt(nrt))
1686                goto out;
1687
1688        netevent.old = &rt->dst;
1689        netevent.new = &nrt->dst;
1690        call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1691
1692        if (rt->rt6i_flags & RTF_CACHE) {
1693                ip6_del_rt(rt);
1694                return;
1695        }
1696
1697out:
1698        dst_release(&rt->dst);
1699}
1700
1701/*
1702 *      Handle ICMP "packet too big" messages
1703 *      i.e. Path MTU discovery
1704 */
1705
1706static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1707                             struct net *net, u32 pmtu, int ifindex)
1708{
1709        struct rt6_info *rt, *nrt;
1710        int allfrag = 0;
1711again:
1712        rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1713        if (!rt)
1714                return;
1715
1716        if (rt6_check_expired(rt)) {
1717                ip6_del_rt(rt);
1718                goto again;
1719        }
1720
1721        if (pmtu >= dst_mtu(&rt->dst))
1722                goto out;
1723
1724        if (pmtu < IPV6_MIN_MTU) {
1725                /*
1726                 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1727                 * MTU (1280) and a fragment header should always be included
1728                 * after a node receiving Too Big message reporting PMTU is
1729                 * less than the IPv6 Minimum Link MTU.
1730                 */
1731                pmtu = IPV6_MIN_MTU;
1732                allfrag = 1;
1733        }
1734
1735        /* New mtu received -> path was valid.
1736           They are sent only in response to data packets,
1737           so that this nexthop apparently is reachable. --ANK
1738         */
1739        dst_confirm(&rt->dst);
1740
1741        /* Host route. If it is static, it would be better
1742           not to override it, but add new one, so that
1743           when cache entry will expire old pmtu
1744           would return automatically.
1745         */
1746        if (rt->rt6i_flags & RTF_CACHE) {
1747                dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1748                if (allfrag) {
1749                        u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1750                        features |= RTAX_FEATURE_ALLFRAG;
1751                        dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1752                }
1753                rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1754                rt->rt6i_flags |= RTF_MODIFIED;
1755                goto out;
1756        }
1757
1758        /* Network route.
1759           Two cases are possible:
1760           1. It is connected route. Action: COW
1761           2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1762         */
1763        if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1764                nrt = rt6_alloc_cow(rt, daddr, saddr);
1765        else
1766                nrt = rt6_alloc_clone(rt, daddr);
1767
1768        if (nrt) {
1769                dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1770                if (allfrag) {
1771                        u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1772                        features |= RTAX_FEATURE_ALLFRAG;
1773                        dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1774                }
1775
1776                /* According to RFC 1981, detecting PMTU increase shouldn't be
1777                 * happened within 5 mins, the recommended timer is 10 mins.
1778                 * Here this route expiration time is set to ip6_rt_mtu_expires
1779                 * which is 10 mins. After 10 mins the decreased pmtu is expired
1780                 * and detecting PMTU increase will be automatically happened.
1781                 */
1782                rt6_update_expires(nrt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1783                nrt->rt6i_flags |= RTF_DYNAMIC;
1784                ip6_ins_rt(nrt);
1785        }
1786out:
1787        dst_release(&rt->dst);
1788}
1789
1790void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1791                        struct net_device *dev, u32 pmtu)
1792{
1793        struct net *net = dev_net(dev);
1794
1795        /*
1796         * RFC 1981 states that a node "MUST reduce the size of the packets it
1797         * is sending along the path" that caused the Packet Too Big message.
1798         * Since it's not possible in the general case to determine which
1799         * interface was used to send the original packet, we update the MTU
1800         * on the interface that will be used to send future packets. We also
1801         * update the MTU on the interface that received the Packet Too Big in
1802         * case the original packet was forced out that interface with
1803         * SO_BINDTODEVICE or similar. This is the next best thing to the
1804         * correct behaviour, which would be to update the MTU on all
1805         * interfaces.
1806         */
1807        rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1808        rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1809}
1810
1811/*
1812 *      Misc support functions
1813 */
1814
1815static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
1816                                    const struct in6_addr *dest)
1817{
1818        struct net *net = dev_net(ort->dst.dev);
1819        struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1820                                            ort->dst.dev, 0);
1821
1822        if (rt) {
1823                rt->dst.input = ort->dst.input;
1824                rt->dst.output = ort->dst.output;
1825                rt->dst.flags |= DST_HOST;
1826
1827                rt->rt6i_dst.addr = *dest;
1828                rt->rt6i_dst.plen = 128;
1829                dst_copy_metrics(&rt->dst, &ort->dst);
1830                rt->dst.error = ort->dst.error;
1831                rt->rt6i_idev = ort->rt6i_idev;
1832                if (rt->rt6i_idev)
1833                        in6_dev_hold(rt->rt6i_idev);
1834                rt->dst.lastuse = jiffies;
1835
1836                rt->rt6i_gateway = ort->rt6i_gateway;
1837                rt->rt6i_flags = ort->rt6i_flags;
1838                if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ==
1839                    (RTF_DEFAULT | RTF_ADDRCONF))
1840                        rt6_set_from(rt, ort);
1841                else
1842                        rt6_clean_expires(rt);
1843                rt->rt6i_metric = 0;
1844
1845#ifdef CONFIG_IPV6_SUBTREES
1846                memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1847#endif
1848                memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1849                rt->rt6i_table = ort->rt6i_table;
1850        }
1851        return rt;
1852}
1853
1854#ifdef CONFIG_IPV6_ROUTE_INFO
1855static struct rt6_info *rt6_get_route_info(struct net *net,
1856                                           const struct in6_addr *prefix, int prefixlen,
1857                                           const struct in6_addr *gwaddr, int ifindex)
1858{
1859        struct fib6_node *fn;
1860        struct rt6_info *rt = NULL;
1861        struct fib6_table *table;
1862
1863        table = fib6_get_table(net, RT6_TABLE_INFO);
1864        if (!table)
1865                return NULL;
1866
1867        write_lock_bh(&table->tb6_lock);
1868        fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1869        if (!fn)
1870                goto out;
1871
1872        for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1873                if (rt->dst.dev->ifindex != ifindex)
1874                        continue;
1875                if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1876                        continue;
1877                if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1878                        continue;
1879                dst_hold(&rt->dst);
1880                break;
1881        }
1882out:
1883        write_unlock_bh(&table->tb6_lock);
1884        return rt;
1885}
1886
1887static struct rt6_info *rt6_add_route_info(struct net *net,
1888                                           const struct in6_addr *prefix, int prefixlen,
1889                                           const struct in6_addr *gwaddr, int ifindex,
1890                                           unsigned pref)
1891{
1892        struct fib6_config cfg = {
1893                .fc_table       = RT6_TABLE_INFO,
1894                .fc_metric      = IP6_RT_PRIO_USER,
1895                .fc_ifindex     = ifindex,
1896                .fc_dst_len     = prefixlen,
1897                .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1898                                  RTF_UP | RTF_PREF(pref),
1899                .fc_nlinfo.pid = 0,
1900                .fc_nlinfo.nlh = NULL,
1901                .fc_nlinfo.nl_net = net,
1902        };
1903
1904        cfg.fc_dst = *prefix;
1905        cfg.fc_gateway = *gwaddr;
1906
1907        /* We should treat it as a default route if prefix length is 0. */
1908        if (!prefixlen)
1909                cfg.fc_flags |= RTF_DEFAULT;
1910
1911        ip6_route_add(&cfg);
1912
1913        return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1914}
1915#endif
1916
1917struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1918{
1919        struct rt6_info *rt;
1920        struct fib6_table *table;
1921
1922        table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1923        if (!table)
1924                return NULL;
1925
1926        write_lock_bh(&table->tb6_lock);
1927        for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1928                if (dev == rt->dst.dev &&
1929                    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1930                    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1931                        break;
1932        }
1933        if (rt)
1934                dst_hold(&rt->dst);
1935        write_unlock_bh(&table->tb6_lock);
1936        return rt;
1937}
1938
1939struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1940                                     struct net_device *dev,
1941                                     unsigned int pref)
1942{
1943        struct fib6_config cfg = {
1944                .fc_table       = RT6_TABLE_DFLT,
1945                .fc_metric      = IP6_RT_PRIO_USER,
1946                .fc_ifindex     = dev->ifindex,
1947                .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1948                                  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1949                .fc_nlinfo.pid = 0,
1950                .fc_nlinfo.nlh = NULL,
1951                .fc_nlinfo.nl_net = dev_net(dev),
1952        };
1953
1954        cfg.fc_gateway = *gwaddr;
1955
1956        ip6_route_add(&cfg);
1957
1958        return rt6_get_dflt_router(gwaddr, dev);
1959}
1960
1961void rt6_purge_dflt_routers(struct net *net)
1962{
1963        struct rt6_info *rt;
1964        struct fib6_table *table;
1965
1966        /* NOTE: Keep consistent with rt6_get_dflt_router */
1967        table = fib6_get_table(net, RT6_TABLE_DFLT);
1968        if (!table)
1969                return;
1970
1971restart:
1972        read_lock_bh(&table->tb6_lock);
1973        for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1974                if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1975                        dst_hold(&rt->dst);
1976                        read_unlock_bh(&table->tb6_lock);
1977                        ip6_del_rt(rt);
1978                        goto restart;
1979                }
1980        }
1981        read_unlock_bh(&table->tb6_lock);
1982}
1983
1984static void rtmsg_to_fib6_config(struct net *net,
1985                                 struct in6_rtmsg *rtmsg,
1986                                 struct fib6_config *cfg)
1987{
1988        memset(cfg, 0, sizeof(*cfg));
1989
1990        cfg->fc_table = RT6_TABLE_MAIN;
1991        cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1992        cfg->fc_metric = rtmsg->rtmsg_metric;
1993        cfg->fc_expires = rtmsg->rtmsg_info;
1994        cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1995        cfg->fc_src_len = rtmsg->rtmsg_src_len;
1996        cfg->fc_flags = rtmsg->rtmsg_flags;
1997
1998        cfg->fc_nlinfo.nl_net = net;
1999
2000        cfg->fc_dst = rtmsg->rtmsg_dst;
2001        cfg->fc_src = rtmsg->rtmsg_src;
2002        cfg->fc_gateway = rtmsg->rtmsg_gateway;
2003}
2004
2005int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2006{
2007        struct fib6_config cfg;
2008        struct in6_rtmsg rtmsg;
2009        int err;
2010
2011        switch(cmd) {
2012        case SIOCADDRT:         /* Add a route */
2013        case SIOCDELRT:         /* Delete a route */
2014                if (!capable(CAP_NET_ADMIN))
2015                        return -EPERM;
2016                err = copy_from_user(&rtmsg, arg,
2017                                     sizeof(struct in6_rtmsg));
2018                if (err)
2019                        return -EFAULT;
2020
2021                rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2022
2023                rtnl_lock();
2024                switch (cmd) {
2025                case SIOCADDRT:
2026                        err = ip6_route_add(&cfg);
2027                        break;
2028                case SIOCDELRT:
2029                        err = ip6_route_del(&cfg);
2030                        break;
2031                default:
2032                        err = -EINVAL;
2033                }
2034                rtnl_unlock();
2035
2036                return err;
2037        }
2038
2039        return -EINVAL;
2040}
2041
2042/*
2043 *      Drop the packet on the floor
2044 */
2045
2046static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2047{
2048        int type;
2049        struct dst_entry *dst = skb_dst(skb);
2050        switch (ipstats_mib_noroutes) {
2051        case IPSTATS_MIB_INNOROUTES:
2052                type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2053                if (type == IPV6_ADDR_ANY) {
2054                        IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2055                                      IPSTATS_MIB_INADDRERRORS);
2056                        break;
2057                }
2058                /* FALLTHROUGH */
2059        case IPSTATS_MIB_OUTNOROUTES:
2060                IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2061                              ipstats_mib_noroutes);
2062                break;
2063        }
2064        icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2065        kfree_skb(skb);
2066        return 0;
2067}
2068
2069static int ip6_pkt_discard(struct sk_buff *skb)
2070{
2071        return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2072}
2073
2074static int ip6_pkt_discard_out(struct sk_buff *skb)
2075{
2076        skb->dev = skb_dst(skb)->dev;
2077        return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2078}
2079
2080#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2081
2082static int ip6_pkt_prohibit(struct sk_buff *skb)
2083{
2084        return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2085}
2086
2087static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2088{
2089        skb->dev = skb_dst(skb)->dev;
2090        return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2091}
2092
2093#endif
2094
2095/*
2096 *      Allocate a dst for local (unicast / anycast) address.
2097 */
2098
2099struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2100                                    const struct in6_addr *addr,
2101                                    bool anycast)
2102{
2103        struct net *net = dev_net(idev->dev);
2104        struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2105                                            net->loopback_dev, 0);
2106        int err;
2107
2108        if (!rt) {
2109                if (net_ratelimit())
2110                        pr_warning("IPv6:  Maximum number of routes reached,"
2111                                   " consider increasing route/max_size.\n");
2112                return ERR_PTR(-ENOMEM);
2113        }
2114
2115        in6_dev_hold(idev);
2116
2117        rt->dst.flags |= DST_HOST;
2118        rt->dst.input = ip6_input;
2119        rt->dst.output = ip6_output;
2120        rt->rt6i_idev = idev;
2121        rt->dst.obsolete = -1;
2122
2123        rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2124        if (anycast)
2125                rt->rt6i_flags |= RTF_ANYCAST;
2126        else
2127                rt->rt6i_flags |= RTF_LOCAL;
2128        err = rt6_bind_neighbour(rt, rt->dst.dev);
2129        if (err) {
2130                dst_free(&rt->dst);
2131                return ERR_PTR(err);
2132        }
2133
2134        rt->rt6i_dst.addr = *addr;
2135        rt->rt6i_dst.plen = 128;
2136        rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2137
2138        atomic_set(&rt->dst.__refcnt, 1);
2139
2140        return rt;
2141}
2142
2143int ip6_route_get_saddr(struct net *net,
2144                        struct rt6_info *rt,
2145                        const struct in6_addr *daddr,
2146                        unsigned int prefs,
2147                        struct in6_addr *saddr)
2148{
2149        struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2150        int err = 0;
2151        if (rt->rt6i_prefsrc.plen)
2152                *saddr = rt->rt6i_prefsrc.addr;
2153        else
2154                err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2155                                         daddr, prefs, saddr);
2156        return err;
2157}
2158
2159/* remove deleted ip from prefsrc entries */
2160struct arg_dev_net_ip {
2161        struct net_device *dev;
2162        struct net *net;
2163        struct in6_addr *addr;
2164};
2165
2166static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2167{
2168        struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2169        struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2170        struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2171
2172        if (((void *)rt->dst.dev == dev || !dev) &&
2173            rt != net->ipv6.ip6_null_entry &&
2174            ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2175                /* remove prefsrc entry */
2176                rt->rt6i_prefsrc.plen = 0;
2177        }
2178        return 0;
2179}
2180
2181void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2182{
2183        struct net *net = dev_net(ifp->idev->dev);
2184        struct arg_dev_net_ip adni = {
2185                .dev = ifp->idev->dev,
2186                .net = net,
2187                .addr = &ifp->addr,
2188        };
2189        fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2190}
2191
2192struct arg_dev_net {
2193        struct net_device *dev;
2194        struct net *net;
2195};
2196
2197static int fib6_ifdown(struct rt6_info *rt, void *arg)
2198{
2199        const struct arg_dev_net *adn = arg;
2200        const struct net_device *dev = adn->dev;
2201
2202        if ((rt->dst.dev == dev || !dev) &&
2203            rt != adn->net->ipv6.ip6_null_entry)
2204                return -1;
2205
2206        return 0;
2207}
2208
2209void rt6_ifdown(struct net *net, struct net_device *dev)
2210{
2211        struct arg_dev_net adn = {
2212                .dev = dev,
2213                .net = net,
2214        };
2215
2216        fib6_clean_all(net, fib6_ifdown, 0, &adn);
2217        icmp6_clean_all(fib6_ifdown, &adn);
2218}
2219
2220struct rt6_mtu_change_arg
2221{
2222        struct net_device *dev;
2223        unsigned mtu;
2224};
2225
2226static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2227{
2228        struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2229        struct inet6_dev *idev;
2230
2231        /* In IPv6 pmtu discovery is not optional,
2232           so that RTAX_MTU lock cannot disable it.
2233           We still use this lock to block changes
2234           caused by addrconf/ndisc.
2235        */
2236
2237        idev = __in6_dev_get(arg->dev);
2238        if (!idev)
2239                return 0;
2240
2241        /* For administrative MTU increase, there is no way to discover
2242           IPv6 PMTU increase, so PMTU increase should be updated here.
2243           Since RFC 1981 doesn't include administrative MTU increase
2244           update PMTU increase is a MUST. (i.e. jumbo frame)
2245         */
2246        /*
2247           If new MTU is less than route PMTU, this new MTU will be the
2248           lowest MTU in the path, update the route PMTU to reflect PMTU
2249           decreases; if new MTU is greater than route PMTU, and the
2250           old MTU is the lowest MTU in the path, update the route PMTU
2251           to reflect the increase. In this case if the other nodes' MTU
2252           also have the lowest MTU, TOO BIG MESSAGE will be lead to
2253           PMTU discouvery.
2254         */
2255        if (rt->dst.dev == arg->dev &&
2256            !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2257            (dst_mtu(&rt->dst) >= arg->mtu ||
2258             (dst_mtu(&rt->dst) < arg->mtu &&
2259              dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2260                dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2261        }
2262        return 0;
2263}
2264
2265void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2266{
2267        struct rt6_mtu_change_arg arg = {
2268                .dev = dev,
2269                .mtu = mtu,
2270        };
2271
2272        fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2273}
2274
2275static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2276        [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2277        [RTA_OIF]               = { .type = NLA_U32 },
2278        [RTA_IIF]               = { .type = NLA_U32 },
2279        [RTA_PRIORITY]          = { .type = NLA_U32 },
2280        [RTA_METRICS]           = { .type = NLA_NESTED },
2281};
2282
2283static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2284                              struct fib6_config *cfg)
2285{
2286        struct rtmsg *rtm;
2287        struct nlattr *tb[RTA_MAX+1];
2288        int err;
2289
2290        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2291        if (err < 0)
2292                goto errout;
2293
2294        err = -EINVAL;
2295        rtm = nlmsg_data(nlh);
2296        memset(cfg, 0, sizeof(*cfg));
2297
2298        cfg->fc_table = rtm->rtm_table;
2299        cfg->fc_dst_len = rtm->rtm_dst_len;
2300        cfg->fc_src_len = rtm->rtm_src_len;
2301        cfg->fc_flags = RTF_UP;
2302        cfg->fc_protocol = rtm->rtm_protocol;
2303
2304        if (rtm->rtm_type == RTN_UNREACHABLE)
2305                cfg->fc_flags |= RTF_REJECT;
2306
2307        if (rtm->rtm_type == RTN_LOCAL)
2308                cfg->fc_flags |= RTF_LOCAL;
2309
2310        cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2311        cfg->fc_nlinfo.nlh = nlh;
2312        cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2313
2314        if (tb[RTA_GATEWAY]) {
2315                nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2316                cfg->fc_flags |= RTF_GATEWAY;
2317        }
2318
2319        if (tb[RTA_DST]) {
2320                int plen = (rtm->rtm_dst_len + 7) >> 3;
2321
2322                if (nla_len(tb[RTA_DST]) < plen)
2323                        goto errout;
2324
2325                nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2326        }
2327
2328        if (tb[RTA_SRC]) {
2329                int plen = (rtm->rtm_src_len + 7) >> 3;
2330
2331                if (nla_len(tb[RTA_SRC]) < plen)
2332                        goto errout;
2333
2334                nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2335        }
2336
2337        if (tb[RTA_PREFSRC])
2338                nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2339
2340        if (tb[RTA_OIF])
2341                cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2342
2343        if (tb[RTA_PRIORITY])
2344                cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2345
2346        if (tb[RTA_METRICS]) {
2347                cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2348                cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2349        }
2350
2351        if (tb[RTA_TABLE])
2352                cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2353
2354        err = 0;
2355errout:
2356        return err;
2357}
2358
2359static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2360{
2361        struct fib6_config cfg;
2362        int err;
2363
2364        err = rtm_to_fib6_config(skb, nlh, &cfg);
2365        if (err < 0)
2366                return err;
2367
2368        return ip6_route_del(&cfg);
2369}
2370
2371static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2372{
2373        struct fib6_config cfg;
2374        int err;
2375
2376        err = rtm_to_fib6_config(skb, nlh, &cfg);
2377        if (err < 0)
2378                return err;
2379
2380        return ip6_route_add(&cfg);
2381}
2382
2383static inline size_t rt6_nlmsg_size(void)
2384{
2385        return NLMSG_ALIGN(sizeof(struct rtmsg))
2386               + nla_total_size(16) /* RTA_SRC */
2387               + nla_total_size(16) /* RTA_DST */
2388               + nla_total_size(16) /* RTA_GATEWAY */
2389               + nla_total_size(16) /* RTA_PREFSRC */
2390               + nla_total_size(4) /* RTA_TABLE */
2391               + nla_total_size(4) /* RTA_IIF */
2392               + nla_total_size(4) /* RTA_OIF */
2393               + nla_total_size(4) /* RTA_PRIORITY */
2394               + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2395               + nla_total_size(sizeof(struct rta_cacheinfo));
2396}
2397
2398static int rt6_fill_node(struct net *net,
2399                         struct sk_buff *skb, struct rt6_info *rt,
2400                         struct in6_addr *dst, struct in6_addr *src,
2401                         int iif, int type, u32 pid, u32 seq,
2402                         int prefix, int nowait, unsigned int flags)
2403{
2404        const struct inet_peer *peer;
2405        struct rtmsg *rtm;
2406        struct nlmsghdr *nlh;
2407        long expires;
2408        u32 table;
2409        struct neighbour *n;
2410        u32 ts, tsage;
2411
2412        if (prefix) {   /* user wants prefix routes only */
2413                if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2414                        /* success since this is not a prefix route */
2415                        return 1;
2416                }
2417        }
2418
2419        nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2420        if (!nlh)
2421                return -EMSGSIZE;
2422
2423        rtm = nlmsg_data(nlh);
2424        rtm->rtm_family = AF_INET6;
2425        rtm->rtm_dst_len = rt->rt6i_dst.plen;
2426        rtm->rtm_src_len = rt->rt6i_src.plen;
2427        rtm->rtm_tos = 0;
2428        if (rt->rt6i_table)
2429                table = rt->rt6i_table->tb6_id;
2430        else
2431                table = RT6_TABLE_UNSPEC;
2432        rtm->rtm_table = table;
2433        NLA_PUT_U32(skb, RTA_TABLE, table);
2434        if (rt->rt6i_flags & RTF_REJECT)
2435                rtm->rtm_type = RTN_UNREACHABLE;
2436        else if (rt->rt6i_flags & RTF_LOCAL)
2437                rtm->rtm_type = RTN_LOCAL;
2438        else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2439                rtm->rtm_type = RTN_LOCAL;
2440        else
2441                rtm->rtm_type = RTN_UNICAST;
2442        rtm->rtm_flags = 0;
2443        rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2444        rtm->rtm_protocol = rt->rt6i_protocol;
2445        if (rt->rt6i_flags & RTF_DYNAMIC)
2446                rtm->rtm_protocol = RTPROT_REDIRECT;
2447        else if (rt->rt6i_flags & RTF_ADDRCONF)
2448                rtm->rtm_protocol = RTPROT_KERNEL;
2449        else if (rt->rt6i_flags & RTF_DEFAULT)
2450                rtm->rtm_protocol = RTPROT_RA;
2451
2452        if (rt->rt6i_flags & RTF_CACHE)
2453                rtm->rtm_flags |= RTM_F_CLONED;
2454
2455        if (dst) {
2456                NLA_PUT(skb, RTA_DST, 16, dst);
2457                rtm->rtm_dst_len = 128;
2458        } else if (rtm->rtm_dst_len)
2459                NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2460#ifdef CONFIG_IPV6_SUBTREES
2461        if (src) {
2462                NLA_PUT(skb, RTA_SRC, 16, src);
2463                rtm->rtm_src_len = 128;
2464        } else if (rtm->rtm_src_len)
2465                NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2466#endif
2467        if (iif) {
2468#ifdef CONFIG_IPV6_MROUTE
2469                if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2470                        int err = ip6mr_get_route(net, skb, rtm, nowait);
2471                        if (err <= 0) {
2472                                if (!nowait) {
2473                                        if (err == 0)
2474                                                return 0;
2475                                        goto nla_put_failure;
2476                                } else {
2477                                        if (err == -EMSGSIZE)
2478                                                goto nla_put_failure;
2479                                }
2480                        }
2481                } else
2482#endif
2483                        NLA_PUT_U32(skb, RTA_IIF, iif);
2484        } else if (dst) {
2485                struct in6_addr saddr_buf;
2486                if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2487                        NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2488        }
2489
2490        if (rt->rt6i_prefsrc.plen) {
2491                struct in6_addr saddr_buf;
2492                saddr_buf = rt->rt6i_prefsrc.addr;
2493                NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2494        }
2495
2496        if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2497                goto nla_put_failure;
2498
2499        rcu_read_lock();
2500        n = dst_get_neighbour_noref(&rt->dst);
2501        if (n) {
2502                if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2503                        rcu_read_unlock();
2504                        goto nla_put_failure;
2505                }
2506        }
2507        rcu_read_unlock();
2508
2509        if (rt->dst.dev)
2510                NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2511
2512        NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2513
2514        if (!(rt->rt6i_flags & RTF_EXPIRES))
2515                expires = 0;
2516        else if (rt->dst.expires - jiffies < INT_MAX)
2517                expires = rt->dst.expires - jiffies;
2518        else
2519                expires = INT_MAX;
2520
2521        peer = rt->rt6i_peer;
2522        ts = tsage = 0;
2523        if (peer && peer->tcp_ts_stamp) {
2524                ts = peer->tcp_ts;
2525                tsage = get_seconds() - peer->tcp_ts_stamp;
2526        }
2527
2528        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
2529                               expires, rt->dst.error) < 0)
2530                goto nla_put_failure;
2531
2532        return nlmsg_end(skb, nlh);
2533
2534nla_put_failure:
2535        nlmsg_cancel(skb, nlh);
2536        return -EMSGSIZE;
2537}
2538
2539int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2540{
2541        struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2542        int prefix;
2543
2544        if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2545                struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2546                prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2547        } else
2548                prefix = 0;
2549
2550        return rt6_fill_node(arg->net,
2551                     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2552                     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2553                     prefix, 0, NLM_F_MULTI);
2554}
2555
2556static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2557{
2558        struct net *net = sock_net(in_skb->sk);
2559        struct nlattr *tb[RTA_MAX+1];
2560        struct rt6_info *rt;
2561        struct sk_buff *skb;
2562        struct rtmsg *rtm;
2563        struct flowi6 fl6;
2564        int err, iif = 0, oif = 0;
2565
2566        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2567        if (err < 0)
2568                goto errout;
2569
2570        err = -EINVAL;
2571        memset(&fl6, 0, sizeof(fl6));
2572
2573        if (tb[RTA_SRC]) {
2574                if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2575                        goto errout;
2576
2577                fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2578        }
2579
2580        if (tb[RTA_DST]) {
2581                if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2582                        goto errout;
2583
2584                fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2585        }
2586
2587        if (tb[RTA_IIF])
2588                iif = nla_get_u32(tb[RTA_IIF]);
2589
2590        if (tb[RTA_OIF])
2591                oif = nla_get_u32(tb[RTA_OIF]);
2592
2593        if (iif) {
2594                struct net_device *dev;
2595                int flags = 0;
2596
2597                dev = __dev_get_by_index(net, iif);
2598                if (!dev) {
2599                        err = -ENODEV;
2600                        goto errout;
2601                }
2602
2603                fl6.flowi6_iif = iif;
2604
2605                if (!ipv6_addr_any(&fl6.saddr))
2606                        flags |= RT6_LOOKUP_F_HAS_SADDR;
2607
2608                rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
2609                                                               flags);
2610        } else {
2611                fl6.flowi6_oif = oif;
2612
2613                rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
2614        }
2615
2616        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2617        if (!skb) {
2618                err = -ENOBUFS;
2619                goto errout;
2620        }
2621
2622        /* Reserve room for dummy headers, this skb can pass
2623           through good chunk of routing engine.
2624         */
2625        skb_reset_mac_header(skb);
2626        skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2627
2628        skb_dst_set(skb, &rt->dst);
2629
2630        err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2631                            RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2632                            nlh->nlmsg_seq, 0, 0, 0);
2633        if (err < 0) {
2634                kfree_skb(skb);
2635                goto errout;
2636        }
2637
2638        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2639errout:
2640        return err;
2641}
2642
2643void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2644{
2645        struct sk_buff *skb;
2646        struct net *net = info->nl_net;
2647        u32 seq;
2648        int err;
2649
2650        err = -ENOBUFS;
2651        seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2652
2653        skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2654        if (!skb)
2655                goto errout;
2656
2657        err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2658                                event, info->pid, seq, 0, 0, 0);
2659        if (err < 0) {
2660                /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2661                WARN_ON(err == -EMSGSIZE);
2662                kfree_skb(skb);
2663                goto errout;
2664        }
2665        rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2666                    info->nlh, gfp_any());
2667        return;
2668errout:
2669        if (err < 0)
2670                rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2671}
2672
2673static int ip6_route_dev_notify(struct notifier_block *this,
2674                                unsigned long event, void *data)
2675{
2676        struct net_device *dev = (struct net_device *)data;
2677        struct net *net = dev_net(dev);
2678
2679        if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2680                net->ipv6.ip6_null_entry->dst.dev = dev;
2681                net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2682#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2683                net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2684                net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2685                net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2686                net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2687#endif
2688        }
2689
2690        return NOTIFY_OK;
2691}
2692
2693/*
2694 *      /proc
2695 */
2696
2697#ifdef CONFIG_PROC_FS
2698
2699struct rt6_proc_arg
2700{
2701        char *buffer;
2702        int offset;
2703        int length;
2704        int skip;
2705        int len;
2706};
2707
2708static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2709{
2710        struct seq_file *m = p_arg;
2711        struct neighbour *n;
2712
2713        seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2714
2715#ifdef CONFIG_IPV6_SUBTREES
2716        seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2717#else
2718        seq_puts(m, "00000000000000000000000000000000 00 ");
2719#endif
2720        rcu_read_lock();
2721        n = dst_get_neighbour_noref(&rt->dst);
2722        if (n) {
2723                seq_printf(m, "%pi6", n->primary_key);
2724        } else {
2725                seq_puts(m, "00000000000000000000000000000000");
2726        }
2727        rcu_read_unlock();
2728        seq_printf(m, " %08x %08x %08x %08x %8s\n",
2729                   rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2730                   rt->dst.__use, rt->rt6i_flags,
2731                   rt->dst.dev ? rt->dst.dev->name : "");
2732        return 0;
2733}
2734
2735static int ipv6_route_show(struct seq_file *m, void *v)
2736{
2737        struct net *net = (struct net *)m->private;
2738        fib6_clean_all_ro(net, rt6_info_route, 0, m);
2739        return 0;
2740}
2741
2742static int ipv6_route_open(struct inode *inode, struct file *file)
2743{
2744        return single_open_net(inode, file, ipv6_route_show);
2745}
2746
2747static const struct file_operations ipv6_route_proc_fops = {
2748        .owner          = THIS_MODULE,
2749        .open           = ipv6_route_open,
2750        .read           = seq_read,
2751        .llseek         = seq_lseek,
2752        .release        = single_release_net,
2753};
2754
2755static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2756{
2757        struct net *net = (struct net *)seq->private;
2758        seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2759                   net->ipv6.rt6_stats->fib_nodes,
2760                   net->ipv6.rt6_stats->fib_route_nodes,
2761                   net->ipv6.rt6_stats->fib_rt_alloc,
2762                   net->ipv6.rt6_stats->fib_rt_entries,
2763                   net->ipv6.rt6_stats->fib_rt_cache,
2764                   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2765                   net->ipv6.rt6_stats->fib_discarded_routes);
2766
2767        return 0;
2768}
2769
2770static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2771{
2772        return single_open_net(inode, file, rt6_stats_seq_show);
2773}
2774
2775static const struct file_operations rt6_stats_seq_fops = {
2776        .owner   = THIS_MODULE,
2777        .open    = rt6_stats_seq_open,
2778        .read    = seq_read,
2779        .llseek  = seq_lseek,
2780        .release = single_release_net,
2781};
2782#endif  /* CONFIG_PROC_FS */
2783
2784#ifdef CONFIG_SYSCTL
2785
2786static
2787int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2788                              void __user *buffer, size_t *lenp, loff_t *ppos)
2789{
2790        struct net *net;
2791        int delay;
2792        if (!write)
2793                return -EINVAL;
2794
2795        net = (struct net *)ctl->extra1;
2796        delay = net->ipv6.sysctl.flush_delay;
2797        proc_dointvec(ctl, write, buffer, lenp, ppos);
2798        fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2799        return 0;
2800}
2801
2802ctl_table ipv6_route_table_template[] = {
2803        {
2804                .procname       =       "flush",
2805                .data           =       &init_net.ipv6.sysctl.flush_delay,
2806                .maxlen         =       sizeof(int),
2807                .mode           =       0200,
2808                .proc_handler   =       ipv6_sysctl_rtcache_flush
2809        },
2810        {
2811                .procname       =       "gc_thresh",
2812                .data           =       &ip6_dst_ops_template.gc_thresh,
2813                .maxlen         =       sizeof(int),
2814                .mode           =       0644,
2815                .proc_handler   =       proc_dointvec,
2816        },
2817        {
2818                .procname       =       "max_size",
2819                .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2820                .maxlen         =       sizeof(int),
2821                .mode           =       0644,
2822                .proc_handler   =       proc_dointvec,
2823        },
2824        {
2825                .procname       =       "gc_min_interval",
2826                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2827                .maxlen         =       sizeof(int),
2828                .mode           =       0644,
2829                .proc_handler   =       proc_dointvec_jiffies,
2830        },
2831        {
2832                .procname       =       "gc_timeout",
2833                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2834                .maxlen         =       sizeof(int),
2835                .mode           =       0644,
2836                .proc_handler   =       proc_dointvec_jiffies,
2837        },
2838        {
2839                .procname       =       "gc_interval",
2840                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2841                .maxlen         =       sizeof(int),
2842                .mode           =       0644,
2843                .proc_handler   =       proc_dointvec_jiffies,
2844        },
2845        {
2846                .procname       =       "gc_elasticity",
2847                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2848                .maxlen         =       sizeof(int),
2849                .mode           =       0644,
2850                .proc_handler   =       proc_dointvec,
2851        },
2852        {
2853                .procname       =       "mtu_expires",
2854                .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2855                .maxlen         =       sizeof(int),
2856                .mode           =       0644,
2857                .proc_handler   =       proc_dointvec_jiffies,
2858        },
2859        {
2860                .procname       =       "min_adv_mss",
2861                .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2862                .maxlen         =       sizeof(int),
2863                .mode           =       0644,
2864                .proc_handler   =       proc_dointvec,
2865        },
2866        {
2867                .procname       =       "gc_min_interval_ms",
2868                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2869                .maxlen         =       sizeof(int),
2870                .mode           =       0644,
2871                .proc_handler   =       proc_dointvec_ms_jiffies,
2872        },
2873        { }
2874};
2875
2876struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2877{
2878        struct ctl_table *table;
2879
2880        table = kmemdup(ipv6_route_table_template,
2881                        sizeof(ipv6_route_table_template),
2882                        GFP_KERNEL);
2883
2884        if (table) {
2885                table[0].data = &net->ipv6.sysctl.flush_delay;
2886                table[0].extra1 = net;
2887                table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2888                table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2889                table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2890                table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2891                table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2892                table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2893                table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2894                table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2895                table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2896        }
2897
2898        return table;
2899}
2900#endif
2901
2902static int __net_init ip6_route_net_init(struct net *net)
2903{
2904        int ret = -ENOMEM;
2905
2906        memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2907               sizeof(net->ipv6.ip6_dst_ops));
2908
2909        if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2910                goto out_ip6_dst_ops;
2911
2912        net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2913                                           sizeof(*net->ipv6.ip6_null_entry),
2914                                           GFP_KERNEL);
2915        if (!net->ipv6.ip6_null_entry)
2916                goto out_ip6_dst_entries;
2917        net->ipv6.ip6_null_entry->dst.path =
2918                (struct dst_entry *)net->ipv6.ip6_null_entry;
2919        net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2920        dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2921                         ip6_template_metrics, true);
2922
2923#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2924        net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2925                                               sizeof(*net->ipv6.ip6_prohibit_entry),
2926                                               GFP_KERNEL);
2927        if (!net->ipv6.ip6_prohibit_entry)
2928                goto out_ip6_null_entry;
2929        net->ipv6.ip6_prohibit_entry->dst.path =
2930                (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2931        net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2932        dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2933                         ip6_template_metrics, true);
2934
2935        net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2936                                               sizeof(*net->ipv6.ip6_blk_hole_entry),
2937                                               GFP_KERNEL);
2938        if (!net->ipv6.ip6_blk_hole_entry)
2939                goto out_ip6_prohibit_entry;
2940        net->ipv6.ip6_blk_hole_entry->dst.path =
2941                (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2942        net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2943        dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2944                         ip6_template_metrics, true);
2945#endif
2946
2947        net->ipv6.sysctl.flush_delay = 0;
2948        net->ipv6.sysctl.ip6_rt_max_size = 4096;
2949        net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2950        net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2951        net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2952        net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2953        net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2954        net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2955
2956#ifdef CONFIG_PROC_FS
2957        proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2958        proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2959#endif
2960        net->ipv6.ip6_rt_gc_expire = 30*HZ;
2961
2962        ret = 0;
2963out:
2964        return ret;
2965
2966#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2967out_ip6_prohibit_entry:
2968        kfree(net->ipv6.ip6_prohibit_entry);
2969out_ip6_null_entry:
2970        kfree(net->ipv6.ip6_null_entry);
2971#endif
2972out_ip6_dst_entries:
2973        dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2974out_ip6_dst_ops:
2975        goto out;
2976}
2977
2978static void __net_exit ip6_route_net_exit(struct net *net)
2979{
2980#ifdef CONFIG_PROC_FS
2981        proc_net_remove(net, "ipv6_route");
2982        proc_net_remove(net, "rt6_stats");
2983#endif
2984        kfree(net->ipv6.ip6_null_entry);
2985#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2986        kfree(net->ipv6.ip6_prohibit_entry);
2987        kfree(net->ipv6.ip6_blk_hole_entry);
2988#endif
2989        dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2990}
2991
2992static struct pernet_operations ip6_route_net_ops = {
2993        .init = ip6_route_net_init,
2994        .exit = ip6_route_net_exit,
2995};
2996
2997static struct notifier_block ip6_route_dev_notifier = {
2998        .notifier_call = ip6_route_dev_notify,
2999        .priority = 0,
3000};
3001
3002int __init ip6_route_init(void)
3003{
3004        int ret;
3005
3006        ret = -ENOMEM;
3007        ip6_dst_ops_template.kmem_cachep =
3008                kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3009                                  SLAB_HWCACHE_ALIGN, NULL);
3010        if (!ip6_dst_ops_template.kmem_cachep)
3011                goto out;
3012
3013        ret = dst_entries_init(&ip6_dst_blackhole_ops);
3014        if (ret)
3015                goto out_kmem_cache;
3016
3017        ret = register_pernet_subsys(&ip6_route_net_ops);
3018        if (ret)
3019                goto out_dst_entries;
3020
3021        ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3022
3023        /* Registering of the loopback is done before this portion of code,
3024         * the loopback reference in rt6_info will not be taken, do it
3025         * manually for init_net */
3026        init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3027        init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3028  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3029        init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3030        init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3031        init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3032        init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3033  #endif
3034        ret = fib6_init();
3035        if (ret)
3036                goto out_register_subsys;
3037
3038        ret = xfrm6_init();
3039        if (ret)
3040                goto out_fib6_init;
3041
3042        ret = fib6_rules_init();
3043        if (ret)
3044                goto xfrm6_init;
3045
3046        ret = -ENOBUFS;
3047        if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3048            __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3049            __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3050                goto fib6_rules_init;
3051
3052        ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3053        if (ret)
3054                goto fib6_rules_init;
3055
3056out:
3057        return ret;
3058
3059fib6_rules_init:
3060        fib6_rules_cleanup();
3061xfrm6_init:
3062        xfrm6_fini();
3063out_fib6_init:
3064        fib6_gc_cleanup();
3065out_register_subsys:
3066        unregister_pernet_subsys(&ip6_route_net_ops);
3067out_dst_entries:
3068        dst_entries_destroy(&ip6_dst_blackhole_ops);
3069out_kmem_cache:
3070        kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3071        goto out;
3072}
3073
3074void ip6_route_cleanup(void)
3075{
3076        unregister_netdevice_notifier(&ip6_route_dev_notifier);
3077        fib6_rules_cleanup();
3078        xfrm6_fini();
3079        fib6_gc_cleanup();
3080        unregister_pernet_subsys(&ip6_route_net_ops);
3081        dst_entries_destroy(&ip6_dst_blackhole_ops);
3082        kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3083}
3084