linux/net/ipv6/route.c
<<
>>
Prefs
   1/*
   2 *      Linux INET6 implementation
   3 *      FIB front-end.
   4 *
   5 *      Authors:
   6 *      Pedro Roque             <roque@di.fc.ul.pt>
   7 *
   8 *      This program is free software; you can redistribute it and/or
   9 *      modify it under the terms of the GNU General Public License
  10 *      as published by the Free Software Foundation; either version
  11 *      2 of the License, or (at your option) any later version.
  12 */
  13
  14/*      Changes:
  15 *
  16 *      YOSHIFUJI Hideaki @USAGI
  17 *              reworked default router selection.
  18 *              - respect outgoing interface
  19 *              - select from (probably) reachable routers (i.e.
  20 *              routers in REACHABLE, STALE, DELAY or PROBE states).
  21 *              - always select the same router if it is (probably)
  22 *              reachable.  otherwise, round-robin the list.
  23 *      Ville Nuorvala
  24 *              Fixed routing subtrees.
  25 */
  26
  27#include <linux/capability.h>
  28#include <linux/errno.h>
  29#include <linux/types.h>
  30#include <linux/times.h>
  31#include <linux/socket.h>
  32#include <linux/sockios.h>
  33#include <linux/net.h>
  34#include <linux/route.h>
  35#include <linux/netdevice.h>
  36#include <linux/in6.h>
  37#include <linux/mroute6.h>
  38#include <linux/init.h>
  39#include <linux/if_arp.h>
  40#include <linux/proc_fs.h>
  41#include <linux/seq_file.h>
  42#include <linux/nsproxy.h>
  43#include <linux/slab.h>
  44#include <net/net_namespace.h>
  45#include <net/snmp.h>
  46#include <net/ipv6.h>
  47#include <net/ip6_fib.h>
  48#include <net/ip6_route.h>
  49#include <net/ndisc.h>
  50#include <net/addrconf.h>
  51#include <net/tcp.h>
  52#include <linux/rtnetlink.h>
  53#include <net/dst.h>
  54#include <net/xfrm.h>
  55#include <net/netevent.h>
  56#include <net/netlink.h>
  57
  58#include <asm/uaccess.h>
  59
  60#ifdef CONFIG_SYSCTL
  61#include <linux/sysctl.h>
  62#endif
  63
  64/* Set to 3 to get tracing. */
  65#define RT6_DEBUG 2
  66
  67#if RT6_DEBUG >= 3
  68#define RDBG(x) printk x
  69#define RT6_TRACE(x...) printk(KERN_DEBUG x)
  70#else
  71#define RDBG(x)
  72#define RT6_TRACE(x...) do { ; } while (0)
  73#endif
  74
  75static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
  76static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
  77static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
  78static unsigned int      ip6_default_mtu(const struct dst_entry *dst);
  79static struct dst_entry *ip6_negative_advice(struct dst_entry *);
  80static void             ip6_dst_destroy(struct dst_entry *);
  81static void             ip6_dst_ifdown(struct dst_entry *,
  82                                       struct net_device *dev, int how);
  83static int               ip6_dst_gc(struct dst_ops *ops);
  84
  85static int              ip6_pkt_discard(struct sk_buff *skb);
  86static int              ip6_pkt_discard_out(struct sk_buff *skb);
  87static void             ip6_link_failure(struct sk_buff *skb);
  88static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
  89
  90#ifdef CONFIG_IPV6_ROUTE_INFO
  91static struct rt6_info *rt6_add_route_info(struct net *net,
  92                                           struct in6_addr *prefix, int prefixlen,
  93                                           struct in6_addr *gwaddr, int ifindex,
  94                                           unsigned pref);
  95static struct rt6_info *rt6_get_route_info(struct net *net,
  96                                           struct in6_addr *prefix, int prefixlen,
  97                                           struct in6_addr *gwaddr, int ifindex);
  98#endif
  99
 100static struct dst_ops ip6_dst_ops_template = {
 101        .family                 =       AF_INET6,
 102        .protocol               =       cpu_to_be16(ETH_P_IPV6),
 103        .gc                     =       ip6_dst_gc,
 104        .gc_thresh              =       1024,
 105        .check                  =       ip6_dst_check,
 106        .default_advmss         =       ip6_default_advmss,
 107        .default_mtu            =       ip6_default_mtu,
 108        .destroy                =       ip6_dst_destroy,
 109        .ifdown                 =       ip6_dst_ifdown,
 110        .negative_advice        =       ip6_negative_advice,
 111        .link_failure           =       ip6_link_failure,
 112        .update_pmtu            =       ip6_rt_update_pmtu,
 113        .local_out              =       __ip6_local_out,
 114};
 115
 116static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
 117{
 118        return 0;
 119}
 120
 121static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
 122{
 123}
 124
 125static struct dst_ops ip6_dst_blackhole_ops = {
 126        .family                 =       AF_INET6,
 127        .protocol               =       cpu_to_be16(ETH_P_IPV6),
 128        .destroy                =       ip6_dst_destroy,
 129        .check                  =       ip6_dst_check,
 130        .default_mtu            =       ip6_blackhole_default_mtu,
 131        .default_advmss         =       ip6_default_advmss,
 132        .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
 133};
 134
 135static struct rt6_info ip6_null_entry_template = {
 136        .dst = {
 137                .__refcnt       = ATOMIC_INIT(1),
 138                .__use          = 1,
 139                .obsolete       = -1,
 140                .error          = -ENETUNREACH,
 141                .input          = ip6_pkt_discard,
 142                .output         = ip6_pkt_discard_out,
 143        },
 144        .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
 145        .rt6i_protocol  = RTPROT_KERNEL,
 146        .rt6i_metric    = ~(u32) 0,
 147        .rt6i_ref       = ATOMIC_INIT(1),
 148};
 149
 150#ifdef CONFIG_IPV6_MULTIPLE_TABLES
 151
 152static int ip6_pkt_prohibit(struct sk_buff *skb);
 153static int ip6_pkt_prohibit_out(struct sk_buff *skb);
 154
 155static struct rt6_info ip6_prohibit_entry_template = {
 156        .dst = {
 157                .__refcnt       = ATOMIC_INIT(1),
 158                .__use          = 1,
 159                .obsolete       = -1,
 160                .error          = -EACCES,
 161                .input          = ip6_pkt_prohibit,
 162                .output         = ip6_pkt_prohibit_out,
 163        },
 164        .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
 165        .rt6i_protocol  = RTPROT_KERNEL,
 166        .rt6i_metric    = ~(u32) 0,
 167        .rt6i_ref       = ATOMIC_INIT(1),
 168};
 169
 170static struct rt6_info ip6_blk_hole_entry_template = {
 171        .dst = {
 172                .__refcnt       = ATOMIC_INIT(1),
 173                .__use          = 1,
 174                .obsolete       = -1,
 175                .error          = -EINVAL,
 176                .input          = dst_discard,
 177                .output         = dst_discard,
 178        },
 179        .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
 180        .rt6i_protocol  = RTPROT_KERNEL,
 181        .rt6i_metric    = ~(u32) 0,
 182        .rt6i_ref       = ATOMIC_INIT(1),
 183};
 184
 185#endif
 186
 187/* allocate dst with ip6_dst_ops */
 188static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
 189{
 190        return (struct rt6_info *)dst_alloc(ops);
 191}
 192
 193static void ip6_dst_destroy(struct dst_entry *dst)
 194{
 195        struct rt6_info *rt = (struct rt6_info *)dst;
 196        struct inet6_dev *idev = rt->rt6i_idev;
 197        struct inet_peer *peer = rt->rt6i_peer;
 198
 199        if (idev != NULL) {
 200                rt->rt6i_idev = NULL;
 201                in6_dev_put(idev);
 202        }
 203        if (peer) {
 204                rt->rt6i_peer = NULL;
 205                inet_putpeer(peer);
 206        }
 207}
 208
 209void rt6_bind_peer(struct rt6_info *rt, int create)
 210{
 211        struct inet_peer *peer;
 212
 213        peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
 214        if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
 215                inet_putpeer(peer);
 216}
 217
 218static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 219                           int how)
 220{
 221        struct rt6_info *rt = (struct rt6_info *)dst;
 222        struct inet6_dev *idev = rt->rt6i_idev;
 223        struct net_device *loopback_dev =
 224                dev_net(dev)->loopback_dev;
 225
 226        if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
 227                struct inet6_dev *loopback_idev =
 228                        in6_dev_get(loopback_dev);
 229                if (loopback_idev != NULL) {
 230                        rt->rt6i_idev = loopback_idev;
 231                        in6_dev_put(idev);
 232                }
 233        }
 234}
 235
 236static __inline__ int rt6_check_expired(const struct rt6_info *rt)
 237{
 238        return (rt->rt6i_flags & RTF_EXPIRES) &&
 239                time_after(jiffies, rt->rt6i_expires);
 240}
 241
 242static inline int rt6_need_strict(struct in6_addr *daddr)
 243{
 244        return ipv6_addr_type(daddr) &
 245                (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
 246}
 247
 248/*
 249 *      Route lookup. Any table->tb6_lock is implied.
 250 */
 251
 252static inline struct rt6_info *rt6_device_match(struct net *net,
 253                                                    struct rt6_info *rt,
 254                                                    struct in6_addr *saddr,
 255                                                    int oif,
 256                                                    int flags)
 257{
 258        struct rt6_info *local = NULL;
 259        struct rt6_info *sprt;
 260
 261        if (!oif && ipv6_addr_any(saddr))
 262                goto out;
 263
 264        for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
 265                struct net_device *dev = sprt->rt6i_dev;
 266
 267                if (oif) {
 268                        if (dev->ifindex == oif)
 269                                return sprt;
 270                        if (dev->flags & IFF_LOOPBACK) {
 271                                if (sprt->rt6i_idev == NULL ||
 272                                    sprt->rt6i_idev->dev->ifindex != oif) {
 273                                        if (flags & RT6_LOOKUP_F_IFACE && oif)
 274                                                continue;
 275                                        if (local && (!oif ||
 276                                                      local->rt6i_idev->dev->ifindex == oif))
 277                                                continue;
 278                                }
 279                                local = sprt;
 280                        }
 281                } else {
 282                        if (ipv6_chk_addr(net, saddr, dev,
 283                                          flags & RT6_LOOKUP_F_IFACE))
 284                                return sprt;
 285                }
 286        }
 287
 288        if (oif) {
 289                if (local)
 290                        return local;
 291
 292                if (flags & RT6_LOOKUP_F_IFACE)
 293                        return net->ipv6.ip6_null_entry;
 294        }
 295out:
 296        return rt;
 297}
 298
 299#ifdef CONFIG_IPV6_ROUTER_PREF
 300static void rt6_probe(struct rt6_info *rt)
 301{
 302        struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
 303        /*
 304         * Okay, this does not seem to be appropriate
 305         * for now, however, we need to check if it
 306         * is really so; aka Router Reachability Probing.
 307         *
 308         * Router Reachability Probe MUST be rate-limited
 309         * to no more than one per minute.
 310         */
 311        if (!neigh || (neigh->nud_state & NUD_VALID))
 312                return;
 313        read_lock_bh(&neigh->lock);
 314        if (!(neigh->nud_state & NUD_VALID) &&
 315            time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
 316                struct in6_addr mcaddr;
 317                struct in6_addr *target;
 318
 319                neigh->updated = jiffies;
 320                read_unlock_bh(&neigh->lock);
 321
 322                target = (struct in6_addr *)&neigh->primary_key;
 323                addrconf_addr_solict_mult(target, &mcaddr);
 324                ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
 325        } else
 326                read_unlock_bh(&neigh->lock);
 327}
 328#else
 329static inline void rt6_probe(struct rt6_info *rt)
 330{
 331}
 332#endif
 333
 334/*
 335 * Default Router Selection (RFC 2461 6.3.6)
 336 */
 337static inline int rt6_check_dev(struct rt6_info *rt, int oif)
 338{
 339        struct net_device *dev = rt->rt6i_dev;
 340        if (!oif || dev->ifindex == oif)
 341                return 2;
 342        if ((dev->flags & IFF_LOOPBACK) &&
 343            rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
 344                return 1;
 345        return 0;
 346}
 347
 348static inline int rt6_check_neigh(struct rt6_info *rt)
 349{
 350        struct neighbour *neigh = rt->rt6i_nexthop;
 351        int m;
 352        if (rt->rt6i_flags & RTF_NONEXTHOP ||
 353            !(rt->rt6i_flags & RTF_GATEWAY))
 354                m = 1;
 355        else if (neigh) {
 356                read_lock_bh(&neigh->lock);
 357                if (neigh->nud_state & NUD_VALID)
 358                        m = 2;
 359#ifdef CONFIG_IPV6_ROUTER_PREF
 360                else if (neigh->nud_state & NUD_FAILED)
 361                        m = 0;
 362#endif
 363                else
 364                        m = 1;
 365                read_unlock_bh(&neigh->lock);
 366        } else
 367                m = 0;
 368        return m;
 369}
 370
 371static int rt6_score_route(struct rt6_info *rt, int oif,
 372                           int strict)
 373{
 374        int m, n;
 375
 376        m = rt6_check_dev(rt, oif);
 377        if (!m && (strict & RT6_LOOKUP_F_IFACE))
 378                return -1;
 379#ifdef CONFIG_IPV6_ROUTER_PREF
 380        m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
 381#endif
 382        n = rt6_check_neigh(rt);
 383        if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
 384                return -1;
 385        return m;
 386}
 387
 388static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
 389                                   int *mpri, struct rt6_info *match)
 390{
 391        int m;
 392
 393        if (rt6_check_expired(rt))
 394                goto out;
 395
 396        m = rt6_score_route(rt, oif, strict);
 397        if (m < 0)
 398                goto out;
 399
 400        if (m > *mpri) {
 401                if (strict & RT6_LOOKUP_F_REACHABLE)
 402                        rt6_probe(match);
 403                *mpri = m;
 404                match = rt;
 405        } else if (strict & RT6_LOOKUP_F_REACHABLE) {
 406                rt6_probe(rt);
 407        }
 408
 409out:
 410        return match;
 411}
 412
 413static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
 414                                     struct rt6_info *rr_head,
 415                                     u32 metric, int oif, int strict)
 416{
 417        struct rt6_info *rt, *match;
 418        int mpri = -1;
 419
 420        match = NULL;
 421        for (rt = rr_head; rt && rt->rt6i_metric == metric;
 422             rt = rt->dst.rt6_next)
 423                match = find_match(rt, oif, strict, &mpri, match);
 424        for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
 425             rt = rt->dst.rt6_next)
 426                match = find_match(rt, oif, strict, &mpri, match);
 427
 428        return match;
 429}
 430
 431static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
 432{
 433        struct rt6_info *match, *rt0;
 434        struct net *net;
 435
 436        RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
 437                  __func__, fn->leaf, oif);
 438
 439        rt0 = fn->rr_ptr;
 440        if (!rt0)
 441                fn->rr_ptr = rt0 = fn->leaf;
 442
 443        match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
 444
 445        if (!match &&
 446            (strict & RT6_LOOKUP_F_REACHABLE)) {
 447                struct rt6_info *next = rt0->dst.rt6_next;
 448
 449                /* no entries matched; do round-robin */
 450                if (!next || next->rt6i_metric != rt0->rt6i_metric)
 451                        next = fn->leaf;
 452
 453                if (next != rt0)
 454                        fn->rr_ptr = next;
 455        }
 456
 457        RT6_TRACE("%s() => %p\n",
 458                  __func__, match);
 459
 460        net = dev_net(rt0->rt6i_dev);
 461        return match ? match : net->ipv6.ip6_null_entry;
 462}
 463
 464#ifdef CONFIG_IPV6_ROUTE_INFO
 465int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 466                  struct in6_addr *gwaddr)
 467{
 468        struct net *net = dev_net(dev);
 469        struct route_info *rinfo = (struct route_info *) opt;
 470        struct in6_addr prefix_buf, *prefix;
 471        unsigned int pref;
 472        unsigned long lifetime;
 473        struct rt6_info *rt;
 474
 475        if (len < sizeof(struct route_info)) {
 476                return -EINVAL;
 477        }
 478
 479        /* Sanity check for prefix_len and length */
 480        if (rinfo->length > 3) {
 481                return -EINVAL;
 482        } else if (rinfo->prefix_len > 128) {
 483                return -EINVAL;
 484        } else if (rinfo->prefix_len > 64) {
 485                if (rinfo->length < 2) {
 486                        return -EINVAL;
 487                }
 488        } else if (rinfo->prefix_len > 0) {
 489                if (rinfo->length < 1) {
 490                        return -EINVAL;
 491                }
 492        }
 493
 494        pref = rinfo->route_pref;
 495        if (pref == ICMPV6_ROUTER_PREF_INVALID)
 496                return -EINVAL;
 497
 498        lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
 499
 500        if (rinfo->length == 3)
 501                prefix = (struct in6_addr *)rinfo->prefix;
 502        else {
 503                /* this function is safe */
 504                ipv6_addr_prefix(&prefix_buf,
 505                                 (struct in6_addr *)rinfo->prefix,
 506                                 rinfo->prefix_len);
 507                prefix = &prefix_buf;
 508        }
 509
 510        rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
 511                                dev->ifindex);
 512
 513        if (rt && !lifetime) {
 514                ip6_del_rt(rt);
 515                rt = NULL;
 516        }
 517
 518        if (!rt && lifetime)
 519                rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
 520                                        pref);
 521        else if (rt)
 522                rt->rt6i_flags = RTF_ROUTEINFO |
 523                                 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
 524
 525        if (rt) {
 526                if (!addrconf_finite_timeout(lifetime)) {
 527                        rt->rt6i_flags &= ~RTF_EXPIRES;
 528                } else {
 529                        rt->rt6i_expires = jiffies + HZ * lifetime;
 530                        rt->rt6i_flags |= RTF_EXPIRES;
 531                }
 532                dst_release(&rt->dst);
 533        }
 534        return 0;
 535}
 536#endif
 537
 538#define BACKTRACK(__net, saddr)                 \
 539do { \
 540        if (rt == __net->ipv6.ip6_null_entry) { \
 541                struct fib6_node *pn; \
 542                while (1) { \
 543                        if (fn->fn_flags & RTN_TL_ROOT) \
 544                                goto out; \
 545                        pn = fn->parent; \
 546                        if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
 547                                fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
 548                        else \
 549                                fn = pn; \
 550                        if (fn->fn_flags & RTN_RTINFO) \
 551                                goto restart; \
 552                } \
 553        } \
 554} while(0)
 555
 556static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 557                                             struct fib6_table *table,
 558                                             struct flowi *fl, int flags)
 559{
 560        struct fib6_node *fn;
 561        struct rt6_info *rt;
 562
 563        read_lock_bh(&table->tb6_lock);
 564        fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
 565restart:
 566        rt = fn->leaf;
 567        rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
 568        BACKTRACK(net, &fl->fl6_src);
 569out:
 570        dst_use(&rt->dst, jiffies);
 571        read_unlock_bh(&table->tb6_lock);
 572        return rt;
 573
 574}
 575
 576struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
 577                            const struct in6_addr *saddr, int oif, int strict)
 578{
 579        struct flowi fl = {
 580                .oif = oif,
 581                .fl6_dst = *daddr,
 582        };
 583        struct dst_entry *dst;
 584        int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
 585
 586        if (saddr) {
 587                memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
 588                flags |= RT6_LOOKUP_F_HAS_SADDR;
 589        }
 590
 591        dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
 592        if (dst->error == 0)
 593                return (struct rt6_info *) dst;
 594
 595        dst_release(dst);
 596
 597        return NULL;
 598}
 599
 600EXPORT_SYMBOL(rt6_lookup);
 601
 602/* ip6_ins_rt is called with FREE table->tb6_lock.
 603   It takes new route entry, the addition fails by any reason the
 604   route is freed. In any case, if caller does not hold it, it may
 605   be destroyed.
 606 */
 607
 608static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
 609{
 610        int err;
 611        struct fib6_table *table;
 612
 613        table = rt->rt6i_table;
 614        write_lock_bh(&table->tb6_lock);
 615        err = fib6_add(&table->tb6_root, rt, info);
 616        write_unlock_bh(&table->tb6_lock);
 617
 618        return err;
 619}
 620
 621int ip6_ins_rt(struct rt6_info *rt)
 622{
 623        struct nl_info info = {
 624                .nl_net = dev_net(rt->rt6i_dev),
 625        };
 626        return __ip6_ins_rt(rt, &info);
 627}
 628
 629static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
 630                                      struct in6_addr *saddr)
 631{
 632        struct rt6_info *rt;
 633
 634        /*
 635         *      Clone the route.
 636         */
 637
 638        rt = ip6_rt_copy(ort);
 639
 640        if (rt) {
 641                struct neighbour *neigh;
 642                int attempts = !in_softirq();
 643
 644                if (!(rt->rt6i_flags&RTF_GATEWAY)) {
 645                        if (rt->rt6i_dst.plen != 128 &&
 646                            ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
 647                                rt->rt6i_flags |= RTF_ANYCAST;
 648                        ipv6_addr_copy(&rt->rt6i_gateway, daddr);
 649                }
 650
 651                ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
 652                rt->rt6i_dst.plen = 128;
 653                rt->rt6i_flags |= RTF_CACHE;
 654                rt->dst.flags |= DST_HOST;
 655
 656#ifdef CONFIG_IPV6_SUBTREES
 657                if (rt->rt6i_src.plen && saddr) {
 658                        ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
 659                        rt->rt6i_src.plen = 128;
 660                }
 661#endif
 662
 663        retry:
 664                neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
 665                if (IS_ERR(neigh)) {
 666                        struct net *net = dev_net(rt->rt6i_dev);
 667                        int saved_rt_min_interval =
 668                                net->ipv6.sysctl.ip6_rt_gc_min_interval;
 669                        int saved_rt_elasticity =
 670                                net->ipv6.sysctl.ip6_rt_gc_elasticity;
 671
 672                        if (attempts-- > 0) {
 673                                net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
 674                                net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
 675
 676                                ip6_dst_gc(&net->ipv6.ip6_dst_ops);
 677
 678                                net->ipv6.sysctl.ip6_rt_gc_elasticity =
 679                                        saved_rt_elasticity;
 680                                net->ipv6.sysctl.ip6_rt_gc_min_interval =
 681                                        saved_rt_min_interval;
 682                                goto retry;
 683                        }
 684
 685                        if (net_ratelimit())
 686                                printk(KERN_WARNING
 687                                       "ipv6: Neighbour table overflow.\n");
 688                        dst_free(&rt->dst);
 689                        return NULL;
 690                }
 691                rt->rt6i_nexthop = neigh;
 692
 693        }
 694
 695        return rt;
 696}
 697
 698static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
 699{
 700        struct rt6_info *rt = ip6_rt_copy(ort);
 701        if (rt) {
 702                ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
 703                rt->rt6i_dst.plen = 128;
 704                rt->rt6i_flags |= RTF_CACHE;
 705                rt->dst.flags |= DST_HOST;
 706                rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
 707        }
 708        return rt;
 709}
 710
 711static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
 712                                      struct flowi *fl, int flags)
 713{
 714        struct fib6_node *fn;
 715        struct rt6_info *rt, *nrt;
 716        int strict = 0;
 717        int attempts = 3;
 718        int err;
 719        int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
 720
 721        strict |= flags & RT6_LOOKUP_F_IFACE;
 722
 723relookup:
 724        read_lock_bh(&table->tb6_lock);
 725
 726restart_2:
 727        fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
 728
 729restart:
 730        rt = rt6_select(fn, oif, strict | reachable);
 731
 732        BACKTRACK(net, &fl->fl6_src);
 733        if (rt == net->ipv6.ip6_null_entry ||
 734            rt->rt6i_flags & RTF_CACHE)
 735                goto out;
 736
 737        dst_hold(&rt->dst);
 738        read_unlock_bh(&table->tb6_lock);
 739
 740        if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
 741                nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
 742        else if (!(rt->dst.flags & DST_HOST))
 743                nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
 744        else
 745                goto out2;
 746
 747        dst_release(&rt->dst);
 748        rt = nrt ? : net->ipv6.ip6_null_entry;
 749
 750        dst_hold(&rt->dst);
 751        if (nrt) {
 752                err = ip6_ins_rt(nrt);
 753                if (!err)
 754                        goto out2;
 755        }
 756
 757        if (--attempts <= 0)
 758                goto out2;
 759
 760        /*
 761         * Race condition! In the gap, when table->tb6_lock was
 762         * released someone could insert this route.  Relookup.
 763         */
 764        dst_release(&rt->dst);
 765        goto relookup;
 766
 767out:
 768        if (reachable) {
 769                reachable = 0;
 770                goto restart_2;
 771        }
 772        dst_hold(&rt->dst);
 773        read_unlock_bh(&table->tb6_lock);
 774out2:
 775        rt->dst.lastuse = jiffies;
 776        rt->dst.__use++;
 777
 778        return rt;
 779}
 780
 781static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
 782                                            struct flowi *fl, int flags)
 783{
 784        return ip6_pol_route(net, table, fl->iif, fl, flags);
 785}
 786
 787void ip6_route_input(struct sk_buff *skb)
 788{
 789        struct ipv6hdr *iph = ipv6_hdr(skb);
 790        struct net *net = dev_net(skb->dev);
 791        int flags = RT6_LOOKUP_F_HAS_SADDR;
 792        struct flowi fl = {
 793                .iif = skb->dev->ifindex,
 794                .fl6_dst = iph->daddr,
 795                .fl6_src = iph->saddr,
 796                .fl6_flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
 797                .mark = skb->mark,
 798                .proto = iph->nexthdr,
 799        };
 800
 801        if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
 802                flags |= RT6_LOOKUP_F_IFACE;
 803
 804        skb_dst_set(skb, fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input));
 805}
 806
 807static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
 808                                             struct flowi *fl, int flags)
 809{
 810        return ip6_pol_route(net, table, fl->oif, fl, flags);
 811}
 812
 813struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
 814                                    struct flowi *fl)
 815{
 816        int flags = 0;
 817
 818        if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl->fl6_dst))
 819                flags |= RT6_LOOKUP_F_IFACE;
 820
 821        if (!ipv6_addr_any(&fl->fl6_src))
 822                flags |= RT6_LOOKUP_F_HAS_SADDR;
 823        else if (sk)
 824                flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
 825
 826        return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
 827}
 828
 829EXPORT_SYMBOL(ip6_route_output);
 830
 831int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
 832{
 833        struct rt6_info *ort = (struct rt6_info *) *dstp;
 834        struct rt6_info *rt = (struct rt6_info *)
 835                dst_alloc(&ip6_dst_blackhole_ops);
 836        struct dst_entry *new = NULL;
 837
 838        if (rt) {
 839                new = &rt->dst;
 840
 841                atomic_set(&new->__refcnt, 1);
 842                new->__use = 1;
 843                new->input = dst_discard;
 844                new->output = dst_discard;
 845
 846                dst_copy_metrics(new, &ort->dst);
 847                new->dev = ort->dst.dev;
 848                if (new->dev)
 849                        dev_hold(new->dev);
 850                rt->rt6i_idev = ort->rt6i_idev;
 851                if (rt->rt6i_idev)
 852                        in6_dev_hold(rt->rt6i_idev);
 853                rt->rt6i_expires = 0;
 854
 855                ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
 856                rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
 857                rt->rt6i_metric = 0;
 858
 859                memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
 860#ifdef CONFIG_IPV6_SUBTREES
 861                memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
 862#endif
 863
 864                dst_free(new);
 865        }
 866
 867        dst_release(*dstp);
 868        *dstp = new;
 869        return new ? 0 : -ENOMEM;
 870}
 871EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
 872
 873/*
 874 *      Destination cache support functions
 875 */
 876
 877static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 878{
 879        struct rt6_info *rt;
 880
 881        rt = (struct rt6_info *) dst;
 882
 883        if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
 884                return dst;
 885
 886        return NULL;
 887}
 888
 889static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
 890{
 891        struct rt6_info *rt = (struct rt6_info *) dst;
 892
 893        if (rt) {
 894                if (rt->rt6i_flags & RTF_CACHE) {
 895                        if (rt6_check_expired(rt)) {
 896                                ip6_del_rt(rt);
 897                                dst = NULL;
 898                        }
 899                } else {
 900                        dst_release(dst);
 901                        dst = NULL;
 902                }
 903        }
 904        return dst;
 905}
 906
 907static void ip6_link_failure(struct sk_buff *skb)
 908{
 909        struct rt6_info *rt;
 910
 911        icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
 912
 913        rt = (struct rt6_info *) skb_dst(skb);
 914        if (rt) {
 915                if (rt->rt6i_flags&RTF_CACHE) {
 916                        dst_set_expires(&rt->dst, 0);
 917                        rt->rt6i_flags |= RTF_EXPIRES;
 918                } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
 919                        rt->rt6i_node->fn_sernum = -1;
 920        }
 921}
 922
 923static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
 924{
 925        struct rt6_info *rt6 = (struct rt6_info*)dst;
 926
 927        if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
 928                rt6->rt6i_flags |= RTF_MODIFIED;
 929                if (mtu < IPV6_MIN_MTU) {
 930                        u32 features = dst_metric(dst, RTAX_FEATURES);
 931                        mtu = IPV6_MIN_MTU;
 932                        features |= RTAX_FEATURE_ALLFRAG;
 933                        dst_metric_set(dst, RTAX_FEATURES, features);
 934                }
 935                dst_metric_set(dst, RTAX_MTU, mtu);
 936                call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
 937        }
 938}
 939
 940static unsigned int ip6_default_advmss(const struct dst_entry *dst)
 941{
 942        struct net_device *dev = dst->dev;
 943        unsigned int mtu = dst_mtu(dst);
 944        struct net *net = dev_net(dev);
 945
 946        mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
 947
 948        if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
 949                mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
 950
 951        /*
 952         * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
 953         * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
 954         * IPV6_MAXPLEN is also valid and means: "any MSS,
 955         * rely only on pmtu discovery"
 956         */
 957        if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
 958                mtu = IPV6_MAXPLEN;
 959        return mtu;
 960}
 961
 962static unsigned int ip6_default_mtu(const struct dst_entry *dst)
 963{
 964        unsigned int mtu = IPV6_MIN_MTU;
 965        struct inet6_dev *idev;
 966
 967        rcu_read_lock();
 968        idev = __in6_dev_get(dst->dev);
 969        if (idev)
 970                mtu = idev->cnf.mtu6;
 971        rcu_read_unlock();
 972
 973        return mtu;
 974}
 975
 976static struct dst_entry *icmp6_dst_gc_list;
 977static DEFINE_SPINLOCK(icmp6_dst_lock);
 978
 979struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
 980                                  struct neighbour *neigh,
 981                                  const struct in6_addr *addr)
 982{
 983        struct rt6_info *rt;
 984        struct inet6_dev *idev = in6_dev_get(dev);
 985        struct net *net = dev_net(dev);
 986
 987        if (unlikely(idev == NULL))
 988                return NULL;
 989
 990        rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
 991        if (unlikely(rt == NULL)) {
 992                in6_dev_put(idev);
 993                goto out;
 994        }
 995
 996        dev_hold(dev);
 997        if (neigh)
 998                neigh_hold(neigh);
 999        else {
1000                neigh = ndisc_get_neigh(dev, addr);
1001                if (IS_ERR(neigh))
1002                        neigh = NULL;
1003        }
1004
1005        rt->rt6i_dev      = dev;
1006        rt->rt6i_idev     = idev;
1007        rt->rt6i_nexthop  = neigh;
1008        atomic_set(&rt->dst.__refcnt, 1);
1009        dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1010        rt->dst.output  = ip6_output;
1011
1012#if 0   /* there's no chance to use these for ndisc */
1013        rt->dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
1014                                ? DST_HOST
1015                                : 0;
1016        ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1017        rt->rt6i_dst.plen = 128;
1018#endif
1019
1020        spin_lock_bh(&icmp6_dst_lock);
1021        rt->dst.next = icmp6_dst_gc_list;
1022        icmp6_dst_gc_list = &rt->dst;
1023        spin_unlock_bh(&icmp6_dst_lock);
1024
1025        fib6_force_start_gc(net);
1026
1027out:
1028        return &rt->dst;
1029}
1030
1031int icmp6_dst_gc(void)
1032{
1033        struct dst_entry *dst, *next, **pprev;
1034        int more = 0;
1035
1036        next = NULL;
1037
1038        spin_lock_bh(&icmp6_dst_lock);
1039        pprev = &icmp6_dst_gc_list;
1040
1041        while ((dst = *pprev) != NULL) {
1042                if (!atomic_read(&dst->__refcnt)) {
1043                        *pprev = dst->next;
1044                        dst_free(dst);
1045                } else {
1046                        pprev = &dst->next;
1047                        ++more;
1048                }
1049        }
1050
1051        spin_unlock_bh(&icmp6_dst_lock);
1052
1053        return more;
1054}
1055
1056static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1057                            void *arg)
1058{
1059        struct dst_entry *dst, **pprev;
1060
1061        spin_lock_bh(&icmp6_dst_lock);
1062        pprev = &icmp6_dst_gc_list;
1063        while ((dst = *pprev) != NULL) {
1064                struct rt6_info *rt = (struct rt6_info *) dst;
1065                if (func(rt, arg)) {
1066                        *pprev = dst->next;
1067                        dst_free(dst);
1068                } else {
1069                        pprev = &dst->next;
1070                }
1071        }
1072        spin_unlock_bh(&icmp6_dst_lock);
1073}
1074
1075static int ip6_dst_gc(struct dst_ops *ops)
1076{
1077        unsigned long now = jiffies;
1078        struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1079        int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1080        int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1081        int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1082        int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1083        unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1084        int entries;
1085
1086        entries = dst_entries_get_fast(ops);
1087        if (time_after(rt_last_gc + rt_min_interval, now) &&
1088            entries <= rt_max_size)
1089                goto out;
1090
1091        net->ipv6.ip6_rt_gc_expire++;
1092        fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1093        net->ipv6.ip6_rt_last_gc = now;
1094        entries = dst_entries_get_slow(ops);
1095        if (entries < ops->gc_thresh)
1096                net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1097out:
1098        net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1099        return entries > rt_max_size;
1100}
1101
1102/* Clean host part of a prefix. Not necessary in radix tree,
1103   but results in cleaner routing tables.
1104
1105   Remove it only when all the things will work!
1106 */
1107
1108int ip6_dst_hoplimit(struct dst_entry *dst)
1109{
1110        int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1111        if (hoplimit == 0) {
1112                struct net_device *dev = dst->dev;
1113                struct inet6_dev *idev;
1114
1115                rcu_read_lock();
1116                idev = __in6_dev_get(dev);
1117                if (idev)
1118                        hoplimit = idev->cnf.hop_limit;
1119                else
1120                        hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1121                rcu_read_unlock();
1122        }
1123        return hoplimit;
1124}
1125EXPORT_SYMBOL(ip6_dst_hoplimit);
1126
1127/*
1128 *
1129 */
1130
1131int ip6_route_add(struct fib6_config *cfg)
1132{
1133        int err;
1134        struct net *net = cfg->fc_nlinfo.nl_net;
1135        struct rt6_info *rt = NULL;
1136        struct net_device *dev = NULL;
1137        struct inet6_dev *idev = NULL;
1138        struct fib6_table *table;
1139        int addr_type;
1140
1141        if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1142                return -EINVAL;
1143#ifndef CONFIG_IPV6_SUBTREES
1144        if (cfg->fc_src_len)
1145                return -EINVAL;
1146#endif
1147        if (cfg->fc_ifindex) {
1148                err = -ENODEV;
1149                dev = dev_get_by_index(net, cfg->fc_ifindex);
1150                if (!dev)
1151                        goto out;
1152                idev = in6_dev_get(dev);
1153                if (!idev)
1154                        goto out;
1155        }
1156
1157        if (cfg->fc_metric == 0)
1158                cfg->fc_metric = IP6_RT_PRIO_USER;
1159
1160        table = fib6_new_table(net, cfg->fc_table);
1161        if (table == NULL) {
1162                err = -ENOBUFS;
1163                goto out;
1164        }
1165
1166        rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1167
1168        if (rt == NULL) {
1169                err = -ENOMEM;
1170                goto out;
1171        }
1172
1173        rt->dst.obsolete = -1;
1174        rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1175                                jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1176                                0;
1177
1178        if (cfg->fc_protocol == RTPROT_UNSPEC)
1179                cfg->fc_protocol = RTPROT_BOOT;
1180        rt->rt6i_protocol = cfg->fc_protocol;
1181
1182        addr_type = ipv6_addr_type(&cfg->fc_dst);
1183
1184        if (addr_type & IPV6_ADDR_MULTICAST)
1185                rt->dst.input = ip6_mc_input;
1186        else if (cfg->fc_flags & RTF_LOCAL)
1187                rt->dst.input = ip6_input;
1188        else
1189                rt->dst.input = ip6_forward;
1190
1191        rt->dst.output = ip6_output;
1192
1193        ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1194        rt->rt6i_dst.plen = cfg->fc_dst_len;
1195        if (rt->rt6i_dst.plen == 128)
1196               rt->dst.flags = DST_HOST;
1197
1198#ifdef CONFIG_IPV6_SUBTREES
1199        ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1200        rt->rt6i_src.plen = cfg->fc_src_len;
1201#endif
1202
1203        rt->rt6i_metric = cfg->fc_metric;
1204
1205        /* We cannot add true routes via loopback here,
1206           they would result in kernel looping; promote them to reject routes
1207         */
1208        if ((cfg->fc_flags & RTF_REJECT) ||
1209            (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1210                                              && !(cfg->fc_flags&RTF_LOCAL))) {
1211                /* hold loopback dev/idev if we haven't done so. */
1212                if (dev != net->loopback_dev) {
1213                        if (dev) {
1214                                dev_put(dev);
1215                                in6_dev_put(idev);
1216                        }
1217                        dev = net->loopback_dev;
1218                        dev_hold(dev);
1219                        idev = in6_dev_get(dev);
1220                        if (!idev) {
1221                                err = -ENODEV;
1222                                goto out;
1223                        }
1224                }
1225                rt->dst.output = ip6_pkt_discard_out;
1226                rt->dst.input = ip6_pkt_discard;
1227                rt->dst.error = -ENETUNREACH;
1228                rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1229                goto install_route;
1230        }
1231
1232        if (cfg->fc_flags & RTF_GATEWAY) {
1233                struct in6_addr *gw_addr;
1234                int gwa_type;
1235
1236                gw_addr = &cfg->fc_gateway;
1237                ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1238                gwa_type = ipv6_addr_type(gw_addr);
1239
1240                if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1241                        struct rt6_info *grt;
1242
1243                        /* IPv6 strictly inhibits using not link-local
1244                           addresses as nexthop address.
1245                           Otherwise, router will not able to send redirects.
1246                           It is very good, but in some (rare!) circumstances
1247                           (SIT, PtP, NBMA NOARP links) it is handy to allow
1248                           some exceptions. --ANK
1249                         */
1250                        err = -EINVAL;
1251                        if (!(gwa_type&IPV6_ADDR_UNICAST))
1252                                goto out;
1253
1254                        grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1255
1256                        err = -EHOSTUNREACH;
1257                        if (grt == NULL)
1258                                goto out;
1259                        if (dev) {
1260                                if (dev != grt->rt6i_dev) {
1261                                        dst_release(&grt->dst);
1262                                        goto out;
1263                                }
1264                        } else {
1265                                dev = grt->rt6i_dev;
1266                                idev = grt->rt6i_idev;
1267                                dev_hold(dev);
1268                                in6_dev_hold(grt->rt6i_idev);
1269                        }
1270                        if (!(grt->rt6i_flags&RTF_GATEWAY))
1271                                err = 0;
1272                        dst_release(&grt->dst);
1273
1274                        if (err)
1275                                goto out;
1276                }
1277                err = -EINVAL;
1278                if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1279                        goto out;
1280        }
1281
1282        err = -ENODEV;
1283        if (dev == NULL)
1284                goto out;
1285
1286        if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1287                rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1288                if (IS_ERR(rt->rt6i_nexthop)) {
1289                        err = PTR_ERR(rt->rt6i_nexthop);
1290                        rt->rt6i_nexthop = NULL;
1291                        goto out;
1292                }
1293        }
1294
1295        rt->rt6i_flags = cfg->fc_flags;
1296
1297install_route:
1298        if (cfg->fc_mx) {
1299                struct nlattr *nla;
1300                int remaining;
1301
1302                nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1303                        int type = nla_type(nla);
1304
1305                        if (type) {
1306                                if (type > RTAX_MAX) {
1307                                        err = -EINVAL;
1308                                        goto out;
1309                                }
1310
1311                                dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1312                        }
1313                }
1314        }
1315
1316        rt->dst.dev = dev;
1317        rt->rt6i_idev = idev;
1318        rt->rt6i_table = table;
1319
1320        cfg->fc_nlinfo.nl_net = dev_net(dev);
1321
1322        return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1323
1324out:
1325        if (dev)
1326                dev_put(dev);
1327        if (idev)
1328                in6_dev_put(idev);
1329        if (rt)
1330                dst_free(&rt->dst);
1331        return err;
1332}
1333
1334static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1335{
1336        int err;
1337        struct fib6_table *table;
1338        struct net *net = dev_net(rt->rt6i_dev);
1339
1340        if (rt == net->ipv6.ip6_null_entry)
1341                return -ENOENT;
1342
1343        table = rt->rt6i_table;
1344        write_lock_bh(&table->tb6_lock);
1345
1346        err = fib6_del(rt, info);
1347        dst_release(&rt->dst);
1348
1349        write_unlock_bh(&table->tb6_lock);
1350
1351        return err;
1352}
1353
1354int ip6_del_rt(struct rt6_info *rt)
1355{
1356        struct nl_info info = {
1357                .nl_net = dev_net(rt->rt6i_dev),
1358        };
1359        return __ip6_del_rt(rt, &info);
1360}
1361
1362static int ip6_route_del(struct fib6_config *cfg)
1363{
1364        struct fib6_table *table;
1365        struct fib6_node *fn;
1366        struct rt6_info *rt;
1367        int err = -ESRCH;
1368
1369        table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1370        if (table == NULL)
1371                return err;
1372
1373        read_lock_bh(&table->tb6_lock);
1374
1375        fn = fib6_locate(&table->tb6_root,
1376                         &cfg->fc_dst, cfg->fc_dst_len,
1377                         &cfg->fc_src, cfg->fc_src_len);
1378
1379        if (fn) {
1380                for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1381                        if (cfg->fc_ifindex &&
1382                            (rt->rt6i_dev == NULL ||
1383                             rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1384                                continue;
1385                        if (cfg->fc_flags & RTF_GATEWAY &&
1386                            !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1387                                continue;
1388                        if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1389                                continue;
1390                        dst_hold(&rt->dst);
1391                        read_unlock_bh(&table->tb6_lock);
1392
1393                        return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1394                }
1395        }
1396        read_unlock_bh(&table->tb6_lock);
1397
1398        return err;
1399}
1400
1401/*
1402 *      Handle redirects
1403 */
1404struct ip6rd_flowi {
1405        struct flowi fl;
1406        struct in6_addr gateway;
1407};
1408
1409static struct rt6_info *__ip6_route_redirect(struct net *net,
1410                                             struct fib6_table *table,
1411                                             struct flowi *fl,
1412                                             int flags)
1413{
1414        struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1415        struct rt6_info *rt;
1416        struct fib6_node *fn;
1417
1418        /*
1419         * Get the "current" route for this destination and
1420         * check if the redirect has come from approriate router.
1421         *
1422         * RFC 2461 specifies that redirects should only be
1423         * accepted if they come from the nexthop to the target.
1424         * Due to the way the routes are chosen, this notion
1425         * is a bit fuzzy and one might need to check all possible
1426         * routes.
1427         */
1428
1429        read_lock_bh(&table->tb6_lock);
1430        fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1431restart:
1432        for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1433                /*
1434                 * Current route is on-link; redirect is always invalid.
1435                 *
1436                 * Seems, previous statement is not true. It could
1437                 * be node, which looks for us as on-link (f.e. proxy ndisc)
1438                 * But then router serving it might decide, that we should
1439                 * know truth 8)8) --ANK (980726).
1440                 */
1441                if (rt6_check_expired(rt))
1442                        continue;
1443                if (!(rt->rt6i_flags & RTF_GATEWAY))
1444                        continue;
1445                if (fl->oif != rt->rt6i_dev->ifindex)
1446                        continue;
1447                if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1448                        continue;
1449                break;
1450        }
1451
1452        if (!rt)
1453                rt = net->ipv6.ip6_null_entry;
1454        BACKTRACK(net, &fl->fl6_src);
1455out:
1456        dst_hold(&rt->dst);
1457
1458        read_unlock_bh(&table->tb6_lock);
1459
1460        return rt;
1461};
1462
1463static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1464                                           struct in6_addr *src,
1465                                           struct in6_addr *gateway,
1466                                           struct net_device *dev)
1467{
1468        int flags = RT6_LOOKUP_F_HAS_SADDR;
1469        struct net *net = dev_net(dev);
1470        struct ip6rd_flowi rdfl = {
1471                .fl = {
1472                        .oif = dev->ifindex,
1473                        .fl6_dst = *dest,
1474                        .fl6_src = *src,
1475                },
1476        };
1477
1478        ipv6_addr_copy(&rdfl.gateway, gateway);
1479
1480        if (rt6_need_strict(dest))
1481                flags |= RT6_LOOKUP_F_IFACE;
1482
1483        return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1484                                                   flags, __ip6_route_redirect);
1485}
1486
1487void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1488                  struct in6_addr *saddr,
1489                  struct neighbour *neigh, u8 *lladdr, int on_link)
1490{
1491        struct rt6_info *rt, *nrt = NULL;
1492        struct netevent_redirect netevent;
1493        struct net *net = dev_net(neigh->dev);
1494
1495        rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1496
1497        if (rt == net->ipv6.ip6_null_entry) {
1498                if (net_ratelimit())
1499                        printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1500                               "for redirect target\n");
1501                goto out;
1502        }
1503
1504        /*
1505         *      We have finally decided to accept it.
1506         */
1507
1508        neigh_update(neigh, lladdr, NUD_STALE,
1509                     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1510                     NEIGH_UPDATE_F_OVERRIDE|
1511                     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1512                                     NEIGH_UPDATE_F_ISROUTER))
1513                     );
1514
1515        /*
1516         * Redirect received -> path was valid.
1517         * Look, redirects are sent only in response to data packets,
1518         * so that this nexthop apparently is reachable. --ANK
1519         */
1520        dst_confirm(&rt->dst);
1521
1522        /* Duplicate redirect: silently ignore. */
1523        if (neigh == rt->dst.neighbour)
1524                goto out;
1525
1526        nrt = ip6_rt_copy(rt);
1527        if (nrt == NULL)
1528                goto out;
1529
1530        nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1531        if (on_link)
1532                nrt->rt6i_flags &= ~RTF_GATEWAY;
1533
1534        ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1535        nrt->rt6i_dst.plen = 128;
1536        nrt->dst.flags |= DST_HOST;
1537
1538        ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1539        nrt->rt6i_nexthop = neigh_clone(neigh);
1540
1541        if (ip6_ins_rt(nrt))
1542                goto out;
1543
1544        netevent.old = &rt->dst;
1545        netevent.new = &nrt->dst;
1546        call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1547
1548        if (rt->rt6i_flags&RTF_CACHE) {
1549                ip6_del_rt(rt);
1550                return;
1551        }
1552
1553out:
1554        dst_release(&rt->dst);
1555}
1556
1557/*
1558 *      Handle ICMP "packet too big" messages
1559 *      i.e. Path MTU discovery
1560 */
1561
1562static void rt6_do_pmtu_disc(struct in6_addr *daddr, struct in6_addr *saddr,
1563                             struct net *net, u32 pmtu, int ifindex)
1564{
1565        struct rt6_info *rt, *nrt;
1566        int allfrag = 0;
1567again:
1568        rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1569        if (rt == NULL)
1570                return;
1571
1572        if (rt6_check_expired(rt)) {
1573                ip6_del_rt(rt);
1574                goto again;
1575        }
1576
1577        if (pmtu >= dst_mtu(&rt->dst))
1578                goto out;
1579
1580        if (pmtu < IPV6_MIN_MTU) {
1581                /*
1582                 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1583                 * MTU (1280) and a fragment header should always be included
1584                 * after a node receiving Too Big message reporting PMTU is
1585                 * less than the IPv6 Minimum Link MTU.
1586                 */
1587                pmtu = IPV6_MIN_MTU;
1588                allfrag = 1;
1589        }
1590
1591        /* New mtu received -> path was valid.
1592           They are sent only in response to data packets,
1593           so that this nexthop apparently is reachable. --ANK
1594         */
1595        dst_confirm(&rt->dst);
1596
1597        /* Host route. If it is static, it would be better
1598           not to override it, but add new one, so that
1599           when cache entry will expire old pmtu
1600           would return automatically.
1601         */
1602        if (rt->rt6i_flags & RTF_CACHE) {
1603                dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1604                if (allfrag) {
1605                        u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1606                        features |= RTAX_FEATURE_ALLFRAG;
1607                        dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1608                }
1609                dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1610                rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1611                goto out;
1612        }
1613
1614        /* Network route.
1615           Two cases are possible:
1616           1. It is connected route. Action: COW
1617           2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1618         */
1619        if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1620                nrt = rt6_alloc_cow(rt, daddr, saddr);
1621        else
1622                nrt = rt6_alloc_clone(rt, daddr);
1623
1624        if (nrt) {
1625                dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1626                if (allfrag) {
1627                        u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1628                        features |= RTAX_FEATURE_ALLFRAG;
1629                        dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1630                }
1631
1632                /* According to RFC 1981, detecting PMTU increase shouldn't be
1633                 * happened within 5 mins, the recommended timer is 10 mins.
1634                 * Here this route expiration time is set to ip6_rt_mtu_expires
1635                 * which is 10 mins. After 10 mins the decreased pmtu is expired
1636                 * and detecting PMTU increase will be automatically happened.
1637                 */
1638                dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1639                nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1640
1641                ip6_ins_rt(nrt);
1642        }
1643out:
1644        dst_release(&rt->dst);
1645}
1646
1647void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1648                        struct net_device *dev, u32 pmtu)
1649{
1650        struct net *net = dev_net(dev);
1651
1652        /*
1653         * RFC 1981 states that a node "MUST reduce the size of the packets it
1654         * is sending along the path" that caused the Packet Too Big message.
1655         * Since it's not possible in the general case to determine which
1656         * interface was used to send the original packet, we update the MTU
1657         * on the interface that will be used to send future packets. We also
1658         * update the MTU on the interface that received the Packet Too Big in
1659         * case the original packet was forced out that interface with
1660         * SO_BINDTODEVICE or similar. This is the next best thing to the
1661         * correct behaviour, which would be to update the MTU on all
1662         * interfaces.
1663         */
1664        rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1665        rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1666}
1667
1668/*
1669 *      Misc support functions
1670 */
1671
1672static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1673{
1674        struct net *net = dev_net(ort->rt6i_dev);
1675        struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1676
1677        if (rt) {
1678                rt->dst.input = ort->dst.input;
1679                rt->dst.output = ort->dst.output;
1680
1681                dst_copy_metrics(&rt->dst, &ort->dst);
1682                rt->dst.error = ort->dst.error;
1683                rt->dst.dev = ort->dst.dev;
1684                if (rt->dst.dev)
1685                        dev_hold(rt->dst.dev);
1686                rt->rt6i_idev = ort->rt6i_idev;
1687                if (rt->rt6i_idev)
1688                        in6_dev_hold(rt->rt6i_idev);
1689                rt->dst.lastuse = jiffies;
1690                rt->rt6i_expires = 0;
1691
1692                ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1693                rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1694                rt->rt6i_metric = 0;
1695
1696                memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1697#ifdef CONFIG_IPV6_SUBTREES
1698                memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1699#endif
1700                rt->rt6i_table = ort->rt6i_table;
1701        }
1702        return rt;
1703}
1704
1705#ifdef CONFIG_IPV6_ROUTE_INFO
1706static struct rt6_info *rt6_get_route_info(struct net *net,
1707                                           struct in6_addr *prefix, int prefixlen,
1708                                           struct in6_addr *gwaddr, int ifindex)
1709{
1710        struct fib6_node *fn;
1711        struct rt6_info *rt = NULL;
1712        struct fib6_table *table;
1713
1714        table = fib6_get_table(net, RT6_TABLE_INFO);
1715        if (table == NULL)
1716                return NULL;
1717
1718        write_lock_bh(&table->tb6_lock);
1719        fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1720        if (!fn)
1721                goto out;
1722
1723        for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1724                if (rt->rt6i_dev->ifindex != ifindex)
1725                        continue;
1726                if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1727                        continue;
1728                if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1729                        continue;
1730                dst_hold(&rt->dst);
1731                break;
1732        }
1733out:
1734        write_unlock_bh(&table->tb6_lock);
1735        return rt;
1736}
1737
1738static struct rt6_info *rt6_add_route_info(struct net *net,
1739                                           struct in6_addr *prefix, int prefixlen,
1740                                           struct in6_addr *gwaddr, int ifindex,
1741                                           unsigned pref)
1742{
1743        struct fib6_config cfg = {
1744                .fc_table       = RT6_TABLE_INFO,
1745                .fc_metric      = IP6_RT_PRIO_USER,
1746                .fc_ifindex     = ifindex,
1747                .fc_dst_len     = prefixlen,
1748                .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1749                                  RTF_UP | RTF_PREF(pref),
1750                .fc_nlinfo.pid = 0,
1751                .fc_nlinfo.nlh = NULL,
1752                .fc_nlinfo.nl_net = net,
1753        };
1754
1755        ipv6_addr_copy(&cfg.fc_dst, prefix);
1756        ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1757
1758        /* We should treat it as a default route if prefix length is 0. */
1759        if (!prefixlen)
1760                cfg.fc_flags |= RTF_DEFAULT;
1761
1762        ip6_route_add(&cfg);
1763
1764        return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1765}
1766#endif
1767
1768struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1769{
1770        struct rt6_info *rt;
1771        struct fib6_table *table;
1772
1773        table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1774        if (table == NULL)
1775                return NULL;
1776
1777        write_lock_bh(&table->tb6_lock);
1778        for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1779                if (dev == rt->rt6i_dev &&
1780                    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1781                    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1782                        break;
1783        }
1784        if (rt)
1785                dst_hold(&rt->dst);
1786        write_unlock_bh(&table->tb6_lock);
1787        return rt;
1788}
1789
1790struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1791                                     struct net_device *dev,
1792                                     unsigned int pref)
1793{
1794        struct fib6_config cfg = {
1795                .fc_table       = RT6_TABLE_DFLT,
1796                .fc_metric      = IP6_RT_PRIO_USER,
1797                .fc_ifindex     = dev->ifindex,
1798                .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1799                                  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1800                .fc_nlinfo.pid = 0,
1801                .fc_nlinfo.nlh = NULL,
1802                .fc_nlinfo.nl_net = dev_net(dev),
1803        };
1804
1805        ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1806
1807        ip6_route_add(&cfg);
1808
1809        return rt6_get_dflt_router(gwaddr, dev);
1810}
1811
1812void rt6_purge_dflt_routers(struct net *net)
1813{
1814        struct rt6_info *rt;
1815        struct fib6_table *table;
1816
1817        /* NOTE: Keep consistent with rt6_get_dflt_router */
1818        table = fib6_get_table(net, RT6_TABLE_DFLT);
1819        if (table == NULL)
1820                return;
1821
1822restart:
1823        read_lock_bh(&table->tb6_lock);
1824        for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1825                if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1826                        dst_hold(&rt->dst);
1827                        read_unlock_bh(&table->tb6_lock);
1828                        ip6_del_rt(rt);
1829                        goto restart;
1830                }
1831        }
1832        read_unlock_bh(&table->tb6_lock);
1833}
1834
1835static void rtmsg_to_fib6_config(struct net *net,
1836                                 struct in6_rtmsg *rtmsg,
1837                                 struct fib6_config *cfg)
1838{
1839        memset(cfg, 0, sizeof(*cfg));
1840
1841        cfg->fc_table = RT6_TABLE_MAIN;
1842        cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1843        cfg->fc_metric = rtmsg->rtmsg_metric;
1844        cfg->fc_expires = rtmsg->rtmsg_info;
1845        cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1846        cfg->fc_src_len = rtmsg->rtmsg_src_len;
1847        cfg->fc_flags = rtmsg->rtmsg_flags;
1848
1849        cfg->fc_nlinfo.nl_net = net;
1850
1851        ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1852        ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1853        ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1854}
1855
1856int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1857{
1858        struct fib6_config cfg;
1859        struct in6_rtmsg rtmsg;
1860        int err;
1861
1862        switch(cmd) {
1863        case SIOCADDRT:         /* Add a route */
1864        case SIOCDELRT:         /* Delete a route */
1865                if (!capable(CAP_NET_ADMIN))
1866                        return -EPERM;
1867                err = copy_from_user(&rtmsg, arg,
1868                                     sizeof(struct in6_rtmsg));
1869                if (err)
1870                        return -EFAULT;
1871
1872                rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1873
1874                rtnl_lock();
1875                switch (cmd) {
1876                case SIOCADDRT:
1877                        err = ip6_route_add(&cfg);
1878                        break;
1879                case SIOCDELRT:
1880                        err = ip6_route_del(&cfg);
1881                        break;
1882                default:
1883                        err = -EINVAL;
1884                }
1885                rtnl_unlock();
1886
1887                return err;
1888        }
1889
1890        return -EINVAL;
1891}
1892
1893/*
1894 *      Drop the packet on the floor
1895 */
1896
1897static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1898{
1899        int type;
1900        struct dst_entry *dst = skb_dst(skb);
1901        switch (ipstats_mib_noroutes) {
1902        case IPSTATS_MIB_INNOROUTES:
1903                type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1904                if (type == IPV6_ADDR_ANY) {
1905                        IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1906                                      IPSTATS_MIB_INADDRERRORS);
1907                        break;
1908                }
1909                /* FALLTHROUGH */
1910        case IPSTATS_MIB_OUTNOROUTES:
1911                IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1912                              ipstats_mib_noroutes);
1913                break;
1914        }
1915        icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1916        kfree_skb(skb);
1917        return 0;
1918}
1919
1920static int ip6_pkt_discard(struct sk_buff *skb)
1921{
1922        return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1923}
1924
1925static int ip6_pkt_discard_out(struct sk_buff *skb)
1926{
1927        skb->dev = skb_dst(skb)->dev;
1928        return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1929}
1930
1931#ifdef CONFIG_IPV6_MULTIPLE_TABLES
1932
1933static int ip6_pkt_prohibit(struct sk_buff *skb)
1934{
1935        return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1936}
1937
1938static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1939{
1940        skb->dev = skb_dst(skb)->dev;
1941        return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1942}
1943
1944#endif
1945
1946/*
1947 *      Allocate a dst for local (unicast / anycast) address.
1948 */
1949
1950struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1951                                    const struct in6_addr *addr,
1952                                    int anycast)
1953{
1954        struct net *net = dev_net(idev->dev);
1955        struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1956        struct neighbour *neigh;
1957
1958        if (rt == NULL) {
1959                if (net_ratelimit())
1960                        pr_warning("IPv6:  Maximum number of routes reached,"
1961                                   " consider increasing route/max_size.\n");
1962                return ERR_PTR(-ENOMEM);
1963        }
1964
1965        dev_hold(net->loopback_dev);
1966        in6_dev_hold(idev);
1967
1968        rt->dst.flags = DST_HOST;
1969        rt->dst.input = ip6_input;
1970        rt->dst.output = ip6_output;
1971        rt->rt6i_dev = net->loopback_dev;
1972        rt->rt6i_idev = idev;
1973        dst_metric_set(&rt->dst, RTAX_HOPLIMIT, -1);
1974        rt->dst.obsolete = -1;
1975
1976        rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1977        if (anycast)
1978                rt->rt6i_flags |= RTF_ANYCAST;
1979        else
1980                rt->rt6i_flags |= RTF_LOCAL;
1981        neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1982        if (IS_ERR(neigh)) {
1983                dst_free(&rt->dst);
1984
1985                /* We are casting this because that is the return
1986                 * value type.  But an errno encoded pointer is the
1987                 * same regardless of the underlying pointer type,
1988                 * and that's what we are returning.  So this is OK.
1989                 */
1990                return (struct rt6_info *) neigh;
1991        }
1992        rt->rt6i_nexthop = neigh;
1993
1994        ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1995        rt->rt6i_dst.plen = 128;
1996        rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1997
1998        atomic_set(&rt->dst.__refcnt, 1);
1999
2000        return rt;
2001}
2002
2003struct arg_dev_net {
2004        struct net_device *dev;
2005        struct net *net;
2006};
2007
2008static int fib6_ifdown(struct rt6_info *rt, void *arg)
2009{
2010        const struct arg_dev_net *adn = arg;
2011        const struct net_device *dev = adn->dev;
2012
2013        if ((rt->rt6i_dev == dev || dev == NULL) &&
2014            rt != adn->net->ipv6.ip6_null_entry) {
2015                RT6_TRACE("deleted by ifdown %p\n", rt);
2016                return -1;
2017        }
2018        return 0;
2019}
2020
2021void rt6_ifdown(struct net *net, struct net_device *dev)
2022{
2023        struct arg_dev_net adn = {
2024                .dev = dev,
2025                .net = net,
2026        };
2027
2028        fib6_clean_all(net, fib6_ifdown, 0, &adn);
2029        icmp6_clean_all(fib6_ifdown, &adn);
2030}
2031
2032struct rt6_mtu_change_arg
2033{
2034        struct net_device *dev;
2035        unsigned mtu;
2036};
2037
2038static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2039{
2040        struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2041        struct inet6_dev *idev;
2042
2043        /* In IPv6 pmtu discovery is not optional,
2044           so that RTAX_MTU lock cannot disable it.
2045           We still use this lock to block changes
2046           caused by addrconf/ndisc.
2047        */
2048
2049        idev = __in6_dev_get(arg->dev);
2050        if (idev == NULL)
2051                return 0;
2052
2053        /* For administrative MTU increase, there is no way to discover
2054           IPv6 PMTU increase, so PMTU increase should be updated here.
2055           Since RFC 1981 doesn't include administrative MTU increase
2056           update PMTU increase is a MUST. (i.e. jumbo frame)
2057         */
2058        /*
2059           If new MTU is less than route PMTU, this new MTU will be the
2060           lowest MTU in the path, update the route PMTU to reflect PMTU
2061           decreases; if new MTU is greater than route PMTU, and the
2062           old MTU is the lowest MTU in the path, update the route PMTU
2063           to reflect the increase. In this case if the other nodes' MTU
2064           also have the lowest MTU, TOO BIG MESSAGE will be lead to
2065           PMTU discouvery.
2066         */
2067        if (rt->rt6i_dev == arg->dev &&
2068            !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2069            (dst_mtu(&rt->dst) >= arg->mtu ||
2070             (dst_mtu(&rt->dst) < arg->mtu &&
2071              dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2072                dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2073        }
2074        return 0;
2075}
2076
2077void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2078{
2079        struct rt6_mtu_change_arg arg = {
2080                .dev = dev,
2081                .mtu = mtu,
2082        };
2083
2084        fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2085}
2086
2087static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2088        [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2089        [RTA_OIF]               = { .type = NLA_U32 },
2090        [RTA_IIF]               = { .type = NLA_U32 },
2091        [RTA_PRIORITY]          = { .type = NLA_U32 },
2092        [RTA_METRICS]           = { .type = NLA_NESTED },
2093};
2094
2095static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2096                              struct fib6_config *cfg)
2097{
2098        struct rtmsg *rtm;
2099        struct nlattr *tb[RTA_MAX+1];
2100        int err;
2101
2102        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2103        if (err < 0)
2104                goto errout;
2105
2106        err = -EINVAL;
2107        rtm = nlmsg_data(nlh);
2108        memset(cfg, 0, sizeof(*cfg));
2109
2110        cfg->fc_table = rtm->rtm_table;
2111        cfg->fc_dst_len = rtm->rtm_dst_len;
2112        cfg->fc_src_len = rtm->rtm_src_len;
2113        cfg->fc_flags = RTF_UP;
2114        cfg->fc_protocol = rtm->rtm_protocol;
2115
2116        if (rtm->rtm_type == RTN_UNREACHABLE)
2117                cfg->fc_flags |= RTF_REJECT;
2118
2119        if (rtm->rtm_type == RTN_LOCAL)
2120                cfg->fc_flags |= RTF_LOCAL;
2121
2122        cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2123        cfg->fc_nlinfo.nlh = nlh;
2124        cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2125
2126        if (tb[RTA_GATEWAY]) {
2127                nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2128                cfg->fc_flags |= RTF_GATEWAY;
2129        }
2130
2131        if (tb[RTA_DST]) {
2132                int plen = (rtm->rtm_dst_len + 7) >> 3;
2133
2134                if (nla_len(tb[RTA_DST]) < plen)
2135                        goto errout;
2136
2137                nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2138        }
2139
2140        if (tb[RTA_SRC]) {
2141                int plen = (rtm->rtm_src_len + 7) >> 3;
2142
2143                if (nla_len(tb[RTA_SRC]) < plen)
2144                        goto errout;
2145
2146                nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2147        }
2148
2149        if (tb[RTA_OIF])
2150                cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2151
2152        if (tb[RTA_PRIORITY])
2153                cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2154
2155        if (tb[RTA_METRICS]) {
2156                cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2157                cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2158        }
2159
2160        if (tb[RTA_TABLE])
2161                cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2162
2163        err = 0;
2164errout:
2165        return err;
2166}
2167
2168static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2169{
2170        struct fib6_config cfg;
2171        int err;
2172
2173        err = rtm_to_fib6_config(skb, nlh, &cfg);
2174        if (err < 0)
2175                return err;
2176
2177        return ip6_route_del(&cfg);
2178}
2179
2180static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2181{
2182        struct fib6_config cfg;
2183        int err;
2184
2185        err = rtm_to_fib6_config(skb, nlh, &cfg);
2186        if (err < 0)
2187                return err;
2188
2189        return ip6_route_add(&cfg);
2190}
2191
2192static inline size_t rt6_nlmsg_size(void)
2193{
2194        return NLMSG_ALIGN(sizeof(struct rtmsg))
2195               + nla_total_size(16) /* RTA_SRC */
2196               + nla_total_size(16) /* RTA_DST */
2197               + nla_total_size(16) /* RTA_GATEWAY */
2198               + nla_total_size(16) /* RTA_PREFSRC */
2199               + nla_total_size(4) /* RTA_TABLE */
2200               + nla_total_size(4) /* RTA_IIF */
2201               + nla_total_size(4) /* RTA_OIF */
2202               + nla_total_size(4) /* RTA_PRIORITY */
2203               + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2204               + nla_total_size(sizeof(struct rta_cacheinfo));
2205}
2206
2207static int rt6_fill_node(struct net *net,
2208                         struct sk_buff *skb, struct rt6_info *rt,
2209                         struct in6_addr *dst, struct in6_addr *src,
2210                         int iif, int type, u32 pid, u32 seq,
2211                         int prefix, int nowait, unsigned int flags)
2212{
2213        struct rtmsg *rtm;
2214        struct nlmsghdr *nlh;
2215        long expires;
2216        u32 table;
2217
2218        if (prefix) {   /* user wants prefix routes only */
2219                if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2220                        /* success since this is not a prefix route */
2221                        return 1;
2222                }
2223        }
2224
2225        nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2226        if (nlh == NULL)
2227                return -EMSGSIZE;
2228
2229        rtm = nlmsg_data(nlh);
2230        rtm->rtm_family = AF_INET6;
2231        rtm->rtm_dst_len = rt->rt6i_dst.plen;
2232        rtm->rtm_src_len = rt->rt6i_src.plen;
2233        rtm->rtm_tos = 0;
2234        if (rt->rt6i_table)
2235                table = rt->rt6i_table->tb6_id;
2236        else
2237                table = RT6_TABLE_UNSPEC;
2238        rtm->rtm_table = table;
2239        NLA_PUT_U32(skb, RTA_TABLE, table);
2240        if (rt->rt6i_flags&RTF_REJECT)
2241                rtm->rtm_type = RTN_UNREACHABLE;
2242        else if (rt->rt6i_flags&RTF_LOCAL)
2243                rtm->rtm_type = RTN_LOCAL;
2244        else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2245                rtm->rtm_type = RTN_LOCAL;
2246        else
2247                rtm->rtm_type = RTN_UNICAST;
2248        rtm->rtm_flags = 0;
2249        rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2250        rtm->rtm_protocol = rt->rt6i_protocol;
2251        if (rt->rt6i_flags&RTF_DYNAMIC)
2252                rtm->rtm_protocol = RTPROT_REDIRECT;
2253        else if (rt->rt6i_flags & RTF_ADDRCONF)
2254                rtm->rtm_protocol = RTPROT_KERNEL;
2255        else if (rt->rt6i_flags&RTF_DEFAULT)
2256                rtm->rtm_protocol = RTPROT_RA;
2257
2258        if (rt->rt6i_flags&RTF_CACHE)
2259                rtm->rtm_flags |= RTM_F_CLONED;
2260
2261        if (dst) {
2262                NLA_PUT(skb, RTA_DST, 16, dst);
2263                rtm->rtm_dst_len = 128;
2264        } else if (rtm->rtm_dst_len)
2265                NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2266#ifdef CONFIG_IPV6_SUBTREES
2267        if (src) {
2268                NLA_PUT(skb, RTA_SRC, 16, src);
2269                rtm->rtm_src_len = 128;
2270        } else if (rtm->rtm_src_len)
2271                NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2272#endif
2273        if (iif) {
2274#ifdef CONFIG_IPV6_MROUTE
2275                if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2276                        int err = ip6mr_get_route(net, skb, rtm, nowait);
2277                        if (err <= 0) {
2278                                if (!nowait) {
2279                                        if (err == 0)
2280                                                return 0;
2281                                        goto nla_put_failure;
2282                                } else {
2283                                        if (err == -EMSGSIZE)
2284                                                goto nla_put_failure;
2285                                }
2286                        }
2287                } else
2288#endif
2289                        NLA_PUT_U32(skb, RTA_IIF, iif);
2290        } else if (dst) {
2291                struct inet6_dev *idev = ip6_dst_idev(&rt->dst);
2292                struct in6_addr saddr_buf;
2293                if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2294                                       dst, 0, &saddr_buf) == 0)
2295                        NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2296        }
2297
2298        if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2299                goto nla_put_failure;
2300
2301        if (rt->dst.neighbour)
2302                NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2303
2304        if (rt->dst.dev)
2305                NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2306
2307        NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2308
2309        if (!(rt->rt6i_flags & RTF_EXPIRES))
2310                expires = 0;
2311        else if (rt->rt6i_expires - jiffies < INT_MAX)
2312                expires = rt->rt6i_expires - jiffies;
2313        else
2314                expires = INT_MAX;
2315
2316        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2317                               expires, rt->dst.error) < 0)
2318                goto nla_put_failure;
2319
2320        return nlmsg_end(skb, nlh);
2321
2322nla_put_failure:
2323        nlmsg_cancel(skb, nlh);
2324        return -EMSGSIZE;
2325}
2326
2327int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2328{
2329        struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2330        int prefix;
2331
2332        if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2333                struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2334                prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2335        } else
2336                prefix = 0;
2337
2338        return rt6_fill_node(arg->net,
2339                     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2340                     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2341                     prefix, 0, NLM_F_MULTI);
2342}
2343
2344static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2345{
2346        struct net *net = sock_net(in_skb->sk);
2347        struct nlattr *tb[RTA_MAX+1];
2348        struct rt6_info *rt;
2349        struct sk_buff *skb;
2350        struct rtmsg *rtm;
2351        struct flowi fl;
2352        int err, iif = 0;
2353
2354        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2355        if (err < 0)
2356                goto errout;
2357
2358        err = -EINVAL;
2359        memset(&fl, 0, sizeof(fl));
2360
2361        if (tb[RTA_SRC]) {
2362                if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2363                        goto errout;
2364
2365                ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2366        }
2367
2368        if (tb[RTA_DST]) {
2369                if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2370                        goto errout;
2371
2372                ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2373        }
2374
2375        if (tb[RTA_IIF])
2376                iif = nla_get_u32(tb[RTA_IIF]);
2377
2378        if (tb[RTA_OIF])
2379                fl.oif = nla_get_u32(tb[RTA_OIF]);
2380
2381        if (iif) {
2382                struct net_device *dev;
2383                dev = __dev_get_by_index(net, iif);
2384                if (!dev) {
2385                        err = -ENODEV;
2386                        goto errout;
2387                }
2388        }
2389
2390        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2391        if (skb == NULL) {
2392                err = -ENOBUFS;
2393                goto errout;
2394        }
2395
2396        /* Reserve room for dummy headers, this skb can pass
2397           through good chunk of routing engine.
2398         */
2399        skb_reset_mac_header(skb);
2400        skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2401
2402        rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2403        skb_dst_set(skb, &rt->dst);
2404
2405        err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2406                            RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2407                            nlh->nlmsg_seq, 0, 0, 0);
2408        if (err < 0) {
2409                kfree_skb(skb);
2410                goto errout;
2411        }
2412
2413        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2414errout:
2415        return err;
2416}
2417
2418void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2419{
2420        struct sk_buff *skb;
2421        struct net *net = info->nl_net;
2422        u32 seq;
2423        int err;
2424
2425        err = -ENOBUFS;
2426        seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2427
2428        skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2429        if (skb == NULL)
2430                goto errout;
2431
2432        err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2433                                event, info->pid, seq, 0, 0, 0);
2434        if (err < 0) {
2435                /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2436                WARN_ON(err == -EMSGSIZE);
2437                kfree_skb(skb);
2438                goto errout;
2439        }
2440        rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2441                    info->nlh, gfp_any());
2442        return;
2443errout:
2444        if (err < 0)
2445                rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2446}
2447
2448static int ip6_route_dev_notify(struct notifier_block *this,
2449                                unsigned long event, void *data)
2450{
2451        struct net_device *dev = (struct net_device *)data;
2452        struct net *net = dev_net(dev);
2453
2454        if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2455                net->ipv6.ip6_null_entry->dst.dev = dev;
2456                net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2457#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2458                net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2459                net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2460                net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2461                net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2462#endif
2463        }
2464
2465        return NOTIFY_OK;
2466}
2467
2468/*
2469 *      /proc
2470 */
2471
2472#ifdef CONFIG_PROC_FS
2473
2474struct rt6_proc_arg
2475{
2476        char *buffer;
2477        int offset;
2478        int length;
2479        int skip;
2480        int len;
2481};
2482
2483static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2484{
2485        struct seq_file *m = p_arg;
2486
2487        seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2488
2489#ifdef CONFIG_IPV6_SUBTREES
2490        seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2491#else
2492        seq_puts(m, "00000000000000000000000000000000 00 ");
2493#endif
2494
2495        if (rt->rt6i_nexthop) {
2496                seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2497        } else {
2498                seq_puts(m, "00000000000000000000000000000000");
2499        }
2500        seq_printf(m, " %08x %08x %08x %08x %8s\n",
2501                   rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2502                   rt->dst.__use, rt->rt6i_flags,
2503                   rt->rt6i_dev ? rt->rt6i_dev->name : "");
2504        return 0;
2505}
2506
2507static int ipv6_route_show(struct seq_file *m, void *v)
2508{
2509        struct net *net = (struct net *)m->private;
2510        fib6_clean_all(net, rt6_info_route, 0, m);
2511        return 0;
2512}
2513
2514static int ipv6_route_open(struct inode *inode, struct file *file)
2515{
2516        return single_open_net(inode, file, ipv6_route_show);
2517}
2518
2519static const struct file_operations ipv6_route_proc_fops = {
2520        .owner          = THIS_MODULE,
2521        .open           = ipv6_route_open,
2522        .read           = seq_read,
2523        .llseek         = seq_lseek,
2524        .release        = single_release_net,
2525};
2526
2527static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2528{
2529        struct net *net = (struct net *)seq->private;
2530        seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2531                   net->ipv6.rt6_stats->fib_nodes,
2532                   net->ipv6.rt6_stats->fib_route_nodes,
2533                   net->ipv6.rt6_stats->fib_rt_alloc,
2534                   net->ipv6.rt6_stats->fib_rt_entries,
2535                   net->ipv6.rt6_stats->fib_rt_cache,
2536                   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2537                   net->ipv6.rt6_stats->fib_discarded_routes);
2538
2539        return 0;
2540}
2541
2542static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2543{
2544        return single_open_net(inode, file, rt6_stats_seq_show);
2545}
2546
2547static const struct file_operations rt6_stats_seq_fops = {
2548        .owner   = THIS_MODULE,
2549        .open    = rt6_stats_seq_open,
2550        .read    = seq_read,
2551        .llseek  = seq_lseek,
2552        .release = single_release_net,
2553};
2554#endif  /* CONFIG_PROC_FS */
2555
2556#ifdef CONFIG_SYSCTL
2557
2558static
2559int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2560                              void __user *buffer, size_t *lenp, loff_t *ppos)
2561{
2562        struct net *net;
2563        int delay;
2564        if (!write)
2565                return -EINVAL;
2566
2567        net = (struct net *)ctl->extra1;
2568        delay = net->ipv6.sysctl.flush_delay;
2569        proc_dointvec(ctl, write, buffer, lenp, ppos);
2570        fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2571        return 0;
2572}
2573
2574ctl_table ipv6_route_table_template[] = {
2575        {
2576                .procname       =       "flush",
2577                .data           =       &init_net.ipv6.sysctl.flush_delay,
2578                .maxlen         =       sizeof(int),
2579                .mode           =       0200,
2580                .proc_handler   =       ipv6_sysctl_rtcache_flush
2581        },
2582        {
2583                .procname       =       "gc_thresh",
2584                .data           =       &ip6_dst_ops_template.gc_thresh,
2585                .maxlen         =       sizeof(int),
2586                .mode           =       0644,
2587                .proc_handler   =       proc_dointvec,
2588        },
2589        {
2590                .procname       =       "max_size",
2591                .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2592                .maxlen         =       sizeof(int),
2593                .mode           =       0644,
2594                .proc_handler   =       proc_dointvec,
2595        },
2596        {
2597                .procname       =       "gc_min_interval",
2598                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2599                .maxlen         =       sizeof(int),
2600                .mode           =       0644,
2601                .proc_handler   =       proc_dointvec_jiffies,
2602        },
2603        {
2604                .procname       =       "gc_timeout",
2605                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2606                .maxlen         =       sizeof(int),
2607                .mode           =       0644,
2608                .proc_handler   =       proc_dointvec_jiffies,
2609        },
2610        {
2611                .procname       =       "gc_interval",
2612                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2613                .maxlen         =       sizeof(int),
2614                .mode           =       0644,
2615                .proc_handler   =       proc_dointvec_jiffies,
2616        },
2617        {
2618                .procname       =       "gc_elasticity",
2619                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2620                .maxlen         =       sizeof(int),
2621                .mode           =       0644,
2622                .proc_handler   =       proc_dointvec,
2623        },
2624        {
2625                .procname       =       "mtu_expires",
2626                .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2627                .maxlen         =       sizeof(int),
2628                .mode           =       0644,
2629                .proc_handler   =       proc_dointvec_jiffies,
2630        },
2631        {
2632                .procname       =       "min_adv_mss",
2633                .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2634                .maxlen         =       sizeof(int),
2635                .mode           =       0644,
2636                .proc_handler   =       proc_dointvec,
2637        },
2638        {
2639                .procname       =       "gc_min_interval_ms",
2640                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2641                .maxlen         =       sizeof(int),
2642                .mode           =       0644,
2643                .proc_handler   =       proc_dointvec_ms_jiffies,
2644        },
2645        { }
2646};
2647
2648struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2649{
2650        struct ctl_table *table;
2651
2652        table = kmemdup(ipv6_route_table_template,
2653                        sizeof(ipv6_route_table_template),
2654                        GFP_KERNEL);
2655
2656        if (table) {
2657                table[0].data = &net->ipv6.sysctl.flush_delay;
2658                table[0].extra1 = net;
2659                table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2660                table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2661                table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2662                table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2663                table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2664                table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2665                table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2666                table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2667                table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2668        }
2669
2670        return table;
2671}
2672#endif
2673
2674static int __net_init ip6_route_net_init(struct net *net)
2675{
2676        int ret = -ENOMEM;
2677
2678        memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2679               sizeof(net->ipv6.ip6_dst_ops));
2680
2681        if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2682                goto out_ip6_dst_ops;
2683
2684        net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2685                                           sizeof(*net->ipv6.ip6_null_entry),
2686                                           GFP_KERNEL);
2687        if (!net->ipv6.ip6_null_entry)
2688                goto out_ip6_dst_entries;
2689        net->ipv6.ip6_null_entry->dst.path =
2690                (struct dst_entry *)net->ipv6.ip6_null_entry;
2691        net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2692        dst_metric_set(&net->ipv6.ip6_null_entry->dst, RTAX_HOPLIMIT, 255);
2693
2694#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2695        net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2696                                               sizeof(*net->ipv6.ip6_prohibit_entry),
2697                                               GFP_KERNEL);
2698        if (!net->ipv6.ip6_prohibit_entry)
2699                goto out_ip6_null_entry;
2700        net->ipv6.ip6_prohibit_entry->dst.path =
2701                (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2702        net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2703        dst_metric_set(&net->ipv6.ip6_prohibit_entry->dst, RTAX_HOPLIMIT, 255);
2704
2705        net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2706                                               sizeof(*net->ipv6.ip6_blk_hole_entry),
2707                                               GFP_KERNEL);
2708        if (!net->ipv6.ip6_blk_hole_entry)
2709                goto out_ip6_prohibit_entry;
2710        net->ipv6.ip6_blk_hole_entry->dst.path =
2711                (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2712        net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2713        dst_metric_set(&net->ipv6.ip6_blk_hole_entry->dst, RTAX_HOPLIMIT, 255);
2714#endif
2715
2716        net->ipv6.sysctl.flush_delay = 0;
2717        net->ipv6.sysctl.ip6_rt_max_size = 4096;
2718        net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2719        net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2720        net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2721        net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2722        net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2723        net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2724
2725#ifdef CONFIG_PROC_FS
2726        proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2727        proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2728#endif
2729        net->ipv6.ip6_rt_gc_expire = 30*HZ;
2730
2731        ret = 0;
2732out:
2733        return ret;
2734
2735#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2736out_ip6_prohibit_entry:
2737        kfree(net->ipv6.ip6_prohibit_entry);
2738out_ip6_null_entry:
2739        kfree(net->ipv6.ip6_null_entry);
2740#endif
2741out_ip6_dst_entries:
2742        dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2743out_ip6_dst_ops:
2744        goto out;
2745}
2746
2747static void __net_exit ip6_route_net_exit(struct net *net)
2748{
2749#ifdef CONFIG_PROC_FS
2750        proc_net_remove(net, "ipv6_route");
2751        proc_net_remove(net, "rt6_stats");
2752#endif
2753        kfree(net->ipv6.ip6_null_entry);
2754#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2755        kfree(net->ipv6.ip6_prohibit_entry);
2756        kfree(net->ipv6.ip6_blk_hole_entry);
2757#endif
2758        dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2759}
2760
2761static struct pernet_operations ip6_route_net_ops = {
2762        .init = ip6_route_net_init,
2763        .exit = ip6_route_net_exit,
2764};
2765
2766static struct notifier_block ip6_route_dev_notifier = {
2767        .notifier_call = ip6_route_dev_notify,
2768        .priority = 0,
2769};
2770
2771int __init ip6_route_init(void)
2772{
2773        int ret;
2774
2775        ret = -ENOMEM;
2776        ip6_dst_ops_template.kmem_cachep =
2777                kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2778                                  SLAB_HWCACHE_ALIGN, NULL);
2779        if (!ip6_dst_ops_template.kmem_cachep)
2780                goto out;
2781
2782        ret = dst_entries_init(&ip6_dst_blackhole_ops);
2783        if (ret)
2784                goto out_kmem_cache;
2785
2786        ret = register_pernet_subsys(&ip6_route_net_ops);
2787        if (ret)
2788                goto out_dst_entries;
2789
2790        ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2791
2792        /* Registering of the loopback is done before this portion of code,
2793         * the loopback reference in rt6_info will not be taken, do it
2794         * manually for init_net */
2795        init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2796        init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2797  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2798        init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2799        init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2800        init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2801        init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2802  #endif
2803        ret = fib6_init();
2804        if (ret)
2805                goto out_register_subsys;
2806
2807        ret = xfrm6_init();
2808        if (ret)
2809                goto out_fib6_init;
2810
2811        ret = fib6_rules_init();
2812        if (ret)
2813                goto xfrm6_init;
2814
2815        ret = -ENOBUFS;
2816        if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2817            __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2818            __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2819                goto fib6_rules_init;
2820
2821        ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2822        if (ret)
2823                goto fib6_rules_init;
2824
2825out:
2826        return ret;
2827
2828fib6_rules_init:
2829        fib6_rules_cleanup();
2830xfrm6_init:
2831        xfrm6_fini();
2832out_fib6_init:
2833        fib6_gc_cleanup();
2834out_register_subsys:
2835        unregister_pernet_subsys(&ip6_route_net_ops);
2836out_dst_entries:
2837        dst_entries_destroy(&ip6_dst_blackhole_ops);
2838out_kmem_cache:
2839        kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2840        goto out;
2841}
2842
2843void ip6_route_cleanup(void)
2844{
2845        unregister_netdevice_notifier(&ip6_route_dev_notifier);
2846        fib6_rules_cleanup();
2847        xfrm6_fini();
2848        fib6_gc_cleanup();
2849        unregister_pernet_subsys(&ip6_route_net_ops);
2850        dst_entries_destroy(&ip6_dst_blackhole_ops);
2851        kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2852}
2853