linux/net/ipv6/route.c
<<
>>
Prefs
   1/*
   2 *      Linux INET6 implementation
   3 *      FIB front-end.
   4 *
   5 *      Authors:
   6 *      Pedro Roque             <roque@di.fc.ul.pt>
   7 *
   8 *      This program is free software; you can redistribute it and/or
   9 *      modify it under the terms of the GNU General Public License
  10 *      as published by the Free Software Foundation; either version
  11 *      2 of the License, or (at your option) any later version.
  12 */
  13
  14/*      Changes:
  15 *
  16 *      YOSHIFUJI Hideaki @USAGI
  17 *              reworked default router selection.
  18 *              - respect outgoing interface
  19 *              - select from (probably) reachable routers (i.e.
  20 *              routers in REACHABLE, STALE, DELAY or PROBE states).
  21 *              - always select the same router if it is (probably)
  22 *              reachable.  otherwise, round-robin the list.
  23 *      Ville Nuorvala
  24 *              Fixed routing subtrees.
  25 */
  26
  27#include <linux/capability.h>
  28#include <linux/errno.h>
  29#include <linux/types.h>
  30#include <linux/times.h>
  31#include <linux/socket.h>
  32#include <linux/sockios.h>
  33#include <linux/net.h>
  34#include <linux/route.h>
  35#include <linux/netdevice.h>
  36#include <linux/in6.h>
  37#include <linux/mroute6.h>
  38#include <linux/init.h>
  39#include <linux/if_arp.h>
  40#include <linux/proc_fs.h>
  41#include <linux/seq_file.h>
  42#include <linux/nsproxy.h>
  43#include <linux/slab.h>
  44#include <net/net_namespace.h>
  45#include <net/snmp.h>
  46#include <net/ipv6.h>
  47#include <net/ip6_fib.h>
  48#include <net/ip6_route.h>
  49#include <net/ndisc.h>
  50#include <net/addrconf.h>
  51#include <net/tcp.h>
  52#include <linux/rtnetlink.h>
  53#include <net/dst.h>
  54#include <net/xfrm.h>
  55#include <net/netevent.h>
  56#include <net/netlink.h>
  57
  58#include <asm/uaccess.h>
  59
  60#ifdef CONFIG_SYSCTL
  61#include <linux/sysctl.h>
  62#endif
  63
  64/* Set to 3 to get tracing. */
  65#define RT6_DEBUG 2
  66
  67#if RT6_DEBUG >= 3
  68#define RDBG(x) printk x
  69#define RT6_TRACE(x...) printk(KERN_DEBUG x)
  70#else
  71#define RDBG(x)
  72#define RT6_TRACE(x...) do { ; } while (0)
  73#endif
  74
  75static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
  76static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
  77static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
  78static unsigned int      ip6_default_mtu(const struct dst_entry *dst);
  79static struct dst_entry *ip6_negative_advice(struct dst_entry *);
  80static void             ip6_dst_destroy(struct dst_entry *);
  81static void             ip6_dst_ifdown(struct dst_entry *,
  82                                       struct net_device *dev, int how);
  83static int               ip6_dst_gc(struct dst_ops *ops);
  84
  85static int              ip6_pkt_discard(struct sk_buff *skb);
  86static int              ip6_pkt_discard_out(struct sk_buff *skb);
  87static void             ip6_link_failure(struct sk_buff *skb);
  88static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
  89
  90#ifdef CONFIG_IPV6_ROUTE_INFO
  91static struct rt6_info *rt6_add_route_info(struct net *net,
  92                                           const struct in6_addr *prefix, int prefixlen,
  93                                           const struct in6_addr *gwaddr, int ifindex,
  94                                           unsigned pref);
  95static struct rt6_info *rt6_get_route_info(struct net *net,
  96                                           const struct in6_addr *prefix, int prefixlen,
  97                                           const struct in6_addr *gwaddr, int ifindex);
  98#endif
  99
 100static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
 101{
 102        struct rt6_info *rt = (struct rt6_info *) dst;
 103        struct inet_peer *peer;
 104        u32 *p = NULL;
 105
 106        if (!rt->rt6i_peer)
 107                rt6_bind_peer(rt, 1);
 108
 109        peer = rt->rt6i_peer;
 110        if (peer) {
 111                u32 *old_p = __DST_METRICS_PTR(old);
 112                unsigned long prev, new;
 113
 114                p = peer->metrics;
 115                if (inet_metrics_new(peer))
 116                        memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
 117
 118                new = (unsigned long) p;
 119                prev = cmpxchg(&dst->_metrics, old, new);
 120
 121                if (prev != old) {
 122                        p = __DST_METRICS_PTR(prev);
 123                        if (prev & DST_METRICS_READ_ONLY)
 124                                p = NULL;
 125                }
 126        }
 127        return p;
 128}
 129
 130static struct dst_ops ip6_dst_ops_template = {
 131        .family                 =       AF_INET6,
 132        .protocol               =       cpu_to_be16(ETH_P_IPV6),
 133        .gc                     =       ip6_dst_gc,
 134        .gc_thresh              =       1024,
 135        .check                  =       ip6_dst_check,
 136        .default_advmss         =       ip6_default_advmss,
 137        .default_mtu            =       ip6_default_mtu,
 138        .cow_metrics            =       ipv6_cow_metrics,
 139        .destroy                =       ip6_dst_destroy,
 140        .ifdown                 =       ip6_dst_ifdown,
 141        .negative_advice        =       ip6_negative_advice,
 142        .link_failure           =       ip6_link_failure,
 143        .update_pmtu            =       ip6_rt_update_pmtu,
 144        .local_out              =       __ip6_local_out,
 145};
 146
 147static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
 148{
 149        return 0;
 150}
 151
 152static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
 153{
 154}
 155
 156static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
 157                                         unsigned long old)
 158{
 159        return NULL;
 160}
 161
 162static struct dst_ops ip6_dst_blackhole_ops = {
 163        .family                 =       AF_INET6,
 164        .protocol               =       cpu_to_be16(ETH_P_IPV6),
 165        .destroy                =       ip6_dst_destroy,
 166        .check                  =       ip6_dst_check,
 167        .default_mtu            =       ip6_blackhole_default_mtu,
 168        .default_advmss         =       ip6_default_advmss,
 169        .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
 170        .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
 171};
 172
 173static const u32 ip6_template_metrics[RTAX_MAX] = {
 174        [RTAX_HOPLIMIT - 1] = 255,
 175};
 176
 177static struct rt6_info ip6_null_entry_template = {
 178        .dst = {
 179                .__refcnt       = ATOMIC_INIT(1),
 180                .__use          = 1,
 181                .obsolete       = -1,
 182                .error          = -ENETUNREACH,
 183                .input          = ip6_pkt_discard,
 184                .output         = ip6_pkt_discard_out,
 185        },
 186        .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
 187        .rt6i_protocol  = RTPROT_KERNEL,
 188        .rt6i_metric    = ~(u32) 0,
 189        .rt6i_ref       = ATOMIC_INIT(1),
 190};
 191
 192#ifdef CONFIG_IPV6_MULTIPLE_TABLES
 193
 194static int ip6_pkt_prohibit(struct sk_buff *skb);
 195static int ip6_pkt_prohibit_out(struct sk_buff *skb);
 196
 197static struct rt6_info ip6_prohibit_entry_template = {
 198        .dst = {
 199                .__refcnt       = ATOMIC_INIT(1),
 200                .__use          = 1,
 201                .obsolete       = -1,
 202                .error          = -EACCES,
 203                .input          = ip6_pkt_prohibit,
 204                .output         = ip6_pkt_prohibit_out,
 205        },
 206        .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
 207        .rt6i_protocol  = RTPROT_KERNEL,
 208        .rt6i_metric    = ~(u32) 0,
 209        .rt6i_ref       = ATOMIC_INIT(1),
 210};
 211
 212static struct rt6_info ip6_blk_hole_entry_template = {
 213        .dst = {
 214                .__refcnt       = ATOMIC_INIT(1),
 215                .__use          = 1,
 216                .obsolete       = -1,
 217                .error          = -EINVAL,
 218                .input          = dst_discard,
 219                .output         = dst_discard,
 220        },
 221        .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
 222        .rt6i_protocol  = RTPROT_KERNEL,
 223        .rt6i_metric    = ~(u32) 0,
 224        .rt6i_ref       = ATOMIC_INIT(1),
 225};
 226
 227#endif
 228
 229/* allocate dst with ip6_dst_ops */
 230static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
 231                                             struct net_device *dev,
 232                                             int flags)
 233{
 234        struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
 235
 236        memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
 237
 238        return rt;
 239}
 240
 241static void ip6_dst_destroy(struct dst_entry *dst)
 242{
 243        struct rt6_info *rt = (struct rt6_info *)dst;
 244        struct inet6_dev *idev = rt->rt6i_idev;
 245        struct inet_peer *peer = rt->rt6i_peer;
 246
 247        if (idev != NULL) {
 248                rt->rt6i_idev = NULL;
 249                in6_dev_put(idev);
 250        }
 251        if (peer) {
 252                rt->rt6i_peer = NULL;
 253                inet_putpeer(peer);
 254        }
 255}
 256
 257static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
 258
 259static u32 rt6_peer_genid(void)
 260{
 261        return atomic_read(&__rt6_peer_genid);
 262}
 263
 264void rt6_bind_peer(struct rt6_info *rt, int create)
 265{
 266        struct inet_peer *peer;
 267
 268        peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
 269        if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
 270                inet_putpeer(peer);
 271        else
 272                rt->rt6i_peer_genid = rt6_peer_genid();
 273}
 274
 275static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 276                           int how)
 277{
 278        struct rt6_info *rt = (struct rt6_info *)dst;
 279        struct inet6_dev *idev = rt->rt6i_idev;
 280        struct net_device *loopback_dev =
 281                dev_net(dev)->loopback_dev;
 282
 283        if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
 284                struct inet6_dev *loopback_idev =
 285                        in6_dev_get(loopback_dev);
 286                if (loopback_idev != NULL) {
 287                        rt->rt6i_idev = loopback_idev;
 288                        in6_dev_put(idev);
 289                }
 290        }
 291}
 292
 293static __inline__ int rt6_check_expired(const struct rt6_info *rt)
 294{
 295        return (rt->rt6i_flags & RTF_EXPIRES) &&
 296                time_after(jiffies, rt->rt6i_expires);
 297}
 298
 299static inline int rt6_need_strict(const struct in6_addr *daddr)
 300{
 301        return ipv6_addr_type(daddr) &
 302                (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
 303}
 304
 305/*
 306 *      Route lookup. Any table->tb6_lock is implied.
 307 */
 308
 309static inline struct rt6_info *rt6_device_match(struct net *net,
 310                                                    struct rt6_info *rt,
 311                                                    const struct in6_addr *saddr,
 312                                                    int oif,
 313                                                    int flags)
 314{
 315        struct rt6_info *local = NULL;
 316        struct rt6_info *sprt;
 317
 318        if (!oif && ipv6_addr_any(saddr))
 319                goto out;
 320
 321        for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
 322                struct net_device *dev = sprt->rt6i_dev;
 323
 324                if (oif) {
 325                        if (dev->ifindex == oif)
 326                                return sprt;
 327                        if (dev->flags & IFF_LOOPBACK) {
 328                                if (sprt->rt6i_idev == NULL ||
 329                                    sprt->rt6i_idev->dev->ifindex != oif) {
 330                                        if (flags & RT6_LOOKUP_F_IFACE && oif)
 331                                                continue;
 332                                        if (local && (!oif ||
 333                                                      local->rt6i_idev->dev->ifindex == oif))
 334                                                continue;
 335                                }
 336                                local = sprt;
 337                        }
 338                } else {
 339                        if (ipv6_chk_addr(net, saddr, dev,
 340                                          flags & RT6_LOOKUP_F_IFACE))
 341                                return sprt;
 342                }
 343        }
 344
 345        if (oif) {
 346                if (local)
 347                        return local;
 348
 349                if (flags & RT6_LOOKUP_F_IFACE)
 350                        return net->ipv6.ip6_null_entry;
 351        }
 352out:
 353        return rt;
 354}
 355
 356#ifdef CONFIG_IPV6_ROUTER_PREF
 357static void rt6_probe(struct rt6_info *rt)
 358{
 359        struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
 360        /*
 361         * Okay, this does not seem to be appropriate
 362         * for now, however, we need to check if it
 363         * is really so; aka Router Reachability Probing.
 364         *
 365         * Router Reachability Probe MUST be rate-limited
 366         * to no more than one per minute.
 367         */
 368        if (!neigh || (neigh->nud_state & NUD_VALID))
 369                return;
 370        read_lock_bh(&neigh->lock);
 371        if (!(neigh->nud_state & NUD_VALID) &&
 372            time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
 373                struct in6_addr mcaddr;
 374                struct in6_addr *target;
 375
 376                neigh->updated = jiffies;
 377                read_unlock_bh(&neigh->lock);
 378
 379                target = (struct in6_addr *)&neigh->primary_key;
 380                addrconf_addr_solict_mult(target, &mcaddr);
 381                ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
 382        } else
 383                read_unlock_bh(&neigh->lock);
 384}
 385#else
 386static inline void rt6_probe(struct rt6_info *rt)
 387{
 388}
 389#endif
 390
 391/*
 392 * Default Router Selection (RFC 2461 6.3.6)
 393 */
 394static inline int rt6_check_dev(struct rt6_info *rt, int oif)
 395{
 396        struct net_device *dev = rt->rt6i_dev;
 397        if (!oif || dev->ifindex == oif)
 398                return 2;
 399        if ((dev->flags & IFF_LOOPBACK) &&
 400            rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
 401                return 1;
 402        return 0;
 403}
 404
 405static inline int rt6_check_neigh(struct rt6_info *rt)
 406{
 407        struct neighbour *neigh = rt->rt6i_nexthop;
 408        int m;
 409        if (rt->rt6i_flags & RTF_NONEXTHOP ||
 410            !(rt->rt6i_flags & RTF_GATEWAY))
 411                m = 1;
 412        else if (neigh) {
 413                read_lock_bh(&neigh->lock);
 414                if (neigh->nud_state & NUD_VALID)
 415                        m = 2;
 416#ifdef CONFIG_IPV6_ROUTER_PREF
 417                else if (neigh->nud_state & NUD_FAILED)
 418                        m = 0;
 419#endif
 420                else
 421                        m = 1;
 422                read_unlock_bh(&neigh->lock);
 423        } else
 424                m = 0;
 425        return m;
 426}
 427
 428static int rt6_score_route(struct rt6_info *rt, int oif,
 429                           int strict)
 430{
 431        int m, n;
 432
 433        m = rt6_check_dev(rt, oif);
 434        if (!m && (strict & RT6_LOOKUP_F_IFACE))
 435                return -1;
 436#ifdef CONFIG_IPV6_ROUTER_PREF
 437        m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
 438#endif
 439        n = rt6_check_neigh(rt);
 440        if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
 441                return -1;
 442        return m;
 443}
 444
 445static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
 446                                   int *mpri, struct rt6_info *match)
 447{
 448        int m;
 449
 450        if (rt6_check_expired(rt))
 451                goto out;
 452
 453        m = rt6_score_route(rt, oif, strict);
 454        if (m < 0)
 455                goto out;
 456
 457        if (m > *mpri) {
 458                if (strict & RT6_LOOKUP_F_REACHABLE)
 459                        rt6_probe(match);
 460                *mpri = m;
 461                match = rt;
 462        } else if (strict & RT6_LOOKUP_F_REACHABLE) {
 463                rt6_probe(rt);
 464        }
 465
 466out:
 467        return match;
 468}
 469
 470static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
 471                                     struct rt6_info *rr_head,
 472                                     u32 metric, int oif, int strict)
 473{
 474        struct rt6_info *rt, *match;
 475        int mpri = -1;
 476
 477        match = NULL;
 478        for (rt = rr_head; rt && rt->rt6i_metric == metric;
 479             rt = rt->dst.rt6_next)
 480                match = find_match(rt, oif, strict, &mpri, match);
 481        for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
 482             rt = rt->dst.rt6_next)
 483                match = find_match(rt, oif, strict, &mpri, match);
 484
 485        return match;
 486}
 487
 488static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
 489{
 490        struct rt6_info *match, *rt0;
 491        struct net *net;
 492
 493        RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
 494                  __func__, fn->leaf, oif);
 495
 496        rt0 = fn->rr_ptr;
 497        if (!rt0)
 498                fn->rr_ptr = rt0 = fn->leaf;
 499
 500        match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
 501
 502        if (!match &&
 503            (strict & RT6_LOOKUP_F_REACHABLE)) {
 504                struct rt6_info *next = rt0->dst.rt6_next;
 505
 506                /* no entries matched; do round-robin */
 507                if (!next || next->rt6i_metric != rt0->rt6i_metric)
 508                        next = fn->leaf;
 509
 510                if (next != rt0)
 511                        fn->rr_ptr = next;
 512        }
 513
 514        RT6_TRACE("%s() => %p\n",
 515                  __func__, match);
 516
 517        net = dev_net(rt0->rt6i_dev);
 518        return match ? match : net->ipv6.ip6_null_entry;
 519}
 520
 521#ifdef CONFIG_IPV6_ROUTE_INFO
 522int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 523                  const struct in6_addr *gwaddr)
 524{
 525        struct net *net = dev_net(dev);
 526        struct route_info *rinfo = (struct route_info *) opt;
 527        struct in6_addr prefix_buf, *prefix;
 528        unsigned int pref;
 529        unsigned long lifetime;
 530        struct rt6_info *rt;
 531
 532        if (len < sizeof(struct route_info)) {
 533                return -EINVAL;
 534        }
 535
 536        /* Sanity check for prefix_len and length */
 537        if (rinfo->length > 3) {
 538                return -EINVAL;
 539        } else if (rinfo->prefix_len > 128) {
 540                return -EINVAL;
 541        } else if (rinfo->prefix_len > 64) {
 542                if (rinfo->length < 2) {
 543                        return -EINVAL;
 544                }
 545        } else if (rinfo->prefix_len > 0) {
 546                if (rinfo->length < 1) {
 547                        return -EINVAL;
 548                }
 549        }
 550
 551        pref = rinfo->route_pref;
 552        if (pref == ICMPV6_ROUTER_PREF_INVALID)
 553                return -EINVAL;
 554
 555        lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
 556
 557        if (rinfo->length == 3)
 558                prefix = (struct in6_addr *)rinfo->prefix;
 559        else {
 560                /* this function is safe */
 561                ipv6_addr_prefix(&prefix_buf,
 562                                 (struct in6_addr *)rinfo->prefix,
 563                                 rinfo->prefix_len);
 564                prefix = &prefix_buf;
 565        }
 566
 567        rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
 568                                dev->ifindex);
 569
 570        if (rt && !lifetime) {
 571                ip6_del_rt(rt);
 572                rt = NULL;
 573        }
 574
 575        if (!rt && lifetime)
 576                rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
 577                                        pref);
 578        else if (rt)
 579                rt->rt6i_flags = RTF_ROUTEINFO |
 580                                 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
 581
 582        if (rt) {
 583                if (!addrconf_finite_timeout(lifetime)) {
 584                        rt->rt6i_flags &= ~RTF_EXPIRES;
 585                } else {
 586                        rt->rt6i_expires = jiffies + HZ * lifetime;
 587                        rt->rt6i_flags |= RTF_EXPIRES;
 588                }
 589                dst_release(&rt->dst);
 590        }
 591        return 0;
 592}
 593#endif
 594
 595#define BACKTRACK(__net, saddr)                 \
 596do { \
 597        if (rt == __net->ipv6.ip6_null_entry) { \
 598                struct fib6_node *pn; \
 599                while (1) { \
 600                        if (fn->fn_flags & RTN_TL_ROOT) \
 601                                goto out; \
 602                        pn = fn->parent; \
 603                        if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
 604                                fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
 605                        else \
 606                                fn = pn; \
 607                        if (fn->fn_flags & RTN_RTINFO) \
 608                                goto restart; \
 609                } \
 610        } \
 611} while(0)
 612
 613static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 614                                             struct fib6_table *table,
 615                                             struct flowi6 *fl6, int flags)
 616{
 617        struct fib6_node *fn;
 618        struct rt6_info *rt;
 619
 620        read_lock_bh(&table->tb6_lock);
 621        fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 622restart:
 623        rt = fn->leaf;
 624        rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
 625        BACKTRACK(net, &fl6->saddr);
 626out:
 627        dst_use(&rt->dst, jiffies);
 628        read_unlock_bh(&table->tb6_lock);
 629        return rt;
 630
 631}
 632
 633struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
 634                            const struct in6_addr *saddr, int oif, int strict)
 635{
 636        struct flowi6 fl6 = {
 637                .flowi6_oif = oif,
 638                .daddr = *daddr,
 639        };
 640        struct dst_entry *dst;
 641        int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
 642
 643        if (saddr) {
 644                memcpy(&fl6.saddr, saddr, sizeof(*saddr));
 645                flags |= RT6_LOOKUP_F_HAS_SADDR;
 646        }
 647
 648        dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
 649        if (dst->error == 0)
 650                return (struct rt6_info *) dst;
 651
 652        dst_release(dst);
 653
 654        return NULL;
 655}
 656
 657EXPORT_SYMBOL(rt6_lookup);
 658
 659/* ip6_ins_rt is called with FREE table->tb6_lock.
 660   It takes new route entry, the addition fails by any reason the
 661   route is freed. In any case, if caller does not hold it, it may
 662   be destroyed.
 663 */
 664
 665static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
 666{
 667        int err;
 668        struct fib6_table *table;
 669
 670        table = rt->rt6i_table;
 671        write_lock_bh(&table->tb6_lock);
 672        err = fib6_add(&table->tb6_root, rt, info);
 673        write_unlock_bh(&table->tb6_lock);
 674
 675        return err;
 676}
 677
 678int ip6_ins_rt(struct rt6_info *rt)
 679{
 680        struct nl_info info = {
 681                .nl_net = dev_net(rt->rt6i_dev),
 682        };
 683        return __ip6_ins_rt(rt, &info);
 684}
 685
 686static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, const struct in6_addr *daddr,
 687                                      const struct in6_addr *saddr)
 688{
 689        struct rt6_info *rt;
 690
 691        /*
 692         *      Clone the route.
 693         */
 694
 695        rt = ip6_rt_copy(ort);
 696
 697        if (rt) {
 698                struct neighbour *neigh;
 699                int attempts = !in_softirq();
 700
 701                if (!(rt->rt6i_flags&RTF_GATEWAY)) {
 702                        if (rt->rt6i_dst.plen != 128 &&
 703                            ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
 704                                rt->rt6i_flags |= RTF_ANYCAST;
 705                        ipv6_addr_copy(&rt->rt6i_gateway, daddr);
 706                }
 707
 708                ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
 709                rt->rt6i_dst.plen = 128;
 710                rt->rt6i_flags |= RTF_CACHE;
 711                rt->dst.flags |= DST_HOST;
 712
 713#ifdef CONFIG_IPV6_SUBTREES
 714                if (rt->rt6i_src.plen && saddr) {
 715                        ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
 716                        rt->rt6i_src.plen = 128;
 717                }
 718#endif
 719
 720        retry:
 721                neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
 722                if (IS_ERR(neigh)) {
 723                        struct net *net = dev_net(rt->rt6i_dev);
 724                        int saved_rt_min_interval =
 725                                net->ipv6.sysctl.ip6_rt_gc_min_interval;
 726                        int saved_rt_elasticity =
 727                                net->ipv6.sysctl.ip6_rt_gc_elasticity;
 728
 729                        if (attempts-- > 0) {
 730                                net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
 731                                net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
 732
 733                                ip6_dst_gc(&net->ipv6.ip6_dst_ops);
 734
 735                                net->ipv6.sysctl.ip6_rt_gc_elasticity =
 736                                        saved_rt_elasticity;
 737                                net->ipv6.sysctl.ip6_rt_gc_min_interval =
 738                                        saved_rt_min_interval;
 739                                goto retry;
 740                        }
 741
 742                        if (net_ratelimit())
 743                                printk(KERN_WARNING
 744                                       "ipv6: Neighbour table overflow.\n");
 745                        dst_free(&rt->dst);
 746                        return NULL;
 747                }
 748                rt->rt6i_nexthop = neigh;
 749
 750        }
 751
 752        return rt;
 753}
 754
 755static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, const struct in6_addr *daddr)
 756{
 757        struct rt6_info *rt = ip6_rt_copy(ort);
 758        if (rt) {
 759                ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
 760                rt->rt6i_dst.plen = 128;
 761                rt->rt6i_flags |= RTF_CACHE;
 762                rt->dst.flags |= DST_HOST;
 763                rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
 764        }
 765        return rt;
 766}
 767
 768static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
 769                                      struct flowi6 *fl6, int flags)
 770{
 771        struct fib6_node *fn;
 772        struct rt6_info *rt, *nrt;
 773        int strict = 0;
 774        int attempts = 3;
 775        int err;
 776        int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
 777
 778        strict |= flags & RT6_LOOKUP_F_IFACE;
 779
 780relookup:
 781        read_lock_bh(&table->tb6_lock);
 782
 783restart_2:
 784        fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 785
 786restart:
 787        rt = rt6_select(fn, oif, strict | reachable);
 788
 789        BACKTRACK(net, &fl6->saddr);
 790        if (rt == net->ipv6.ip6_null_entry ||
 791            rt->rt6i_flags & RTF_CACHE)
 792                goto out;
 793
 794        dst_hold(&rt->dst);
 795        read_unlock_bh(&table->tb6_lock);
 796
 797        if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
 798                nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
 799        else if (!(rt->dst.flags & DST_HOST))
 800                nrt = rt6_alloc_clone(rt, &fl6->daddr);
 801        else
 802                goto out2;
 803
 804        dst_release(&rt->dst);
 805        rt = nrt ? : net->ipv6.ip6_null_entry;
 806
 807        dst_hold(&rt->dst);
 808        if (nrt) {
 809                err = ip6_ins_rt(nrt);
 810                if (!err)
 811                        goto out2;
 812        }
 813
 814        if (--attempts <= 0)
 815                goto out2;
 816
 817        /*
 818         * Race condition! In the gap, when table->tb6_lock was
 819         * released someone could insert this route.  Relookup.
 820         */
 821        dst_release(&rt->dst);
 822        goto relookup;
 823
 824out:
 825        if (reachable) {
 826                reachable = 0;
 827                goto restart_2;
 828        }
 829        dst_hold(&rt->dst);
 830        read_unlock_bh(&table->tb6_lock);
 831out2:
 832        rt->dst.lastuse = jiffies;
 833        rt->dst.__use++;
 834
 835        return rt;
 836}
 837
 838static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
 839                                            struct flowi6 *fl6, int flags)
 840{
 841        return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
 842}
 843
 844void ip6_route_input(struct sk_buff *skb)
 845{
 846        const struct ipv6hdr *iph = ipv6_hdr(skb);
 847        struct net *net = dev_net(skb->dev);
 848        int flags = RT6_LOOKUP_F_HAS_SADDR;
 849        struct flowi6 fl6 = {
 850                .flowi6_iif = skb->dev->ifindex,
 851                .daddr = iph->daddr,
 852                .saddr = iph->saddr,
 853                .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
 854                .flowi6_mark = skb->mark,
 855                .flowi6_proto = iph->nexthdr,
 856        };
 857
 858        if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
 859                flags |= RT6_LOOKUP_F_IFACE;
 860
 861        skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
 862}
 863
 864static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
 865                                             struct flowi6 *fl6, int flags)
 866{
 867        return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
 868}
 869
 870struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
 871                                    struct flowi6 *fl6)
 872{
 873        int flags = 0;
 874
 875        if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
 876                flags |= RT6_LOOKUP_F_IFACE;
 877
 878        if (!ipv6_addr_any(&fl6->saddr))
 879                flags |= RT6_LOOKUP_F_HAS_SADDR;
 880        else if (sk)
 881                flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
 882
 883        return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
 884}
 885
 886EXPORT_SYMBOL(ip6_route_output);
 887
 888struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
 889{
 890        struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
 891        struct dst_entry *new = NULL;
 892
 893        rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
 894        if (rt) {
 895                memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
 896
 897                new = &rt->dst;
 898
 899                new->__use = 1;
 900                new->input = dst_discard;
 901                new->output = dst_discard;
 902
 903                dst_copy_metrics(new, &ort->dst);
 904                rt->rt6i_idev = ort->rt6i_idev;
 905                if (rt->rt6i_idev)
 906                        in6_dev_hold(rt->rt6i_idev);
 907                rt->rt6i_expires = 0;
 908
 909                ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
 910                rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
 911                rt->rt6i_metric = 0;
 912
 913                memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
 914#ifdef CONFIG_IPV6_SUBTREES
 915                memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
 916#endif
 917
 918                dst_free(new);
 919        }
 920
 921        dst_release(dst_orig);
 922        return new ? new : ERR_PTR(-ENOMEM);
 923}
 924
 925/*
 926 *      Destination cache support functions
 927 */
 928
 929static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 930{
 931        struct rt6_info *rt;
 932
 933        rt = (struct rt6_info *) dst;
 934
 935        if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
 936                if (rt->rt6i_peer_genid != rt6_peer_genid()) {
 937                        if (!rt->rt6i_peer)
 938                                rt6_bind_peer(rt, 0);
 939                        rt->rt6i_peer_genid = rt6_peer_genid();
 940                }
 941                return dst;
 942        }
 943        return NULL;
 944}
 945
 946static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
 947{
 948        struct rt6_info *rt = (struct rt6_info *) dst;
 949
 950        if (rt) {
 951                if (rt->rt6i_flags & RTF_CACHE) {
 952                        if (rt6_check_expired(rt)) {
 953                                ip6_del_rt(rt);
 954                                dst = NULL;
 955                        }
 956                } else {
 957                        dst_release(dst);
 958                        dst = NULL;
 959                }
 960        }
 961        return dst;
 962}
 963
 964static void ip6_link_failure(struct sk_buff *skb)
 965{
 966        struct rt6_info *rt;
 967
 968        icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
 969
 970        rt = (struct rt6_info *) skb_dst(skb);
 971        if (rt) {
 972                if (rt->rt6i_flags&RTF_CACHE) {
 973                        dst_set_expires(&rt->dst, 0);
 974                        rt->rt6i_flags |= RTF_EXPIRES;
 975                } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
 976                        rt->rt6i_node->fn_sernum = -1;
 977        }
 978}
 979
 980static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
 981{
 982        struct rt6_info *rt6 = (struct rt6_info*)dst;
 983
 984        if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
 985                rt6->rt6i_flags |= RTF_MODIFIED;
 986                if (mtu < IPV6_MIN_MTU) {
 987                        u32 features = dst_metric(dst, RTAX_FEATURES);
 988                        mtu = IPV6_MIN_MTU;
 989                        features |= RTAX_FEATURE_ALLFRAG;
 990                        dst_metric_set(dst, RTAX_FEATURES, features);
 991                }
 992                dst_metric_set(dst, RTAX_MTU, mtu);
 993        }
 994}
 995
 996static unsigned int ip6_default_advmss(const struct dst_entry *dst)
 997{
 998        struct net_device *dev = dst->dev;
 999        unsigned int mtu = dst_mtu(dst);
1000        struct net *net = dev_net(dev);
1001
1002        mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1003
1004        if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1005                mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1006
1007        /*
1008         * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1009         * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1010         * IPV6_MAXPLEN is also valid and means: "any MSS,
1011         * rely only on pmtu discovery"
1012         */
1013        if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1014                mtu = IPV6_MAXPLEN;
1015        return mtu;
1016}
1017
1018static unsigned int ip6_default_mtu(const struct dst_entry *dst)
1019{
1020        unsigned int mtu = IPV6_MIN_MTU;
1021        struct inet6_dev *idev;
1022
1023        rcu_read_lock();
1024        idev = __in6_dev_get(dst->dev);
1025        if (idev)
1026                mtu = idev->cnf.mtu6;
1027        rcu_read_unlock();
1028
1029        return mtu;
1030}
1031
1032static struct dst_entry *icmp6_dst_gc_list;
1033static DEFINE_SPINLOCK(icmp6_dst_lock);
1034
1035struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1036                                  struct neighbour *neigh,
1037                                  const struct in6_addr *addr)
1038{
1039        struct rt6_info *rt;
1040        struct inet6_dev *idev = in6_dev_get(dev);
1041        struct net *net = dev_net(dev);
1042
1043        if (unlikely(idev == NULL))
1044                return NULL;
1045
1046        rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1047        if (unlikely(rt == NULL)) {
1048                in6_dev_put(idev);
1049                goto out;
1050        }
1051
1052        if (neigh)
1053                neigh_hold(neigh);
1054        else {
1055                neigh = ndisc_get_neigh(dev, addr);
1056                if (IS_ERR(neigh))
1057                        neigh = NULL;
1058        }
1059
1060        rt->rt6i_idev     = idev;
1061        rt->rt6i_nexthop  = neigh;
1062        atomic_set(&rt->dst.__refcnt, 1);
1063        dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1064        rt->dst.output  = ip6_output;
1065
1066        spin_lock_bh(&icmp6_dst_lock);
1067        rt->dst.next = icmp6_dst_gc_list;
1068        icmp6_dst_gc_list = &rt->dst;
1069        spin_unlock_bh(&icmp6_dst_lock);
1070
1071        fib6_force_start_gc(net);
1072
1073out:
1074        return &rt->dst;
1075}
1076
1077int icmp6_dst_gc(void)
1078{
1079        struct dst_entry *dst, **pprev;
1080        int more = 0;
1081
1082        spin_lock_bh(&icmp6_dst_lock);
1083        pprev = &icmp6_dst_gc_list;
1084
1085        while ((dst = *pprev) != NULL) {
1086                if (!atomic_read(&dst->__refcnt)) {
1087                        *pprev = dst->next;
1088                        dst_free(dst);
1089                } else {
1090                        pprev = &dst->next;
1091                        ++more;
1092                }
1093        }
1094
1095        spin_unlock_bh(&icmp6_dst_lock);
1096
1097        return more;
1098}
1099
1100static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1101                            void *arg)
1102{
1103        struct dst_entry *dst, **pprev;
1104
1105        spin_lock_bh(&icmp6_dst_lock);
1106        pprev = &icmp6_dst_gc_list;
1107        while ((dst = *pprev) != NULL) {
1108                struct rt6_info *rt = (struct rt6_info *) dst;
1109                if (func(rt, arg)) {
1110                        *pprev = dst->next;
1111                        dst_free(dst);
1112                } else {
1113                        pprev = &dst->next;
1114                }
1115        }
1116        spin_unlock_bh(&icmp6_dst_lock);
1117}
1118
1119static int ip6_dst_gc(struct dst_ops *ops)
1120{
1121        unsigned long now = jiffies;
1122        struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1123        int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1124        int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1125        int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1126        int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1127        unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1128        int entries;
1129
1130        entries = dst_entries_get_fast(ops);
1131        if (time_after(rt_last_gc + rt_min_interval, now) &&
1132            entries <= rt_max_size)
1133                goto out;
1134
1135        net->ipv6.ip6_rt_gc_expire++;
1136        fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1137        net->ipv6.ip6_rt_last_gc = now;
1138        entries = dst_entries_get_slow(ops);
1139        if (entries < ops->gc_thresh)
1140                net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1141out:
1142        net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1143        return entries > rt_max_size;
1144}
1145
1146/* Clean host part of a prefix. Not necessary in radix tree,
1147   but results in cleaner routing tables.
1148
1149   Remove it only when all the things will work!
1150 */
1151
1152int ip6_dst_hoplimit(struct dst_entry *dst)
1153{
1154        int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1155        if (hoplimit == 0) {
1156                struct net_device *dev = dst->dev;
1157                struct inet6_dev *idev;
1158
1159                rcu_read_lock();
1160                idev = __in6_dev_get(dev);
1161                if (idev)
1162                        hoplimit = idev->cnf.hop_limit;
1163                else
1164                        hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1165                rcu_read_unlock();
1166        }
1167        return hoplimit;
1168}
1169EXPORT_SYMBOL(ip6_dst_hoplimit);
1170
1171/*
1172 *
1173 */
1174
1175int ip6_route_add(struct fib6_config *cfg)
1176{
1177        int err;
1178        struct net *net = cfg->fc_nlinfo.nl_net;
1179        struct rt6_info *rt = NULL;
1180        struct net_device *dev = NULL;
1181        struct inet6_dev *idev = NULL;
1182        struct fib6_table *table;
1183        int addr_type;
1184
1185        if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1186                return -EINVAL;
1187#ifndef CONFIG_IPV6_SUBTREES
1188        if (cfg->fc_src_len)
1189                return -EINVAL;
1190#endif
1191        if (cfg->fc_ifindex) {
1192                err = -ENODEV;
1193                dev = dev_get_by_index(net, cfg->fc_ifindex);
1194                if (!dev)
1195                        goto out;
1196                idev = in6_dev_get(dev);
1197                if (!idev)
1198                        goto out;
1199        }
1200
1201        if (cfg->fc_metric == 0)
1202                cfg->fc_metric = IP6_RT_PRIO_USER;
1203
1204        table = fib6_new_table(net, cfg->fc_table);
1205        if (table == NULL) {
1206                err = -ENOBUFS;
1207                goto out;
1208        }
1209
1210        rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1211
1212        if (rt == NULL) {
1213                err = -ENOMEM;
1214                goto out;
1215        }
1216
1217        rt->dst.obsolete = -1;
1218        rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1219                                jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1220                                0;
1221
1222        if (cfg->fc_protocol == RTPROT_UNSPEC)
1223                cfg->fc_protocol = RTPROT_BOOT;
1224        rt->rt6i_protocol = cfg->fc_protocol;
1225
1226        addr_type = ipv6_addr_type(&cfg->fc_dst);
1227
1228        if (addr_type & IPV6_ADDR_MULTICAST)
1229                rt->dst.input = ip6_mc_input;
1230        else if (cfg->fc_flags & RTF_LOCAL)
1231                rt->dst.input = ip6_input;
1232        else
1233                rt->dst.input = ip6_forward;
1234
1235        rt->dst.output = ip6_output;
1236
1237        ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1238        rt->rt6i_dst.plen = cfg->fc_dst_len;
1239        if (rt->rt6i_dst.plen == 128)
1240               rt->dst.flags |= DST_HOST;
1241
1242#ifdef CONFIG_IPV6_SUBTREES
1243        ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1244        rt->rt6i_src.plen = cfg->fc_src_len;
1245#endif
1246
1247        rt->rt6i_metric = cfg->fc_metric;
1248
1249        /* We cannot add true routes via loopback here,
1250           they would result in kernel looping; promote them to reject routes
1251         */
1252        if ((cfg->fc_flags & RTF_REJECT) ||
1253            (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1254                                              && !(cfg->fc_flags&RTF_LOCAL))) {
1255                /* hold loopback dev/idev if we haven't done so. */
1256                if (dev != net->loopback_dev) {
1257                        if (dev) {
1258                                dev_put(dev);
1259                                in6_dev_put(idev);
1260                        }
1261                        dev = net->loopback_dev;
1262                        dev_hold(dev);
1263                        idev = in6_dev_get(dev);
1264                        if (!idev) {
1265                                err = -ENODEV;
1266                                goto out;
1267                        }
1268                }
1269                rt->dst.output = ip6_pkt_discard_out;
1270                rt->dst.input = ip6_pkt_discard;
1271                rt->dst.error = -ENETUNREACH;
1272                rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1273                goto install_route;
1274        }
1275
1276        if (cfg->fc_flags & RTF_GATEWAY) {
1277                const struct in6_addr *gw_addr;
1278                int gwa_type;
1279
1280                gw_addr = &cfg->fc_gateway;
1281                ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1282                gwa_type = ipv6_addr_type(gw_addr);
1283
1284                if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1285                        struct rt6_info *grt;
1286
1287                        /* IPv6 strictly inhibits using not link-local
1288                           addresses as nexthop address.
1289                           Otherwise, router will not able to send redirects.
1290                           It is very good, but in some (rare!) circumstances
1291                           (SIT, PtP, NBMA NOARP links) it is handy to allow
1292                           some exceptions. --ANK
1293                         */
1294                        err = -EINVAL;
1295                        if (!(gwa_type&IPV6_ADDR_UNICAST))
1296                                goto out;
1297
1298                        grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1299
1300                        err = -EHOSTUNREACH;
1301                        if (grt == NULL)
1302                                goto out;
1303                        if (dev) {
1304                                if (dev != grt->rt6i_dev) {
1305                                        dst_release(&grt->dst);
1306                                        goto out;
1307                                }
1308                        } else {
1309                                dev = grt->rt6i_dev;
1310                                idev = grt->rt6i_idev;
1311                                dev_hold(dev);
1312                                in6_dev_hold(grt->rt6i_idev);
1313                        }
1314                        if (!(grt->rt6i_flags&RTF_GATEWAY))
1315                                err = 0;
1316                        dst_release(&grt->dst);
1317
1318                        if (err)
1319                                goto out;
1320                }
1321                err = -EINVAL;
1322                if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1323                        goto out;
1324        }
1325
1326        err = -ENODEV;
1327        if (dev == NULL)
1328                goto out;
1329
1330        if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1331                if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1332                        err = -EINVAL;
1333                        goto out;
1334                }
1335                ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1336                rt->rt6i_prefsrc.plen = 128;
1337        } else
1338                rt->rt6i_prefsrc.plen = 0;
1339
1340        if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1341                rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1342                if (IS_ERR(rt->rt6i_nexthop)) {
1343                        err = PTR_ERR(rt->rt6i_nexthop);
1344                        rt->rt6i_nexthop = NULL;
1345                        goto out;
1346                }
1347        }
1348
1349        rt->rt6i_flags = cfg->fc_flags;
1350
1351install_route:
1352        if (cfg->fc_mx) {
1353                struct nlattr *nla;
1354                int remaining;
1355
1356                nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1357                        int type = nla_type(nla);
1358
1359                        if (type) {
1360                                if (type > RTAX_MAX) {
1361                                        err = -EINVAL;
1362                                        goto out;
1363                                }
1364
1365                                dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1366                        }
1367                }
1368        }
1369
1370        rt->dst.dev = dev;
1371        rt->rt6i_idev = idev;
1372        rt->rt6i_table = table;
1373
1374        cfg->fc_nlinfo.nl_net = dev_net(dev);
1375
1376        return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1377
1378out:
1379        if (dev)
1380                dev_put(dev);
1381        if (idev)
1382                in6_dev_put(idev);
1383        if (rt)
1384                dst_free(&rt->dst);
1385        return err;
1386}
1387
1388static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1389{
1390        int err;
1391        struct fib6_table *table;
1392        struct net *net = dev_net(rt->rt6i_dev);
1393
1394        if (rt == net->ipv6.ip6_null_entry)
1395                return -ENOENT;
1396
1397        table = rt->rt6i_table;
1398        write_lock_bh(&table->tb6_lock);
1399
1400        err = fib6_del(rt, info);
1401        dst_release(&rt->dst);
1402
1403        write_unlock_bh(&table->tb6_lock);
1404
1405        return err;
1406}
1407
1408int ip6_del_rt(struct rt6_info *rt)
1409{
1410        struct nl_info info = {
1411                .nl_net = dev_net(rt->rt6i_dev),
1412        };
1413        return __ip6_del_rt(rt, &info);
1414}
1415
1416static int ip6_route_del(struct fib6_config *cfg)
1417{
1418        struct fib6_table *table;
1419        struct fib6_node *fn;
1420        struct rt6_info *rt;
1421        int err = -ESRCH;
1422
1423        table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1424        if (table == NULL)
1425                return err;
1426
1427        read_lock_bh(&table->tb6_lock);
1428
1429        fn = fib6_locate(&table->tb6_root,
1430                         &cfg->fc_dst, cfg->fc_dst_len,
1431                         &cfg->fc_src, cfg->fc_src_len);
1432
1433        if (fn) {
1434                for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1435                        if (cfg->fc_ifindex &&
1436                            (rt->rt6i_dev == NULL ||
1437                             rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1438                                continue;
1439                        if (cfg->fc_flags & RTF_GATEWAY &&
1440                            !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1441                                continue;
1442                        if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1443                                continue;
1444                        dst_hold(&rt->dst);
1445                        read_unlock_bh(&table->tb6_lock);
1446
1447                        return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1448                }
1449        }
1450        read_unlock_bh(&table->tb6_lock);
1451
1452        return err;
1453}
1454
1455/*
1456 *      Handle redirects
1457 */
1458struct ip6rd_flowi {
1459        struct flowi6 fl6;
1460        struct in6_addr gateway;
1461};
1462
1463static struct rt6_info *__ip6_route_redirect(struct net *net,
1464                                             struct fib6_table *table,
1465                                             struct flowi6 *fl6,
1466                                             int flags)
1467{
1468        struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1469        struct rt6_info *rt;
1470        struct fib6_node *fn;
1471
1472        /*
1473         * Get the "current" route for this destination and
1474         * check if the redirect has come from approriate router.
1475         *
1476         * RFC 2461 specifies that redirects should only be
1477         * accepted if they come from the nexthop to the target.
1478         * Due to the way the routes are chosen, this notion
1479         * is a bit fuzzy and one might need to check all possible
1480         * routes.
1481         */
1482
1483        read_lock_bh(&table->tb6_lock);
1484        fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1485restart:
1486        for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1487                /*
1488                 * Current route is on-link; redirect is always invalid.
1489                 *
1490                 * Seems, previous statement is not true. It could
1491                 * be node, which looks for us as on-link (f.e. proxy ndisc)
1492                 * But then router serving it might decide, that we should
1493                 * know truth 8)8) --ANK (980726).
1494                 */
1495                if (rt6_check_expired(rt))
1496                        continue;
1497                if (!(rt->rt6i_flags & RTF_GATEWAY))
1498                        continue;
1499                if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1500                        continue;
1501                if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1502                        continue;
1503                break;
1504        }
1505
1506        if (!rt)
1507                rt = net->ipv6.ip6_null_entry;
1508        BACKTRACK(net, &fl6->saddr);
1509out:
1510        dst_hold(&rt->dst);
1511
1512        read_unlock_bh(&table->tb6_lock);
1513
1514        return rt;
1515};
1516
1517static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1518                                           const struct in6_addr *src,
1519                                           const struct in6_addr *gateway,
1520                                           struct net_device *dev)
1521{
1522        int flags = RT6_LOOKUP_F_HAS_SADDR;
1523        struct net *net = dev_net(dev);
1524        struct ip6rd_flowi rdfl = {
1525                .fl6 = {
1526                        .flowi6_oif = dev->ifindex,
1527                        .daddr = *dest,
1528                        .saddr = *src,
1529                },
1530        };
1531
1532        ipv6_addr_copy(&rdfl.gateway, gateway);
1533
1534        if (rt6_need_strict(dest))
1535                flags |= RT6_LOOKUP_F_IFACE;
1536
1537        return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1538                                                   flags, __ip6_route_redirect);
1539}
1540
1541void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1542                  const struct in6_addr *saddr,
1543                  struct neighbour *neigh, u8 *lladdr, int on_link)
1544{
1545        struct rt6_info *rt, *nrt = NULL;
1546        struct netevent_redirect netevent;
1547        struct net *net = dev_net(neigh->dev);
1548
1549        rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1550
1551        if (rt == net->ipv6.ip6_null_entry) {
1552                if (net_ratelimit())
1553                        printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1554                               "for redirect target\n");
1555                goto out;
1556        }
1557
1558        /*
1559         *      We have finally decided to accept it.
1560         */
1561
1562        neigh_update(neigh, lladdr, NUD_STALE,
1563                     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1564                     NEIGH_UPDATE_F_OVERRIDE|
1565                     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1566                                     NEIGH_UPDATE_F_ISROUTER))
1567                     );
1568
1569        /*
1570         * Redirect received -> path was valid.
1571         * Look, redirects are sent only in response to data packets,
1572         * so that this nexthop apparently is reachable. --ANK
1573         */
1574        dst_confirm(&rt->dst);
1575
1576        /* Duplicate redirect: silently ignore. */
1577        if (neigh == rt->dst.neighbour)
1578                goto out;
1579
1580        nrt = ip6_rt_copy(rt);
1581        if (nrt == NULL)
1582                goto out;
1583
1584        nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1585        if (on_link)
1586                nrt->rt6i_flags &= ~RTF_GATEWAY;
1587
1588        ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1589        nrt->rt6i_dst.plen = 128;
1590        nrt->dst.flags |= DST_HOST;
1591
1592        ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1593        nrt->rt6i_nexthop = neigh_clone(neigh);
1594
1595        if (ip6_ins_rt(nrt))
1596                goto out;
1597
1598        netevent.old = &rt->dst;
1599        netevent.new = &nrt->dst;
1600        call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1601
1602        if (rt->rt6i_flags&RTF_CACHE) {
1603                ip6_del_rt(rt);
1604                return;
1605        }
1606
1607out:
1608        dst_release(&rt->dst);
1609}
1610
1611/*
1612 *      Handle ICMP "packet too big" messages
1613 *      i.e. Path MTU discovery
1614 */
1615
1616static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1617                             struct net *net, u32 pmtu, int ifindex)
1618{
1619        struct rt6_info *rt, *nrt;
1620        int allfrag = 0;
1621again:
1622        rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1623        if (rt == NULL)
1624                return;
1625
1626        if (rt6_check_expired(rt)) {
1627                ip6_del_rt(rt);
1628                goto again;
1629        }
1630
1631        if (pmtu >= dst_mtu(&rt->dst))
1632                goto out;
1633
1634        if (pmtu < IPV6_MIN_MTU) {
1635                /*
1636                 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1637                 * MTU (1280) and a fragment header should always be included
1638                 * after a node receiving Too Big message reporting PMTU is
1639                 * less than the IPv6 Minimum Link MTU.
1640                 */
1641                pmtu = IPV6_MIN_MTU;
1642                allfrag = 1;
1643        }
1644
1645        /* New mtu received -> path was valid.
1646           They are sent only in response to data packets,
1647           so that this nexthop apparently is reachable. --ANK
1648         */
1649        dst_confirm(&rt->dst);
1650
1651        /* Host route. If it is static, it would be better
1652           not to override it, but add new one, so that
1653           when cache entry will expire old pmtu
1654           would return automatically.
1655         */
1656        if (rt->rt6i_flags & RTF_CACHE) {
1657                dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1658                if (allfrag) {
1659                        u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1660                        features |= RTAX_FEATURE_ALLFRAG;
1661                        dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1662                }
1663                dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1664                rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1665                goto out;
1666        }
1667
1668        /* Network route.
1669           Two cases are possible:
1670           1. It is connected route. Action: COW
1671           2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1672         */
1673        if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1674                nrt = rt6_alloc_cow(rt, daddr, saddr);
1675        else
1676                nrt = rt6_alloc_clone(rt, daddr);
1677
1678        if (nrt) {
1679                dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1680                if (allfrag) {
1681                        u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1682                        features |= RTAX_FEATURE_ALLFRAG;
1683                        dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1684                }
1685
1686                /* According to RFC 1981, detecting PMTU increase shouldn't be
1687                 * happened within 5 mins, the recommended timer is 10 mins.
1688                 * Here this route expiration time is set to ip6_rt_mtu_expires
1689                 * which is 10 mins. After 10 mins the decreased pmtu is expired
1690                 * and detecting PMTU increase will be automatically happened.
1691                 */
1692                dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1693                nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1694
1695                ip6_ins_rt(nrt);
1696        }
1697out:
1698        dst_release(&rt->dst);
1699}
1700
1701void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1702                        struct net_device *dev, u32 pmtu)
1703{
1704        struct net *net = dev_net(dev);
1705
1706        /*
1707         * RFC 1981 states that a node "MUST reduce the size of the packets it
1708         * is sending along the path" that caused the Packet Too Big message.
1709         * Since it's not possible in the general case to determine which
1710         * interface was used to send the original packet, we update the MTU
1711         * on the interface that will be used to send future packets. We also
1712         * update the MTU on the interface that received the Packet Too Big in
1713         * case the original packet was forced out that interface with
1714         * SO_BINDTODEVICE or similar. This is the next best thing to the
1715         * correct behaviour, which would be to update the MTU on all
1716         * interfaces.
1717         */
1718        rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1719        rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1720}
1721
1722/*
1723 *      Misc support functions
1724 */
1725
1726static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1727{
1728        struct net *net = dev_net(ort->rt6i_dev);
1729        struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1730                                            ort->dst.dev, 0);
1731
1732        if (rt) {
1733                rt->dst.input = ort->dst.input;
1734                rt->dst.output = ort->dst.output;
1735
1736                dst_copy_metrics(&rt->dst, &ort->dst);
1737                rt->dst.error = ort->dst.error;
1738                rt->rt6i_idev = ort->rt6i_idev;
1739                if (rt->rt6i_idev)
1740                        in6_dev_hold(rt->rt6i_idev);
1741                rt->dst.lastuse = jiffies;
1742                rt->rt6i_expires = 0;
1743
1744                ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1745                rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1746                rt->rt6i_metric = 0;
1747
1748                memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1749#ifdef CONFIG_IPV6_SUBTREES
1750                memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1751#endif
1752                memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1753                rt->rt6i_table = ort->rt6i_table;
1754        }
1755        return rt;
1756}
1757
1758#ifdef CONFIG_IPV6_ROUTE_INFO
1759static struct rt6_info *rt6_get_route_info(struct net *net,
1760                                           const struct in6_addr *prefix, int prefixlen,
1761                                           const struct in6_addr *gwaddr, int ifindex)
1762{
1763        struct fib6_node *fn;
1764        struct rt6_info *rt = NULL;
1765        struct fib6_table *table;
1766
1767        table = fib6_get_table(net, RT6_TABLE_INFO);
1768        if (table == NULL)
1769                return NULL;
1770
1771        write_lock_bh(&table->tb6_lock);
1772        fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1773        if (!fn)
1774                goto out;
1775
1776        for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1777                if (rt->rt6i_dev->ifindex != ifindex)
1778                        continue;
1779                if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1780                        continue;
1781                if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1782                        continue;
1783                dst_hold(&rt->dst);
1784                break;
1785        }
1786out:
1787        write_unlock_bh(&table->tb6_lock);
1788        return rt;
1789}
1790
1791static struct rt6_info *rt6_add_route_info(struct net *net,
1792                                           const struct in6_addr *prefix, int prefixlen,
1793                                           const struct in6_addr *gwaddr, int ifindex,
1794                                           unsigned pref)
1795{
1796        struct fib6_config cfg = {
1797                .fc_table       = RT6_TABLE_INFO,
1798                .fc_metric      = IP6_RT_PRIO_USER,
1799                .fc_ifindex     = ifindex,
1800                .fc_dst_len     = prefixlen,
1801                .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1802                                  RTF_UP | RTF_PREF(pref),
1803                .fc_nlinfo.pid = 0,
1804                .fc_nlinfo.nlh = NULL,
1805                .fc_nlinfo.nl_net = net,
1806        };
1807
1808        ipv6_addr_copy(&cfg.fc_dst, prefix);
1809        ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1810
1811        /* We should treat it as a default route if prefix length is 0. */
1812        if (!prefixlen)
1813                cfg.fc_flags |= RTF_DEFAULT;
1814
1815        ip6_route_add(&cfg);
1816
1817        return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1818}
1819#endif
1820
1821struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1822{
1823        struct rt6_info *rt;
1824        struct fib6_table *table;
1825
1826        table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1827        if (table == NULL)
1828                return NULL;
1829
1830        write_lock_bh(&table->tb6_lock);
1831        for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1832                if (dev == rt->rt6i_dev &&
1833                    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1834                    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1835                        break;
1836        }
1837        if (rt)
1838                dst_hold(&rt->dst);
1839        write_unlock_bh(&table->tb6_lock);
1840        return rt;
1841}
1842
1843struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1844                                     struct net_device *dev,
1845                                     unsigned int pref)
1846{
1847        struct fib6_config cfg = {
1848                .fc_table       = RT6_TABLE_DFLT,
1849                .fc_metric      = IP6_RT_PRIO_USER,
1850                .fc_ifindex     = dev->ifindex,
1851                .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1852                                  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1853                .fc_nlinfo.pid = 0,
1854                .fc_nlinfo.nlh = NULL,
1855                .fc_nlinfo.nl_net = dev_net(dev),
1856        };
1857
1858        ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1859
1860        ip6_route_add(&cfg);
1861
1862        return rt6_get_dflt_router(gwaddr, dev);
1863}
1864
1865void rt6_purge_dflt_routers(struct net *net)
1866{
1867        struct rt6_info *rt;
1868        struct fib6_table *table;
1869
1870        /* NOTE: Keep consistent with rt6_get_dflt_router */
1871        table = fib6_get_table(net, RT6_TABLE_DFLT);
1872        if (table == NULL)
1873                return;
1874
1875restart:
1876        read_lock_bh(&table->tb6_lock);
1877        for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1878                if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1879                        dst_hold(&rt->dst);
1880                        read_unlock_bh(&table->tb6_lock);
1881                        ip6_del_rt(rt);
1882                        goto restart;
1883                }
1884        }
1885        read_unlock_bh(&table->tb6_lock);
1886}
1887
1888static void rtmsg_to_fib6_config(struct net *net,
1889                                 struct in6_rtmsg *rtmsg,
1890                                 struct fib6_config *cfg)
1891{
1892        memset(cfg, 0, sizeof(*cfg));
1893
1894        cfg->fc_table = RT6_TABLE_MAIN;
1895        cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1896        cfg->fc_metric = rtmsg->rtmsg_metric;
1897        cfg->fc_expires = rtmsg->rtmsg_info;
1898        cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1899        cfg->fc_src_len = rtmsg->rtmsg_src_len;
1900        cfg->fc_flags = rtmsg->rtmsg_flags;
1901
1902        cfg->fc_nlinfo.nl_net = net;
1903
1904        ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1905        ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1906        ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1907}
1908
1909int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1910{
1911        struct fib6_config cfg;
1912        struct in6_rtmsg rtmsg;
1913        int err;
1914
1915        switch(cmd) {
1916        case SIOCADDRT:         /* Add a route */
1917        case SIOCDELRT:         /* Delete a route */
1918                if (!capable(CAP_NET_ADMIN))
1919                        return -EPERM;
1920                err = copy_from_user(&rtmsg, arg,
1921                                     sizeof(struct in6_rtmsg));
1922                if (err)
1923                        return -EFAULT;
1924
1925                rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1926
1927                rtnl_lock();
1928                switch (cmd) {
1929                case SIOCADDRT:
1930                        err = ip6_route_add(&cfg);
1931                        break;
1932                case SIOCDELRT:
1933                        err = ip6_route_del(&cfg);
1934                        break;
1935                default:
1936                        err = -EINVAL;
1937                }
1938                rtnl_unlock();
1939
1940                return err;
1941        }
1942
1943        return -EINVAL;
1944}
1945
1946/*
1947 *      Drop the packet on the floor
1948 */
1949
1950static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1951{
1952        int type;
1953        struct dst_entry *dst = skb_dst(skb);
1954        switch (ipstats_mib_noroutes) {
1955        case IPSTATS_MIB_INNOROUTES:
1956                type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1957                if (type == IPV6_ADDR_ANY) {
1958                        IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1959                                      IPSTATS_MIB_INADDRERRORS);
1960                        break;
1961                }
1962                /* FALLTHROUGH */
1963        case IPSTATS_MIB_OUTNOROUTES:
1964                IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1965                              ipstats_mib_noroutes);
1966                break;
1967        }
1968        icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1969        kfree_skb(skb);
1970        return 0;
1971}
1972
1973static int ip6_pkt_discard(struct sk_buff *skb)
1974{
1975        return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1976}
1977
1978static int ip6_pkt_discard_out(struct sk_buff *skb)
1979{
1980        skb->dev = skb_dst(skb)->dev;
1981        return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1982}
1983
1984#ifdef CONFIG_IPV6_MULTIPLE_TABLES
1985
1986static int ip6_pkt_prohibit(struct sk_buff *skb)
1987{
1988        return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1989}
1990
1991static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1992{
1993        skb->dev = skb_dst(skb)->dev;
1994        return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1995}
1996
1997#endif
1998
1999/*
2000 *      Allocate a dst for local (unicast / anycast) address.
2001 */
2002
2003struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2004                                    const struct in6_addr *addr,
2005                                    int anycast)
2006{
2007        struct net *net = dev_net(idev->dev);
2008        struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2009                                            net->loopback_dev, 0);
2010        struct neighbour *neigh;
2011
2012        if (rt == NULL) {
2013                if (net_ratelimit())
2014                        pr_warning("IPv6:  Maximum number of routes reached,"
2015                                   " consider increasing route/max_size.\n");
2016                return ERR_PTR(-ENOMEM);
2017        }
2018
2019        in6_dev_hold(idev);
2020
2021        rt->dst.flags |= DST_HOST;
2022        rt->dst.input = ip6_input;
2023        rt->dst.output = ip6_output;
2024        rt->rt6i_idev = idev;
2025        rt->dst.obsolete = -1;
2026
2027        rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2028        if (anycast)
2029                rt->rt6i_flags |= RTF_ANYCAST;
2030        else
2031                rt->rt6i_flags |= RTF_LOCAL;
2032        neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2033        if (IS_ERR(neigh)) {
2034                dst_free(&rt->dst);
2035
2036                return ERR_CAST(neigh);
2037        }
2038        rt->rt6i_nexthop = neigh;
2039
2040        ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2041        rt->rt6i_dst.plen = 128;
2042        rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2043
2044        atomic_set(&rt->dst.__refcnt, 1);
2045
2046        return rt;
2047}
2048
2049int ip6_route_get_saddr(struct net *net,
2050                        struct rt6_info *rt,
2051                        const struct in6_addr *daddr,
2052                        unsigned int prefs,
2053                        struct in6_addr *saddr)
2054{
2055        struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2056        int err = 0;
2057        if (rt->rt6i_prefsrc.plen)
2058                ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2059        else
2060                err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2061                                         daddr, prefs, saddr);
2062        return err;
2063}
2064
2065/* remove deleted ip from prefsrc entries */
2066struct arg_dev_net_ip {
2067        struct net_device *dev;
2068        struct net *net;
2069        struct in6_addr *addr;
2070};
2071
2072static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2073{
2074        struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2075        struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2076        struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2077
2078        if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2079            rt != net->ipv6.ip6_null_entry &&
2080            ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2081                /* remove prefsrc entry */
2082                rt->rt6i_prefsrc.plen = 0;
2083        }
2084        return 0;
2085}
2086
2087void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2088{
2089        struct net *net = dev_net(ifp->idev->dev);
2090        struct arg_dev_net_ip adni = {
2091                .dev = ifp->idev->dev,
2092                .net = net,
2093                .addr = &ifp->addr,
2094        };
2095        fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2096}
2097
2098struct arg_dev_net {
2099        struct net_device *dev;
2100        struct net *net;
2101};
2102
2103static int fib6_ifdown(struct rt6_info *rt, void *arg)
2104{
2105        const struct arg_dev_net *adn = arg;
2106        const struct net_device *dev = adn->dev;
2107
2108        if ((rt->rt6i_dev == dev || dev == NULL) &&
2109            rt != adn->net->ipv6.ip6_null_entry) {
2110                RT6_TRACE("deleted by ifdown %p\n", rt);
2111                return -1;
2112        }
2113        return 0;
2114}
2115
2116void rt6_ifdown(struct net *net, struct net_device *dev)
2117{
2118        struct arg_dev_net adn = {
2119                .dev = dev,
2120                .net = net,
2121        };
2122
2123        fib6_clean_all(net, fib6_ifdown, 0, &adn);
2124        icmp6_clean_all(fib6_ifdown, &adn);
2125}
2126
2127struct rt6_mtu_change_arg
2128{
2129        struct net_device *dev;
2130        unsigned mtu;
2131};
2132
2133static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2134{
2135        struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2136        struct inet6_dev *idev;
2137
2138        /* In IPv6 pmtu discovery is not optional,
2139           so that RTAX_MTU lock cannot disable it.
2140           We still use this lock to block changes
2141           caused by addrconf/ndisc.
2142        */
2143
2144        idev = __in6_dev_get(arg->dev);
2145        if (idev == NULL)
2146                return 0;
2147
2148        /* For administrative MTU increase, there is no way to discover
2149           IPv6 PMTU increase, so PMTU increase should be updated here.
2150           Since RFC 1981 doesn't include administrative MTU increase
2151           update PMTU increase is a MUST. (i.e. jumbo frame)
2152         */
2153        /*
2154           If new MTU is less than route PMTU, this new MTU will be the
2155           lowest MTU in the path, update the route PMTU to reflect PMTU
2156           decreases; if new MTU is greater than route PMTU, and the
2157           old MTU is the lowest MTU in the path, update the route PMTU
2158           to reflect the increase. In this case if the other nodes' MTU
2159           also have the lowest MTU, TOO BIG MESSAGE will be lead to
2160           PMTU discouvery.
2161         */
2162        if (rt->rt6i_dev == arg->dev &&
2163            !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2164            (dst_mtu(&rt->dst) >= arg->mtu ||
2165             (dst_mtu(&rt->dst) < arg->mtu &&
2166              dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2167                dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2168        }
2169        return 0;
2170}
2171
2172void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2173{
2174        struct rt6_mtu_change_arg arg = {
2175                .dev = dev,
2176                .mtu = mtu,
2177        };
2178
2179        fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2180}
2181
2182static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2183        [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2184        [RTA_OIF]               = { .type = NLA_U32 },
2185        [RTA_IIF]               = { .type = NLA_U32 },
2186        [RTA_PRIORITY]          = { .type = NLA_U32 },
2187        [RTA_METRICS]           = { .type = NLA_NESTED },
2188};
2189
2190static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2191                              struct fib6_config *cfg)
2192{
2193        struct rtmsg *rtm;
2194        struct nlattr *tb[RTA_MAX+1];
2195        int err;
2196
2197        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2198        if (err < 0)
2199                goto errout;
2200
2201        err = -EINVAL;
2202        rtm = nlmsg_data(nlh);
2203        memset(cfg, 0, sizeof(*cfg));
2204
2205        cfg->fc_table = rtm->rtm_table;
2206        cfg->fc_dst_len = rtm->rtm_dst_len;
2207        cfg->fc_src_len = rtm->rtm_src_len;
2208        cfg->fc_flags = RTF_UP;
2209        cfg->fc_protocol = rtm->rtm_protocol;
2210
2211        if (rtm->rtm_type == RTN_UNREACHABLE)
2212                cfg->fc_flags |= RTF_REJECT;
2213
2214        if (rtm->rtm_type == RTN_LOCAL)
2215                cfg->fc_flags |= RTF_LOCAL;
2216
2217        cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2218        cfg->fc_nlinfo.nlh = nlh;
2219        cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2220
2221        if (tb[RTA_GATEWAY]) {
2222                nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2223                cfg->fc_flags |= RTF_GATEWAY;
2224        }
2225
2226        if (tb[RTA_DST]) {
2227                int plen = (rtm->rtm_dst_len + 7) >> 3;
2228
2229                if (nla_len(tb[RTA_DST]) < plen)
2230                        goto errout;
2231
2232                nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2233        }
2234
2235        if (tb[RTA_SRC]) {
2236                int plen = (rtm->rtm_src_len + 7) >> 3;
2237
2238                if (nla_len(tb[RTA_SRC]) < plen)
2239                        goto errout;
2240
2241                nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2242        }
2243
2244        if (tb[RTA_PREFSRC])
2245                nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2246
2247        if (tb[RTA_OIF])
2248                cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2249
2250        if (tb[RTA_PRIORITY])
2251                cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2252
2253        if (tb[RTA_METRICS]) {
2254                cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2255                cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2256        }
2257
2258        if (tb[RTA_TABLE])
2259                cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2260
2261        err = 0;
2262errout:
2263        return err;
2264}
2265
2266static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2267{
2268        struct fib6_config cfg;
2269        int err;
2270
2271        err = rtm_to_fib6_config(skb, nlh, &cfg);
2272        if (err < 0)
2273                return err;
2274
2275        return ip6_route_del(&cfg);
2276}
2277
2278static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2279{
2280        struct fib6_config cfg;
2281        int err;
2282
2283        err = rtm_to_fib6_config(skb, nlh, &cfg);
2284        if (err < 0)
2285                return err;
2286
2287        return ip6_route_add(&cfg);
2288}
2289
2290static inline size_t rt6_nlmsg_size(void)
2291{
2292        return NLMSG_ALIGN(sizeof(struct rtmsg))
2293               + nla_total_size(16) /* RTA_SRC */
2294               + nla_total_size(16) /* RTA_DST */
2295               + nla_total_size(16) /* RTA_GATEWAY */
2296               + nla_total_size(16) /* RTA_PREFSRC */
2297               + nla_total_size(4) /* RTA_TABLE */
2298               + nla_total_size(4) /* RTA_IIF */
2299               + nla_total_size(4) /* RTA_OIF */
2300               + nla_total_size(4) /* RTA_PRIORITY */
2301               + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2302               + nla_total_size(sizeof(struct rta_cacheinfo));
2303}
2304
2305static int rt6_fill_node(struct net *net,
2306                         struct sk_buff *skb, struct rt6_info *rt,
2307                         struct in6_addr *dst, struct in6_addr *src,
2308                         int iif, int type, u32 pid, u32 seq,
2309                         int prefix, int nowait, unsigned int flags)
2310{
2311        struct rtmsg *rtm;
2312        struct nlmsghdr *nlh;
2313        long expires;
2314        u32 table;
2315
2316        if (prefix) {   /* user wants prefix routes only */
2317                if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2318                        /* success since this is not a prefix route */
2319                        return 1;
2320                }
2321        }
2322
2323        nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2324        if (nlh == NULL)
2325                return -EMSGSIZE;
2326
2327        rtm = nlmsg_data(nlh);
2328        rtm->rtm_family = AF_INET6;
2329        rtm->rtm_dst_len = rt->rt6i_dst.plen;
2330        rtm->rtm_src_len = rt->rt6i_src.plen;
2331        rtm->rtm_tos = 0;
2332        if (rt->rt6i_table)
2333                table = rt->rt6i_table->tb6_id;
2334        else
2335                table = RT6_TABLE_UNSPEC;
2336        rtm->rtm_table = table;
2337        NLA_PUT_U32(skb, RTA_TABLE, table);
2338        if (rt->rt6i_flags&RTF_REJECT)
2339                rtm->rtm_type = RTN_UNREACHABLE;
2340        else if (rt->rt6i_flags&RTF_LOCAL)
2341                rtm->rtm_type = RTN_LOCAL;
2342        else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2343                rtm->rtm_type = RTN_LOCAL;
2344        else
2345                rtm->rtm_type = RTN_UNICAST;
2346        rtm->rtm_flags = 0;
2347        rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2348        rtm->rtm_protocol = rt->rt6i_protocol;
2349        if (rt->rt6i_flags&RTF_DYNAMIC)
2350                rtm->rtm_protocol = RTPROT_REDIRECT;
2351        else if (rt->rt6i_flags & RTF_ADDRCONF)
2352                rtm->rtm_protocol = RTPROT_KERNEL;
2353        else if (rt->rt6i_flags&RTF_DEFAULT)
2354                rtm->rtm_protocol = RTPROT_RA;
2355
2356        if (rt->rt6i_flags&RTF_CACHE)
2357                rtm->rtm_flags |= RTM_F_CLONED;
2358
2359        if (dst) {
2360                NLA_PUT(skb, RTA_DST, 16, dst);
2361                rtm->rtm_dst_len = 128;
2362        } else if (rtm->rtm_dst_len)
2363                NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2364#ifdef CONFIG_IPV6_SUBTREES
2365        if (src) {
2366                NLA_PUT(skb, RTA_SRC, 16, src);
2367                rtm->rtm_src_len = 128;
2368        } else if (rtm->rtm_src_len)
2369                NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2370#endif
2371        if (iif) {
2372#ifdef CONFIG_IPV6_MROUTE
2373                if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2374                        int err = ip6mr_get_route(net, skb, rtm, nowait);
2375                        if (err <= 0) {
2376                                if (!nowait) {
2377                                        if (err == 0)
2378                                                return 0;
2379                                        goto nla_put_failure;
2380                                } else {
2381                                        if (err == -EMSGSIZE)
2382                                                goto nla_put_failure;
2383                                }
2384                        }
2385                } else
2386#endif
2387                        NLA_PUT_U32(skb, RTA_IIF, iif);
2388        } else if (dst) {
2389                struct in6_addr saddr_buf;
2390                if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2391                        NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2392        }
2393
2394        if (rt->rt6i_prefsrc.plen) {
2395                struct in6_addr saddr_buf;
2396                ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2397                NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2398        }
2399
2400        if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2401                goto nla_put_failure;
2402
2403        if (rt->dst.neighbour)
2404                NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2405
2406        if (rt->dst.dev)
2407                NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2408
2409        NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2410
2411        if (!(rt->rt6i_flags & RTF_EXPIRES))
2412                expires = 0;
2413        else if (rt->rt6i_expires - jiffies < INT_MAX)
2414                expires = rt->rt6i_expires - jiffies;
2415        else
2416                expires = INT_MAX;
2417
2418        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2419                               expires, rt->dst.error) < 0)
2420                goto nla_put_failure;
2421
2422        return nlmsg_end(skb, nlh);
2423
2424nla_put_failure:
2425        nlmsg_cancel(skb, nlh);
2426        return -EMSGSIZE;
2427}
2428
2429int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2430{
2431        struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2432        int prefix;
2433
2434        if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2435                struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2436                prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2437        } else
2438                prefix = 0;
2439
2440        return rt6_fill_node(arg->net,
2441                     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2442                     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2443                     prefix, 0, NLM_F_MULTI);
2444}
2445
2446static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2447{
2448        struct net *net = sock_net(in_skb->sk);
2449        struct nlattr *tb[RTA_MAX+1];
2450        struct rt6_info *rt;
2451        struct sk_buff *skb;
2452        struct rtmsg *rtm;
2453        struct flowi6 fl6;
2454        int err, iif = 0;
2455
2456        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2457        if (err < 0)
2458                goto errout;
2459
2460        err = -EINVAL;
2461        memset(&fl6, 0, sizeof(fl6));
2462
2463        if (tb[RTA_SRC]) {
2464                if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2465                        goto errout;
2466
2467                ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2468        }
2469
2470        if (tb[RTA_DST]) {
2471                if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2472                        goto errout;
2473
2474                ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2475        }
2476
2477        if (tb[RTA_IIF])
2478                iif = nla_get_u32(tb[RTA_IIF]);
2479
2480        if (tb[RTA_OIF])
2481                fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2482
2483        if (iif) {
2484                struct net_device *dev;
2485                dev = __dev_get_by_index(net, iif);
2486                if (!dev) {
2487                        err = -ENODEV;
2488                        goto errout;
2489                }
2490        }
2491
2492        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2493        if (skb == NULL) {
2494                err = -ENOBUFS;
2495                goto errout;
2496        }
2497
2498        /* Reserve room for dummy headers, this skb can pass
2499           through good chunk of routing engine.
2500         */
2501        skb_reset_mac_header(skb);
2502        skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2503
2504        rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2505        skb_dst_set(skb, &rt->dst);
2506
2507        err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2508                            RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2509                            nlh->nlmsg_seq, 0, 0, 0);
2510        if (err < 0) {
2511                kfree_skb(skb);
2512                goto errout;
2513        }
2514
2515        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2516errout:
2517        return err;
2518}
2519
2520void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2521{
2522        struct sk_buff *skb;
2523        struct net *net = info->nl_net;
2524        u32 seq;
2525        int err;
2526
2527        err = -ENOBUFS;
2528        seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2529
2530        skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2531        if (skb == NULL)
2532                goto errout;
2533
2534        err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2535                                event, info->pid, seq, 0, 0, 0);
2536        if (err < 0) {
2537                /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2538                WARN_ON(err == -EMSGSIZE);
2539                kfree_skb(skb);
2540                goto errout;
2541        }
2542        rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2543                    info->nlh, gfp_any());
2544        return;
2545errout:
2546        if (err < 0)
2547                rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2548}
2549
2550static int ip6_route_dev_notify(struct notifier_block *this,
2551                                unsigned long event, void *data)
2552{
2553        struct net_device *dev = (struct net_device *)data;
2554        struct net *net = dev_net(dev);
2555
2556        if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2557                net->ipv6.ip6_null_entry->dst.dev = dev;
2558                net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2559#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2560                net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2561                net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2562                net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2563                net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2564#endif
2565        }
2566
2567        return NOTIFY_OK;
2568}
2569
2570/*
2571 *      /proc
2572 */
2573
2574#ifdef CONFIG_PROC_FS
2575
2576struct rt6_proc_arg
2577{
2578        char *buffer;
2579        int offset;
2580        int length;
2581        int skip;
2582        int len;
2583};
2584
2585static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2586{
2587        struct seq_file *m = p_arg;
2588
2589        seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2590
2591#ifdef CONFIG_IPV6_SUBTREES
2592        seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2593#else
2594        seq_puts(m, "00000000000000000000000000000000 00 ");
2595#endif
2596
2597        if (rt->rt6i_nexthop) {
2598                seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2599        } else {
2600                seq_puts(m, "00000000000000000000000000000000");
2601        }
2602        seq_printf(m, " %08x %08x %08x %08x %8s\n",
2603                   rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2604                   rt->dst.__use, rt->rt6i_flags,
2605                   rt->rt6i_dev ? rt->rt6i_dev->name : "");
2606        return 0;
2607}
2608
2609static int ipv6_route_show(struct seq_file *m, void *v)
2610{
2611        struct net *net = (struct net *)m->private;
2612        fib6_clean_all(net, rt6_info_route, 0, m);
2613        return 0;
2614}
2615
2616static int ipv6_route_open(struct inode *inode, struct file *file)
2617{
2618        return single_open_net(inode, file, ipv6_route_show);
2619}
2620
2621static const struct file_operations ipv6_route_proc_fops = {
2622        .owner          = THIS_MODULE,
2623        .open           = ipv6_route_open,
2624        .read           = seq_read,
2625        .llseek         = seq_lseek,
2626        .release        = single_release_net,
2627};
2628
2629static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2630{
2631        struct net *net = (struct net *)seq->private;
2632        seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2633                   net->ipv6.rt6_stats->fib_nodes,
2634                   net->ipv6.rt6_stats->fib_route_nodes,
2635                   net->ipv6.rt6_stats->fib_rt_alloc,
2636                   net->ipv6.rt6_stats->fib_rt_entries,
2637                   net->ipv6.rt6_stats->fib_rt_cache,
2638                   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2639                   net->ipv6.rt6_stats->fib_discarded_routes);
2640
2641        return 0;
2642}
2643
2644static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2645{
2646        return single_open_net(inode, file, rt6_stats_seq_show);
2647}
2648
2649static const struct file_operations rt6_stats_seq_fops = {
2650        .owner   = THIS_MODULE,
2651        .open    = rt6_stats_seq_open,
2652        .read    = seq_read,
2653        .llseek  = seq_lseek,
2654        .release = single_release_net,
2655};
2656#endif  /* CONFIG_PROC_FS */
2657
2658#ifdef CONFIG_SYSCTL
2659
2660static
2661int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2662                              void __user *buffer, size_t *lenp, loff_t *ppos)
2663{
2664        struct net *net;
2665        int delay;
2666        if (!write)
2667                return -EINVAL;
2668
2669        net = (struct net *)ctl->extra1;
2670        delay = net->ipv6.sysctl.flush_delay;
2671        proc_dointvec(ctl, write, buffer, lenp, ppos);
2672        fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2673        return 0;
2674}
2675
2676ctl_table ipv6_route_table_template[] = {
2677        {
2678                .procname       =       "flush",
2679                .data           =       &init_net.ipv6.sysctl.flush_delay,
2680                .maxlen         =       sizeof(int),
2681                .mode           =       0200,
2682                .proc_handler   =       ipv6_sysctl_rtcache_flush
2683        },
2684        {
2685                .procname       =       "gc_thresh",
2686                .data           =       &ip6_dst_ops_template.gc_thresh,
2687                .maxlen         =       sizeof(int),
2688                .mode           =       0644,
2689                .proc_handler   =       proc_dointvec,
2690        },
2691        {
2692                .procname       =       "max_size",
2693                .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2694                .maxlen         =       sizeof(int),
2695                .mode           =       0644,
2696                .proc_handler   =       proc_dointvec,
2697        },
2698        {
2699                .procname       =       "gc_min_interval",
2700                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2701                .maxlen         =       sizeof(int),
2702                .mode           =       0644,
2703                .proc_handler   =       proc_dointvec_jiffies,
2704        },
2705        {
2706                .procname       =       "gc_timeout",
2707                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2708                .maxlen         =       sizeof(int),
2709                .mode           =       0644,
2710                .proc_handler   =       proc_dointvec_jiffies,
2711        },
2712        {
2713                .procname       =       "gc_interval",
2714                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2715                .maxlen         =       sizeof(int),
2716                .mode           =       0644,
2717                .proc_handler   =       proc_dointvec_jiffies,
2718        },
2719        {
2720                .procname       =       "gc_elasticity",
2721                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2722                .maxlen         =       sizeof(int),
2723                .mode           =       0644,
2724                .proc_handler   =       proc_dointvec,
2725        },
2726        {
2727                .procname       =       "mtu_expires",
2728                .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2729                .maxlen         =       sizeof(int),
2730                .mode           =       0644,
2731                .proc_handler   =       proc_dointvec_jiffies,
2732        },
2733        {
2734                .procname       =       "min_adv_mss",
2735                .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2736                .maxlen         =       sizeof(int),
2737                .mode           =       0644,
2738                .proc_handler   =       proc_dointvec,
2739        },
2740        {
2741                .procname       =       "gc_min_interval_ms",
2742                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2743                .maxlen         =       sizeof(int),
2744                .mode           =       0644,
2745                .proc_handler   =       proc_dointvec_ms_jiffies,
2746        },
2747        { }
2748};
2749
2750struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2751{
2752        struct ctl_table *table;
2753
2754        table = kmemdup(ipv6_route_table_template,
2755                        sizeof(ipv6_route_table_template),
2756                        GFP_KERNEL);
2757
2758        if (table) {
2759                table[0].data = &net->ipv6.sysctl.flush_delay;
2760                table[0].extra1 = net;
2761                table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2762                table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2763                table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2764                table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2765                table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2766                table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2767                table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2768                table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2769                table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2770        }
2771
2772        return table;
2773}
2774#endif
2775
2776static int __net_init ip6_route_net_init(struct net *net)
2777{
2778        int ret = -ENOMEM;
2779
2780        memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2781               sizeof(net->ipv6.ip6_dst_ops));
2782
2783        if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2784                goto out_ip6_dst_ops;
2785
2786        net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2787                                           sizeof(*net->ipv6.ip6_null_entry),
2788                                           GFP_KERNEL);
2789        if (!net->ipv6.ip6_null_entry)
2790                goto out_ip6_dst_entries;
2791        net->ipv6.ip6_null_entry->dst.path =
2792                (struct dst_entry *)net->ipv6.ip6_null_entry;
2793        net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2794        dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2795                         ip6_template_metrics, true);
2796
2797#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2798        net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2799                                               sizeof(*net->ipv6.ip6_prohibit_entry),
2800                                               GFP_KERNEL);
2801        if (!net->ipv6.ip6_prohibit_entry)
2802                goto out_ip6_null_entry;
2803        net->ipv6.ip6_prohibit_entry->dst.path =
2804                (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2805        net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2806        dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2807                         ip6_template_metrics, true);
2808
2809        net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2810                                               sizeof(*net->ipv6.ip6_blk_hole_entry),
2811                                               GFP_KERNEL);
2812        if (!net->ipv6.ip6_blk_hole_entry)
2813                goto out_ip6_prohibit_entry;
2814        net->ipv6.ip6_blk_hole_entry->dst.path =
2815                (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2816        net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2817        dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2818                         ip6_template_metrics, true);
2819#endif
2820
2821        net->ipv6.sysctl.flush_delay = 0;
2822        net->ipv6.sysctl.ip6_rt_max_size = 4096;
2823        net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2824        net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2825        net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2826        net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2827        net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2828        net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2829
2830#ifdef CONFIG_PROC_FS
2831        proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2832        proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2833#endif
2834        net->ipv6.ip6_rt_gc_expire = 30*HZ;
2835
2836        ret = 0;
2837out:
2838        return ret;
2839
2840#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2841out_ip6_prohibit_entry:
2842        kfree(net->ipv6.ip6_prohibit_entry);
2843out_ip6_null_entry:
2844        kfree(net->ipv6.ip6_null_entry);
2845#endif
2846out_ip6_dst_entries:
2847        dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2848out_ip6_dst_ops:
2849        goto out;
2850}
2851
2852static void __net_exit ip6_route_net_exit(struct net *net)
2853{
2854#ifdef CONFIG_PROC_FS
2855        proc_net_remove(net, "ipv6_route");
2856        proc_net_remove(net, "rt6_stats");
2857#endif
2858        kfree(net->ipv6.ip6_null_entry);
2859#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2860        kfree(net->ipv6.ip6_prohibit_entry);
2861        kfree(net->ipv6.ip6_blk_hole_entry);
2862#endif
2863        dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2864}
2865
2866static struct pernet_operations ip6_route_net_ops = {
2867        .init = ip6_route_net_init,
2868        .exit = ip6_route_net_exit,
2869};
2870
2871static struct notifier_block ip6_route_dev_notifier = {
2872        .notifier_call = ip6_route_dev_notify,
2873        .priority = 0,
2874};
2875
2876int __init ip6_route_init(void)
2877{
2878        int ret;
2879
2880        ret = -ENOMEM;
2881        ip6_dst_ops_template.kmem_cachep =
2882                kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2883                                  SLAB_HWCACHE_ALIGN, NULL);
2884        if (!ip6_dst_ops_template.kmem_cachep)
2885                goto out;
2886
2887        ret = dst_entries_init(&ip6_dst_blackhole_ops);
2888        if (ret)
2889                goto out_kmem_cache;
2890
2891        ret = register_pernet_subsys(&ip6_route_net_ops);
2892        if (ret)
2893                goto out_dst_entries;
2894
2895        ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2896
2897        /* Registering of the loopback is done before this portion of code,
2898         * the loopback reference in rt6_info will not be taken, do it
2899         * manually for init_net */
2900        init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2901        init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2902  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2903        init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2904        init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2905        init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2906        init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2907  #endif
2908        ret = fib6_init();
2909        if (ret)
2910                goto out_register_subsys;
2911
2912        ret = xfrm6_init();
2913        if (ret)
2914                goto out_fib6_init;
2915
2916        ret = fib6_rules_init();
2917        if (ret)
2918                goto xfrm6_init;
2919
2920        ret = -ENOBUFS;
2921        if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2922            __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2923            __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2924                goto fib6_rules_init;
2925
2926        ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2927        if (ret)
2928                goto fib6_rules_init;
2929
2930out:
2931        return ret;
2932
2933fib6_rules_init:
2934        fib6_rules_cleanup();
2935xfrm6_init:
2936        xfrm6_fini();
2937out_fib6_init:
2938        fib6_gc_cleanup();
2939out_register_subsys:
2940        unregister_pernet_subsys(&ip6_route_net_ops);
2941out_dst_entries:
2942        dst_entries_destroy(&ip6_dst_blackhole_ops);
2943out_kmem_cache:
2944        kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2945        goto out;
2946}
2947
2948void ip6_route_cleanup(void)
2949{
2950        unregister_netdevice_notifier(&ip6_route_dev_notifier);
2951        fib6_rules_cleanup();
2952        xfrm6_fini();
2953        fib6_gc_cleanup();
2954        unregister_pernet_subsys(&ip6_route_net_ops);
2955        dst_entries_destroy(&ip6_dst_blackhole_ops);
2956        kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2957}
2958