linux/net/ipv6/route.c
<<
>>
Prefs
   1/*
   2 *      Linux INET6 implementation
   3 *      FIB front-end.
   4 *
   5 *      Authors:
   6 *      Pedro Roque             <roque@di.fc.ul.pt>
   7 *
   8 *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
   9 *
  10 *      This program is free software; you can redistribute it and/or
  11 *      modify it under the terms of the GNU General Public License
  12 *      as published by the Free Software Foundation; either version
  13 *      2 of the License, or (at your option) any later version.
  14 */
  15
  16/*      Changes:
  17 *
  18 *      YOSHIFUJI Hideaki @USAGI
  19 *              reworked default router selection.
  20 *              - respect outgoing interface
  21 *              - select from (probably) reachable routers (i.e.
  22 *              routers in REACHABLE, STALE, DELAY or PROBE states).
  23 *              - always select the same router if it is (probably)
  24 *              reachable.  otherwise, round-robin the list.
  25 *      Ville Nuorvala
  26 *              Fixed routing subtrees.
  27 */
  28
  29#include <linux/capability.h>
  30#include <linux/errno.h>
  31#include <linux/types.h>
  32#include <linux/times.h>
  33#include <linux/socket.h>
  34#include <linux/sockios.h>
  35#include <linux/net.h>
  36#include <linux/route.h>
  37#include <linux/netdevice.h>
  38#include <linux/in6.h>
  39#include <linux/init.h>
  40#include <linux/if_arp.h>
  41#include <linux/proc_fs.h>
  42#include <linux/seq_file.h>
  43#include <net/net_namespace.h>
  44#include <net/snmp.h>
  45#include <net/ipv6.h>
  46#include <net/ip6_fib.h>
  47#include <net/ip6_route.h>
  48#include <net/ndisc.h>
  49#include <net/addrconf.h>
  50#include <net/tcp.h>
  51#include <linux/rtnetlink.h>
  52#include <net/dst.h>
  53#include <net/xfrm.h>
  54#include <net/netevent.h>
  55#include <net/netlink.h>
  56
  57#include <asm/uaccess.h>
  58
  59#ifdef CONFIG_SYSCTL
  60#include <linux/sysctl.h>
  61#endif
  62
  63/* Set to 3 to get tracing. */
  64#define RT6_DEBUG 2
  65
  66#if RT6_DEBUG >= 3
  67#define RDBG(x) printk x
  68#define RT6_TRACE(x...) printk(KERN_DEBUG x)
  69#else
  70#define RDBG(x)
  71#define RT6_TRACE(x...) do { ; } while (0)
  72#endif
  73
  74#define CLONE_OFFLINK_ROUTE 0
  75
  76static int ip6_rt_max_size = 4096;
  77static int ip6_rt_gc_min_interval = HZ / 2;
  78static int ip6_rt_gc_timeout = 60*HZ;
  79int ip6_rt_gc_interval = 30*HZ;
  80static int ip6_rt_gc_elasticity = 9;
  81static int ip6_rt_mtu_expires = 10*60*HZ;
  82static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
  83
  84static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
  85static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
  86static struct dst_entry *ip6_negative_advice(struct dst_entry *);
  87static void             ip6_dst_destroy(struct dst_entry *);
  88static void             ip6_dst_ifdown(struct dst_entry *,
  89                                       struct net_device *dev, int how);
  90static int               ip6_dst_gc(void);
  91
  92static int              ip6_pkt_discard(struct sk_buff *skb);
  93static int              ip6_pkt_discard_out(struct sk_buff *skb);
  94static void             ip6_link_failure(struct sk_buff *skb);
  95static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
  96
  97#ifdef CONFIG_IPV6_ROUTE_INFO
  98static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
  99                                           struct in6_addr *gwaddr, int ifindex,
 100                                           unsigned pref);
 101static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
 102                                           struct in6_addr *gwaddr, int ifindex);
 103#endif
 104
 105static struct dst_ops ip6_dst_ops = {
 106        .family                 =       AF_INET6,
 107        .protocol               =       __constant_htons(ETH_P_IPV6),
 108        .gc                     =       ip6_dst_gc,
 109        .gc_thresh              =       1024,
 110        .check                  =       ip6_dst_check,
 111        .destroy                =       ip6_dst_destroy,
 112        .ifdown                 =       ip6_dst_ifdown,
 113        .negative_advice        =       ip6_negative_advice,
 114        .link_failure           =       ip6_link_failure,
 115        .update_pmtu            =       ip6_rt_update_pmtu,
 116        .entry_size             =       sizeof(struct rt6_info),
 117};
 118
 119static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
 120{
 121}
 122
 123static struct dst_ops ip6_dst_blackhole_ops = {
 124        .family                 =       AF_INET6,
 125        .protocol               =       __constant_htons(ETH_P_IPV6),
 126        .destroy                =       ip6_dst_destroy,
 127        .check                  =       ip6_dst_check,
 128        .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
 129        .entry_size             =       sizeof(struct rt6_info),
 130};
 131
 132struct rt6_info ip6_null_entry = {
 133        .u = {
 134                .dst = {
 135                        .__refcnt       = ATOMIC_INIT(1),
 136                        .__use          = 1,
 137                        .obsolete       = -1,
 138                        .error          = -ENETUNREACH,
 139                        .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
 140                        .input          = ip6_pkt_discard,
 141                        .output         = ip6_pkt_discard_out,
 142                        .ops            = &ip6_dst_ops,
 143                        .path           = (struct dst_entry*)&ip6_null_entry,
 144                }
 145        },
 146        .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
 147        .rt6i_metric    = ~(u32) 0,
 148        .rt6i_ref       = ATOMIC_INIT(1),
 149};
 150
 151#ifdef CONFIG_IPV6_MULTIPLE_TABLES
 152
 153static int ip6_pkt_prohibit(struct sk_buff *skb);
 154static int ip6_pkt_prohibit_out(struct sk_buff *skb);
 155static int ip6_pkt_blk_hole(struct sk_buff *skb);
 156
 157struct rt6_info ip6_prohibit_entry = {
 158        .u = {
 159                .dst = {
 160                        .__refcnt       = ATOMIC_INIT(1),
 161                        .__use          = 1,
 162                        .obsolete       = -1,
 163                        .error          = -EACCES,
 164                        .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
 165                        .input          = ip6_pkt_prohibit,
 166                        .output         = ip6_pkt_prohibit_out,
 167                        .ops            = &ip6_dst_ops,
 168                        .path           = (struct dst_entry*)&ip6_prohibit_entry,
 169                }
 170        },
 171        .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
 172        .rt6i_metric    = ~(u32) 0,
 173        .rt6i_ref       = ATOMIC_INIT(1),
 174};
 175
 176struct rt6_info ip6_blk_hole_entry = {
 177        .u = {
 178                .dst = {
 179                        .__refcnt       = ATOMIC_INIT(1),
 180                        .__use          = 1,
 181                        .obsolete       = -1,
 182                        .error          = -EINVAL,
 183                        .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
 184                        .input          = ip6_pkt_blk_hole,
 185                        .output         = ip6_pkt_blk_hole,
 186                        .ops            = &ip6_dst_ops,
 187                        .path           = (struct dst_entry*)&ip6_blk_hole_entry,
 188                }
 189        },
 190        .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
 191        .rt6i_metric    = ~(u32) 0,
 192        .rt6i_ref       = ATOMIC_INIT(1),
 193};
 194
 195#endif
 196
 197/* allocate dst with ip6_dst_ops */
 198static __inline__ struct rt6_info *ip6_dst_alloc(void)
 199{
 200        return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
 201}
 202
 203static void ip6_dst_destroy(struct dst_entry *dst)
 204{
 205        struct rt6_info *rt = (struct rt6_info *)dst;
 206        struct inet6_dev *idev = rt->rt6i_idev;
 207
 208        if (idev != NULL) {
 209                rt->rt6i_idev = NULL;
 210                in6_dev_put(idev);
 211        }
 212}
 213
 214static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 215                           int how)
 216{
 217        struct rt6_info *rt = (struct rt6_info *)dst;
 218        struct inet6_dev *idev = rt->rt6i_idev;
 219
 220        if (dev != init_net.loopback_dev && idev != NULL && idev->dev == dev) {
 221                struct inet6_dev *loopback_idev = in6_dev_get(init_net.loopback_dev);
 222                if (loopback_idev != NULL) {
 223                        rt->rt6i_idev = loopback_idev;
 224                        in6_dev_put(idev);
 225                }
 226        }
 227}
 228
 229static __inline__ int rt6_check_expired(const struct rt6_info *rt)
 230{
 231        return (rt->rt6i_flags & RTF_EXPIRES &&
 232                time_after(jiffies, rt->rt6i_expires));
 233}
 234
 235static inline int rt6_need_strict(struct in6_addr *daddr)
 236{
 237        return (ipv6_addr_type(daddr) &
 238                (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
 239}
 240
 241/*
 242 *      Route lookup. Any table->tb6_lock is implied.
 243 */
 244
 245static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
 246                                                    int oif,
 247                                                    int strict)
 248{
 249        struct rt6_info *local = NULL;
 250        struct rt6_info *sprt;
 251
 252        if (oif) {
 253                for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) {
 254                        struct net_device *dev = sprt->rt6i_dev;
 255                        if (dev->ifindex == oif)
 256                                return sprt;
 257                        if (dev->flags & IFF_LOOPBACK) {
 258                                if (sprt->rt6i_idev == NULL ||
 259                                    sprt->rt6i_idev->dev->ifindex != oif) {
 260                                        if (strict && oif)
 261                                                continue;
 262                                        if (local && (!oif ||
 263                                                      local->rt6i_idev->dev->ifindex == oif))
 264                                                continue;
 265                                }
 266                                local = sprt;
 267                        }
 268                }
 269
 270                if (local)
 271                        return local;
 272
 273                if (strict)
 274                        return &ip6_null_entry;
 275        }
 276        return rt;
 277}
 278
 279#ifdef CONFIG_IPV6_ROUTER_PREF
 280static void rt6_probe(struct rt6_info *rt)
 281{
 282        struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
 283        /*
 284         * Okay, this does not seem to be appropriate
 285         * for now, however, we need to check if it
 286         * is really so; aka Router Reachability Probing.
 287         *
 288         * Router Reachability Probe MUST be rate-limited
 289         * to no more than one per minute.
 290         */
 291        if (!neigh || (neigh->nud_state & NUD_VALID))
 292                return;
 293        read_lock_bh(&neigh->lock);
 294        if (!(neigh->nud_state & NUD_VALID) &&
 295            time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
 296                struct in6_addr mcaddr;
 297                struct in6_addr *target;
 298
 299                neigh->updated = jiffies;
 300                read_unlock_bh(&neigh->lock);
 301
 302                target = (struct in6_addr *)&neigh->primary_key;
 303                addrconf_addr_solict_mult(target, &mcaddr);
 304                ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
 305        } else
 306                read_unlock_bh(&neigh->lock);
 307}
 308#else
 309static inline void rt6_probe(struct rt6_info *rt)
 310{
 311        return;
 312}
 313#endif
 314
 315/*
 316 * Default Router Selection (RFC 2461 6.3.6)
 317 */
 318static inline int rt6_check_dev(struct rt6_info *rt, int oif)
 319{
 320        struct net_device *dev = rt->rt6i_dev;
 321        if (!oif || dev->ifindex == oif)
 322                return 2;
 323        if ((dev->flags & IFF_LOOPBACK) &&
 324            rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
 325                return 1;
 326        return 0;
 327}
 328
 329static inline int rt6_check_neigh(struct rt6_info *rt)
 330{
 331        struct neighbour *neigh = rt->rt6i_nexthop;
 332        int m;
 333        if (rt->rt6i_flags & RTF_NONEXTHOP ||
 334            !(rt->rt6i_flags & RTF_GATEWAY))
 335                m = 1;
 336        else if (neigh) {
 337                read_lock_bh(&neigh->lock);
 338                if (neigh->nud_state & NUD_VALID)
 339                        m = 2;
 340#ifdef CONFIG_IPV6_ROUTER_PREF
 341                else if (neigh->nud_state & NUD_FAILED)
 342                        m = 0;
 343#endif
 344                else
 345                        m = 1;
 346                read_unlock_bh(&neigh->lock);
 347        } else
 348                m = 0;
 349        return m;
 350}
 351
 352static int rt6_score_route(struct rt6_info *rt, int oif,
 353                           int strict)
 354{
 355        int m, n;
 356
 357        m = rt6_check_dev(rt, oif);
 358        if (!m && (strict & RT6_LOOKUP_F_IFACE))
 359                return -1;
 360#ifdef CONFIG_IPV6_ROUTER_PREF
 361        m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
 362#endif
 363        n = rt6_check_neigh(rt);
 364        if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
 365                return -1;
 366        return m;
 367}
 368
 369static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
 370                                   int *mpri, struct rt6_info *match)
 371{
 372        int m;
 373
 374        if (rt6_check_expired(rt))
 375                goto out;
 376
 377        m = rt6_score_route(rt, oif, strict);
 378        if (m < 0)
 379                goto out;
 380
 381        if (m > *mpri) {
 382                if (strict & RT6_LOOKUP_F_REACHABLE)
 383                        rt6_probe(match);
 384                *mpri = m;
 385                match = rt;
 386        } else if (strict & RT6_LOOKUP_F_REACHABLE) {
 387                rt6_probe(rt);
 388        }
 389
 390out:
 391        return match;
 392}
 393
 394static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
 395                                     struct rt6_info *rr_head,
 396                                     u32 metric, int oif, int strict)
 397{
 398        struct rt6_info *rt, *match;
 399        int mpri = -1;
 400
 401        match = NULL;
 402        for (rt = rr_head; rt && rt->rt6i_metric == metric;
 403             rt = rt->u.dst.rt6_next)
 404                match = find_match(rt, oif, strict, &mpri, match);
 405        for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
 406             rt = rt->u.dst.rt6_next)
 407                match = find_match(rt, oif, strict, &mpri, match);
 408
 409        return match;
 410}
 411
 412static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
 413{
 414        struct rt6_info *match, *rt0;
 415
 416        RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
 417                  __FUNCTION__, fn->leaf, oif);
 418
 419        rt0 = fn->rr_ptr;
 420        if (!rt0)
 421                fn->rr_ptr = rt0 = fn->leaf;
 422
 423        match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
 424
 425        if (!match &&
 426            (strict & RT6_LOOKUP_F_REACHABLE)) {
 427                struct rt6_info *next = rt0->u.dst.rt6_next;
 428
 429                /* no entries matched; do round-robin */
 430                if (!next || next->rt6i_metric != rt0->rt6i_metric)
 431                        next = fn->leaf;
 432
 433                if (next != rt0)
 434                        fn->rr_ptr = next;
 435        }
 436
 437        RT6_TRACE("%s() => %p\n",
 438                  __FUNCTION__, match);
 439
 440        return (match ? match : &ip6_null_entry);
 441}
 442
 443#ifdef CONFIG_IPV6_ROUTE_INFO
 444int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 445                  struct in6_addr *gwaddr)
 446{
 447        struct route_info *rinfo = (struct route_info *) opt;
 448        struct in6_addr prefix_buf, *prefix;
 449        unsigned int pref;
 450        u32 lifetime;
 451        struct rt6_info *rt;
 452
 453        if (len < sizeof(struct route_info)) {
 454                return -EINVAL;
 455        }
 456
 457        /* Sanity check for prefix_len and length */
 458        if (rinfo->length > 3) {
 459                return -EINVAL;
 460        } else if (rinfo->prefix_len > 128) {
 461                return -EINVAL;
 462        } else if (rinfo->prefix_len > 64) {
 463                if (rinfo->length < 2) {
 464                        return -EINVAL;
 465                }
 466        } else if (rinfo->prefix_len > 0) {
 467                if (rinfo->length < 1) {
 468                        return -EINVAL;
 469                }
 470        }
 471
 472        pref = rinfo->route_pref;
 473        if (pref == ICMPV6_ROUTER_PREF_INVALID)
 474                pref = ICMPV6_ROUTER_PREF_MEDIUM;
 475
 476        lifetime = ntohl(rinfo->lifetime);
 477        if (lifetime == 0xffffffff) {
 478                /* infinity */
 479        } else if (lifetime > 0x7fffffff/HZ) {
 480                /* Avoid arithmetic overflow */
 481                lifetime = 0x7fffffff/HZ - 1;
 482        }
 483
 484        if (rinfo->length == 3)
 485                prefix = (struct in6_addr *)rinfo->prefix;
 486        else {
 487                /* this function is safe */
 488                ipv6_addr_prefix(&prefix_buf,
 489                                 (struct in6_addr *)rinfo->prefix,
 490                                 rinfo->prefix_len);
 491                prefix = &prefix_buf;
 492        }
 493
 494        rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
 495
 496        if (rt && !lifetime) {
 497                ip6_del_rt(rt);
 498                rt = NULL;
 499        }
 500
 501        if (!rt && lifetime)
 502                rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
 503                                        pref);
 504        else if (rt)
 505                rt->rt6i_flags = RTF_ROUTEINFO |
 506                                 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
 507
 508        if (rt) {
 509                if (lifetime == 0xffffffff) {
 510                        rt->rt6i_flags &= ~RTF_EXPIRES;
 511                } else {
 512                        rt->rt6i_expires = jiffies + HZ * lifetime;
 513                        rt->rt6i_flags |= RTF_EXPIRES;
 514                }
 515                dst_release(&rt->u.dst);
 516        }
 517        return 0;
 518}
 519#endif
 520
 521#define BACKTRACK(saddr) \
 522do { \
 523        if (rt == &ip6_null_entry) { \
 524                struct fib6_node *pn; \
 525                while (1) { \
 526                        if (fn->fn_flags & RTN_TL_ROOT) \
 527                                goto out; \
 528                        pn = fn->parent; \
 529                        if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
 530                                fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
 531                        else \
 532                                fn = pn; \
 533                        if (fn->fn_flags & RTN_RTINFO) \
 534                                goto restart; \
 535                } \
 536        } \
 537} while(0)
 538
 539static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
 540                                             struct flowi *fl, int flags)
 541{
 542        struct fib6_node *fn;
 543        struct rt6_info *rt;
 544
 545        read_lock_bh(&table->tb6_lock);
 546        fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
 547restart:
 548        rt = fn->leaf;
 549        rt = rt6_device_match(rt, fl->oif, flags);
 550        BACKTRACK(&fl->fl6_src);
 551out:
 552        dst_use(&rt->u.dst, jiffies);
 553        read_unlock_bh(&table->tb6_lock);
 554        return rt;
 555
 556}
 557
 558struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
 559                            int oif, int strict)
 560{
 561        struct flowi fl = {
 562                .oif = oif,
 563                .nl_u = {
 564                        .ip6_u = {
 565                                .daddr = *daddr,
 566                        },
 567                },
 568        };
 569        struct dst_entry *dst;
 570        int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
 571
 572        if (saddr) {
 573                memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
 574                flags |= RT6_LOOKUP_F_HAS_SADDR;
 575        }
 576
 577        dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
 578        if (dst->error == 0)
 579                return (struct rt6_info *) dst;
 580
 581        dst_release(dst);
 582
 583        return NULL;
 584}
 585
 586EXPORT_SYMBOL(rt6_lookup);
 587
 588/* ip6_ins_rt is called with FREE table->tb6_lock.
 589   It takes new route entry, the addition fails by any reason the
 590   route is freed. In any case, if caller does not hold it, it may
 591   be destroyed.
 592 */
 593
 594static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
 595{
 596        int err;
 597        struct fib6_table *table;
 598
 599        table = rt->rt6i_table;
 600        write_lock_bh(&table->tb6_lock);
 601        err = fib6_add(&table->tb6_root, rt, info);
 602        write_unlock_bh(&table->tb6_lock);
 603
 604        return err;
 605}
 606
 607int ip6_ins_rt(struct rt6_info *rt)
 608{
 609        return __ip6_ins_rt(rt, NULL);
 610}
 611
 612static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
 613                                      struct in6_addr *saddr)
 614{
 615        struct rt6_info *rt;
 616
 617        /*
 618         *      Clone the route.
 619         */
 620
 621        rt = ip6_rt_copy(ort);
 622
 623        if (rt) {
 624                if (!(rt->rt6i_flags&RTF_GATEWAY)) {
 625                        if (rt->rt6i_dst.plen != 128 &&
 626                            ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
 627                                rt->rt6i_flags |= RTF_ANYCAST;
 628                        ipv6_addr_copy(&rt->rt6i_gateway, daddr);
 629                }
 630
 631                ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
 632                rt->rt6i_dst.plen = 128;
 633                rt->rt6i_flags |= RTF_CACHE;
 634                rt->u.dst.flags |= DST_HOST;
 635
 636#ifdef CONFIG_IPV6_SUBTREES
 637                if (rt->rt6i_src.plen && saddr) {
 638                        ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
 639                        rt->rt6i_src.plen = 128;
 640                }
 641#endif
 642
 643                rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
 644
 645        }
 646
 647        return rt;
 648}
 649
 650static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
 651{
 652        struct rt6_info *rt = ip6_rt_copy(ort);
 653        if (rt) {
 654                ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
 655                rt->rt6i_dst.plen = 128;
 656                rt->rt6i_flags |= RTF_CACHE;
 657                rt->u.dst.flags |= DST_HOST;
 658                rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
 659        }
 660        return rt;
 661}
 662
 663static struct rt6_info *ip6_pol_route(struct fib6_table *table, int oif,
 664                                            struct flowi *fl, int flags)
 665{
 666        struct fib6_node *fn;
 667        struct rt6_info *rt, *nrt;
 668        int strict = 0;
 669        int attempts = 3;
 670        int err;
 671        int reachable = ipv6_devconf.forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
 672
 673        strict |= flags & RT6_LOOKUP_F_IFACE;
 674
 675relookup:
 676        read_lock_bh(&table->tb6_lock);
 677
 678restart_2:
 679        fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
 680
 681restart:
 682        rt = rt6_select(fn, oif, strict | reachable);
 683        BACKTRACK(&fl->fl6_src);
 684        if (rt == &ip6_null_entry ||
 685            rt->rt6i_flags & RTF_CACHE)
 686                goto out;
 687
 688        dst_hold(&rt->u.dst);
 689        read_unlock_bh(&table->tb6_lock);
 690
 691        if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
 692                nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
 693        else {
 694#if CLONE_OFFLINK_ROUTE
 695                nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
 696#else
 697                goto out2;
 698#endif
 699        }
 700
 701        dst_release(&rt->u.dst);
 702        rt = nrt ? : &ip6_null_entry;
 703
 704        dst_hold(&rt->u.dst);
 705        if (nrt) {
 706                err = ip6_ins_rt(nrt);
 707                if (!err)
 708                        goto out2;
 709        }
 710
 711        if (--attempts <= 0)
 712                goto out2;
 713
 714        /*
 715         * Race condition! In the gap, when table->tb6_lock was
 716         * released someone could insert this route.  Relookup.
 717         */
 718        dst_release(&rt->u.dst);
 719        goto relookup;
 720
 721out:
 722        if (reachable) {
 723                reachable = 0;
 724                goto restart_2;
 725        }
 726        dst_hold(&rt->u.dst);
 727        read_unlock_bh(&table->tb6_lock);
 728out2:
 729        rt->u.dst.lastuse = jiffies;
 730        rt->u.dst.__use++;
 731
 732        return rt;
 733}
 734
 735static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
 736                                            struct flowi *fl, int flags)
 737{
 738        return ip6_pol_route(table, fl->iif, fl, flags);
 739}
 740
 741void ip6_route_input(struct sk_buff *skb)
 742{
 743        struct ipv6hdr *iph = ipv6_hdr(skb);
 744        int flags = RT6_LOOKUP_F_HAS_SADDR;
 745        struct flowi fl = {
 746                .iif = skb->dev->ifindex,
 747                .nl_u = {
 748                        .ip6_u = {
 749                                .daddr = iph->daddr,
 750                                .saddr = iph->saddr,
 751                                .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
 752                        },
 753                },
 754                .mark = skb->mark,
 755                .proto = iph->nexthdr,
 756        };
 757
 758        if (rt6_need_strict(&iph->daddr))
 759                flags |= RT6_LOOKUP_F_IFACE;
 760
 761        skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
 762}
 763
 764static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
 765                                             struct flowi *fl, int flags)
 766{
 767        return ip6_pol_route(table, fl->oif, fl, flags);
 768}
 769
 770struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
 771{
 772        int flags = 0;
 773
 774        if (rt6_need_strict(&fl->fl6_dst))
 775                flags |= RT6_LOOKUP_F_IFACE;
 776
 777        if (!ipv6_addr_any(&fl->fl6_src))
 778                flags |= RT6_LOOKUP_F_HAS_SADDR;
 779
 780        return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
 781}
 782
 783EXPORT_SYMBOL(ip6_route_output);
 784
 785static int ip6_blackhole_output(struct sk_buff *skb)
 786{
 787        kfree_skb(skb);
 788        return 0;
 789}
 790
 791int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
 792{
 793        struct rt6_info *ort = (struct rt6_info *) *dstp;
 794        struct rt6_info *rt = (struct rt6_info *)
 795                dst_alloc(&ip6_dst_blackhole_ops);
 796        struct dst_entry *new = NULL;
 797
 798        if (rt) {
 799                new = &rt->u.dst;
 800
 801                atomic_set(&new->__refcnt, 1);
 802                new->__use = 1;
 803                new->input = ip6_blackhole_output;
 804                new->output = ip6_blackhole_output;
 805
 806                memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
 807                new->dev = ort->u.dst.dev;
 808                if (new->dev)
 809                        dev_hold(new->dev);
 810                rt->rt6i_idev = ort->rt6i_idev;
 811                if (rt->rt6i_idev)
 812                        in6_dev_hold(rt->rt6i_idev);
 813                rt->rt6i_expires = 0;
 814
 815                ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
 816                rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
 817                rt->rt6i_metric = 0;
 818
 819                memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
 820#ifdef CONFIG_IPV6_SUBTREES
 821                memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
 822#endif
 823
 824                dst_free(new);
 825        }
 826
 827        dst_release(*dstp);
 828        *dstp = new;
 829        return (new ? 0 : -ENOMEM);
 830}
 831EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
 832
 833/*
 834 *      Destination cache support functions
 835 */
 836
 837static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 838{
 839        struct rt6_info *rt;
 840
 841        rt = (struct rt6_info *) dst;
 842
 843        if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
 844                return dst;
 845
 846        return NULL;
 847}
 848
 849static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
 850{
 851        struct rt6_info *rt = (struct rt6_info *) dst;
 852
 853        if (rt) {
 854                if (rt->rt6i_flags & RTF_CACHE)
 855                        ip6_del_rt(rt);
 856                else
 857                        dst_release(dst);
 858        }
 859        return NULL;
 860}
 861
 862static void ip6_link_failure(struct sk_buff *skb)
 863{
 864        struct rt6_info *rt;
 865
 866        icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
 867
 868        rt = (struct rt6_info *) skb->dst;
 869        if (rt) {
 870                if (rt->rt6i_flags&RTF_CACHE) {
 871                        dst_set_expires(&rt->u.dst, 0);
 872                        rt->rt6i_flags |= RTF_EXPIRES;
 873                } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
 874                        rt->rt6i_node->fn_sernum = -1;
 875        }
 876}
 877
 878static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
 879{
 880        struct rt6_info *rt6 = (struct rt6_info*)dst;
 881
 882        if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
 883                rt6->rt6i_flags |= RTF_MODIFIED;
 884                if (mtu < IPV6_MIN_MTU) {
 885                        mtu = IPV6_MIN_MTU;
 886                        dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
 887                }
 888                dst->metrics[RTAX_MTU-1] = mtu;
 889                call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
 890        }
 891}
 892
 893static int ipv6_get_mtu(struct net_device *dev);
 894
 895static inline unsigned int ipv6_advmss(unsigned int mtu)
 896{
 897        mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
 898
 899        if (mtu < ip6_rt_min_advmss)
 900                mtu = ip6_rt_min_advmss;
 901
 902        /*
 903         * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
 904         * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
 905         * IPV6_MAXPLEN is also valid and means: "any MSS,
 906         * rely only on pmtu discovery"
 907         */
 908        if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
 909                mtu = IPV6_MAXPLEN;
 910        return mtu;
 911}
 912
 913static struct dst_entry *ndisc_dst_gc_list;
 914static DEFINE_SPINLOCK(ndisc_lock);
 915
 916struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
 917                                  struct neighbour *neigh,
 918                                  struct in6_addr *addr,
 919                                  int (*output)(struct sk_buff *))
 920{
 921        struct rt6_info *rt;
 922        struct inet6_dev *idev = in6_dev_get(dev);
 923
 924        if (unlikely(idev == NULL))
 925                return NULL;
 926
 927        rt = ip6_dst_alloc();
 928        if (unlikely(rt == NULL)) {
 929                in6_dev_put(idev);
 930                goto out;
 931        }
 932
 933        dev_hold(dev);
 934        if (neigh)
 935                neigh_hold(neigh);
 936        else
 937                neigh = ndisc_get_neigh(dev, addr);
 938
 939        rt->rt6i_dev      = dev;
 940        rt->rt6i_idev     = idev;
 941        rt->rt6i_nexthop  = neigh;
 942        atomic_set(&rt->u.dst.__refcnt, 1);
 943        rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
 944        rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
 945        rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
 946        rt->u.dst.output  = output;
 947
 948#if 0   /* there's no chance to use these for ndisc */
 949        rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
 950                                ? DST_HOST
 951                                : 0;
 952        ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
 953        rt->rt6i_dst.plen = 128;
 954#endif
 955
 956        spin_lock_bh(&ndisc_lock);
 957        rt->u.dst.next = ndisc_dst_gc_list;
 958        ndisc_dst_gc_list = &rt->u.dst;
 959        spin_unlock_bh(&ndisc_lock);
 960
 961        fib6_force_start_gc();
 962
 963out:
 964        return &rt->u.dst;
 965}
 966
 967int ndisc_dst_gc(int *more)
 968{
 969        struct dst_entry *dst, *next, **pprev;
 970        int freed;
 971
 972        next = NULL;
 973        freed = 0;
 974
 975        spin_lock_bh(&ndisc_lock);
 976        pprev = &ndisc_dst_gc_list;
 977
 978        while ((dst = *pprev) != NULL) {
 979                if (!atomic_read(&dst->__refcnt)) {
 980                        *pprev = dst->next;
 981                        dst_free(dst);
 982                        freed++;
 983                } else {
 984                        pprev = &dst->next;
 985                        (*more)++;
 986                }
 987        }
 988
 989        spin_unlock_bh(&ndisc_lock);
 990
 991        return freed;
 992}
 993
 994static int ip6_dst_gc(void)
 995{
 996        static unsigned expire = 30*HZ;
 997        static unsigned long last_gc;
 998        unsigned long now = jiffies;
 999
1000        if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
1001            atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
1002                goto out;
1003
1004        expire++;
1005        fib6_run_gc(expire);
1006        last_gc = now;
1007        if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
1008                expire = ip6_rt_gc_timeout>>1;
1009
1010out:
1011        expire -= expire>>ip6_rt_gc_elasticity;
1012        return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
1013}
1014
1015/* Clean host part of a prefix. Not necessary in radix tree,
1016   but results in cleaner routing tables.
1017
1018   Remove it only when all the things will work!
1019 */
1020
1021static int ipv6_get_mtu(struct net_device *dev)
1022{
1023        int mtu = IPV6_MIN_MTU;
1024        struct inet6_dev *idev;
1025
1026        idev = in6_dev_get(dev);
1027        if (idev) {
1028                mtu = idev->cnf.mtu6;
1029                in6_dev_put(idev);
1030        }
1031        return mtu;
1032}
1033
1034int ipv6_get_hoplimit(struct net_device *dev)
1035{
1036        int hoplimit = ipv6_devconf.hop_limit;
1037        struct inet6_dev *idev;
1038
1039        idev = in6_dev_get(dev);
1040        if (idev) {
1041                hoplimit = idev->cnf.hop_limit;
1042                in6_dev_put(idev);
1043        }
1044        return hoplimit;
1045}
1046
1047/*
1048 *
1049 */
1050
1051int ip6_route_add(struct fib6_config *cfg)
1052{
1053        int err;
1054        struct rt6_info *rt = NULL;
1055        struct net_device *dev = NULL;
1056        struct inet6_dev *idev = NULL;
1057        struct fib6_table *table;
1058        int addr_type;
1059
1060        if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1061                return -EINVAL;
1062#ifndef CONFIG_IPV6_SUBTREES
1063        if (cfg->fc_src_len)
1064                return -EINVAL;
1065#endif
1066        if (cfg->fc_ifindex) {
1067                err = -ENODEV;
1068                dev = dev_get_by_index(&init_net, cfg->fc_ifindex);
1069                if (!dev)
1070                        goto out;
1071                idev = in6_dev_get(dev);
1072                if (!idev)
1073                        goto out;
1074        }
1075
1076        if (cfg->fc_metric == 0)
1077                cfg->fc_metric = IP6_RT_PRIO_USER;
1078
1079        table = fib6_new_table(cfg->fc_table);
1080        if (table == NULL) {
1081                err = -ENOBUFS;
1082                goto out;
1083        }
1084
1085        rt = ip6_dst_alloc();
1086
1087        if (rt == NULL) {
1088                err = -ENOMEM;
1089                goto out;
1090        }
1091
1092        rt->u.dst.obsolete = -1;
1093        rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
1094
1095        if (cfg->fc_protocol == RTPROT_UNSPEC)
1096                cfg->fc_protocol = RTPROT_BOOT;
1097        rt->rt6i_protocol = cfg->fc_protocol;
1098
1099        addr_type = ipv6_addr_type(&cfg->fc_dst);
1100
1101        if (addr_type & IPV6_ADDR_MULTICAST)
1102                rt->u.dst.input = ip6_mc_input;
1103        else
1104                rt->u.dst.input = ip6_forward;
1105
1106        rt->u.dst.output = ip6_output;
1107
1108        ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1109        rt->rt6i_dst.plen = cfg->fc_dst_len;
1110        if (rt->rt6i_dst.plen == 128)
1111               rt->u.dst.flags = DST_HOST;
1112
1113#ifdef CONFIG_IPV6_SUBTREES
1114        ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1115        rt->rt6i_src.plen = cfg->fc_src_len;
1116#endif
1117
1118        rt->rt6i_metric = cfg->fc_metric;
1119
1120        /* We cannot add true routes via loopback here,
1121           they would result in kernel looping; promote them to reject routes
1122         */
1123        if ((cfg->fc_flags & RTF_REJECT) ||
1124            (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1125                /* hold loopback dev/idev if we haven't done so. */
1126                if (dev != init_net.loopback_dev) {
1127                        if (dev) {
1128                                dev_put(dev);
1129                                in6_dev_put(idev);
1130                        }
1131                        dev = init_net.loopback_dev;
1132                        dev_hold(dev);
1133                        idev = in6_dev_get(dev);
1134                        if (!idev) {
1135                                err = -ENODEV;
1136                                goto out;
1137                        }
1138                }
1139                rt->u.dst.output = ip6_pkt_discard_out;
1140                rt->u.dst.input = ip6_pkt_discard;
1141                rt->u.dst.error = -ENETUNREACH;
1142                rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1143                goto install_route;
1144        }
1145
1146        if (cfg->fc_flags & RTF_GATEWAY) {
1147                struct in6_addr *gw_addr;
1148                int gwa_type;
1149
1150                gw_addr = &cfg->fc_gateway;
1151                ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1152                gwa_type = ipv6_addr_type(gw_addr);
1153
1154                if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1155                        struct rt6_info *grt;
1156
1157                        /* IPv6 strictly inhibits using not link-local
1158                           addresses as nexthop address.
1159                           Otherwise, router will not able to send redirects.
1160                           It is very good, but in some (rare!) circumstances
1161                           (SIT, PtP, NBMA NOARP links) it is handy to allow
1162                           some exceptions. --ANK
1163                         */
1164                        err = -EINVAL;
1165                        if (!(gwa_type&IPV6_ADDR_UNICAST))
1166                                goto out;
1167
1168                        grt = rt6_lookup(gw_addr, NULL, cfg->fc_ifindex, 1);
1169
1170                        err = -EHOSTUNREACH;
1171                        if (grt == NULL)
1172                                goto out;
1173                        if (dev) {
1174                                if (dev != grt->rt6i_dev) {
1175                                        dst_release(&grt->u.dst);
1176                                        goto out;
1177                                }
1178                        } else {
1179                                dev = grt->rt6i_dev;
1180                                idev = grt->rt6i_idev;
1181                                dev_hold(dev);
1182                                in6_dev_hold(grt->rt6i_idev);
1183                        }
1184                        if (!(grt->rt6i_flags&RTF_GATEWAY))
1185                                err = 0;
1186                        dst_release(&grt->u.dst);
1187
1188                        if (err)
1189                                goto out;
1190                }
1191                err = -EINVAL;
1192                if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1193                        goto out;
1194        }
1195
1196        err = -ENODEV;
1197        if (dev == NULL)
1198                goto out;
1199
1200        if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1201                rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1202                if (IS_ERR(rt->rt6i_nexthop)) {
1203                        err = PTR_ERR(rt->rt6i_nexthop);
1204                        rt->rt6i_nexthop = NULL;
1205                        goto out;
1206                }
1207        }
1208
1209        rt->rt6i_flags = cfg->fc_flags;
1210
1211install_route:
1212        if (cfg->fc_mx) {
1213                struct nlattr *nla;
1214                int remaining;
1215
1216                nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1217                        int type = nla_type(nla);
1218
1219                        if (type) {
1220                                if (type > RTAX_MAX) {
1221                                        err = -EINVAL;
1222                                        goto out;
1223                                }
1224
1225                                rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1226                        }
1227                }
1228        }
1229
1230        if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1231                rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1232        if (!rt->u.dst.metrics[RTAX_MTU-1])
1233                rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1234        if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1235                rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1236        rt->u.dst.dev = dev;
1237        rt->rt6i_idev = idev;
1238        rt->rt6i_table = table;
1239        return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1240
1241out:
1242        if (dev)
1243                dev_put(dev);
1244        if (idev)
1245                in6_dev_put(idev);
1246        if (rt)
1247                dst_free(&rt->u.dst);
1248        return err;
1249}
1250
1251static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1252{
1253        int err;
1254        struct fib6_table *table;
1255
1256        if (rt == &ip6_null_entry)
1257                return -ENOENT;
1258
1259        table = rt->rt6i_table;
1260        write_lock_bh(&table->tb6_lock);
1261
1262        err = fib6_del(rt, info);
1263        dst_release(&rt->u.dst);
1264
1265        write_unlock_bh(&table->tb6_lock);
1266
1267        return err;
1268}
1269
1270int ip6_del_rt(struct rt6_info *rt)
1271{
1272        return __ip6_del_rt(rt, NULL);
1273}
1274
1275static int ip6_route_del(struct fib6_config *cfg)
1276{
1277        struct fib6_table *table;
1278        struct fib6_node *fn;
1279        struct rt6_info *rt;
1280        int err = -ESRCH;
1281
1282        table = fib6_get_table(cfg->fc_table);
1283        if (table == NULL)
1284                return err;
1285
1286        read_lock_bh(&table->tb6_lock);
1287
1288        fn = fib6_locate(&table->tb6_root,
1289                         &cfg->fc_dst, cfg->fc_dst_len,
1290                         &cfg->fc_src, cfg->fc_src_len);
1291
1292        if (fn) {
1293                for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1294                        if (cfg->fc_ifindex &&
1295                            (rt->rt6i_dev == NULL ||
1296                             rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1297                                continue;
1298                        if (cfg->fc_flags & RTF_GATEWAY &&
1299                            !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1300                                continue;
1301                        if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1302                                continue;
1303                        dst_hold(&rt->u.dst);
1304                        read_unlock_bh(&table->tb6_lock);
1305
1306                        return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1307                }
1308        }
1309        read_unlock_bh(&table->tb6_lock);
1310
1311        return err;
1312}
1313
1314/*
1315 *      Handle redirects
1316 */
1317struct ip6rd_flowi {
1318        struct flowi fl;
1319        struct in6_addr gateway;
1320};
1321
1322static struct rt6_info *__ip6_route_redirect(struct fib6_table *table,
1323                                             struct flowi *fl,
1324                                             int flags)
1325{
1326        struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1327        struct rt6_info *rt;
1328        struct fib6_node *fn;
1329
1330        /*
1331         * Get the "current" route for this destination and
1332         * check if the redirect has come from approriate router.
1333         *
1334         * RFC 2461 specifies that redirects should only be
1335         * accepted if they come from the nexthop to the target.
1336         * Due to the way the routes are chosen, this notion
1337         * is a bit fuzzy and one might need to check all possible
1338         * routes.
1339         */
1340
1341        read_lock_bh(&table->tb6_lock);
1342        fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1343restart:
1344        for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1345                /*
1346                 * Current route is on-link; redirect is always invalid.
1347                 *
1348                 * Seems, previous statement is not true. It could
1349                 * be node, which looks for us as on-link (f.e. proxy ndisc)
1350                 * But then router serving it might decide, that we should
1351                 * know truth 8)8) --ANK (980726).
1352                 */
1353                if (rt6_check_expired(rt))
1354                        continue;
1355                if (!(rt->rt6i_flags & RTF_GATEWAY))
1356                        continue;
1357                if (fl->oif != rt->rt6i_dev->ifindex)
1358                        continue;
1359                if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1360                        continue;
1361                break;
1362        }
1363
1364        if (!rt)
1365                rt = &ip6_null_entry;
1366        BACKTRACK(&fl->fl6_src);
1367out:
1368        dst_hold(&rt->u.dst);
1369
1370        read_unlock_bh(&table->tb6_lock);
1371
1372        return rt;
1373};
1374
1375static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1376                                           struct in6_addr *src,
1377                                           struct in6_addr *gateway,
1378                                           struct net_device *dev)
1379{
1380        int flags = RT6_LOOKUP_F_HAS_SADDR;
1381        struct ip6rd_flowi rdfl = {
1382                .fl = {
1383                        .oif = dev->ifindex,
1384                        .nl_u = {
1385                                .ip6_u = {
1386                                        .daddr = *dest,
1387                                        .saddr = *src,
1388                                },
1389                        },
1390                },
1391                .gateway = *gateway,
1392        };
1393
1394        if (rt6_need_strict(dest))
1395                flags |= RT6_LOOKUP_F_IFACE;
1396
1397        return (struct rt6_info *)fib6_rule_lookup((struct flowi *)&rdfl, flags, __ip6_route_redirect);
1398}
1399
1400void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1401                  struct in6_addr *saddr,
1402                  struct neighbour *neigh, u8 *lladdr, int on_link)
1403{
1404        struct rt6_info *rt, *nrt = NULL;
1405        struct netevent_redirect netevent;
1406
1407        rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1408
1409        if (rt == &ip6_null_entry) {
1410                if (net_ratelimit())
1411                        printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1412                               "for redirect target\n");
1413                goto out;
1414        }
1415
1416        /*
1417         *      We have finally decided to accept it.
1418         */
1419
1420        neigh_update(neigh, lladdr, NUD_STALE,
1421                     NEIGH_UPDATE_F_WEAK_OVERRIDE|
1422                     NEIGH_UPDATE_F_OVERRIDE|
1423                     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1424                                     NEIGH_UPDATE_F_ISROUTER))
1425                     );
1426
1427        /*
1428         * Redirect received -> path was valid.
1429         * Look, redirects are sent only in response to data packets,
1430         * so that this nexthop apparently is reachable. --ANK
1431         */
1432        dst_confirm(&rt->u.dst);
1433
1434        /* Duplicate redirect: silently ignore. */
1435        if (neigh == rt->u.dst.neighbour)
1436                goto out;
1437
1438        nrt = ip6_rt_copy(rt);
1439        if (nrt == NULL)
1440                goto out;
1441
1442        nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1443        if (on_link)
1444                nrt->rt6i_flags &= ~RTF_GATEWAY;
1445
1446        ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1447        nrt->rt6i_dst.plen = 128;
1448        nrt->u.dst.flags |= DST_HOST;
1449
1450        ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1451        nrt->rt6i_nexthop = neigh_clone(neigh);
1452        /* Reset pmtu, it may be better */
1453        nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1454        nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1455
1456        if (ip6_ins_rt(nrt))
1457                goto out;
1458
1459        netevent.old = &rt->u.dst;
1460        netevent.new = &nrt->u.dst;
1461        call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1462
1463        if (rt->rt6i_flags&RTF_CACHE) {
1464                ip6_del_rt(rt);
1465                return;
1466        }
1467
1468out:
1469        dst_release(&rt->u.dst);
1470        return;
1471}
1472
1473/*
1474 *      Handle ICMP "packet too big" messages
1475 *      i.e. Path MTU discovery
1476 */
1477
1478void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1479                        struct net_device *dev, u32 pmtu)
1480{
1481        struct rt6_info *rt, *nrt;
1482        int allfrag = 0;
1483
1484        rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1485        if (rt == NULL)
1486                return;
1487
1488        if (pmtu >= dst_mtu(&rt->u.dst))
1489                goto out;
1490
1491        if (pmtu < IPV6_MIN_MTU) {
1492                /*
1493                 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1494                 * MTU (1280) and a fragment header should always be included
1495                 * after a node receiving Too Big message reporting PMTU is
1496                 * less than the IPv6 Minimum Link MTU.
1497                 */
1498                pmtu = IPV6_MIN_MTU;
1499                allfrag = 1;
1500        }
1501
1502        /* New mtu received -> path was valid.
1503           They are sent only in response to data packets,
1504           so that this nexthop apparently is reachable. --ANK
1505         */
1506        dst_confirm(&rt->u.dst);
1507
1508        /* Host route. If it is static, it would be better
1509           not to override it, but add new one, so that
1510           when cache entry will expire old pmtu
1511           would return automatically.
1512         */
1513        if (rt->rt6i_flags & RTF_CACHE) {
1514                rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1515                if (allfrag)
1516                        rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1517                dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1518                rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1519                goto out;
1520        }
1521
1522        /* Network route.
1523           Two cases are possible:
1524           1. It is connected route. Action: COW
1525           2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1526         */
1527        if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1528                nrt = rt6_alloc_cow(rt, daddr, saddr);
1529        else
1530                nrt = rt6_alloc_clone(rt, daddr);
1531
1532        if (nrt) {
1533                nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1534                if (allfrag)
1535                        nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1536
1537                /* According to RFC 1981, detecting PMTU increase shouldn't be
1538                 * happened within 5 mins, the recommended timer is 10 mins.
1539                 * Here this route expiration time is set to ip6_rt_mtu_expires
1540                 * which is 10 mins. After 10 mins the decreased pmtu is expired
1541                 * and detecting PMTU increase will be automatically happened.
1542                 */
1543                dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1544                nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1545
1546                ip6_ins_rt(nrt);
1547        }
1548out:
1549        dst_release(&rt->u.dst);
1550}
1551
1552/*
1553 *      Misc support functions
1554 */
1555
1556static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1557{
1558        struct rt6_info *rt = ip6_dst_alloc();
1559
1560        if (rt) {
1561                rt->u.dst.input = ort->u.dst.input;
1562                rt->u.dst.output = ort->u.dst.output;
1563
1564                memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1565                rt->u.dst.error = ort->u.dst.error;
1566                rt->u.dst.dev = ort->u.dst.dev;
1567                if (rt->u.dst.dev)
1568                        dev_hold(rt->u.dst.dev);
1569                rt->rt6i_idev = ort->rt6i_idev;
1570                if (rt->rt6i_idev)
1571                        in6_dev_hold(rt->rt6i_idev);
1572                rt->u.dst.lastuse = jiffies;
1573                rt->rt6i_expires = 0;
1574
1575                ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1576                rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1577                rt->rt6i_metric = 0;
1578
1579                memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1580#ifdef CONFIG_IPV6_SUBTREES
1581                memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1582#endif
1583                rt->rt6i_table = ort->rt6i_table;
1584        }
1585        return rt;
1586}
1587
1588#ifdef CONFIG_IPV6_ROUTE_INFO
1589static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1590                                           struct in6_addr *gwaddr, int ifindex)
1591{
1592        struct fib6_node *fn;
1593        struct rt6_info *rt = NULL;
1594        struct fib6_table *table;
1595
1596        table = fib6_get_table(RT6_TABLE_INFO);
1597        if (table == NULL)
1598                return NULL;
1599
1600        write_lock_bh(&table->tb6_lock);
1601        fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1602        if (!fn)
1603                goto out;
1604
1605        for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1606                if (rt->rt6i_dev->ifindex != ifindex)
1607                        continue;
1608                if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1609                        continue;
1610                if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1611                        continue;
1612                dst_hold(&rt->u.dst);
1613                break;
1614        }
1615out:
1616        write_unlock_bh(&table->tb6_lock);
1617        return rt;
1618}
1619
1620static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1621                                           struct in6_addr *gwaddr, int ifindex,
1622                                           unsigned pref)
1623{
1624        struct fib6_config cfg = {
1625                .fc_table       = RT6_TABLE_INFO,
1626                .fc_metric      = 1024,
1627                .fc_ifindex     = ifindex,
1628                .fc_dst_len     = prefixlen,
1629                .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1630                                  RTF_UP | RTF_PREF(pref),
1631        };
1632
1633        ipv6_addr_copy(&cfg.fc_dst, prefix);
1634        ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1635
1636        /* We should treat it as a default route if prefix length is 0. */
1637        if (!prefixlen)
1638                cfg.fc_flags |= RTF_DEFAULT;
1639
1640        ip6_route_add(&cfg);
1641
1642        return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1643}
1644#endif
1645
1646struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1647{
1648        struct rt6_info *rt;
1649        struct fib6_table *table;
1650
1651        table = fib6_get_table(RT6_TABLE_DFLT);
1652        if (table == NULL)
1653                return NULL;
1654
1655        write_lock_bh(&table->tb6_lock);
1656        for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) {
1657                if (dev == rt->rt6i_dev &&
1658                    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1659                    ipv6_addr_equal(&rt->rt6i_gateway, addr))
1660                        break;
1661        }
1662        if (rt)
1663                dst_hold(&rt->u.dst);
1664        write_unlock_bh(&table->tb6_lock);
1665        return rt;
1666}
1667
1668struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1669                                     struct net_device *dev,
1670                                     unsigned int pref)
1671{
1672        struct fib6_config cfg = {
1673                .fc_table       = RT6_TABLE_DFLT,
1674                .fc_metric      = 1024,
1675                .fc_ifindex     = dev->ifindex,
1676                .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1677                                  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1678        };
1679
1680        ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1681
1682        ip6_route_add(&cfg);
1683
1684        return rt6_get_dflt_router(gwaddr, dev);
1685}
1686
1687void rt6_purge_dflt_routers(void)
1688{
1689        struct rt6_info *rt;
1690        struct fib6_table *table;
1691
1692        /* NOTE: Keep consistent with rt6_get_dflt_router */
1693        table = fib6_get_table(RT6_TABLE_DFLT);
1694        if (table == NULL)
1695                return;
1696
1697restart:
1698        read_lock_bh(&table->tb6_lock);
1699        for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) {
1700                if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1701                        dst_hold(&rt->u.dst);
1702                        read_unlock_bh(&table->tb6_lock);
1703                        ip6_del_rt(rt);
1704                        goto restart;
1705                }
1706        }
1707        read_unlock_bh(&table->tb6_lock);
1708}
1709
1710static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg,
1711                                 struct fib6_config *cfg)
1712{
1713        memset(cfg, 0, sizeof(*cfg));
1714
1715        cfg->fc_table = RT6_TABLE_MAIN;
1716        cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1717        cfg->fc_metric = rtmsg->rtmsg_metric;
1718        cfg->fc_expires = rtmsg->rtmsg_info;
1719        cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1720        cfg->fc_src_len = rtmsg->rtmsg_src_len;
1721        cfg->fc_flags = rtmsg->rtmsg_flags;
1722
1723        ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1724        ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1725        ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1726}
1727
1728int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1729{
1730        struct fib6_config cfg;
1731        struct in6_rtmsg rtmsg;
1732        int err;
1733
1734        switch(cmd) {
1735        case SIOCADDRT:         /* Add a route */
1736        case SIOCDELRT:         /* Delete a route */
1737                if (!capable(CAP_NET_ADMIN))
1738                        return -EPERM;
1739                err = copy_from_user(&rtmsg, arg,
1740                                     sizeof(struct in6_rtmsg));
1741                if (err)
1742                        return -EFAULT;
1743
1744                rtmsg_to_fib6_config(&rtmsg, &cfg);
1745
1746                rtnl_lock();
1747                switch (cmd) {
1748                case SIOCADDRT:
1749                        err = ip6_route_add(&cfg);
1750                        break;
1751                case SIOCDELRT:
1752                        err = ip6_route_del(&cfg);
1753                        break;
1754                default:
1755                        err = -EINVAL;
1756                }
1757                rtnl_unlock();
1758
1759                return err;
1760        }
1761
1762        return -EINVAL;
1763}
1764
1765/*
1766 *      Drop the packet on the floor
1767 */
1768
1769static inline int ip6_pkt_drop(struct sk_buff *skb, int code,
1770                               int ipstats_mib_noroutes)
1771{
1772        int type;
1773        switch (ipstats_mib_noroutes) {
1774        case IPSTATS_MIB_INNOROUTES:
1775                type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1776                if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED) {
1777                        IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INADDRERRORS);
1778                        break;
1779                }
1780                /* FALLTHROUGH */
1781        case IPSTATS_MIB_OUTNOROUTES:
1782                IP6_INC_STATS(ip6_dst_idev(skb->dst), ipstats_mib_noroutes);
1783                break;
1784        }
1785        icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev);
1786        kfree_skb(skb);
1787        return 0;
1788}
1789
1790static int ip6_pkt_discard(struct sk_buff *skb)
1791{
1792        return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1793}
1794
1795static int ip6_pkt_discard_out(struct sk_buff *skb)
1796{
1797        skb->dev = skb->dst->dev;
1798        return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1799}
1800
1801#ifdef CONFIG_IPV6_MULTIPLE_TABLES
1802
1803static int ip6_pkt_prohibit(struct sk_buff *skb)
1804{
1805        return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1806}
1807
1808static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1809{
1810        skb->dev = skb->dst->dev;
1811        return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1812}
1813
1814static int ip6_pkt_blk_hole(struct sk_buff *skb)
1815{
1816        kfree_skb(skb);
1817        return 0;
1818}
1819
1820#endif
1821
1822/*
1823 *      Allocate a dst for local (unicast / anycast) address.
1824 */
1825
1826struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1827                                    const struct in6_addr *addr,
1828                                    int anycast)
1829{
1830        struct rt6_info *rt = ip6_dst_alloc();
1831
1832        if (rt == NULL)
1833                return ERR_PTR(-ENOMEM);
1834
1835        dev_hold(init_net.loopback_dev);
1836        in6_dev_hold(idev);
1837
1838        rt->u.dst.flags = DST_HOST;
1839        rt->u.dst.input = ip6_input;
1840        rt->u.dst.output = ip6_output;
1841        rt->rt6i_dev = init_net.loopback_dev;
1842        rt->rt6i_idev = idev;
1843        rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1844        rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1845        rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1846        rt->u.dst.obsolete = -1;
1847
1848        rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1849        if (anycast)
1850                rt->rt6i_flags |= RTF_ANYCAST;
1851        else
1852                rt->rt6i_flags |= RTF_LOCAL;
1853        rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1854        if (rt->rt6i_nexthop == NULL) {
1855                dst_free(&rt->u.dst);
1856                return ERR_PTR(-ENOMEM);
1857        }
1858
1859        ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1860        rt->rt6i_dst.plen = 128;
1861        rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
1862
1863        atomic_set(&rt->u.dst.__refcnt, 1);
1864
1865        return rt;
1866}
1867
1868static int fib6_ifdown(struct rt6_info *rt, void *arg)
1869{
1870        if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1871            rt != &ip6_null_entry) {
1872                RT6_TRACE("deleted by ifdown %p\n", rt);
1873                return -1;
1874        }
1875        return 0;
1876}
1877
1878void rt6_ifdown(struct net_device *dev)
1879{
1880        fib6_clean_all(fib6_ifdown, 0, dev);
1881}
1882
1883struct rt6_mtu_change_arg
1884{
1885        struct net_device *dev;
1886        unsigned mtu;
1887};
1888
1889static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1890{
1891        struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1892        struct inet6_dev *idev;
1893
1894        /* In IPv6 pmtu discovery is not optional,
1895           so that RTAX_MTU lock cannot disable it.
1896           We still use this lock to block changes
1897           caused by addrconf/ndisc.
1898        */
1899
1900        idev = __in6_dev_get(arg->dev);
1901        if (idev == NULL)
1902                return 0;
1903
1904        /* For administrative MTU increase, there is no way to discover
1905           IPv6 PMTU increase, so PMTU increase should be updated here.
1906           Since RFC 1981 doesn't include administrative MTU increase
1907           update PMTU increase is a MUST. (i.e. jumbo frame)
1908         */
1909        /*
1910           If new MTU is less than route PMTU, this new MTU will be the
1911           lowest MTU in the path, update the route PMTU to reflect PMTU
1912           decreases; if new MTU is greater than route PMTU, and the
1913           old MTU is the lowest MTU in the path, update the route PMTU
1914           to reflect the increase. In this case if the other nodes' MTU
1915           also have the lowest MTU, TOO BIG MESSAGE will be lead to
1916           PMTU discouvery.
1917         */
1918        if (rt->rt6i_dev == arg->dev &&
1919            !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1920            (dst_mtu(&rt->u.dst) > arg->mtu ||
1921             (dst_mtu(&rt->u.dst) < arg->mtu &&
1922              dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) {
1923                rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1924                rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1925        }
1926        return 0;
1927}
1928
1929void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1930{
1931        struct rt6_mtu_change_arg arg = {
1932                .dev = dev,
1933                .mtu = mtu,
1934        };
1935
1936        fib6_clean_all(rt6_mtu_change_route, 0, &arg);
1937}
1938
1939static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
1940        [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
1941        [RTA_OIF]               = { .type = NLA_U32 },
1942        [RTA_IIF]               = { .type = NLA_U32 },
1943        [RTA_PRIORITY]          = { .type = NLA_U32 },
1944        [RTA_METRICS]           = { .type = NLA_NESTED },
1945};
1946
1947static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1948                              struct fib6_config *cfg)
1949{
1950        struct rtmsg *rtm;
1951        struct nlattr *tb[RTA_MAX+1];
1952        int err;
1953
1954        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
1955        if (err < 0)
1956                goto errout;
1957
1958        err = -EINVAL;
1959        rtm = nlmsg_data(nlh);
1960        memset(cfg, 0, sizeof(*cfg));
1961
1962        cfg->fc_table = rtm->rtm_table;
1963        cfg->fc_dst_len = rtm->rtm_dst_len;
1964        cfg->fc_src_len = rtm->rtm_src_len;
1965        cfg->fc_flags = RTF_UP;
1966        cfg->fc_protocol = rtm->rtm_protocol;
1967
1968        if (rtm->rtm_type == RTN_UNREACHABLE)
1969                cfg->fc_flags |= RTF_REJECT;
1970
1971        cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
1972        cfg->fc_nlinfo.nlh = nlh;
1973
1974        if (tb[RTA_GATEWAY]) {
1975                nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
1976                cfg->fc_flags |= RTF_GATEWAY;
1977        }
1978
1979        if (tb[RTA_DST]) {
1980                int plen = (rtm->rtm_dst_len + 7) >> 3;
1981
1982                if (nla_len(tb[RTA_DST]) < plen)
1983                        goto errout;
1984
1985                nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
1986        }
1987
1988        if (tb[RTA_SRC]) {
1989                int plen = (rtm->rtm_src_len + 7) >> 3;
1990
1991                if (nla_len(tb[RTA_SRC]) < plen)
1992                        goto errout;
1993
1994                nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
1995        }
1996
1997        if (tb[RTA_OIF])
1998                cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
1999
2000        if (tb[RTA_PRIORITY])
2001                cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2002
2003        if (tb[RTA_METRICS]) {
2004                cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2005                cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2006        }
2007
2008        if (tb[RTA_TABLE])
2009                cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2010
2011        err = 0;
2012errout:
2013        return err;
2014}
2015
2016static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2017{
2018        struct fib6_config cfg;
2019        int err;
2020
2021        err = rtm_to_fib6_config(skb, nlh, &cfg);
2022        if (err < 0)
2023                return err;
2024
2025        return ip6_route_del(&cfg);
2026}
2027
2028static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2029{
2030        struct fib6_config cfg;
2031        int err;
2032
2033        err = rtm_to_fib6_config(skb, nlh, &cfg);
2034        if (err < 0)
2035                return err;
2036
2037        return ip6_route_add(&cfg);
2038}
2039
2040static inline size_t rt6_nlmsg_size(void)
2041{
2042        return NLMSG_ALIGN(sizeof(struct rtmsg))
2043               + nla_total_size(16) /* RTA_SRC */
2044               + nla_total_size(16) /* RTA_DST */
2045               + nla_total_size(16) /* RTA_GATEWAY */
2046               + nla_total_size(16) /* RTA_PREFSRC */
2047               + nla_total_size(4) /* RTA_TABLE */
2048               + nla_total_size(4) /* RTA_IIF */
2049               + nla_total_size(4) /* RTA_OIF */
2050               + nla_total_size(4) /* RTA_PRIORITY */
2051               + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2052               + nla_total_size(sizeof(struct rta_cacheinfo));
2053}
2054
2055static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
2056                         struct in6_addr *dst, struct in6_addr *src,
2057                         int iif, int type, u32 pid, u32 seq,
2058                         int prefix, unsigned int flags)
2059{
2060        struct rtmsg *rtm;
2061        struct nlmsghdr *nlh;
2062        long expires;
2063        u32 table;
2064
2065        if (prefix) {   /* user wants prefix routes only */
2066                if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2067                        /* success since this is not a prefix route */
2068                        return 1;
2069                }
2070        }
2071
2072        nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2073        if (nlh == NULL)
2074                return -EMSGSIZE;
2075
2076        rtm = nlmsg_data(nlh);
2077        rtm->rtm_family = AF_INET6;
2078        rtm->rtm_dst_len = rt->rt6i_dst.plen;
2079        rtm->rtm_src_len = rt->rt6i_src.plen;
2080        rtm->rtm_tos = 0;
2081        if (rt->rt6i_table)
2082                table = rt->rt6i_table->tb6_id;
2083        else
2084                table = RT6_TABLE_UNSPEC;
2085        rtm->rtm_table = table;
2086        NLA_PUT_U32(skb, RTA_TABLE, table);
2087        if (rt->rt6i_flags&RTF_REJECT)
2088                rtm->rtm_type = RTN_UNREACHABLE;
2089        else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2090                rtm->rtm_type = RTN_LOCAL;
2091        else
2092                rtm->rtm_type = RTN_UNICAST;
2093        rtm->rtm_flags = 0;
2094        rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2095        rtm->rtm_protocol = rt->rt6i_protocol;
2096        if (rt->rt6i_flags&RTF_DYNAMIC)
2097                rtm->rtm_protocol = RTPROT_REDIRECT;
2098        else if (rt->rt6i_flags & RTF_ADDRCONF)
2099                rtm->rtm_protocol = RTPROT_KERNEL;
2100        else if (rt->rt6i_flags&RTF_DEFAULT)
2101                rtm->rtm_protocol = RTPROT_RA;
2102
2103        if (rt->rt6i_flags&RTF_CACHE)
2104                rtm->rtm_flags |= RTM_F_CLONED;
2105
2106        if (dst) {
2107                NLA_PUT(skb, RTA_DST, 16, dst);
2108                rtm->rtm_dst_len = 128;
2109        } else if (rtm->rtm_dst_len)
2110                NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2111#ifdef CONFIG_IPV6_SUBTREES
2112        if (src) {
2113                NLA_PUT(skb, RTA_SRC, 16, src);
2114                rtm->rtm_src_len = 128;
2115        } else if (rtm->rtm_src_len)
2116                NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2117#endif
2118        if (iif)
2119                NLA_PUT_U32(skb, RTA_IIF, iif);
2120        else if (dst) {
2121                struct in6_addr saddr_buf;
2122                if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
2123                        NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2124        }
2125
2126        if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2127                goto nla_put_failure;
2128
2129        if (rt->u.dst.neighbour)
2130                NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2131
2132        if (rt->u.dst.dev)
2133                NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2134
2135        NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2136
2137        expires = rt->rt6i_expires ? rt->rt6i_expires - jiffies : 0;
2138        if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
2139                               expires, rt->u.dst.error) < 0)
2140                goto nla_put_failure;
2141
2142        return nlmsg_end(skb, nlh);
2143
2144nla_put_failure:
2145        nlmsg_cancel(skb, nlh);
2146        return -EMSGSIZE;
2147}
2148
2149int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2150{
2151        struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2152        int prefix;
2153
2154        if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2155                struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2156                prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2157        } else
2158                prefix = 0;
2159
2160        return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2161                     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2162                     prefix, NLM_F_MULTI);
2163}
2164
2165static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2166{
2167        struct nlattr *tb[RTA_MAX+1];
2168        struct rt6_info *rt;
2169        struct sk_buff *skb;
2170        struct rtmsg *rtm;
2171        struct flowi fl;
2172        int err, iif = 0;
2173
2174        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2175        if (err < 0)
2176                goto errout;
2177
2178        err = -EINVAL;
2179        memset(&fl, 0, sizeof(fl));
2180
2181        if (tb[RTA_SRC]) {
2182                if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2183                        goto errout;
2184
2185                ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2186        }
2187
2188        if (tb[RTA_DST]) {
2189                if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2190                        goto errout;
2191
2192                ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2193        }
2194
2195        if (tb[RTA_IIF])
2196                iif = nla_get_u32(tb[RTA_IIF]);
2197
2198        if (tb[RTA_OIF])
2199                fl.oif = nla_get_u32(tb[RTA_OIF]);
2200
2201        if (iif) {
2202                struct net_device *dev;
2203                dev = __dev_get_by_index(&init_net, iif);
2204                if (!dev) {
2205                        err = -ENODEV;
2206                        goto errout;
2207                }
2208        }
2209
2210        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2211        if (skb == NULL) {
2212                err = -ENOBUFS;
2213                goto errout;
2214        }
2215
2216        /* Reserve room for dummy headers, this skb can pass
2217           through good chunk of routing engine.
2218         */
2219        skb_reset_mac_header(skb);
2220        skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2221
2222        rt = (struct rt6_info*) ip6_route_output(NULL, &fl);
2223        skb->dst = &rt->u.dst;
2224
2225        err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2226                            RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2227                            nlh->nlmsg_seq, 0, 0);
2228        if (err < 0) {
2229                kfree_skb(skb);
2230                goto errout;
2231        }
2232
2233        err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2234errout:
2235        return err;
2236}
2237
2238void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2239{
2240        struct sk_buff *skb;
2241        u32 pid = 0, seq = 0;
2242        struct nlmsghdr *nlh = NULL;
2243        int err = -ENOBUFS;
2244
2245        if (info) {
2246                pid = info->pid;
2247                nlh = info->nlh;
2248                if (nlh)
2249                        seq = nlh->nlmsg_seq;
2250        }
2251
2252        skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2253        if (skb == NULL)
2254                goto errout;
2255
2256        err = rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0);
2257        if (err < 0) {
2258                /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2259                WARN_ON(err == -EMSGSIZE);
2260                kfree_skb(skb);
2261                goto errout;
2262        }
2263        err = rtnl_notify(skb, pid, RTNLGRP_IPV6_ROUTE, nlh, gfp_any());
2264errout:
2265        if (err < 0)
2266                rtnl_set_sk_err(RTNLGRP_IPV6_ROUTE, err);
2267}
2268
2269/*
2270 *      /proc
2271 */
2272
2273#ifdef CONFIG_PROC_FS
2274
2275#define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2276
2277struct rt6_proc_arg
2278{
2279        char *buffer;
2280        int offset;
2281        int length;
2282        int skip;
2283        int len;
2284};
2285
2286static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2287{
2288        struct seq_file *m = p_arg;
2289
2290        seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_dst.addr),
2291                   rt->rt6i_dst.plen);
2292
2293#ifdef CONFIG_IPV6_SUBTREES
2294        seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_src.addr),
2295                   rt->rt6i_src.plen);
2296#else
2297        seq_puts(m, "00000000000000000000000000000000 00 ");
2298#endif
2299
2300        if (rt->rt6i_nexthop) {
2301                seq_printf(m, NIP6_SEQFMT,
2302                           NIP6(*((struct in6_addr *)rt->rt6i_nexthop->primary_key)));
2303        } else {
2304                seq_puts(m, "00000000000000000000000000000000");
2305        }
2306        seq_printf(m, " %08x %08x %08x %08x %8s\n",
2307                   rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2308                   rt->u.dst.__use, rt->rt6i_flags,
2309                   rt->rt6i_dev ? rt->rt6i_dev->name : "");
2310        return 0;
2311}
2312
2313static int ipv6_route_show(struct seq_file *m, void *v)
2314{
2315        fib6_clean_all(rt6_info_route, 0, m);
2316        return 0;
2317}
2318
2319static int ipv6_route_open(struct inode *inode, struct file *file)
2320{
2321        return single_open(file, ipv6_route_show, NULL);
2322}
2323
2324static const struct file_operations ipv6_route_proc_fops = {
2325        .owner          = THIS_MODULE,
2326        .open           = ipv6_route_open,
2327        .read           = seq_read,
2328        .llseek         = seq_lseek,
2329        .release        = single_release,
2330};
2331
2332static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2333{
2334        seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2335                      rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2336                      rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2337                      rt6_stats.fib_rt_cache,
2338                      atomic_read(&ip6_dst_ops.entries),
2339                      rt6_stats.fib_discarded_routes);
2340
2341        return 0;
2342}
2343
2344static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2345{
2346        return single_open(file, rt6_stats_seq_show, NULL);
2347}
2348
2349static const struct file_operations rt6_stats_seq_fops = {
2350        .owner   = THIS_MODULE,
2351        .open    = rt6_stats_seq_open,
2352        .read    = seq_read,
2353        .llseek  = seq_lseek,
2354        .release = single_release,
2355};
2356#endif  /* CONFIG_PROC_FS */
2357
2358#ifdef CONFIG_SYSCTL
2359
2360static int flush_delay;
2361
2362static
2363int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2364                              void __user *buffer, size_t *lenp, loff_t *ppos)
2365{
2366        if (write) {
2367                proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2368                fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2369                return 0;
2370        } else
2371                return -EINVAL;
2372}
2373
2374ctl_table ipv6_route_table[] = {
2375        {
2376                .procname       =       "flush",
2377                .data           =       &flush_delay,
2378                .maxlen         =       sizeof(int),
2379                .mode           =       0200,
2380                .proc_handler   =       &ipv6_sysctl_rtcache_flush
2381        },
2382        {
2383                .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2384                .procname       =       "gc_thresh",
2385                .data           =       &ip6_dst_ops.gc_thresh,
2386                .maxlen         =       sizeof(int),
2387                .mode           =       0644,
2388                .proc_handler   =       &proc_dointvec,
2389        },
2390        {
2391                .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2392                .procname       =       "max_size",
2393                .data           =       &ip6_rt_max_size,
2394                .maxlen         =       sizeof(int),
2395                .mode           =       0644,
2396                .proc_handler   =       &proc_dointvec,
2397        },
2398        {
2399                .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2400                .procname       =       "gc_min_interval",
2401                .data           =       &ip6_rt_gc_min_interval,
2402                .maxlen         =       sizeof(int),
2403                .mode           =       0644,
2404                .proc_handler   =       &proc_dointvec_jiffies,
2405                .strategy       =       &sysctl_jiffies,
2406        },
2407        {
2408                .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2409                .procname       =       "gc_timeout",
2410                .data           =       &ip6_rt_gc_timeout,
2411                .maxlen         =       sizeof(int),
2412                .mode           =       0644,
2413                .proc_handler   =       &proc_dointvec_jiffies,
2414                .strategy       =       &sysctl_jiffies,
2415        },
2416        {
2417                .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2418                .procname       =       "gc_interval",
2419                .data           =       &ip6_rt_gc_interval,
2420                .maxlen         =       sizeof(int),
2421                .mode           =       0644,
2422                .proc_handler   =       &proc_dointvec_jiffies,
2423                .strategy       =       &sysctl_jiffies,
2424        },
2425        {
2426                .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2427                .procname       =       "gc_elasticity",
2428                .data           =       &ip6_rt_gc_elasticity,
2429                .maxlen         =       sizeof(int),
2430                .mode           =       0644,
2431                .proc_handler   =       &proc_dointvec_jiffies,
2432                .strategy       =       &sysctl_jiffies,
2433        },
2434        {
2435                .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2436                .procname       =       "mtu_expires",
2437                .data           =       &ip6_rt_mtu_expires,
2438                .maxlen         =       sizeof(int),
2439                .mode           =       0644,
2440                .proc_handler   =       &proc_dointvec_jiffies,
2441                .strategy       =       &sysctl_jiffies,
2442        },
2443        {
2444                .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2445                .procname       =       "min_adv_mss",
2446                .data           =       &ip6_rt_min_advmss,
2447                .maxlen         =       sizeof(int),
2448                .mode           =       0644,
2449                .proc_handler   =       &proc_dointvec_jiffies,
2450                .strategy       =       &sysctl_jiffies,
2451        },
2452        {
2453                .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2454                .procname       =       "gc_min_interval_ms",
2455                .data           =       &ip6_rt_gc_min_interval,
2456                .maxlen         =       sizeof(int),
2457                .mode           =       0644,
2458                .proc_handler   =       &proc_dointvec_ms_jiffies,
2459                .strategy       =       &sysctl_ms_jiffies,
2460        },
2461        { .ctl_name = 0 }
2462};
2463
2464#endif
2465
2466void __init ip6_route_init(void)
2467{
2468        ip6_dst_ops.kmem_cachep =
2469                kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2470                                  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2471        ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops.kmem_cachep;
2472
2473        fib6_init();
2474        proc_net_fops_create(&init_net, "ipv6_route", 0, &ipv6_route_proc_fops);
2475        proc_net_fops_create(&init_net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2476#ifdef CONFIG_XFRM
2477        xfrm6_init();
2478#endif
2479#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2480        fib6_rules_init();
2481#endif
2482
2483        __rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL);
2484        __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL);
2485        __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL);
2486}
2487
2488void ip6_route_cleanup(void)
2489{
2490#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2491        fib6_rules_cleanup();
2492#endif
2493#ifdef CONFIG_PROC_FS
2494        proc_net_remove(&init_net, "ipv6_route");
2495        proc_net_remove(&init_net, "rt6_stats");
2496#endif
2497#ifdef CONFIG_XFRM
2498        xfrm6_fini();
2499#endif
2500        rt6_ifdown(NULL);
2501        fib6_gc_cleanup();
2502        kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2503}
2504