linux/net/ipv6/route.c
<<
>>
Prefs
   1/*
   2 *      Linux INET6 implementation
   3 *      FIB front-end.
   4 *
   5 *      Authors:
   6 *      Pedro Roque             <roque@di.fc.ul.pt>
   7 *
   8 *      This program is free software; you can redistribute it and/or
   9 *      modify it under the terms of the GNU General Public License
  10 *      as published by the Free Software Foundation; either version
  11 *      2 of the License, or (at your option) any later version.
  12 */
  13
  14/*      Changes:
  15 *
  16 *      YOSHIFUJI Hideaki @USAGI
  17 *              reworked default router selection.
  18 *              - respect outgoing interface
  19 *              - select from (probably) reachable routers (i.e.
  20 *              routers in REACHABLE, STALE, DELAY or PROBE states).
  21 *              - always select the same router if it is (probably)
  22 *              reachable.  otherwise, round-robin the list.
  23 *      Ville Nuorvala
  24 *              Fixed routing subtrees.
  25 */
  26
  27#define pr_fmt(fmt) "IPv6: " fmt
  28
  29#include <linux/capability.h>
  30#include <linux/errno.h>
  31#include <linux/export.h>
  32#include <linux/types.h>
  33#include <linux/times.h>
  34#include <linux/socket.h>
  35#include <linux/sockios.h>
  36#include <linux/net.h>
  37#include <linux/route.h>
  38#include <linux/netdevice.h>
  39#include <linux/in6.h>
  40#include <linux/mroute6.h>
  41#include <linux/init.h>
  42#include <linux/if_arp.h>
  43#include <linux/proc_fs.h>
  44#include <linux/seq_file.h>
  45#include <linux/nsproxy.h>
  46#include <linux/slab.h>
  47#include <net/net_namespace.h>
  48#include <net/snmp.h>
  49#include <net/ipv6.h>
  50#include <net/ip6_fib.h>
  51#include <net/ip6_route.h>
  52#include <net/ndisc.h>
  53#include <net/addrconf.h>
  54#include <net/tcp.h>
  55#include <linux/rtnetlink.h>
  56#include <net/dst.h>
  57#include <net/dst_metadata.h>
  58#include <net/xfrm.h>
  59#include <net/netevent.h>
  60#include <net/netlink.h>
  61#include <net/nexthop.h>
  62#include <net/lwtunnel.h>
  63#include <net/ip_tunnels.h>
  64#include <net/l3mdev.h>
  65#include <trace/events/fib6.h>
  66
  67#include <linux/uaccess.h>
  68
  69#ifdef CONFIG_SYSCTL
  70#include <linux/sysctl.h>
  71#endif
  72
  73enum rt6_nud_state {
  74        RT6_NUD_FAIL_HARD = -3,
  75        RT6_NUD_FAIL_PROBE = -2,
  76        RT6_NUD_FAIL_DO_RR = -1,
  77        RT6_NUD_SUCCEED = 1
  78};
  79
  80static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
  81static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
  82static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
  83static unsigned int      ip6_mtu(const struct dst_entry *dst);
  84static struct dst_entry *ip6_negative_advice(struct dst_entry *);
  85static void             ip6_dst_destroy(struct dst_entry *);
  86static void             ip6_dst_ifdown(struct dst_entry *,
  87                                       struct net_device *dev, int how);
  88static int               ip6_dst_gc(struct dst_ops *ops);
  89
  90static int              ip6_pkt_discard(struct sk_buff *skb);
  91static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
  92static int              ip6_pkt_prohibit(struct sk_buff *skb);
  93static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
  94static void             ip6_link_failure(struct sk_buff *skb);
  95static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
  96                                           struct sk_buff *skb, u32 mtu);
  97static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
  98                                        struct sk_buff *skb);
  99static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
 100static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
 101static size_t rt6_nlmsg_size(struct rt6_info *rt);
 102static int rt6_fill_node(struct net *net,
 103                         struct sk_buff *skb, struct rt6_info *rt,
 104                         struct in6_addr *dst, struct in6_addr *src,
 105                         int iif, int type, u32 portid, u32 seq,
 106                         unsigned int flags);
 107
 108#ifdef CONFIG_IPV6_ROUTE_INFO
 109static struct rt6_info *rt6_add_route_info(struct net *net,
 110                                           const struct in6_addr *prefix, int prefixlen,
 111                                           const struct in6_addr *gwaddr,
 112                                           struct net_device *dev,
 113                                           unsigned int pref);
 114static struct rt6_info *rt6_get_route_info(struct net *net,
 115                                           const struct in6_addr *prefix, int prefixlen,
 116                                           const struct in6_addr *gwaddr,
 117                                           struct net_device *dev);
 118#endif
 119
 120struct uncached_list {
 121        spinlock_t              lock;
 122        struct list_head        head;
 123};
 124
 125static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
 126
 127static void rt6_uncached_list_add(struct rt6_info *rt)
 128{
 129        struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
 130
 131        rt->dst.flags |= DST_NOCACHE;
 132        rt->rt6i_uncached_list = ul;
 133
 134        spin_lock_bh(&ul->lock);
 135        list_add_tail(&rt->rt6i_uncached, &ul->head);
 136        spin_unlock_bh(&ul->lock);
 137}
 138
 139static void rt6_uncached_list_del(struct rt6_info *rt)
 140{
 141        if (!list_empty(&rt->rt6i_uncached)) {
 142                struct uncached_list *ul = rt->rt6i_uncached_list;
 143
 144                spin_lock_bh(&ul->lock);
 145                list_del(&rt->rt6i_uncached);
 146                spin_unlock_bh(&ul->lock);
 147        }
 148}
 149
 150static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
 151{
 152        struct net_device *loopback_dev = net->loopback_dev;
 153        int cpu;
 154
 155        if (dev == loopback_dev)
 156                return;
 157
 158        for_each_possible_cpu(cpu) {
 159                struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
 160                struct rt6_info *rt;
 161
 162                spin_lock_bh(&ul->lock);
 163                list_for_each_entry(rt, &ul->head, rt6i_uncached) {
 164                        struct inet6_dev *rt_idev = rt->rt6i_idev;
 165                        struct net_device *rt_dev = rt->dst.dev;
 166
 167                        if (rt_idev->dev == dev) {
 168                                rt->rt6i_idev = in6_dev_get(loopback_dev);
 169                                in6_dev_put(rt_idev);
 170                        }
 171
 172                        if (rt_dev == dev) {
 173                                rt->dst.dev = loopback_dev;
 174                                dev_hold(rt->dst.dev);
 175                                dev_put(rt_dev);
 176                        }
 177                }
 178                spin_unlock_bh(&ul->lock);
 179        }
 180}
 181
 182static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
 183{
 184        return dst_metrics_write_ptr(rt->dst.from);
 185}
 186
 187static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
 188{
 189        struct rt6_info *rt = (struct rt6_info *)dst;
 190
 191        if (rt->rt6i_flags & RTF_PCPU)
 192                return rt6_pcpu_cow_metrics(rt);
 193        else if (rt->rt6i_flags & RTF_CACHE)
 194                return NULL;
 195        else
 196                return dst_cow_metrics_generic(dst, old);
 197}
 198
 199static inline const void *choose_neigh_daddr(struct rt6_info *rt,
 200                                             struct sk_buff *skb,
 201                                             const void *daddr)
 202{
 203        struct in6_addr *p = &rt->rt6i_gateway;
 204
 205        if (!ipv6_addr_any(p))
 206                return (const void *) p;
 207        else if (skb)
 208                return &ipv6_hdr(skb)->daddr;
 209        return daddr;
 210}
 211
 212static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
 213                                          struct sk_buff *skb,
 214                                          const void *daddr)
 215{
 216        struct rt6_info *rt = (struct rt6_info *) dst;
 217        struct neighbour *n;
 218
 219        daddr = choose_neigh_daddr(rt, skb, daddr);
 220        n = __ipv6_neigh_lookup(dst->dev, daddr);
 221        if (n)
 222                return n;
 223        return neigh_create(&nd_tbl, daddr, dst->dev);
 224}
 225
 226static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 227{
 228        struct net_device *dev = dst->dev;
 229        struct rt6_info *rt = (struct rt6_info *)dst;
 230
 231        daddr = choose_neigh_daddr(rt, NULL, daddr);
 232        if (!daddr)
 233                return;
 234        if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
 235                return;
 236        if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
 237                return;
 238        __ipv6_confirm_neigh(dev, daddr);
 239}
 240
 241static struct dst_ops ip6_dst_ops_template = {
 242        .family                 =       AF_INET6,
 243        .gc                     =       ip6_dst_gc,
 244        .gc_thresh              =       1024,
 245        .check                  =       ip6_dst_check,
 246        .default_advmss         =       ip6_default_advmss,
 247        .mtu                    =       ip6_mtu,
 248        .cow_metrics            =       ipv6_cow_metrics,
 249        .destroy                =       ip6_dst_destroy,
 250        .ifdown                 =       ip6_dst_ifdown,
 251        .negative_advice        =       ip6_negative_advice,
 252        .link_failure           =       ip6_link_failure,
 253        .update_pmtu            =       ip6_rt_update_pmtu,
 254        .redirect               =       rt6_do_redirect,
 255        .local_out              =       __ip6_local_out,
 256        .neigh_lookup           =       ip6_neigh_lookup,
 257        .confirm_neigh          =       ip6_confirm_neigh,
 258};
 259
 260static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
 261{
 262        unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
 263
 264        return mtu ? : dst->dev->mtu;
 265}
 266
 267static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
 268                                         struct sk_buff *skb, u32 mtu)
 269{
 270}
 271
 272static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
 273                                      struct sk_buff *skb)
 274{
 275}
 276
 277static struct dst_ops ip6_dst_blackhole_ops = {
 278        .family                 =       AF_INET6,
 279        .destroy                =       ip6_dst_destroy,
 280        .check                  =       ip6_dst_check,
 281        .mtu                    =       ip6_blackhole_mtu,
 282        .default_advmss         =       ip6_default_advmss,
 283        .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
 284        .redirect               =       ip6_rt_blackhole_redirect,
 285        .cow_metrics            =       dst_cow_metrics_generic,
 286        .neigh_lookup           =       ip6_neigh_lookup,
 287};
 288
 289static const u32 ip6_template_metrics[RTAX_MAX] = {
 290        [RTAX_HOPLIMIT - 1] = 0,
 291};
 292
 293static const struct rt6_info ip6_null_entry_template = {
 294        .dst = {
 295                .__refcnt       = ATOMIC_INIT(1),
 296                .__use          = 1,
 297                .obsolete       = DST_OBSOLETE_FORCE_CHK,
 298                .error          = -ENETUNREACH,
 299                .input          = ip6_pkt_discard,
 300                .output         = ip6_pkt_discard_out,
 301        },
 302        .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
 303        .rt6i_protocol  = RTPROT_KERNEL,
 304        .rt6i_metric    = ~(u32) 0,
 305        .rt6i_ref       = ATOMIC_INIT(1),
 306};
 307
 308#ifdef CONFIG_IPV6_MULTIPLE_TABLES
 309
 310static const struct rt6_info ip6_prohibit_entry_template = {
 311        .dst = {
 312                .__refcnt       = ATOMIC_INIT(1),
 313                .__use          = 1,
 314                .obsolete       = DST_OBSOLETE_FORCE_CHK,
 315                .error          = -EACCES,
 316                .input          = ip6_pkt_prohibit,
 317                .output         = ip6_pkt_prohibit_out,
 318        },
 319        .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
 320        .rt6i_protocol  = RTPROT_KERNEL,
 321        .rt6i_metric    = ~(u32) 0,
 322        .rt6i_ref       = ATOMIC_INIT(1),
 323};
 324
 325static const struct rt6_info ip6_blk_hole_entry_template = {
 326        .dst = {
 327                .__refcnt       = ATOMIC_INIT(1),
 328                .__use          = 1,
 329                .obsolete       = DST_OBSOLETE_FORCE_CHK,
 330                .error          = -EINVAL,
 331                .input          = dst_discard,
 332                .output         = dst_discard_out,
 333        },
 334        .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
 335        .rt6i_protocol  = RTPROT_KERNEL,
 336        .rt6i_metric    = ~(u32) 0,
 337        .rt6i_ref       = ATOMIC_INIT(1),
 338};
 339
 340#endif
 341
 342static void rt6_info_init(struct rt6_info *rt)
 343{
 344        struct dst_entry *dst = &rt->dst;
 345
 346        memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
 347        INIT_LIST_HEAD(&rt->rt6i_siblings);
 348        INIT_LIST_HEAD(&rt->rt6i_uncached);
 349}
 350
 351/* allocate dst with ip6_dst_ops */
 352static struct rt6_info *__ip6_dst_alloc(struct net *net,
 353                                        struct net_device *dev,
 354                                        int flags)
 355{
 356        struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
 357                                        0, DST_OBSOLETE_FORCE_CHK, flags);
 358
 359        if (rt)
 360                rt6_info_init(rt);
 361
 362        return rt;
 363}
 364
 365struct rt6_info *ip6_dst_alloc(struct net *net,
 366                               struct net_device *dev,
 367                               int flags)
 368{
 369        struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
 370
 371        if (rt) {
 372                rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
 373                if (rt->rt6i_pcpu) {
 374                        int cpu;
 375
 376                        for_each_possible_cpu(cpu) {
 377                                struct rt6_info **p;
 378
 379                                p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
 380                                /* no one shares rt */
 381                                *p =  NULL;
 382                        }
 383                } else {
 384                        dst_destroy((struct dst_entry *)rt);
 385                        return NULL;
 386                }
 387        }
 388
 389        return rt;
 390}
 391EXPORT_SYMBOL(ip6_dst_alloc);
 392
 393static void ip6_dst_destroy(struct dst_entry *dst)
 394{
 395        struct rt6_info *rt = (struct rt6_info *)dst;
 396        struct dst_entry *from = dst->from;
 397        struct inet6_dev *idev;
 398
 399        dst_destroy_metrics_generic(dst);
 400        free_percpu(rt->rt6i_pcpu);
 401        rt6_uncached_list_del(rt);
 402
 403        idev = rt->rt6i_idev;
 404        if (idev) {
 405                rt->rt6i_idev = NULL;
 406                in6_dev_put(idev);
 407        }
 408
 409        dst->from = NULL;
 410        dst_release(from);
 411}
 412
 413static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 414                           int how)
 415{
 416        struct rt6_info *rt = (struct rt6_info *)dst;
 417        struct inet6_dev *idev = rt->rt6i_idev;
 418        struct net_device *loopback_dev =
 419                dev_net(dev)->loopback_dev;
 420
 421        if (dev != loopback_dev) {
 422                if (idev && idev->dev == dev) {
 423                        struct inet6_dev *loopback_idev =
 424                                in6_dev_get(loopback_dev);
 425                        if (loopback_idev) {
 426                                rt->rt6i_idev = loopback_idev;
 427                                in6_dev_put(idev);
 428                        }
 429                }
 430        }
 431}
 432
 433static bool __rt6_check_expired(const struct rt6_info *rt)
 434{
 435        if (rt->rt6i_flags & RTF_EXPIRES)
 436                return time_after(jiffies, rt->dst.expires);
 437        else
 438                return false;
 439}
 440
 441static bool rt6_check_expired(const struct rt6_info *rt)
 442{
 443        if (rt->rt6i_flags & RTF_EXPIRES) {
 444                if (time_after(jiffies, rt->dst.expires))
 445                        return true;
 446        } else if (rt->dst.from) {
 447                return rt6_check_expired((struct rt6_info *) rt->dst.from);
 448        }
 449        return false;
 450}
 451
 452/* Multipath route selection:
 453 *   Hash based function using packet header and flowlabel.
 454 * Adapted from fib_info_hashfn()
 455 */
 456static int rt6_info_hash_nhsfn(unsigned int candidate_count,
 457                               const struct flowi6 *fl6)
 458{
 459        return get_hash_from_flowi6(fl6) % candidate_count;
 460}
 461
 462static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
 463                                             struct flowi6 *fl6, int oif,
 464                                             int strict)
 465{
 466        struct rt6_info *sibling, *next_sibling;
 467        int route_choosen;
 468
 469        route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
 470        /* Don't change the route, if route_choosen == 0
 471         * (siblings does not include ourself)
 472         */
 473        if (route_choosen)
 474                list_for_each_entry_safe(sibling, next_sibling,
 475                                &match->rt6i_siblings, rt6i_siblings) {
 476                        route_choosen--;
 477                        if (route_choosen == 0) {
 478                                if (rt6_score_route(sibling, oif, strict) < 0)
 479                                        break;
 480                                match = sibling;
 481                                break;
 482                        }
 483                }
 484        return match;
 485}
 486
 487/*
 488 *      Route lookup. Any table->tb6_lock is implied.
 489 */
 490
 491static inline struct rt6_info *rt6_device_match(struct net *net,
 492                                                    struct rt6_info *rt,
 493                                                    const struct in6_addr *saddr,
 494                                                    int oif,
 495                                                    int flags)
 496{
 497        struct rt6_info *local = NULL;
 498        struct rt6_info *sprt;
 499
 500        if (!oif && ipv6_addr_any(saddr))
 501                goto out;
 502
 503        for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
 504                struct net_device *dev = sprt->dst.dev;
 505
 506                if (oif) {
 507                        if (dev->ifindex == oif)
 508                                return sprt;
 509                        if (dev->flags & IFF_LOOPBACK) {
 510                                if (!sprt->rt6i_idev ||
 511                                    sprt->rt6i_idev->dev->ifindex != oif) {
 512                                        if (flags & RT6_LOOKUP_F_IFACE)
 513                                                continue;
 514                                        if (local &&
 515                                            local->rt6i_idev->dev->ifindex == oif)
 516                                                continue;
 517                                }
 518                                local = sprt;
 519                        }
 520                } else {
 521                        if (ipv6_chk_addr(net, saddr, dev,
 522                                          flags & RT6_LOOKUP_F_IFACE))
 523                                return sprt;
 524                }
 525        }
 526
 527        if (oif) {
 528                if (local)
 529                        return local;
 530
 531                if (flags & RT6_LOOKUP_F_IFACE)
 532                        return net->ipv6.ip6_null_entry;
 533        }
 534out:
 535        return rt;
 536}
 537
 538#ifdef CONFIG_IPV6_ROUTER_PREF
 539struct __rt6_probe_work {
 540        struct work_struct work;
 541        struct in6_addr target;
 542        struct net_device *dev;
 543};
 544
 545static void rt6_probe_deferred(struct work_struct *w)
 546{
 547        struct in6_addr mcaddr;
 548        struct __rt6_probe_work *work =
 549                container_of(w, struct __rt6_probe_work, work);
 550
 551        addrconf_addr_solict_mult(&work->target, &mcaddr);
 552        ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
 553        dev_put(work->dev);
 554        kfree(work);
 555}
 556
 557static void rt6_probe(struct rt6_info *rt)
 558{
 559        struct __rt6_probe_work *work;
 560        struct neighbour *neigh;
 561        /*
 562         * Okay, this does not seem to be appropriate
 563         * for now, however, we need to check if it
 564         * is really so; aka Router Reachability Probing.
 565         *
 566         * Router Reachability Probe MUST be rate-limited
 567         * to no more than one per minute.
 568         */
 569        if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
 570                return;
 571        rcu_read_lock_bh();
 572        neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
 573        if (neigh) {
 574                if (neigh->nud_state & NUD_VALID)
 575                        goto out;
 576
 577                work = NULL;
 578                write_lock(&neigh->lock);
 579                if (!(neigh->nud_state & NUD_VALID) &&
 580                    time_after(jiffies,
 581                               neigh->updated +
 582                               rt->rt6i_idev->cnf.rtr_probe_interval)) {
 583                        work = kmalloc(sizeof(*work), GFP_ATOMIC);
 584                        if (work)
 585                                __neigh_set_probe_once(neigh);
 586                }
 587                write_unlock(&neigh->lock);
 588        } else {
 589                work = kmalloc(sizeof(*work), GFP_ATOMIC);
 590        }
 591
 592        if (work) {
 593                INIT_WORK(&work->work, rt6_probe_deferred);
 594                work->target = rt->rt6i_gateway;
 595                dev_hold(rt->dst.dev);
 596                work->dev = rt->dst.dev;
 597                schedule_work(&work->work);
 598        }
 599
 600out:
 601        rcu_read_unlock_bh();
 602}
 603#else
 604static inline void rt6_probe(struct rt6_info *rt)
 605{
 606}
 607#endif
 608
 609/*
 610 * Default Router Selection (RFC 2461 6.3.6)
 611 */
 612static inline int rt6_check_dev(struct rt6_info *rt, int oif)
 613{
 614        struct net_device *dev = rt->dst.dev;
 615        if (!oif || dev->ifindex == oif)
 616                return 2;
 617        if ((dev->flags & IFF_LOOPBACK) &&
 618            rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
 619                return 1;
 620        return 0;
 621}
 622
 623static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
 624{
 625        struct neighbour *neigh;
 626        enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
 627
 628        if (rt->rt6i_flags & RTF_NONEXTHOP ||
 629            !(rt->rt6i_flags & RTF_GATEWAY))
 630                return RT6_NUD_SUCCEED;
 631
 632        rcu_read_lock_bh();
 633        neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
 634        if (neigh) {
 635                read_lock(&neigh->lock);
 636                if (neigh->nud_state & NUD_VALID)
 637                        ret = RT6_NUD_SUCCEED;
 638#ifdef CONFIG_IPV6_ROUTER_PREF
 639                else if (!(neigh->nud_state & NUD_FAILED))
 640                        ret = RT6_NUD_SUCCEED;
 641                else
 642                        ret = RT6_NUD_FAIL_PROBE;
 643#endif
 644                read_unlock(&neigh->lock);
 645        } else {
 646                ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
 647                      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
 648        }
 649        rcu_read_unlock_bh();
 650
 651        return ret;
 652}
 653
 654static int rt6_score_route(struct rt6_info *rt, int oif,
 655                           int strict)
 656{
 657        int m;
 658
 659        m = rt6_check_dev(rt, oif);
 660        if (!m && (strict & RT6_LOOKUP_F_IFACE))
 661                return RT6_NUD_FAIL_HARD;
 662#ifdef CONFIG_IPV6_ROUTER_PREF
 663        m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
 664#endif
 665        if (strict & RT6_LOOKUP_F_REACHABLE) {
 666                int n = rt6_check_neigh(rt);
 667                if (n < 0)
 668                        return n;
 669        }
 670        return m;
 671}
 672
 673static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
 674                                   int *mpri, struct rt6_info *match,
 675                                   bool *do_rr)
 676{
 677        int m;
 678        bool match_do_rr = false;
 679        struct inet6_dev *idev = rt->rt6i_idev;
 680        struct net_device *dev = rt->dst.dev;
 681
 682        if (dev && !netif_carrier_ok(dev) &&
 683            idev->cnf.ignore_routes_with_linkdown &&
 684            !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
 685                goto out;
 686
 687        if (rt6_check_expired(rt))
 688                goto out;
 689
 690        m = rt6_score_route(rt, oif, strict);
 691        if (m == RT6_NUD_FAIL_DO_RR) {
 692                match_do_rr = true;
 693                m = 0; /* lowest valid score */
 694        } else if (m == RT6_NUD_FAIL_HARD) {
 695                goto out;
 696        }
 697
 698        if (strict & RT6_LOOKUP_F_REACHABLE)
 699                rt6_probe(rt);
 700
 701        /* note that m can be RT6_NUD_FAIL_PROBE at this point */
 702        if (m > *mpri) {
 703                *do_rr = match_do_rr;
 704                *mpri = m;
 705                match = rt;
 706        }
 707out:
 708        return match;
 709}
 710
 711static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
 712                                     struct rt6_info *rr_head,
 713                                     u32 metric, int oif, int strict,
 714                                     bool *do_rr)
 715{
 716        struct rt6_info *rt, *match, *cont;
 717        int mpri = -1;
 718
 719        match = NULL;
 720        cont = NULL;
 721        for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
 722                if (rt->rt6i_metric != metric) {
 723                        cont = rt;
 724                        break;
 725                }
 726
 727                match = find_match(rt, oif, strict, &mpri, match, do_rr);
 728        }
 729
 730        for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
 731                if (rt->rt6i_metric != metric) {
 732                        cont = rt;
 733                        break;
 734                }
 735
 736                match = find_match(rt, oif, strict, &mpri, match, do_rr);
 737        }
 738
 739        if (match || !cont)
 740                return match;
 741
 742        for (rt = cont; rt; rt = rt->dst.rt6_next)
 743                match = find_match(rt, oif, strict, &mpri, match, do_rr);
 744
 745        return match;
 746}
 747
 748static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
 749{
 750        struct rt6_info *match, *rt0;
 751        struct net *net;
 752        bool do_rr = false;
 753
 754        rt0 = fn->rr_ptr;
 755        if (!rt0)
 756                fn->rr_ptr = rt0 = fn->leaf;
 757
 758        match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
 759                             &do_rr);
 760
 761        if (do_rr) {
 762                struct rt6_info *next = rt0->dst.rt6_next;
 763
 764                /* no entries matched; do round-robin */
 765                if (!next || next->rt6i_metric != rt0->rt6i_metric)
 766                        next = fn->leaf;
 767
 768                if (next != rt0)
 769                        fn->rr_ptr = next;
 770        }
 771
 772        net = dev_net(rt0->dst.dev);
 773        return match ? match : net->ipv6.ip6_null_entry;
 774}
 775
 776static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
 777{
 778        return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
 779}
 780
 781#ifdef CONFIG_IPV6_ROUTE_INFO
 782int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 783                  const struct in6_addr *gwaddr)
 784{
 785        struct net *net = dev_net(dev);
 786        struct route_info *rinfo = (struct route_info *) opt;
 787        struct in6_addr prefix_buf, *prefix;
 788        unsigned int pref;
 789        unsigned long lifetime;
 790        struct rt6_info *rt;
 791
 792        if (len < sizeof(struct route_info)) {
 793                return -EINVAL;
 794        }
 795
 796        /* Sanity check for prefix_len and length */
 797        if (rinfo->length > 3) {
 798                return -EINVAL;
 799        } else if (rinfo->prefix_len > 128) {
 800                return -EINVAL;
 801        } else if (rinfo->prefix_len > 64) {
 802                if (rinfo->length < 2) {
 803                        return -EINVAL;
 804                }
 805        } else if (rinfo->prefix_len > 0) {
 806                if (rinfo->length < 1) {
 807                        return -EINVAL;
 808                }
 809        }
 810
 811        pref = rinfo->route_pref;
 812        if (pref == ICMPV6_ROUTER_PREF_INVALID)
 813                return -EINVAL;
 814
 815        lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
 816
 817        if (rinfo->length == 3)
 818                prefix = (struct in6_addr *)rinfo->prefix;
 819        else {
 820                /* this function is safe */
 821                ipv6_addr_prefix(&prefix_buf,
 822                                 (struct in6_addr *)rinfo->prefix,
 823                                 rinfo->prefix_len);
 824                prefix = &prefix_buf;
 825        }
 826
 827        if (rinfo->prefix_len == 0)
 828                rt = rt6_get_dflt_router(gwaddr, dev);
 829        else
 830                rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
 831                                        gwaddr, dev);
 832
 833        if (rt && !lifetime) {
 834                ip6_del_rt(rt);
 835                rt = NULL;
 836        }
 837
 838        if (!rt && lifetime)
 839                rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
 840                                        dev, pref);
 841        else if (rt)
 842                rt->rt6i_flags = RTF_ROUTEINFO |
 843                                 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
 844
 845        if (rt) {
 846                if (!addrconf_finite_timeout(lifetime))
 847                        rt6_clean_expires(rt);
 848                else
 849                        rt6_set_expires(rt, jiffies + HZ * lifetime);
 850
 851                ip6_rt_put(rt);
 852        }
 853        return 0;
 854}
 855#endif
 856
 857static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
 858                                        struct in6_addr *saddr)
 859{
 860        struct fib6_node *pn;
 861        while (1) {
 862                if (fn->fn_flags & RTN_TL_ROOT)
 863                        return NULL;
 864                pn = fn->parent;
 865                if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
 866                        fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
 867                else
 868                        fn = pn;
 869                if (fn->fn_flags & RTN_RTINFO)
 870                        return fn;
 871        }
 872}
 873
 874static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 875                                             struct fib6_table *table,
 876                                             struct flowi6 *fl6, int flags)
 877{
 878        struct fib6_node *fn;
 879        struct rt6_info *rt;
 880
 881        read_lock_bh(&table->tb6_lock);
 882        fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 883restart:
 884        rt = fn->leaf;
 885        rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
 886        if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
 887                rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
 888        if (rt == net->ipv6.ip6_null_entry) {
 889                fn = fib6_backtrack(fn, &fl6->saddr);
 890                if (fn)
 891                        goto restart;
 892        }
 893        dst_use(&rt->dst, jiffies);
 894        read_unlock_bh(&table->tb6_lock);
 895
 896        trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
 897
 898        return rt;
 899
 900}
 901
 902struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
 903                                    int flags)
 904{
 905        return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
 906}
 907EXPORT_SYMBOL_GPL(ip6_route_lookup);
 908
 909struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
 910                            const struct in6_addr *saddr, int oif, int strict)
 911{
 912        struct flowi6 fl6 = {
 913                .flowi6_oif = oif,
 914                .daddr = *daddr,
 915        };
 916        struct dst_entry *dst;
 917        int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
 918
 919        if (saddr) {
 920                memcpy(&fl6.saddr, saddr, sizeof(*saddr));
 921                flags |= RT6_LOOKUP_F_HAS_SADDR;
 922        }
 923
 924        dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
 925        if (dst->error == 0)
 926                return (struct rt6_info *) dst;
 927
 928        dst_release(dst);
 929
 930        return NULL;
 931}
 932EXPORT_SYMBOL(rt6_lookup);
 933
 934/* ip6_ins_rt is called with FREE table->tb6_lock.
 935   It takes new route entry, the addition fails by any reason the
 936   route is freed. In any case, if caller does not hold it, it may
 937   be destroyed.
 938 */
 939
 940static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
 941                        struct mx6_config *mxc)
 942{
 943        int err;
 944        struct fib6_table *table;
 945
 946        table = rt->rt6i_table;
 947        write_lock_bh(&table->tb6_lock);
 948        err = fib6_add(&table->tb6_root, rt, info, mxc);
 949        write_unlock_bh(&table->tb6_lock);
 950
 951        return err;
 952}
 953
 954int ip6_ins_rt(struct rt6_info *rt)
 955{
 956        struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
 957        struct mx6_config mxc = { .mx = NULL, };
 958
 959        return __ip6_ins_rt(rt, &info, &mxc);
 960}
 961
 962static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
 963                                           const struct in6_addr *daddr,
 964                                           const struct in6_addr *saddr)
 965{
 966        struct rt6_info *rt;
 967
 968        /*
 969         *      Clone the route.
 970         */
 971
 972        if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
 973                ort = (struct rt6_info *)ort->dst.from;
 974
 975        rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
 976
 977        if (!rt)
 978                return NULL;
 979
 980        ip6_rt_copy_init(rt, ort);
 981        rt->rt6i_flags |= RTF_CACHE;
 982        rt->rt6i_metric = 0;
 983        rt->dst.flags |= DST_HOST;
 984        rt->rt6i_dst.addr = *daddr;
 985        rt->rt6i_dst.plen = 128;
 986
 987        if (!rt6_is_gw_or_nonexthop(ort)) {
 988                if (ort->rt6i_dst.plen != 128 &&
 989                    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
 990                        rt->rt6i_flags |= RTF_ANYCAST;
 991#ifdef CONFIG_IPV6_SUBTREES
 992                if (rt->rt6i_src.plen && saddr) {
 993                        rt->rt6i_src.addr = *saddr;
 994                        rt->rt6i_src.plen = 128;
 995                }
 996#endif
 997        }
 998
 999        return rt;
1000}
1001
1002static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1003{
1004        struct rt6_info *pcpu_rt;
1005
1006        pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
1007                                  rt->dst.dev, rt->dst.flags);
1008
1009        if (!pcpu_rt)
1010                return NULL;
1011        ip6_rt_copy_init(pcpu_rt, rt);
1012        pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1013        pcpu_rt->rt6i_flags |= RTF_PCPU;
1014        return pcpu_rt;
1015}
1016
1017/* It should be called with read_lock_bh(&tb6_lock) acquired */
1018static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1019{
1020        struct rt6_info *pcpu_rt, **p;
1021
1022        p = this_cpu_ptr(rt->rt6i_pcpu);
1023        pcpu_rt = *p;
1024
1025        if (pcpu_rt) {
1026                dst_hold(&pcpu_rt->dst);
1027                rt6_dst_from_metrics_check(pcpu_rt);
1028        }
1029        return pcpu_rt;
1030}
1031
1032static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1033{
1034        struct fib6_table *table = rt->rt6i_table;
1035        struct rt6_info *pcpu_rt, *prev, **p;
1036
1037        pcpu_rt = ip6_rt_pcpu_alloc(rt);
1038        if (!pcpu_rt) {
1039                struct net *net = dev_net(rt->dst.dev);
1040
1041                dst_hold(&net->ipv6.ip6_null_entry->dst);
1042                return net->ipv6.ip6_null_entry;
1043        }
1044
1045        read_lock_bh(&table->tb6_lock);
1046        if (rt->rt6i_pcpu) {
1047                p = this_cpu_ptr(rt->rt6i_pcpu);
1048                prev = cmpxchg(p, NULL, pcpu_rt);
1049                if (prev) {
1050                        /* If someone did it before us, return prev instead */
1051                        dst_destroy(&pcpu_rt->dst);
1052                        pcpu_rt = prev;
1053                }
1054        } else {
1055                /* rt has been removed from the fib6 tree
1056                 * before we have a chance to acquire the read_lock.
1057                 * In this case, don't brother to create a pcpu rt
1058                 * since rt is going away anyway.  The next
1059                 * dst_check() will trigger a re-lookup.
1060                 */
1061                dst_destroy(&pcpu_rt->dst);
1062                pcpu_rt = rt;
1063        }
1064        dst_hold(&pcpu_rt->dst);
1065        rt6_dst_from_metrics_check(pcpu_rt);
1066        read_unlock_bh(&table->tb6_lock);
1067        return pcpu_rt;
1068}
1069
1070struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1071                               int oif, struct flowi6 *fl6, int flags)
1072{
1073        struct fib6_node *fn, *saved_fn;
1074        struct rt6_info *rt;
1075        int strict = 0;
1076
1077        strict |= flags & RT6_LOOKUP_F_IFACE;
1078        strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1079        if (net->ipv6.devconf_all->forwarding == 0)
1080                strict |= RT6_LOOKUP_F_REACHABLE;
1081
1082        read_lock_bh(&table->tb6_lock);
1083
1084        fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1085        saved_fn = fn;
1086
1087        if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1088                oif = 0;
1089
1090redo_rt6_select:
1091        rt = rt6_select(fn, oif, strict);
1092        if (rt->rt6i_nsiblings)
1093                rt = rt6_multipath_select(rt, fl6, oif, strict);
1094        if (rt == net->ipv6.ip6_null_entry) {
1095                fn = fib6_backtrack(fn, &fl6->saddr);
1096                if (fn)
1097                        goto redo_rt6_select;
1098                else if (strict & RT6_LOOKUP_F_REACHABLE) {
1099                        /* also consider unreachable route */
1100                        strict &= ~RT6_LOOKUP_F_REACHABLE;
1101                        fn = saved_fn;
1102                        goto redo_rt6_select;
1103                }
1104        }
1105
1106
1107        if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1108                dst_use(&rt->dst, jiffies);
1109                read_unlock_bh(&table->tb6_lock);
1110
1111                rt6_dst_from_metrics_check(rt);
1112
1113                trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1114                return rt;
1115        } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1116                            !(rt->rt6i_flags & RTF_GATEWAY))) {
1117                /* Create a RTF_CACHE clone which will not be
1118                 * owned by the fib6 tree.  It is for the special case where
1119                 * the daddr in the skb during the neighbor look-up is different
1120                 * from the fl6->daddr used to look-up route here.
1121                 */
1122
1123                struct rt6_info *uncached_rt;
1124
1125                dst_use(&rt->dst, jiffies);
1126                read_unlock_bh(&table->tb6_lock);
1127
1128                uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1129                dst_release(&rt->dst);
1130
1131                if (uncached_rt)
1132                        rt6_uncached_list_add(uncached_rt);
1133                else
1134                        uncached_rt = net->ipv6.ip6_null_entry;
1135
1136                dst_hold(&uncached_rt->dst);
1137
1138                trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1139                return uncached_rt;
1140
1141        } else {
1142                /* Get a percpu copy */
1143
1144                struct rt6_info *pcpu_rt;
1145
1146                rt->dst.lastuse = jiffies;
1147                rt->dst.__use++;
1148                pcpu_rt = rt6_get_pcpu_route(rt);
1149
1150                if (pcpu_rt) {
1151                        read_unlock_bh(&table->tb6_lock);
1152                } else {
1153                        /* We have to do the read_unlock first
1154                         * because rt6_make_pcpu_route() may trigger
1155                         * ip6_dst_gc() which will take the write_lock.
1156                         */
1157                        dst_hold(&rt->dst);
1158                        read_unlock_bh(&table->tb6_lock);
1159                        pcpu_rt = rt6_make_pcpu_route(rt);
1160                        dst_release(&rt->dst);
1161                }
1162
1163                trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1164                return pcpu_rt;
1165
1166        }
1167}
1168EXPORT_SYMBOL_GPL(ip6_pol_route);
1169
1170static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1171                                            struct flowi6 *fl6, int flags)
1172{
1173        return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1174}
1175
1176struct dst_entry *ip6_route_input_lookup(struct net *net,
1177                                         struct net_device *dev,
1178                                         struct flowi6 *fl6, int flags)
1179{
1180        if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1181                flags |= RT6_LOOKUP_F_IFACE;
1182
1183        return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1184}
1185EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1186
1187void ip6_route_input(struct sk_buff *skb)
1188{
1189        const struct ipv6hdr *iph = ipv6_hdr(skb);
1190        struct net *net = dev_net(skb->dev);
1191        int flags = RT6_LOOKUP_F_HAS_SADDR;
1192        struct ip_tunnel_info *tun_info;
1193        struct flowi6 fl6 = {
1194                .flowi6_iif = skb->dev->ifindex,
1195                .daddr = iph->daddr,
1196                .saddr = iph->saddr,
1197                .flowlabel = ip6_flowinfo(iph),
1198                .flowi6_mark = skb->mark,
1199                .flowi6_proto = iph->nexthdr,
1200        };
1201
1202        tun_info = skb_tunnel_info(skb);
1203        if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1204                fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1205        skb_dst_drop(skb);
1206        skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1207}
1208
1209static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1210                                             struct flowi6 *fl6, int flags)
1211{
1212        return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1213}
1214
1215struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1216                                         struct flowi6 *fl6, int flags)
1217{
1218        bool any_src;
1219
1220        if (rt6_need_strict(&fl6->daddr)) {
1221                struct dst_entry *dst;
1222
1223                dst = l3mdev_link_scope_lookup(net, fl6);
1224                if (dst)
1225                        return dst;
1226        }
1227
1228        fl6->flowi6_iif = LOOPBACK_IFINDEX;
1229
1230        any_src = ipv6_addr_any(&fl6->saddr);
1231        if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1232            (fl6->flowi6_oif && any_src))
1233                flags |= RT6_LOOKUP_F_IFACE;
1234
1235        if (!any_src)
1236                flags |= RT6_LOOKUP_F_HAS_SADDR;
1237        else if (sk)
1238                flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1239
1240        return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1241}
1242EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1243
1244struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1245{
1246        struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1247        struct dst_entry *new = NULL;
1248
1249        rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1250        if (rt) {
1251                rt6_info_init(rt);
1252
1253                new = &rt->dst;
1254                new->__use = 1;
1255                new->input = dst_discard;
1256                new->output = dst_discard_out;
1257
1258                dst_copy_metrics(new, &ort->dst);
1259                rt->rt6i_idev = ort->rt6i_idev;
1260                if (rt->rt6i_idev)
1261                        in6_dev_hold(rt->rt6i_idev);
1262
1263                rt->rt6i_gateway = ort->rt6i_gateway;
1264                rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1265                rt->rt6i_metric = 0;
1266
1267                memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1268#ifdef CONFIG_IPV6_SUBTREES
1269                memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1270#endif
1271
1272                dst_free(new);
1273        }
1274
1275        dst_release(dst_orig);
1276        return new ? new : ERR_PTR(-ENOMEM);
1277}
1278
1279/*
1280 *      Destination cache support functions
1281 */
1282
1283static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1284{
1285        if (rt->dst.from &&
1286            dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1287                dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1288}
1289
1290static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1291{
1292        if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1293                return NULL;
1294
1295        if (rt6_check_expired(rt))
1296                return NULL;
1297
1298        return &rt->dst;
1299}
1300
1301static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1302{
1303        if (!__rt6_check_expired(rt) &&
1304            rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1305            rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1306                return &rt->dst;
1307        else
1308                return NULL;
1309}
1310
1311static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1312{
1313        struct rt6_info *rt;
1314
1315        rt = (struct rt6_info *) dst;
1316
1317        /* All IPV6 dsts are created with ->obsolete set to the value
1318         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1319         * into this function always.
1320         */
1321
1322        rt6_dst_from_metrics_check(rt);
1323
1324        if (rt->rt6i_flags & RTF_PCPU ||
1325            (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from))
1326                return rt6_dst_from_check(rt, cookie);
1327        else
1328                return rt6_check(rt, cookie);
1329}
1330
1331static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1332{
1333        struct rt6_info *rt = (struct rt6_info *) dst;
1334
1335        if (rt) {
1336                if (rt->rt6i_flags & RTF_CACHE) {
1337                        if (rt6_check_expired(rt)) {
1338                                ip6_del_rt(rt);
1339                                dst = NULL;
1340                        }
1341                } else {
1342                        dst_release(dst);
1343                        dst = NULL;
1344                }
1345        }
1346        return dst;
1347}
1348
1349static void ip6_link_failure(struct sk_buff *skb)
1350{
1351        struct rt6_info *rt;
1352
1353        icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1354
1355        rt = (struct rt6_info *) skb_dst(skb);
1356        if (rt) {
1357                if (rt->rt6i_flags & RTF_CACHE) {
1358                        dst_hold(&rt->dst);
1359                        ip6_del_rt(rt);
1360                } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1361                        rt->rt6i_node->fn_sernum = -1;
1362                }
1363        }
1364}
1365
1366static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1367{
1368        struct net *net = dev_net(rt->dst.dev);
1369
1370        rt->rt6i_flags |= RTF_MODIFIED;
1371        rt->rt6i_pmtu = mtu;
1372        rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1373}
1374
1375static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1376{
1377        return !(rt->rt6i_flags & RTF_CACHE) &&
1378                (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
1379}
1380
1381static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1382                                 const struct ipv6hdr *iph, u32 mtu)
1383{
1384        const struct in6_addr *daddr, *saddr;
1385        struct rt6_info *rt6 = (struct rt6_info *)dst;
1386
1387        if (rt6->rt6i_flags & RTF_LOCAL)
1388                return;
1389
1390        if (dst_metric_locked(dst, RTAX_MTU))
1391                return;
1392
1393        if (iph) {
1394                daddr = &iph->daddr;
1395                saddr = &iph->saddr;
1396        } else if (sk) {
1397                daddr = &sk->sk_v6_daddr;
1398                saddr = &inet6_sk(sk)->saddr;
1399        } else {
1400                daddr = NULL;
1401                saddr = NULL;
1402        }
1403        dst_confirm_neigh(dst, daddr);
1404        mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1405        if (mtu >= dst_mtu(dst))
1406                return;
1407
1408        if (!rt6_cache_allowed_for_pmtu(rt6)) {
1409                rt6_do_update_pmtu(rt6, mtu);
1410        } else if (daddr) {
1411                struct rt6_info *nrt6;
1412
1413                nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1414                if (nrt6) {
1415                        rt6_do_update_pmtu(nrt6, mtu);
1416
1417                        /* ip6_ins_rt(nrt6) will bump the
1418                         * rt6->rt6i_node->fn_sernum
1419                         * which will fail the next rt6_check() and
1420                         * invalidate the sk->sk_dst_cache.
1421                         */
1422                        ip6_ins_rt(nrt6);
1423                }
1424        }
1425}
1426
1427static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1428                               struct sk_buff *skb, u32 mtu)
1429{
1430        __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1431}
1432
1433void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1434                     int oif, u32 mark, kuid_t uid)
1435{
1436        const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1437        struct dst_entry *dst;
1438        struct flowi6 fl6;
1439
1440        memset(&fl6, 0, sizeof(fl6));
1441        fl6.flowi6_oif = oif;
1442        fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1443        fl6.daddr = iph->daddr;
1444        fl6.saddr = iph->saddr;
1445        fl6.flowlabel = ip6_flowinfo(iph);
1446        fl6.flowi6_uid = uid;
1447
1448        dst = ip6_route_output(net, NULL, &fl6);
1449        if (!dst->error)
1450                __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1451        dst_release(dst);
1452}
1453EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1454
1455void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1456{
1457        struct dst_entry *dst;
1458
1459        ip6_update_pmtu(skb, sock_net(sk), mtu,
1460                        sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
1461
1462        dst = __sk_dst_get(sk);
1463        if (!dst || !dst->obsolete ||
1464            dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
1465                return;
1466
1467        bh_lock_sock(sk);
1468        if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
1469                ip6_datagram_dst_update(sk, false);
1470        bh_unlock_sock(sk);
1471}
1472EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1473
1474/* Handle redirects */
1475struct ip6rd_flowi {
1476        struct flowi6 fl6;
1477        struct in6_addr gateway;
1478};
1479
1480static struct rt6_info *__ip6_route_redirect(struct net *net,
1481                                             struct fib6_table *table,
1482                                             struct flowi6 *fl6,
1483                                             int flags)
1484{
1485        struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1486        struct rt6_info *rt;
1487        struct fib6_node *fn;
1488
1489        /* Get the "current" route for this destination and
1490         * check if the redirect has come from appropriate router.
1491         *
1492         * RFC 4861 specifies that redirects should only be
1493         * accepted if they come from the nexthop to the target.
1494         * Due to the way the routes are chosen, this notion
1495         * is a bit fuzzy and one might need to check all possible
1496         * routes.
1497         */
1498
1499        read_lock_bh(&table->tb6_lock);
1500        fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1501restart:
1502        for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1503                if (rt6_check_expired(rt))
1504                        continue;
1505                if (rt->dst.error)
1506                        break;
1507                if (!(rt->rt6i_flags & RTF_GATEWAY))
1508                        continue;
1509                if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1510                        continue;
1511                if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1512                        continue;
1513                break;
1514        }
1515
1516        if (!rt)
1517                rt = net->ipv6.ip6_null_entry;
1518        else if (rt->dst.error) {
1519                rt = net->ipv6.ip6_null_entry;
1520                goto out;
1521        }
1522
1523        if (rt == net->ipv6.ip6_null_entry) {
1524                fn = fib6_backtrack(fn, &fl6->saddr);
1525                if (fn)
1526                        goto restart;
1527        }
1528
1529out:
1530        dst_hold(&rt->dst);
1531
1532        read_unlock_bh(&table->tb6_lock);
1533
1534        trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1535        return rt;
1536};
1537
1538static struct dst_entry *ip6_route_redirect(struct net *net,
1539                                        const struct flowi6 *fl6,
1540                                        const struct in6_addr *gateway)
1541{
1542        int flags = RT6_LOOKUP_F_HAS_SADDR;
1543        struct ip6rd_flowi rdfl;
1544
1545        rdfl.fl6 = *fl6;
1546        rdfl.gateway = *gateway;
1547
1548        return fib6_rule_lookup(net, &rdfl.fl6,
1549                                flags, __ip6_route_redirect);
1550}
1551
1552void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
1553                  kuid_t uid)
1554{
1555        const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1556        struct dst_entry *dst;
1557        struct flowi6 fl6;
1558
1559        memset(&fl6, 0, sizeof(fl6));
1560        fl6.flowi6_iif = LOOPBACK_IFINDEX;
1561        fl6.flowi6_oif = oif;
1562        fl6.flowi6_mark = mark;
1563        fl6.daddr = iph->daddr;
1564        fl6.saddr = iph->saddr;
1565        fl6.flowlabel = ip6_flowinfo(iph);
1566        fl6.flowi6_uid = uid;
1567
1568        dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1569        rt6_do_redirect(dst, NULL, skb);
1570        dst_release(dst);
1571}
1572EXPORT_SYMBOL_GPL(ip6_redirect);
1573
1574void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1575                            u32 mark)
1576{
1577        const struct ipv6hdr *iph = ipv6_hdr(skb);
1578        const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1579        struct dst_entry *dst;
1580        struct flowi6 fl6;
1581
1582        memset(&fl6, 0, sizeof(fl6));
1583        fl6.flowi6_iif = LOOPBACK_IFINDEX;
1584        fl6.flowi6_oif = oif;
1585        fl6.flowi6_mark = mark;
1586        fl6.daddr = msg->dest;
1587        fl6.saddr = iph->daddr;
1588        fl6.flowi6_uid = sock_net_uid(net, NULL);
1589
1590        dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1591        rt6_do_redirect(dst, NULL, skb);
1592        dst_release(dst);
1593}
1594
1595void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1596{
1597        ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
1598                     sk->sk_uid);
1599}
1600EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1601
1602static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1603{
1604        struct net_device *dev = dst->dev;
1605        unsigned int mtu = dst_mtu(dst);
1606        struct net *net = dev_net(dev);
1607
1608        mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1609
1610        if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1611                mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1612
1613        /*
1614         * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1615         * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1616         * IPV6_MAXPLEN is also valid and means: "any MSS,
1617         * rely only on pmtu discovery"
1618         */
1619        if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1620                mtu = IPV6_MAXPLEN;
1621        return mtu;
1622}
1623
1624static unsigned int ip6_mtu(const struct dst_entry *dst)
1625{
1626        const struct rt6_info *rt = (const struct rt6_info *)dst;
1627        unsigned int mtu = rt->rt6i_pmtu;
1628        struct inet6_dev *idev;
1629
1630        if (mtu)
1631                goto out;
1632
1633        mtu = dst_metric_raw(dst, RTAX_MTU);
1634        if (mtu)
1635                goto out;
1636
1637        mtu = IPV6_MIN_MTU;
1638
1639        rcu_read_lock();
1640        idev = __in6_dev_get(dst->dev);
1641        if (idev)
1642                mtu = idev->cnf.mtu6;
1643        rcu_read_unlock();
1644
1645out:
1646        mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1647
1648        return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1649}
1650
1651static struct dst_entry *icmp6_dst_gc_list;
1652static DEFINE_SPINLOCK(icmp6_dst_lock);
1653
1654struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1655                                  struct flowi6 *fl6)
1656{
1657        struct dst_entry *dst;
1658        struct rt6_info *rt;
1659        struct inet6_dev *idev = in6_dev_get(dev);
1660        struct net *net = dev_net(dev);
1661
1662        if (unlikely(!idev))
1663                return ERR_PTR(-ENODEV);
1664
1665        rt = ip6_dst_alloc(net, dev, 0);
1666        if (unlikely(!rt)) {
1667                in6_dev_put(idev);
1668                dst = ERR_PTR(-ENOMEM);
1669                goto out;
1670        }
1671
1672        rt->dst.flags |= DST_HOST;
1673        rt->dst.output  = ip6_output;
1674        atomic_set(&rt->dst.__refcnt, 1);
1675        rt->rt6i_gateway  = fl6->daddr;
1676        rt->rt6i_dst.addr = fl6->daddr;
1677        rt->rt6i_dst.plen = 128;
1678        rt->rt6i_idev     = idev;
1679        dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1680
1681        spin_lock_bh(&icmp6_dst_lock);
1682        rt->dst.next = icmp6_dst_gc_list;
1683        icmp6_dst_gc_list = &rt->dst;
1684        spin_unlock_bh(&icmp6_dst_lock);
1685
1686        fib6_force_start_gc(net);
1687
1688        dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1689
1690out:
1691        return dst;
1692}
1693
1694int icmp6_dst_gc(void)
1695{
1696        struct dst_entry *dst, **pprev;
1697        int more = 0;
1698
1699        spin_lock_bh(&icmp6_dst_lock);
1700        pprev = &icmp6_dst_gc_list;
1701
1702        while ((dst = *pprev) != NULL) {
1703                if (!atomic_read(&dst->__refcnt)) {
1704                        *pprev = dst->next;
1705                        dst_free(dst);
1706                } else {
1707                        pprev = &dst->next;
1708                        ++more;
1709                }
1710        }
1711
1712        spin_unlock_bh(&icmp6_dst_lock);
1713
1714        return more;
1715}
1716
1717static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1718                            void *arg)
1719{
1720        struct dst_entry *dst, **pprev;
1721
1722        spin_lock_bh(&icmp6_dst_lock);
1723        pprev = &icmp6_dst_gc_list;
1724        while ((dst = *pprev) != NULL) {
1725                struct rt6_info *rt = (struct rt6_info *) dst;
1726                if (func(rt, arg)) {
1727                        *pprev = dst->next;
1728                        dst_free(dst);
1729                } else {
1730                        pprev = &dst->next;
1731                }
1732        }
1733        spin_unlock_bh(&icmp6_dst_lock);
1734}
1735
1736static int ip6_dst_gc(struct dst_ops *ops)
1737{
1738        struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1739        int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1740        int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1741        int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1742        int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1743        unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1744        int entries;
1745
1746        entries = dst_entries_get_fast(ops);
1747        if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1748            entries <= rt_max_size)
1749                goto out;
1750
1751        net->ipv6.ip6_rt_gc_expire++;
1752        fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1753        entries = dst_entries_get_slow(ops);
1754        if (entries < ops->gc_thresh)
1755                net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1756out:
1757        net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1758        return entries > rt_max_size;
1759}
1760
1761static int ip6_convert_metrics(struct mx6_config *mxc,
1762                               const struct fib6_config *cfg)
1763{
1764        bool ecn_ca = false;
1765        struct nlattr *nla;
1766        int remaining;
1767        u32 *mp;
1768
1769        if (!cfg->fc_mx)
1770                return 0;
1771
1772        mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1773        if (unlikely(!mp))
1774                return -ENOMEM;
1775
1776        nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1777                int type = nla_type(nla);
1778                u32 val;
1779
1780                if (!type)
1781                        continue;
1782                if (unlikely(type > RTAX_MAX))
1783                        goto err;
1784
1785                if (type == RTAX_CC_ALGO) {
1786                        char tmp[TCP_CA_NAME_MAX];
1787
1788                        nla_strlcpy(tmp, nla, sizeof(tmp));
1789                        val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1790                        if (val == TCP_CA_UNSPEC)
1791                                goto err;
1792                } else {
1793                        val = nla_get_u32(nla);
1794                }
1795                if (type == RTAX_HOPLIMIT && val > 255)
1796                        val = 255;
1797                if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1798                        goto err;
1799
1800                mp[type - 1] = val;
1801                __set_bit(type - 1, mxc->mx_valid);
1802        }
1803
1804        if (ecn_ca) {
1805                __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1806                mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1807        }
1808
1809        mxc->mx = mp;
1810        return 0;
1811 err:
1812        kfree(mp);
1813        return -EINVAL;
1814}
1815
1816static struct rt6_info *ip6_nh_lookup_table(struct net *net,
1817                                            struct fib6_config *cfg,
1818                                            const struct in6_addr *gw_addr)
1819{
1820        struct flowi6 fl6 = {
1821                .flowi6_oif = cfg->fc_ifindex,
1822                .daddr = *gw_addr,
1823                .saddr = cfg->fc_prefsrc,
1824        };
1825        struct fib6_table *table;
1826        struct rt6_info *rt;
1827        int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
1828
1829        table = fib6_get_table(net, cfg->fc_table);
1830        if (!table)
1831                return NULL;
1832
1833        if (!ipv6_addr_any(&cfg->fc_prefsrc))
1834                flags |= RT6_LOOKUP_F_HAS_SADDR;
1835
1836        rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
1837
1838        /* if table lookup failed, fall back to full lookup */
1839        if (rt == net->ipv6.ip6_null_entry) {
1840                ip6_rt_put(rt);
1841                rt = NULL;
1842        }
1843
1844        return rt;
1845}
1846
1847static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
1848{
1849        struct net *net = cfg->fc_nlinfo.nl_net;
1850        struct rt6_info *rt = NULL;
1851        struct net_device *dev = NULL;
1852        struct inet6_dev *idev = NULL;
1853        struct fib6_table *table;
1854        int addr_type;
1855        int err = -EINVAL;
1856
1857        /* RTF_PCPU is an internal flag; can not be set by userspace */
1858        if (cfg->fc_flags & RTF_PCPU)
1859                goto out;
1860
1861        if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1862                goto out;
1863#ifndef CONFIG_IPV6_SUBTREES
1864        if (cfg->fc_src_len)
1865                goto out;
1866#endif
1867        if (cfg->fc_ifindex) {
1868                err = -ENODEV;
1869                dev = dev_get_by_index(net, cfg->fc_ifindex);
1870                if (!dev)
1871                        goto out;
1872                idev = in6_dev_get(dev);
1873                if (!idev)
1874                        goto out;
1875        }
1876
1877        if (cfg->fc_metric == 0)
1878                cfg->fc_metric = IP6_RT_PRIO_USER;
1879
1880        err = -ENOBUFS;
1881        if (cfg->fc_nlinfo.nlh &&
1882            !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1883                table = fib6_get_table(net, cfg->fc_table);
1884                if (!table) {
1885                        pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1886                        table = fib6_new_table(net, cfg->fc_table);
1887                }
1888        } else {
1889                table = fib6_new_table(net, cfg->fc_table);
1890        }
1891
1892        if (!table)
1893                goto out;
1894
1895        rt = ip6_dst_alloc(net, NULL,
1896                           (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1897
1898        if (!rt) {
1899                err = -ENOMEM;
1900                goto out;
1901        }
1902
1903        if (cfg->fc_flags & RTF_EXPIRES)
1904                rt6_set_expires(rt, jiffies +
1905                                clock_t_to_jiffies(cfg->fc_expires));
1906        else
1907                rt6_clean_expires(rt);
1908
1909        if (cfg->fc_protocol == RTPROT_UNSPEC)
1910                cfg->fc_protocol = RTPROT_BOOT;
1911        rt->rt6i_protocol = cfg->fc_protocol;
1912
1913        addr_type = ipv6_addr_type(&cfg->fc_dst);
1914
1915        if (addr_type & IPV6_ADDR_MULTICAST)
1916                rt->dst.input = ip6_mc_input;
1917        else if (cfg->fc_flags & RTF_LOCAL)
1918                rt->dst.input = ip6_input;
1919        else
1920                rt->dst.input = ip6_forward;
1921
1922        rt->dst.output = ip6_output;
1923
1924        if (cfg->fc_encap) {
1925                struct lwtunnel_state *lwtstate;
1926
1927                err = lwtunnel_build_state(cfg->fc_encap_type,
1928                                           cfg->fc_encap, AF_INET6, cfg,
1929                                           &lwtstate);
1930                if (err)
1931                        goto out;
1932                rt->dst.lwtstate = lwtstate_get(lwtstate);
1933                if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1934                        rt->dst.lwtstate->orig_output = rt->dst.output;
1935                        rt->dst.output = lwtunnel_output;
1936                }
1937                if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1938                        rt->dst.lwtstate->orig_input = rt->dst.input;
1939                        rt->dst.input = lwtunnel_input;
1940                }
1941        }
1942
1943        ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1944        rt->rt6i_dst.plen = cfg->fc_dst_len;
1945        if (rt->rt6i_dst.plen == 128)
1946                rt->dst.flags |= DST_HOST;
1947
1948#ifdef CONFIG_IPV6_SUBTREES
1949        ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1950        rt->rt6i_src.plen = cfg->fc_src_len;
1951#endif
1952
1953        rt->rt6i_metric = cfg->fc_metric;
1954
1955        /* We cannot add true routes via loopback here,
1956           they would result in kernel looping; promote them to reject routes
1957         */
1958        if ((cfg->fc_flags & RTF_REJECT) ||
1959            (dev && (dev->flags & IFF_LOOPBACK) &&
1960             !(addr_type & IPV6_ADDR_LOOPBACK) &&
1961             !(cfg->fc_flags & RTF_LOCAL))) {
1962                /* hold loopback dev/idev if we haven't done so. */
1963                if (dev != net->loopback_dev) {
1964                        if (dev) {
1965                                dev_put(dev);
1966                                in6_dev_put(idev);
1967                        }
1968                        dev = net->loopback_dev;
1969                        dev_hold(dev);
1970                        idev = in6_dev_get(dev);
1971                        if (!idev) {
1972                                err = -ENODEV;
1973                                goto out;
1974                        }
1975                }
1976                rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1977                switch (cfg->fc_type) {
1978                case RTN_BLACKHOLE:
1979                        rt->dst.error = -EINVAL;
1980                        rt->dst.output = dst_discard_out;
1981                        rt->dst.input = dst_discard;
1982                        break;
1983                case RTN_PROHIBIT:
1984                        rt->dst.error = -EACCES;
1985                        rt->dst.output = ip6_pkt_prohibit_out;
1986                        rt->dst.input = ip6_pkt_prohibit;
1987                        break;
1988                case RTN_THROW:
1989                case RTN_UNREACHABLE:
1990                default:
1991                        rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1992                                        : (cfg->fc_type == RTN_UNREACHABLE)
1993                                        ? -EHOSTUNREACH : -ENETUNREACH;
1994                        rt->dst.output = ip6_pkt_discard_out;
1995                        rt->dst.input = ip6_pkt_discard;
1996                        break;
1997                }
1998                goto install_route;
1999        }
2000
2001        if (cfg->fc_flags & RTF_GATEWAY) {
2002                const struct in6_addr *gw_addr;
2003                int gwa_type;
2004
2005                gw_addr = &cfg->fc_gateway;
2006                gwa_type = ipv6_addr_type(gw_addr);
2007
2008                /* if gw_addr is local we will fail to detect this in case
2009                 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2010                 * will return already-added prefix route via interface that
2011                 * prefix route was assigned to, which might be non-loopback.
2012                 */
2013                err = -EINVAL;
2014                if (ipv6_chk_addr_and_flags(net, gw_addr,
2015                                            gwa_type & IPV6_ADDR_LINKLOCAL ?
2016                                            dev : NULL, 0, 0))
2017                        goto out;
2018
2019                rt->rt6i_gateway = *gw_addr;
2020
2021                if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2022                        struct rt6_info *grt = NULL;
2023
2024                        /* IPv6 strictly inhibits using not link-local
2025                           addresses as nexthop address.
2026                           Otherwise, router will not able to send redirects.
2027                           It is very good, but in some (rare!) circumstances
2028                           (SIT, PtP, NBMA NOARP links) it is handy to allow
2029                           some exceptions. --ANK
2030                           We allow IPv4-mapped nexthops to support RFC4798-type
2031                           addressing
2032                         */
2033                        if (!(gwa_type & (IPV6_ADDR_UNICAST |
2034                                          IPV6_ADDR_MAPPED)))
2035                                goto out;
2036
2037                        if (cfg->fc_table) {
2038                                grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2039
2040                                if (grt) {
2041                                        if (grt->rt6i_flags & RTF_GATEWAY ||
2042                                            (dev && dev != grt->dst.dev)) {
2043                                                ip6_rt_put(grt);
2044                                                grt = NULL;
2045                                        }
2046                                }
2047                        }
2048
2049                        if (!grt)
2050                                grt = rt6_lookup(net, gw_addr, NULL,
2051                                                 cfg->fc_ifindex, 1);
2052
2053                        err = -EHOSTUNREACH;
2054                        if (!grt)
2055                                goto out;
2056                        if (dev) {
2057                                if (dev != grt->dst.dev) {
2058                                        ip6_rt_put(grt);
2059                                        goto out;
2060                                }
2061                        } else {
2062                                dev = grt->dst.dev;
2063                                idev = grt->rt6i_idev;
2064                                dev_hold(dev);
2065                                in6_dev_hold(grt->rt6i_idev);
2066                        }
2067                        if (!(grt->rt6i_flags & RTF_GATEWAY))
2068                                err = 0;
2069                        ip6_rt_put(grt);
2070
2071                        if (err)
2072                                goto out;
2073                }
2074                err = -EINVAL;
2075                if (!dev || (dev->flags & IFF_LOOPBACK))
2076                        goto out;
2077        }
2078
2079        err = -ENODEV;
2080        if (!dev)
2081                goto out;
2082
2083        if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2084                if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2085                        err = -EINVAL;
2086                        goto out;
2087                }
2088                rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2089                rt->rt6i_prefsrc.plen = 128;
2090        } else
2091                rt->rt6i_prefsrc.plen = 0;
2092
2093        rt->rt6i_flags = cfg->fc_flags;
2094
2095install_route:
2096        rt->dst.dev = dev;
2097        rt->rt6i_idev = idev;
2098        rt->rt6i_table = table;
2099
2100        cfg->fc_nlinfo.nl_net = dev_net(dev);
2101
2102        return rt;
2103out:
2104        if (dev)
2105                dev_put(dev);
2106        if (idev)
2107                in6_dev_put(idev);
2108        if (rt)
2109                dst_free(&rt->dst);
2110
2111        return ERR_PTR(err);
2112}
2113
2114int ip6_route_add(struct fib6_config *cfg)
2115{
2116        struct mx6_config mxc = { .mx = NULL, };
2117        struct rt6_info *rt;
2118        int err;
2119
2120        rt = ip6_route_info_create(cfg);
2121        if (IS_ERR(rt)) {
2122                err = PTR_ERR(rt);
2123                rt = NULL;
2124                goto out;
2125        }
2126
2127        err = ip6_convert_metrics(&mxc, cfg);
2128        if (err)
2129                goto out;
2130
2131        err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
2132
2133        kfree(mxc.mx);
2134
2135        return err;
2136out:
2137        if (rt)
2138                dst_free(&rt->dst);
2139
2140        return err;
2141}
2142
2143static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2144{
2145        int err;
2146        struct fib6_table *table;
2147        struct net *net = dev_net(rt->dst.dev);
2148
2149        if (rt == net->ipv6.ip6_null_entry ||
2150            rt->dst.flags & DST_NOCACHE) {
2151                err = -ENOENT;
2152                goto out;
2153        }
2154
2155        table = rt->rt6i_table;
2156        write_lock_bh(&table->tb6_lock);
2157        err = fib6_del(rt, info);
2158        write_unlock_bh(&table->tb6_lock);
2159
2160out:
2161        ip6_rt_put(rt);
2162        return err;
2163}
2164
2165int ip6_del_rt(struct rt6_info *rt)
2166{
2167        struct nl_info info = {
2168                .nl_net = dev_net(rt->dst.dev),
2169        };
2170        return __ip6_del_rt(rt, &info);
2171}
2172
2173static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2174{
2175        struct nl_info *info = &cfg->fc_nlinfo;
2176        struct net *net = info->nl_net;
2177        struct sk_buff *skb = NULL;
2178        struct fib6_table *table;
2179        int err = -ENOENT;
2180
2181        if (rt == net->ipv6.ip6_null_entry)
2182                goto out_put;
2183        table = rt->rt6i_table;
2184        write_lock_bh(&table->tb6_lock);
2185
2186        if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2187                struct rt6_info *sibling, *next_sibling;
2188
2189                /* prefer to send a single notification with all hops */
2190                skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2191                if (skb) {
2192                        u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2193
2194                        if (rt6_fill_node(net, skb, rt,
2195                                          NULL, NULL, 0, RTM_DELROUTE,
2196                                          info->portid, seq, 0) < 0) {
2197                                kfree_skb(skb);
2198                                skb = NULL;
2199                        } else
2200                                info->skip_notify = 1;
2201                }
2202
2203                list_for_each_entry_safe(sibling, next_sibling,
2204                                         &rt->rt6i_siblings,
2205                                         rt6i_siblings) {
2206                        err = fib6_del(sibling, info);
2207                        if (err)
2208                                goto out_unlock;
2209                }
2210        }
2211
2212        err = fib6_del(rt, info);
2213out_unlock:
2214        write_unlock_bh(&table->tb6_lock);
2215out_put:
2216        ip6_rt_put(rt);
2217
2218        if (skb) {
2219                rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2220                            info->nlh, gfp_any());
2221        }
2222        return err;
2223}
2224
2225static int ip6_route_del(struct fib6_config *cfg)
2226{
2227        struct fib6_table *table;
2228        struct fib6_node *fn;
2229        struct rt6_info *rt;
2230        int err = -ESRCH;
2231
2232        table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2233        if (!table)
2234                return err;
2235
2236        read_lock_bh(&table->tb6_lock);
2237
2238        fn = fib6_locate(&table->tb6_root,
2239                         &cfg->fc_dst, cfg->fc_dst_len,
2240                         &cfg->fc_src, cfg->fc_src_len);
2241
2242        if (fn) {
2243                for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2244                        if ((rt->rt6i_flags & RTF_CACHE) &&
2245                            !(cfg->fc_flags & RTF_CACHE))
2246                                continue;
2247                        if (cfg->fc_ifindex &&
2248                            (!rt->dst.dev ||
2249                             rt->dst.dev->ifindex != cfg->fc_ifindex))
2250                                continue;
2251                        if (cfg->fc_flags & RTF_GATEWAY &&
2252                            !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2253                                continue;
2254                        if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2255                                continue;
2256                        if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2257                                continue;
2258                        dst_hold(&rt->dst);
2259                        read_unlock_bh(&table->tb6_lock);
2260
2261                        /* if gateway was specified only delete the one hop */
2262                        if (cfg->fc_flags & RTF_GATEWAY)
2263                                return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2264
2265                        return __ip6_del_rt_siblings(rt, cfg);
2266                }
2267        }
2268        read_unlock_bh(&table->tb6_lock);
2269
2270        return err;
2271}
2272
2273static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2274{
2275        struct netevent_redirect netevent;
2276        struct rt6_info *rt, *nrt = NULL;
2277        struct ndisc_options ndopts;
2278        struct inet6_dev *in6_dev;
2279        struct neighbour *neigh;
2280        struct rd_msg *msg;
2281        int optlen, on_link;
2282        u8 *lladdr;
2283
2284        optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2285        optlen -= sizeof(*msg);
2286
2287        if (optlen < 0) {
2288                net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2289                return;
2290        }
2291
2292        msg = (struct rd_msg *)icmp6_hdr(skb);
2293
2294        if (ipv6_addr_is_multicast(&msg->dest)) {
2295                net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2296                return;
2297        }
2298
2299        on_link = 0;
2300        if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2301                on_link = 1;
2302        } else if (ipv6_addr_type(&msg->target) !=
2303                   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2304                net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2305                return;
2306        }
2307
2308        in6_dev = __in6_dev_get(skb->dev);
2309        if (!in6_dev)
2310                return;
2311        if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2312                return;
2313
2314        /* RFC2461 8.1:
2315         *      The IP source address of the Redirect MUST be the same as the current
2316         *      first-hop router for the specified ICMP Destination Address.
2317         */
2318
2319        if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2320                net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2321                return;
2322        }
2323
2324        lladdr = NULL;
2325        if (ndopts.nd_opts_tgt_lladdr) {
2326                lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2327                                             skb->dev);
2328                if (!lladdr) {
2329                        net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2330                        return;
2331                }
2332        }
2333
2334        rt = (struct rt6_info *) dst;
2335        if (rt->rt6i_flags & RTF_REJECT) {
2336                net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2337                return;
2338        }
2339
2340        /* Redirect received -> path was valid.
2341         * Look, redirects are sent only in response to data packets,
2342         * so that this nexthop apparently is reachable. --ANK
2343         */
2344        dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2345
2346        neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2347        if (!neigh)
2348                return;
2349
2350        /*
2351         *      We have finally decided to accept it.
2352         */
2353
2354        ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2355                     NEIGH_UPDATE_F_WEAK_OVERRIDE|
2356                     NEIGH_UPDATE_F_OVERRIDE|
2357                     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2358                                     NEIGH_UPDATE_F_ISROUTER)),
2359                     NDISC_REDIRECT, &ndopts);
2360
2361        nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2362        if (!nrt)
2363                goto out;
2364
2365        nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2366        if (on_link)
2367                nrt->rt6i_flags &= ~RTF_GATEWAY;
2368
2369        nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2370
2371        if (ip6_ins_rt(nrt))
2372                goto out;
2373
2374        netevent.old = &rt->dst;
2375        netevent.new = &nrt->dst;
2376        netevent.daddr = &msg->dest;
2377        netevent.neigh = neigh;
2378        call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2379
2380        if (rt->rt6i_flags & RTF_CACHE) {
2381                rt = (struct rt6_info *) dst_clone(&rt->dst);
2382                ip6_del_rt(rt);
2383        }
2384
2385out:
2386        neigh_release(neigh);
2387}
2388
2389/*
2390 *      Misc support functions
2391 */
2392
2393static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2394{
2395        BUG_ON(from->dst.from);
2396
2397        rt->rt6i_flags &= ~RTF_EXPIRES;
2398        dst_hold(&from->dst);
2399        rt->dst.from = &from->dst;
2400        dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2401}
2402
2403static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2404{
2405        rt->dst.input = ort->dst.input;
2406        rt->dst.output = ort->dst.output;
2407        rt->rt6i_dst = ort->rt6i_dst;
2408        rt->dst.error = ort->dst.error;
2409        rt->rt6i_idev = ort->rt6i_idev;
2410        if (rt->rt6i_idev)
2411                in6_dev_hold(rt->rt6i_idev);
2412        rt->dst.lastuse = jiffies;
2413        rt->rt6i_gateway = ort->rt6i_gateway;
2414        rt->rt6i_flags = ort->rt6i_flags;
2415        rt6_set_from(rt, ort);
2416        rt->rt6i_metric = ort->rt6i_metric;
2417#ifdef CONFIG_IPV6_SUBTREES
2418        rt->rt6i_src = ort->rt6i_src;
2419#endif
2420        rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2421        rt->rt6i_table = ort->rt6i_table;
2422        rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2423}
2424
2425#ifdef CONFIG_IPV6_ROUTE_INFO
2426static struct rt6_info *rt6_get_route_info(struct net *net,
2427                                           const struct in6_addr *prefix, int prefixlen,
2428                                           const struct in6_addr *gwaddr,
2429                                           struct net_device *dev)
2430{
2431        u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
2432        int ifindex = dev->ifindex;
2433        struct fib6_node *fn;
2434        struct rt6_info *rt = NULL;
2435        struct fib6_table *table;
2436
2437        table = fib6_get_table(net, tb_id);
2438        if (!table)
2439                return NULL;
2440
2441        read_lock_bh(&table->tb6_lock);
2442        fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2443        if (!fn)
2444                goto out;
2445
2446        for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2447                if (rt->dst.dev->ifindex != ifindex)
2448                        continue;
2449                if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2450                        continue;
2451                if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2452                        continue;
2453                dst_hold(&rt->dst);
2454                break;
2455        }
2456out:
2457        read_unlock_bh(&table->tb6_lock);
2458        return rt;
2459}
2460
2461static struct rt6_info *rt6_add_route_info(struct net *net,
2462                                           const struct in6_addr *prefix, int prefixlen,
2463                                           const struct in6_addr *gwaddr,
2464                                           struct net_device *dev,
2465                                           unsigned int pref)
2466{
2467        struct fib6_config cfg = {
2468                .fc_metric      = IP6_RT_PRIO_USER,
2469                .fc_ifindex     = dev->ifindex,
2470                .fc_dst_len     = prefixlen,
2471                .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2472                                  RTF_UP | RTF_PREF(pref),
2473                .fc_nlinfo.portid = 0,
2474                .fc_nlinfo.nlh = NULL,
2475                .fc_nlinfo.nl_net = net,
2476        };
2477
2478        cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
2479        cfg.fc_dst = *prefix;
2480        cfg.fc_gateway = *gwaddr;
2481
2482        /* We should treat it as a default route if prefix length is 0. */
2483        if (!prefixlen)
2484                cfg.fc_flags |= RTF_DEFAULT;
2485
2486        ip6_route_add(&cfg);
2487
2488        return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
2489}
2490#endif
2491
2492struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2493{
2494        u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
2495        struct rt6_info *rt;
2496        struct fib6_table *table;
2497
2498        table = fib6_get_table(dev_net(dev), tb_id);
2499        if (!table)
2500                return NULL;
2501
2502        read_lock_bh(&table->tb6_lock);
2503        for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2504                if (dev == rt->dst.dev &&
2505                    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2506                    ipv6_addr_equal(&rt->rt6i_gateway, addr))
2507                        break;
2508        }
2509        if (rt)
2510                dst_hold(&rt->dst);
2511        read_unlock_bh(&table->tb6_lock);
2512        return rt;
2513}
2514
2515struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2516                                     struct net_device *dev,
2517                                     unsigned int pref)
2518{
2519        struct fib6_config cfg = {
2520                .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
2521                .fc_metric      = IP6_RT_PRIO_USER,
2522                .fc_ifindex     = dev->ifindex,
2523                .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2524                                  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2525                .fc_nlinfo.portid = 0,
2526                .fc_nlinfo.nlh = NULL,
2527                .fc_nlinfo.nl_net = dev_net(dev),
2528        };
2529
2530        cfg.fc_gateway = *gwaddr;
2531
2532        if (!ip6_route_add(&cfg)) {
2533                struct fib6_table *table;
2534
2535                table = fib6_get_table(dev_net(dev), cfg.fc_table);
2536                if (table)
2537                        table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
2538        }
2539
2540        return rt6_get_dflt_router(gwaddr, dev);
2541}
2542
2543static void __rt6_purge_dflt_routers(struct fib6_table *table)
2544{
2545        struct rt6_info *rt;
2546
2547restart:
2548        read_lock_bh(&table->tb6_lock);
2549        for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2550                if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2551                    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2552                        dst_hold(&rt->dst);
2553                        read_unlock_bh(&table->tb6_lock);
2554                        ip6_del_rt(rt);
2555                        goto restart;
2556                }
2557        }
2558        read_unlock_bh(&table->tb6_lock);
2559
2560        table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
2561}
2562
2563void rt6_purge_dflt_routers(struct net *net)
2564{
2565        struct fib6_table *table;
2566        struct hlist_head *head;
2567        unsigned int h;
2568
2569        rcu_read_lock();
2570
2571        for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2572                head = &net->ipv6.fib_table_hash[h];
2573                hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2574                        if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
2575                                __rt6_purge_dflt_routers(table);
2576                }
2577        }
2578
2579        rcu_read_unlock();
2580}
2581
2582static void rtmsg_to_fib6_config(struct net *net,
2583                                 struct in6_rtmsg *rtmsg,
2584                                 struct fib6_config *cfg)
2585{
2586        memset(cfg, 0, sizeof(*cfg));
2587
2588        cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
2589                         : RT6_TABLE_MAIN;
2590        cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2591        cfg->fc_metric = rtmsg->rtmsg_metric;
2592        cfg->fc_expires = rtmsg->rtmsg_info;
2593        cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2594        cfg->fc_src_len = rtmsg->rtmsg_src_len;
2595        cfg->fc_flags = rtmsg->rtmsg_flags;
2596
2597        cfg->fc_nlinfo.nl_net = net;
2598
2599        cfg->fc_dst = rtmsg->rtmsg_dst;
2600        cfg->fc_src = rtmsg->rtmsg_src;
2601        cfg->fc_gateway = rtmsg->rtmsg_gateway;
2602}
2603
2604int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2605{
2606        struct fib6_config cfg;
2607        struct in6_rtmsg rtmsg;
2608        int err;
2609
2610        switch (cmd) {
2611        case SIOCADDRT:         /* Add a route */
2612        case SIOCDELRT:         /* Delete a route */
2613                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2614                        return -EPERM;
2615                err = copy_from_user(&rtmsg, arg,
2616                                     sizeof(struct in6_rtmsg));
2617                if (err)
2618                        return -EFAULT;
2619
2620                rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2621
2622                rtnl_lock();
2623                switch (cmd) {
2624                case SIOCADDRT:
2625                        err = ip6_route_add(&cfg);
2626                        break;
2627                case SIOCDELRT:
2628                        err = ip6_route_del(&cfg);
2629                        break;
2630                default:
2631                        err = -EINVAL;
2632                }
2633                rtnl_unlock();
2634
2635                return err;
2636        }
2637
2638        return -EINVAL;
2639}
2640
2641/*
2642 *      Drop the packet on the floor
2643 */
2644
2645static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2646{
2647        int type;
2648        struct dst_entry *dst = skb_dst(skb);
2649        switch (ipstats_mib_noroutes) {
2650        case IPSTATS_MIB_INNOROUTES:
2651                type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2652                if (type == IPV6_ADDR_ANY) {
2653                        IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2654                                      IPSTATS_MIB_INADDRERRORS);
2655                        break;
2656                }
2657                /* FALLTHROUGH */
2658        case IPSTATS_MIB_OUTNOROUTES:
2659                IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2660                              ipstats_mib_noroutes);
2661                break;
2662        }
2663        icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2664        kfree_skb(skb);
2665        return 0;
2666}
2667
2668static int ip6_pkt_discard(struct sk_buff *skb)
2669{
2670        return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2671}
2672
2673static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2674{
2675        skb->dev = skb_dst(skb)->dev;
2676        return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2677}
2678
2679static int ip6_pkt_prohibit(struct sk_buff *skb)
2680{
2681        return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2682}
2683
2684static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2685{
2686        skb->dev = skb_dst(skb)->dev;
2687        return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2688}
2689
2690/*
2691 *      Allocate a dst for local (unicast / anycast) address.
2692 */
2693
2694struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2695                                    const struct in6_addr *addr,
2696                                    bool anycast)
2697{
2698        u32 tb_id;
2699        struct net *net = dev_net(idev->dev);
2700        struct net_device *dev = net->loopback_dev;
2701        struct rt6_info *rt;
2702
2703        /* use L3 Master device as loopback for host routes if device
2704         * is enslaved and address is not link local or multicast
2705         */
2706        if (!rt6_need_strict(addr))
2707                dev = l3mdev_master_dev_rcu(idev->dev) ? : dev;
2708
2709        rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
2710        if (!rt)
2711                return ERR_PTR(-ENOMEM);
2712
2713        in6_dev_hold(idev);
2714
2715        rt->dst.flags |= DST_HOST;
2716        rt->dst.input = ip6_input;
2717        rt->dst.output = ip6_output;
2718        rt->rt6i_idev = idev;
2719
2720        rt->rt6i_protocol = RTPROT_KERNEL;
2721        rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2722        if (anycast)
2723                rt->rt6i_flags |= RTF_ANYCAST;
2724        else
2725                rt->rt6i_flags |= RTF_LOCAL;
2726
2727        rt->rt6i_gateway  = *addr;
2728        rt->rt6i_dst.addr = *addr;
2729        rt->rt6i_dst.plen = 128;
2730        tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
2731        rt->rt6i_table = fib6_get_table(net, tb_id);
2732        rt->dst.flags |= DST_NOCACHE;
2733
2734        atomic_set(&rt->dst.__refcnt, 1);
2735
2736        return rt;
2737}
2738
2739/* remove deleted ip from prefsrc entries */
2740struct arg_dev_net_ip {
2741        struct net_device *dev;
2742        struct net *net;
2743        struct in6_addr *addr;
2744};
2745
2746static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2747{
2748        struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2749        struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2750        struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2751
2752        if (((void *)rt->dst.dev == dev || !dev) &&
2753            rt != net->ipv6.ip6_null_entry &&
2754            ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2755                /* remove prefsrc entry */
2756                rt->rt6i_prefsrc.plen = 0;
2757        }
2758        return 0;
2759}
2760
2761void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2762{
2763        struct net *net = dev_net(ifp->idev->dev);
2764        struct arg_dev_net_ip adni = {
2765                .dev = ifp->idev->dev,
2766                .net = net,
2767                .addr = &ifp->addr,
2768        };
2769        fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2770}
2771
2772#define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2773#define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
2774
2775/* Remove routers and update dst entries when gateway turn into host. */
2776static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2777{
2778        struct in6_addr *gateway = (struct in6_addr *)arg;
2779
2780        if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2781             ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2782             ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2783                return -1;
2784        }
2785        return 0;
2786}
2787
2788void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2789{
2790        fib6_clean_all(net, fib6_clean_tohost, gateway);
2791}
2792
2793struct arg_dev_net {
2794        struct net_device *dev;
2795        struct net *net;
2796};
2797
2798/* called with write lock held for table with rt */
2799static int fib6_ifdown(struct rt6_info *rt, void *arg)
2800{
2801        const struct arg_dev_net *adn = arg;
2802        const struct net_device *dev = adn->dev;
2803
2804        if ((rt->dst.dev == dev || !dev) &&
2805            rt != adn->net->ipv6.ip6_null_entry &&
2806            (rt->rt6i_nsiblings == 0 ||
2807             (dev && netdev_unregistering(dev)) ||
2808             !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
2809                return -1;
2810
2811        return 0;
2812}
2813
2814void rt6_ifdown(struct net *net, struct net_device *dev)
2815{
2816        struct arg_dev_net adn = {
2817                .dev = dev,
2818                .net = net,
2819        };
2820
2821        fib6_clean_all(net, fib6_ifdown, &adn);
2822        icmp6_clean_all(fib6_ifdown, &adn);
2823        if (dev)
2824                rt6_uncached_list_flush_dev(net, dev);
2825}
2826
2827struct rt6_mtu_change_arg {
2828        struct net_device *dev;
2829        unsigned int mtu;
2830};
2831
2832static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2833{
2834        struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2835        struct inet6_dev *idev;
2836
2837        /* In IPv6 pmtu discovery is not optional,
2838           so that RTAX_MTU lock cannot disable it.
2839           We still use this lock to block changes
2840           caused by addrconf/ndisc.
2841        */
2842
2843        idev = __in6_dev_get(arg->dev);
2844        if (!idev)
2845                return 0;
2846
2847        /* For administrative MTU increase, there is no way to discover
2848           IPv6 PMTU increase, so PMTU increase should be updated here.
2849           Since RFC 1981 doesn't include administrative MTU increase
2850           update PMTU increase is a MUST. (i.e. jumbo frame)
2851         */
2852        /*
2853           If new MTU is less than route PMTU, this new MTU will be the
2854           lowest MTU in the path, update the route PMTU to reflect PMTU
2855           decreases; if new MTU is greater than route PMTU, and the
2856           old MTU is the lowest MTU in the path, update the route PMTU
2857           to reflect the increase. In this case if the other nodes' MTU
2858           also have the lowest MTU, TOO BIG MESSAGE will be lead to
2859           PMTU discovery.
2860         */
2861        if (rt->dst.dev == arg->dev &&
2862            dst_metric_raw(&rt->dst, RTAX_MTU) &&
2863            !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2864                if (rt->rt6i_flags & RTF_CACHE) {
2865                        /* For RTF_CACHE with rt6i_pmtu == 0
2866                         * (i.e. a redirected route),
2867                         * the metrics of its rt->dst.from has already
2868                         * been updated.
2869                         */
2870                        if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2871                                rt->rt6i_pmtu = arg->mtu;
2872                } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2873                           (dst_mtu(&rt->dst) < arg->mtu &&
2874                            dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2875                        dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2876                }
2877        }
2878        return 0;
2879}
2880
2881void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2882{
2883        struct rt6_mtu_change_arg arg = {
2884                .dev = dev,
2885                .mtu = mtu,
2886        };
2887
2888        fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2889}
2890
2891static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2892        [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2893        [RTA_OIF]               = { .type = NLA_U32 },
2894        [RTA_IIF]               = { .type = NLA_U32 },
2895        [RTA_PRIORITY]          = { .type = NLA_U32 },
2896        [RTA_METRICS]           = { .type = NLA_NESTED },
2897        [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
2898        [RTA_PREF]              = { .type = NLA_U8 },
2899        [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
2900        [RTA_ENCAP]             = { .type = NLA_NESTED },
2901        [RTA_EXPIRES]           = { .type = NLA_U32 },
2902        [RTA_UID]               = { .type = NLA_U32 },
2903        [RTA_MARK]              = { .type = NLA_U32 },
2904};
2905
2906static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2907                              struct fib6_config *cfg)
2908{
2909        struct rtmsg *rtm;
2910        struct nlattr *tb[RTA_MAX+1];
2911        unsigned int pref;
2912        int err;
2913
2914        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
2915                          NULL);
2916        if (err < 0)
2917                goto errout;
2918
2919        err = -EINVAL;
2920        rtm = nlmsg_data(nlh);
2921        memset(cfg, 0, sizeof(*cfg));
2922
2923        cfg->fc_table = rtm->rtm_table;
2924        cfg->fc_dst_len = rtm->rtm_dst_len;
2925        cfg->fc_src_len = rtm->rtm_src_len;
2926        cfg->fc_flags = RTF_UP;
2927        cfg->fc_protocol = rtm->rtm_protocol;
2928        cfg->fc_type = rtm->rtm_type;
2929
2930        if (rtm->rtm_type == RTN_UNREACHABLE ||
2931            rtm->rtm_type == RTN_BLACKHOLE ||
2932            rtm->rtm_type == RTN_PROHIBIT ||
2933            rtm->rtm_type == RTN_THROW)
2934                cfg->fc_flags |= RTF_REJECT;
2935
2936        if (rtm->rtm_type == RTN_LOCAL)
2937                cfg->fc_flags |= RTF_LOCAL;
2938
2939        if (rtm->rtm_flags & RTM_F_CLONED)
2940                cfg->fc_flags |= RTF_CACHE;
2941
2942        cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2943        cfg->fc_nlinfo.nlh = nlh;
2944        cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2945
2946        if (tb[RTA_GATEWAY]) {
2947                cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2948                cfg->fc_flags |= RTF_GATEWAY;
2949        }
2950
2951        if (tb[RTA_DST]) {
2952                int plen = (rtm->rtm_dst_len + 7) >> 3;
2953
2954                if (nla_len(tb[RTA_DST]) < plen)
2955                        goto errout;
2956
2957                nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2958        }
2959
2960        if (tb[RTA_SRC]) {
2961                int plen = (rtm->rtm_src_len + 7) >> 3;
2962
2963                if (nla_len(tb[RTA_SRC]) < plen)
2964                        goto errout;
2965
2966                nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2967        }
2968
2969        if (tb[RTA_PREFSRC])
2970                cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2971
2972        if (tb[RTA_OIF])
2973                cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2974
2975        if (tb[RTA_PRIORITY])
2976                cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2977
2978        if (tb[RTA_METRICS]) {
2979                cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2980                cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2981        }
2982
2983        if (tb[RTA_TABLE])
2984                cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2985
2986        if (tb[RTA_MULTIPATH]) {
2987                cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2988                cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2989
2990                err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
2991                                                     cfg->fc_mp_len);
2992                if (err < 0)
2993                        goto errout;
2994        }
2995
2996        if (tb[RTA_PREF]) {
2997                pref = nla_get_u8(tb[RTA_PREF]);
2998                if (pref != ICMPV6_ROUTER_PREF_LOW &&
2999                    pref != ICMPV6_ROUTER_PREF_HIGH)
3000                        pref = ICMPV6_ROUTER_PREF_MEDIUM;
3001                cfg->fc_flags |= RTF_PREF(pref);
3002        }
3003
3004        if (tb[RTA_ENCAP])
3005                cfg->fc_encap = tb[RTA_ENCAP];
3006
3007        if (tb[RTA_ENCAP_TYPE]) {
3008                cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3009
3010                err = lwtunnel_valid_encap_type(cfg->fc_encap_type);
3011                if (err < 0)
3012                        goto errout;
3013        }
3014
3015        if (tb[RTA_EXPIRES]) {
3016                unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3017
3018                if (addrconf_finite_timeout(timeout)) {
3019                        cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3020                        cfg->fc_flags |= RTF_EXPIRES;
3021                }
3022        }
3023
3024        err = 0;
3025errout:
3026        return err;
3027}
3028
3029struct rt6_nh {
3030        struct rt6_info *rt6_info;
3031        struct fib6_config r_cfg;
3032        struct mx6_config mxc;
3033        struct list_head next;
3034};
3035
3036static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3037{
3038        struct rt6_nh *nh;
3039
3040        list_for_each_entry(nh, rt6_nh_list, next) {
3041                pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3042                        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3043                        nh->r_cfg.fc_ifindex);
3044        }
3045}
3046
3047static int ip6_route_info_append(struct list_head *rt6_nh_list,
3048                                 struct rt6_info *rt, struct fib6_config *r_cfg)
3049{
3050        struct rt6_nh *nh;
3051        struct rt6_info *rtnh;
3052        int err = -EEXIST;
3053
3054        list_for_each_entry(nh, rt6_nh_list, next) {
3055                /* check if rt6_info already exists */
3056                rtnh = nh->rt6_info;
3057
3058                if (rtnh->dst.dev == rt->dst.dev &&
3059                    rtnh->rt6i_idev == rt->rt6i_idev &&
3060                    ipv6_addr_equal(&rtnh->rt6i_gateway,
3061                                    &rt->rt6i_gateway))
3062                        return err;
3063        }
3064
3065        nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3066        if (!nh)
3067                return -ENOMEM;
3068        nh->rt6_info = rt;
3069        err = ip6_convert_metrics(&nh->mxc, r_cfg);
3070        if (err) {
3071                kfree(nh);
3072                return err;
3073        }
3074        memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3075        list_add_tail(&nh->next, rt6_nh_list);
3076
3077        return 0;
3078}
3079
3080static void ip6_route_mpath_notify(struct rt6_info *rt,
3081                                   struct rt6_info *rt_last,
3082                                   struct nl_info *info,
3083                                   __u16 nlflags)
3084{
3085        /* if this is an APPEND route, then rt points to the first route
3086         * inserted and rt_last points to last route inserted. Userspace
3087         * wants a consistent dump of the route which starts at the first
3088         * nexthop. Since sibling routes are always added at the end of
3089         * the list, find the first sibling of the last route appended
3090         */
3091        if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3092                rt = list_first_entry(&rt_last->rt6i_siblings,
3093                                      struct rt6_info,
3094                                      rt6i_siblings);
3095        }
3096
3097        if (rt)
3098                inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3099}
3100
3101static int ip6_route_multipath_add(struct fib6_config *cfg)
3102{
3103        struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3104        struct nl_info *info = &cfg->fc_nlinfo;
3105        struct fib6_config r_cfg;
3106        struct rtnexthop *rtnh;
3107        struct rt6_info *rt;
3108        struct rt6_nh *err_nh;
3109        struct rt6_nh *nh, *nh_safe;
3110        __u16 nlflags;
3111        int remaining;
3112        int attrlen;
3113        int err = 1;
3114        int nhn = 0;
3115        int replace = (cfg->fc_nlinfo.nlh &&
3116                       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3117        LIST_HEAD(rt6_nh_list);
3118
3119        nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3120        if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3121                nlflags |= NLM_F_APPEND;
3122
3123        remaining = cfg->fc_mp_len;
3124        rtnh = (struct rtnexthop *)cfg->fc_mp;
3125
3126        /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3127         * rt6_info structs per nexthop
3128         */
3129        while (rtnh_ok(rtnh, remaining)) {
3130                memcpy(&r_cfg, cfg, sizeof(*cfg));
3131                if (rtnh->rtnh_ifindex)
3132                        r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3133
3134                attrlen = rtnh_attrlen(rtnh);
3135                if (attrlen > 0) {
3136                        struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3137
3138                        nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3139                        if (nla) {
3140                                r_cfg.fc_gateway = nla_get_in6_addr(nla);
3141                                r_cfg.fc_flags |= RTF_GATEWAY;
3142                        }
3143                        r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3144                        nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3145                        if (nla)
3146                                r_cfg.fc_encap_type = nla_get_u16(nla);
3147                }
3148
3149                rt = ip6_route_info_create(&r_cfg);
3150                if (IS_ERR(rt)) {
3151                        err = PTR_ERR(rt);
3152                        rt = NULL;
3153                        goto cleanup;
3154                }
3155
3156                err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3157                if (err) {
3158                        dst_free(&rt->dst);
3159                        goto cleanup;
3160                }
3161
3162                rtnh = rtnh_next(rtnh, &remaining);
3163        }
3164
3165        /* for add and replace send one notification with all nexthops.
3166         * Skip the notification in fib6_add_rt2node and send one with
3167         * the full route when done
3168         */
3169        info->skip_notify = 1;
3170
3171        err_nh = NULL;
3172        list_for_each_entry(nh, &rt6_nh_list, next) {
3173                rt_last = nh->rt6_info;
3174                err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc);
3175                /* save reference to first route for notification */
3176                if (!rt_notif && !err)
3177                        rt_notif = nh->rt6_info;
3178
3179                /* nh->rt6_info is used or freed at this point, reset to NULL*/
3180                nh->rt6_info = NULL;
3181                if (err) {
3182                        if (replace && nhn)
3183                                ip6_print_replace_route_err(&rt6_nh_list);
3184                        err_nh = nh;
3185                        goto add_errout;
3186                }
3187
3188                /* Because each route is added like a single route we remove
3189                 * these flags after the first nexthop: if there is a collision,
3190                 * we have already failed to add the first nexthop:
3191                 * fib6_add_rt2node() has rejected it; when replacing, old
3192                 * nexthops have been replaced by first new, the rest should
3193                 * be added to it.
3194                 */
3195                cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3196                                                     NLM_F_REPLACE);
3197                nhn++;
3198        }
3199
3200        /* success ... tell user about new route */
3201        ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3202        goto cleanup;
3203
3204add_errout:
3205        /* send notification for routes that were added so that
3206         * the delete notifications sent by ip6_route_del are
3207         * coherent
3208         */
3209        if (rt_notif)
3210                ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3211
3212        /* Delete routes that were already added */
3213        list_for_each_entry(nh, &rt6_nh_list, next) {
3214                if (err_nh == nh)
3215                        break;
3216                ip6_route_del(&nh->r_cfg);
3217        }
3218
3219cleanup:
3220        list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3221                if (nh->rt6_info)
3222                        dst_free(&nh->rt6_info->dst);
3223                kfree(nh->mxc.mx);
3224                list_del(&nh->next);
3225                kfree(nh);
3226        }
3227
3228        return err;
3229}
3230
3231static int ip6_route_multipath_del(struct fib6_config *cfg)
3232{
3233        struct fib6_config r_cfg;
3234        struct rtnexthop *rtnh;
3235        int remaining;
3236        int attrlen;
3237        int err = 1, last_err = 0;
3238
3239        remaining = cfg->fc_mp_len;
3240        rtnh = (struct rtnexthop *)cfg->fc_mp;
3241
3242        /* Parse a Multipath Entry */
3243        while (rtnh_ok(rtnh, remaining)) {
3244                memcpy(&r_cfg, cfg, sizeof(*cfg));
3245                if (rtnh->rtnh_ifindex)
3246                        r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3247
3248                attrlen = rtnh_attrlen(rtnh);
3249                if (attrlen > 0) {
3250                        struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3251
3252                        nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3253                        if (nla) {
3254                                nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3255                                r_cfg.fc_flags |= RTF_GATEWAY;
3256                        }
3257                }
3258                err = ip6_route_del(&r_cfg);
3259                if (err)
3260                        last_err = err;
3261
3262                rtnh = rtnh_next(rtnh, &remaining);
3263        }
3264
3265        return last_err;
3266}
3267
3268static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3269                              struct netlink_ext_ack *extack)
3270{
3271        struct fib6_config cfg;
3272        int err;
3273
3274        err = rtm_to_fib6_config(skb, nlh, &cfg);
3275        if (err < 0)
3276                return err;
3277
3278        if (cfg.fc_mp)
3279                return ip6_route_multipath_del(&cfg);
3280        else {
3281                cfg.fc_delete_all_nh = 1;
3282                return ip6_route_del(&cfg);
3283        }
3284}
3285
3286static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3287                              struct netlink_ext_ack *extack)
3288{
3289        struct fib6_config cfg;
3290        int err;
3291
3292        err = rtm_to_fib6_config(skb, nlh, &cfg);
3293        if (err < 0)
3294                return err;
3295
3296        if (cfg.fc_mp)
3297                return ip6_route_multipath_add(&cfg);
3298        else
3299                return ip6_route_add(&cfg);
3300}
3301
3302static size_t rt6_nlmsg_size(struct rt6_info *rt)
3303{
3304        int nexthop_len = 0;
3305
3306        if (rt->rt6i_nsiblings) {
3307                nexthop_len = nla_total_size(0)  /* RTA_MULTIPATH */
3308                            + NLA_ALIGN(sizeof(struct rtnexthop))
3309                            + nla_total_size(16) /* RTA_GATEWAY */
3310                            + lwtunnel_get_encap_size(rt->dst.lwtstate);
3311
3312                nexthop_len *= rt->rt6i_nsiblings;
3313        }
3314
3315        return NLMSG_ALIGN(sizeof(struct rtmsg))
3316               + nla_total_size(16) /* RTA_SRC */
3317               + nla_total_size(16) /* RTA_DST */
3318               + nla_total_size(16) /* RTA_GATEWAY */
3319               + nla_total_size(16) /* RTA_PREFSRC */
3320               + nla_total_size(4) /* RTA_TABLE */
3321               + nla_total_size(4) /* RTA_IIF */
3322               + nla_total_size(4) /* RTA_OIF */
3323               + nla_total_size(4) /* RTA_PRIORITY */
3324               + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3325               + nla_total_size(sizeof(struct rta_cacheinfo))
3326               + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3327               + nla_total_size(1) /* RTA_PREF */
3328               + lwtunnel_get_encap_size(rt->dst.lwtstate)
3329               + nexthop_len;
3330}
3331
3332static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3333                            unsigned int *flags, bool skip_oif)
3334{
3335        if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3336                *flags |= RTNH_F_LINKDOWN;
3337                if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3338                        *flags |= RTNH_F_DEAD;
3339        }
3340
3341        if (rt->rt6i_flags & RTF_GATEWAY) {
3342                if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3343                        goto nla_put_failure;
3344        }
3345
3346        /* not needed for multipath encoding b/c it has a rtnexthop struct */
3347        if (!skip_oif && rt->dst.dev &&
3348            nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3349                goto nla_put_failure;
3350
3351        if (rt->dst.lwtstate &&
3352            lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3353                goto nla_put_failure;
3354
3355        return 0;
3356
3357nla_put_failure:
3358        return -EMSGSIZE;
3359}
3360
3361/* add multipath next hop */
3362static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
3363{
3364        struct rtnexthop *rtnh;
3365        unsigned int flags = 0;
3366
3367        rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
3368        if (!rtnh)
3369                goto nla_put_failure;
3370
3371        rtnh->rtnh_hops = 0;
3372        rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
3373
3374        if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
3375                goto nla_put_failure;
3376
3377        rtnh->rtnh_flags = flags;
3378
3379        /* length of rtnetlink header + attributes */
3380        rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
3381
3382        return 0;
3383
3384nla_put_failure:
3385        return -EMSGSIZE;
3386}
3387
3388static int rt6_fill_node(struct net *net,
3389                         struct sk_buff *skb, struct rt6_info *rt,
3390                         struct in6_addr *dst, struct in6_addr *src,
3391                         int iif, int type, u32 portid, u32 seq,
3392                         unsigned int flags)
3393{
3394        u32 metrics[RTAX_MAX];
3395        struct rtmsg *rtm;
3396        struct nlmsghdr *nlh;
3397        long expires;
3398        u32 table;
3399
3400        nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3401        if (!nlh)
3402                return -EMSGSIZE;
3403
3404        rtm = nlmsg_data(nlh);
3405        rtm->rtm_family = AF_INET6;
3406        rtm->rtm_dst_len = rt->rt6i_dst.plen;
3407        rtm->rtm_src_len = rt->rt6i_src.plen;
3408        rtm->rtm_tos = 0;
3409        if (rt->rt6i_table)
3410                table = rt->rt6i_table->tb6_id;
3411        else
3412                table = RT6_TABLE_UNSPEC;
3413        rtm->rtm_table = table;
3414        if (nla_put_u32(skb, RTA_TABLE, table))
3415                goto nla_put_failure;
3416        if (rt->rt6i_flags & RTF_REJECT) {
3417                switch (rt->dst.error) {
3418                case -EINVAL:
3419                        rtm->rtm_type = RTN_BLACKHOLE;
3420                        break;
3421                case -EACCES:
3422                        rtm->rtm_type = RTN_PROHIBIT;
3423                        break;
3424                case -EAGAIN:
3425                        rtm->rtm_type = RTN_THROW;
3426                        break;
3427                default:
3428                        rtm->rtm_type = RTN_UNREACHABLE;
3429                        break;
3430                }
3431        }
3432        else if (rt->rt6i_flags & RTF_LOCAL)
3433                rtm->rtm_type = RTN_LOCAL;
3434        else if (rt->rt6i_flags & RTF_ANYCAST)
3435                rtm->rtm_type = RTN_ANYCAST;
3436        else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3437                rtm->rtm_type = RTN_LOCAL;
3438        else
3439                rtm->rtm_type = RTN_UNICAST;
3440        rtm->rtm_flags = 0;
3441        rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3442        rtm->rtm_protocol = rt->rt6i_protocol;
3443        if (rt->rt6i_flags & RTF_DYNAMIC)
3444                rtm->rtm_protocol = RTPROT_REDIRECT;
3445        else if (rt->rt6i_flags & RTF_ADDRCONF) {
3446                if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
3447                        rtm->rtm_protocol = RTPROT_RA;
3448                else
3449                        rtm->rtm_protocol = RTPROT_KERNEL;
3450        }
3451
3452        if (rt->rt6i_flags & RTF_CACHE)
3453                rtm->rtm_flags |= RTM_F_CLONED;
3454
3455        if (dst) {
3456                if (nla_put_in6_addr(skb, RTA_DST, dst))
3457                        goto nla_put_failure;
3458                rtm->rtm_dst_len = 128;
3459        } else if (rtm->rtm_dst_len)
3460                if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3461                        goto nla_put_failure;
3462#ifdef CONFIG_IPV6_SUBTREES
3463        if (src) {
3464                if (nla_put_in6_addr(skb, RTA_SRC, src))
3465                        goto nla_put_failure;
3466                rtm->rtm_src_len = 128;
3467        } else if (rtm->rtm_src_len &&
3468                   nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3469                goto nla_put_failure;
3470#endif
3471        if (iif) {
3472#ifdef CONFIG_IPV6_MROUTE
3473                if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3474                        int err = ip6mr_get_route(net, skb, rtm, portid);
3475
3476                        if (err == 0)
3477                                return 0;
3478                        if (err < 0)
3479                                goto nla_put_failure;
3480                } else
3481#endif
3482                        if (nla_put_u32(skb, RTA_IIF, iif))
3483                                goto nla_put_failure;
3484        } else if (dst) {
3485                struct in6_addr saddr_buf;
3486                if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3487                    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3488                        goto nla_put_failure;
3489        }
3490
3491        if (rt->rt6i_prefsrc.plen) {
3492                struct in6_addr saddr_buf;
3493                saddr_buf = rt->rt6i_prefsrc.addr;
3494                if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3495                        goto nla_put_failure;
3496        }
3497
3498        memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3499        if (rt->rt6i_pmtu)
3500                metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3501        if (rtnetlink_put_metrics(skb, metrics) < 0)
3502                goto nla_put_failure;
3503
3504        if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3505                goto nla_put_failure;
3506
3507        /* For multipath routes, walk the siblings list and add
3508         * each as a nexthop within RTA_MULTIPATH.
3509         */
3510        if (rt->rt6i_nsiblings) {
3511                struct rt6_info *sibling, *next_sibling;
3512                struct nlattr *mp;
3513
3514                mp = nla_nest_start(skb, RTA_MULTIPATH);
3515                if (!mp)
3516                        goto nla_put_failure;
3517
3518                if (rt6_add_nexthop(skb, rt) < 0)
3519                        goto nla_put_failure;
3520
3521                list_for_each_entry_safe(sibling, next_sibling,
3522                                         &rt->rt6i_siblings, rt6i_siblings) {
3523                        if (rt6_add_nexthop(skb, sibling) < 0)
3524                                goto nla_put_failure;
3525                }
3526
3527                nla_nest_end(skb, mp);
3528        } else {
3529                if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
3530                        goto nla_put_failure;
3531        }
3532
3533        expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3534
3535        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3536                goto nla_put_failure;
3537
3538        if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3539                goto nla_put_failure;
3540
3541
3542        nlmsg_end(skb, nlh);
3543        return 0;
3544
3545nla_put_failure:
3546        nlmsg_cancel(skb, nlh);
3547        return -EMSGSIZE;
3548}
3549
3550int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3551{
3552        struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3553        struct net *net = arg->net;
3554
3555        if (rt == net->ipv6.ip6_null_entry)
3556                return 0;
3557
3558        if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3559                struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3560
3561                /* user wants prefix routes only */
3562                if (rtm->rtm_flags & RTM_F_PREFIX &&
3563                    !(rt->rt6i_flags & RTF_PREFIX_RT)) {
3564                        /* success since this is not a prefix route */
3565                        return 1;
3566                }
3567        }
3568
3569        return rt6_fill_node(net,
3570                     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3571                     NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3572                     NLM_F_MULTI);
3573}
3574
3575static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3576                              struct netlink_ext_ack *extack)
3577{
3578        struct net *net = sock_net(in_skb->sk);
3579        struct nlattr *tb[RTA_MAX+1];
3580        struct rt6_info *rt;
3581        struct sk_buff *skb;
3582        struct rtmsg *rtm;
3583        struct flowi6 fl6;
3584        int err, iif = 0, oif = 0;
3585
3586        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3587                          extack);
3588        if (err < 0)
3589                goto errout;
3590
3591        err = -EINVAL;
3592        memset(&fl6, 0, sizeof(fl6));
3593        rtm = nlmsg_data(nlh);
3594        fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
3595
3596        if (tb[RTA_SRC]) {
3597                if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3598                        goto errout;
3599
3600                fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3601        }
3602
3603        if (tb[RTA_DST]) {
3604                if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3605                        goto errout;
3606
3607                fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3608        }
3609
3610        if (tb[RTA_IIF])
3611                iif = nla_get_u32(tb[RTA_IIF]);
3612
3613        if (tb[RTA_OIF])
3614                oif = nla_get_u32(tb[RTA_OIF]);
3615
3616        if (tb[RTA_MARK])
3617                fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3618
3619        if (tb[RTA_UID])
3620                fl6.flowi6_uid = make_kuid(current_user_ns(),
3621                                           nla_get_u32(tb[RTA_UID]));
3622        else
3623                fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
3624
3625        if (iif) {
3626                struct net_device *dev;
3627                int flags = 0;
3628
3629                dev = __dev_get_by_index(net, iif);
3630                if (!dev) {
3631                        err = -ENODEV;
3632                        goto errout;
3633                }
3634
3635                fl6.flowi6_iif = iif;
3636
3637                if (!ipv6_addr_any(&fl6.saddr))
3638                        flags |= RT6_LOOKUP_F_HAS_SADDR;
3639
3640                rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3641                                                               flags);
3642        } else {
3643                fl6.flowi6_oif = oif;
3644
3645                rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3646        }
3647
3648        if (rt == net->ipv6.ip6_null_entry) {
3649                err = rt->dst.error;
3650                ip6_rt_put(rt);
3651                goto errout;
3652        }
3653
3654        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3655        if (!skb) {
3656                ip6_rt_put(rt);
3657                err = -ENOBUFS;
3658                goto errout;
3659        }
3660
3661        skb_dst_set(skb, &rt->dst);
3662
3663        err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3664                            RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3665                            nlh->nlmsg_seq, 0);
3666        if (err < 0) {
3667                kfree_skb(skb);
3668                goto errout;
3669        }
3670
3671        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3672errout:
3673        return err;
3674}
3675
3676void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3677                     unsigned int nlm_flags)
3678{
3679        struct sk_buff *skb;
3680        struct net *net = info->nl_net;
3681        u32 seq;
3682        int err;
3683
3684        err = -ENOBUFS;
3685        seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3686
3687        skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3688        if (!skb)
3689                goto errout;
3690
3691        err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3692                                event, info->portid, seq, nlm_flags);
3693        if (err < 0) {
3694                /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3695                WARN_ON(err == -EMSGSIZE);
3696                kfree_skb(skb);
3697                goto errout;
3698        }
3699        rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3700                    info->nlh, gfp_any());
3701        return;
3702errout:
3703        if (err < 0)
3704                rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3705}
3706
3707static int ip6_route_dev_notify(struct notifier_block *this,
3708                                unsigned long event, void *ptr)
3709{
3710        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3711        struct net *net = dev_net(dev);
3712
3713        if (!(dev->flags & IFF_LOOPBACK))
3714                return NOTIFY_OK;
3715
3716        if (event == NETDEV_REGISTER) {
3717                net->ipv6.ip6_null_entry->dst.dev = dev;
3718                net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3719#ifdef CONFIG_IPV6_MULTIPLE_TABLES
3720                net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3721                net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3722                net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3723                net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3724#endif
3725         } else if (event == NETDEV_UNREGISTER &&
3726                    dev->reg_state != NETREG_UNREGISTERED) {
3727                /* NETDEV_UNREGISTER could be fired for multiple times by
3728                 * netdev_wait_allrefs(). Make sure we only call this once.
3729                 */
3730                in6_dev_put(net->ipv6.ip6_null_entry->rt6i_idev);
3731#ifdef CONFIG_IPV6_MULTIPLE_TABLES
3732                in6_dev_put(net->ipv6.ip6_prohibit_entry->rt6i_idev);
3733                in6_dev_put(net->ipv6.ip6_blk_hole_entry->rt6i_idev);
3734#endif
3735        }
3736
3737        return NOTIFY_OK;
3738}
3739
3740/*
3741 *      /proc
3742 */
3743
3744#ifdef CONFIG_PROC_FS
3745
3746static const struct file_operations ipv6_route_proc_fops = {
3747        .owner          = THIS_MODULE,
3748        .open           = ipv6_route_open,
3749        .read           = seq_read,
3750        .llseek         = seq_lseek,
3751        .release        = seq_release_net,
3752};
3753
3754static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3755{
3756        struct net *net = (struct net *)seq->private;
3757        seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3758                   net->ipv6.rt6_stats->fib_nodes,
3759                   net->ipv6.rt6_stats->fib_route_nodes,
3760                   net->ipv6.rt6_stats->fib_rt_alloc,
3761                   net->ipv6.rt6_stats->fib_rt_entries,
3762                   net->ipv6.rt6_stats->fib_rt_cache,
3763                   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3764                   net->ipv6.rt6_stats->fib_discarded_routes);
3765
3766        return 0;
3767}
3768
3769static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3770{
3771        return single_open_net(inode, file, rt6_stats_seq_show);
3772}
3773
3774static const struct file_operations rt6_stats_seq_fops = {
3775        .owner   = THIS_MODULE,
3776        .open    = rt6_stats_seq_open,
3777        .read    = seq_read,
3778        .llseek  = seq_lseek,
3779        .release = single_release_net,
3780};
3781#endif  /* CONFIG_PROC_FS */
3782
3783#ifdef CONFIG_SYSCTL
3784
3785static
3786int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3787                              void __user *buffer, size_t *lenp, loff_t *ppos)
3788{
3789        struct net *net;
3790        int delay;
3791        if (!write)
3792                return -EINVAL;
3793
3794        net = (struct net *)ctl->extra1;
3795        delay = net->ipv6.sysctl.flush_delay;
3796        proc_dointvec(ctl, write, buffer, lenp, ppos);
3797        fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3798        return 0;
3799}
3800
3801struct ctl_table ipv6_route_table_template[] = {
3802        {
3803                .procname       =       "flush",
3804                .data           =       &init_net.ipv6.sysctl.flush_delay,
3805                .maxlen         =       sizeof(int),
3806                .mode           =       0200,
3807                .proc_handler   =       ipv6_sysctl_rtcache_flush
3808        },
3809        {
3810                .procname       =       "gc_thresh",
3811                .data           =       &ip6_dst_ops_template.gc_thresh,
3812                .maxlen         =       sizeof(int),
3813                .mode           =       0644,
3814                .proc_handler   =       proc_dointvec,
3815        },
3816        {
3817                .procname       =       "max_size",
3818                .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
3819                .maxlen         =       sizeof(int),
3820                .mode           =       0644,
3821                .proc_handler   =       proc_dointvec,
3822        },
3823        {
3824                .procname       =       "gc_min_interval",
3825                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3826                .maxlen         =       sizeof(int),
3827                .mode           =       0644,
3828                .proc_handler   =       proc_dointvec_jiffies,
3829        },
3830        {
3831                .procname       =       "gc_timeout",
3832                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3833                .maxlen         =       sizeof(int),
3834                .mode           =       0644,
3835                .proc_handler   =       proc_dointvec_jiffies,
3836        },
3837        {
3838                .procname       =       "gc_interval",
3839                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3840                .maxlen         =       sizeof(int),
3841                .mode           =       0644,
3842                .proc_handler   =       proc_dointvec_jiffies,
3843        },
3844        {
3845                .procname       =       "gc_elasticity",
3846                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3847                .maxlen         =       sizeof(int),
3848                .mode           =       0644,
3849                .proc_handler   =       proc_dointvec,
3850        },
3851        {
3852                .procname       =       "mtu_expires",
3853                .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3854                .maxlen         =       sizeof(int),
3855                .mode           =       0644,
3856                .proc_handler   =       proc_dointvec_jiffies,
3857        },
3858        {
3859                .procname       =       "min_adv_mss",
3860                .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3861                .maxlen         =       sizeof(int),
3862                .mode           =       0644,
3863                .proc_handler   =       proc_dointvec,
3864        },
3865        {
3866                .procname       =       "gc_min_interval_ms",
3867                .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3868                .maxlen         =       sizeof(int),
3869                .mode           =       0644,
3870                .proc_handler   =       proc_dointvec_ms_jiffies,
3871        },
3872        { }
3873};
3874
3875struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3876{
3877        struct ctl_table *table;
3878
3879        table = kmemdup(ipv6_route_table_template,
3880                        sizeof(ipv6_route_table_template),
3881                        GFP_KERNEL);
3882
3883        if (table) {
3884                table[0].data = &net->ipv6.sysctl.flush_delay;
3885                table[0].extra1 = net;
3886                table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3887                table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3888                table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3889                table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3890                table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3891                table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3892                table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3893                table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3894                table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3895
3896                /* Don't export sysctls to unprivileged users */
3897                if (net->user_ns != &init_user_ns)
3898                        table[0].procname = NULL;
3899        }
3900
3901        return table;
3902}
3903#endif
3904
3905static int __net_init ip6_route_net_init(struct net *net)
3906{
3907        int ret = -ENOMEM;
3908
3909        memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3910               sizeof(net->ipv6.ip6_dst_ops));
3911
3912        if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3913                goto out_ip6_dst_ops;
3914
3915        net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3916                                           sizeof(*net->ipv6.ip6_null_entry),
3917                                           GFP_KERNEL);
3918        if (!net->ipv6.ip6_null_entry)
3919                goto out_ip6_dst_entries;
3920        net->ipv6.ip6_null_entry->dst.path =
3921                (struct dst_entry *)net->ipv6.ip6_null_entry;
3922        net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3923        dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3924                         ip6_template_metrics, true);
3925
3926#ifdef CONFIG_IPV6_MULTIPLE_TABLES
3927        net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3928                                               sizeof(*net->ipv6.ip6_prohibit_entry),
3929                                               GFP_KERNEL);
3930        if (!net->ipv6.ip6_prohibit_entry)
3931                goto out_ip6_null_entry;
3932        net->ipv6.ip6_prohibit_entry->dst.path =
3933                (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3934        net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3935        dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3936                         ip6_template_metrics, true);
3937
3938        net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3939                                               sizeof(*net->ipv6.ip6_blk_hole_entry),
3940                                               GFP_KERNEL);
3941        if (!net->ipv6.ip6_blk_hole_entry)
3942                goto out_ip6_prohibit_entry;
3943        net->ipv6.ip6_blk_hole_entry->dst.path =
3944                (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3945        net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3946        dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3947                         ip6_template_metrics, true);
3948#endif
3949
3950        net->ipv6.sysctl.flush_delay = 0;
3951        net->ipv6.sysctl.ip6_rt_max_size = 4096;
3952        net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3953        net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3954        net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3955        net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3956        net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3957        net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3958
3959        net->ipv6.ip6_rt_gc_expire = 30*HZ;
3960
3961        ret = 0;
3962out:
3963        return ret;
3964
3965#ifdef CONFIG_IPV6_MULTIPLE_TABLES
3966out_ip6_prohibit_entry:
3967        kfree(net->ipv6.ip6_prohibit_entry);
3968out_ip6_null_entry:
3969        kfree(net->ipv6.ip6_null_entry);
3970#endif
3971out_ip6_dst_entries:
3972        dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3973out_ip6_dst_ops:
3974        goto out;
3975}
3976
3977static void __net_exit ip6_route_net_exit(struct net *net)
3978{
3979        kfree(net->ipv6.ip6_null_entry);
3980#ifdef CONFIG_IPV6_MULTIPLE_TABLES
3981        kfree(net->ipv6.ip6_prohibit_entry);
3982        kfree(net->ipv6.ip6_blk_hole_entry);
3983#endif
3984        dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3985}
3986
3987static int __net_init ip6_route_net_init_late(struct net *net)
3988{
3989#ifdef CONFIG_PROC_FS
3990        proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3991        proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3992#endif
3993        return 0;
3994}
3995
3996static void __net_exit ip6_route_net_exit_late(struct net *net)
3997{
3998#ifdef CONFIG_PROC_FS
3999        remove_proc_entry("ipv6_route", net->proc_net);
4000        remove_proc_entry("rt6_stats", net->proc_net);
4001#endif
4002}
4003
4004static struct pernet_operations ip6_route_net_ops = {
4005        .init = ip6_route_net_init,
4006        .exit = ip6_route_net_exit,
4007};
4008
4009static int __net_init ipv6_inetpeer_init(struct net *net)
4010{
4011        struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4012
4013        if (!bp)
4014                return -ENOMEM;
4015        inet_peer_base_init(bp);
4016        net->ipv6.peers = bp;
4017        return 0;
4018}
4019
4020static void __net_exit ipv6_inetpeer_exit(struct net *net)
4021{
4022        struct inet_peer_base *bp = net->ipv6.peers;
4023
4024        net->ipv6.peers = NULL;
4025        inetpeer_invalidate_tree(bp);
4026        kfree(bp);
4027}
4028
4029static struct pernet_operations ipv6_inetpeer_ops = {
4030        .init   =       ipv6_inetpeer_init,
4031        .exit   =       ipv6_inetpeer_exit,
4032};
4033
4034static struct pernet_operations ip6_route_net_late_ops = {
4035        .init = ip6_route_net_init_late,
4036        .exit = ip6_route_net_exit_late,
4037};
4038
4039static struct notifier_block ip6_route_dev_notifier = {
4040        .notifier_call = ip6_route_dev_notify,
4041        .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4042};
4043
4044void __init ip6_route_init_special_entries(void)
4045{
4046        /* Registering of the loopback is done before this portion of code,
4047         * the loopback reference in rt6_info will not be taken, do it
4048         * manually for init_net */
4049        init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4050        init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4051  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4052        init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4053        init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4054        init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4055        init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4056  #endif
4057}
4058
4059int __init ip6_route_init(void)
4060{
4061        int ret;
4062        int cpu;
4063
4064        ret = -ENOMEM;
4065        ip6_dst_ops_template.kmem_cachep =
4066                kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4067                                  SLAB_HWCACHE_ALIGN, NULL);
4068        if (!ip6_dst_ops_template.kmem_cachep)
4069                goto out;
4070
4071        ret = dst_entries_init(&ip6_dst_blackhole_ops);
4072        if (ret)
4073                goto out_kmem_cache;
4074
4075        ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4076        if (ret)
4077                goto out_dst_entries;
4078
4079        ret = register_pernet_subsys(&ip6_route_net_ops);
4080        if (ret)
4081                goto out_register_inetpeer;
4082
4083        ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4084
4085        ret = fib6_init();
4086        if (ret)
4087                goto out_register_subsys;
4088
4089        ret = xfrm6_init();
4090        if (ret)
4091                goto out_fib6_init;
4092
4093        ret = fib6_rules_init();
4094        if (ret)
4095                goto xfrm6_init;
4096
4097        ret = register_pernet_subsys(&ip6_route_net_late_ops);
4098        if (ret)
4099                goto fib6_rules_init;
4100
4101        ret = -ENOBUFS;
4102        if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
4103            __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
4104            __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
4105                goto out_register_late_subsys;
4106
4107        ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4108        if (ret)
4109                goto out_register_late_subsys;
4110
4111        for_each_possible_cpu(cpu) {
4112                struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4113
4114                INIT_LIST_HEAD(&ul->head);
4115                spin_lock_init(&ul->lock);
4116        }
4117
4118out:
4119        return ret;
4120
4121out_register_late_subsys:
4122        unregister_pernet_subsys(&ip6_route_net_late_ops);
4123fib6_rules_init:
4124        fib6_rules_cleanup();
4125xfrm6_init:
4126        xfrm6_fini();
4127out_fib6_init:
4128        fib6_gc_cleanup();
4129out_register_subsys:
4130        unregister_pernet_subsys(&ip6_route_net_ops);
4131out_register_inetpeer:
4132        unregister_pernet_subsys(&ipv6_inetpeer_ops);
4133out_dst_entries:
4134        dst_entries_destroy(&ip6_dst_blackhole_ops);
4135out_kmem_cache:
4136        kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4137        goto out;
4138}
4139
4140void ip6_route_cleanup(void)
4141{
4142        unregister_netdevice_notifier(&ip6_route_dev_notifier);
4143        unregister_pernet_subsys(&ip6_route_net_late_ops);
4144        fib6_rules_cleanup();
4145        xfrm6_fini();
4146        fib6_gc_cleanup();
4147        unregister_pernet_subsys(&ipv6_inetpeer_ops);
4148        unregister_pernet_subsys(&ip6_route_net_ops);
4149        dst_entries_destroy(&ip6_dst_blackhole_ops);
4150        kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4151}
4152